def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() index = 0 self.map_val = [] lambs = np.arange(0, 1, 0.1) # For each value in the lambs array it will compute the MAP # After going through all the values in the lambs array # it will present a plot with the variation of the MAP throughout the lamb values for lamb in lambs: models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, lamb) i = 1 map_model = 0 for query in cranfield.queries: # Parse the query and compute the document scores scores = models.score_lmjm(parser.stemSentence(query)) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) map_model = map_model + average_precision i = i + 1 self.map_val.append(map_model / cranfield.num_queries) index = index + 1 plt.plot(lambs, self.map_val, color='b', alpha=1) plt.ylim([0.0, 0.5]) plt.xlim([0.0, 1.0]) plt.xlabel('Lambda') plt.ylabel('MAP') plt.title('MAP-Lambda') plt.savefig('results/map-lamb.png', dpi=100) plt.show()
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, 0.5, 250) ### 4. Run the queries over the corpus i = 1 self.p10_model = 0 self.precision_model = [] for query in cranfield.queries: # Parse the query and compute the document scores scores = models.score_lmd(parser.stemSentence(query)) # Do the evaluation [average_precision, precision, self.recall, p10] = cranfield.eval(scores, i) # Sums all the p10 values obtained in the different queries self.p10_model = self.p10_model + p10 self.precision_model.append(precision) i = i + 1 # Computes the mean value of P10 and present it self.p10_model = self.p10_model / cranfield.num_queries print('\nP10 =', self.p10_model)
def __init__(self, bigrams): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() colors = ['green', 'red', 'blue'] limits = [-3, -5, -10] # For each defined limit it will compute the MAP variation # given a specific range of alpha values for k in limits: index = 0 self.map_val = [] alphas = np.arange(0, 1, 0.1) for alpha in alphas: models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, alpha) i = 1 map_model = 0 for query in cranfield.queries: # Parse the query and compute the document scores scores = models.scoreRM3(parser.stemSentence(query), k, alpha) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) # Compute the words that were considered relevant in this query words = self.show_query_terms(vectorizer, models) print('\nalpha:', alpha, ', limit:', abs(k), '\n', words) map_model = map_model + average_precision i = i + 1 self.map_val.append(map_model / cranfield.num_queries) index = index + 1 # Creates the plot that will show the MAP variation with the different alpha values plt.plot(alphas, self.map_val, color=colors[limits.index(k)], alpha=1, label='limit = ' + str(abs(limits[limits.index(k)]))) plt.legend(loc='upper left') plt.ylim([0.0, 0.5]) plt.xlim([0.0, 1.0]) plt.xlabel('Alpha') plt.ylabel('MAP') plt.title('MAP-Alpha') plt.savefig('results/map-alpha.png', dpi=100) plt.show()
def __init__(self, model_type): # Names from the models the program can compute models_names = ["vsm", "lmd", "lmjm", "rm3"] cranfield = collectionloaders.CranfieldTestBed() corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) colors = ['b', 'r'] labels = ['unigram', 'bigram'] for k in range(0, 2): # Depending on the k variable the "vectorizer" will use unigrams or bigrams if k == 0: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer) i = 1 self.map_model = 0 self.precision_model = [] # Goes through all the queries and computes the MAP and mean value of the precision for query in cranfield.queries: scores = self.compute_score(models, model_type, query) [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) self.map_model = self.map_model + average_precision self.precision_model.append(precision) i = i + 1 self.map_model = self.map_model / cranfield.num_queries mean_precision = np.mean(self.precision_model, axis=0) # Draws in the plot the Precision-Recall relation using the color in the "colors" array at the position k # I will also give a label to that line which is in the labels array at the position k plt.plot(self.recall, mean_precision, color=colors[k], alpha=1, label=labels[k]) plt.gca().set_aspect('equal', adjustable='box') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) # Places the legend at the top left corner of the plot plt.legend(loc='upper left') plt.title('Precision-Recall (' + models_names[model_type].upper() + ')') plt.savefig('results/uni-bi-' + models_names[model_type] + '.png', dpi=100)
def __init__(self, bigrams, model_type, is_sw=0.05, is_ba=0.95): ### 1. Load the corpus cranfield = collectionloaders.CranfieldTestBed() ### 2. Parse the corpus # Tokenize, stem and remove stop words if not bigrams: vectorizer = CountVectorizer() else: vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract']) ### 3. Create the model # Compute the term frequencies matrix and the model statistics tf_cranfield = vectorizer.fit_transform(corpus).toarray() models = RetrievalModelsMatrix.RetrievalModelsMatrix( tf_cranfield, vectorizer, 0.5, 250) ### 4. Run the queries over the corpus i = 1 self.map_model = 0 self.precision_model = [] self.ap_below = 0 self.better_query = [] self.worse_query = [] plt.figure(1) for query in cranfield.queries: # Parse the query and compute the document scores scores = self.compute_score(models, model_type, query) # Do the evaluation [average_precision, precision, self.recall, thresholds] = cranfield.eval(scores, i) # If the computed average precision of the query is below a static value (0.05) it will be presented if is_sw > average_precision: #print('qid =', i, ' AP=', average_precision) self.ap_below = self.ap_below + average_precision worse_query.append(i) if is_ba >= average_precision: better_query.append(i) # Sums all the average_precision values obtained in the different queries self.map_model = self.map_model + average_precision self.precision_model.append(precision) plt.plot(self.recall, precision, color='silver', alpha=0.1) i = i + 1 # Computes the mean value of MAP and # the percentage of queries that have an average precision below a static value self.map_model = self.map_model / cranfield.num_queries self.ap_below = (self.ap_below / cranfield.num_queries) * 100 print('model ', model_type, ' done.') print('MAP = ', self.map_model)
facecolor='b', alpha=0.1) plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.0]) plt.xlim([0.0, 1.0]) plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_vsm)) plt.savefig('results/vsmtest.png', dpi=100) finalres = [precision_vsm, precision_11point, map_vsm] return finalres ###################################################### Main Code ############################################################################################################# cl = collectionloaders.CranfieldTestBed() user_input = input("Command?") verbose = True while user_input.lower() != "q": if user_input.lower() == "mm": user_model_option = input("model : ex. vsm/lmjm/lmd/bm25") user_input = input("Number of N-grams? \t ex: 1 2 3") result = inputParser(user_input) if result == 0: vectorizer = CountVectorizer(ngram_range=(1, int(num) + 1), token_pattern=r'\b\w+\b', min_df=1, stop_words='english') else: vsmArray = []