示例#1
0
 def compute_score(self, models, type, query):
     if type == 0:
         return models.score_vsm(parser.stemSentence(query))
     elif type == 1:
         return models.score_lmd(parser.stemSentence(query))
     elif type == 2:
         return models.score_lmjm(parser.stemSentence(query))
     else:
         return models.scoreRM3(parser.stemSentence(query))
示例#2
0
    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()

        index = 0
        self.map_val = []
        lambs = np.arange(0, 1, 0.1)

        # For each value in the lambs array it will compute the MAP
        # After going through all the values in the lambs array
        # it will present a plot with the variation of the MAP throughout the lamb values
        for lamb in lambs:
            models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                tf_cranfield, vectorizer, lamb)

            i = 1
            map_model = 0
            for query in cranfield.queries:
                # Parse the query and compute the document scores
                scores = models.score_lmjm(parser.stemSentence(query))

                # Do the evaluation
                [average_precision, precision, self.recall,
                 thresholds] = cranfield.eval(scores, i)

                map_model = map_model + average_precision
                i = i + 1

            self.map_val.append(map_model / cranfield.num_queries)
            index = index + 1

        plt.plot(lambs, self.map_val, color='b', alpha=1)
        plt.ylim([0.0, 0.5])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Lambda')
        plt.ylabel('MAP')
        plt.title('MAP-Lambda')
        plt.savefig('results/map-lamb.png', dpi=100)
        plt.show()
示例#3
0
def vsm(vectorizer, cl, verbose):
    plt.clf()
    corpus = parser.stemCorpus(cl.corpus_cranfield['abstract'])
    tf_cranfield = vectorizer.fit_transform(corpus).toarray()
    models = RetrievalModelsMatrix.RetrievalModelsMatrix(
        tf_cranfield, vectorizer)
    i = 1
    map_vsm = 0
    p10aux = 0
    precision_vsm = []
    recallarr = []
    for query in cl.queries_cranfield['query']:
        scores = models.score_vsm(parser.stemSentence(query))
        [average_precision, precision_11point, recall_11point,
         p10] = cl.eval(scores, i)
        map_vsm = map_vsm + average_precision
        p10aux = p10aux + p10
        precision_vsm.append(average_precision)
        recallarr.append(recall_11point)

        if verbose:
            plt.plot(recall_11point,
                     precision_11point,
                     color='silver',
                     alpha=0.1)
            print('qid =', i, 'VSM     AP=', average_precision)
        i = i + 1

    map_vsm = map_vsm / cl.num_queries
    p10aux = p10aux / cl.num_queries

    plt.plot(recall_11point, precision_11point, color='b', alpha=1)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.fill_between(
        recall_11point,
        np.mean(precision_vsm, axis=0) - np.std(precision_vsm, axis=0),
        np.mean(precision_vsm, axis=0) + np.std(precision_vsm, axis=0),
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_vsm))
    plt.savefig('results/VSMResult.png', dpi=100)

    finalres = [map_vsm, p10aux]
    return finalres
示例#4
0
def lmjm(vectorizer, cl, verbose, lmbd):
    plt.clf()
    corpus = parser.stemCorpus(cl.corpus_cranfield['abstract'])
    tf_cranfiled = vectorizer.fit_transform(corpus).toarray()
    models = RetrievalModelsMatrix.RetrievalModelsMatrix(
        tf_cranfiled, vectorizer)
    scores_array = []
    p10aux = 0
    map_lmjm = 0
    j = 1
    for query in cl.queries_cranfield['query']:
        score = models.score_lmjm(parser.stemSentence(query), lmbd)
        [average_precision, precision_11point, recall_11point,
         p10] = cl.eval(score, j)
        map_lmjm = map_lmjm + average_precision
        p10aux = p10aux + p10
        scores_array.append(average_precision)
        if verbose:
            plt.plot(recall_11point,
                     precision_11point,
                     color='silver',
                     alpha=0.1)
            print('qid =', j, 'LMJM     AP=', average_precision)
        j = j + 1
    map_lmjm = map_lmjm / cl.num_queries
    p10aux = p10aux / cl.num_queries

    plt.plot(recall_11point, precision_11point, color='b', alpha=1)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.fill_between(
        recall_11point,
        np.mean(scores_array, axis=0) - np.std(scores_array, axis=0),
        np.mean(scores_array, axis=0) + np.std(scores_array, axis=0),
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_lmjm))
    plt.savefig('results/LMJMResult.png', dpi=100)

    finalres = [map_lmjm, p10aux]
    return finalres
示例#5
0
    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()
        models = RetrievalModelsMatrix.RetrievalModelsMatrix(
            tf_cranfield, vectorizer, 0.5, 250)

        ### 4. Run the queries over the corpus
        i = 1
        self.p10_model = 0
        self.precision_model = []

        for query in cranfield.queries:
            # Parse the query and compute the document scores
            scores = models.score_lmd(parser.stemSentence(query))

            # Do the evaluation
            [average_precision, precision, self.recall,
             p10] = cranfield.eval(scores, i)

            # Sums all the p10 values obtained in the different queries
            self.p10_model = self.p10_model + p10
            self.precision_model.append(precision)

            i = i + 1

        # Computes the mean value of P10 and present it
        self.p10_model = self.p10_model / cranfield.num_queries
        print('\nP10 =', self.p10_model)
def bm25(vectorizer, cl, verbose, k1, b):
    corpus = parser.stemCorpus(cl.corpus_cranfield['abstract'])
    tf_cranfield = vectorizer.fit_transform(corpus).toarray()
    models = RetrievalModelsMatrix.RetrievalModelsMatrix(
        tf_cranfield, vectorizer)
    i = 1
    map_bm25 = 0
    precision_bm25 = []
    for query in cl.queries_cranfield['query']:
        scores = models.score_bm25(parser.stemSentence(query), k1, b)
        [average_precision, precision_11point, recall_11point,
         p10] = cl.eval(scores, i)
        map_bm25 = map_bm25 + average_precision
        precision_bm25.append(average_precision)

        if verbose:
            plt.plot(recall_11point,
                     precision_11point,
                     color='silver',
                     alpha=0.1)
            print('qid =', i, 'BM25    AP=', average_precision)
        i = i + 1
    map_bm25 = map_bm25 / cl.num_queries

    plt.plot(recall_11point, precision_11point, color='b', alpha=1)
    plt.gca().set_aspect('equal', adjustable='box')
    plt.fill_between(
        recall_11point,
        np.mean(precision_bm25, axis=0) - np.std(precision_bm25, axis=0),
        np.mean(precision_bm25, axis=0) + np.std(precision_bm25, axis=0),
        facecolor='b',
        alpha=0.1)

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.0])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall (MAP={0:0.2f})'.format(map_bm25))
    plt.savefig('results/bm25test.png', dpi=100)

    finalres = [precision_bm25, recall_11point, map_bm25]
    return finalres
示例#7
0
    def __init__(self, bigrams):

        ### 1. Load the corpus
        cranfield = collectionloaders.CranfieldTestBed()

        ### 2. Parse the corpus
        # Tokenize, stem and remove stop words
        if not bigrams:
            vectorizer = CountVectorizer()
        else:
            vectorizer = CountVectorizer(ngram_range=(1, 2),
                                         token_pattern=r'\b\w+\b',
                                         min_df=1)

        corpus = parser.stemCorpus(cranfield.corpus_cranfield['abstract'])

        ### 3. Create the model
        # Compute the term frequencies matrix and the model statistics
        tf_cranfield = vectorizer.fit_transform(corpus).toarray()

        colors = ['green', 'red', 'blue']
        limits = [-3, -5, -10]

        # For each defined limit it will compute the MAP variation
        # given a specific range of alpha values
        for k in limits:
            index = 0
            self.map_val = []
            alphas = np.arange(0, 1, 0.1)
            for alpha in alphas:
                models = RetrievalModelsMatrix.RetrievalModelsMatrix(
                    tf_cranfield, vectorizer, alpha)

                i = 1
                map_model = 0
                for query in cranfield.queries:
                    # Parse the query and compute the document scores
                    scores = models.scoreRM3(parser.stemSentence(query), k,
                                             alpha)

                    # Do the evaluation
                    [average_precision, precision, self.recall,
                     thresholds] = cranfield.eval(scores, i)

                    # Compute the words that were considered relevant in this query
                    words = self.show_query_terms(vectorizer, models)
                    print('\nalpha:', alpha, ', limit:', abs(k), '\n', words)

                    map_model = map_model + average_precision
                    i = i + 1

                self.map_val.append(map_model / cranfield.num_queries)
                index = index + 1

            # Creates the plot that will show the MAP variation with the different alpha values
            plt.plot(alphas,
                     self.map_val,
                     color=colors[limits.index(k)],
                     alpha=1,
                     label='limit = ' + str(abs(limits[limits.index(k)])))

        plt.legend(loc='upper left')
        plt.ylim([0.0, 0.5])
        plt.xlim([0.0, 1.0])
        plt.xlabel('Alpha')
        plt.ylabel('MAP')
        plt.title('MAP-Alpha')
        plt.savefig('results/map-alpha.png', dpi=100)
        plt.show()