Exemplos de ClassicSimilarity em Python, exemplos de org.apache.lucene.search.similarities.ClassicSimilarity em Python

Exemplo n.º 1

0

Exibir arquivo

 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)

Exemplo n.º 2

0

Exibir arquivo

Arquivo: query_engine.py Projeto: scurrim/project-python-scurrim

    def createIndex_Stem_Lemma_SpacyIndex(self):
        print("In create index method")
        spacy_file = self.directory+"wiki_spacy_lemma_pos.json"
        my_analyzer = analysis.en.EnglishAnalyzer()
        my_config = index.IndexWriterConfig(my_analyzer)
        my_config.setSimilarity(ClassicSimilarity())
        my_writer = index.IndexWriter(self.in_directory_English_lemma, my_config)
        # # Setting up Title field for content we want tokenized
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)   # only want documents returned
        # Setting up Body field for content we want tokenized
        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
        # Setting up Categories field for content we want tokenized
        t3 = FieldType()
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(IndexOptions.DOCS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
        # Setting up Body POS  field for content we want tokenized
        t4 = FieldType()
        t4.setStored(True)
        t4.setTokenized(True)
        t4.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking

        nDocsAdded = 0
        docs = self.readJSONFromDisk(spacy_file)
        print("Len of file is", len(docs))
        for doc in docs:
            title = doc[0]
            lemma = doc[1]
            category = doc[2]
            pos = doc[3]
            doc = Document()

            doc.add(Field(self.TITLE, title, t1))
            doc.add(Field(self.TEXT, lemma, t2))
            doc.add(Field("Categories", category, t3))
            doc.add(Field("POS", pos, t4))
            my_writer.addDocument(doc)
            nDocsAdded +=1
        # now safely in the provided directories: indexDir and taxoDir.
        my_writer.commit()
        my_writer.close()
        print("Indexed %d documents with spacy." % nDocsAdded)
        pass

Exemplo n.º 3

0

Exibir arquivo

Arquivo: bm_generateRankings.py Projeto: gauravpatil93/evaluation-framework

def main():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    queries = makeQueryList(args["queryFile"])
    print 'lucene', lucene.VERSION
    print "\n"

    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    print directory.getDirectory()
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = StandardAnalyzer()

    run(searcher, analyzer, queries)
    del searcher

Exemplo n.º 4

0

Exibir arquivo

Arquivo: query_engine.py Projeto: scurrim/project-python-scurrim

 def SearchSimple_Eng(self, query, tfidf):
     # # Search index in lucene syntax
     directory = self.in_directory_English
     ireader = index.DirectoryReader.open(directory)
     isearcher = search.IndexSearcher(ireader)
     analyzer = analysis.en.EnglishAnalyzer()
     isMatch = False
     isMatchCat = False
     # change the similarity function to ClassicSimilarity which implements tfidf
     if (tfidf):
         isearcher.setSimilarity(ClassicSimilarity())
     # # Parse a simple query that searches in the field "contents":
     parser = queryparser.classic.QueryParser(self.TEXT, analyzer)
     # extract query text from query object
     query_text = parser.parse(query[1])
     # # return top 1 documents
     # remove special characters & convert to lower case so that AND, OR, NOT etc. are not considered as keywords
     query_cat_text = parser.parse(query[1] + " " + query[0].title())
     hits = isearcher.search(query_text, 1).scoreDocs
     hits_cat = isearcher.search(query_cat_text, 1).scoreDocs
     ans=[]
     # search for query terms in contents of documents
     for hit in hits:
         hit_doc = isearcher.doc(hit.doc)
         if hit_doc[self.TITLE] in query[2]:
             isMatch = True
         # add Documents to answer
         ans = [query[0], query[1], query[2], hit_doc[self.TITLE], hit.score, isMatch, hit.score, isMatch]
         if hit.score < hits_cat[0].score:
             hit_cat_doc = isearcher.doc(hits_cat[0].doc)
             if hit_cat_doc[self.TITLE] in query[2] :
                 isMatchCat = True
             ans = [query[0], query[1], query[2], hit_cat_doc[self.TITLE], hits_cat[0].score, isMatchCat, hit.score, isMatch]
         # if isMatch or isMatchCat:
         #     print("SSE Query-answer ", ans)
     ireader.close()
     return ans

Exemplo n.º 5

0

Exibir arquivo

Arquivo: index_query.py Projeto: Silleellie/Docker-test

    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame

Exemplo n.º 6

0

Exibir arquivo

    import argparse
    parser = argparse.ArgumentParser(
        description='Execute queries on comment body')
    parser.add_argument('user_name', type=str,
                        help="User name (profile to use)")
    parser.add_argument('index_dir', metavar='dir', type=str,
                        help="Index directory")
    parser.add_argument('--sim', type=str, nargs='?',
                        default="tfidf", help="Similarity (in [tfidf, lm, bm25])")
    parser.add_argument('--reorder', type=str, nargs='?',
                        default="no", help="Reordering (in [ups, normups])")
    parser.add_argument('--short', action='store_false',
                        help="Don't show the body of comments")
    args = parser.parse_args()


    if args.sim in ['bm25']:
        similarity = BM25Similarity()
    elif args.sim in ['lm']:
        similarity = LMDirichletSimilarity()
    else:
        similarity = ClassicSimilarity()

    # Sample query
    storeDir = SimpleFSDirectory(Paths.get(args.index_dir))
    searcher = IndexSearcher(DirectoryReader.open(storeDir))
    if similarity is not None:
        searcher.setSimilarity(similarity)
    analyzer = StandardAnalyzer()
    run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: lmySearchFilesFr.py Projeto: shilpadhagat/InformationRetrievalProject-EnglishtoFrenchQueryTranslation

        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return
        translate = makeFrenchQuery.makeFrenchQuery(command)
        commande = ""
        for word in translate:
            commande += word
            commande += " "
        print
        print "Searching for:", commande
        query = QueryParser("contents", analyzer).parse(commande)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name"), (
                'score: %f' % (scoreDoc.score))


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = FrenchAnalyzer()
    run(searcher, analyzer)
    del searcher

Exemplo n.º 8

0

Exibir arquivo

Arquivo: query_engine.py Projeto: scurrim/project-python-scurrim

 def SearchSimple_Eng_mult(self, query, tfidf):
     # # Search index in lucene syntax
     directory = self.in_directory_English
     ireader = index.DirectoryReader.open(directory)
     isearcher = search.IndexSearcher(ireader)
     analyzer = analysis.en.EnglishAnalyzer()
     isMatch = False
     isMatchCat = False
     # change the similarity function to ClassicSimilarity which implements tfidf
     if (tfidf):
         isearcher.setSimilarity(ClassicSimilarity())
     # # Parse a simple query that searches in the field "contents":
     parser = queryparser.classic.QueryParser(self.TEXT, analyzer)
     # extract query text from query object
     query_text = parser.parse(query[1])
     # if "Wayside" in query[1] or "wayside" in query[1]:
     #     print("q1 query_text",query[1], query_text)
     query_cat_text = parser.parse(query[1] + " " +query[0].title())
     hits = isearcher.search(query_text, 10).scoreDocs
     hits_cat = isearcher.search(query_cat_text, 10).scoreDocs
     answers=[]
     ans =[]
     j = 0
     topTenCorrect = False
     #test_ans = query[2]
     # search for query terms in contents of documents
     query_txt = query[1].split()
     pot_ans = []
     pot_ans_overlap_question = 0
     for i in range(len(hits)):
         hit_doc = isearcher.doc(hits[i].doc)
         hit_cat_doc = isearcher.doc(hits_cat[i].doc)
         ansSet = False
         # exclude category and not self.contains_category_term(hit_doc[self.TITLE], query[0])
         # to test for special characters
         # if not self.contains_unnatural_terms(hit_doc[self.TITLE]) or not self.contains_unnatural_terms(hit_cat_doc[self.TITLE]) : 25
         #if not self.answer_too_long(hit_doc[self.TITLE]) or not self.answer_too_long(hit_cat_doc[self.TITLE]) : 26
         #if hit_doc[self.TITLE] not in query[1] or hit_cat_doc[self.TITLE] not in query[1]: 26
         #if not self.contains_category_term(hit_doc[self.TITLE], query[0]) or hit_cat_doc[self.TITLE] not in query[0]:
         if  (not self.contains_unnatural_terms(hit_doc[self.TITLE]) and not self.answer_too_long(hit_doc[self.TITLE]) and hit_doc[self.TITLE] not in query[0] ) or (not self.contains_unnatural_terms(hit_cat_doc[self.TITLE])  and not self.answer_too_long(hit_cat_doc[self.TITLE]) and hit_cat_doc[self.TITLE] not in query[0]):
             # print("Query ans ", query[2], "too_long ", self.answer_too_long(hit_doc[self.TITLE]), " unnat terms ", self.contains_unnatural_terms(query[2]), " categ", self.contains_category_term(query[2], query[0]), "cat too long",  self.answer_too_long(hit_cat_doc[self.TITLE]) )
             # hit_doc = isearcher.doc(hits[i].doc)
             #if not self.contains_unnatural_terms(hit_doc[self.TITLE]) :
             # if not self.answer_too_long(hit_doc[self.TITLE]):
             #if hit_doc[self.TITLE] not in query[0]:
             if (not self.contains_unnatural_terms(hit_doc[self.TITLE]) and not self.answer_too_long(hit_doc[self.TITLE]) and hit_doc[self.TITLE] not in query[1]):
                 ansSet = True
                 # Return to this when done
                 pot_ans = hit_doc[self.TITLE].split(" ")
                 for p in pot_ans:
                     if p in query_txt:
                         pot_ans_overlap_question += 1
                 # if at least 75% of answer words in question then not right answer
                 # if hit_doc[self.TITLE] in query[2]:
                 if not pot_ans_overlap_question/len(pot_ans) >= 0.75 :
                     if hit_doc[self.TITLE] in query[2] :
                     #print("Hit_doc title",hit_doc[self.TITLE])
                         isMatch = True
                         topTenCorrect = True
                     ans = [query[0], query[1], query[2], hit_doc[self.TITLE], hits[i].score, isMatch, hits[i].score, isMatch]
                 else:
                     ansSet = False
                 # add Documents to answer
             # hit_cat_doc = isearcher.doc(hits_cat[i].doc)
             #if not self.contains_unnatural_terms(hit_cat_doc[self.TITLE]) :
             # if not self.answer_too_long(hit_cat_doc[self.TITLE]):
             # if hit_cat_doc[self.TITLE] not in query[1]:
             # if not self.answer_too_long(hit_cat_doc[self.TITLE]) :
             if (not self.contains_unnatural_terms(hit_cat_doc[self.TITLE])  and not self.answer_too_long(hit_cat_doc[self.TITLE]) and hit_cat_doc[self.TITLE] not in query[1]):
                 pot_ans_overlap_question = 0
                 pot_ans = hit_cat_doc[self.TITLE].split(" ")
                 for p in pot_ans:
                     if p in query_txt:
                         pot_ans_overlap_question += 1
                 if not pot_ans_overlap_question / len(pot_ans) >= 0.75:
                     if (hit_cat_doc[self.TITLE] in query[2] and ((hits[i].score < hits_cat[i].score) or ansSet == False)) :
                         # Return to this when done (hit_cat_doc[self.TITLE] in query[2] or pot_ans_overlap_question/len(pot_ans) >= 0.75 ) and ((hits[i].score < hits_cat[i].score) or ansSet == False):
                         isMatchCat = True
                         topTenCorrect = True
                     # elif hit_cat_doc[self.TITLE] not in query[2] and ((hits[i].score < hits_cat[i].score) or ansSet == False):
                     #     topTenCorrect = False
                         ans = [query[0], query[1], query[2], hit_cat_doc[self.TITLE], hits_cat[i].score, isMatchCat, hits[i].score, isMatch]
             #print("SSEM Query-answer ",i, ans)
             if topTenCorrect == True:
                 #print("Matched in top 10 for ",j, query[2], ans)
                 if j == 0:
                     self.prec_at_1 += 1
                 break
             if pot_ans_overlap_question < 0.75 :
                 j += 1
             pot_ans_overlap_question = 0
         # else:
              # print("Query ans ", query[2], "too_long ", self.answer_too_long(query[2]) , " unnat terms ", self.contains_unnatural_terms(query[2]), " categ" , self.contains_category_term(query[2], query[0]))
     ireader.close()
     return ans

Exemplo n.º 9

0

Exibir arquivo

Arquivo: query_engine.py Projeto: scurrim/project-python-scurrim

    def SearchSimple_Eng_Lemma(self, query, tfidf):
        # # Search index in lucene syntax
        directory = self.in_directory_English_lemma
        ireader = index.DirectoryReader.open(directory)
        isearcher = search.IndexSearcher(ireader)
        analyzer = analysis.en.EnglishAnalyzer()
        isMatch = False
        isMatchNoun = False
        isMatchCat = False
        # change the similarity function to ClassicSimilarity which implements tfidf
        if (tfidf):
            isearcher.setSimilarity(ClassicSimilarity())
        parser = queryparser.classic.QueryParser(self.TEXT, analyzer)
        self.convertQueryToLemmaToString(query[1])
        query_text = re.sub(r'[0-9]+', '', self.query_lemma)
        query_txt = query_text.split(" ")
        query_pos = self.query_pos.split(" ")
        query_sub =""
        # Extract nouns
        for i in range(len(query_pos)):
            # token is PROPN / NOUN
            if query_pos[i] == "PROPN" or query_pos[i] == "NOUN" or query_pos[i] == "-PRON-":
                if query_sub=="":
                    query_sub = query_txt[i]
                else:
                    query_sub += " "+ query_txt[i]
        #stopwords = "a an and are as at be but by for if in into is it no not of on or such that the their then there these they this to was will with"
        # if two words or less or the number of POS <> length of text
        if query_sub =="" or len(query_sub)<=2 or len(query_pos)!= len(query_txt):
            query_sub = self.query_lemma
        #     print("Query noun phr was empty")
        # else:
        #     print("query sub",query_sub)
        # print("querysub",query_sub, "lemma ", self.query_lemma, query[1])
        query_text = parser.parse(self.query_lemma)
        # noun phrases only
        query_text_noun = parser.parse(query_sub)
        query_cat_text = parser.parse(self.query_lemma + " " + query[0].title())

        hits = isearcher.search(query_text, 1).scoreDocs
        hits_noun = isearcher.search(query_text_noun, 1).scoreDocs
        hits_cat = isearcher.search(query_cat_text, 1).scoreDocs
        ans=[]
        # search for query terms in contents of documents
        for hit in hits:
            hit_doc = isearcher.doc(hit.doc)
            if hit_doc[self.TITLE] in query[2] :
                isMatch = True
            # add Documents to answer
            ans = [query[0], query[1], query[2], hit_doc[self.TITLE], hit.score, isMatch, hit.score, isMatch]
            if hit.score < hits_noun[0].score:
                hit_noun_doc = isearcher.doc(hits_noun[0].doc)
                if hit_noun_doc[self.TITLE] in query[2]  :
                    isMatchNoun = True
                ans = [query[0], query[1], query[2], hit_noun_doc[self.TITLE], hits_noun[0].score, isMatchNoun, query_sub, hit.score, isMatch]
            if hit.score < hits_cat[0].score  and hits_noun[0].score < hits_cat[0].score :
                hit_cat_doc = isearcher.doc(hits_cat[0].doc)
                if hit_cat_doc[self.TITLE] in query[2] :
                    isMatchCat = True
                ans = [query[0], query[1], query[2], hit_cat_doc[self.TITLE], hits_cat[0].score, isMatchCat, hit.score, isMatch, hits_noun[0].score, isMatchNoun]
            # if isMatch or isMatchCat or isMatchNoun:
            #     print("SSEL Query-answer ", ans)
        ireader.close()
        return ans

Exemplo n.º 10

0

Exibir arquivo

Arquivo: query_engine.py Projeto: scurrim/project-python-scurrim

 def createIndex_Stem(self,input_files):
     cur_title = ""
     cur_body = ""
     cur_category = []
     file_counter = 0
     ip_file_counter = 1
     # Initialize PorterStemmer analyzer & Index writer
     my_analyzer = analysis.en.EnglishAnalyzer()
     my_config = index.IndexWriterConfig(my_analyzer)
     my_config.setSimilarity(ClassicSimilarity())
     my_writer = index.IndexWriter(self.in_directory_English, my_config)
     # Setting up Title field for content we want tokenized
     t1 = FieldType()
     t1.setStored(True)
     t1.setTokenized(True)
     t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
     # Setting up Body field for content we want tokenized
     t2 = FieldType()
     t2.setStored(True)
     t2.setTokenized(True)
     t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
     # Setting up Categories field for content we want tokenized
     t3 = FieldType()
     t3.setStored(True)
     t3.setTokenized(True)
     t3.setIndexOptions(IndexOptions.DOCS)
     nDocsAdded = 0
     print("List of input files is",input_files)
     for input_file in input_files:
         with open(input_file, 'r', encoding='utf8') as f1:
             # Assumes all input documents contain documents that are separated by titles denoted by  [[xxx]]
             my_line = f1.readline()
             while my_line:
                 if my_line.startswith("[[") and my_line.rstrip().endswith("]]"):
                     if cur_title != "":
                         doc = Document()
                         doc.add(Field(self.TITLE, cur_title, t1))
                         doc.add(Field(self.TEXT, cur_body, t2))
                         doc.add(Field("Categories", self.listToString(cur_category), t3))
                         my_writer.addDocument(doc)
                         nDocsAdded += 1
                         cur_body = ""
                         cur_category = []
                         file_counter += 1
                     cur_title = my_line[2:-3]
                 elif my_line.startswith("CATEGORIES:"):
                     cur_category = my_line[11:].strip().split(", ")
                 else:
                     cur_body += my_line
                 my_line = f1.readline()
             file_counter += 1
             doc = Document()
             doc.add(Field(self.TITLE, cur_title, t1))
             doc.add(Field(self.TEXT, cur_body, t2))
             doc.add(Field("Categories", self.listToString(cur_category), t3))
             my_writer.addDocument(doc)
             cur_title = ""
             cur_body = ""
         ip_file_counter += 1
     my_writer.commit()
     my_writer.close()
     print("Indexed %d documents." % nDocsAdded)
     pass

Exemplo n.º 11

0

Exibir arquivo

Arquivo: retriever.py Projeto: stefan-niculae/golden-retriever

def stats_tooltip(word, doc_id, reader):
    # content statistics
    term = Term('content', tokenize(word))
    term_text = unicode(term).replace('content:', '')
    doc_count = reader.docFreq(term)  # in how many docs the term appears

    total_term_count = reader.totalTermFreq(
        term)  # how many times the term appears in any doc
    n_docs = reader.getDocCount('content')  # total number of docs

    postings = MultiFields.getTermDocsEnum(reader, 'content',
                                           BytesRef(term_text))
    while postings.docID() != doc_id:  # this is bad
        postings.nextDoc()
    term_count = postings.freq()  # how many times the term appears in this doc

    similarity = ClassicSimilarity()
    tf = similarity.tf(float(term_count))  # sqrt(term_freq)
    # whether the term is is common or rare among all the docs
    idf = similarity.idf(long(doc_count),
                         long(n_docs))  # log((n_docs+1)/(doc_count+1)) + 1

    # abstract statistics
    abstract_term = Term('abstract', tokenize(word))
    abstract_doc_count = reader.docFreq(abstract_term)
    abstract_total_term_count = reader.totalTermFreq(abstract_term)
    a_idf = similarity.idf(long(abstract_doc_count), long(n_docs))

    abstract_postings = MultiFields.getTermDocsEnum(reader, 'abstract',
                                                    BytesRef(term_text))
    if not abstract_postings:  # the term appears in no document's abstract
        abstract_term_count = 0
        a_tf = 1
    else:
        while abstract_postings.docID() != doc_id:  # this is bad
            if abstract_postings.nextDoc() == abstract_postings.NO_MORE_DOCS:
                abstract_term_count = 0  # it does not appear in this document's abstract
                a_tf = 1
                break
        else:  # no break, it does appear in this document's abstract
            abstract_term_count = abstract_postings.freq()
            a_tf = similarity.tf(float(abstract_term_count))

    content_score = tf * idf**2 * CONTENT_BOOST
    abstract_score = a_tf * a_idf**2 * ABSTRACT_BOOST

    # mixing concerns like nobody's business
    return '''
            <div class="popup">
                <div class="term">{}</div>     
                
                <table>
                <tr>
                    <th> </th>
                    <th>abstr</th>
                    <th>body</th>
                    <th>total</th>
                </tr>
                
                <tr><td>this doc</td>   <td>{}</td>     <td>{}</td>     <td>{}</td>     </tr>
                <tr><td>TF</td>         <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr>
                
                <tr><td>nr docs</td>    <td>{}</td>     <td>{}</td>     <td>{}</td>     </tr>
                <tr><td>IDF</td>        <td>{:.2g}</td> <td>{:.2g}</td> <td>{:.2g}</td> </tr>
                
                <tr><td>score</td>      <td>{:.2g}</td> <td>{:.2g}</td> <td><b>{:.2g}</b></td> </tr>
                <tr><td>all docs</td>   <td>{}</td>     <td>{}</td>     <td>{}</td>     </tr>
                </table>
                
                <div class="total-docs">{}</div>
            </div>
            '''.format(
        term_text, abstract_term_count, term_count - abstract_term_count,
        term_count, a_tf, tf, a_tf * tf, abstract_doc_count, doc_count,
        doc_count, a_idf, idf, a_idf * idf, abstract_score, content_score,
        abstract_score * content_score, abstract_total_term_count,
        total_term_count - abstract_total_term_count, total_term_count, n_docs)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: rankDocuments.py Projeto: poojahoza/lucene-task

 def set_searcher_similarity(self, sim):
     if sim == 'BM25':
         self.search_object.setSimilarity(BM25Similarity())
     else:
         self.search_object.setSimilarity(ClassicSimilarity())