Пример #1
0
 def test_different_analyzers_are_different(self):
     self.searcher.set_analyzer(
         pyanalysis.get_lucene_analyzer(stemming=False))
     hits_first = self.searcher.search('information retrieval')
     self.searcher.set_analyzer(pyanalysis.get_lucene_analyzer())
     hits_second = self.searcher.search('information retrieval')
     self.assertNotEqual(hits_first, hits_second)
Пример #2
0
    def testTermQuery2(self):
        term_query1 = pyquerybuilder.get_term_query('inform', analyzer=get_lucene_analyzer(stemming=False))
        term_query2 = pyquerybuilder.get_term_query('retriev', analyzer=get_lucene_analyzer(stemming=False))

        should = pyquerybuilder.JBooleanClauseOccur['should'].value

        boolean_query1 = pyquerybuilder.get_boolean_query_builder()
        boolean_query1.add(term_query1, should)
        boolean_query1.add(term_query2, should)

        bq1 = boolean_query1.build()
        hits1 = self.searcher.search(bq1)
        hits2 = self.searcher.search('information retrieval')

        for h1, h2 in zip(hits1, hits2):
            self.assertEqual(h1.docid, h2.docid)
            self.assertEqual(h1.score, h2.score)
Пример #3
0
 def test_analyze_with_analyzer(self):
     tokenizer = pyanalysis.get_lucene_analyzer(stemming=False)
     query = JString('information retrieval')
     only_tokenization = JAnalyzerUtils.analyze(tokenizer, query)
     token_list = []
     for token in only_tokenization.toArray():
         token_list.append(token)
     self.assertEqual(token_list, ['information', 'retrieval'])
Пример #4
0
 def test_analyze(self):
     self.assertEqual(' '.join(self.index_utils.analyze('retrieval')), 'retriev')
     self.assertEqual(' '.join(self.index_utils.analyze('rapid retrieval, space economy')),
                      'rapid retriev space economi')
     tokenizer = pyanalysis.get_lucene_analyzer(stemming=False)
     self.assertEqual(' '.join(self.index_utils.analyze('retrieval', analyzer=tokenizer)), 'retrieval')
     self.assertEqual(' '.join(self.index_utils.analyze('rapid retrieval, space economy', analyzer=tokenizer)),
                      'rapid retrieval space economy')
     # Test utf encoding:
     self.assertEqual(self.index_utils.analyze('zoölogy')[0], 'zoölog')
     self.assertEqual(self.index_utils.analyze('zoölogy', analyzer=tokenizer)[0], 'zoölogy')
Пример #5
0
 def __init__(self,
              k1: float = 1.6,
              b: float = 0.75,
              index_path: str = None):
     self.k1 = k1
     self.b = b
     self.use_corpus_estimator = False
     self.analyzer = Analyzer(get_lucene_analyzer())
     if index_path:
         self.use_corpus_estimator = True
         self.index_utils = IndexReaderUtils(index_path)
Пример #6
0
    def test_term_stats(self):
        df, cf = self.index_utils.get_term_counts('retrieval')
        self.assertEqual(df, 138)
        self.assertEqual(cf, 275)

        analyzer = pyanalysis.get_lucene_analyzer(stemming=False, stopwords=False)
        df_no_stem, cf_no_stem = self.index_utils.get_term_counts('retrieval', analyzer)
        # 'retrieval' does not occur as a stemmed word, only 'retriev' does.
        self.assertEqual(df_no_stem, 0)
        self.assertEqual(cf_no_stem, 0)

        df_no_stopword, cf_no_stopword = self.index_utils.get_term_counts('on', analyzer)
        self.assertEqual(df_no_stopword, 326)
        self.assertEqual(cf_no_stopword, 443)
Пример #7
0
    def test_analysis(self):
        # Default is Porter stemmer
        analyzer = pyanalysis.Analyzer(pyanalysis.get_lucene_analyzer())
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Porter stemmer explicitly
        analyzer = pyanalysis.Analyzer(
            pyanalysis.get_lucene_analyzer(stemmer='porter'))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Krovetz stemmer explicitly
        analyzer = pyanalysis.Analyzer(
            pyanalysis.get_lucene_analyzer(stemmer='krovetz'))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])

        # No stemming
        analyzer = pyanalysis.Analyzer(
            pyanalysis.get_lucene_analyzer(stemming=False))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])

        # No stopword filter, no stemming
        analyzer = pyanalysis.Analyzer(
            pyanalysis.get_lucene_analyzer(stemming=False, stopwords=False))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens,
                         ['city', 'buses', 'are', 'running', 'on', 'time'])

        # No stopword filter, with stemming
        analyzer = pyanalysis.Analyzer(
            pyanalysis.get_lucene_analyzer(stemming=True, stopwords=False))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
Пример #8
0
        :param slop: parameter of sdm: https://lucene.apache.org/core/7_3_0/core/org/apache/lucene/search/spans/SpanNearQuery.html
        :param inOder: parameter of sdm: https://lucene.apache.org/core/7_3_0/core/org/apache/lucene/search/spans/SpanNearQuery.html
        :return: feature value.
        """
        return self.index_utils.object.getFieldFeature(
            self.reader, JString(docid), JString(field),
            JString(query.encode('utf-8')), self.analyzer, JString(method),
            sdm, slop, inOder)


if __name__ == "__main__":
    topic_dic = parse_topics("topics")
    qrel_dic = parse_qrel("0.txt")

    index_path = '/Volumes/ext3/arvin/anserini/lucene-index.cw09b/'
    analyzer = get_lucene_analyzer()
    extractor = FeatureExtractor(index_path, analyzer)
    fields = ["title", "url", "anchor", "contents"]
    f = open("clueweb09_intent_change.txt", "w+")

    for qid in tqdm(qrel_dic.keys()):
        for docid in qrel_dic[qid].keys():
            rel = qrel_dic[qid][docid]
            query = topic_dic[qid]
            features = []
            s = ""
            for field in fields:
                features.append(extractor.get_field_length(docid, field))
                features.append(extractor.get_tf(query, docid, field))
                features.append(extractor.get_tfidf(query, docid, field))
                features.append(
Пример #9
0
 def test_invalid_analysis(self):
     # Invalid configuration, make sure we get an exception.
     with self.assertRaises(ValueError):
         pyanalysis.Analyzer(pyanalysis.get_lucene_analyzer('blah'))
Пример #10
0
    'complete', 'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has',
    'approach', 'Studies', 'Stud', 'Inst', 'Divi', 'Thomae', 'Brigham',
    'Young', 'Univ', 'studies', 'volition',
    'severe acute respiratory syndrome', 'affect', 'affected'
]

#NLTK stopwords
nltk.download('stopwords')
stopwords = list(set(stopwords.words('English')))
stopwords_manual = list(np.append(stopwords_manual, stopwords))

token_narrative_list = []

#Extract important narrative text
for i in range(len(R1_topics)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(R1_topics['Narrative'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]
    tokens = list(set(tokens))
    token_narrative_list.append(tokens)

#Tokenize question
token_question_list = []

#Extract important question text - NOT USED YET
for i in range(len(R1_topics)):
    analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz'))
    tokens = analyzer.analyze(R1_topics['Question'][i])
    #Remove stopwords and duplicates from token
    tokens = [w for w in tokens if not w in stopwords_manual]