def test_different_analyzers_are_different(self): self.searcher.set_analyzer( pyanalysis.get_lucene_analyzer(stemming=False)) hits_first = self.searcher.search('information retrieval') self.searcher.set_analyzer(pyanalysis.get_lucene_analyzer()) hits_second = self.searcher.search('information retrieval') self.assertNotEqual(hits_first, hits_second)
def testTermQuery2(self): term_query1 = pyquerybuilder.get_term_query('inform', analyzer=get_lucene_analyzer(stemming=False)) term_query2 = pyquerybuilder.get_term_query('retriev', analyzer=get_lucene_analyzer(stemming=False)) should = pyquerybuilder.JBooleanClauseOccur['should'].value boolean_query1 = pyquerybuilder.get_boolean_query_builder() boolean_query1.add(term_query1, should) boolean_query1.add(term_query2, should) bq1 = boolean_query1.build() hits1 = self.searcher.search(bq1) hits2 = self.searcher.search('information retrieval') for h1, h2 in zip(hits1, hits2): self.assertEqual(h1.docid, h2.docid) self.assertEqual(h1.score, h2.score)
def test_analyze_with_analyzer(self): tokenizer = pyanalysis.get_lucene_analyzer(stemming=False) query = JString('information retrieval') only_tokenization = JAnalyzerUtils.analyze(tokenizer, query) token_list = [] for token in only_tokenization.toArray(): token_list.append(token) self.assertEqual(token_list, ['information', 'retrieval'])
def test_analyze(self): self.assertEqual(' '.join(self.index_utils.analyze('retrieval')), 'retriev') self.assertEqual(' '.join(self.index_utils.analyze('rapid retrieval, space economy')), 'rapid retriev space economi') tokenizer = pyanalysis.get_lucene_analyzer(stemming=False) self.assertEqual(' '.join(self.index_utils.analyze('retrieval', analyzer=tokenizer)), 'retrieval') self.assertEqual(' '.join(self.index_utils.analyze('rapid retrieval, space economy', analyzer=tokenizer)), 'rapid retrieval space economy') # Test utf encoding: self.assertEqual(self.index_utils.analyze('zoölogy')[0], 'zoölog') self.assertEqual(self.index_utils.analyze('zoölogy', analyzer=tokenizer)[0], 'zoölogy')
def __init__(self, k1: float = 1.6, b: float = 0.75, index_path: str = None): self.k1 = k1 self.b = b self.use_corpus_estimator = False self.analyzer = Analyzer(get_lucene_analyzer()) if index_path: self.use_corpus_estimator = True self.index_utils = IndexReaderUtils(index_path)
def test_term_stats(self): df, cf = self.index_utils.get_term_counts('retrieval') self.assertEqual(df, 138) self.assertEqual(cf, 275) analyzer = pyanalysis.get_lucene_analyzer(stemming=False, stopwords=False) df_no_stem, cf_no_stem = self.index_utils.get_term_counts('retrieval', analyzer) # 'retrieval' does not occur as a stemmed word, only 'retriev' does. self.assertEqual(df_no_stem, 0) self.assertEqual(cf_no_stem, 0) df_no_stopword, cf_no_stopword = self.index_utils.get_term_counts('on', analyzer) self.assertEqual(df_no_stopword, 326) self.assertEqual(cf_no_stopword, 443)
def test_analysis(self): # Default is Porter stemmer analyzer = pyanalysis.Analyzer(pyanalysis.get_lucene_analyzer()) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Porter stemmer explicitly analyzer = pyanalysis.Analyzer( pyanalysis.get_lucene_analyzer(stemmer='porter')) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'run', 'time']) # Specify Krovetz stemmer explicitly analyzer = pyanalysis.Analyzer( pyanalysis.get_lucene_analyzer(stemmer='krovetz')) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'bus', 'running', 'time']) # No stemming analyzer = pyanalysis.Analyzer( pyanalysis.get_lucene_analyzer(stemming=False)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'running', 'time']) # No stopword filter, no stemming analyzer = pyanalysis.Analyzer( pyanalysis.get_lucene_analyzer(stemming=False, stopwords=False)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time']) # No stopword filter, with stemming analyzer = pyanalysis.Analyzer( pyanalysis.get_lucene_analyzer(stemming=True, stopwords=False)) tokens = analyzer.analyze('City buses are running on time.') self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
:param slop: parameter of sdm: https://lucene.apache.org/core/7_3_0/core/org/apache/lucene/search/spans/SpanNearQuery.html :param inOder: parameter of sdm: https://lucene.apache.org/core/7_3_0/core/org/apache/lucene/search/spans/SpanNearQuery.html :return: feature value. """ return self.index_utils.object.getFieldFeature( self.reader, JString(docid), JString(field), JString(query.encode('utf-8')), self.analyzer, JString(method), sdm, slop, inOder) if __name__ == "__main__": topic_dic = parse_topics("topics") qrel_dic = parse_qrel("0.txt") index_path = '/Volumes/ext3/arvin/anserini/lucene-index.cw09b/' analyzer = get_lucene_analyzer() extractor = FeatureExtractor(index_path, analyzer) fields = ["title", "url", "anchor", "contents"] f = open("clueweb09_intent_change.txt", "w+") for qid in tqdm(qrel_dic.keys()): for docid in qrel_dic[qid].keys(): rel = qrel_dic[qid][docid] query = topic_dic[qid] features = [] s = "" for field in fields: features.append(extractor.get_field_length(docid, field)) features.append(extractor.get_tf(query, docid, field)) features.append(extractor.get_tfidf(query, docid, field)) features.append(
def test_invalid_analysis(self): # Invalid configuration, make sure we get an exception. with self.assertRaises(ValueError): pyanalysis.Analyzer(pyanalysis.get_lucene_analyzer('blah'))
'complete', 'do', 'how', 'been', 'against', 'use', 'to', 'had', 'has', 'approach', 'Studies', 'Stud', 'Inst', 'Divi', 'Thomae', 'Brigham', 'Young', 'Univ', 'studies', 'volition', 'severe acute respiratory syndrome', 'affect', 'affected' ] #NLTK stopwords nltk.download('stopwords') stopwords = list(set(stopwords.words('English'))) stopwords_manual = list(np.append(stopwords_manual, stopwords)) token_narrative_list = [] #Extract important narrative text for i in range(len(R1_topics)): analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) tokens = analyzer.analyze(R1_topics['Narrative'][i]) #Remove stopwords and duplicates from token tokens = [w for w in tokens if not w in stopwords_manual] tokens = list(set(tokens)) token_narrative_list.append(tokens) #Tokenize question token_question_list = [] #Extract important question text - NOT USED YET for i in range(len(R1_topics)): analyzer = Analyzer(get_lucene_analyzer(stemmer='krovetz')) tokens = analyzer.analyze(R1_topics['Question'][i]) #Remove stopwords and duplicates from token tokens = [w for w in tokens if not w in stopwords_manual]