def get_image_pmcid(pmcid, classes = ""): fields = ["pmcid", "class"] docs = [] location = web.__path__[0] + "/static/web/files/index/index.figures" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) # multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser #query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) # query.setDefaultOperator(QueryParserBase.AND_OPERATOR) #query = query.parse(query, ('4175339','1')) # query.parse(queryString)#"Shigella sonnei" # query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 #hits = searcher.search(query, MAX) if classes == "all": queryStr = "pmcid:(" + ' '.join(pmcid) +")" else: queryStr = "pmcid:(" + ' '.join(pmcid) +")" + " AND class:" + classes query = QueryParser(Version.LUCENE_4_10_1, "pmcid",analyzer)#needed to build a custom query q = query.parse(queryStr) hits = searcher.search(q, MAX) for hit in hits.scoreDocs:#should only be one #print hit.score, hit.doc, hit.toString() docs.append(searcher.doc(hit.doc)) return docs #This will return the image documents that belong to a pmcid(article)
def get_query_results(reader,query,n,field): searcher = IndexSearcher(reader) hits = searcher.search(query, n).scoreDocs print("Found %d hits:" % len(hits)) for i, hit in enumerate(hits): doc = searcher.doc(hit.doc) print("%d. %s" % (i + 1, doc.get(field)))
def search(self): ''' Searches the given query in the index ''' lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath('.')) base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) # print 'path:', doc.get("path"), 'name:', doc.get("name") print doc
def search(): lucene.initVM(vmargs=['-Djava.awt.headless=true']) args = [] if request.method == 'POST': if request.form['ies']: args.append('+ies:'+request.form['ies']) if request.form['area']: args.append('+area:'+request.form['area']) if request.form['professor']: args.append('+professor:'+request.form['professor']) if request.form['conceito']: #args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito']) args.append('m:'+request.form['conceito']) args.append('d:'+request.form['conceito']) args.append('f:'+request.form['conceito']) table = [] if(len(args) > 0): scoreDocs = mansearch.buscar('indexer/',args) fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table.append(dict((field.name(), field.stringValue()) for field in doc.getFields())) return render_template('busca.html',table = table) pass
class Searcher(object): def __init__(self, **kwargs): """ Initialize a new instance of the Searcher :param count: The number of counts to return from a query :param output: The output directory of the underlying index """ self.count = kwargs.get("count", 100) self.output = kwargs.get("root", "index") self.store = SimpleFSDirectory(File(self.output)) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(DirectoryReader.open(self.store)) def search(self, query): """ Given a query, apply it against the existing index. :param query: The query to apply to the index :returns: A generator of the matching documents """ query = QueryParser(Version.LUCENE_30, "data", self.analyzer).parse(query) results = self.searcher.search(query, self.count) for result in results.scoreDocs or []: # logger.debug("%s %s %s", hit.score, hit.doc, hit.toString()) document = self.searcher.doc(result.doc) yield document.get("path"), result.score
def get_candidates(qatp): if prm.create_index: create_index() lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(prm.index_folder))) searcher = IndexSearcher(reader) candidates = [] n = 0 for q,a,t,p in qatp: if n % 100 == 0: print 'finding candidates sample', n n+=1 q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT') query = QueryParser(Version.LUCENE_4_10_1, "text", analyzer).parse(QueryParser.escape(q)) hits = searcher.search(query, prm.max_candidates) c = [] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) c.append(doc.get("id")) candidates.append(c) return candidates
def getIndexSearcher(self): indexSearcher = IndexSearcher(self.mIndexReader) if self.mSimilarity != None: indexSearcher.setSimilarity(self.mSimilarity) return indexSearcher
def query(self, data): if self.fil.exists(): searcher = IndexSearcher(DirectoryReader.open(self.d)) query = QueryParser( Version.LUCENE_30, "id", self.analyzer).parse( data['query']) hits = searcher.search(query, 100000) results = {} results['totalHits'] = hits.totalHits results['hits'] = {} for hit in hits.scoreDocs: record = {} doc = searcher.doc(hit.doc) fields = doc.getFields() record['score'] = hit.score for field in fields: if field.name() != "id": record[field.name()] = field.stringValue() results['hits'][doc.get('id')] = record searcher.getIndexReader().close() return results
class IndexAndTaxonomy(object): def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks} self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True def reopen(self): reader = DirectoryReader.openIfChanged(self._reader) if reader is None: return self._reader.close() self._reader = reader self._reopenSearcher = True taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader) if taxoReader is None: return self.taxoReader.close() self.taxoReader = taxoReader @property def searcher(self): if not self._reopenSearcher: return self._searcher if self._settings.multithreaded: if self._executor: self._executor.shutdown(); self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks); self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks) else: self._searcher = IndexSearcher(self._reader) self._searcher.setSimilarity(self._similarity) self._reopenSearcher = False return self._searcher def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None): # This method must be thread-safe if similarity is None: self._similarity = self._settings.similarity else: self._similarity = BM25Similarity(similarity["k1"], similarity["b"]) if numberOfConcurrentTasks is None: self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks else: self._numberOfConcurrentTasks = numberOfConcurrentTasks self._reopenSearcher = True def close(self): self.taxoReader.close() self._reader.close()
def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def buscar(indexDir, args,options = None): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #Analizador para filtro dos tokens analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #print analyzer #Criando um QueryParser usando por padrao contents #Variavel com as restricoes da busca parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) #print parser parser.setDefaultOperator(QueryParser.Operator.AND) #print args #Juntando parametros passados com o valor do mesmo command = ' +'.join(args) #print command query = parser.parse(command) print query #Criando um JArray com resultado da consulta return searcher.search(query, 200).scoreDocs
class LuceneSearcher(object): fields = ['id', 'text', 'types'] def __init__(self, db_path): directory = SimpleFSDirectory(File(db_path)) reader = DirectoryReader.open(directory) self.searcher = IndexSearcher(reader) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) logger.info("Loaded DB from %s with %d documents: ", db_path, reader.numDocs()) def search(self, query, max_matches=1000): query = VALID_CHARS_PATTERN.sub(' ', query) logger.debug("Searching for %s", query) query = QueryParser(Version.LUCENE_CURRENT, "text", self.analyzer).parse(query) score_docs = self.searcher.search(query, max_matches).scoreDocs logger.debug("%s total matching documents.", len(score_docs)) docs = [self.searcher.doc(d.doc) for d in score_docs] return [self.convert_to_dict(doc) for doc in docs] def convert_to_dict(self, doc): return {field: doc.get(field) for field in self.fields}
def search(self, field, text): """ search text within indexed data input: field fieldname of the value that will be indexed text text to search output: hits return a list of hits """ results = [] idx_reader = DirectoryReader.open(self.directory) idx_searcher = IndexSearcher(idx_reader) # parse query parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser) query = parser.parse(text) # search hits = idx_searcher.search(query, 1000).scoreDocs.tolist() for hit in hits: doc = idx_searcher.doc(hit.doc) score = hit.score title = doc.get(field) url = doc.get("url") results.append((score, url, title)) return results
def search(self, input_query=None, max_answers=10): ''' Searches the given query in the index ''' if input_query is None: return None base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer) query = MultiFieldQueryParser.parse(parser, input_query) scoreDocs = searcher.search(query, max_answers).scoreDocs print "%s total matching documents." % len(scoreDocs) docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields()) docs.append(doc_dict) # print doc return docs
def search_docs(self, value, field="general_info"): MAX_RESULTS = 1000 searcher = IndexSearcher(DirectoryReader.open(self.store)) query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(value) topDocs = searcher.search(query, MAX_RESULTS) return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
def config(): base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75. searcher.setSimilarity(bm25Sim) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) return searcher,analyzer
def SearchQuery(queryString, fields, classification): #if __name__ == "__main__": #if __name__ == "retriever": location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(query, queryString) #query.parse(queryString)#"Shigella sonnei" #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = get_image_pmcid(pmcids, classification)#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
def perform_search(self, searchterm, results_per_page, page): # if there is a field in the searchterm """if ":" in searchterm: # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) else: query = BooleanQuery() query_title = TermQuery(Term("title", searchterm)) query_description = TermQuery(Term("description", searchterm)) query_content = TermQuery(Term("content", searchterm)) # BooleanClause.Occur.MUST for AND queries query.add(query_title, BooleanClause.Occur.SHOULD) query.add(query_description, BooleanClause.Occur.SHOULD) query.add(query_content, BooleanClause.Occur.SHOULD)""" # create QueryParser for each field to be searched parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer) parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer) parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) # put fields together query = BooleanQuery() query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD) query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() hits = searcher.search(query, results_per_page + (results_per_page * page)) score_docs = hits.scoreDocs count_results = hits.totalHits duration = datetime.now() - start # results to return results = [] count = 0 for scoreDoc in score_docs: # skip offset if count < results_per_page * page: count += 1 continue count += 1 doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) results.append(table) return results, duration, count_results
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False): """ multifield: different query string for different field not same word on different field :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) # feature_type is a list of function text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class)) query = BooleanQuery() # BooleanClause.Occur # MUST implies that the keyword must occur # SHOULD implies that the keyword SHOULD occur query.add(text_query, BooleanClause.Occur.SHOULD) query.add(subject_query, BooleanClause.Occur.SHOULD) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def is_article_indexed(art_id, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id)) docs = searcher.search(query, 1).scoreDocs return len(docs) > 0
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
class WikiPageIndex(): def __init__(self, index_dir): #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g']) self.index_dir = index_dir self.directory = SimpleFSDirectory(File(self.index_dir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) def createIndex(self): self.writer = IndexWriter(self.directory, self.config) if not os.path.exists(self.index_dir): os.mkdir(self.index_dir) def addDocumentToIndex(self, title, text): doc = Document() doc.add(Field("Title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("Text", text, Field.Store.YES, Field.Index.ANALYZED)) self.writer.addDocument(doc) def closeIndex(self): self.writer.commit() self.writer.close() def searchIndex(self, queryString, field="Text", max_results=100): query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(queryString) scoreDocs = self.searcher.search(query, max_results).scoreDocs log.debug("Found {0} documents for query [{1}]".format(len(scoreDocs), queryString)) docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) log.debug(WikiPageIndex.cleanWikiText(doc.get("Text"))) #print("title: {0}\ncontents: {1}".format(doc.get("Title"), doc.get("Text")[:70])) docs.append(doc) return docs @staticmethod def cleanWikiText(text): text = text.encode('ascii', 'ignore') text = re.sub('(\[\[.*?\]\]|\{\{.*?\}\}|\{\|.*?\|\})', '', text) text = re.sub('[^\na-zA-Z0-9\n_-]+', ' ', text) text = re.sub('([ \t]*[\n]+[ \t]*)+', '\n', text) return text.strip()
def search(term, n_docs=10, index='index'): store = SimpleFSDirectory(File(index)) searcher = IndexSearcher(DirectoryReader.open(store)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term) # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery" score_docs = searcher.search(query, n_docs).scoreDocs return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
def retrieve_wiki(text_query, index_directory_name): lucene.initVM() directory = FSDirectory.open(File(index_directory_name)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) txt =text_query query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('contents')
def main(indexDir, inputDir): """Creates a SQLite database with news linked to other news by at least one term, backed by a Lucene Index""" lucene.initVM() # Open index logger.info("Opening Lucene index [%s]..." % indexDir) dir = SimpleFSDirectory(File(indexDir)) analyzer = KeywordAnalyzer(Version.LUCENE_CURRENT) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) # Search documents onlyfiles = [ f for f in listdir(inputDir) if isfile(join(inputDir, f)) and f.endswith('.json') ] rels = list() for f in onlyfiles: journal_code = f.split('.')[0] f = join(inputDir, f) json_data = open(f) data = json.load(json_data) # The results collected after comparison for entry in data: url = entry['url'] date = entry['date'] title = entry['title'] logger.debug("Processing URL [%s] date [%s] - [%s]" % (url, date, title)) tt = nltk.word_tokenize(title) tokens = [] for t in tt: tokens.append(t.lower()) for token in tokens: q = 'title: "%s" AND date: "%s" AND NOT journal: "%s" AND NOT url: "%s"' % (token, date, journal_code, url) query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(q) hits = searcher.search(query, MAX_HITS) logger.debug("Found %d document(s) that matched query '%s':" % (hits.totalHits, q)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) logger.debug(doc) rels.append({'left': url, 'token': token, 'right': doc.get('url')}) json_data.close() with open('relationships.csv', 'wb') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for rel in rels: csvwriter.writerow([rel['left'].encode('utf8'), rel['token'].encode('utf8'), rel['right'].encode('utf8')])
class PyLucene(object): def __init__(self): if luceneImport: self.lucene = True else: self.lucene = False #Lucene connection lucene.initVM() indexDir = "texts/index" directory = MMapDirectory(File(indexDir)) directory = DirectoryReader.open(directory) self.analyzer = StandardAnalyzer(Version.LUCENE_30) self.searcher = IndexSearcher(directory) def query(self, terms = []): query = QueryParser(Version.LUCENE_30, "text", self.analyzer).parse(" OR ".join(terms)) MAX = 1000 hits = self.searcher.search(query, MAX) results = [] for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) results.append([doc.get("doc_id").encode("utf-8"), doc.get("head").encode("utf-8")]) return results def occurencies(self, term, morphs): query = [] already = [] for morph in morphs: query.append(morph) #Sometime, when there is doubt about a term, because of xml hashing in Lucene, you would find twice a lemma like wordword query.append(morph+morph) results = self.query(query) resultsReturned = [] for result in results: if result[0] not in already: resultsReturned.append(result) already.append(result[0]) return resultsReturned, len(resultsReturned) def chunk(self, occurency): #Could be updated using the section information but could be only milesone return occurency#, len(occurency)
def searcher(self): if not self._reopenSearcher: return self._searcher if self._settings.multithreaded: if self._executor: self._executor.shutdown(); self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks); self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks) else: self._searcher = IndexSearcher(self._reader) self._searcher.setSimilarity(self._similarity) self._reopenSearcher = False return self._searcher
def lucene_retrieval(q_string, feature_type, use_BM25=False): """ :param q_string: :param feature_type: :param use_BM25: :return: retrieval_scores for each question-answer pair """ index = set_lucene_index['ind'] # nonlocal variable index def retrieval_scores(hists): """ return sorted document+score by score :param hists: """ def doc_score(hists): """ return doc_name & score :param hists: """ for h in hists: # docID = h.doc # doc = searcher.doc(docID) # file_name = doc.get("corpus_name") # doc_name = doc.get("doc_name") # text = doc.get("text") score = h.score # yield (file_name, doc_name, score, text) yield score doc_score_list = list(doc_score(hists)) return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function # escape special characters via escape function query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string)) # search reader = IndexReader.open(index) searcher = IndexSearcher(reader) if use_BM25: searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75)) # todo: BM25 parameters collector = TopScoreDocCollector.create(hitsPerPage, True) searcher.search(query, collector) hs = collector.topDocs().scoreDocs # hists results = retrieval_scores(hs) # reader.close() return results # retrieval_scores for each question-answer pair
def __init__(self, index_dir, index_file, rawQuery): self.indexFile = os.path.join(index_dir, index_file) # lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately directory = SimpleFSDirectory(File(self.indexFile)) searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(BM25Similarity(1.2, 0.75)) # set BM25 as the similarity metric, k=1.2, b=0.75 if 'Standard' in self.indexFile: print "Use the StandardAnalyzer" analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # build a standard analyzer with default stop words if 'Porter' in self.indexFile: print "Use the PorterStemmer analyzer" analyzer = PorterStemmerAnalyzer() self.run(searcher, analyzer, rawQuery) del searcher
def author_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] print(entry['prim_author']) if qry in entry['prim_author'].lower(): fname = short_title + CONTENT_EXT results[entry_id] = {'title': short_title, 'file': fname } f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w') f.write(json.dumps(results)) f.close() return json.dumps(results)
from org.apache.lucene.search import BooleanQuery from org.apache.lucene.search import BooleanClause # 以下是为了显示上下文并且高亮而import的 - Highlighter from org.apache.lucene.analysis import TokenStream from org.apache.lucene.index import Term from org.apache.lucene import search from org.apache.lucene.search import TermQuery from org.apache.lucene.search.highlight import Highlighter from org.apache.lucene.search.highlight import QueryScorer from org.apache.lucene.search.highlight import SimpleHTMLFormatter from org.apache.lucene.search.highlight import SimpleSpanFragmenter # 以下是为了实现网页接口 import 的库 import web from web import form print 'lucene', lucene.VERSION vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) STORE_DIR_PLAYLIST = "Playlist" dir_playlist = SimpleFSDirectory(File(STORE_DIR_PLAYLIST)) searcher_playlist = IndexSearcher(DirectoryReader.open(dir_playlist)) STORE_DIR_SONGS = "Songs" dir_songs = SimpleFSDirectory(File(STORE_DIR_SONGS)) searcher_songs = IndexSearcher(DirectoryReader.open(dir_songs)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) print "初始化完成."
def createIndexSearcher(indexDir): directory = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir))) searcher = IndexSearcher(directory) similarity = BM25Similarity(K1, B) searcher.setSimilarity(similarity) return searcher
doc = searcher.doc(scoreDoc.doc) #titleHighLight = Highlighter.getBestFragment(analyzer, "title", doc.get("标题")) #print titleHighLighthttps://img1.doubanio.com/view/photo/s_ratio_poster/public/p2502530749.jpg lis.append([ doc.get('标题'), doc.get('图片'), doc.get('评分'), doc.get("上映日期:"), doc.get("类型:") ]) lis.sort(key=lambda x: x[2], reverse=True) #print 'path:', doc.get("path"), \ #"title:",doc.get("title"),"url:",doc.get("url"),'score:', scoreDoc.score # print 'explain:', searcher.explain(query, scoreDoc.doc) return lis if __name__ == '__main__': STORE_DIR = "index" lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) lis = run(searcher, analyzer) for i in lis: for j in i: print j del searcher
class QAsearcher(object): def __init__(self, dir): self.dir = dir self.lReader = DirectoryReader.open( FSDirectory.open(Paths.get(self.dir))) self.lSearcher = IndexSearcher(self.lReader) # def get_collection_size(self): # return self.lReader.numDocs() def doc_search(self, field, keywords, numHits): if field != 'All': analyzer = StandardAnalyzer() parser = QueryParser(field, analyzer) query = parser.parse(keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits else: analyzer = WhitespaceAnalyzer() parser = MultiFieldQueryParser(['Title', 'Body'], analyzer) query = MultiFieldQueryParser.parse(parser, keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits self.hits = hits self.field = field return hits def print_result(self): j = 1 for i in self.hits: print "\nResult " + str(j) + "\tDocID: " + str( i.doc) + "\t Score: " + str(i.score) try: if self.field == 'All': print "Tile: " + self.lReader.document(i.doc).get("Title") print "Body: " + self.lReader.document(i.doc).get("Body") if self.field == 'Title': print "Tile: " + self.lReader.document(i.doc).get("Title") if self.field == 'Body': print "Body: " + self.lReader.document(i.doc).get("Body") except RuntimeError: print "Search fail" j = j + 1 print j def close(self): try: if (self.lReader != None): self.lReader.close() except RuntimeError: print "Close reader fail"
import argparse parser = argparse.ArgumentParser( description='Execute queries on comment body') parser.add_argument('user_name', type=str, help="User name (profile to use)") parser.add_argument('index_dir', metavar='dir', type=str, help="Index directory") parser.add_argument('--sim', type=str, nargs='?', default="tfidf", help="Similarity (in [tfidf, lm, bm25])") parser.add_argument('--reorder', type=str, nargs='?', default="no", help="Reordering (in [ups, normups])") parser.add_argument('--short', action='store_false', help="Don't show the body of comments") args = parser.parse_args() if args.sim in ['bm25']: similarity = BM25Similarity() elif args.sim in ['lm']: similarity = LMDirichletSimilarity() else: similarity = ClassicSimilarity() # Sample query storeDir = SimpleFSDirectory(Paths.get(args.index_dir)) searcher = IndexSearcher(DirectoryReader.open(storeDir)) if similarity is not None: searcher.setSimilarity(similarity) analyzer = StandardAnalyzer() run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)
array = text.split() print array[0], " ", array[1], " ", array[2] + "</a><br>" if __name__ == '__main__': #initialize VM lucene.initVM(vmargs=['-Djava.awt.headless=true']) #get user's input form = cgi.FieldStorage() searchTerm = form.getvalue('search') K = form.getvalue('kValue') #open index base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR))) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer() print "results for query: ", searchTerm, "<br>" #if checkbox was checked, try 2,3,4,5,6 for k, see which one's clusters have higher silhouette coefficient #then use that number as k to search one more time to get final results if form.getvalue('autoK'): bestK = 0 highestSilhouette = 0 for counter in range(3, 7): if counter == 2: highestSilhouette = getSilhouette(reader, searcher, analyzer, searchTerm, counter) bestK = counter else: silhouette = getSilhouette(reader, searcher, analyzer, searchTerm, counter)
def getSearcher(self, store): return IndexSearcher(DirectoryReader.open(store))
class GitHubSearcher: def __init__(self, index_path, query=None): self.index_path = index_path self.reader = None self.query = query self.porter_analyzer = PorterAnalyzer( StandardAnalyzer(Version.LUCENE_CURRENT)) self.load_index() def load_index(self): indexDir = File(self.index_path) a = {"code": self.porter_analyzer} self.analyzer = PerFieldAnalyzerWrapper(KeywordAnalyzer(), a) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open( index) #IndexReader 열고 닫지 않았었음........................... n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader) print("Index contains %d documents." % n_docs) def get_DF(self, field, term): return self.reader.docFreq(Term(field, term)) def get_IDF(self, field, term): from math import log10, sqrt docF = self.reader.docFreq(Term(field, term)) return log10(self.reader.numDocs() / (docF + 1)) + 1 def get_minimum_IDF(self, docF=2): from math import log10, sqrt return log10(self.reader.numDocs() / (docF + 1)) + 1 def document_to_query(self, doc): """ Given a document it transforms the source code related fields to a lucene query string""" query = "" for field in [ "typed_method_call", "methods", "used_classes", "class_instance_creation", "methods_called", "annotations", "literals" ]: #"used_classes", , "literals" , "extends" for val in doc.getFields(field): if val.stringValue().strip(): term = QueryParser.escape(val.stringValue()) # Filter out noisy terms stoplist = ["java.lang.Object"] if term not in stoplist: # idf = self.get_IDF(field, term) # print self.get_DF(field, term), term, field #query += "%s:%s^%s " % (field, term, idf) query += "%s:%s " % (field, term) #print "term: %s idf: %s" % (term, self.get_minimum_IDF()) #query += "%s:%s " % (field, term) #print "%s:%s^%s" % (field, term, self.getIDF(field, term)) # for hint in doc.getFields("code_hints"): # tokens = utils.tokenize(hint.stringValue()) # for token in tokens: # #print token # token = QueryParser.escape(token) # if token.strip(): # print "HINTS", token # query += "code:%s^5.0 " % (token) if len(doc.getFields("code_hints")) > 0: hints = [ hint.stringValue() for hint in doc.getFields("code_hints") ] hints_str = " ".join(hints) for term in hints: if term: term = QueryParser.escape(term) print "TERM", term # if term[0].isupper(): # query += "used_classes:%s^5.0 class_instance_creation:%s^5.0 " % (term, term) # elif "(" in term or "." in term or "#" in term: # Heuristic to boost only code identifiers # query += "methods:%s^5.0 methods_called:%s^5.0 " % (term, term) #query += "code:%s^5.0 " % (term) return query def get_matched_keywords(self, query, docid): matched_terms = [] # def _get_matched_keywords(q, matched_terms): # print type(q), matched_terms # if isinstance(q, TermQuery): # if self.searcher.explain(q, docid).isMatch(): # matched_terms.append( q.getTerm().text() ) # elif isinstance(q, BooleanQuery): # for query_term in query.getClauses(): # _get_matched_keywords(query_term, matched_terms) # # if self.searcher.explain(query_term.getQuery(), docid).isMatch(): # # matched_terms.append( query_term.getQuery().getTerm().text() ) # _get_matched_keywords(query, matched_terms) if isinstance(query, TermQuery): if self.searcher.explain(query, docid).isMatch(): matched_terms.append(query.getTerm().text()) elif isinstance(query, BooleanQuery): for query_term in query.getClauses(): if self.searcher.explain(query_term.getQuery(), docid).isMatch(): matched_terms.append( query_term.getQuery().getTerm().text()) #print "Matched Terms: %s" % matched_terms return matched_terms def get_matched_keywords2(self, query, doc): matched_terms = [] weight_expl = self.searcher.explain(query, doc).toString().split("weight(") for expl in weight_expl: if " in " in expl: field_val = expl.split(" in ")[0] #field, val = field_val.split(":") val = field_val.split(":")[-1] matched_terms.append(val) return matched_terms def code_as_text(self): """ Extends a query by matching query keywords in source code as text""" query = " " for term in tokenize_string(self.porter_analyzer, self.query): if term: term = QueryParser.escape(term) query += "code:%s " % (term) return query def lexical_search(self): """ In case no term is matching with stackoverflow we perform a simple lexical search on GitHub """ github_result = [] query = self.code_as_text().strip() query = QueryParser(Version.LUCENE_CURRENT, "code", self.analyzer).parse(query) hits = self.searcher.search(query, 10).scoreDocs for hit in hits: doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords(query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] item = GithubResultItem(doc.get("file"), decompress(doc.get("file_content")), matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) return github_result def more_like_this(self, so_items): github_result = [] if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) for so_item in so_items: queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) query = "" if so_item.doc: query = self.document_to_query(so_item.doc) query += self.code_as_text() if query: print "-" * 30 print "Query: %s" % query print "-" * 30 try: like_query = queryparser.parse(query) hits = self.searcher.search(like_query, 10).scoreDocs for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] item = GithubResultItem(doc.get("file"), decompress( doc.get("file_content")), matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) #print("%d. File: %s, Matched: %s, Score: %s" % (i + 1, doc.get("file"), matched_terms, hit.score)) except Exception as e: print "Error: %s" % e # print Counter(files).most_common(5) return github_result def more_like_this2(self, so_items): if not so_items: so_items.append(SOResultItem(None, 1.0, "No Title", 0, "")) query = "" queryparser = QueryParser(Version.LUCENE_CURRENT, "typed_method_call", self.analyzer) #### ###아래의 반복문이 Agumented Query 생성부 for so_item in so_items: if so_item.doc: query += self.document_to_query(so_item.doc) query += self.code_as_text() github_result = [] if query: print "-" * 50 print "UNified Query: %s" % query print "-" * 50 try: ###루씬에 맞는 Query로 최종 변환 like_query = queryparser.parse(query) ###아래 줄이 실제로 GitHub Indices들 찾아들어가서 like_query와 비교 견적 상위 5개.. hits = self.searcher.search(like_query, 5).scoreDocs #상위 5개 결과 #hits에 5개의 결과가 들어감.. for i, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) matched_terms = self.get_matched_keywords2( like_query, hit.doc) print "Matched Terms : ", matched_terms # apis = [d.stringValue() for d in doc.getFields("typed_method_call")] print("file", doc.get("file"), "file_content", doc.get("file_content"), "line_numbers", doc.get("line_numbers")) file_path = doc.get("file") #file_path = "" + doc.get("file")[24:] #file_path = "/root/GitSearch" + doc.get("file")[24:] #print(doc.get("file")[32:]) #print(doc.get("file")[0:]) #print(file_path) content = None try: with open( file_path ) as f: #실제 프로젝트 경로 쭉 찾아들어가서 파일 열고 읽어서 content에 넣음 content = f.read() except: pass #File 찾고 내용 존재 시, 형식에 맞게 item에 넣음. if content: item = GithubResultItem(doc.get("file"), content, matched_terms, hit.score, so_item, doc.get("line_numbers"), hit.doc) # code github_result.append(item) except Exception as e: print "GitSearcher: Error: %s" % e print(traceback.format_exc()) return github_result
def __init__(self, index_path): indexDir = File(index_path) index = SimpleFSDirectory(indexDir) self.reader = IndexReader.open(index) n_docs = self.reader.numDocs() self.searcher = IndexSearcher(self.reader)
def main(): """Function to index negative situations and retrive based on input sentence""" all_sent_df = pd.read_csv("../data/sentiment_data.csv") neg = all_sent_df[all_sent_df["label"] == 1] all_neg_phrases = list(neg["phrase"]) with open("../data/negSituations.txt", "r") as fpointer: all_neg_situations = fpointer.readlines() all_neg_situations = map(lambda s: s.strip(), all_neg_situations) all_neg_phrases = map(lambda s: s.strip(), all_neg_phrases) lucene.initVM() analyzer = StandardAnalyzer() path = Paths.get('negSituationIndex') directory = SimpleFSDirectory(path) writer_config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, writer_config) print(writer.numDocs()) # INDEXING ALL DOCUMENTS/ARTICLES IN THE CORPUS for each in all_neg_situations: document = Document() document.add(Field("negativeSituations", each, TextField.TYPE_STORED)) writer.addDocument(document) print(writer.numDocs()) writer.close() analyzer = StandardAnalyzer() reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) # QUERYING FOR A QUESTION with open("../data/negative_situation_to_retrieve.txt", "r") as fpointer: all_test_sent = fpointer.readlines() all_test_sent = map(lambda s: s.strip(), all_test_sent) query_parser = QueryParser("negativeSituations", analyzer) total_num = 0 tic = time.time() all_ans = [] for each in all_test_sent: total_num = total_num + 1 if total_num % 1000 == 0: print(total_num, time.time() - tic) query = query_parser.parse(query_parser.escape(each)) hits = searcher.search(query, 3) docs_scores = [hit.score for hit in hits.scoreDocs] current_ans = [] if docs_scores != []: for hit in hits.scoreDocs: doc_t = searcher.doc(hit.doc) doc_text = doc_t.get("negativeSituations") current_ans.append(doc_text) else: continue current_ans = list(set(current_ans)) all_ans.append(current_ans) print(all_ans)
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher1 = IndexSearcher(reader) searcher1.setSimilarity(BM25Similarity()) searcher2 = IndexSearcher(reader) w = IndexWriter(index_mm,config) # read query read_query() # initialize mongodb client mongoObj=Mongo_Object('localhost',27017) # search docDup=set() finalDup={} for i in xrange(len(queries)): print 'process query %d' %(i) query = queries[i] querystr = stemSentence(query[3]) # build searcher q_lucene = QueryParser("all_text", analyzer).parse(querystr) collector = TopScoreDocCollector.create(hitsPerPage); searcher1.search(q_lucene, collector); hits = collector.topDocs().scoreDocs; # find candidate results after 1st round filter docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) if d['title'] in docDup: finalDup[d['title']]=d continue docDup.add(d['title']) docDup.clear() for j in xrange(len(hits)): docID=hits[j].doc d=searcher1.doc(docID) title=d['title'] if d['title'] in docDup: continue docDup.add(title) item=(mongoObj.conn_me).find_one({'title':title}) if item is None: continue entitylist=item['entitylist'].split('|') for en_title in entitylist: if title==en_title: continue t=Term('title',en_title) q=TermQuery(t) docs=searcher2.search(q,2) if docs.totalHits<=1: continue docID2=(docs.scoreDocs)[0].doc doc=searcher2.doc(docID2) finalDup[doc['title']]=doc print 'begin to clean index, there are %d dup records' %(len(finalDup)) for title in finalDup: doc=finalDup[title] # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract name=doc['name'] value=doc['value'] category=doc['category'] skos_category=doc['skos_category'] all_text=doc['all_text'] raw_name=doc['raw_name'] raw_value=doc['raw_value'] abstract=doc['abstract'] print 'process '+title t=Term('title',title) q=TermQuery(t) w.deleteDocuments(q) addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract) # process remaining records #global batch,cnt_batch #if cnt_batch>0: #w.addDocuments(batch) #cnt_batch=0 #del batch[:] w.close()
index_reader = DirectoryReader.open(index.store) # get vocab size terms = MultiFields.getTerms(index_reader, 'contents') termEnum = terms.iterator() vocabCounter = 0 for term in BytesRefIterator.cast_(termEnum): vocabCounter += 1 print("Number of docs:", index_reader.numDocs()) print("Vocab size:", vocabCounter) # print min, max, mean querystr = 'بازار بزرگ تهران' print("Query: ", querystr) q = QueryParser("contents", index.analyzer).parse(querystr) hitsPerPage = 20 searcher = IndexSearcher(index_reader) docs = searcher.search(q, hitsPerPage) hits = docs.scoreDocs for i, hit in enumerate(hits): docId = hit.doc score = hit.score d = searcher.doc(docId) print("Query result(%d): %s , Similarity: %g" % ((i + 1), d.get("id"), score)) print("Overall time elapsed: ", (timeit.default_timer() - start_time)) except Exception as e: print(e) raise e
class Searcher(): """A simple interface to search articles. In this class `MultiFieldQueryParse`, `DuplicateFilter` are used to accomplish our application: query should apply on multiple fields, duplication should be avoid. """ def __init__(self, index_dir, search_fields=['canonical_url', 'title', 'meta', 'content'], unique_field='uq_id_str', boost=dict(canonical_url=4.0, title=8.0, meta=2.0, content=1.0), date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Searcher. Parameters ---------- index_dir : string The location of lucene index. search_fields : list A list of field names indicating fields to search on. unique_field : string The field name, on which the duplication should avoid. boost : dict This dict control the weight when computing score. date_format : string Convert the string into datetime. Should consistent with the index part. """ self.index_dir = index_dir self.search_fields = search_fields self.sort_by_recent = Sort( SortField('date_published', SortField.Type.STRING, True)) self.store = FSDirectory.open(Paths.get(index_dir)) self.reader = DirectoryReader.open(self.store) self.isearcher = IndexSearcher(self.reader) self.analyzer = StandardAnalyzer() self.boost_map = HashMap() for k, v in boost.items(): self.boost_map.put(k, Float(v)) self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer, self.boost_map) self.date_format = date_format def query_between_dates(self, dt1, dt2, original_query=None): '''Update the given query to only allow records between dt1 and dt2.''' return TermRangeQuery( 'date_published', # Field BytesRef(dt1.strftime(self.date_format)), # Lower bound BytesRef(dt2.strftime(self.date_format)), # Upper bound True, # Include lower bound True # Include upper bound ) def refresh(self): """Refresh the searsher, if index is changed.""" nireader = DirectoryReader.openIfChanged(self.reader) if nireader: self.reader.close() self.reader = nireader self.isearcher = IndexSearcher(self.reader) logger.debug('Index file changed, freshed') else: logger.debug('Index file did not change.') def fetch_one_doc(self, score_doc): """Fetch one document from the scored doc results.""" doc = self.isearcher.doc(score_doc.doc) return ( doc.getField("group_id").numericValue().intValue(), doc.get("canonical_url"), doc.get("title"), doc.get("date_published"), doc.get("domain"), doc.get("site_type"), score_doc.score, ) def search(self, query, n1=100, n2=100000, sort_by='relevant', use_lucene_syntax=False, min_score_of_recent_sorting=0.4, min_date_published=None): """Return the matched articles from lucene. Parameters ---------- query : string The query string. n1 : int How many result finally returned. n2 : int How many search results returned when sort by recent. sort_by : string {'relevant', 'recent'}, the sorting order when doing lucene searching. min_score_of_recent_sorting : float The min score when sorting by 'recent'. min_date_published : datetime The min date_published when filtering lucene searching results. Returns ------- tuple (total_hits, df), where total_hits represents the total number of hits and df is a pandas.DataFrame object. df.columns = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] """ if min_date_published is not None: dt2 = datetime.utcnow() if isinstance(min_date_published, datetime): dt1 = min_date_published elif isinstance(min_date_published, str): dt1 = utc_from_str(min_date_published) q_dates = self.query_between_dates(dt1, dt2) try: if use_lucene_syntax is False: query = clean_query(query) q = self.mul_parser.parse(self.mul_parser, query) logger.warning(q) if 'date_published:' in query: end = query.find('AND date_published') q_without_date_publushed = query[:end] logger.warning(q_without_date_publushed) q = self.mul_parser.parse(self.mul_parser, q_without_date_publushed) date_published_splits = query.split('date_published:[') date_range = date_published_splits[len(date_published_splits) - 1] date_range = date_range[:-1] logger.warning(date_range) if 'TO' in date_range: date_range_splits = date_range.split('TO') dt1_string = date_range_splits[0] # handling when regex presents if '*' in dt1_string: date1_end = dt1_string.find('*') - 1 dt1_string = dt1_string[:date1_end] logger.warning(dt1_string) dt1 = utc_from_str(dt1_string) dt2_string = date_range_splits[1] if '*' in dt2_string: date2_end = dt2_string.find('*') - 1 dt2_string = dt2_string[:date2_end] logger.warning(dt2_string) dt2 = utc_from_str(dt2_string) query_dates = self.query_between_dates(dt1, dt2) q = combine_queries(q, query_dates) if min_date_published is not None: q = combine_queries(q, q_dates) logger.warning('Parsed query: %s', q) except Exception as e: logger.error(e) if use_lucene_syntax is True: raise APIParseError("""Error when parse the query string! \ You are quering with lucene syntax, be careful of your query string!""") else: raise APIParseError('Error when parse the query string!') cnames = [ 'id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score' ] if sort_by == 'relevant': top_docs = self.isearcher.search(q, n1) score_docs = top_docs.scoreDocs total_hits = top_docs.totalHits if total_hits == 0: df = pd.DataFrame() else: records = [self.fetch_one_doc(sd) for sd in score_docs] # Index in each record of canonical URL and title canonical_url, title = 1, 2 # Store 2-tuples of (site, article title) as keys in dict then # turn back to list unique_docs = dict() for record in records: key = (record[canonical_url], record[title]) if key not in unique_docs: unique_docs[key] = record # Include only unique records records = list(unique_docs.values()) df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return total_hits, df elif sort_by == 'recent': counter = 0 records = [] top_field_docs = self.isearcher.search(q, n2, self.sort_by_recent, True, True) if top_field_docs.maxScore >= min_score_of_recent_sorting: for sd in top_field_docs.scoreDocs: if sd.score >= min_score_of_recent_sorting: records.append(self.fetch_one_doc(sd)) counter += 1 if counter == n1: break if counter == 0: df = pd.DataFrame() else: df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return counter, df
def getSearcher(self, directory=None, reader=None): if reader is not None: return IndexSearcher(reader) return IndexSearcher(self.getReader(directory=directory))
def __init__(self, lucene_index_dir='/data/zjy/csqa_data/lucene_dir/'): lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File(lucene_index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(directory)) self.num_docs_to_return = 5 self.ireader = IndexReader.open(directory)
class IndexAndTaxonomy(object): def __init__(self, settings, indexDirectory=None, taxoDirectory=None): self._settings = settings self._similarity = settings.similarity self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks self._reader = DirectoryReader.open(indexDirectory) self.taxoReader = DirectoryTaxonomyReader(taxoDirectory) self._readerSettingsWrapper = ReaderSettingsWrapper() self._readerSettingsWrapper.get = lambda: { "similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks } self._readerSettingsWrapper.set = self._setReadSettings self._searcher = None self._executor = None self._reopenSearcher = True def reopen(self): reader = DirectoryReader.openIfChanged(self._reader) if reader is None: return self._reader.close() self._reader = reader self._reopenSearcher = True taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader) if taxoReader is None: return self.taxoReader.close() self.taxoReader = taxoReader @property def searcher(self): if not self._reopenSearcher: return self._searcher if self._settings.multithreaded: if self._executor: self._executor.shutdown() self._executor = Executors.newFixedThreadPool( self._numberOfConcurrentTasks) self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks) else: self._searcher = IndexSearcher(self._reader) self._searcher.setSimilarity(self._similarity) self._reopenSearcher = False return self._searcher def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None): # This method must be thread-safe if similarity is None: self._similarity = self._settings.similarity else: self._similarity = BM25Similarity(similarity["k1"], similarity["b"]) if numberOfConcurrentTasks is None: self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks else: self._numberOfConcurrentTasks = numberOfConcurrentTasks self._reopenSearcher = True def close(self): self.taxoReader.close() self._reader.close()
elif args.mode == 'no_e': args.input_test_file = pre + 'no_e.json' import lucene, time import nltk from java.io import File from org.apache.lucene.store import FSDirectory from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.util import Version lucene.initVM(vmargs=['-Djava.awt.headless=true']) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) directory = FSDirectory.open(File(index_dir)) searcher = IndexSearcher(DirectoryReader.open(directory)) parser = QueryParser(Version.LUCENE_CURRENT, "sentence", analyzer) english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%', "''", '``', "'s", "-", "--", '–' ] stopwords = nltk.corpus.stopwords.words('english') stopwords.extend(english_punctuations) stopwords.remove('by') with open(raw2Q_file, 'r') as f: raw2Q = json.load(f) with open(Qlabel_dict_file, 'r') as f: Qlabel_dict = json.load(f)
def init_lucene(dir_path): lucene.initVM(vmargs=['-Djava.awt.headless=true']) store = SimpleFSDirectory(Paths.get(dir_path)) searcher = IndexSearcher(DirectoryReader.open(store)) # store.close() return searcher
def __init__(self, dir): self.dir = dir self.lReader = DirectoryReader.open( FSDirectory.open(Paths.get(self.dir))) self.lSearcher = IndexSearcher(self.lReader)
from org.apache.lucene.search import IndexSearcher from org.apache.lucene.index import DirectoryReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.util import Version indexDir = './index' query = 'null' lucene.initVM() print 'lucene', lucene.VERSION keyWords = ["学术论坛", "媒体", "校园新闻", "创新", "机构设置", "教学在线", "科研处", "招生"] #索引的存放位置 indir = SimpleFSDirectory(Paths.get(indexDir)) #分词器 analyzer = StandardAnalyzer() #检索器 searcher = IndexSearcher(DirectoryReader.open(indir)) for i in range(0, 8): keyword = keyWords[i] query = QueryParser('contents', analyzer).parse(keyword) #开始搜索 hits = searcher.search(query, 100) print '搜索到的结果数为:', hits.totalHits f = open("result.txt", "a") f.write("\nTD" + str(i) + " " + keyword) f.write('\n') f.close() for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) temp = doc.get('url') + " " + str(hit.score) f = open("result.txt", "a")
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
directory = RAMDirectory() analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1000000) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(directory, config) i = 0 for file in files: print i, len(files) i = i + 1 doc = create_document(file) writer.addDocument(doc) writer.close() searcher = IndexSearcher(DirectoryReader.open(directory)) file_expanded_queries = open('expanded_query', 'wb') file_expanded_queries_vector = open('query_vector', 'wb') file_expanded_queries_performance = open('Performance_after_query_expansion', 'wb') file_queries = open('query.txt', 'r') lines = file_queries.readlines() avg_prec = 0 avg_recall = 0 avg_fScore = 0 avg_prec_new = 0 avg_recall_new = 0 avg_fScore_new = 0 for query in lines: current_query = query[5:].strip()
def __init__(self, folder='modern_index'): self.chSearcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(folder + '/chinese')))) self.enSearcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(folder + '/english')))) self.Analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
def getSecondarySearcher(self): if len(self.searchers) < 2: self.searchers.append(IndexSearcher(self.reader)) return self.searchers[1]
subfolder_index_pairs = [ ('cluster_w2v', 'index_w2v'), ('cluster_w2v_n', 'index_w2v_n'), ('cluster_tfidf', 'index_tfidf'), ('cluster_tfidf_n', 'index_tfidf_n') ] search_term = sys.argv[1] for cluster_type, index in subfolder_index_pairs: print(f'searching in {index}') path = Paths.get(f'../data/indices/{index}') reader = DirectoryReader.open(SimpleFSDirectory(path)) searcher = IndexSearcher(reader) query = QueryParser("content", analyzer).parse(search_term) MAX = 1000000 hits = searcher.search(query, MAX) month_counter = [ [0] * 32, [0] * 32, [0] * 32 ] for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) month = int(doc.get('day').split('_')[0])
class LuceneInterface: def __init__(self, indexPath, initialHeap, maxHeap): lucene.initVM(initialheap=initialHeap, maxheap=maxHeap) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.indexPath = indexPath self.indexDir = SimpleFSDirectory(File(self.indexPath)) def open(self): debug('Opening index "%s"' % self.indexPath) self.reader = IndexReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) self.totalDocs = self.getTotalSentenceCount() def close(self): self.reader.close() def getTFForField(self, field): tfs = {} fields = MultiFields.getFields(self.reader) terms = fields.terms(field) enum = BytesRefIterator.cast_(terms.iterator(None)) try: while enum.next(): termval = TermsEnum.cast_(enum) termString = termval.term().utf8ToString() freq = self.reader.totalTermFreq(Term(field, termString)) tfs[termString] = freq except: pass return tfs def getTotalSentenceCount(self): num = self.reader.numDocs() return num def sentenceCountForQuery(self, query, field='text'): qp = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) collector = TotalHitCountCollector() self.searcher.search(qp, collector) return collector.getTotalHits() #def getQueryCount(self, query, field='text'): #qp = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(query) #collector = TotalHitCountCollector() #self.searcher.search(qp, collector) #return collector.getTotalHits() def getIntersectionCount(self, query, countTermString, sfield, cfield): qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,[query,countTermString],[sfield,cfield],[BooleanClause.Occur.MUST,BooleanClause.Occur.MUST],self.analyzer) collector = TotalHitCountCollector() self.searcher.search(qp, collector) return collector.getTotalHits() # Return a list of records, where each record is a dictionary; the keys are the the field names in lucene. def search(self, query, field, maxReturnLimit): qp = QueryParser(Version.LUCENE_CURRENT, field, WhitespaceAnalyzer(Version.LUCENE_CURRENT)).parse(query) hits = self.searcher.search(qp, maxReturnLimit) result = [] for hit in hits.scoreDocs: record = dict() doc = self.searcher.doc(hit.doc) record["id"] = doc.get("id") record["pos"] = doc.get("pos") record["hallmarks"] = doc.get("hallmarks").split() #record["hallmarks-exp"] = doc.get("hallmarks-exp").split() record["text"] = doc.get("text") result.append(record) return result def searchGivenHallmarks(self, query, hallmarksList, hallmarksField, maxReturnLimit): qList = [query] qList.extend(hallmarksList) #print(qList) fList = ["text"] fList.extend([hallmarksField]*len(hallmarksList)) #print(fList) flagList = [BooleanClause.Occur.MUST] flagList.extend([BooleanClause.Occur.MUST]*len(hallmarksList)) #print(flagList) qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, qList, fList, flagList, self.analyzer) #print (qp) hits = self.searcher.search(qp, maxReturnLimit) result = [] for hit in hits.scoreDocs: record = dict() doc = self.searcher.doc(hit.doc) record["id"] = doc.get("id") record["pos"] = doc.get("pos") record["hallmarks"] = doc.get("hallmarks").split() #record["hallmarks-exp"] = doc.get("hallmarks-exp").split() record["text"] = doc.get("text") result.append(record) return result
def open(self): debug('Opening index "%s"' % self.indexPath) self.reader = IndexReader.open(self.indexDir) self.searcher = IndexSearcher(self.reader) self.totalDocs = self.getTotalSentenceCount()
def main(): #constants FIELD_CONTENTS = "vectext" DOC_NAME = "identifier" STORE_DIR = "../full_index1" lucene.initVM() store = SimpleFSDirectory(Paths.get(STORE_DIR)) ireader = DirectoryReader.open(store) #, True) #print(ireader.readerIndex(0)) searcher = IndexSearcher(ireader) #self.getSearcher() pickle_file = glob.glob('full_word_list.pkl') print(pickle_file) date_range = (1785, 1805) bigrams = False remake_word_list = True if remake_word_list: #not pickle_file: full_df = get_full_df() full_term_data = [] for year in range(date_range[0], date_range[1]): docs_in_year = get_docs_in_year(full_df, year) #print(docs_in_year) year_dict = Counter({}) terms = [] freqs = [] print(year) for cd, doc_id in enumerate(docs_in_year): #if not cd%100: # print(cd , '--', len(docs_in_year)) # get document (query by id) q = TermQuery(Term("identifier", doc_id + '_djvu.txt')) topDocs = searcher.search(q, 50000) #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") one_doc = topDocs.scoreDocs[0].doc doc_name = searcher.doc(one_doc) #print(doc_name, doc_id) if bigrams == False: termvec = ireader.getTermVector(topDocs.scoreDocs[0].doc, FIELD_CONTENTS) if termvec != None: #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") termsEnum = termvec.iterator() for term in BytesRefIterator.cast_(termsEnum): terms.append(term.utf8ToString()) freqs.append(termsEnum.totalTermFreq()) else: #print(doc_name, doc_id) text = doc_name.get("text") text = text.split() text = strip_stopwords_punc(text) for word1, word2 in zip(text[:-1], text[1:]): if len(word1) + len(word2) > 6: try: year_dict[word1 + ' ' + word2] += 1 except: year_dict[word1 + ' ' + word2] = 1 if bigrams == False: for term, freq in zip(terms, freqs): try: year_dict[term] += freq except: year_dict[term] = freq print(len(year_dict)) #print(year_dict) for term in list(year_dict): if year_dict[term] < 2: #5 and term not in stopwords: year_dict.pop(term) full_term_data.append(year_dict) print(len(year_dict)) #year_dict = year_dict + doc_dict #print(year_dict.most_common(1000)) print('\n\n') if bigrams: pickle.dump(full_term_data, open('full_bigram_list.pkl', 'wb')) else: pickle.dump(full_term_data, open('full_word_list.pkl', 'wb')) else: if bigrams: full_term_data = pickle.load(open('full_bigram_list.pkl', 'rb')) else: full_term_data = pickle.load(open('full_word_list.pkl', 'rb')) # get complete list of unique words # top_words_year = zscore_method(full_term_data, date_range) top_words_year = tfidf_method(full_term_data, date_range) print(top_words_year) pickle.dump(top_words_year, open('trending_ratio.pkl', 'wb'))
def predict_test(indexed_data, index_destination, source='directory', already_indexed=False): """ :param indexed_data_dir: :param index_destination: :return: """ def choose_best(): scores = [] for k, v in sorted(res.items(), key=lambda x: x[0]): scores.append((k, 1. * sum(data_test['correctAnswer'] == v) / len(v))) return sorted(scores, key=lambda x: -x[-1])[0][0] def calculate_score(res): """ :param res: :return: """ correct = 0 total = 0 for index, row in data_test.iterrows(): if res[index] == row['correctAnswer']: correct += 1 total += 1 return float(correct)/total if not already_indexed: make_index(indexed_data, index_destination, source) res = {} MAX = 100 docs_per_q = range(1,20) records = [] #analyzer = StandardAnalyzer(Version.LUCENE_30) analyzer = SnowballAnalyzer(Version.LUCENE_30, "English", StandardAnalyzer.STOP_WORDS_SET) reader = IndexReader.open(SimpleFSDirectory(File(index_destination))) searcher = IndexSearcher(reader) for index, row in data_test.iterrows(): queries = [row['answerA'], row['answerB'], row['answerC'], row['answerD']] queries = [row['question'] + ' ' + q for q in queries] scores = {} for q in queries: query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[^a-zA-Z0-9]"," ", q)) #query = QueryParser(Version.LUCENE_30, "content", analyzer).parse(re.sub("[/^]", "\^", q)) hits = searcher.search(query, MAX) doc_importance = [hit.score for hit in hits.scoreDocs] for n in docs_per_q: scores.setdefault(n, []) scores[n].append(sum(doc_importance[:n])) to_records = [index+102501] to_records.append(['A','B','C','D'][np.argmax(scores[4])]) records.append(to_records) for n in docs_per_q: res.setdefault(n, []) res[n].append(['A','B','C','D'][np.argmax(scores[n])]) df = pandas.DataFrame.from_records(records, columns=["id","correctAnswer"]) df = df.set_index("id") df.to_csv("ololo.csv") # print res[4] best = choose_best() print best score = calculate_score(res[best]) # score = calculate_score(res) print score