def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer)
def __init__(self, directory): self.directory = directory # create Directories for the search index and for the taxonomy index # in RAM or on Disc #indexDir = RAMDirectory() #taxoDir = RAMDirectory() self.indexDir = FSDirectory.open(File(os.path.join(self.directory, INDEX_DIR))) self.taxoDir = FSDirectory.open(File(os.path.join(self.directory, TAXONOMY_DIR)))
def __init__(self, directory): self.directory = directory # create Directories for the search index and for the taxonomy index # in RAM or on Disc #indexDir = RAMDirectory() #taxoDir = RAMDirectory() self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR))) self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory, TAXONOMY_DIR))) # FacetConfig self.facets_config = FacetsConfig() self.facets_config.setHierarchical("Categories", True) self.facets_config.setMultiValued("Categories", True)
def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Indexer. Parameters ---------- index_dir : string The location of lucene index mode : string The mode when opening lucene index. Available values are: 'create', open new index and overwriting over index, 'append', open existed index and append. 'create_or_append', if `index_dir` exists, 'append', else 'create' date_format : string We save datetime field as string, `date_format` specify how to format datetime into string. """ # self.store = FSDirectory.open(File(index_dir)) self.store = FSDirectory.open(Paths.get(index_dir)) # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = StandardAnalyzer() # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config = IndexWriterConfig(self.analyzer) self.mode = mode self.date_format = date_format if mode == 'create_or_append': self.config.setOpenMode( IndexWriterConfig.OpenMode.CREATE_OR_APPEND) elif mode == 'create': self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) elif mode == 'append': self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) else: raise ValueError('Invalid mode %s', mode) self.writer = IndexWriter(self.store, self.config)
def search(querystr): print('lucene', lucene.VERSION) # lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = FSDirectory.open(Paths.get("index")) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() q = QueryParser("name", analyzer).parse(querystr) hitsPerPage = 20 docs = searcher.search(q, hitsPerPage) hits = docs.scoreDocs people = [] number = 1 for hit in hits: # print(hit.doc, hit.score) d = searcher.doc(hit.doc) person = {} print(number, d.get("name")) person['Name'] = (d.get("name")) person['Birth date'] = (d.get("birth_date")) person['Death date'] = (d.get("death_date")) person['Birth note'] = (d.get("birth_note")) person['Death note'] = (d.get("death_note")) people.append(person) number += 1 return people
def l_searcher(query_string, directory, number_documents): lucene.initVM() # analyzer = StandardAnalyzer() reader = DirectoryReader.open(FSDirectory.open(Paths.get(directory))) searcher = IndexSearcher(reader) # Top 'n' documents as result topN = number_documents try: # query = QueryParser("question", analyzer).parse(query_string) query = FuzzyQuery(Term("question", query_string), 2) print("The query was: {}".format(query)) hits = searcher.search(query, topN) print("The hits were: ") options = [] options_answers = [] # print(hits.totalHits) for hit in hits.scoreDocs: print(hit.doc) # print(hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) options_answers.append(doc.get("answer")) options.append(doc.get("question")) # print(doc.get("answer")) return options, options_answers except IndexError: return None
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader): docs_lookup = dict() # noinspection PyUnresolvedReferences lucene.initVM(initialheap='32m', maxheap='4G') file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT") dir = FSDirectory.open(file) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) parser = QueryParser('contents', StandardAnalyzer()) logging.warning( 'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED' ) for object in objects: tokens = object.split(' ') doc_sets = [] for token in tokens: q = parser.parse(f'"{token}"') # TODO maybe use minimum score topdocs = searcher.search(q, 99999999) results = set([topdoc.doc for topdoc in topdocs.scoreDocs]) doc_sets.append(results) docs_lookup[object] = set.intersection(*doc_sets) return docs_lookup, reader
def __init__(self, searchDir): self.analyzer = MyPythonEnglishAnalyzer( stopwords=Indexer.ENGLISH_STOP_WORDS_SET) self.directory = FSDirectory.open(Paths.get(searchDir)) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader)
def __init__(self, baseDir, indexDirectory="IR.Index"): """ :param baseDir: The directory where this querrier is run :param indexDirectory: Directory of indices, default value = 'IR.Index' """ indexDir = FSDirectory.open(Paths.get(os.path.join(baseDir, indexDirectory))) self.reader = DirectoryReader.open(indexDir)
def __init__(self, dir, data_file): self.dir = dir self.data_file = data_file index_dir = FSDirectory.open(Paths.get(self.dir)) analyzer = StandardAnalyzer() writer_config = IndexWriterConfig(analyzer) writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(index_dir, writer_config)
def build_corpus(n=0): sbcs = texeval_corpus.test_subcorpora sbc = sbcs[n] # Hack for parallelizing queries, uses one index per domain. directory = FSDirectory.open(File(wiki_index+'-'+sbc)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
def create_index_dir(self): """ Create the directory where index is stored :return: index directory """ path = Paths.get('index') indexDir = FSDirectory.open(path) return indexDir
def createIndexWriter(indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) directory = FSDirectory.open(Paths.get(indexDir)) config = IndexWriterConfig(WhitespaceAnalyzer()) #config = config.setRAMBufferSizeMB(ramBufferSize) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) return IndexWriter(directory, config)
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def __init__(self, index_path, update=False): dir = FSDirectory.open(Paths.get(index_path)) analyzer = StandardAnalyzer() iwc = IndexWriterConfig(analyzer) if update: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) else: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(dir, iwc)
def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def index_scan(): print("Scanning the index") #pdb.set_trace() indexPath = File("indexOut/").toPath() indexDir = FSDirectory.open(indexPath) reader = DirectoryReader.open(indexDir) fields = MultiFields.getFields(reader) for field in fields: term = MultiFields.getTerms(reader,field) print(field, "->" , term)
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort( Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def getLucene(path): directory = FSDirectory.open(File(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(Version.LATEST, analyzer) mergePolicy = config.getMergePolicy() sortingMergePolicy = SortingMergePolicy(mergePolicy, Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) config.setMergePolicy(sortingMergePolicy) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def __init__(self): lucene.initVM(vmargs=['-Djava.awt.headless=true']) paths_dict = util.getPaths() # textSearcher = TextSearcher(paths_dict['fs_directory']) fs_directory = FSDirectory.open(Paths.get(paths_dict['fs_directory'])) index_reader = DirectoryReader.open(fs_directory) self.lucene_dictionary = LuceneDictionary(index_reader, 'contents') self.analyzer = StandardAnalyzer() self.searcher = IndexSearcher(index_reader) self.formatter = SimpleHTMLFormatter()
def init_index(self): """ Initializes the lucene index, as well as creates an StandardAnalyzer and an IndexSearcher. Returns: vm: The initialized Java VM analyzer: StandardAnalyzer word analizer. searcher: Searcher over the lucene index. """ self.vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) ldir = FSDirectory.open(Paths.get(settings.LUCENE_INDEX)) self.analyzer = StandardAnalyzer() self.searcher = IndexSearcher(DirectoryReader.open(ldir))
def open_writer(path): from java.io import File from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version directory = FSDirectory.open(File(path)) analyzer = StandardAnalyzer(Version.LUCENE_43) config = IndexWriterConfig(Version.LUCENE_43, analyzer) writer = IndexWriter(directory, config) return writer
def createIndex(): print(lucene.VERSION) lucene.initVM(vmargs=['-Djava.awt.headless=true']) index = FSDirectory.open(Paths.get('index')) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index, config) openCSV('../output/outputSpark_full_index.csv', writer) writer.close()
def retrieve_wiki(text_query, index_directory_name): lucene.initVM() directory = FSDirectory.open(File(index_directory_name)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) txt =text_query query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(txt) scoreDocs = searcher.search(query, 1000).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) yield doc.get('contents')
def __init__(self, indexDir, root="testdocs"): # create and open an index writer indexDir = FSDirectory.open(Paths.get(indexDir)) # TODO make appropriate analyzer add to config analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) self.authorcount = 0 self.titlecount = 0 self.errorcount = 0 self.indexDocs(root, iw)
def __init__(self, index_dir): """ :param index_dir: the dir where to store the index. """ self.indexDir = index_dir if not os.path.exists(index_dir): os.mkdir(index_dir) self.analyzer = MyPythonEnglishAnalyzer( stopwords=self.ENGLISH_STOP_WORDS_SET) conf = IndexWriterConfig(self.analyzer) conf.setUseCompoundFile(False) directory = FSDirectory.open(Paths.get(index_dir)) self.writer = IndexWriter(directory, conf)
def search(self, query): lucene.initVM() luceneDirectory = "/index/" path = str(os.path.abspath(os.getcwd()) + luceneDirectory) directory = FSDirectory.open(Paths.get(path)) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer() #args = len(sys.argv) - 1 #if args < 1: # print ("\n No query was submitted! \n") #else: #query_string = "" #position = 1 #while(args >= position): #query_string = query_string + str(sys.argv[position]) + " " #position = position + 1 print("Searching for '" + query + "'") fields_to_search = ["text", "page title", "date"] filter_date = 'date:"May 25"' filtered_query = filter_date + "AND " + query parser = MultiFieldQueryParser(fields_to_search, analyzer) updated_query = MultiFieldQueryParser.parse(parser, filtered_query) scored_documents = searcher.search(updated_query, 10).scoreDocs # array of docs print("Found " + str((len(scored_documents))) + " matches in the collection.") results = [] for doc in scored_documents: scoredTweet = dict() scoredTweet['score'] = doc.score result = searcher.doc(doc.doc) scoredTweet['username'] = result.get("username") scoredTweet['tweet_body'] = result.get("text") scoredTweet['date'] = result.get("date") results.append(scoredTweet) print(scoredTweet) return results
def author_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] print(entry['prim_author']) if qry in entry['prim_author'].lower(): fname = short_title + CONTENT_EXT results[entry_id] = {'title': short_title, 'file': fname } f = open ('/Users/Nelle/Documents/coding/text_analysis/newsvn/RenaissanceNLP/data/dataResults/authorPaths/' + qry + '.json', 'w') f.write(json.dumps(results)) f.close() return json.dumps(results)
def index_wiki(wiki_xmlfile, index_directory_name): # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
def index_wiki(wiki_xmlfile, index_directory_name): lucene.initVM() # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
def find_frequencies_wikipedia(terms: List[str], index_location: str): """Find frequencies using a Lucene index of wikipedia.""" # TODO doesn't find any n>1 grams due to missing location index on contents! logger.warning('Not working! Does not find any n>1 grams') # noinspection PyUnresolvedReferences lucene.initVM(initialheap='32m', maxheap='4G') file = Paths.get(index_location) dir = FSDirectory.open(file) reader = DirectoryReader.open(dir) freqs = {} for term_str in tqdm.tqdm(terms): term = Term("contents", term_str) freq = reader.totalTermFreq(term) freqs[term_str] = freq reader.close() return freqs
def custom_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() rootdir = OUT_RAW_DIR from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' print rootdir results = {} for hit in hits: doc = searcher.doc(hit.doc) entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) short_title = entry['short_title'] year = entry['publ_year'] fname = short_title + CONTENT_EXT results[fname] = year;
def index(self): # if exists sent_index, delete and create a new one doc_tool.cleardir(index_root) doc_tool.mkdir(index_root) index_dir = FSDirectory.open(Paths.get(index_root)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_dir, writer_config) ft1 = FieldType() ft1.setStored(True) ft1.setIndexOptions(IndexOptions.NONE) ft2 = FieldType() ft2.setStored(False) ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc_list = self.doc() file_path = os.path.join(config.SENT_RETRIEVAL_ROOT, "merge_doc") file_list = os.listdir(file_path) num = 0 for file in file_list: docs = doc_tool.load_json_file(file_path, file) for page_identifier in docs: if page_identifier in doc_list: num += 1 for sent_number in docs[page_identifier]: sentence_text = self.process_sent( docs[page_identifier][sent_number]) doc = Document() doc.add(Field("page_identifier", page_identifier, ft1)) doc.add(Field("sentence_number", sent_number, ft1)) doc.add(Field("sentence_text", sentence_text, ft2)) writer.addDocument(doc) print(num) writer.commit() writer.close() index_dir.close()
def do_search(qry, limit): helper.initPyLucene() RNLP_ctxt = _get_rnlp_ctxt() entry_map = RNLP_ctxt.get_entry_map() from org.apache.lucene.index import DirectoryReader from org.apache.lucene.search import IndexSearcher from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from java.io import File print os.path.abspath(os.path.pardir) reader = DirectoryReader.open(FSDirectory.open(File(IDX_DIR))) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_40) field = 'contents' parser = QueryParser(Version.LUCENE_40, field, analyzer); query = parser.parse(qry); print 'Searching for:', query.toString(field) raw_results = searcher.search(query, limit) hits = raw_results.scoreDocs numTotalHits = raw_results.totalHits print numTotalHits, 'total matching documents' results = [] for hit in hits: doc = searcher.doc(hit.doc); entry_id = doc.get('entry_id') entry = entry_map.get(entry_id) #print 'entry:', entry score = hit.score #print 'Hit:', entry['short_title'], score results.append((score, doc, entry)) return results
def l_indexer(directory, load_path): lucene.initVM() # index_dir = SimpleFSDirectory(File(directory)) index_dir = FSDirectory.open(Paths.get(directory)) writer_config = IndexWriterConfig(PortugueseAnalyzer()) # writer_config = IndexWriterConfig(customPortugueseAnalyser()) writer = IndexWriter(index_dir, writer_config) with open(load_path) as subtles_file: subtles_corpus = subtles_file.read().splitlines() for i in range(0, len(subtles_corpus), 2): doc = Document() doc.add(Field("question", subtles_corpus[i], StringField.TYPE_STORED)) doc.add(Field("answer", subtles_corpus[i+1], StringField.TYPE_STORED)) writer.addDocument(doc) writer.close() print("Index successfully created!")
def __init__(self, index_dir, search_fields=['canonical_url', 'title', 'meta', 'content'], unique_field='uq_id_str', boost=dict(canonical_url=4.0, title=8.0, meta=2.0, content=1.0), date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Searcher. Parameters ---------- index_dir : string The location of lucene index. search_fields : list A list of field names indicating fields to search on. unique_field : string The field name, on which the duplication should avoid. boost : dict This dict control the weight when computing score. date_format : string Convert the string into datetime. Should consistent with the index part. """ self.index_dir = index_dir self.search_fields = search_fields self.sort_by_recent = Sort( SortField('date_published', SortField.Type.STRING, True)) self.store = FSDirectory.open(File(index_dir)) self.reader = DirectoryReader.open(self.store) self.isearcher = IndexSearcher(self.reader) self.analyzer = StandardAnalyzer() self.dup_filter = DuplicateFilter(unique_field) self.boost_map = HashMap() for k, v in boost.iteritems(): self.boost_map.put(k, Float(v)) self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer, self.boost_map) self.date_format = date_format
def __index(self, emailInfo): from org.apache.lucene.index import IndexWriterConfig from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer analyser = StandardAnalyzer(Version.LUCENE_33) conf = IndexWriterConfig(Version.LUCENE_33, analyser) from org.apache.lucene.store import FSDirectory from java.io import File storage = File.createTempFile(u'Tubelight-', '.index') storage.delete() storage.mkdir() storage.deleteOnExit() self.storage = storage.getAbsolutePath() from java.io import File self.session.setAttribute('directory', storage.getAbsolutePath()+File.separator+'mail.idx') directory = FSDirectory.open(storage) from org.apache.lucene.index import IndexWriter iw = IndexWriter(directory, conf) from us.d8u.tubelight import Configuration addr = emailInfo[Configuration.EmailAddressKey] (username, server) = addr.split('@') from java.lang import System System.setProperty("mail.imap.partialfetch", "false") urlPrefix = (("imap://%s@%s:%d/Inbox") % (username, server, int(emailInfo[Configuration.EmailPortKey]))) from javax.mail import Session session = Session.getDefaultInstance(System.getProperties(), None).getStore(h.get(Configuration.EmailProtocolKey)) session.connect(server, username, emailInfo[Configuration.EmailPasswordKey]) folder = session.getDefaultFolder() for m in folder.getMessages(): from org.apache.lucene.document import Document d = Document() subject = Field("subject", m.getSubject(), Field.Store.YES, Field.Index.ANALYZED) toSrc = u'' toSrc = [((u'%s, %s') % (toSrc, str(r))) for r in m.getAllRecipients()] to = Field("to", toSrc.substring(toSrc.indexOf(u',')), Field.Store.YES, Field.Index.ANALYZED) d.add(to) d.add(subject) iw.addDocument(d) iw.commit() self.searcher = IndexSearcher(directory)
def retrieving(searchword): indexPath = File("indexOut/").toPath() indexDir = FSDirectory.open(indexPath) reader = DirectoryReader.open(indexDir) idxDocs = reader.maxDoc() print("We have ", idxDocs, " indexed documents") searcher = IndexSearcher(reader) idx_analyzer = EnglishAnalyzer() #Search for the input term in field stored as text # To look into multiple fields, try MultiFieldQueryParser, but it is not recommended. # Its best to club everything we want to search into a single search field and try WildCard matching on it query = QueryParser("text", idx_analyzer).parse(searchword) MAX = 1000 hits = searcher.search(query, MAX) print ("Found %d document(s) that matched query '%s':" % (hits.totalHits, query)) try: for hit in hits.scoreDocs: print (hit.score, hit.doc, hit.toString()) doc = searcher.doc(hit.doc) print (doc.get("text").encode("utf-8")) except: print("Could not find the word")
def __init__(self, dest=None): """ create a apache lucene indexer input: dest destination to store index information. If not set, use RAM. """ # where to store information file or ram if dest: _dir = FSDirectory.open(java.io.File(dest)) else: _dir = RAMDirectory() self.directory = _dir # analyser self.analyser = StandardAnalyzer(Version.LUCENE_CURRENT) # index writer cfg = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyser) cfg.setDefaultWriteLockTimeout(6000) self.idx_writer = IndexWriter(self.directory, cfg)
def getReader(path): return DirectoryReader.open(FSDirectory.open(Paths.get(path)))
def getReader(path): return DirectoryReader.open(FSDirectory.open(File(path)))
# Use MoreLikeThis query by document technology mlt = MoreLikeThis(reader) mlt.setFieldNames(["title", "director", "writer", "genre", "cast", "fullplot"]) mlt.setMinTermFreq(0) mlt.setMinDocFreq(0) mlt.setAnalyzer(self.analyzer) mlt_query = mlt.like(results.scoreDocs[0].doc) # Filter the original film filtered_query = BooleanQuery() filtered_query.add(mlt_query, BooleanClause.Occur.MUST) filtered_query.add(film_query, BooleanClause.Occur.MUST_NOT) score_docs = self.searcher.search(filtered_query, count).scoreDocs return self._retrieve_in_order(score_docs) # Initialize Lucene lucene.initVM() logger = logging.getLogger(__name__) logger.info('Initialising Lucene VM') base_dir = os.path.abspath(os.path.curdir) index_file = os.path.join(base_dir, settings.LUCENE['PATH']) index = FSDirectory.open(File(index_file)) try: reader = DirectoryReader.open(index) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) except lucene.JavaError as e: logger.error('Lucene not loaded')
""" # lucene modules needed for this script import lucene from java.io import File from org.apache.lucene.util import Version from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.store import FSDirectory from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.document import Document, Field, TextField # start Java VM lucene.initVM(vmargs=['-Djava.awt.headless=true']) # indexing directory indexDir = FSDirectory.open(File("lucene_index.Index")) # input which will be indexed with Lucene title1 = "text of title1" title2 = "title2" abstract1 = "abstract1 has many words, e.g. hellow world can be the text" abstract2 = "text of abstract2" # configure indexing config = IndexWriterConfig(Version.LUCENE_CURRENT, WhitespaceAnalyzer(Version.LUCENE_CURRENT)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # count number of documents processed nDocsAdded = 0