def indexDocs(self, img_url, toi, tid): try: t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions( FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) print("Adding", img_url) name = "Pictures/1.jpg" conn = urllib.urlopen(img_url) f = open(name, 'wb') f.write(conn.read()) f.close() img = cv2.imread(name) sdf = img_search_color(img) storeDir = 'Picture_new/' + sdf.strs if not os.path.exists(storeDir): os.mkdir(storeDir) cv2.imwrite(storeDir + '/' + str(toi) + '___' + str(tid) + '.jpg', img) '''storeDir2 = 'Picture_user/'+str(tid) if not os.path.exists(storeDir2): n = 0 os.mkdir(storeDir2) else : n = len(os.listdir(storeDir2)) cv2.imwrite(storeDir2+'/'+str(toi)+ '_'+ str(n) + '.jpg',img)''' except Exception, e: print("Failed in indexDocs:", e)
def __init__(self, indexDir, doClear=True, computeLengthNorm=False): # if not jpype.isJVMStarted(): # lucene.initVM() lucene.getVMEnv().attachCurrentThread() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 100678)#is here? self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setRAMBufferSizeMB(256.0) # 设置自动提交的最大RAM为256MB self.config.setMaxBufferedDocs(10000) # 设置自动提交的最大Docs个数为10000 if not computeLengthNorm: sim = CustomSimilarity() self.config.setSimilarity(sim) self.path = os.path.join(INDEX_PATH, indexDir) # print self.path # path.mkdir(self.path) # if doClear: # self.clearExistingIndex() self.store = SimpleFSDirectory(File(self.path)) self.writer = IndexWriter(self.store, self.config) self.t1 = FieldType() # 域t1 self.t1.setIndexed(True) self.t1.setStored(True) self.t1.setTokenized(False) self.t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.t2 = FieldType() # 域t2 self.t2.setIndexed(True) self.t2.setStored(False) self.t2.setTokenized(True) self.t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def run(self): print "Booting lucene driver worker...." lucene.initVM() self.fieldType1 = FieldType() self.fieldType1.setIndexed(True) self.fieldType1.setStored(False) self.fieldType1.setTokenized(True) self.fieldType2 = FieldType() self.fieldType2.setIndexed(True) self.fieldType2.setStored(True) self.fieldType2.setTokenized(False) while(True): data = self.queue.get() da = data[1] response = None try: self.fil = File(da['data']['indexdir']) self.d = NIOFSDirectory(self.fil) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.conf = IndexWriterConfig( Version.LUCENE_CURRENT, self.analyzer) response = getattr(self, da['action'])(da['data']) self.d.close() except Exception as e: print e if response is None: response = {} self.ret[data[0]] = response
def __init__(self, folder=None, fields=[], similarity="tfidf"): self.jcc = lucene.initVM() if folder: self.directory = SimpleFSDirectory(File(folder)) else: self.directory = RAMDirectory() self.fields = {} for field in fields: ft = FieldType() for pname, pvalue in field.props.items(): setter = getattr(ft, "set" + pname.capitalize()) setter(pvalue) ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) # ft.setOmitNorms(True) self.fields[field.name] = ft self.similarity = similarity.lower() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.writer = None self.searcher = None
def index(self, root): t = FieldType() t.setIndexed(True) t.setStored(True) t.setTokenized(True) t.setStoreTermVectors(True) for path, dirs, files in os.walk(root): for file in files: filePath = os.path.join(path, file) fd = open(filePath) content = unicode(fd.read(), 'iso-8859-1') fd.close() doc = Document() doc.add(Field('name', file, StringField.TYPE_STORED)) parent = os.path.split(path)[1] doc.add(Field('parent', parent, StringField.TYPE_STORED)) if len(content) > 0: doc.add(Field('content', content, t)) print 'Indexing %s' % file self.mWriter.addDocument(doc) self.mWriter.commit() self.mWriter.close()
def create_minidoc(termstring, field='text'): # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) doc = Document() doc.add(Field(field, termstring, customfield)) return doc
def init_writing(self): self.__field_type_searching = FieldType(TextField.TYPE_STORED) self.__field_type_frequency = FieldType(StringField.TYPE_STORED) self.__field_type_frequency.setStored(True) self.__field_type_frequency.setTokenized(False) self.__field_type_frequency.setStoreTermVectors(True) self.__field_type_frequency.setStoreTermVectorPositions(True) self.__field_type_frequency.\ setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) fs_directory = SimpleFSDirectory(Paths.get(self.directory)) self.__writer = IndexWriter(fs_directory, IndexWriterConfig())
def _set_fieldtypes(self): self._ft1 = FieldType() self._ft1.setStored(True) self._ft1.setTokenized(False) self._ft1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self._ft2 = FieldType() self._ft2.setStored(True) self._ft2.setTokenized(True) self._ft1.setStoreTermVectors(True) self._ft1.setStoreTermVectorOffsets(True) self._ft1.setStoreTermVectorPositions(True) self._ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) def repalcer(text): chars = '\\`*_{}[]()>#+-.!$‘' for c in chars: if c in text: text = text.replace(c, ' ') return text for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename)) as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = repalcer(line[0]) contents = line[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i)
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) wikiFile = ZipFile(root, 'r') files = wikiFile.namelist() i = 0 for file in files[1:]: i += 1 wiki = wikiFile.open(file, 'r') for line in wiki: for line in codecs.iterdecode(wiki, 'utf8'): normailized = unicodedata.normalize('NFD', line).split(' ', 2) if not normailized[1].isdigit(): continue docname = normailized[0] + ' ' + normailized[1] name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0]) contents = normailized[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i, file)
def main(): INDEX_DIR = "full_index1" DOCUMENTS_DIR = "/media/joseph/Windows8_OS/Users/Joseph/AppData/Local/lxss/home/jwymbs23/data_science_projects/french_pamphlets/frc-data-master/OCR_text/" # Initialize lucene and JVM lucene.initVM(vmargs=['-Djava.awt.headless=true']) print("lucene version is:", lucene.VERSION, '\n') store = getStore(INDEX_DIR) analyzer = getAnalyzer() writer = getWriter(store=store, analyzer=analyzer, create=True) #get list of documents doc_list = getDoclist(DOCUMENTS_DIR) ftype = FieldType() ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ftype.setTokenized(True) ftype.setStoreTermVectors(True) ftype.freeze() for cd, doc_name in enumerate(doc_list): if not cd % 1000: print(cd, '--', len(doc_list)) with open(doc_name, 'r') as d: doc_lines = d.readlines() full_text = ''.join([i.strip() for i in doc_lines]).lower() try: # create a document that would we added to the index doc = Document() # Add fields to this document #could process fname here instead of in the dataframe later doc.add( Field("identifier", doc_name.split('/')[-1], TextField.TYPE_STORED) ) #Store.YES))#, Field.Index.ANALYZED)) doc.add( Field("vectext", full_text, ftype) ) #TextField.TYPE_STORED, TermVector.YES, ))#Store.YES))#, Field.Index.ANALYZED)) doc.add(Field("text", full_text, TextField.TYPE_STORED)) # Add the document to the index writer.addDocument(doc) except: print("Failed in indexDocs: ", doc_name) #writer.optimize() writer.commit()
def get_field_type(self, stored, indexed): ''' Generate a `FieldType` according to the parameters. Input: `stored`: whether the field is stored `indexed`: whether the field is indexed Output: `FieldType` with the correct parameters ''' t = FieldType() t.setStored(stored) t.setTokenized(indexed) if indexed: # Indexes documents, frequencies and positions t.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) else: # Not indexed t.setIndexOptions(IndexOptions.NONE) return t
def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def __init__(self, indexDir="", debug=False, verbose=False): """ :Parameters: - `indexDir`: Path where the Index will be saved. (Str) - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean) - `verbose`: Provide additional information about the initialization process. (Boolean) """ self.__verbose = verbose if indexDir != "": INDEX_DIR = indexDir else: INDEX_DIR = os.path.dirname( os.path.realpath(__file__)) + "/luceneIndex" if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self.__boAppend = False else: self.__boAppend = True # Initialize lucene and JVM lucene.initVM() # Get index storage if debug: # Store the index in memory self.__indexDir = RAMDirectory() self.__boAppend = False INDEX_DIR = "RAM Memory" else: # Store an index on disk self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR)) # Create Content FieldType self.__contentType = FieldType() self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.__contentType.setTokenized(True) self.__contentType.setStored(True) self.__contentType.setStoreTermVectors(True) self.__contentType.setStoreTermVectorPositions(True) self.__contentType.freeze() # Get the Analyzer self.__analyzer = StandardAnalyzer( StandardAnalyzer.ENGLISH_STOP_WORDS_SET) # Print Indexer Information print("Lucene version is: ", lucene.VERSION) print("Index Directory: ", INDEX_DIR)
def indexDocs(self, lines, writer): ts = [] for i in range(4): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t1.storeTermVectors() ts.append(t1) file_names = ["all", "context", "last", "response"] for line in lines: doc = Document() for i in range(4): doc.add(Field(file_names[i], line[i], ts[i])) writer.addDocument(doc)
def __init__(self): """Init possible field types""" # FIELD_ID: stored, indexed, non-tokenized self.field_id = FieldType() #self.field_id.setIndexed(True) self.field_id.setIndexOptions(IndexOptions.DOCS) self.field_id.setStored(True) self.field_id.setTokenized(False) # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions) # for storing IDs with term vector info self.field_id_tv = FieldType() #self.field_id_tv.setIndexed(True) self.field_id_tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.field_id_tv.setStored(True) self.field_id_tv.setTokenized(False) self.field_id_tv.setStoreTermVectors(True) # FIELD_TEXT: stored, indexed, tokenized, with positions self.field_text = FieldType() #self.field_text.setIndexed(True) self.field_text.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.field_text.setStored(True) self.field_text.setTokenized(True) # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions) self.field_text_tv = FieldType() #self.field_text_tv.setIndexed(True) self.field_text_tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.field_text_tv.setStored(True) self.field_text_tv.setTokenized(True) self.field_text_tv.setStoreTermVectors(True) # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_tvp = FieldType() #self.field_text_tvp.setIndexed(True) self.field_text_tvp.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.field_text_tvp.setStored(True) self.field_text_tvp.setTokenized(True) self.field_text_tvp.setStoreTermVectors(True) self.field_text_tvp.setStoreTermVectorPositions(True)
def setUp(self): super(Test_Bug1842, self).setUp() self.analyzer = StandardAnalyzer() w1 = self.getWriter(analyzer=self.analyzer) doc1 = Document() ftype = FieldType() ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ftype.setTokenized(True) ftype.setStoreTermVectors(True) ftype.freeze() doc1.add(Field("all", "blah blah blah Gesundheit", ftype)) doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED)) w1.addDocument(doc1) w1.close()
def index_docs(self, input_documents): for document in tqdm(input_documents, total=len(input_documents)): doc = Document() doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES)) doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES)) type = FieldType() type.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) type.setStored(True) type.setStoreTermVectors(True) type.setTokenized(True) if ".W" in document and ".M" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower() + document[".W"].lower())), type)) elif ".M" in document and ".W" not in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower())), type)) elif ".M" not in document and ".W" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".T"].lower() + document[".W"].lower())), type)) elif ".M" not in document and ".W" not in document: doc.add( Field("text", " ".join(tokenizer.tokenize(document[".T"].lower())), type)) if self.writer.getConfig().getOpenMode( ) == IndexWriterConfig.OpenMode.CREATE: self.writer.addDocument(doc) else: self.writer.updateDocument(Term(".U", document[".U"]), doc) self.writer.close()
def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close()
def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) # True,建立新索引,False,建立增量索引 noIndexedString = FieldType() noIndexedString.setTokenized(False) noIndexedString.setIndexed(False) noIndexedString.setStored(True) try: print "adding", goodname goodname_s = unicode(goodname, 'utf8') seg_list_good = jieba.cut(goodname_s, cut_all=False) goodname_s = " ".join(seg_list_good) # 默认模式 shopname_s = unicode(shopname, 'utf8') seg_list_shop = jieba.cut(shopname_s, cut_all=False) shopname_s = " ".join(seg_list_shop) # 默认模式 shopnameField = Field("shopName", shopname, noIndexedString) shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO) goodnameField = Field("goodName", goodname, noIndexedString) goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO) salenumField = IntField("saleNum", salenum, Field.Store.YES) priceField = DoubleField("price", price, Field.Store.YES) urlField = Field("url", url, noIndexedString) pictureField = StringField("pictureName", picturename, Field.Store.YES) commentField = Field("comments", comment, noIndexedString) historyPriceField = Field("historyPrice", historyprice, noIndexedString) doc = Document() doc.add(shopnameField) doc.add(shopnameField_s) doc.add(goodnameField) doc.add(goodnameField_s) doc.add(salenumField) doc.add(priceField) doc.add(urlField) doc.add(pictureField) doc.add(commentField) doc.add(historyPriceField) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def create_index(self, index_folder, docs_path, add_terms=False): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
def testBinaryFieldInIndex(self): ft = FieldType() ft.setStored(True) bytes = JArray('byte')(self.binaryValStored) binaryFldStored = StoredField("binaryStored", bytes) stringFldStored = Field("stringStored", self.binaryValStored, ft) doc = Document() doc.add(binaryFldStored) doc.add(stringFldStored) # test for field count self.assertEqual(2, doc.fields.size()) # add the doc to a ram index writer = self.getWriter( analyzer=StandardAnalyzer(Version.LUCENE_CURRENT)) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assert_(docFromReader is not None) # fetch the binary stored field and compare it's content with the # original one bytes = docFromReader.getBinaryValue("binaryStored") binaryFldStoredTest = bytes.bytes.string_ self.assertEqual(binaryFldStoredTest, self.binaryValStored) # fetch the string field and compare it's content with the original # one stringFldStoredTest = docFromReader.get("stringStored") self.assertEqual(stringFldStoredTest, self.binaryValStored) reader.close()
def __init__(self): """Init possible field types.""" # FIELD_ID: stored, indexed, non-tokenized self.field_id = FieldType() self.field_id.setIndexed(True) self.field_id.setStored(True) self.field_id.setTokenized(False) # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions) # for storing IDs with term vector info self.field_id_tv = FieldType() self.field_id_tv.setIndexed(True) self.field_id_tv.setStored(True) self.field_id_tv.setTokenized(False) self.field_id_tv.setStoreTermVectors(True) # FIELD_TEXT: stored, indexed, tokenized, with positions self.field_text = FieldType() self.field_text.setIndexed(True) self.field_text.setStored(True) self.field_text.setTokenized(True) # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions) self.field_text_tv = FieldType() self.field_text_tv.setIndexed(True) self.field_text_tv.setStored(True) self.field_text_tv.setTokenized(True) self.field_text_tv.setStoreTermVectors(True) # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_tvp = FieldType() self.field_text_tvp.setIndexed(True) self.field_text_tvp.setStored(True) self.field_text_tvp.setTokenized(True) self.field_text_tvp.setStoreTermVectors(True) self.field_text_tvp.setStoreTermVectorPositions(True) # FIELD_TEXT_NTV: not stored, indexed, tokenized, with term vectors (without positions) self.field_text_ntv = FieldType() self.field_text_ntv.setIndexed(True) self.field_text_ntv.setStored(False) self.field_text_ntv.setTokenized(True) self.field_text_ntv.setStoreTermVectors(True) # FIELD_TEXT_TVP: not stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_ntvp = FieldType() self.field_text_ntvp.setIndexed(True) self.field_text_ntvp.setStored(False) self.field_text_ntvp.setTokenized(True) self.field_text_ntvp.setStoreTermVectors(True) self.field_text_ntvp.setStoreTermVectorPositions(True)
def testBinaryFieldInIndex(self): ft = FieldType() ft.setStored(True) bytes = JArray('byte')(self.binaryValStored) binaryFldStored = StoredField("binaryStored", bytes) stringFldStored = Field("stringStored", self.binaryValStored, ft) doc = Document() doc.add(binaryFldStored) doc.add(stringFldStored) # test for field count self.assertEqual(2, doc.fields.size()) # add the doc to a ram index writer = self.getWriter(analyzer=StandardAnalyzer()) writer.addDocument(doc) writer.close() # open a reader and fetch the document reader = self.getReader() docFromReader = reader.document(0) self.assertTrue(docFromReader is not None) # fetch the binary stored field and compare it's content with the # original one bytes = docFromReader.getBinaryValue("binaryStored") binaryFldStoredTest = bytes.bytes.bytes_ self.assertEqual(binaryFldStoredTest, self.binaryValStored) # fetch the string field and compare it's content with the original # one stringFldStoredTest = docFromReader.get("stringStored") self.assertEqual(stringFldStoredTest, self.binaryValStored.decode()) reader.close()
def setUp(self): super(Test_Bug1842, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) w1 = self.getWriter(analyzer=self.analyzer) doc1 = Document() ftype = FieldType() ftype.setStored(False) ftype.setIndexed(True) ftype.setStoreTermVectors(True) doc1.add(Field("all", "blah blah blah Gesundheit", ftype)) doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED)) w1.addDocument(doc1) w1.close()
def lazyImport(): global imported if imported: return from meresco.pylucene import getJVM getJVM() from java.nio.file import Paths from org.apache.lucene.document import Document, StringField, Field, FieldType from org.apache.lucene.search import IndexSearcher, TermQuery from org.apache.lucene.index import DirectoryReader, Term, IndexWriter, IndexWriterConfig, IndexOptions from org.apache.lucene.store import FSDirectory from org.apache.lucene.util import Version from org.apache.lucene.analysis.core import WhitespaceAnalyzer UNINDEXED_TYPE = FieldType() UNINDEXED_TYPE.setIndexOptions(IndexOptions.NONE) UNINDEXED_TYPE.setStored(True) UNINDEXED_TYPE.setTokenized(False) imported = True globals().update(locals())
def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename), encoding='utf-8') as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = self.repalcer(line[0]) contents = line[2] name_contents = name + ' ' + contents doc = Document() doc.add(Field('name-sid', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) doc.add(Field('name-contents', name_contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i)
def add_documents(self, input_file, writer_obj): """ Read the input file and create the index from the paragraph text Each paragraph is one document. Store the document ids, frequencies and positions in the index for each term :param input_file: the input file used for creating the index :param writer_obj: the index writer object :return: """ contentType = FieldType() contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) contentType.setStored(True) contentType.setTokenized(True) paras = get_paragraphs(input_file) for p in paras: doc = Document() lines_split = p.split('|__|') doc.add(StringField("id", lines_split[0], Field.Store.YES)) doc.add( Field("paragraphs", get_stemmed_string(lines_split[1].lower()), contentType)) writer_obj.addDocument(doc) writer_obj.close()
def create_index(self, index_folder, docs_path, add_terms=False): print('Loading Vocab...') if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print("%d docs in index" % self.writer.numDocs()) print("Indexing documents...") # import corpus_hdf5 # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path) import pickle with open(docs_path, "rb") as read_file: corpus = pickle.load(read_file) idx_cnt = 0 # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()): # for doc_id, txt in corpus.items(): for txt in corpus: self.add_doc(idx_cnt, txt, add_terms) # not lowered if idx_cnt % 1000 == 0: print('indexing doc', idx_cnt) idx_cnt += 1 print("Index of %d docs..." % self.writer.numDocs()) self.writer.close()
def index_docs(self, train_set, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for ii in train_set: doc = Document() doc.add(Field("answer", ii['Answer'], t1)) doc.add(Field("qid", ii['Question ID'], t1)) doc.add(Field("category", ii['category'], t1)) doc.add(Field("position", ii['Sentence Position'], t1)) doc.add(Field("question", ii['Question Text'], t2)) doc.add(Field("wiki_plain", self.wiki_reader.get_text(ii['Answer']), t2)) writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.html'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
directory = FSDirectory.open(File(path)) analyzer = StandardAnalyzer(Version.LUCENE_43) config = IndexWriterConfig(Version.LUCENE_43, analyzer) writer = IndexWriter(directory, config) return writer def open_searcher(writer): from org.apache.lucene.search import IndexSearcher reader = writer.getReader() searcher = IndexSearcher(reader) return reader, searcher from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import Term vectorFieldType = FieldType(TextField.TYPE_NOT_STORED) vectorFieldType.setIndexed(True) vectorFieldType.setTokenized(True) vectorFieldType.setStoreTermVectors(True) vectorFieldType.setStoreTermVectorPositions(False) writer = open_writer('data/index') def addToIndex(lxmlNode): uri = xpathFirst(lxmlNode, '//oa:hasTarget/@rdf:resource') print uri seen = set() doc = Document() for fieldName in FIELD_NAMES: seen.clear() for subpath in [
def reindex(self): ''' Re-indexes the entire database into Index file''' start = time.time() # get all posts posts = self._tuples_to_dict(self._fetch_all_questions(), self._posts_fields) if not posts: raise Exception("FATAL Error: Could not fetch posts from Database") # open indexer # lucene.initVM(vmargs=['-Djava.awt.headless=true']) # print 'lucene', lucene.VERSION store = SimpleFSDirectory(File(self.index_dir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(store, config) indexedField = FieldType() indexedField.setIndexed(True) indexedField.setStored(True) indexedField.setTokenized(True) indexedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) storedField = FieldType() storedField.setIndexed(False) storedField.setStored(True) storedField.setTokenized(False) storedField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) fieldTypes = { 'type' : storedField, 'id' : storedField, 'title' : indexedField, 'question' : indexedField, 'answer' : indexedField, # 'comment' : indexedField, 'tag' : indexedField, 'extra' : indexedField, } # get their comments num_docs = 0 for post in posts: if self.status_mode: print "\r {0:.2f} %complete".format(((num_docs/142627.0)*100)), if self.debug : print "\n","*"*20,"\nIndexing post: ", post['id'], "from ", post['extra'] if self.debug and self.verbose_values: print post answers = self._tuples_to_dict(self._fetch_all_answers(post['id'], post['extra']), self._answer_fields) # add comment field for answer in answers: num_docs += 1 if self.debug: print "\n","+"*10, "\nMaking new Document" doc = Document() if self.debug: print "Adding doc type" doc.add(Field("type", self.doctype, fieldTypes['type'])) # make fields if self.debug: print "Adding post fields" for i in xrange(len(self._posts_fields)): f = Field(self._posts_fields[i], self._cleanup_tag(post[self._posts_fields[i]]), fieldTypes[self._posts_fields[i]]) f.setBoost(self._fields_boost[self._posts_fields[i]]) doc.add(f) if self.status_mode: print "\t Indexing answer: ", answer['answer_id'] if self.debug and self.verbose_values: print answer # answered_doc = copy.deepcopy(doc) # make comment field f = Field("answer", self._cleanup_tag(answer['answer']), fieldTypes['answer']) f.setBoost(self._fields_boost['answer']) doc.add(f) # calculate paths # commented_doc = copy.deepcopy(answered_doc) # comments = self._comments_to_comment_string(self._tuples_to_dict(self._fetch_all_comments(answer['id']), self._comment_fields)) # if self.debug: print "\t\tAdding comments: ", comments # commented_doc.add(Field("comment", self._cleanup_tag(comments), fieldTypes['comment'])) # write index if self.debug: print "\tAdding document {doc_id} to index".format(doc_id=post['id']) writer.addDocument(doc) # del answered_doc # del commented_doc if self.debug: print "Commiting document to index" writer.commit() # close index if self.status_mode: print "Closing index write" writer.close() end = time.time() - start if self.status_mode: print "\n","-"*20, \ "\nTotal time spent in indexing: ", end, "seconds" \ "\nIndexed {num_docs} documents".format(num_docs=num_docs)
def tweetIndexer(self, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) x = 0 for i in range(0,500): if not os.path.isfile("json/tweets-" + str(i) + ".json"): break print "adding tweets-" + str(i) + ".json" tweets = open("json/tweets-" + str(i) + ".json", "r") for line in tweets.readlines(): tweet = json.loads(line) if 'limit' in tweet: continue try: doc = Document() doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1)) sname = tweet['user']['screen_name'] tid = str(tweet['id']) text = tweet['text'] uname = tweet['user']['name'] created = tweet['created_at'] tstamp = tweet['timestamp_ms'] place = "" if tweet['place']: place = tweet['place']['full_name'] + ", " + tweet['place']['country'] lat = "" lng = "" titles = "" urls = "" exist = "false" if tweet['coordinates']: lat = str(tweet['coordinates']['coordinates'][1]) lng = str(tweet['coordinates']['coordinates'][0]) else: lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2) lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2) if len(tweet['entities']['urls']) != 0: exist = "true" for index in range(len(tweet['entities']['urls'])): title = tweet['entities']['urls'][index]['url_title'] if title == None: titles += ",-" else: title = title.encode('ascii','ignore') titles += "," + str(title) urls += " " + str(tweet['entities']['urls'][index]['expanded_url']) searchable = text + " " + urls + " " + uname + " " + sname + " " + place doc.add(Field("lookup", searchable, t2)) doc.add(Field("text", text, t2)) doc.add(Field("user_name", uname, t2)) doc.add(Field("screen_name", sname, t2)) doc.add(Field("tweet_id", tid, t2)) doc.add(Field("created_at", created, t2)) doc.add(Field("geo_lat", lat, t2)) doc.add(Field("geo_lng", lng, t2)) doc.add(Field("url_exist", exist, t2)) doc.add(Field("url_url", urls, t2)) doc.add(Field("url_title", titles, t2)) doc.add(Field("timestamp", tstamp, t2)) writer.addDocument(doc) x += 1 except Exception, e: pass tweets.close()
def __init__(self): self.mDocumentDirectory = settings.ADMINS_ENGINE.mDocumentDirectory self.mIndexDirectory = settings.ADMINS_ENGINE.mIndexDirectory self.mAnalyzers = settings.ADMINS_ENGINE.getIndexingAnalyzers() ############################# Writer Configurattion ##################################### map = HashMap() map.put('name', self.mAnalyzers['name']) map.put('parent', self.mAnalyzers['parent']) map.put('content', self.mAnalyzers['default']) map.put('id', self.mAnalyzers['id']) analyzerWrapper = PerFieldAnalyzerWrapper(self.mAnalyzers['default'], map) self.mWriterConfig = IndexWriterConfig(Version.LUCENE_CURRENT, analyzerWrapper) self.mWriterConfig.setOpenMode(settings.ADMINS_ENGINE.mOpenMode) if settings.ADMINS_ENGINE.mSimilarity != None: self.mWriterConfig.setSimilarity(settings.ADMINS_ENGINE.mSimilarity) ######################################################################################## directory = SimpleFSDirectory(File(self.mIndexDirectory)) self.mIndexWriter = IndexWriter(directory, self.mWriterConfig) ############################# FieldType Prepration ##################### nameField = FieldType() nameField.setIndexed(True) nameField.setStored(True) nameField.setTokenized(True) nameField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) parentField = FieldType() parentField.setIndexed(True) parentField.setStored(True) parentField.setTokenized(True) parentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) contentField = FieldType() contentField.setIndexed(True) contentField.setStored(True) contentField.setTokenized(True) contentField.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) idField = FieldType() idField.setIndexed(True) idField.setStored(True) idField.setTokenized(False) idField.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) self.mFieldTypes = { 'name' : nameField, 'parent' : parentField, 'content' : contentField, 'id' : idField } ####################################################################### self.mLog = ""
def index_article(writer, art_id, art_body): art_id_field = FieldType() art_id_field.setIndexed(True) art_id_field.setStored(True) art_id_field.setTokenized(False) art_id_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) art_body_field = FieldType() art_body_field.setIndexed(True) art_body_field.setStored(True) art_body_field.setTokenized(True) art_body_field.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc = Document() doc.add(Field("art_id", str(art_id), art_id_field)) doc.add(Field("art_body", art_body, art_body_field)) writer.addDocument(doc)
from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.store import RAMDirectory from org.apache.lucene.document import Document, Field, FieldType from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import \ IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = ["this bernhard is the text to be index text", "this claudia is the text to be indexed"] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit()
def _getIndex(self, even, odd): mergePolicy = LogDocMergePolicy() mergePolicy.setMergeFactor(1000) directory = RAMDirectory() self.dirs.append(directory) writer = self.getWriter(directory=directory, analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT), maxBufferedDocs=2, mergePolicy=mergePolicy) if self.dvStringSorted: # Index sorted stringDVType = FieldInfo.DocValuesType.SORTED elif self.notSorted: # Index non-sorted stringDVType = FieldInfo.DocValuesType.BINARY else: # sorted anyway stringDVType = FieldInfo.DocValuesType.SORTED ft1 = FieldType() ft1.setStored(True) ft2 = FieldType() ft2.setIndexed(True) for i in xrange(len(self.data)): if (i % 2 == 0 and even) or (i % 2 == 1 and odd): doc = Document() doc.add(Field("tracer", self.data[i][0], ft1)) doc.add(TextField("contents", self.data[i][1], Field.Store.NO)) if self.data[i][2] is not None: doc.add(StringField("int", self.data[i][2], Field.Store.NO)) if self.supportsDocValues: doc.add(NumericDocValuesField("int_dv", Long.parseLong(self.data[i][2]))) if self.data[i][3] is not None: doc.add(StringField("float", self.data[i][3], Field.Store.NO)) if self.supportsDocValues: doc.add(FloatDocValuesField("float_dv", Float.parseFloat(self.data[i][3]))) if self.data[i][4] is not None: doc.add(StringField("string", self.data[i][4], Field.Store.NO)) if self.supportsDocValues: if stringDVType == FieldInfo.DocValuesType.SORTED: doc.add(SortedDocValuesField("string_dv", BytesRef(self.data[i][4]))) elif stringDVType == FieldInfo.DocValuesType.BINARY: doc.add(BinaryDocValuesField("string_dv", BytesRef(self.data[i][4]))) else: raise ValueError("unknown type " + stringDVType) if self.data[i][5] is not None: doc.add(StringField("custom", self.data[i][5], Field.Store.NO)) if self.data[i][6] is not None: doc.add(StringField("i18n", self.data[i][6], Field.Store.NO)) if self.data[i][7] is not None: doc.add(StringField("long", self.data[i][7], Field.Store.NO)) if self.data[i][8] is not None: doc.add(StringField("double", self.data[i][8], Field.Store.NO)) if self.supportsDocValues: doc.add(NumericDocValuesField("double_dv", Double.doubleToRawLongBits(Double.parseDouble(self.data[i][8])))) if self.data[i][9] is not None: doc.add(StringField("short", self.data[i][9], Field.Store.NO)) if self.data[i][10] is not None: doc.add(StringField("byte", self.data[i][10], Field.Store.NO)) if self.data[i][11] is not None: doc.add(StringField("parser", self.data[i][11], Field.Store.NO)) for f in doc.getFields(): if f.fieldType().indexed() and not f.fieldType().omitNorms(): Field.cast_(f).setBoost(2.0) writer.addDocument(doc) reader = writer.getReader() writer.close() return self.getSearcher(reader=reader)
def indexTable(self, writer): #connection con = None #define the index of all the fields #---------step 2:connect to mysql---------- con = mdb.connect('localhost','root','testgce','douban_movie_v3') #t_num = FieldType.NumericType it is wrong!! t_num = FieldType() t_num.setStored(False) t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) maxDict = utils.maxDict #加权数值范围 base = DOC_BOOST_RANGE[0] upper = DOC_BOOST_RANGE[1] with con: # Careful with codecs con.set_character_set('utf8') cur = con.cursor() # Aagin the codecs cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #------step 3: choose the right table------ cur.execute("SELECT * FROM movie_items") numrows = int(cur.rowcount) print 'numrows:',numrows for i in range(numrows): print row = cur.fetchone() #------step 4:Index your field------ summary = row[SUMMARY] subject_id = row[SUBJECT_ID] print 'id'+subject_id year = utils.formatYear(row[YEAR]) try: date = DateTools.stringToDate(year.replace('-',' ')) wtfFile = open('wtf.txt','a') dateStr = DateTools.dateToString(date,DateTools.Resolution.DAY) except: wtfFile.write(year+'\n') doc = Document() #boosting boostProb = utils.calcBoostProb(row,maxDict,dateStr) boost = base + boostProb*(upper-base) doc.add(FloatField("boost",boost,Field.Store.YES)) doc.add(StringField("year",dateStr,Field.Store.YES)) print 'dateStr:'+dateStr #A text field is a sequence of terms that has been tokenized while a string field is a single term (although it can also be multivalued.) do_count = row[DO_COUNT] if row[DO_COUNT] != None else 0 wish_count = row[COLLECT_COUNT] if row[WISH_COUNT] != None else 0 #fields which should not be analyzed doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.YES)) doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.YES)) doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.YES)) #doc.add(FloatField("year", float(row[YEAR]), Field.Store.YES).setBoost(boost)) doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.YES)) doc.add(IntField("do_count", int(do_count), Field.Store.YES)) doc.add(IntField("wish_count", int(wish_count), Field.Store.YES)) doc.add(IntField("subject_id", int(row[SUBJECT_ID]), Field.Store.YES)) doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.YES)) doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.YES)) doc.add(StringField("image_small", row[IMAGE_SMALL], Field.Store.YES)) #fields which should be analyzed with WhitespaceAnalyzer #attention!!! dont use a long sentence like : #doc.add(Field("genres", row[GENRES].replace(delim,' '), t3).setBoost(boost)) #or you'll get a null pointer error f = Field("countries", row[COUNTRIES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #process casts raw_casts = row[CASTS].replace(delim,' ') f = Field("raw_casts", raw_casts , t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · raw_casts = raw_casts.replace('·',' ') if len(raw_casts.split(' '))<CASTS_LEN: #平局人名长度是4 casts = raw_casts + ' ¥¥¥¥'*(CASTS_LEN-len(raw_casts.split(' '))) f = Field("casts", casts , t3) f.setBoost(boost) doc.add(f) #process directors raw_directors = row[DIRECTORS].replace(delim,' ') f = Field("raw_directors",raw_directors, t1) f.setBoost(boost) doc.add(f) #将英文人名中的 · 替换 raw_directors = raw_directors.replace('·',' ') if len(raw_directors.split(' '))<DIRECTORS_LEN: #平局人名长度是4 directors = raw_directors + ' ¥¥¥¥'*(DIRECTORS_LEN-len(raw_directors.split(' '))) f = Field("directors", directors, t3) f.setBoost(boost) doc.add(f) Field("genres", row[GENRES].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) Field("subtype", row[SUBTYPE].replace(delim,' '), t3) f.setBoost(boost) doc.add(f) #it is wrong cause indexable field has no method setBoost # fieldList = doc.getFields() # is not a python 'list' , but a 'List' which is unindexable # for eachField in fieldList: # eachField.setBoost(boost) #user_tags 原始字符串要存,reRank要用: doc.add(StringField("raw_user_tags",row[USER_TAGS],Field.Store.YES)) doc.add(StringField("raw_others_like",row[OTHERS_LIKE],Field.Store.YES)) user_tags_str = '' others_like_str = '' tags_len = 0 if row[USER_TAGS]!='': user_tags_list = row[USER_TAGS].split(delim) for tag_pair in user_tags_list: if tag_pair!='':#字符串的最后一个字符是¥,这样split之后最后一个元素是空字符 #print 'tag_pair'+tag_pair+'hhe' tag_name = tag_pair.split(delim_uo)[0]+' ' # dont forget this space !! tag_num = tag_pair.split(delim_uo)[1] tag_num_processed = int(int(tag_num)/TAG_SPAN)+1 #最小为1 #!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! user_tags_str = user_tags_str +' '+ tag_name * tag_num_processed tags_len = tags_len + tag_num_processed #最后得到总共词的个数 if tags_len<TAGS_AVER_LEN: #填充tags,目测3是平均长度,所以使用 ¥¥¥ user_tags_str = user_tags_str +' ¥¥¥'*(TAGS_AVER_LEN - tags_len) # if row[OTHERS_LIKE]!='': for like_pair in row[OTHERS_LIKE].split(delim): if like_pair!='': others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1] #start process adjs if row[ADJS] != None: raw_adjs = row[ADJS][:-1] adjs_str = '' adjs_len = 0 if row[ADJS] != '' and row[ADJS] != '\n': #'重要=4.0,特殊=4.0' adjs_str = row[ADJS] adjs_list = adjs_str.split(',') for adj_pair in adjs_list: #print 'adj_pair:'+adj_pair+'hhe' adj_name = adj_pair.split('=')[0] adj_num = adj_pair.split('=')[1] #去换行符,转换int if adj_num[-1] == '\n': adj_num = adj_num[0:-1] adj_num = int(float(adj_num)) add_adj='' # #同义词 # adj_name_bro = searchDictValue(adjMap,adj_name) # if adj_name_bro == -1: #表示没有结果,即未找到近义词,不添加 # add_adj = '' # else: # add_adj = (adj_name_bro+' ')*adj_num # raw_adjs = raw_adjs + ',' + adj_name_bro+'='+str(adj_num) adjs_str = adjs_str + ' ' + (adj_name+' ') * adj_num +add_adj adjs_len = adjs_len + adj_num #最后得到总共tags的个数 #print raw_adjs doc.add(StringField("raw_adjs",raw_adjs,Field.Store.YES)) if adjs_len<ADJS_AVER_LEN: #填充 adjs_str,目测2是平均长度,所以使用 "¥¥" adjs_str = adjs_str +' ¥¥'*(ADJS_AVER_LEN - adjs_len) f = Field("adjs", adjs_str, t3) f.setBoost(boost) doc.add(f) f = Field("user_tags", user_tags_str, t3) f.setBoost(boost) doc.add(f) f = Field("others_like", others_like_str, t3) f.setBoost(boost) doc.add(f) #fields which should be analyzed with good analyzer f = Field("title", row[TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("original_title", row[ORIGINAL_TITLE], t3) f.setBoost(boost) doc.add(f) f = Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t3) f.setBoost(boost) doc.add(f) f = Field("aka", row[AKA], t2) f.setBoost(boost) doc.add(f) if len(summary) > 0: print subject_id +'--->'+':\n '+ row[TITLE] try: summary_unicoded = unicode(summary, 'utf-8') #test the encoding except Exception,e: print "Decode Failed: ", e f = Field('summary', summary, t2) f.setBoost(boost) doc.add(f) else: print "warning:\n" + subject_id +'---> No content!' print 'boosting:' + str(boost) #for debug if boost>upper: print boostProb print maxDict exit(0) writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # if not filename.endswith('.cdc'): # continue try: # only add the filename and path for indexing path = os.path.join(root, filename) print "adding file : ", path file = open(path) contents = unicode(file.read(), 'utf-8') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in ", filename writer.addDocument(doc) except Exception, e: print "failed in indexDocs:", e
former_scents = ' '.join(scents_list[0]) doc.add(Field("former_scents", former_scents, t3)) mid_scents = ' '.join(scents_list[1]) doc.add(Field("mid_scents", mid_scents, t3)) last_scents = ' '.join(scents_list[2]) doc.add(Field("last_scents", last_scents, t3)) except Exception, e: print "Failed in indexDocs:", e print line print cnt cnt += 1 writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(False)
def indexDocs(self, root, writer): # t1 is used for filenames and t2 is used for contents t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: # We can index only a certain types of files if not (filename.endswith('.txt') or filename.endswith('.pdf') or filename.endswith('.xml') or filename.endswith('.doc') or filename.endswith('.odt')): continue try: file_path = os.path.join(root, filename) outfile_path = file_path # First convert PDF and DOC files to text if filename.endswith('.pdf'): outfile = filename.replace('.pdf', '.txt') outfile_path = os.path.join(root, outfile) cmd = 'pdftotext ' + '-layout ' + "'"+ file_path + "'" + ' ' + "'" + outfile_path + "'" subprocess.check_output(cmd, shell=True) file_path = outfile_path elif filename.endswith('.doc'): outfile = filename.replace('.doc', '.txt') outfile_path = os.path.join(root, outfile) cmd = 'antiword ' + file_path + ' >> ' + outfile_path subprocess.check_output(cmd, shell=True) file_path = outfile_path elif filename.endswith('.odt'): outfile = filename.replace('.odt', '.txt') outfile_path = os.path.join(root, outfile) cmd = 'odttotext ' + '-layout ' + "'"+ file_path + "'" + ' ' + "'" + outfile_path + "'" subprocess.check_output(cmd, shell=True) file_path = outfile_path file = open(file_path) contents = unicode(file.read(), 'iso-8859-1') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: logging.debug('warning: no content in %s', filename) writer.addDocument(doc) except Exception, e: logging.debug('Failed in indexDocs: %s', e)
class Indexer(object): def __init__(self, **kwargs): """ Initialize a new instance of the Indexer :param output: The output directory of the underlying index :param anaylzer: The overloaded analyzer to work with """ self.output = kwargs.get("root", "index") if not os.path.exists(self.output): os.mkdir(self.output) self.analyzer = kwargs.get("analyzer", StandardAnalyzer(Version.LUCENE_CURRENT)) self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.store = SimpleFSDirectory(File(self.output)) self.writer = IndexWriter(self.store, self.config) self.create_field_types() def index(self, document): """ Given a new document, add it to the index. :param document: The document to add to the indexer """ try: self.writer.addDocument(document) except Exception: logger.exception("Failed to index the supplied document") def shutdown(self): """ Shutdown the currently processing indexer. """ try: # self.writer.optimize() self.writer.close() except Exception: logger.exception("Failed to shutdown the indexer correctly") def create_field_types(self): """ Create the field types that will be used to specify what actions lucene should take on the various fields supplied to index. """ self.field_clean = FieldType() self.field_clean.setIndexed(True) self.field_clean.setStored(True) self.field_clean.setTokenized(False) self.field_clean.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) self.field_dirty = FieldType() self.field_dirty.setIndexed(True) self.field_dirty.setStored(False) self.field_dirty.setTokenized(True) self.field_dirty.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
def indexDocs(self, root, writer): for root, dirnames, filenames in os.walk(root): for filename in filenames: print "adding", filename doc_parser = HTMLDocumentParser() try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'iso-8859-1') doc_parser.feed(contents) contents = doc_parser.contents html_doc = HTMLDocument(contents) flag = False if flag: print '==============' print 'Title: ' + html_doc.title print 'Description: ' + html_doc.description print 'Month: ' + html_doc.month print 'Year: ' + html_doc.year print 'Authors: ' + str(html_doc.authors) print 'Keywords: ' + str(html_doc.keywords) print 'Timestamp: ' + str(html_doc.timestamp) print ' ' file.close() doc = Document() field_filename = FieldType() field_filename.setIndexed(True) field_filename.setStored(True) field_filename.setTokenized(False) field_filename.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("filename", filename.replace('.html', ''), field_filename)) field_path = FieldType() field_path.setIndexed(True) field_path.setStored(True) field_path.setTokenized(True) field_path.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("path", root, field_path)) field_title = FieldType() field_title.setIndexed(True) field_title.setStored(True) field_title.setTokenized(True) field_title.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("title", html_doc.title, field_title)) field_description = FieldType() if html_doc.has_description(): field_description.setIndexed(True) else: field_description.setIndexed(True) field_description.setStored(True) field_description.setTokenized(True) field_description.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("description", html_doc.description, field_description)) field_month = FieldType() field_month.setIndexed(True) field_month.setStored(True) field_month.setTokenized(False) field_month.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("month", html_doc.month, field_month)) field_year = FieldType() field_year.setIndexed(True) field_year.setStored(True) field_year.setTokenized(False) field_year.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("year", html_doc.year, field_year)) if html_doc.has_authors(): field_author = FieldType() field_author.setIndexed(True) field_author.setStored(True) field_author.setTokenized(True) field_author.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) for author in html_doc.authors: doc.add(Field("author", author, field_author)) if html_doc.has_keywords(): field_keyword = FieldType() field_keyword.setIndexed(True) field_keyword.setStored(True) field_keyword.setTokenized(True) field_keyword.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) for keyword in html_doc.keywords: doc.add(Field("keyword", keyword, field_keyword)) field_timestamp = FieldType() field_timestamp.setIndexed(False) field_timestamp.setStored(True) field_timestamp.setTokenized(False) field_timestamp.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) doc.add(Field("timestamp", html_doc.timestamp, field_timestamp)) if len(contents) > 0: field_source = FieldType() field_source.setIndexed(True) field_source.setStored(True) field_source.setTokenized(True) field_source.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) doc.add(Field("contents", contents, field_source)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(root, writer): """ indexed: name title content stored: date name tilte sumary :param root: :param writer: :return: """ #index and store t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) #only index, but not store t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #only store t3 = FieldType() t3.setIndexed(False) t3.setStored(True) t3.setTokenized(False) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) for root, dirnames, filenames in os.walk(root): print filenames for filename in filenames: if not filename.endswith('.md'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf-8') file.close() date, name = get_date_name(filename) title, content = get_post_title_content(contents) summary = content[:200] if content else '' print date, name, title doc = Document() doc.add(Field('date', date, t3)) doc.add(Field('name', name, t1)) doc.add(Field('title', title, t1)) doc.add(Field('content', content, t2)) doc.add(Field('summary', summary, t3)) # doc.add(Field("name", filename, t1)) # doc.add(Field("path", root, t1)) # if len(contents) > 0: # doc.add(Field("contents", contents, t2)) # else: # print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, url, writer): type1 = FieldType() type1.setIndexed(True) type1.setStored(True) type1.setTokenized(False) type1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) type2 = FieldType() type2.setIndexed(True) type2.setStored(True) type2.setTokenized(True) type2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) # Read Feeds feeds = feedparser.parse(url) for item in feeds["entries"]: print "adding", item["title"] try: link = item["link"] contents = item["description"].encode("utf-8") contents = re.sub('<[^<]+?>', '', ''.join(contents)) title = item["title"] doc = Document() doc.add(Field("url", link, type1)) doc.add(Field("title", title, type1)) if len(contents) > 0: doc.add(Field("contents", contents, type2)) else: print "warning: no content in %s" % item["title"] writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def index_wiki(wiki_xmlfile, index_directory_name): # Initialize index directory and analyzer. version = Version.LUCENE_CURRENT store = FSDirectory.open(File(index_directory_name)) analyzer = StandardAnalyzer(version) # Creates config file. config = IndexWriterConfig(version, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) # Set document content field type. content_fieldtype = FieldType() content_fieldtype.setIndexed(True) content_fieldtype.setStored(True) content_fieldtype.setTokenized(True) content_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document title field type. title_fieldtype = FieldType() title_fieldtype.setIndexed(True) title_fieldtype.setStored(True) title_fieldtype.setTokenized(True) title_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) # Set document url field type. url_fieldtype = FieldType() url_fieldtype.setIndexed(True) url_fieldtype.setStored(True) url_fieldtype.setTokenized(False) url_fieldtype.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for xmldoc in wikicorpusxml((wiki_xmlfile)): content = xmldoc.partition('>')[2].partition('<')[0].strip() title = xmldoc.partition(' title="')[2].partition('"')[0].strip() url = xmldoc.partition(' url="')[2].partition('"')[0].strip() doc = Document() doc.add(Field("contents", content, content_fieldtype)) doc.add(Field("title", title, title_fieldtype)) doc.add(Field("url", url, url_fieldtype)) writer.addDocument(doc) writer.commit() writer.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.txt'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'gbk') file.close() doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
class LuceneDocumentField(object): """Internal handler class for possible field types""" def __init__(self): """Init possible field types""" # FIELD_ID: stored, indexed, non-tokenized self.field_id = FieldType() self.field_id.setIndexed(True) self.field_id.setStored(True) self.field_id.setTokenized(False) # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions) # for storing IDs with term vector info self.field_id_tv = FieldType() self.field_id_tv.setIndexed(True) self.field_id_tv.setStored(True) self.field_id_tv.setTokenized(False) self.field_id_tv.setStoreTermVectors(True) # FIELD_TEXT: stored, indexed, tokenized, with positions self.field_text = FieldType() self.field_text.setIndexed(True) self.field_text.setStored(True) self.field_text.setTokenized(True) # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions) self.field_text_tv = FieldType() self.field_text_tv.setIndexed(True) self.field_text_tv.setStored(True) self.field_text_tv.setTokenized(True) self.field_text_tv.setStoreTermVectors(True) # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_tvp = FieldType() self.field_text_tvp.setIndexed(True) self.field_text_tvp.setStored(True) self.field_text_tvp.setTokenized(True) self.field_text_tvp.setStoreTermVectors(True) self.field_text_tvp.setStoreTermVectorPositions(True) def get_field(self, type): """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value""" if type == Lucene.FIELDTYPE_ID: return self.field_id elif type == Lucene.FIELDTYPE_ID_TV: return self.field_id_tv elif type == Lucene.FIELDTYPE_TEXT: return self.field_text elif type == Lucene.FIELDTYPE_TEXT_TV: return self.field_text_tv elif type == Lucene.FIELDTYPE_TEXT_TVP: return self.field_text_tvp else: raise Exception("Unknown field type")
class LuceneSearch(): def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map() def get_title_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() title_id = {} id_title = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) title = doc['title'] title_id[title] = idd id_title[idd] = title return title_id, id_title def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder, docs_path, add_terms=False): print 'Loading Vocab...' if not self.vocab: self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words) os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close() def search_multithread(self, qs, max_cand, max_full_cand, searcher): self.max_cand = max_cand self.max_full_cand = max_full_cand self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() if q in self.cache: return self.cache[q] else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' q = 'dummy' query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q)) c = OrderedDict() hits = self.curr_searcher.search(query, self.max_cand) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) if i < self.max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] return c def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher): out = [] for q in qs: if q in self.cache: out.append(self.cache[q]) else: try: q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace( 'NOT', '\\NOT') query = QueryParser("text", self.analyzer).parse( QueryParser.escape(q)) except: print 'Unexpected error when processing query:', str(q) print 'Using query "dummy".' query = QueryParser("text", self.analyzer).parse( QueryParser.escape('dummy')) c = OrderedDict() hits = curr_searcher.search(query, max_cand) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) if i < max_full_cand: word_idx = map(int, doc['word_idx'].split(' ')) word = doc['word'].split('<&>') else: word_idx = [] word = [] c[int(doc['id'])] = [word_idx, word] out.append(c) return out def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True): if not max_full_cand: max_full_cand = max_cand if prm.docs_path != prm.docs_path_term: max_cand2 = 0 else: max_cand2 = max_full_cand if prm.n_threads > 1: out = self.search_multithread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term) else: out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher) if (prm.docs_path != prm.docs_path_term) and extra_terms: terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term) if (prm.docs_path != prm.docs_path_term) and extra_terms: for outt, termss in itertools.izip(out, terms): for cand_id, term in itertools.izip( outt.keys()[:max_full_cand], termss.values()): outt[cand_id] = term if save_cache: for q, c in itertools.izip(qs, out): if q not in self.cache: self.cache[q] = c return out
def indexDocs(self,root,writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames,filenames in os.walk(root): # traverse through the doc directory for filename in filenames: # only if this file ends with '.c' if not filename.endswith('.c'): continue try: # only add the filename and path for indexing path = os.path.join(root,filename) print "adding file : ",path file = open(path) contents = unicode(file.read(),'utf-8') file.close() doc = Document() doc.add(Field("name",filename,t1)) doc.add(Field("path",root,t1)) # if len(contents) > 0: # doc.add(Field("contents",contents,t2)) # else: # print "warning: no content in ",filename writer.addDocument(doc) except Exception,e: print "failed in indexDocs:",e
def indexTable(self, writer): #connection con = None #define the index of all the fields #---------step 2---------- con = mdb.connect('localhost','root','testgce','moviedata') #t_num = FieldType.NumericType it is wrong!! t_num = FieldType() t_num.setStored(False) t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(True) t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) with con: # Careful with codecs con.set_character_set('utf8') cur = con.cursor() # Aagin the codecs cur.execute('SET NAMES utf8;') cur.execute('SET CHARACTER SET utf8;') cur.execute('SET character_set_connection=utf8;') #------step 3------ cur.execute("SELECT * FROM movie_items") numrows = int(cur.rowcount) print 'numrows:',numrows for i in range(numrows): row = cur.fetchone() #------step 4------ summary = row[SUMMARY] subject_id = row[SUBJECT_ID] print 'id'+subject_id #print 'summary'+summary+'end' doc = Document() #fields which should not be analyzed doc.add(FloatField("rating_average",float(row[RATING_AVERAGE]),Field.Store.NO)) doc.add(FloatField("rating_stars", float(row[RATING_STARS]), Field.Store.NO)) doc.add(IntField("reviews_count", int(row[REVIEWS_COUNT]), Field.Store.NO)) #doc.add(FloatField("year", float(row[YEAR]), Field.Store.NO)) doc.add(IntField("collect_count", int(row[COLLECT_COUNT]), Field.Store.NO)) doc.add(IntField("subject_id", int(subject_id), Field.Store.YES)) doc.add(IntField("comments_count", int(row[COMMENTS_COUNT]), Field.Store.NO)) doc.add(IntField("ratings_count", int(row[RATINGS_COUNT]), Field.Store.NO)) doc.add(Field("image_small", row[IMAGE_SMALL], t1)) #fields which should be analyzed with WhitespaceAnalyzer doc.add(Field("countries", row[COUNTRIES].replace(delim,' '), t3)) doc.add(Field("casts", row[CASTS].replace(delim,' '), t3)) doc.add(Field("genres", row[GENRES].replace(delim,' '), t3)) doc.add(Field("subtype", row[SUBTYPE].replace(delim,' '), t2)) doc.add(Field("directors", row[DIRECTORS].replace(delim,' '), t3)) user_tags_str = '' others_like_str = '' # print 'user_tags'+row[USER_TAGS] # print 'others_like'+row[OTHERS_LIKE] if row[USER_TAGS]!='': for tag_pair in row[USER_TAGS].split(delim): if tag_pair!='':#字符串的最后一个字符是:,这样split之后最后一个元素是空字符 user_tags_str = user_tags_str +' '+tag_pair.split(delim_uo)[0] if row[OTHERS_LIKE]!='': for like_pair in row[OTHERS_LIKE].split(delim): if like_pair!='': others_like_str = others_like_str +' '+like_pair.split(delim_uo)[1] # print user_tags_str # print others_like_str doc.add(Field("user_tags", user_tags_str, t3)) doc.add(Field("others_like", others_like_str, t3)) #fields which should be analyzed with good analyzer doc.add(Field("title", row[TITLE], t3)) doc.add(Field("original_title", row[ORIGINAL_TITLE], t2)) doc.add(Field("summary_segmentation", row[SUMMARY_SEGMENTATION], t2)) doc.add(Field("aka", row[AKA], t2)) if len(summary) > 0: print subject_id +'--->'+':\n '+ row[TITLE] try: summary_unicoded = unicode(summary, 'utf-8') #test the encoding except Exception,e: print "Decode Failed: ", e doc.add(Field('summary', summary, t2)) else: print "warning:\n" + subject_id +'---> No content!' writer.addDocument(doc)
def _getFullStrings(self): mergePolicy = LogDocMergePolicy() mergePolicy.setMergeFactor(97) directory = RAMDirectory() self.dirs.append(directory) writer = self.getWriter(directory=directory, analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT), maxBufferedDocs=4, mergePolicy=mergePolicy) onlyStored = FieldType() onlyStored.setStored(True) fixedLen = self.getRandomNumber(2, 8) fixedLen2 = self.getRandomNumber(1, 4) for i in xrange(NUM_STRINGS): doc = Document() num = self.getRandomCharString(self.getRandomNumber(2, 8), 48, 52) doc.add(Field("tracer", num, onlyStored)) doc.add(StringField("string", num, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string_dv", BytesRef(num))) else: doc.add(BinaryDocValuesField("string_dv", BytesRef(num))) num2 = self.getRandomCharString(self.getRandomNumber(1, 4), 48, 50) doc.add(StringField("string2", num2, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string2_dv", BytesRef(num2))) else: doc.add(BinaryDocValuesField("string2_dv", BytesRef(num2))) doc.add(Field("tracer2", num2, onlyStored)) for f2 in doc.getFields(): if f2.fieldType().indexed() and not f2.fieldType().omitNorms(): Field.cast_(f2).setBoost(2.0) numFixed = self.getRandomCharString(fixedLen, 48, 52) doc.add(Field("fixed_tracer", numFixed, onlyStored)) doc.add(StringField("string_fixed", numFixed, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string_fixed_dv", BytesRef(numFixed))) else: doc.add(BinaryDocValuesField("string_fixed_dv", BytesRef(numFixed))) num2Fixed = self.getRandomCharString(fixedLen2, 48, 52) doc.add(StringField("string2_fixed", num2Fixed, Field.Store.NO)) if self.supportsDocValues: if self.dvStringSorted: doc.add(SortedDocValuesField("string2_fixed_dv", BytesRef(num2Fixed))) else: doc.add(BinaryDocValuesField("string2_fixed_dv", BytesRef(num2Fixed))) doc.add(Field("tracer2_fixed", num2Fixed, onlyStored)) for f2 in doc.getFields(): if f2.fieldType().indexed() and not f2.fieldType().omitNorms(): Field.cast_(f2).setBoost(2.0) writer.addDocument(doc) writer.close() return self.getSearcher(directory=directory)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) count = 0 filedic = {} urlfile = open("sportsinformation.txt", "r") for line in urlfile.readlines(): urlname, pagename, newsdate = line.split('*my_sep*') newsdate = newsdate.strip('\n') if len(newsdate) == 0: continue filedic[pagename] = [urlname, newsdate] '''pageANDurl = line.split('\t') urlname = pageANDurl[0] webpage = pageANDurl[1].strip('\n') filedic[webpage] = urlname''' urlfile.close() for root, dirnames, filenames in os.walk(root): for filename in filenames: if (filename.endswith('apk') or filename.endswith('pdf') \ or filename.endswith('exe') or filename.endswith('rar') \ or filename.endswith('zip')): print filename, " SKIP THIS FILE!" continue count += 1 print "adding", filename, " ", count try: path = os.path.join(root, filename) file = open(path) contents = unicode(file.read(), 'utf8', 'ignore') soup = BeautifulSoup(contents, "html.parser") title = soup.find("title") if title: title = title.text else: title = "This page has no title" contents = ''.join(soup.findAll(text=True)) contents = ' '.join(jieba.cut(contents)) file.close() url = filedic[filename][0] newsdate = str(filedic[filename][1]) print type(newsdate) print type(url) doc = Document() doc.add(Field("title", title, t1)) doc.add(Field("url", url, t1)) doc.add(Field("date", newsdate, t1)) sites = [] site = urlparse.urlparse(url).netloc siteparts = site.split(".") length = len(siteparts) while (length > 0): length -= 1 site = '.'.join(siteparts[length:]) sites.append(site) site = ' '.join(sites) doc.add(Field("site", site, t2)) if len(contents) > 0: print "yes" doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def index_docs(self, tweets, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t1.setStoreTermVectors(True) t1.setStoreTermVectorOffsets(True) # add each tweet to the index for tweet in tweets: try: # strip out URLs because they provide false index matches contents = [] for word in tweet[1].text.split(): if word.startswith("http://") or word.startswith("https://"): continue contents.append(word) contents = " ".join(contents) if len(contents) == 0: continue doc = Document() doc.add(Field("contents", contents, t1)) writer.addDocument(doc) except Exception, e: print "Failed in index_docs:", e
def _create_index(self, index_dir: str) -> None: """Index documents Parameters ---------- index_dir : str The dir to store index """ os.mkdir(index_dir) TITLE_FIELD = FieldType() # pylint: disable=invalid-name TITLE_FIELD.setStored(True) TITLE_FIELD.setIndexOptions(IndexOptions.DOCS) TEXT_FIELD = FieldType() # pylint: disable=invalid-name TEXT_FIELD.setStored(True) TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) fs_dir = MMapDirectory(Paths.get(index_dir)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fs_dir, writer_config) logger.info("%d docs in index", self.writer.numDocs()) logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) doc = Document() doc.add(Field("title", doc_id, TITLE_FIELD)) doc.add(Field("text", text, TEXT_FIELD)) self.writer.addDocument(doc) logger.info("Indexed %d docs.", self.writer.numDocs()) self.writer.forceMerge(1) # to increase search performance self.writer.close()
def _createNoTermsFrequencyFieldType(): f = FieldType() f.setIndexed(True) f.setTokenized(True) f.setOmitNorms(True) f.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY) f.freeze() return f