def indexer(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) def repalcer(text): chars = '\\`*_{}[]()>#+-.!$‘' for c in chars: if c in text: text = text.replace(c, ' ') return text for root, dirnames, filenames in os.walk(root): i = 0 for filename in filenames: i += 1 with open(os.path.join(root, filename)) as f: for line in f.readlines(): line = line.split(' ', 2) docname = line[0] + ' ' + line[1] name = repalcer(line[0]) contents = line[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i)
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): #print "adding",i, sent try: root = os.getcwd() #contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field("name", str(i), t1)) doc.add(Field("path", root, t1)) if len(sent) > 0: doc.add(Field("contents", sent.lower(), t2)) else: print "warning: no content in %s" % str(i) writer.addDocument(doc) except Exception, e: print "Failed in indexsents:", e
def build_index(self, dict_data): print("loading data...") t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for k, v in dict_data.items(): doc = Document() doc.add(Field("id", k, t1)) doc.add(Field("content", v, t2)) self.writer.addDocument(doc) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() self.writer.commit() self.writer.close() ticker.tick = False print("done")
def build(self, index): writer = self.getWriter(directory=index.index, analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT)) seed(101) for d in xrange(self.minId, self.maxId + 1): doc = Document() doc.add(Field("id", self.pad(d), StringField.TYPE_STORED)) if index.allowNegativeRandomInts: r = randint(~self.MAX_INT, self.MAX_INT) else: r = randint(0, self.MAX_INT) if index.maxR < r: index.maxR = r if r < index.minR: index.minR = r doc.add(Field("rand", self.pad(r), StringField.TYPE_STORED)) doc.add(Field("body", "body", StringField.TYPE_STORED)) writer.addDocument(doc) writer.commit() writer.close()
def indexMovie(movie): doc = Document() doc.add(Field('id', str(movie), StringField.TYPE_STORED)) at_lest_one_field = False maybe_tags = movies_tags.query('item == @movie') if not maybe_tags.empty: tags = maybe_tags[['tags']].values.flatten()[0] doc.add(Field('tags', tags, TextField.TYPE_NOT_STORED)) at_lest_one_field = True maybe_description = movies_descriptions.query('item == @movie') if not maybe_description.empty: description = maybe_description[['description']].values.flatten()[0] doc.add(Field('description', description, TextField.TYPE_NOT_STORED)) at_lest_one_field = True maybe_genres = movies_genres.query('item == @movie') if not maybe_genres.empty: genres = maybe_genres[['genres']].values.flatten()[0] doc.add(Field('genres', genres, TextField.TYPE_NOT_STORED)) at_lest_one_field = True if at_lest_one_field: writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) inFile = open(str(args["inputFile"])) indexName = inFile.readline() while (indexName != ''): print "adding", indexName doc = Document() doc.add(Field("name", indexName, t1)) #doc.add(Field("path", root, t1)) text = inFile.readline() if (len(text) > 0): print("contents: %s\n" % text) doc.add(Field("contents", text, t2)) else: print "warning: no content in %s" % indexName indexName = inFile.readline() writer.addDocument(doc) inFile.close()
def indexDictionary(d, writer): for k, v in d.iteritems(): doc = Document() doc.add(Field('filename', k, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field('content', v, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) return writer.numDocs()
def wikipedia_indexer(storage, wikipedia_file): lucene.initVM() indexDir = SimpleFSDirectory(File(storage)) stops = CharArraySet(Version.LUCENE_4_10_1, 0, True) for s in stopwords: stops.add(s) analyzer = StandardAnalyzer(Version.LUCENE_4_10_1, stops) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading Documents" f = open(wikipedia_file) for i, line in enumerate(f): text = line.strip().decode('utf-8').split('\t') title = text[0] if 'disambigu' in text[0] or len(text) < 2: continue text = text[1] doc = Document() doc.add(Field("num", str(i), Field.Store.YES, Field.Index.NO)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("text", text, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) if writer.numDocs() % 1000 == 0: print "Indexed (%d docs in index) Last %d" % (writer.numDocs(), i) print "Closing index of %d docs..." % writer.numDocs() writer.close()
def addDoc(w, data): doc = Document() for field in data: value, type = data[field][0], data[field][1] ''' if type!='INTEGER_STORED': #print ('field=%s len=%d'%(field,len(value))) print ('field=%s value=%s'%(field,value)) else: print ('field=%s value=%d'%(field,value)) ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_NOT_STORED': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_NOT_STORED)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') try: w.addDocument(doc) except: #print ('error cat=%s'%(data['category'][0])) print('-----------------------------------') for field in data: value, type = data[field][0], data[field][1] print('field=%s\nvalue=%s' % (field, str(value)))
def add_doc(self, doc_id, title, txt, add_terms): doc = Document() txt = utils.clean(txt) if add_terms: if prm.top_tfidf > 0: words_idx = [] words, _ = utils.top_tfidf(txt.lower(), self.idf, prm.top_tfidf, prm.min_term_freq) if len(words) == 0: words.append('unk') for w in words: if w in self.vocab: words_idx.append(self.vocab[w]) else: words_idx.append(-1) # unknown words. else: txt_ = txt.lower() words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc) words_idx = words_idx[0] words = words[0] doc.add(Field("id", str(doc_id), self.t1)) doc.add(Field("title", title, self.t1)) doc.add(Field("text", txt, self.t2)) if add_terms: doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3)) doc.add(Field("word", '<&>'.join(words), self.t3)) self.writer.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) file_path = root + 'r52-train-all-terms.txt' fd = open(file_path) contents = fd.readlines() fd.close() contents_list = [x.strip() for x in contents] for i in xrange(len(contents_list)): try: [topic, content] = contents_list[i].split('\t') doc = Document() doc.add(Field("id", str(i), t1)) doc.add(Field("topic", topic, t1)) doc.add(Field("contents", content, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexsents(self, sentences, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for i, sent in enumerate(sentences): # print 'adding',i, sent try: root = os.getcwd() # contents = unicode(sent, 'iso-8859-1') doc = Document() doc.add(Field('name', str(i), t1)) doc.add(Field('path', root, t1)) if len(sent) > 0: doc.add(Field('contents', sent.lower(), t2)) else: print('warning: no content in %s' % str(i)) writer.addDocument(doc) except Exception as e: print('Failed in indexsents:', e) writer.commit() writer.close()
def publish_services(self, service_list): transformer = WSDLTransformer() current_document = 1 indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig( Version.LUCENE_CURRENT, EnglishAnalyzer(Version.LUCENE_CURRENT)) writerConfig.setSimilarity(BM25Similarity()) index_writer = IndexWriter(indexDir, writerConfig) for wsdl in service_list: if self._document_expansion: #bag_of_words = ' '.join(self._preprocessor(self._semantic_transformer.transform(transformer.transform(wsdl)))) bag_of_words = ' '.join( self._semantic_transformer.transform( transformer.transform(wsdl))) else: #bag_of_words = ' '.join(self._preprocessor(transformer.transform(wsdl))) bag_of_words = ' '.join(transformer.transform(wsdl)) doc = Document() doc.add( Field("content", bag_of_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("path", wsdl, Field.Store.YES, Field.Index.NO)) index_writer.addDocument(doc) current_document += 1 index_writer.close()
def index_image(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) with open(os.path.join(root, "index.txt"), mode="r", encoding="utf8") as index: count = 1 for line in index: print("\r", count, end="", sep="") try: image_url, content = line.strip().split()[:2] except ValueError as e: print(e) continue doc = Document() doc.add(Field("raw_content", content, t1)) content = " ".join( word for word in jieba.cut_for_search(content) if word.strip() and word not in self.stop_words) doc.add(Field("url", image_url, t1)) doc.add(Field("content", content, t2)) writer.addDocument(doc) count += 1 print("\n{count} image(s) added.".format(count=count))
def indexDocs(self, sourceDir, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for sourceDir, dirnames, filenames in os.walk(sourceDir): for filename in filenames: if not filename.endswith('.txt'): continue print(filename) try: path = os.path.join(sourceDir, filename) file = open(path, 'r', encoding="utf-8") contents = file.read() #contents = str(filecontent, 'utf-8') #contents = filecontent.encode('utf-8') #print('path', path, len(contents)) doc = Document() doc.add(Field("name", filename, t1)) # filename (title) #doc.add(Field("path", root, t1)) if len(contents) > 0: doc.add(Field(queryField, contents, t2)) # content else: print("warning: no content in %s" % filename) writer.addDocument(doc) file.close() except NameError: print("Failed in indexDocs:")
def addDocument(self, writer, new_doc, metadata, fields_to_process, bow_info): """ Add a document to the index. Does this using direct Lucene access. :param new_doc: dict of fields with values :type new_doc:dict :param metadata: ditto :type metadata:dict :param fields_to_process: only add these fields from the doc dict :type fields_to_process:list """ doc = Document() total_numTerms = bow_info["total_numterms"] # each BOW now comes with its field for field in fields_to_process: field_object = Field(field, new_doc[field], Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES) ## boost=math.sqrt(numTerms[field]) / float(math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) boost = 1 / float( math.sqrt(total_numTerms)) if total_numTerms > 0 else float(0) field_object.setBoost(float(boost)) doc.add(field_object) json_metadata = json.dumps(metadata) doc.add(Field("guid", guid, Field.Store.YES, Field.Index.ANALYZED)) doc.add( Field("bow_info", json.dumps(bow_info), Field.Store.YES, Field.Index.NO)) doc.add( Field("metadata", json_metadata, Field.Store.YES, Field.Index.NO)) doc.add( Field("year_from", metadata["year"], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc)
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def index(indexdir): lucene.initVM() indexDir = SimpleFSDirectory(File(indexdir)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, EnglishAnalyzer()) writer = IndexWriter(indexDir, writerConfig) f = open('data/docid.documento-xml.txt') st = PorterStemmer() for i, line in enumerate(f.readlines()): id, xmltext = line.split('\t') xmltext = xmltext.rstrip('\n') xmldoc = minidom.parseString(xmltext) title = xmldoc.getElementsByTagName("TITLE") title = "" if len(title) == 0 else title[0].childNodes[0].nodeValue authors = xmldoc.getElementsByTagName("AUTHORS") authors = "" if len(authors) == 0 else authors[0].childNodes[0].nodeValue abstract = xmldoc.getElementsByTagName("ABSTRACT") abstract = "" if len(abstract) == 0 else abstract[0].childNodes[0].nodeValue doc = Document() doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("authors", authors, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("abstract", abstract, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", id, Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc) print "indexed %s docs" % (i+1) writer.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t3 = FieldType() t3.setIndexed(True) t3.setStored(False) t3.setTokenized(True)#利用预先设置的analyzer进行分词,这里是根据空格 t3.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) total=0 file = open(root,"r") for line in file.readlines(): try: imgurl, itemurl, content = line.split('\t') total+=1 print total print "adding", content contents = ' '.join(jieba.cut(content)) doc = Document() doc.add(Field("imgurl", imgurl, t1)) doc.add(Field("itemurl", itemurl, t1)) doc.add(Field("title", content, t1)) doc.add(Field("contents",contents,t3)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) wikiFile = ZipFile(root, 'r') files = wikiFile.namelist() i = 0 for file in files[1:]: i += 1 wiki = wikiFile.open(file, 'r') for line in wiki: for line in codecs.iterdecode(wiki, 'utf8'): normailized = unicodedata.normalize('NFD', line).split(' ', 2) if not normailized[1].isdigit(): continue docname = normailized[0] + ' ' + normailized[1] name = re.sub(r'[^a-zA-Z0-9]', ' ', normailized[0]) contents = normailized[2] doc = Document() doc.add(Field('docname', docname, t1)) doc.add(Field('name', name, t1)) doc.add(Field('contents', contents, t1)) writer.addDocument(doc) print('File %d done indexing' % i, file)
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] ''' print ('field:%s type:%s'%(field,type)) print (value+'\n') ''' if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'CUSTOM_FIELD_TEXT_DF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_DF)) elif type == 'CUSTOM_FIELD_TEXT_BF': doc.add(Field(field, value, CUSTOM_FIELD_TEXT_BF)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) index_file = open("index.txt", 'r') for line in index_file.readlines(): try: src = line.strip().split('\t')[0] filename = line.strip().split('\t')[1] tag = line.strip().split('\t')[2] path = os.path.join(root, filename) doc = Document() doc.add(Field("name", filename, t1)) doc.add(Field("path", root, t1)) doc.add(Field("src", src, t1)) if len(tag) > 0: doc.add(Field("tag", tag, t2)) else: print "warning: no tag in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def index_docs(root, writer): # metadata: name and path metadata = FieldType() metadata.setStored(True) # as is value metadata.setTokenized(False) metadata.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # content: abstract and body content_type = FieldType() content_type.setStored(True) # to highlight on search results content_type.setTokenized(True) # tokenize words content_type.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for directory, _, file_names in walk(root): for file_name in file_names: name, extension = splitext(file_name) if extension not in DOC_FORMATS: continue # skip unsupported formats file_path = join(directory, file_name) print ' ', file_path # Build indexed document doc = Document() doc.add(Field('name', file_name, metadata)) doc.add(Field('path', directory, metadata)) # Read file contents content = process(file_path, 'utf-8', method='pdfminer') abstract = extract_abstract(content) doc.add(Field('content', content, content_type)) doc.add(Field('abstract', abstract, content_type)) writer.addDocument(doc)
def create_index(): lucene.initVM() if os.path.exists(prm.index_folder): shutil.rmtree(prm.index_folder) indexDir = SimpleFSDirectory(File(prm.index_folder)) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) wk = wiki.Wiki(prm.pages_path) print "%d docs in index" % writer.numDocs() print "Reading files from wikipedia..." n = 0 for l in wk.get_text_iter(): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("id", str(n), Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) n += 1 if n % 100000 == 0: print 'indexing article', n print "Indexed %d docs from wikipedia (%d docs in index)" % ( n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()