def addDoc(w, name, birth_date, death_date, birth_note, death_note): doc = Document() doc.add(TextField("name", name, Field.Store.YES)) doc.add(StringField("birth_date", birth_date, Field.Store.YES)) doc.add(StringField("death_date", death_date, Field.Store.YES)) doc.add(StringField("birth_note", birth_note, Field.Store.YES)) doc.add(StringField("death_note", death_note, Field.Store.YES)) w.addDocument(doc)
def create_doc(data): screen_name = data['screen_name'] tweet = data['tweet'] tweet_date = data['tweet_date'] tweet_location = data['tweet_location'] page_title = data['page_title'] doc = Document() doc.add(TextField("username", screen_name, Field.Store.YES)) doc.add(TextField("text", tweet, Field.Store.YES)) doc.add(TextField("date", tweet_date, Field.Store.YES)) if tweet_location: doc.add(TextField("location", tweet_location, Field.Store.YES)) if page_title: doc.add(TextField("page title", page_title, Field.Store.YES)) return doc
def addDoc(w, doc_name, text, file_name): """ add single doc to the index :param w: writer :param doc_name: :param text: :param file_name: :return: """ doc = Document() # TextField: sequence of terms: tokenized doc.add(TextField("text", text, Field.Store.YES)) # StringField: character strings with all punctuation, spacing, and case preserved. doc.add(TextField('doc_name', doc_name, Field.Store.YES)) #doc.add(StringField('corpus_name', file_name, Field.Store.YES)) doc.add(TextField('corpus_name', file_name, Field.Store.YES)) w.addDocument(doc)
def create_document(file_name): path = INPUT_DIR + file_name # assemble the file descriptor file = open(path) # open in read mode doc = Document() # create a new document # add the title field doc.add(StringField("title", input_file, Field.Store.YES)) # add the whole book doc.add(TextField("text", file.read(), Field.Store.YES)) file.close() # close the file pointer return doc
def get_doc(self, doc_info, contents): ''' Generate a `Document` according to the given info. Input: `doc_info`: info of the doc (`name`, `path`, `title`, `url`, `site`) `contents`: contents of the webpage Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("name", doc_info['name'], Field.Store.YES)) doc.add(StringField("path", doc_info['path'], Field.Store.YES)) doc.add(StringField("title", doc_info['title'], Field.Store.YES)) doc.add(StringField("url", doc_info['url'], Field.Store.YES)) doc.add(TextField("site", doc_info['site'], Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print("Warning: No content in {}".format(doc_info['name'])) return doc
def retrival_answer(MAX): lucene.initVM() directory = RAMDirectory() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(directory, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from Document..." process_doc = open("Huawei_result/document.txt", "r") doc_line = process_doc.readlines() for l in doc_line: doc = Document() doc.add(TextField("text", l, Field.Store.YES)) writer.addDocument(doc) print "Indexed from %d docs in index" % (writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close() accuracy = [] process_query = open("Huawei_result/query.txt", "r") query_line = process_query.readlines() for n, one_query in enumerate(query_line): analyzer = StandardAnalyzer() # reader = IndexReader.open(SimpleFSDirectory(Paths.get('index'))) searcher = IndexSearcher(DirectoryReader.open(directory)) # searcher = IndexSearcher(reader) query = QueryParser("text", analyzer).parse(one_query) hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) # print "The groundtruth document is:", doc_line[n] candidate_doc = [] for hit in hits.scoreDocs: # print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) # print doc.get("text").encode("utf-8") candidate_doc.append(doc.get("text")) choices = process.extract(unicode(doc_line[n]), candidate_doc) flag = 0 for i in range(len(choices)): if choices[i][1] >= 89: flag = 1 if flag == 1: accuracy.append(1) else: accuracy.append(0) final_accuracy = float(sum(accuracy)) / float(len(accuracy)) print "the final accuracy is:", final_accuracy
def obj_to_document_old(obj): res = Document() res.add(StringField('index', str(obj.index), Field.Store.YES)) res.add(StringField('type', obj.__class__.__name__, Field.Store.YES)) for k, v in vars(obj.data).items(): if v is None: res.add(Field(LT_NONE + k, '', Field.Store.YES, Field.Index.NO)) elif isinstance(v, list): if len(v) > 0 and isinstance(v[0], int): res.add( TextField(LT_INTLIST + k, ' '.join( (str(x) for x in set(v))), Field.Store.YES)) else: res.add( TextField(LT_LIST + k, ' '.join(list(set(v))), Field.Store.YES)) elif isinstance(v, str) or isinstance(v, unicode): res.add(Field(LT_STRING + k, v, Field.Store.YES, Field.Index.NO)) res.add( TextField(LT_FOR_QUERY + k, ' '.join(jieba.lcut(v)), Field.Store.NO)) elif isinstance(v, hyper_text): res.add( Field(LT_HYPERTEXT + k, v.raw, Field.Store.YES, Field.Index.NO)) res.add( TextField(LT_FOR_QUERY + k, ' '.join(jieba.lcut(v.text)), Field.Store.NO)) elif isinstance(v, bool): if v: vs = '1' else: vs = '0' res.add(StringField(LT_BOOL + k, vs, Field.Store.YES)) elif isinstance(v, int) or isinstance(v, long): res.add(StringField(LT_INT + k, str(v), Field.Store.YES)) else: raise Exception('unrecognized data type') return res
def addDoc(w, text): """ add single doc to the index :param w: writer :param doc_name: :param text: :param file_name: :return: """ doc = Document() # TextField: sequence of terms: tokenized doc.add(TextField("text", text, Field.Store.YES)) w.addDocument(doc)
def make_document(full_path, unix_timestamp, contents): """ Create Lucene document with specific content. """ doc = Document() # two separate date fields per recommendation # at https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/DateTools.html doc.add(LongPoint('date_for_pointrangequery', int(unix_timestamp))) doc.add(StoredField('last_modified_time', int(unix_timestamp))) # https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/TextField.html # indexed and tokenized doc.add(TextField('fullpath', full_path, Field.Store.YES)) # this is file key but tokenized doc.add(TextField('body', contents, Field.Store.YES)) # It is also possible to add fields that are indexed but not tokenized. # See https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/document/StringField.html # However there is a limitation: https://stackoverflow.com/a/32654329/130164 # MultiFieldQueryParser will have bizarre results because the query parser runs the analyzer # , while StringField does not run the analyzer. # We deliberately store the key as untokenized so we can search by it directly with a TermQuery. doc.add(StringField('key', full_path, Field.Store.YES)) # this is file key return doc
def get_doc(self, img): ''' Generate a `Document` according to the parameters. Input: `img`: dict containing a single image info Output: `Document` with the fields initialized ''' doc = Document() doc.add(StringField("img_url", img['img_url'], Field.Store.YES)) doc.add(TextField("description", img['description'], Field.Store.YES)) doc.add(StringField("url", img['url'], Field.Store.YES)) doc.add(StringField("url_title", img['url_title'], Field.Store.YES)) return doc
def index(cls, indexDir, taxoDir, facets_config): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer config = IndexWriterConfig(Version.LUCENE_48, WhitespaceAnalyzer(Version.LUCENE_48)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # obtain the sample facets for current document facets = categories[docNum] author = authors[docNum] # ... and use the FacetField class for adding facet fields to # the Lucene document (and via FacetsConfig to the taxonomy index) doc.add(FacetField("Author", author)) for f in facets: doc.add(FacetField("Categories", f)) # finally add the document to the index iw.addDocument(facets_config.build(taxo, doc)) nDocsAdded += 1 # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. iw.close() taxo.close() print "Indexed %d documents with facets." % nDocsAdded
def create_document(line): doc = Document() line = line.split() keyterm = line[0] doc.add(StringField("keyterm", keyterm, Field.Store.YES)) index = line[1] doc.add(StringField("Sno", index, Field.Store.YES)) del line[0:2] line = ' '.join(line) qterm = keyterm.replace("_", " ") if qterm not in line: line = qterm + ' ' + line doc.add(TextField("text", line, Field.Store.YES)) return doc
def indexing(datadir): indexedDocs = 0 doc = Document() #index_outdir = str(input("Enter index output dir: ")) path = Paths.get('indexOut') indexOut = SimpleFSDirectory(path) analyzer = EnglishAnalyzer() config = IndexWriterConfig(analyzer) writer = IndexWriter(indexOut, config) for filename in glob.iglob(datadir + '/*.json*', recursive=True): try: print("Filename is", filename) #pdb.set_trace() with open(filename) as f: for line in f: tweet=json.loads(line) if(tweet['lang']=='en'): doc.add(StringField("id", tweet['id_str'], Field.Store.YES)) # doc.add(Field("screen_name", tweet['user.screen_name'])) # print(tweet['user.screen_name']) # doc.add(Field("name", tweet['user.name'])) #doc.add(Field("location", tweet['user.location'])) #print(tweet['user.location']) doc.add(TextField("text",tweet['text'],Field.Store.YES)) #doc.add(Field("created_at", DateTools.stringToDate(tweet['created_at']),Field.Store.YES)) doc.add(TextField("created_at", tweet['created_at'], Field.Store.YES)) # doc.add(IntPoint("followers", tweet['user.followers_count'],Field.Store.YES)) # doc.add(IntPoint("friends", tweet['friends_count'],Field.Store.YES)) writer.addDocument(doc) writer.commit() indexedDocs+=1 except: continue writer.close() print("Indexed ", indexedDocs, " documents")
def testScore(self): reactor = CallTrace('reactor') settings = LuceneSettings(commitCount=1, similarity=TermFrequencySimilarity(), verbose=False) lucene = Lucene(join(self.tempdir, 'lucene'), reactor=reactor, settings=settings) document = Document() document.add(TextField('field', 'x '*100, Field.Store.NO)) returnValueFromGenerator(lucene.addDocument(identifier="identifier", document=document)) q = TermQuery(Term("field", 'x')) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(0.1, result.hits[0].score) q.setBoost(10.0) result = returnValueFromGenerator(lucene.executeQuery(q)) self.assertAlmostEqual(1, result.hits[0].score)
def index_single_file(self, doc_file): logger.info("adding {}".format(doc_file)) single_file_num = 0 try: with open(doc_file) as df: for line in df: para_no = 1 wiki_doc = json.loads(line) doc_title = wiki_doc['title'] doc_text = wiki_doc['plaintext'] doc_id = wiki_doc['_id'] paragraphs = doc_text.split('\n\n') if len(paragraphs) < 3: continue # logger.info('doc_id:', doc_id, 'title:', doc_title, 'para_num:', len(paragraphs)) for para in paragraphs: para = rm_white_space(para) if len(word_tokenize(para)) < 50: continue para_id = '{}_{}'.format(doc_id, para_no) doc = Document() doc.add(StringField("id", para_id, Field.Store.YES)) doc.add(TextField("title", doc_title, Field.Store.YES)) doc.add(TextField("text", para, Field.Store.YES)) self.writer.addDocument(doc) para_no += 1 single_file_num += 1 if single_file_num % 10000 == 0: logger.info('added {} lucene docs (paragraphs)'.format(single_file_num)) except Exception as e: import traceback traceback.print_tb(e.__traceback__) logger.error("Failed in: {}".format(doc_file)) return single_file_num
def createDoc(self, url, html, duplicate): title, contents = self.parseHtml(url, html) doc = Document() doc.add(StringField("title", title, Field.Store.YES)) doc.add(StringField("url", url, Field.Store.YES)) doc.add( StringField("duplicate", str(duplicate).lower(), Field.Store.YES)) if len(contents) > 0: doc.add(TextField("contents", contents, Field.Store.YES)) else: print "Warning: No content in %s" % url return doc
def addDoc(w, data): doc = Document() #print ('----------------------------') for field in data: value, type = data[field][0], data[field][1] if type == 'StringField': doc.add(StringField(field, value, Field.Store.YES)) elif type == 'TextField': doc.add(TextField(field, value, Field.Store.YES)) elif type == 'CUSTOM_FIELD_TEXT': doc.add(Field(field, value, CUSTOM_FIELD_TEXT)) elif type == 'INTEGER_STORED': doc.add(StoredField(field, value)) else: print('UNKNOWN FIELD') w.addDocument(doc)
def build_index(file_dir): indexDir = SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")) config = IndexWriterConfig(WhitespaceAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(indexDir, config) # t1 = FieldType() # t1.setStored(True) # t1.setTokenized(False) # t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # # t2 = FieldType() # t2.setStored(True) # t2.setTokenized(True) # t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) print("%d docs in index" % writer.numDocs()) if writer.numDocs(): print("Index already built.") return with open(file_dir + "/train/train.ast.src") as fc: codes = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in fc.readlines() ] for k, code in enumerate(codes): doc = Document() doc.add(StoredField("id", str(k))) doc.add(TextField("code", code, Field.Store.YES)) writer.addDocument(doc) print("Closing index of %d docs..." % writer.numDocs()) writer.close()
def addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract): global batch,cnt_batch #print 'title='+title+ ' category='+category+' skos='+skos_category doc = Document() doc.add(StringField('title',title,Field.Store.YES)) doc.add(TextField('name',name,Field.Store.YES)) doc.add(TextField('value',value,Field.Store.YES)) doc.add(StoredField('category',category)) doc.add(StoredField('skos_category',skos_category)) doc.add(TextField('all_text',all_text,Field.Store.YES)) doc.add(TextField('raw_name',raw_name,Field.Store.YES)) doc.add(TextField('raw_value',raw_value,Field.Store.YES)) doc.add(TextField('abstract',abstract,Field.Store.YES)) #batch.append(doc) #cnt_batch+=1 #if cnt_batch==1000: # w.addDocuments(batch) # cnt_batch=0 # del batch[:] w.addDocument(doc)
def index_facts(facts, writer): for fact in tqdm.tqdm(facts, desc="Indexing facts"): doc = Document() doc.add(TextField("contents", fact, Field.Store.YES)) writer.addDocument(doc)
def indexDocs(self, dataFilePath, writer): noIndexedString = FieldType() noIndexedString.setTokenized(False) noIndexedString.setIndexed(False) noIndexedString.setStored(True) with open(dataFilePath, 'r') as f: shopinfo = f.readlines() cnt = 0 validnum = 0 for script in shopinfo: cnt += 1 print cnt script = script.strip().split('\t') if len(script) < 7: print "data incomplete." continue try: goodname, salenum, price, shopname, url, picturename, comment, historyprice = sentenceModify(script) print "adding", goodname goodname_s = unicode(goodname, 'utf8') seg_list_good = jieba.cut(goodname_s, cut_all=False) goodname_s = " ".join(seg_list_good) # 默认模式 shopname_s = unicode(shopname, 'utf8') seg_list_shop = jieba.cut(shopname_s, cut_all=False) shopname_s = " ".join(seg_list_shop) # 默认模式 shopnameField = Field("shopName", shopname, noIndexedString) shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO) goodnameField = Field("goodName", goodname, noIndexedString) goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO) salenumField = IntField("saleNum", salenum, Field.Store.YES) priceField = DoubleField("price", price, Field.Store.YES) urlField = Field("url", url, noIndexedString) pictureField = StringField("pictureName", picturename, Field.Store.YES) commentField = Field("comments", comment, noIndexedString) historyPriceField = Field("historyPrice", historyprice, noIndexedString) doc = Document() doc.add(shopnameField) doc.add(shopnameField_s) doc.add(goodnameField) doc.add(goodnameField_s) doc.add(salenumField) doc.add(priceField) doc.add(urlField) doc.add(pictureField) doc.add(commentField) doc.add(historyPriceField) writer.addDocument(doc) validnum += 1 except Exception, e: print "Failed in indexDocs:", e
def getTextField(self): return TextField()
def obj_to_document(obj): def conv_to_str(x): if isinstance(x, unicode): return x.encode('utf8') return str(x) res = Document() tstr = '1' if not is_valid_object(obj): tstr = '0' res.add(StringField(LTPF_TYPE, tstr, Field.Store.NO)) res.add(StringField('index', conv_to_str(obj.index), Field.Store.YES)) res.add(StringField('type', obj.__class__.__name__, Field.Store.YES)) for k, v in vars(obj.data).items(): if v is None: res.add(Field(k, '', Field.Store.YES, Field.Index.NO)) fieldtype = LT_NONE elif isinstance(v, list): if len(v) > 0 and isinstance(v[0], int): res.add( TextField(k, ' '.join((str(x) for x in set(v))), Field.Store.YES)) fieldtype = LT_INTLIST else: res.add(TextField(k, ' '.join(list(set(v))), Field.Store.YES)) fieldtype = LT_LIST elif isinstance(v, str) or isinstance(v, unicode): if k == 'author_index': res.add(StringField(k, v, Field.Store.YES)) else: res.add(Field(k, v, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v)), Field.Store.NO)) fieldtype = LT_STRING elif isinstance(v, hyper_text): res.add(Field(k, v.raw, Field.Store.YES, Field.Index.NO)) res.add( TextField(k + LTPF_FOR_QUERY, ' '.join(jieba.lcut_for_search(v.text)), Field.Store.NO)) fieldtype = LT_HYPERTEXT elif isinstance(v, bool): if v: vs = '1' else: vs = '0' res.add(StringField(k, vs, Field.Store.YES)) fieldtype = LT_BOOL elif isinstance(v, int) or isinstance(v, long): res.add(StringField(k, str(v), Field.Store.YES)) fieldtype = LT_INT elif isinstance(v, float): res.add(StringField(k, str(v), Field.Store.YES)) fieldtype = LT_FLOAT else: raise Exception('unrecognized data type') res.add( Field(k + LTPF_TYPE, fieldtype, Field.Store.YES, Field.Index.NO)) return res
def add(self, content): sent = Document() sent.add(TextField("content", content, Field.Store.YES)) self.writer.addDocument(sent)
def add(self, title, content): sent = Document() #sent.add(StringField("sid", sid, Field.Store.YES)) sent.add(StringField("title", title, Field.Store.YES)) sent.add(TextField("content", content, Field.Store.YES)) self.writer.addDocument(sent)
def add(self, did, title_en, content): doc = Document() doc.add(StringField("did", did, Field.Store.YES)) doc.add(StringField("title_en", title_en, Field.Store.YES)) doc.add(TextField("content", content, Field.Store.YES)) self.writer.addDocument(doc)
def add(self, pid, content): doc = Document() doc.add(StringField("pid", pid, Field.Store.YES)) doc.add(TextField("content", content, Field.Store.YES)) self.writer.addDocument(doc)
def main(index_dir, input_dir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % index_dir) fs_dir = SimpleFSDirectory(Paths.get(index_dir)) analyzer = StandardAnalyzer(stopwords) writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(fs_dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(input_dir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(StringField("journal", journal_code, Field.Store.YES)) doc.add(StringField("url", entry['url'], Field.Store.YES)) doc.add(StringField("date", entry['date'], Field.Store.YES)) doc.add(TextField("title", entry['title'], Field.Store.YES)) writer.addDocument(doc) json_data.close() except IOError as v: try: (code, message) = v except (TypeError, ValueError): code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up # logger.info("About to optimize index of %d documents..." % writer.numDocs()) # writer.optimize() # logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = DirectoryReader.open(fs_dir) with open('all.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in range(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([ doc.get('journal'), doc.get('date'), doc.get('url'), doc.get('title').strip().replace(',', '\,') ])
from org.apache.lucene.analysis.standard import StandardAnalyzer if __name__ == "__main__": lucene.initVM() path = Paths.get('index') indexDir = SimpleFSDirectory(path) analyzer = StandardAnalyzer() writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." todo = get_all_rawtext_ids() for n, i in enumerate(todo): try: html = get_rawtext_by_id(i).html root = LH.fromstring(html) text = root.text_content().strip() except: #print "Failed to parse doc" continue doc = Document() # print text doc.add(TextField("text", text, Field.Store.NO)) doc.add(StoredField("id", i)) writer.addDocument(doc) if n % 1000 == 0: print "Indexed %d files (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory import sqlite3 import pandas as pd PATH = '' if __name__ == "__main__": PATH = os.getcwd() lucene.initVM() indexDir = SimpleFSDirectory(Paths.get('index')) writerConfig = IndexWriterConfig(StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print("%d docs in index" % writer.numDocs()) print("Reading lines from sys.stdin...") con = sqlite3.connect(PATH + '/imdb.db') df = pd.read_sql('select * from movies', con) con.close() for v in df.values: doc = Document() doc.add(StringField("id", str(v[0]), Field.Store.YES)) doc.add(TextField("name", v[1], Field.Store.YES)) doc.add(StringField("year", str(v[2]), Field.Store.YES)) writer.addDocument(doc) print("Indexed %d lines from stdin (%d docs in index)" % (df.shape[0], writer.numDocs())) print("Closing index of %d docs..." % writer.numDocs()) writer.close()