def indexFile(cls, writer, path, baseDir): input = file(path) props = {} while True: line = input.readline().strip() if not line: break name, value = line.split("=", 1) props[name] = value.decode("unicode-escape") input.close() doc = Document() # category comes from relative path below the base directory category = os.path.dirname(path)[len(baseDir) :] if os.path.sep != "/": category = category.replace(os.path.sep, "/") isbn = props["isbn"] title = props["title"] author = props["author"] url = props["url"] subject = props["subject"] pubmonth = props["pubmonth"] print title.encode("utf8") print author.encode("utf-8") print subject.encode("utf-8") print category.encode("utf-8") print "---------" doc.add(Field("isbn", isbn, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("category", category, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("title", title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add( Field( "title2", title.lower(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS, ) ) # split multiple authors into unique field instances authors = author.split(",") for a in authors: doc.add( Field("author", a, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS) ) doc.add(Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) doc.add( Field("subject", subject, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS) ) doc.add(NumericField("pubmonth", Field.Store.YES, True).setIntValue(int(pubmonth))) d = DateTools.stringToDate(pubmonth) d = int(d.getTime() / (1000 * 3600 * 24.0)) doc.add(NumericField("pubmonthAsDay").setIntValue(d)) doc.add( Field( "contents", " ".join([title, subject, author, category]), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS, ) ) doc.add(Field("path", path, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("modified", DateField.dateToString(samplesModified), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc)
def indexFile(cls, writer, path, baseDir): input = file(path) props = {} while True: line = input.readline().strip() if not line: break name, value = line.split('=', 1) props[name] = value.decode('unicode-escape') input.close() doc = Document() # category comes from relative path below the base directory category = os.path.dirname(path)[len(baseDir):] if os.path.sep != '/': category = category.replace(os.path.sep, '/') isbn = props['isbn'] title = props['title'] author = props['author'] url = props['url'] subject = props['subject'] pubmonth = props['pubmonth'] print title.encode('utf8') print author.encode('utf-8') print subject.encode('utf-8') print category.encode('utf-8') print "---------" doc.add(Field("isbn", isbn, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("category", category, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("title", title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add( Field("title2", title.lower(), Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.WITH_POSITIONS_OFFSETS)) # split multiple authors into unique field instances authors = author.split(',') for a in authors: doc.add( Field("author", a, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add( Field("url", url, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)) doc.add( Field("subject", subject, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add( NumericField("pubmonth", Field.Store.YES, True).setIntValue(int(pubmonth))) d = DateTools.stringToDate(pubmonth) d = int(d.getTime() / (1000 * 3600 * 24.0)) doc.add(NumericField("pubmonthAsDay").setIntValue(d)) doc.add( Field("contents", ' '.join([title, subject, author, category]), Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field("path", path, Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add( Field("modified", DateField.dateToString(samplesModified), Field.Store.YES, Field.Index.NOT_ANALYZED)) writer.addDocument(doc)
def indexFile(cls, writer, path, baseDir): input = file(path) props = {} while True: line = input.readline().strip() if not line: break name, value = line.split('=', 1) props[name] = value.decode('unicode-escape') input.close() doc = Document() # category comes from relative path below the base directory category = os.path.dirname(path)[len(baseDir):] if os.path.sep != '/': category = category.replace(os.path.sep, '/') isbn = props['isbn'] title = props['title'] author = props['author'] url = props['url'] subject = props['subject'] pubmonth = props['pubmonth'] print title.encode('utf8') print author.encode('utf-8') print subject.encode('utf-8') print category.encode('utf-8') print "---------" doc.add(Field("isbn", isbn, StringField.TYPE_STORED)) doc.add(Field("category", category, StringField.TYPE_STORED)) # note: ft should be initialized once and re-used ft = FieldType() ft.setIndexed(True) ft.setTokenized(True) ft.setStored(True) ft.setStoreTermVectorPositions(True) ft.setStoreTermVectorOffsets(True) ft.freeze() doc.add(Field("title", title, ft)) ft = FieldType(StringField.TYPE_STORED) ft.setIndexed(True) ft.setTokenized(False) ft.setOmitNorms(True) ft.setStoreTermVectorPositions(True) ft.setStoreTermVectorOffsets(True) doc.add(Field("title2", title.lower(), ft)) # split multiple authors into unique field instances authors = author.split(',') ft = FieldType() ft.setIndexed(True) ft.setTokenized(False) ft.setStored(True) ft.setStoreTermVectorPositions(True) ft.setStoreTermVectorOffsets(True) for a in authors: doc.add(Field("author", a, ft)) ft = FieldType() ft.setIndexed(True) ft.setTokenized(False) ft.setStored(True) ft.setOmitNorms(True) doc.add(Field("url", url, ft)) ft = FieldType() ft.setIndexed(True) ft.setTokenized(True) ft.setStored(False) ft.setStoreTermVectorPositions(True) ft.setStoreTermVectorOffsets(True) doc.add(Field("subject", subject, ft)) doc.add(IntField("pubmonth", int(pubmonth), Field.Store.YES)) d = DateTools.stringToDate(pubmonth) d = int(d.getTime() / (1000 * 3600 * 24.0)) doc.add(IntField("pubmonthAsDay", d, IntField.TYPE_NOT_STORED)) ft = FieldType() ft.setIndexed(True) ft.setTokenized(True) ft.setStored(False) ft.setStoreTermVectorPositions(True) ft.setStoreTermVectorOffsets(True) doc.add(Field("contents", ' '.join([title, subject, author, category]), ft)) doc.add(Field("path", path, StringField.TYPE_STORED)) doc.add(Field("modified", DateTools.dateToString(samplesModified, DateTools.Resolution.MILLISECOND), StringField.TYPE_STORED)) writer.addDocument(doc)