def index_docs(self, tweets, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t1.setStoreTermVectors(True) t1.setStoreTermVectorOffsets(True) # add each tweet to the index for tweet in tweets: try: # strip out URLs because they provide false index matches contents = [] for word in tweet[1].text.split(): if word.startswith("http://") or word.startswith("https://"): continue contents.append(word) contents = " ".join(contents) if len(contents) == 0: continue doc = Document() doc.add(Field("contents", contents, t1)) writer.addDocument(doc) except Exception, e: print "Failed in index_docs:", e
def index_docs(self, train_set, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for ii in train_set: doc = Document() doc.add(Field("answer", ii['Answer'], t1)) doc.add(Field("qid", ii['Question ID'], t1)) doc.add(Field("category", ii['category'], t1)) doc.add(Field("position", ii['Sentence Position'], t1)) doc.add(Field("question", ii['Question Text'], t2)) doc.add(Field("wiki_plain", self.wiki_reader.get_text(ii['Answer']), t2)) writer.addDocument(doc)
def indexDocs(self, root, iw): t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for filename in os.listdir(root): if not filename.endswith(".txt"): print("file is not a txt file. we skip it.") continue print("adding", filename) path = os.path.join(root, filename) self.parseBook(path, t1, t2, iw) # Prints a set of statistics displaying missing data # Authorerror = number of authors not found # Titleerror = number of titles not found # Documenterror = number of documents where text could not be extracted so entire document was indexed print("AuthorError: {}".format(self.authorcount)) print("TitleError: {}".format(self.titlecount)) print("DocumentError: {}".format(self.errorcount)) iw.close()
def index(self, root): t = FieldType() t.setIndexed(True) t.setStored(True) t.setTokenized(True) t.setStoreTermVectors(True) for path, dirs, files in os.walk(root): for file in files: filePath = os.path.join(path, file) fd = open(filePath) content = unicode(fd.read(), 'iso-8859-1') fd.close() doc = Document() doc.add(Field('name', file, StringField.TYPE_STORED)) parent = os.path.split(path)[1] doc.add(Field('parent', parent, StringField.TYPE_STORED)) if len(content) > 0: doc.add(Field('content', content, t)) print 'Indexing %s' % file self.mWriter.addDocument(doc) self.mWriter.commit() self.mWriter.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) file_path = root + 'r52-train-all-terms.txt' fd = open(file_path) contents = fd.readlines() fd.close() contents_list = [x.strip() for x in contents] for i in xrange(len(contents_list)): try: [topic, content] = contents_list[i].split('\t') doc = Document() doc.add(Field("id", str(i), t1)) doc.add(Field("topic", topic, t1)) doc.add(Field("contents", content, t2)) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def create_minidoc(termstring, field='text'): # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) doc = Document() doc.add(Field(field, termstring, customfield)) return doc
def main(): INDEX_DIR = "full_index1" DOCUMENTS_DIR = "/media/joseph/Windows8_OS/Users/Joseph/AppData/Local/lxss/home/jwymbs23/data_science_projects/french_pamphlets/frc-data-master/OCR_text/" # Initialize lucene and JVM lucene.initVM(vmargs=['-Djava.awt.headless=true']) print("lucene version is:", lucene.VERSION, '\n') store = getStore(INDEX_DIR) analyzer = getAnalyzer() writer = getWriter(store=store, analyzer=analyzer, create=True) #get list of documents doc_list = getDoclist(DOCUMENTS_DIR) ftype = FieldType() ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ftype.setTokenized(True) ftype.setStoreTermVectors(True) ftype.freeze() for cd, doc_name in enumerate(doc_list): if not cd % 1000: print(cd, '--', len(doc_list)) with open(doc_name, 'r') as d: doc_lines = d.readlines() full_text = ''.join([i.strip() for i in doc_lines]).lower() try: # create a document that would we added to the index doc = Document() # Add fields to this document #could process fname here instead of in the dataframe later doc.add( Field("identifier", doc_name.split('/')[-1], TextField.TYPE_STORED) ) #Store.YES))#, Field.Index.ANALYZED)) doc.add( Field("vectext", full_text, ftype) ) #TextField.TYPE_STORED, TermVector.YES, ))#Store.YES))#, Field.Index.ANALYZED)) doc.add(Field("text", full_text, TextField.TYPE_STORED)) # Add the document to the index writer.addDocument(doc) except: print("Failed in indexDocs: ", doc_name) #writer.optimize() writer.commit()
def setUp(self): super(Test_Bug1842, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) w1 = self.getWriter(analyzer=self.analyzer) doc1 = Document() ftype = FieldType() ftype.setStored(False) ftype.setIndexed(True) ftype.setStoreTermVectors(True) doc1.add(Field("all", "blah blah blah Gesundheit", ftype)) doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED)) w1.addDocument(doc1) w1.close()
def indexDocs(self, root, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) for root, dirnames, filenames in os.walk(root): for filename in filenames: if not filename.endswith('.json'): continue print "adding", filename try: path = os.path.join(root, filename) file = open(path) contents = json.load(file) file.close() doc = Document() doc.add(Field("title", contents['title'], t2)) doc.add(Field("path", root, t1)) doc.add(Field("playStoreURL", contents['playStoreURL'], t1)) doc.add(Field("creator", contents['creator'], t1)) extendedInfo = contents['extendedInfo'] if len(extendedInfo['description']) > 0: doc.add( Field("description", extendedInfo['description'], t2)) else: print "warning: no description in %s" % filename writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def setUp(self): super(Test_Bug1842, self).setUp() self.analyzer = StandardAnalyzer() w1 = self.getWriter(analyzer=self.analyzer) doc1 = Document() ftype = FieldType() ftype.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) ftype.setTokenized(True) ftype.setStoreTermVectors(True) ftype.freeze() doc1.add(Field("all", "blah blah blah Gesundheit", ftype)) doc1.add(Field('id', '1', StringField.TYPE_NOT_STORED)) w1.addDocument(doc1) w1.close()
def index_docs(self, input_documents): for document in tqdm(input_documents, total=len(input_documents)): doc = Document() doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES)) doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES)) type = FieldType() type.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) type.setStored(True) type.setStoreTermVectors(True) type.setTokenized(True) if ".W" in document and ".M" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower() + document[".W"].lower())), type)) elif ".M" in document and ".W" not in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower())), type)) elif ".M" not in document and ".W" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".T"].lower() + document[".W"].lower())), type)) elif ".M" not in document and ".W" not in document: doc.add( Field("text", " ".join(tokenizer.tokenize(document[".T"].lower())), type)) if self.writer.getConfig().getOpenMode( ) == IndexWriterConfig.OpenMode.CREATE: self.writer.addDocument(doc) else: self.writer.updateDocument(Term(".U", document[".U"]), doc) self.writer.close()
class LuceneDocumentField(object): """Internal handler class for possible field types""" def __init__(self): """Init possible field types""" # FIELD_ID: stored, indexed, non-tokenized self.field_id = FieldType() self.field_id.setIndexed(True) self.field_id.setStored(True) self.field_id.setTokenized(False) # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions) # for storing IDs with term vector info self.field_id_tv = FieldType() self.field_id_tv.setIndexed(True) self.field_id_tv.setStored(True) self.field_id_tv.setTokenized(False) self.field_id_tv.setStoreTermVectors(True) # FIELD_TEXT: stored, indexed, tokenized, with positions self.field_text = FieldType() self.field_text.setIndexed(True) self.field_text.setStored(True) self.field_text.setTokenized(True) # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions) self.field_text_tv = FieldType() self.field_text_tv.setIndexed(True) self.field_text_tv.setStored(True) self.field_text_tv.setTokenized(True) self.field_text_tv.setStoreTermVectors(True) # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_tvp = FieldType() self.field_text_tvp.setIndexed(True) self.field_text_tvp.setStored(True) self.field_text_tvp.setTokenized(True) self.field_text_tvp.setStoreTermVectors(True) self.field_text_tvp.setStoreTermVectorPositions(True) def get_field(self, type): """Get Lucene FieldType object for the corresponding internal FIELDTYPE_ value""" if type == Lucene.FIELDTYPE_ID: return self.field_id elif type == Lucene.FIELDTYPE_ID_TV: return self.field_id_tv elif type == Lucene.FIELDTYPE_TEXT: return self.field_text elif type == Lucene.FIELDTYPE_TEXT_TV: return self.field_text_tv elif type == Lucene.FIELDTYPE_TEXT_TVP: return self.field_text_tvp else: raise Exception("Unknown field type")
from org.apache.lucene.document import Document, Field, FieldType from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import \ IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = ["this bernhard is the text to be index text", "this claudia is the text to be indexed"] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory)
# Open DB import sqlite3 conn = sqlite3.connect(args.db) c = conn.cursor() # Field types t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setStored(True) t3.setTokenized(False) t3.setIndexOptions(IndexOptions.DOCS) # Fields to index (comments) fields = { 'id': t1, 'subreddit': t1, 'subreddit_id': t1, 'ups': t3, 'author': t1, 'name': t1,
class Indexer: """ Indexer Class """ (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date", "url", "tags", "timestamp") def __init__(self, indexDir="", debug=False, verbose=False): """ :Parameters: - `indexDir`: Path where the Index will be saved. (Str) - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean) - `verbose`: Provide additional information about the initialization process. (Boolean) """ self.__verbose = verbose if indexDir != "": INDEX_DIR = indexDir else: INDEX_DIR = os.path.dirname( os.path.realpath(__file__)) + "/luceneIndex" if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self.__boAppend = False else: self.__boAppend = True # Initialize lucene and JVM lucene.initVM() # Get index storage if debug: # Store the index in memory self.__indexDir = RAMDirectory() self.__boAppend = False INDEX_DIR = "RAM Memory" else: # Store an index on disk self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR)) # Create Content FieldType self.__contentType = FieldType() self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.__contentType.setTokenized(True) self.__contentType.setStored(True) self.__contentType.setStoreTermVectors(True) self.__contentType.setStoreTermVectorPositions(True) self.__contentType.freeze() # Get the Analyzer self.__analyzer = StandardAnalyzer( StandardAnalyzer.ENGLISH_STOP_WORDS_SET) # Print Indexer Information print("Lucene version is: ", lucene.VERSION) print("Index Directory: ", INDEX_DIR) def __del__(self): self.__indexDir.close() ################################################## #Private Methods ################################################## @staticmethod def __getTimestamp(dateTime): """ Converts the document's date to an integer timestamp :Parameters: - `dateTime`: Document's date (Str) :Returns: - Date timestamp (Int) """ tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ') sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format( tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec) return int(sTime) @staticmethod def __getDateTime(timeStamp): """ Converts the document's timestamp to date :Parameters: - `timeStamp`: Document's timestamp :Returns: - Date (Str) """ date = datetime.datetime(year=int(timeStamp[0:4]), month=int(timeStamp[4:6]), day=int(timeStamp[6:8]), hour=int(timeStamp[8:10]), minute=int(timeStamp[10:12]), second=int(timeStamp[12:14])) return date.strftime('%Y-%m-%d %H:%M:%S') @staticmethod def __qualifyTags(tags): """ Creates the qualify string for tags :Parameters: - `tags`: List of document's tags :Return: - Qualify Tags (Str) """ sTags = "" for tag in tags: sTags += tag + '|' return sTags[:-1] @staticmethod def __scatterMatrix(numDocs, freqMtx): print("Scattering Frequency Matrix...") pB = ProgressBar(len(freqMtx), prefix='Progress:') matrix = [] innerMatrix = ['Term'] #Generate Document Columns for docIdx in range(numDocs): innerMatrix.append("D{0:0>4}".format(docIdx)) matrix.append(innerMatrix) #Generate Word Rows and Columns for word in sorted(freqMtx): innerMatrix = [] innerMatrix.append(word) for docIdx in range(numDocs): try: termCount = round(freqMtx[word][str(docIdx)], 3) innerMatrix.append(termCount) except KeyError: innerMatrix.append(0) matrix.append(innerMatrix) pB.updateProgress() return matrix @staticmethod def __saveMatrix(numDocs, freqMtx): pathMatrix = os.path.dirname( os.path.realpath(__file__)) + "/freqMtx.txt" fMatrix = open(pathMatrix, 'w') print("Saving Frequency Matrix File: ", pathMatrix) pB = ProgressBar(len(freqMtx), prefix='Progress:') # File Generation Start print("+========= Frequency Matrix =========+", file=fMatrix) print("%20s" % (' '), end=' ', file=fMatrix) for docIdx in range(numDocs): print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix) print(file=fMatrix) for word in sorted(freqMtx): print("%20s" % (word), end=' ', file=fMatrix) for docIdx in range(numDocs): try: termCount = freqMtx[word][str(docIdx)] print("%02.03f" % (termCount), end=' ', file=fMatrix) except KeyError: print(" 0 ", end=' ', file=fMatrix) print(file=fMatrix) pB.updateProgress() # Close File fMatrix.close() def __stemString(self, stringToStem): stemmedTerms = [] tknStream = self.__analyzer.tokenStream('STEM', stringToStem) stemmed = SnowballFilter(tknStream, "English") stemmed.reset() while stemmed.incrementToken(): stemmedTerms.append( stemmed.getAttribute(CharTermAttribute.class_).toString()) tknStream.close() return stemmedTerms @staticmethod def __normalize(qVector, freqMtx): for term in qVector: for docId in freqMtx: if (term in freqMtx[docId]) and (freqMtx[docId][term] > qVector[term]): qVector[term] = freqMtx[docId][term] @staticmethod def __dotProduct(aVector, bVector): """ Calculate Dot Product :Parameters: - `aVector`: A Vector. (Dict) - `bVector`: B Vector. (Dict) :Returns: - Dot Product. (Int) """ dotProduct = 0 for term in aVector: if term in bVector: product = aVector[term] * bVector[term] dotProduct += product return dotProduct @staticmethod def __magnitude(vector): """ Calculate Dot Product :Parameters: - `vector`: Query Vector. (Dict) :Returns: - Vector Magnitude. (Int) """ # Magnitude of the vector is the square root of the dot product of the vector with itself. vectorMagnitude = Indexer.__dotProduct(vector, vector) vectorMagnitude = math.sqrt(vectorMagnitude) return vectorMagnitude ################################################## #Public Methods ################################################## def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close() def Search(self, query, field=NAME, maxResult=1000): """ Search for a document into the Lucene's Index :Parameters: - `query`: Request to be made to the Index (Str). - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS). - `maxResult`: Maximum number of results. """ # Get the Index Directory reader = DirectoryReader.open(self.__indexDir) searcher = IndexSearcher(reader) # Create a query queryParser = QueryParser(field, self.__analyzer).parse(query) # Do a search hits = searcher.search(queryParser, maxResult) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, queryParser)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score)) print("Name: " + doc.get('name')) print("Tags: " + doc.get('tags') + "\n") reader.close() def StemDocument(self, docIdx): """ Return an array of the document's stemmed terms :Parameters: - `docIdx`: Document's index ID (Int). """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx).get(Indexer.CONTENT) reader.close() return self.__stemString(doc) def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False): """ Generates a Frequency Matrix of the current Index :Parameters: - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean) """ freqMtx = {} # Terms - DocumentID Matrix reader = DirectoryReader.open(self.__indexDir) numDocs = reader.numDocs() print("Generating Frequency Matrix...") pB = ProgressBar(numDocs - 1, prefix='Progress:') for docIdx in range(numDocs): termItr = self.StemDocument(docIdx) termSize = len(termItr) docStr = '{0}'.format(docIdx) termDict = {} for termText in termItr: if byTerms: # Check if the term exists if termText in freqMtx: # Check if the document exists if docStr in freqMtx[termText]: termCount = int( math.ceil( ((freqMtx[termText][docStr] * termSize) / 100))) freqMtx[termText].update( {docStr: ((termCount + 1) / termSize) * 100}) else: freqMtx[termText].update( {docStr: (1 / termSize) * 100}) else: termIdx = {termText: {docStr: (1 / termSize) * 100}} freqMtx.update(termIdx) else: # Check if the term exists termText = termText.replace('.', '_') if termText in termDict: termCount = int( math.ceil((termDict[termText] * termSize) / 100)) termDict[termText] = ((termCount + 1) / termSize) * 100 else: termIdx = {termText: (1 / termSize) * 100} termDict.update(termIdx) if not byTerms: freqMtx.update({docStr: termDict}) pB.updateProgress() if saveMtx and byTerms: self.__saveMatrix(numDocs, freqMtx) if scattered and byTerms: freqMtx = self.__scatterMatrix(numDocs, freqMtx) # Close IndexReader reader.close() return freqMtx def GetSimilarity(self, query, freqMtx): """ Cosine Similarity """ qVector = {} qList = self.__stemString(query) for stem in qList: qVector.update({stem: 0}) self.__normalize(qVector, freqMtx) qList = [] #Get similarity between query and doc[n] for docIdx, dVector in freqMtx.items(): dP = self.__dotProduct(qVector, dVector) qM = self.__magnitude(qVector) dM = self.__magnitude(dVector) cosSimilarity = dP / (qM * dM) qList.append((docIdx, cosSimilarity)) return sorted(qList, key=lambda similarity: similarity[1], reverse=True) def AnalyzeDocument(self, docIdx): """ Generates a list of (entity, relation, entity) tuples as its output. :Parameters: - `docIdx`: Document's index ID (Int). """ gpeList = {} geolocator = Geocode() reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) # Load NLTK Data nltkPath = os.path.dirname( os.path.realpath(__file__)) + '/../tools/nltk_data' nltk.data.path.append(nltkPath) # Named Entity Recognition content = doc.get(Indexer.CONTENT) sentences = nltk.sent_tokenize(content) #ProgressBar print("Analazing Document {0}".format(docIdx)) pB = ProgressBar(len(sentences), prefix='Progress:') # Loop over each sentence and tokenize it separately for sentence in sentences: ner = nltk.word_tokenize(sentence) ner = nltk.pos_tag(ner) ner = nltk.ne_chunk(ner) # Get all the Geo-Political Entities for subtrees in list( ner.subtrees( filter=lambda subtree: subtree.label() == 'GPE')): entityName = ' '.join([child[0] for child in subtrees]) if entityName not in gpeList: location = geolocator.GetGPE(entityName) if location: gpeList.update(location) pB.updateProgress() gpeList = geolocator.GetFeatureCollection(gpeList) return gpeList def GetDocField(self, docIdx, field=CONTENT): """ Get the document's field :Parameters: - `docIdx`: Document's index ID (Int). - `field`: Field to retrieve (Str). :Returns: - Document's field. (Str) """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) content = doc.get(field) reader.close() return content
class LuceneDocumentField(object): """Internal handler class for possible field types.""" def __init__(self): """Init possible field types. DOCS Only documents are indexed: term frequencies and positions are omitted. DOCS_AND_FREQS Only documents and term frequencies are indexed: positions are omitted. DOCS_AND_FREQS_AND_POSITIONS Indexes documents, frequencies and positions. DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS Indexes documents, frequencies, positions and offsets. NONE Not indexed """ # FIELD_ID: stored, indexed, non-tokenized self.field_id = FieldType() self.field_id.setIndexOptions(IndexOptions.DOCS) self.field_id.setStored(True) self.field_id.setTokenized(False) # FIELD_ID_TV: stored, indexed, not tokenized, with term vectors (without positions) # for storing IDs with term vector info self.field_id_tv = FieldType() self.field_id_tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.field_id_tv.setStored(True) self.field_id_tv.setTokenized(False) self.field_id_tv.setStoreTermVectors(True) # FIELD_TEXT: stored, indexed, tokenized, with positions self.field_text = FieldType() self.field_text.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.field_text.setStored(True) self.field_text.setTokenized(True) # FIELD_TEXT_TV: stored, indexed, tokenized, with term vectors (without positions) self.field_text_tv = FieldType() self.field_text_tv.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.field_text_tv.setStored(True) self.field_text_tv.setTokenized(True) self.field_text_tv.setStoreTermVectors(True) # FIELD_TEXT_TVP: stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_tvp = FieldType() self.field_text_tvp.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.field_text_tvp.setStored(True) self.field_text_tvp.setTokenized(True) self.field_text_tvp.setStoreTermVectors(True) self.field_text_tvp.setStoreTermVectorPositions(True) # FIELD_TEXT_NTV: not stored, indexed, tokenized, with term vectors (without positions) self.field_text_ntv = FieldType() self.field_text_ntv.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.field_text_ntv.setStored(False) self.field_text_ntv.setTokenized(True) self.field_text_ntv.setStoreTermVectors(True) # FIELD_TEXT_TVP: not stored, indexed, tokenized, with term vectors and positions # (but no character offsets) self.field_text_ntvp = FieldType() self.field_text_ntvp.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.field_text_ntvp.setStored(False) self.field_text_ntvp.setTokenized(True) self.field_text_ntvp.setStoreTermVectors(True) self.field_text_ntvp.setStoreTermVectorPositions(True) def get_field(self, type): """Gets Lucene FieldType object for the corresponding internal FIELDTYPE_ value.""" if type == Lucene.FIELDTYPE_ID: return self.field_id elif type == Lucene.FIELDTYPE_ID_TV: return self.field_id_tv elif type == Lucene.FIELDTYPE_TEXT: return self.field_text elif type == Lucene.FIELDTYPE_TEXT_TV: return self.field_text_tv elif type == Lucene.FIELDTYPE_TEXT_TVP: return self.field_text_tvp elif type == Lucene.FIELDTYPE_TEXT_NTV: return self.field_text_ntv elif type == Lucene.FIELDTYPE_TEXT_NTVP: return self.field_text_ntvp else: raise Exception("Unknown field type")
import lucene # for customized field from org.apache.lucene.document import Field from org.apache.lucene.document import FieldType from org.apache.lucene.index import IndexOptions lucene.initVM(vmargs=['-Djava.awt.headless=true']) CUSTOM_FIELD_TEXT=FieldType() CUSTOM_FIELD_TEXT.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) CUSTOM_FIELD_TEXT.setStored(True) CUSTOM_FIELD_TEXT.setStoreTermVectors(True) CUSTOM_FIELD_TEXT.setStoreTermVectorPositions(True) CUSTOM_FIELD_TEXT.setStoreTermVectorOffsets(True) #CUSTOM_FIELD_TEXT.setStoreTermVectorPayloads(True) CUSTOM_FIELD_TEXT.setTokenized(True) CUSTOM_FIELD_TEXT_BF=FieldType() CUSTOM_FIELD_TEXT_BF.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) CUSTOM_FIELD_TEXT_BF.setStored(False) CUSTOM_FIELD_TEXT_BF.setStoreTermVectors(True) CUSTOM_FIELD_TEXT_BF.setStoreTermVectorPositions(True) CUSTOM_FIELD_TEXT_BF.setStoreTermVectorOffsets(True) CUSTOM_FIELD_TEXT_BF.setTokenized(True) CUSTOM_FIELD_TEXT_DF=FieldType() CUSTOM_FIELD_TEXT_DF.setIndexOptions(IndexOptions.DOCS_AND_FREQS) CUSTOM_FIELD_TEXT_DF.setStored(False) CUSTOM_FIELD_TEXT_DF.setStoreTermVectors(True) CUSTOM_FIELD_TEXT_DF.setStoreTermVectorPositions(False) CUSTOM_FIELD_TEXT_DF.setStoreTermVectorOffsets(False)
def BuildSearchEngine(start, maxPages,domain,first): #only initiate VM if it's the first being called if first == True: lucene.initVM(vmargs=['-Djava.awt.headless=true']) print ('lucene'), lucene.VERSION if not os.path.exists("IndexFiles.index"): os.mkdir("IndexFiles.index") store = SimpleFSDirectory(Paths.get("IndexFiles.index")) config = IndexWriterConfig(StandardAnalyzer(StandardAnalyzer.STOP_WORDS_SET)) #if first time being called, create new index, otherwise only append new pages into old index if first == True: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) writer = IndexWriter(store, config) #configure settings for pages being saved t1 = FieldType() t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(False) t2.setTokenized(True) t2.setStoreTermVectors(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPositions(True) t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) pagesToVisit = [start] hashtable=dict() hashtable[start]=1 numberVisited = 0 rp = robotparser.RobotFileParser() robotFileLocation = "http://www."+domain+"/robots.txt" rp.set_url(robotFileLocation) rp.read() # The main loop. Create a LinkParser and get all the links on the page. while numberVisited < maxPages and pagesToVisit != []: numberVisited = numberVisited +1 # Start from the beginning of our collection of pages to visit: url = pagesToVisit[0] pagesToVisit = pagesToVisit[1:] try: print(numberVisited, "Visiting:", url) parser = LinkParser() data, links,hashtable = parser.getLinks(url,domain,hashtable,rp) # Add the pages that we visited to the end of our collection # of pages to visit: print(" **Success!**") path = "files/a.html" urllib.urlretrieve(url,path) file = open("files/a.html") contents = removeTag(file); file.close() file = open("files/a.html","w") file.write(contents) file.close() file = open("files/a.html") contents = file.read() file.close() doc = Document() doc.add(Field("name", "a.html", t1)) doc.add(Field("path", "files", t1)) #index the url doc.add(Field("url", url, t1)) if len(contents) > 0: doc.add(Field("contents", contents.decode("utf-8").replace(u"\u2019","'").replace(u"\u2018","'").replace(u"\ufe0f","'").replace(u"\u20e3","'"), t2)) else: print ("warning: no content in %s") % filename writer.addDocument(doc) pagesToVisit = pagesToVisit + links except Exception,e: print Exception,":",e print(" **Failed!**")
writer = IndexWriter(directory, config) return writer def open_searcher(writer): from org.apache.lucene.search import IndexSearcher reader = writer.getReader() searcher = IndexSearcher(reader) return reader, searcher from org.apache.lucene.document import Document, Field, FieldType, TextField, StringField from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import Term vectorFieldType = FieldType(TextField.TYPE_NOT_STORED) vectorFieldType.setIndexed(True) vectorFieldType.setTokenized(True) vectorFieldType.setStoreTermVectors(True) vectorFieldType.setStoreTermVectorPositions(False) writer = open_writer('data/index') def addToIndex(lxmlNode): uri = xpathFirst(lxmlNode, '//oa:hasTarget/@rdf:resource') print uri seen = set() doc = Document() for fieldName in FIELD_NAMES: seen.clear() for subpath in [ '', '/*/rdfs:label', '/*/skos:prefLabel', '/*/skos:altLabel', '/*/dcterms:title', '/*/foaf:name']: for value in xpath(lxmlNode, '//%(fieldName)s%(subpath)s/text()' % locals()):
from org.apache.lucene.document import Document, Field, FieldType from org.apache.lucene.util import BytesRef, BytesRefIterator from org.apache.lucene.index import \ IndexWriterConfig, IndexWriter, DirectoryReader, IndexOptions if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = RAMDirectory() iconfig = IndexWriterConfig(LimitTokenCountAnalyzer(StandardAnalyzer(), 100)) iwriter = IndexWriter(directory, iconfig) ft = FieldType() ft.setStored(True) ft.setTokenized(True) ft.setStoreTermVectors(True) ft.setStoreTermVectorOffsets(True) ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = [ "this bernhard is the text to be index text", "this claudia is the text to be indexed" ] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close()
def create_index_from_folder(folder, index_file): """Lets Lucene create an index of all database files within a specified folder :param folder: absolute or relative path to database files :param index_file: absolute or relative output location for index Notes: - Does not go through database folder recursively, i.e. all files have to be at the root of the folder - Only CSV files are supported - Column headers are hardcoded and should follow: ID, text, Reddit ID, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold """ # Set up Lucene print() print("Starting Lucene ...") lucene.initVM() index_store = SimpleFSDirectory.open(File(index_file).toPath()) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) print() # Go through files, add rows of each as Documents to writer for file in os.listdir(folder): if file.endswith(".csv"): print("Indexing {} ...".format(file), end=" ", flush=True) with open(os.path.join(folder, file), newline='') as db: reader = csv.reader(db) # The Reddit database seems to carry a lot of duplicate posts, so we try to skip those post_ids = set() duplicate_counter = 0 # To store term vectors (used for query expansion) we have to use a custom fieldtype customfield = FieldType() customfield.setIndexOptions(IndexOptions.DOCS_AND_FREQS) customfield.setStored(True) customfield.setTokenized(True) customfield.setStoreTermVectors(True) # CSV files have a useless first row... skipfirst = True # ... and a useless first column. Skip both. for _, text, rid, subreddit, meta, time, author, ups, downs, authorlinkkarma, authorkarma, authorisgold in reader: if skipfirst: skipfirst = False continue doc = Document() if rid in post_ids: duplicate_counter += 1 continue # skip else: post_ids.add(rid) # Tokenize, index and store doc.add(Field("text", text, customfield)) # Index and store doc.add(StringField("id", rid, Field.Store.YES)) doc.add( StringField("subreddit", subreddit, Field.Store.YES)) doc.add(StringField("meta", meta, Field.Store.YES)) doc.add(StringField("time", time, Field.Store.YES)) doc.add(StringField("author", author, Field.Store.YES)) # Store only doc.add(StoredField("ups", ups)) doc.add(StoredField("downs", downs)) doc.add(StoredField("authorlinkkarma", authorlinkkarma)) doc.add(StoredField("authorkarma", authorkarma)) doc.add(StoredField("authorisgold", authorisgold)) writer.addDocument(doc) print("DONE!\t(Duplicate posts skipped: {})".format( duplicate_counter)) writer.commit() writer.close() print() print("Finished indexing!")
class IndexInterface(TextInterface): """ Abstract class that takes care of serializing and deserializing text in an indexed structure This use lucene library Args: directory (str): Path of the directory where the content will be serialized """ def __init__(self, directory: str): super().__init__(directory) self.__doc = None self.__writer = None self.__field_type_frequency = None self.__field_type_searching = None def __str__(self): return "IndexInterface" def init_writing(self): self.__field_type_searching = FieldType(TextField.TYPE_STORED) self.__field_type_frequency = FieldType(StringField.TYPE_STORED) self.__field_type_frequency.setStored(True) self.__field_type_frequency.setTokenized(False) self.__field_type_frequency.setStoreTermVectors(True) self.__field_type_frequency.setStoreTermVectorPositions(True) self.__field_type_frequency.\ setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) fs_directory = SimpleFSDirectory(Paths.get(self.directory)) self.__writer = IndexWriter(fs_directory, IndexWriterConfig()) def new_content(self): """ In the lucene index case the new content is a new document in the index """ self.__doc = Document() def new_field(self, field_name: str, field_data): """ Add a new field Args: field_name (str): Name of the new field field_data: Data to put into the field """ if isinstance(field_data, list): for word in field_data: self.__doc.add( Field(field_name, word, self.__field_type_frequency)) else: self.__doc.add( Field(field_name, field_data, self.__field_type_frequency)) def new_searching_field(self, field_name, field_data): """ Add a new searching field. It will be used by the search engine recommender Args: field_name (str): Name of the new field field_data: Data to put into the field """ self.__doc.add( Field(field_name, field_data, self.__field_type_searching)) def serialize_content(self): """ Serialize the content """ doc_index = self.__writer.addDocument(self.__doc) return doc_index - 1 def stop_writing(self): """ Stop the index writer and commit the operations """ self.__writer.commit() self.__writer.close() def get_tf_idf(self, field_name: str, content_id: str): """ Calculates the tf-idf for the words contained in the field of the content whose id is content_id Args: field_name (str): Name of the field containing the words for which calculate the tf-idf content_id (str): Id of the content that contains the specified field Returns: words_bag (Dict <str, float>): Dictionary whose keys are the words contained in the field, and the corresponding values are the tf-idf values. """ searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory)))) query = QueryParser("testo_libero", KeywordAnalyzer()).parse("content_id:\"" + content_id + "\"") score_docs = searcher.search(query, 1).scoreDocs document_offset = -1 for score_doc in score_docs: document_offset = score_doc.doc reader = searcher.getIndexReader() words_bag = {} term_vector = reader.getTermVector(document_offset, field_name) term_enum = term_vector.iterator() for term in BytesRefIterator.cast_(term_enum): term_text = term.utf8ToString() postings = term_enum.postings(None) postings.nextDoc() term_frequency = 1 + math.log10( postings.freq()) # normalized term frequency inverse_document_frequency = math.log10( reader.maxDoc() / reader.docFreq(Term(field_name, term))) tf_idf = term_frequency * inverse_document_frequency words_bag[term_text] = tf_idf reader.close() return words_bag def delete_index(self): shutil.rmtree(self.directory, ignore_errors=True)
def tweetIndexer(self, writer): t1 = FieldType() t1.setIndexed(True) t1.setStored(True) t1.setTokenized(False) t1.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setIndexed(True) t2.setStored(True) t2.setTokenized(True) t2.setStoreTermVectorOffsets(True) t2.setStoreTermVectorPayloads(True) t2.setStoreTermVectorPositions(True) t2.setStoreTermVectors(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) x = 0 for i in range(0,500): if not os.path.isfile("json/tweets-" + str(i) + ".json"): break print "adding tweets-" + str(i) + ".json" tweets = open("json/tweets-" + str(i) + ".json", "r") for line in tweets.readlines(): tweet = json.loads(line) if 'limit' in tweet: continue try: doc = Document() doc.add(Field("file", "json/tweets-" + str(i) + ".json", t1)) sname = tweet['user']['screen_name'] tid = str(tweet['id']) text = tweet['text'] uname = tweet['user']['name'] created = tweet['created_at'] tstamp = tweet['timestamp_ms'] place = "" if tweet['place']: place = tweet['place']['full_name'] + ", " + tweet['place']['country'] lat = "" lng = "" titles = "" urls = "" exist = "false" if tweet['coordinates']: lat = str(tweet['coordinates']['coordinates'][1]) lng = str(tweet['coordinates']['coordinates'][0]) else: lat = str((tweet['place']['bounding_box']['coordinates'][0][0][1] + tweet['place']['bounding_box']['coordinates'][0][2][1])/2) lng = str((tweet['place']['bounding_box']['coordinates'][0][0][0] + tweet['place']['bounding_box']['coordinates'][0][2][0])/2) if len(tweet['entities']['urls']) != 0: exist = "true" for index in range(len(tweet['entities']['urls'])): title = tweet['entities']['urls'][index]['url_title'] if title == None: titles += ",-" else: title = title.encode('ascii','ignore') titles += "," + str(title) urls += " " + str(tweet['entities']['urls'][index]['expanded_url']) searchable = text + " " + urls + " " + uname + " " + sname + " " + place doc.add(Field("lookup", searchable, t2)) doc.add(Field("text", text, t2)) doc.add(Field("user_name", uname, t2)) doc.add(Field("screen_name", sname, t2)) doc.add(Field("tweet_id", tid, t2)) doc.add(Field("created_at", created, t2)) doc.add(Field("geo_lat", lat, t2)) doc.add(Field("geo_lng", lng, t2)) doc.add(Field("url_exist", exist, t2)) doc.add(Field("url_url", urls, t2)) doc.add(Field("url_title", titles, t2)) doc.add(Field("timestamp", tstamp, t2)) writer.addDocument(doc) x += 1 except Exception, e: pass tweets.close()
class IndexFiles: def __init__(self, path, analyzer): self.path = path self._analyzer = analyzer self.errors = [] self._initialize() def index(self, csvs_path): all_csvs = [x for x in os.listdir(csvs_path) if x.endswith('csv')] for i, csv_file in enumerate(all_csvs, 1): print("\nProcessing CSV #{}".format(i), flush=True) patents = self._read_csv(csvs_path + "/" + csv_file) if patents is None: continue print("\rProcessed {}/{} patents in file".format(0, len(patents)), end='', flush=True) for j, patent in enumerate(patents, 1): pid, date, title, author, icn, org, acn, abstract, description, purpose, mechanics, uid = patent try: doc = Document() doc.add(Field('id', pid, self._ft1)) doc.add(Field('date', date, self._ft1)) doc.add(Field('title', title, self._ft2)) doc.add(Field('author', author, self._ft1)) doc.add(Field('icn', icn, self._ft1)) doc.add(Field('organization', org, self._ft1)) doc.add(Field('acn', acn, self._ft1)) doc.add(Field('abstract', abstract, self._ft2)) doc.add(Field('description', description, self._ft2)) doc.add(Field('purpose', purpose, self._ft2)) doc.add(Field('mechanics', mechanics, self._ft2)) doc.add(Field('uid', uid, self._ft1)) self._writer.addDocument(doc) except Exception as e: print("\nFailed to index '{}': {}\n".format(csvs_path, e)) print("\rProcessed {}/{} patents in file".format( j, len(patents)), end='', flush=True) print() self._commit() return self def _read_csv(self, path): try: with open(path, 'rU', newline='') as fs: reader = csv.reader(x.replace('\0', '') for x in fs) rows = [r for r in reader] return rows except Exception as e: print("\nError reading file '{}' : {} \n".format(path, e)) return None def _commit(self): ticker = Ticker() print("Commiting index", end='', flush=True) threading.Thread(target=ticker.run).start() self._writer.commit() self._writer.close() ticker.tick = False print("Done!") def _initialize(self): if not os.path.exists(self.path): os.mkdir(self.path) self._analyzer = LimitTokenCountAnalyzer(self._analyzer, 1048576) self._store = SimpleFSDirectory(Paths.get(self.path)) self._config = IndexWriterConfig(self._analyzer) self._config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._store, self._config) self._set_fieldtypes() def _set_fieldtypes(self): self._ft1 = FieldType() self._ft1.setStored(True) self._ft1.setTokenized(False) self._ft1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self._ft2 = FieldType() self._ft2.setStored(True) self._ft2.setTokenized(True) self._ft1.setStoreTermVectors(True) self._ft1.setStoreTermVectorOffsets(True) self._ft1.setStoreTermVectorPositions(True) self._ft2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)