class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page=1, duplicates=False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream( "contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight }) del self.searcher totalPages = int(math.ceil(results.getTotalHits() / float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
class SearchIndex(object): def __init__(self): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH'])) self.searcher = IndexSearcher(DirectoryReader.open(indexDir)) self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer) def search(self, q, page = 1, duplicates = False): query = self.parser.parse(q) if not duplicates: query = self.addDuplicatesQuery(query) perPage = 10 start = (page - 1) * perPage results = TopScoreDocCollector.create(1000, True) self.searcher.search(query, results) highlighter = Highlighter(QueryScorer(query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) docs = [] for scoreDoc in results.topDocs(start, perPage).scoreDocs: doc = self.searcher.doc(scoreDoc.doc) tokenStream = self.analyzer.tokenStream("contents", StringReader(doc['contents'])) highlight = highlighter.getBestFragments(tokenStream, doc['contents'], 3, "...") docs.append({ 'title': doc['title'], 'url': doc['url'], 'duplicate': doc['duplicate'], 'highlight': highlight} ) del self.searcher totalPages = int(math.ceil(results.getTotalHits()/float(perPage))) return totalPages, docs def addDuplicatesQuery(self, query): not_duplicate = TermQuery(Term('duplicate', 'false')) booleanQuery = BooleanQuery() booleanQuery.add(not_duplicate, BooleanClause.Occur.MUST) booleanQuery.add(query, BooleanClause.Occur.MUST) return booleanQuery
def lucene_search(query, MAX, showHighlight): dir = os.getcwd() lucene.initVM() index_dir = SimpleFSDirectory(File(dir)) index_reader = DirectoryReader.open(index_dir) lucene_searcher = IndexSearcher(index_reader) lucene_analyzer = StandardAnalyzer(Version.LUCENE_48) my_query = QueryParser(Version.LUCENE_48, "text", lucene_analyzer).parse(query) #We can define the MAX number of results (default 10) total_hits = lucene_searcher.search(my_query, MAX) query_scorer = QueryScorer(my_query) formatter = SimpleHTMLFormatter() highlighter = Highlighter(formatter, query_scorer) # Set the fragment size. We break text in to fragment of 50 characters fragmenter = SimpleSpanFragmenter(query_scorer, 50) highlighter.setTextFragmenter(fragmenter) print "Only shows at most %s documents" % MAX if showHighlight: print "<br>" for hit in total_hits.scoreDocs: doc = lucene_searcher.doc(hit.doc) text = doc.get("text") ts = lucene_analyzer.tokenStream("text", StringReader(text)) if showHighlight: print "<p>" print doc.get("title") if showHighlight: print "<br>" print highlighter.getBestFragments(ts, text, 3, "...") print "</p>"
class HighlighterTestCase(PyLuceneTestCase): """ Unit tests ported from Java Lucene. 2004 by Yura Smolsky ;) """ FIELD_NAME = "contents" texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem. Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented." "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem." "From http://cognexus.org/id42.htm" "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems. Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches." "This text has a typo in referring to whicked problems" ] def __init__(self, *args): super(HighlighterTestCase, self).__init__(*args) self.parser = QueryParser(self.FIELD_NAME, StandardAnalyzer()) def setUp(self): super(HighlighterTestCase, self).setUp() self.analyzer = StandardAnalyzer() writer = self.getWriter(analyzer=self.analyzer) for text in self.texts: self.addDoc(writer, text) writer.commit() writer.close() self.reader = self.getReader() self.numHighlights = 0 def testSimpleHighlighter(self): self.doSearching("Wicked") highlighter = Highlighter(QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) maxNumFragmentsRequired = 2 for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...") print "\t", result # Not sure we can assert anything here - just running to check we don't # throw any exceptions def testGetBestFragmentsSimpleQuery(self): self.doSearching("Wicked") self.doStandardHighlights() self.assert_(self.numHighlights == 3, ("Failed to find correct number of highlights, %d found" % (self.numHighlights))) def doSearching(self, queryString): self.searcher = self.getSearcher() self.query = self.parser.parse(queryString) # for any multi-term queries to work (prefix, wildcard, range, # fuzzy etc) you must use a rewritten query! self.query = self.query.rewrite(self.reader) print "Searching for:", self.query.toString(self.FIELD_NAME) self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs self.numHighlights = 0 def doStandardHighlights(self): formatter = TestFormatter(self) highlighter = Highlighter(formatter, QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(20)) for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) maxNumFragmentsRequired = 2 fragmentSeparator = "..." tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator) print "\t", result def countHighlightTerm(self): self.numHighlights += 1 # update stats used in assertions def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)
class LuceneManager(object): def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'): self.index_root_loc = index_root_loc self.index_subdir_name = index_subdir_name def __enter__(self): """ Used by "with" statement. Like an "open" / "init" method. """ if lucene.getVMEnv() is None: lucene.initVM(vmargs=['-Djava.awt.headless=true']) index_path = Path(self.index_root_loc).joinpath('%s/' % self.index_subdir_name) index_path.mkdir(parents=True, exist_ok=True) store = SimpleFSDirectory(Paths.get(str(index_path))) self.analyzer = StandardAnalyzer() config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # IndexWriter self.writer = IndexWriter(store, config) # IndexReader self.reader = DirectoryReader.open(self.writer) # IndexSearcher self.searcher = IndexSearcher(self.reader) return self def insert(self, document): self.writer.addDocument(document) return document['key'] def delete(self, key): self.writer.deleteDocuments(Term('key', key)) return key def delete_all(self): self.writer.deleteAll() def num_docs(self): return self.reader.numDocs() def update(self, key, document): # atomic delete and add self.writer.updateDocument(Term('key', key), document) return key def exists(self, key): boolean_query = BooleanQuery.Builder() boolean_query.add(TermQuery(Term('key', key)), BooleanClause.Occur.MUST) results = self.searcher.search(boolean_query.build(), 1) return results.totalHits > 0 def commit(self): self.writer.commit() # make IndexReader reflect index updates # TODO: try IndexReader.isCurrent() new_reader = DirectoryReader.openIfChanged(self.reader) if new_reader is not None: self.reader.close() # note: not thread safe, may need to revisit self.reader = new_reader self.searcher = IndexSearcher(self.reader) def _process_search_result(self, result, highlighter=None): docid = result.doc # this is not a stable identifier # obtain document through an IndexReader doc = self.searcher.doc(docid) # doc.getFields() -> field.name(), field.stringValue() # use highlighter to extract relevant part of body highlighted_text = '' if highlighter: contents = doc['body'] token_stream = self.analyzer.tokenStream('body', contents) n_fragments = 3 fragment_separator = '...' highlighted_text = highlighter.getBestFragments( token_stream, contents, n_fragments, fragment_separator) return { 'fullpath': doc['fullpath'], 'last_modified_time': doc['last_modified_time'], 'score': result.score, 'excerpt': highlighted_text } def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ] def get_all_docs(self, n_hits=1000): # debug method return [ self._process_search_result(result) for result in self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs ] def __exit__(self, type, value, traceback): """ Used by the "with" statement. Handles close. TODO: error handling """ self.writer.close() self.reader.close() def debug_analyzer(self, text): """ Debug what StandardAnalyzer will give on this text. Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py """ token_stream = self.analyzer.tokenStream('field', text) termAtt = token_stream.getAttribute(CharTermAttribute.class_) token_stream.reset() tokens = [] while token_stream.incrementToken(): #tokens.append(token_stream.reflectAsString(True)) tokens.append(termAtt.toString()) token_stream.end() token_stream.close() return tokens
class HighlighterTestCase(PyLuceneTestCase): """ Unit tests ported from Java Lucene. 2004 by Yura Smolsky ;) """ FIELD_NAME = "contents" texts = [ "A wicked problem is one for which each attempt to create a solution changes the understanding of the problem. Wicked problems cannot be solved in a traditional linear fashion, because the problem definition evolves as new possible solutions are considered and/or implemented." "Wicked problems always occur in a social context -- the wickedness of the problem reflects the diversity among the stakeholders in the problem." "From http://cognexus.org/id42.htm" "Most projects in organizations -- and virtually all technology-related projects these days -- are about wicked problems. Indeed, it is the social complexity of these problems, not their technical complexity, that overwhelms most current problem solving and project management approaches." "This text has a typo in referring to whicked problems" ]; def __init__(self, *args): super(HighlighterTestCase, self).__init__(*args) self.parser = QueryParser(Version.LUCENE_CURRENT, self.FIELD_NAME, StandardAnalyzer(Version.LUCENE_CURRENT)) def setUp(self): super(HighlighterTestCase, self).setUp() self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) writer = self.getWriter(analyzer=self.analyzer) for text in self.texts: self.addDoc(writer, text) writer.commit() writer.close() self.reader = self.getReader() self.numHighlights = 0; def testSimpleHighlighter(self): self.doSearching("Wicked") highlighter = Highlighter(QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(40)) maxNumFragmentsRequired = 2 for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...") print "\t", result # Not sure we can assert anything here - just running to check we don't # throw any exceptions def testGetBestFragmentsSimpleQuery(self): self.doSearching("Wicked") self.doStandardHighlights() self.assert_(self.numHighlights == 3, ("Failed to find correct number of highlights, %d found" %(self.numHighlights))) def doSearching(self, queryString): self.searcher = self.getSearcher() self.query = self.parser.parse(queryString) # for any multi-term queries to work (prefix, wildcard, range, # fuzzy etc) you must use a rewritten query! self.query = self.query.rewrite(self.reader) print "Searching for:", self.query.toString(self.FIELD_NAME) self.scoreDocs = self.searcher.search(self.query, 100).scoreDocs self.numHighlights = 0 def doStandardHighlights(self): formatter = TestFormatter(self) highlighter = Highlighter(formatter, QueryScorer(self.query)) highlighter.setTextFragmenter(SimpleFragmenter(20)) for scoreDoc in self.scoreDocs: text = self.searcher.doc(scoreDoc.doc).get(self.FIELD_NAME) maxNumFragmentsRequired = 2 fragmentSeparator = "..." tokenStream = self.analyzer.tokenStream(self.FIELD_NAME, StringReader(text)) result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, fragmentSeparator) print "\t", result def countHighlightTerm(self): self.numHighlights += 1 # update stats used in assertions def addDoc(self, writer, text): d = Document() f = Field(self.FIELD_NAME, text, TextField.TYPE_STORED) d.add(f) writer.addDocument(d)
# Basic tokenizer example. test = "This is how we do it." tokenizer = StandardTokenizer() tokenizer.setReader(StringReader(test)) charTermAttrib = tokenizer.getAttribute(CharTermAttribute.class_) tokenizer.reset() tokens = [] while tokenizer.incrementToken(): tokens.append(charTermAttrib.toString()) print(tokens) # StandardAnalyzer example. analyzer = StandardAnalyzer() stream = analyzer.tokenStream("", StringReader(test)) stream.reset() tokens = [] while stream.incrementToken(): tokens.append(stream.getAttribute(CharTermAttribute.class_).toString()) print(tokens) # JapaneseAnalyzer example. analyzer = JapaneseAnalyzer() test = "寿司が食べたい。" stream = analyzer.tokenStream("", StringReader(test)) stream.reset() tokens = [] while stream.incrementToken():
def rocchio(ireader, query, relevantIDs, nonrelevantIDs, maxaddedterms=5, tfidfthresh=0.0, alpha=1.0, beta=0.75, gamma=0.15): """ Performs the Rocchio Query Expansion algorithm to provide additional relevant query terms. :param ireader: IndexReader object (you'll probably want to use the one available during search). :param query: Query string, the raw string from the user. :param relevantIDs: Document IDs of all relevant documents. :param nonrelevantIDs: Document IDs of all non-relevant documents. :param maxaddedterms: The maximum amount of terms appended to the expanded query (can be combined with tfidfthresh). Default: 5 :param tfidfthresh: TF-IDF threshold for all query terms (can be combined with maxaddedterms). Default: 0 :param alpha: Rocchio alpha weight (original query vector). Default: 1.0 :param beta: Rocchio beta weight (relevant docs query vector). Default: 0.75 :param gamma: Rocchio gamma weight (non-relevant docs query vector). Default: 0.15 :return: Query string. Weight defaults were sourced from: https://nlp.stanford.edu/IR-book/html/htmledition/the-rocchio71-algorithm-1.html """ # All score vectors we'll be using q0 = {} drv = {} dnrv = {} # Process query into tokens analyzer = StandardAnalyzer() stream = analyzer.tokenStream("", StringReader(query)) stream.reset() tokens = [] while stream.incrementToken(): tokens.append(stream.getAttribute(CharTermAttribute.class_).toString()) # (Re)generate score vector for current query q0 = get_score_vector(ireader, alpha, terms=tokens) # Generate score vector for relevant documents if relevantIDs: drv = get_score_vector(ireader, beta, docs=relevantIDs) # Generate score vector for nonrelevant documents if nonrelevantIDs: dnrv = get_score_vector(ireader, gamma, docs=nonrelevantIDs) # Merge score vectors following Rocchio formula. Weights have already been applied q1 = q0 for key, value in drv.items(): if key in q1: q1[key] += value else: q1[key] = value for key, value in dnrv.items(): if key in q1: q1[key] -= value else: q1[key] = value # Return all the best terms # Terms are narrowed down using both a TF-IDF threshold and a maximum amount of terms # The TF-IDF threshold is 0 (i.e. ignored) by default bestterms = sorted([t for t in q1.keys() if q1[t] > tfidfthresh], key=lambda x: q1[x], reverse=True) best = bestterms[:len(tokens) + maxaddedterms] # print([(t, q1[t]) for t in best]) return " ".join(best)
class Indexer: """ Indexer Class """ (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date", "url", "tags", "timestamp") def __init__(self, indexDir="", debug=False, verbose=False): """ :Parameters: - `indexDir`: Path where the Index will be saved. (Str) - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean) - `verbose`: Provide additional information about the initialization process. (Boolean) """ self.__verbose = verbose if indexDir != "": INDEX_DIR = indexDir else: INDEX_DIR = os.path.dirname( os.path.realpath(__file__)) + "/luceneIndex" if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self.__boAppend = False else: self.__boAppend = True # Initialize lucene and JVM lucene.initVM() # Get index storage if debug: # Store the index in memory self.__indexDir = RAMDirectory() self.__boAppend = False INDEX_DIR = "RAM Memory" else: # Store an index on disk self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR)) # Create Content FieldType self.__contentType = FieldType() self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.__contentType.setTokenized(True) self.__contentType.setStored(True) self.__contentType.setStoreTermVectors(True) self.__contentType.setStoreTermVectorPositions(True) self.__contentType.freeze() # Get the Analyzer self.__analyzer = StandardAnalyzer( StandardAnalyzer.ENGLISH_STOP_WORDS_SET) # Print Indexer Information print("Lucene version is: ", lucene.VERSION) print("Index Directory: ", INDEX_DIR) def __del__(self): self.__indexDir.close() ################################################## #Private Methods ################################################## @staticmethod def __getTimestamp(dateTime): """ Converts the document's date to an integer timestamp :Parameters: - `dateTime`: Document's date (Str) :Returns: - Date timestamp (Int) """ tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ') sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format( tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec) return int(sTime) @staticmethod def __getDateTime(timeStamp): """ Converts the document's timestamp to date :Parameters: - `timeStamp`: Document's timestamp :Returns: - Date (Str) """ date = datetime.datetime(year=int(timeStamp[0:4]), month=int(timeStamp[4:6]), day=int(timeStamp[6:8]), hour=int(timeStamp[8:10]), minute=int(timeStamp[10:12]), second=int(timeStamp[12:14])) return date.strftime('%Y-%m-%d %H:%M:%S') @staticmethod def __qualifyTags(tags): """ Creates the qualify string for tags :Parameters: - `tags`: List of document's tags :Return: - Qualify Tags (Str) """ sTags = "" for tag in tags: sTags += tag + '|' return sTags[:-1] @staticmethod def __scatterMatrix(numDocs, freqMtx): print("Scattering Frequency Matrix...") pB = ProgressBar(len(freqMtx), prefix='Progress:') matrix = [] innerMatrix = ['Term'] #Generate Document Columns for docIdx in range(numDocs): innerMatrix.append("D{0:0>4}".format(docIdx)) matrix.append(innerMatrix) #Generate Word Rows and Columns for word in sorted(freqMtx): innerMatrix = [] innerMatrix.append(word) for docIdx in range(numDocs): try: termCount = round(freqMtx[word][str(docIdx)], 3) innerMatrix.append(termCount) except KeyError: innerMatrix.append(0) matrix.append(innerMatrix) pB.updateProgress() return matrix @staticmethod def __saveMatrix(numDocs, freqMtx): pathMatrix = os.path.dirname( os.path.realpath(__file__)) + "/freqMtx.txt" fMatrix = open(pathMatrix, 'w') print("Saving Frequency Matrix File: ", pathMatrix) pB = ProgressBar(len(freqMtx), prefix='Progress:') # File Generation Start print("+========= Frequency Matrix =========+", file=fMatrix) print("%20s" % (' '), end=' ', file=fMatrix) for docIdx in range(numDocs): print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix) print(file=fMatrix) for word in sorted(freqMtx): print("%20s" % (word), end=' ', file=fMatrix) for docIdx in range(numDocs): try: termCount = freqMtx[word][str(docIdx)] print("%02.03f" % (termCount), end=' ', file=fMatrix) except KeyError: print(" 0 ", end=' ', file=fMatrix) print(file=fMatrix) pB.updateProgress() # Close File fMatrix.close() def __stemString(self, stringToStem): stemmedTerms = [] tknStream = self.__analyzer.tokenStream('STEM', stringToStem) stemmed = SnowballFilter(tknStream, "English") stemmed.reset() while stemmed.incrementToken(): stemmedTerms.append( stemmed.getAttribute(CharTermAttribute.class_).toString()) tknStream.close() return stemmedTerms @staticmethod def __normalize(qVector, freqMtx): for term in qVector: for docId in freqMtx: if (term in freqMtx[docId]) and (freqMtx[docId][term] > qVector[term]): qVector[term] = freqMtx[docId][term] @staticmethod def __dotProduct(aVector, bVector): """ Calculate Dot Product :Parameters: - `aVector`: A Vector. (Dict) - `bVector`: B Vector. (Dict) :Returns: - Dot Product. (Int) """ dotProduct = 0 for term in aVector: if term in bVector: product = aVector[term] * bVector[term] dotProduct += product return dotProduct @staticmethod def __magnitude(vector): """ Calculate Dot Product :Parameters: - `vector`: Query Vector. (Dict) :Returns: - Vector Magnitude. (Int) """ # Magnitude of the vector is the square root of the dot product of the vector with itself. vectorMagnitude = Indexer.__dotProduct(vector, vector) vectorMagnitude = math.sqrt(vectorMagnitude) return vectorMagnitude ################################################## #Public Methods ################################################## def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close() def Search(self, query, field=NAME, maxResult=1000): """ Search for a document into the Lucene's Index :Parameters: - `query`: Request to be made to the Index (Str). - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS). - `maxResult`: Maximum number of results. """ # Get the Index Directory reader = DirectoryReader.open(self.__indexDir) searcher = IndexSearcher(reader) # Create a query queryParser = QueryParser(field, self.__analyzer).parse(query) # Do a search hits = searcher.search(queryParser, maxResult) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, queryParser)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score)) print("Name: " + doc.get('name')) print("Tags: " + doc.get('tags') + "\n") reader.close() def StemDocument(self, docIdx): """ Return an array of the document's stemmed terms :Parameters: - `docIdx`: Document's index ID (Int). """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx).get(Indexer.CONTENT) reader.close() return self.__stemString(doc) def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False): """ Generates a Frequency Matrix of the current Index :Parameters: - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean) """ freqMtx = {} # Terms - DocumentID Matrix reader = DirectoryReader.open(self.__indexDir) numDocs = reader.numDocs() print("Generating Frequency Matrix...") pB = ProgressBar(numDocs - 1, prefix='Progress:') for docIdx in range(numDocs): termItr = self.StemDocument(docIdx) termSize = len(termItr) docStr = '{0}'.format(docIdx) termDict = {} for termText in termItr: if byTerms: # Check if the term exists if termText in freqMtx: # Check if the document exists if docStr in freqMtx[termText]: termCount = int( math.ceil( ((freqMtx[termText][docStr] * termSize) / 100))) freqMtx[termText].update( {docStr: ((termCount + 1) / termSize) * 100}) else: freqMtx[termText].update( {docStr: (1 / termSize) * 100}) else: termIdx = {termText: {docStr: (1 / termSize) * 100}} freqMtx.update(termIdx) else: # Check if the term exists termText = termText.replace('.', '_') if termText in termDict: termCount = int( math.ceil((termDict[termText] * termSize) / 100)) termDict[termText] = ((termCount + 1) / termSize) * 100 else: termIdx = {termText: (1 / termSize) * 100} termDict.update(termIdx) if not byTerms: freqMtx.update({docStr: termDict}) pB.updateProgress() if saveMtx and byTerms: self.__saveMatrix(numDocs, freqMtx) if scattered and byTerms: freqMtx = self.__scatterMatrix(numDocs, freqMtx) # Close IndexReader reader.close() return freqMtx def GetSimilarity(self, query, freqMtx): """ Cosine Similarity """ qVector = {} qList = self.__stemString(query) for stem in qList: qVector.update({stem: 0}) self.__normalize(qVector, freqMtx) qList = [] #Get similarity between query and doc[n] for docIdx, dVector in freqMtx.items(): dP = self.__dotProduct(qVector, dVector) qM = self.__magnitude(qVector) dM = self.__magnitude(dVector) cosSimilarity = dP / (qM * dM) qList.append((docIdx, cosSimilarity)) return sorted(qList, key=lambda similarity: similarity[1], reverse=True) def AnalyzeDocument(self, docIdx): """ Generates a list of (entity, relation, entity) tuples as its output. :Parameters: - `docIdx`: Document's index ID (Int). """ gpeList = {} geolocator = Geocode() reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) # Load NLTK Data nltkPath = os.path.dirname( os.path.realpath(__file__)) + '/../tools/nltk_data' nltk.data.path.append(nltkPath) # Named Entity Recognition content = doc.get(Indexer.CONTENT) sentences = nltk.sent_tokenize(content) #ProgressBar print("Analazing Document {0}".format(docIdx)) pB = ProgressBar(len(sentences), prefix='Progress:') # Loop over each sentence and tokenize it separately for sentence in sentences: ner = nltk.word_tokenize(sentence) ner = nltk.pos_tag(ner) ner = nltk.ne_chunk(ner) # Get all the Geo-Political Entities for subtrees in list( ner.subtrees( filter=lambda subtree: subtree.label() == 'GPE')): entityName = ' '.join([child[0] for child in subtrees]) if entityName not in gpeList: location = geolocator.GetGPE(entityName) if location: gpeList.update(location) pB.updateProgress() gpeList = geolocator.GetFeatureCollection(gpeList) return gpeList def GetDocField(self, docIdx, field=CONTENT): """ Get the document's field :Parameters: - `docIdx`: Document's index ID (Int). - `field`: Field to retrieve (Str). :Returns: - Document's field. (Str) """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) content = doc.get(field) reader.close() return content
class PorterStemmerAnalyzer(PythonAnalyzer): def createComponents(self, fieldName, reader): source = StandardTokenizer(Version.LUCENE_CURRENT, reader) filter = StandardFilter(Version.LUCENE_CURRENT, source) filter = LowerCaseFilter(Version.LUCENE_CURRENT, filter) filter = PorterStemFilter(filter) filter = StopFilter(Version.LUCENE_CURRENT, filter, StopAnalyzer.ENGLISH_STOP_WORDS_SET) return self.TokenStreamComponents(source, filter) lucene.initVM(vmargs=['-Djava.awt.headless=true']) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) input = 'this is a test string for Analyzer' ts = analyzer.tokenStream("dummy", StringReader(input)) #matchVersion = Version.LUCENE_XY; ##Substitute desired Lucene version for XY offsetAtt = ts.addAttribute(OffsetAttribute.class_) termAtt = ts.addAttribute(CharTermAttribute.class_) #posAtt = ts.addAttribute(PartOfSpeechAttribute.class_) def testStandard(): ts.reset(); ##Resets this stream to the beginning. (Required while ts.incrementToken(): #print ts.r #print ts.reflectAsString(True) print offsetAtt.startOffset() print offsetAtt.endOffset() print termAtt.toString() #, posAtt.getPartOfSpeech()