def __init__(self, startJVM=False): if startJVM: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.STORE_DIR = "index_dir" self.store = SimpleFSDirectory(File(self.STORE_DIR)) tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config)
def createind(product,url): "This function creates index for lucene" global counter counter += 1 adId = counter adLine = product field_string = chunker(product.lower()) field_related_words = getDbpediaMatches(product, field_string) url = url lucene.initVM() # 1. create an index index_path = File("Home/WishMatcherIndex") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) index = SimpleFSDirectory(index_path) # 2. fill the index config = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) writer = IndexWriter(index, config) #for title in TITLES: import time millis = int(round(time.time() * 1000)) userid = str(millis) doc = Document() doc.add(Field("AdId", str(adId), Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("AdLine", adLine, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("FieldString", field_string, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("FieldRelatedWords", field_related_words, Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("URL", url, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print(adId) # 3. close resources writer.close() index.close() return ""
print 'url:', doc.get('url') print 'rate:', doc.get('rate') while True: print print "Hit enter with no input to quit." choice = raw_input('1-perfumer, 2-scents:') if choice == '': return command = raw_input("Query:") command = unicode(command, 'utf-8') if choice == '1': perfumer_search(command) if choice == '2': scent_search(command) if command == '': return print "Searching for:", command if __name__ == '__main__': STORE_DIR = "index" lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) del searcher
def main(index_dir, input_dir): """Creates a Lucene Index, and indexes every .json file it finds. It utilizes a stopwords.txt to filter out stop words""" lucene.initVM() logger.info("Loading stop words from stopwords.txt") f = open('stopwords.txt', 'r') stopwords = set([]) for line in f: stopwords.add(line.strip()) f.close() logger.debug('Stop words: %s' % str(stopwords)) temp = CharArraySet(1, True) for stopword in stopwords: temp.add(stopword) stopwords = temp # Create index logger.info("Creating Lucene index [%s]..." % index_dir) fs_dir = SimpleFSDirectory(Paths.get(index_dir)) analyzer = StandardAnalyzer(stopwords) writerConfig = IndexWriterConfig(analyzer) writer = IndexWriter(fs_dir, writerConfig) logger.info("Currently there are %d documents in the index..." % writer.numDocs()) # Index documents onlyfiles = [ f for f in listdir(input_dir) if isfile(join(input_dir, f)) and f.endswith('.json') ] for f in onlyfiles: try: journal_code = f.split('.')[0] f = join(input_dir, f) json_data = open(f) data = json.load(json_data) for entry in data: doc = Document() doc.add(StringField("journal", journal_code, Field.Store.YES)) doc.add(StringField("url", entry['url'], Field.Store.YES)) doc.add(StringField("date", entry['date'], Field.Store.YES)) doc.add(TextField("title", entry['title'], Field.Store.YES)) writer.addDocument(doc) json_data.close() except IOError as v: try: (code, message) = v except (TypeError, ValueError): code = 0 message = v logger.error("I/O Error: " + str(message) + " (" + str(code) + ")") logger.info("Indexed lines from stdin (%d documents in index)" % writer.numDocs()) # Wrap it up # logger.info("About to optimize index of %d documents..." % writer.numDocs()) # writer.optimize() # logger.info("...done optimizing index of %d documents" % writer.numDocs()) logger.info("Closing index of %d documents..." % writer.numDocs()) writer.close() reader = DirectoryReader.open(fs_dir) with open('all.csv', 'w') as csvfile: csvwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) for i in range(0, reader.numDocs()): doc = reader.document(i) csvwriter.writerow([ doc.get('journal'), doc.get('date'), doc.get('url'), doc.get('title').strip().replace(',', '\,') ])
def __init__(self, storeDir): lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION self.dir = SimpleFSDirectory(File(storeDir))
def index_images_until_stop(session, handler, lbound): global _stop, _stopped, _vm _vm.attachCurrentThread() searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER)))) query = BooleanQuery() query.add(TermQuery(Term('finish_time', '0')), BooleanClause.Occur.MUST_NOT) query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) if not lbound is None: query.add( TermRangeQuery.newStringRange('finish_time', lbound, '9999999999', False, True), BooleanClause.Occur.MUST) sort = Sort(SortField('finish_time', SortField.Type.INT)) tmpbk = None res = searcher.search(query, 100, sort) answer_content_searcher = zh_iatd.create_searcher() logger = external_console_logger('/tmp/zh_imgc_info') while not _stop: print 'got', len(res.scoreDocs), 'docs' for x in res.scoreDocs: try: imgsgot = 0 realdoc = searcher.doc(x.doc) doctype = realdoc['func_name'] objid = realdoc['id'] logger.write(' ft:{0}'.format(realdoc['finish_time'])) if doctype == 'user_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/people/{0}'.format( objid))), HTML_PARSER) cover = soup.select( '#ProfileHeader .ProfileHeader-userCover img') if len(cover) > 0: cover_img = cover[0]['src'] imgsgot += 1 handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid) avatar_img = soup.select( '#ProfileHeader .ProfileHeader-main .UserAvatar img' )[0]['src'] imgsgot += 1 handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid) elif doctype == 'article_data': jsondata = session.get_article_content_raw(objid) if 'titleImage' in jsondata.keys(): cover_img = jsondata['titleImage'] if len(cover_img) > 0: imgsgot += 1 handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid) soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER) for x in soup.select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid) elif doctype == 'topic_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/topic/{0}/hot'. format(objid))), HTML_PARSER) topic_img = soup.select( '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img' )[0]['src'] imgsgot += 1 handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid) elif doctype == 'answer_comments' and realdoc['start'] == '0': obj, q = zh_iatd.query_object(answer_content_searcher, objid, zh_pganlz.answer) for x in obj.data.text.as_soup().select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid) elif doctype == 'question_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/question/{0}'. format(objid))), HTML_PARSER) for x in soup.select('#zh-question-detail img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid) else: logger.write('\n') continue logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot)) if _stop: break time.sleep(3) except Exception as e: logger.write('\n## ERROR ################################\n') logger.write(traceback.format_exc()) if len(res.scoreDocs) > 0: tmpbk = res.scoreDocs[-1] res = searcher.searchAfter(tmpbk, query, 100, sort) print 'stopped' _stopped = True
def __init__(this, storeDir, analyzer): store = SimpleFSDirectory(Paths.get(storeDir)) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) this.writer = IndexWriter(store, config)
def index(docDirPath="data", indexDirPath="index"): lucene.initVM() indexDir = SimpleFSDirectory(Paths.get(indexDirPath)) analyzer = StandardAnalyzer() writerConfig = IndexWriterConfig(analyzer) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(indexDir, writerConfig) files = os.listdir(docDirPath) for file in files: if not os.path.isdir(file): f = open(docDirPath + "/" + file) iter_f = iter(f) for line in iter_f: document = Document() data = json.loads(line) document.add( Field("user_id", data["user_id"], TextField.TYPE_STORED)) document.add(Field("text", data["text"], TextField.TYPE_STORED)) if data["urls"] != "None" and len(data["urls"]) != 0: document.add( Field("title", data["urls"]["title"], TextField.TYPE_STORED)) else: document.add(Field("title", "None", TextField.TYPE_STORED)) if data["hashtags"] != "None": hashtags = "" for tag in data['hashtags'].values(): hashtags += tag["text"] + ", " document.add( Field("hashtags", hashtags[0:-2], TextField.TYPE_STORED)) else: document.add( Field("hashtags", "None", TextField.TYPE_STORED)) if data["user_mentions"] != "None": user_mentions = "" for user in data['user_mentions'].values(): user_mentions += user + ", " document.add( Field("user_mentions", user_mentions[0:-2], TextField.TYPE_STORED)) else: document.add( Field("user_mentions", "None", TextField.TYPE_STORED)) if data["place"] != "None": document.add( Field("place", data["place"]["place_name"], TextField.TYPE_STORED)) document.add( Field("cords_x", str(data["place"]["1"]["x"]), StoredField.TYPE)) document.add( Field("cords_y", str(data["place"]["1"]["y"]), StoredField.TYPE)) else: document.add(Field("place", "None", TextField.TYPE_STORED)) document.add(Field("cords_x", "None", StoredField.TYPE)) document.add(Field("cords_y", "None", StoredField.TYPE)) document.add(Field("time", data["time"], StoredField.TYPE)) text = data['tweet'].split(',') for i in range(len(text)): if ("'lang':" in text[i]): lang = text[i].split(': ')[1] document.add( Field("lang", lang[1:-1], StoredField.TYPE)) break index_writer.addDocument(document) f.close() index_writer.close()
class PyLucene: """ PyLucene module api """ def __init__(self, startJVM=False): if startJVM: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.STORE_DIR = "index_dir" self.store = SimpleFSDirectory(File(self.STORE_DIR)) tmp_analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = LimitTokenCountAnalyzer(tmp_analyzer, 10000) config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(self.store, config) def close_store(self): self.store.close() def index_doc(self, doc_dict): """ Index a doc to pylucene obs.: docid is a string not an integer """ doc = Document() doc.add(Field("doc_id", doc_dict["doc_id"], TextField.TYPE_STORED)) doc.add(Field("general_info", doc_dict["general_info"], TextField.TYPE_NOT_STORED)) doc.add(Field("subject", doc_dict["subject"], TextField.TYPE_NOT_STORED)) doc.add(Field("source", doc_dict["source"], TextField.TYPE_NOT_STORED)) doc.add(Field("initial_date", doc_dict["initial_date"], TextField.TYPE_NOT_STORED)) doc.add(Field("final_date", doc_dict["final_date"], TextField.TYPE_NOT_STORED)) body_text = doc_dict["content"] body_reader = StringReader(body_text) doc.add(Field("content", body_reader)) self.writer.addDocument(doc) ticker = Ticker() print 'commit index', threading.Thread(target=ticker.run).start() self.writer.commit() ticker.tick = False print 'done' def search_docs(self, value, field="general_info"): MAX_RESULTS = 1000 searcher = IndexSearcher(DirectoryReader.open(self.store)) query = QueryParser(Version.LUCENE_CURRENT, field, self.analyzer).parse(value) topDocs = searcher.search(query, MAX_RESULTS) return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
class Indexer: """ Indexer Class """ (NAME, CONTENT, DATE, URL, TAGS, TIMESTAMP) = ("name", "content", "date", "url", "tags", "timestamp") def __init__(self, indexDir="", debug=False, verbose=False): """ :Parameters: - `indexDir`: Path where the Index will be saved. (Str) - `debug`: Create the Index in RAM Memory (indexDir will be ignored). (Boolean) - `verbose`: Provide additional information about the initialization process. (Boolean) """ self.__verbose = verbose if indexDir != "": INDEX_DIR = indexDir else: INDEX_DIR = os.path.dirname( os.path.realpath(__file__)) + "/luceneIndex" if not os.path.exists(INDEX_DIR): os.makedirs(INDEX_DIR) self.__boAppend = False else: self.__boAppend = True # Initialize lucene and JVM lucene.initVM() # Get index storage if debug: # Store the index in memory self.__indexDir = RAMDirectory() self.__boAppend = False INDEX_DIR = "RAM Memory" else: # Store an index on disk self.__indexDir = SimpleFSDirectory(Paths.get(INDEX_DIR)) # Create Content FieldType self.__contentType = FieldType() self.__contentType.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.__contentType.setTokenized(True) self.__contentType.setStored(True) self.__contentType.setStoreTermVectors(True) self.__contentType.setStoreTermVectorPositions(True) self.__contentType.freeze() # Get the Analyzer self.__analyzer = StandardAnalyzer( StandardAnalyzer.ENGLISH_STOP_WORDS_SET) # Print Indexer Information print("Lucene version is: ", lucene.VERSION) print("Index Directory: ", INDEX_DIR) def __del__(self): self.__indexDir.close() ################################################## #Private Methods ################################################## @staticmethod def __getTimestamp(dateTime): """ Converts the document's date to an integer timestamp :Parameters: - `dateTime`: Document's date (Str) :Returns: - Date timestamp (Int) """ tm = time.strptime(dateTime, '%Y-%m-%dT%H:%M:%SZ') sTime = "{0:0>4}{1:0>2}{2:0>2}{3:0>2}{4:0>2}{5:0>2}".format( tm.tm_year, tm.tm_mon, tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec) return int(sTime) @staticmethod def __getDateTime(timeStamp): """ Converts the document's timestamp to date :Parameters: - `timeStamp`: Document's timestamp :Returns: - Date (Str) """ date = datetime.datetime(year=int(timeStamp[0:4]), month=int(timeStamp[4:6]), day=int(timeStamp[6:8]), hour=int(timeStamp[8:10]), minute=int(timeStamp[10:12]), second=int(timeStamp[12:14])) return date.strftime('%Y-%m-%d %H:%M:%S') @staticmethod def __qualifyTags(tags): """ Creates the qualify string for tags :Parameters: - `tags`: List of document's tags :Return: - Qualify Tags (Str) """ sTags = "" for tag in tags: sTags += tag + '|' return sTags[:-1] @staticmethod def __scatterMatrix(numDocs, freqMtx): print("Scattering Frequency Matrix...") pB = ProgressBar(len(freqMtx), prefix='Progress:') matrix = [] innerMatrix = ['Term'] #Generate Document Columns for docIdx in range(numDocs): innerMatrix.append("D{0:0>4}".format(docIdx)) matrix.append(innerMatrix) #Generate Word Rows and Columns for word in sorted(freqMtx): innerMatrix = [] innerMatrix.append(word) for docIdx in range(numDocs): try: termCount = round(freqMtx[word][str(docIdx)], 3) innerMatrix.append(termCount) except KeyError: innerMatrix.append(0) matrix.append(innerMatrix) pB.updateProgress() return matrix @staticmethod def __saveMatrix(numDocs, freqMtx): pathMatrix = os.path.dirname( os.path.realpath(__file__)) + "/freqMtx.txt" fMatrix = open(pathMatrix, 'w') print("Saving Frequency Matrix File: ", pathMatrix) pB = ProgressBar(len(freqMtx), prefix='Progress:') # File Generation Start print("+========= Frequency Matrix =========+", file=fMatrix) print("%20s" % (' '), end=' ', file=fMatrix) for docIdx in range(numDocs): print("D{0:0>4}".format(docIdx), end=' ', file=fMatrix) print(file=fMatrix) for word in sorted(freqMtx): print("%20s" % (word), end=' ', file=fMatrix) for docIdx in range(numDocs): try: termCount = freqMtx[word][str(docIdx)] print("%02.03f" % (termCount), end=' ', file=fMatrix) except KeyError: print(" 0 ", end=' ', file=fMatrix) print(file=fMatrix) pB.updateProgress() # Close File fMatrix.close() def __stemString(self, stringToStem): stemmedTerms = [] tknStream = self.__analyzer.tokenStream('STEM', stringToStem) stemmed = SnowballFilter(tknStream, "English") stemmed.reset() while stemmed.incrementToken(): stemmedTerms.append( stemmed.getAttribute(CharTermAttribute.class_).toString()) tknStream.close() return stemmedTerms @staticmethod def __normalize(qVector, freqMtx): for term in qVector: for docId in freqMtx: if (term in freqMtx[docId]) and (freqMtx[docId][term] > qVector[term]): qVector[term] = freqMtx[docId][term] @staticmethod def __dotProduct(aVector, bVector): """ Calculate Dot Product :Parameters: - `aVector`: A Vector. (Dict) - `bVector`: B Vector. (Dict) :Returns: - Dot Product. (Int) """ dotProduct = 0 for term in aVector: if term in bVector: product = aVector[term] * bVector[term] dotProduct += product return dotProduct @staticmethod def __magnitude(vector): """ Calculate Dot Product :Parameters: - `vector`: Query Vector. (Dict) :Returns: - Vector Magnitude. (Int) """ # Magnitude of the vector is the square root of the dot product of the vector with itself. vectorMagnitude = Indexer.__dotProduct(vector, vector) vectorMagnitude = math.sqrt(vectorMagnitude) return vectorMagnitude ################################################## #Public Methods ################################################## def IndexDocs(self, documents): """ Index documents under the directory :Parameters: - `documents`: Documents to be indexed (List) """ # Get the Writer Configuration writerConfig = IndexWriterConfig(self.__analyzer) # Get index writer writer = IndexWriter(self.__indexDir, writerConfig) for document in documents: # Create a document that would we added to the index doc = Document() # Add a field to this document doc.add(TextField(Indexer.NAME, document['name'], Field.Store.YES)) doc.add( Field(Indexer.CONTENT, document['content'], self.__contentType)) doc.add( StringField(Indexer.DATE, document['date'], Field.Store.YES)) doc.add(StringField(Indexer.URL, document['url'], Field.Store.YES)) doc.add( TextField(Indexer.TAGS, self.__qualifyTags(document['tags']), Field.Store.YES)) doc.add( LongPoint(Indexer.TIMESTAMP, self.__getTimestamp(document['date']))) # Add or update the document to the index if not self.__boAppend: # New index, so we just add the document (no old document can be there): if self.__verbose: print("Adding " + document['name']) writer.addDocument(doc) else: # Existing index (an old copy of this document may have been indexed) so # we use updateDocument instead to replace the old one matching the exact # path, if present: if self.__verbose: print("Updating " + document['name']) writer.updateDocument(Term(Indexer.NAME, document['name']), doc) # Print index information and close writer print("Indexed %d documents (%d docs in index)" % (len(documents), writer.numDocs())) writer.close() def Search(self, query, field=NAME, maxResult=1000): """ Search for a document into the Lucene's Index :Parameters: - `query`: Request to be made to the Index (Str). - `field`: Field to be consulted by the query (NAME, CONTENT, DATE, URL, TAGS). - `maxResult`: Maximum number of results. """ # Get the Index Directory reader = DirectoryReader.open(self.__indexDir) searcher = IndexSearcher(reader) # Create a query queryParser = QueryParser(field, self.__analyzer).parse(query) # Do a search hits = searcher.search(queryParser, maxResult) print("Found %d document(s) that matched query '%s':" % (hits.totalHits, queryParser)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) print("Document Nº: %d - Score: %.5f" % (hit.doc, hit.score)) print("Name: " + doc.get('name')) print("Tags: " + doc.get('tags') + "\n") reader.close() def StemDocument(self, docIdx): """ Return an array of the document's stemmed terms :Parameters: - `docIdx`: Document's index ID (Int). """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx).get(Indexer.CONTENT) reader.close() return self.__stemString(doc) def FreqMatrix(self, scattered=False, byTerms=True, saveMtx=False): """ Generates a Frequency Matrix of the current Index :Parameters: - `saveMtx`: Save the Frequency Matrix to a .txt file. (Boolean) """ freqMtx = {} # Terms - DocumentID Matrix reader = DirectoryReader.open(self.__indexDir) numDocs = reader.numDocs() print("Generating Frequency Matrix...") pB = ProgressBar(numDocs - 1, prefix='Progress:') for docIdx in range(numDocs): termItr = self.StemDocument(docIdx) termSize = len(termItr) docStr = '{0}'.format(docIdx) termDict = {} for termText in termItr: if byTerms: # Check if the term exists if termText in freqMtx: # Check if the document exists if docStr in freqMtx[termText]: termCount = int( math.ceil( ((freqMtx[termText][docStr] * termSize) / 100))) freqMtx[termText].update( {docStr: ((termCount + 1) / termSize) * 100}) else: freqMtx[termText].update( {docStr: (1 / termSize) * 100}) else: termIdx = {termText: {docStr: (1 / termSize) * 100}} freqMtx.update(termIdx) else: # Check if the term exists termText = termText.replace('.', '_') if termText in termDict: termCount = int( math.ceil((termDict[termText] * termSize) / 100)) termDict[termText] = ((termCount + 1) / termSize) * 100 else: termIdx = {termText: (1 / termSize) * 100} termDict.update(termIdx) if not byTerms: freqMtx.update({docStr: termDict}) pB.updateProgress() if saveMtx and byTerms: self.__saveMatrix(numDocs, freqMtx) if scattered and byTerms: freqMtx = self.__scatterMatrix(numDocs, freqMtx) # Close IndexReader reader.close() return freqMtx def GetSimilarity(self, query, freqMtx): """ Cosine Similarity """ qVector = {} qList = self.__stemString(query) for stem in qList: qVector.update({stem: 0}) self.__normalize(qVector, freqMtx) qList = [] #Get similarity between query and doc[n] for docIdx, dVector in freqMtx.items(): dP = self.__dotProduct(qVector, dVector) qM = self.__magnitude(qVector) dM = self.__magnitude(dVector) cosSimilarity = dP / (qM * dM) qList.append((docIdx, cosSimilarity)) return sorted(qList, key=lambda similarity: similarity[1], reverse=True) def AnalyzeDocument(self, docIdx): """ Generates a list of (entity, relation, entity) tuples as its output. :Parameters: - `docIdx`: Document's index ID (Int). """ gpeList = {} geolocator = Geocode() reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) # Load NLTK Data nltkPath = os.path.dirname( os.path.realpath(__file__)) + '/../tools/nltk_data' nltk.data.path.append(nltkPath) # Named Entity Recognition content = doc.get(Indexer.CONTENT) sentences = nltk.sent_tokenize(content) #ProgressBar print("Analazing Document {0}".format(docIdx)) pB = ProgressBar(len(sentences), prefix='Progress:') # Loop over each sentence and tokenize it separately for sentence in sentences: ner = nltk.word_tokenize(sentence) ner = nltk.pos_tag(ner) ner = nltk.ne_chunk(ner) # Get all the Geo-Political Entities for subtrees in list( ner.subtrees( filter=lambda subtree: subtree.label() == 'GPE')): entityName = ' '.join([child[0] for child in subtrees]) if entityName not in gpeList: location = geolocator.GetGPE(entityName) if location: gpeList.update(location) pB.updateProgress() gpeList = geolocator.GetFeatureCollection(gpeList) return gpeList def GetDocField(self, docIdx, field=CONTENT): """ Get the document's field :Parameters: - `docIdx`: Document's index ID (Int). - `field`: Field to retrieve (Str). :Returns: - Document's field. (Str) """ reader = DirectoryReader.open(self.__indexDir) doc = reader.document(docIdx) content = doc.get(field) reader.close() return content
import sys import lucene from java.nio.file import Paths from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field from org.apache.lucene.search import IndexSearcher from org.apache.lucene.index import IndexReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version from org.apache.lucene.index import DirectoryReader if __name__ == "__main__": lucene.initVM() analyzer = StandardAnalyzer() path = Paths.get('index') indexDir = SimpleFSDirectory(path) searcher = IndexSearcher(DirectoryReader.open(indexDir)) query = QueryParser("text", analyzer).parse("certificate") MAX = 1000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("id")
def query_index(query, hit_logs_for_each, score_logs_for_each): ### 1_Query Alternation user_code_query = Generator(query) directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 2_Finding 3 Answer Snippets using the User Query (refined) answers = SnippetSearcher(searcher, user_code_query) answer_ids = answers.more_like_this(20, query=user_code_query) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Answer count if answer_ids: hit_logs_for_each += str(len(answer_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') ### 3_Finding the Associated Questions question_ids = answers.find_question_ids(answer_ids) # Log : Answer - Question count if question_ids: hit_logs_for_each += str(len(question_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() getDoc = GettingQuestionDocs(searcher) item_docs = getDoc.search( question_ids, 20)[0:7] # 순위대로 최소 7개의 question을 얻기 위해서 여기서 7개를 자름. searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Question ItemDoc count if item_docs: hit_logs_for_each += str(len(item_docs)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'questionIndex')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 4_Finding 3 Similar Questions per a Question (3 X 3) similar_questions = [] question = SimilarQsSearcher(searcher) # Log : Similar Question count for each of Question ItemDoc i = 1 if item_docs: for item_doc in item_docs: similar_question = question.more_like_this2( item_doc, 7) # 각 question 들에 대해 7개씩 비슷한 것들 찾음. if similar_question: hit_logs_for_each += str(len(similar_question)) + '\t' else: hit_logs_for_each += ('0' + '\t') similar_questions += similar_question i += 1 else: hit_logs_for_each += ('0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' + '0' + '\t' ) # 7개 searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Similar Question result count if similar_questions: hit_logs_for_each += str(len(similar_questions)) + '\t' else: hit_logs_for_each += ('0' + '\t') ### 5_Finding Associated Answers for each Question (9 - 9) answer_ids = find_answer_ids(similar_questions) # Log : Question - Answer count if answer_ids: hit_logs_for_each += str(len(answer_ids)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'stackoverflow')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() ### 6_Getting Answer Docs for the Final Query getDoc = GettingAnswerDocs(searcher) answer_docs = getDoc.search(answer_ids) searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None # Log : Answer Docs count if answer_docs: hit_logs_for_each += str(len(answer_docs)) + '\t' else: hit_logs_for_each += ('0' + '\t') directory = SimpleFSDirectory(File(INDICES_PATH + 'bigclonebench_1')) searchermgr = SearcherManager(directory, SearcherFactory()) searchermgr.maybeRefresh() searcher = searchermgr.acquire() bench_results = [] benchsearcher = BenchSearcher(searcher) # BigCloneBench # Exceptional ### 7_Appending for the user query results bench_result, score_logs_for_each = benchsearcher.more_like_this2( 100, answer_docs[0], score_logs_for_each, user_code_query, 1) if bench_result: hit_logs_for_each += str(len(bench_result)) + '\t' else: hit_logs_for_each += ('0' + '\t') bench_results += bench_result ### 8_Querying for the Final Results # Log : Bench_result for each query for answer_doc in answer_docs: bench_result, score_logs_for_each = benchsearcher.more_like_this2( 100, answer_doc, score_logs_for_each, user_code_query, 0) # , user_query=user_code_query) if bench_result: hit_logs_for_each += str(len(bench_result)) + '\t' else: hit_logs_for_each += ('0' + '\t') bench_results += bench_result searchermgr.release(searcher) searchermgr.close() searcher = None directory.close() directory = None if answer_docs < 49: for a in range(49 - len(answer_docs)): hit_logs_for_each += ('0' + '\t') if bench_results: hit_logs_for_each += str(len(bench_results)) + '\t' else: hit_logs_for_each += ('0' + '\t') sorted_bench_results = sorted(bench_results, key=attrgetter('score'), reverse=True) print 'Search Count : ', len(sorted_bench_results) recommended = recommend(sorted_bench_results) print 'Final Count : ', len(recommended) if bench_results: hit_logs_for_each += str(len(recommended)) + '\t' else: hit_logs_for_each += ('0' + '\t') return recommended, hit_logs_for_each, score_logs_for_each
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser("contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name"), ( 'score: %f' % (scoreDoc.score)) if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) searcher.setSimilarity(ClassicSimilarity()) analyzer = EnglishAnalyzer() run(searcher, analyzer) del searcher
#!/usr/bin/env python import web from web import form import urllib2 import os import sys, os, lucene from java.io import File from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.index import DirectoryReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher from org.apache.lucene.util import Version from org.apache.lucene.search import BooleanQuery from org.apache.lucene.search import BooleanClause STORE_DIR1 = "index1" STORE_DIR2 = "index2" vm_env =lucene.initVM(vmargs=['-Djava.awt.headless=true']) #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory1 = SimpleFSDirectory(File(STORE_DIR1)) searcher1 = IndexSearcher(DirectoryReader.open(directory1)) directory2= SimpleFSDirectory(File(STORE_DIR2)) searcher2 = IndexSearcher(DirectoryReader.open(directory2)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) total_list=[]
def GET(self, query): data_input = web.input() page = 0 if "page" in data_input: page = int(data_input["page"]) render = web.template.render('templates/') anses = [] num_pages = 0 if use_elasticsearch: # importing libraries for Elasticsearch from elasticsearch import Elasticsearch from elasticsearch_dsl import Search, document, field, connections, Q from elasticsearch_dsl.connections import connections from booktype import Book es = Elasticsearch() es.indices.create(index='book-index', ignore=[400, 404]) connections.create_connection(hosts=['localhost'], timeout=20) connections.add_connection('book', es) # print(connections.get_connection().cluster.health()) # s = Search(es).index('book-index').doc_type('book').query(Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) s = Search(using=es, index='book-index').doc_type('book').query( Q('match', title=query.strip()) | Q('match', description=query.strip()) | Q("match", userreviews_userReview=query.strip())) ## This damn statement took half an hour from me! Nowhere in the documentation indicated that this statement should be before s.execute() s = s[page * 10:page * 10 + 10] response = s.execute() # print 'total number of hits: ', response.hits.total num_pages = (response.hits.total / 10) + 1 for res in response: authors = zip(res.authors_name, res.authors_url) anses.append({ 'title': res.title, 'description': res.description.encode('utf-8'), 'url': res.url, 'cover': res.cover, 'authors': authors }) else: # importing libraries for Lucene import lucene from java.io import File from org.apache.lucene.index import DirectoryReader, Term from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher, BooleanClause, BooleanQuery, TermQuery from org.apache.lucene.util import Version from org.apache.lucene.analysis.standard import StandardAnalyzer import os # fields title_field = 'title' description_field = 'description' cover_field = 'cover' authors_name_field = 'authors_name' authors_url_field = 'authors_url' url_field = 'url' index_folder = '.' index_name = 'lucene.index' index_path = os.path.join(index_folder, index_name) lucene.initVM() version = Version.LUCENE_CURRENT directory = SimpleFSDirectory(File(index_path)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(version) title_tq = TermQuery(Term(title_field, query)) desc_tq = TermQuery(Term(description_field, query)) query = BooleanQuery() query.add(BooleanClause(title_tq, BooleanClause.Occur.SHOULD)) query.add(BooleanClause(desc_tq, BooleanClause.Occur.SHOULD)) scoreDocs = searcher.search(query, 1000).scoreDocs num_pages = (len(scoreDocs) / 10) + 1 for scoreDoc in scoreDocs[page * 10:page * 10 + 10]: doc = searcher.doc(scoreDoc.doc) authors = zip([doc.get(authors_name_field)], [doc.get(authors_url_field)]) anses.append({ 'title': doc.get(title_field), 'description': doc.get(description_field).encode('utf-8'), 'url': doc.get(url_field), 'cover': doc.get(cover_field), 'authors': authors }) return render.index(anses, query, num_pages)
def func(user_access): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() # ------------ # STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) searcher = IndexSearcher(DirectoryReader.open(directory)) # ------------ # tag = {} access = user_access.split() res = '' for i in access: b = i b = ''.join(b.split('/')) query = QueryParser(Version.LUCENE_CURRENT, "Tags", analyzer).parse(b) scoreDocs = searcher.search(query, 200).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tags = doc.get("Tags") tag_list = tags.split() for j in tag_list: if j not in tag: tag[j] = 1 else: tag[j] += 1 tags_list = sorted(tag.items(), key=lambda item: item[1], reverse=True) for i in tags_list[:3]: command = i[0] if command == '': return command = ''.join(command.split('/')) query = QueryParser(Version.LUCENE_CURRENT, "Tags", analyzer).parse(command) scoreDocs = searcher.search(query, 200).scoreDocs tmp = {} for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) collect = doc.get("Likes") views = doc.get("Views") rate = float(collect) / float(views) tmp[doc.get("Page_num")] = rate res_list = sorted(tmp.items(), key=lambda item: item[1], reverse=True) count = 0 for i in res_list: if i[0] not in res: res += i[0] res += ' ' count += 1 if count > 9: break tmp_list = res.split() res = '' for i in tmp_list: query = QueryParser(Version.LUCENE_CURRENT, "Page_num", analyzer).parse(i) scoreDocs = searcher.search(query, 1).scoreDocs doc = searcher.doc(scoreDocs[0].doc) ch = doc.get('Page_num') + ' ' ch += 'data/' + doc.get('Page_num') + '.jpg' + ' ' ch += doc.get('Page_link') + ' ' ch += doc.get('Views') + ' ' ch += doc.get('Likes') + ' ' tmp_alt = doc.get('Img_alt') tmp_alt = '_'.join(tmp_alt.split()) ch += tmp_alt res += ch res += ' ' del searcher del analyzer return res
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print "Searching for:", command query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print "%s total matching documents." % len(scoreDocs) # Highlight the matching text in red highlighter = Highlighter(SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream("contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream("contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)
def get_writer(index_dir): indexDir = SimpleFSDirectory(File(index_dir).toPath()) writerConfig = IndexWriterConfig() print(f"Codec : {writerConfig.getCodec()}") writer = IndexWriter(indexDir, writerConfig) return writer
def run(searcher, analyzer): while True: print print "Hit enter with no input to quit." command = raw_input("Query:") if command == '': return print print "Searching for:", command query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get("name") if __name__ == '__main__': lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) del searcher
from java.io import File from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.index import DirectoryReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher from org.apache.lucene.util import Version from org.apache.lucene.analysis.core import WhitespaceAnalyzer from org.apache.lucene.analysis.miscellaneous import LimitTokenCountAnalyzer from org.apache.lucene.index import DirectoryReader from org.apache.lucene.queryparser.classic import QueryParser from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.search import IndexSearcher from org.apache.lucene.util import Version from org.apache.lucene.search import BooleanQuery from org.apache.lucene.search import BooleanClause from org.apache.lucene.search.spell import SpellChecker from org.apache.lucene.search.spell import LuceneDictionary from org.apache.lucene.index import IndexReader from org.apache.lucene.index import IndexWriterConfig vm_env = lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(File("store")) searcher = IndexSearcher(DirectoryReader.open(directory)) # 创建拼写检查索引 spell_dic = SimpleFSDirectory(File("spellchecker")) spellchecker = SpellChecker(spell_dic) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
# -*- coding : utf-8 -*- """ 建立索引 """ import sys import lucene from java.io import File from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version if __name__ == "__main__": lucene.initVM() indexDir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." for n, l in enumerate(sys.stdin): doc = Document() doc.add(Field("text", l, Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) print "Indexed %d lines from stdin (%d docs in index)" % (n, writer.numDocs()) print "Closing index of %d docs..." % writer.numDocs() writer.close()
for o, a in options: if o == "--format": format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print("Found %d document(s) (in %s) that matched query '%s':" % (len(scoreDocs), duration, query), file=sys.stderr) for scoreDoc in scoreDocs:
#!/usr/bin/python import sys, os sys.path.append("../lib/lucene-core-3.6.2.jar") sys.path.append("../lib/lucene-core-3.6.2-javadoc.jar") from java.io import File from java.util import Scanner from org.apache.lucene.index import IndexReader, Term from org.apache.lucene.store import SimpleFSDirectory import pdb if __name__ == "__main__": r = IndexReader.open(SimpleFSDirectory(File('../index'))) print "... total number of documents in the index is " + str(r.maxDoc()) t = r.terms() i = 0 count_add = 0 while t.next(): i = i + 1 if i > 100010: break if i > 100000: print "[" + str(i) + "]" + t.term().text() te = Term("contents", "brute") print "... number of documents with the word brute is : " + str( r.docFreq(te)) td = r.termDocs(te)
def __init__(self, indexDir: str): index_dir = SimpleFSDirectory(Paths.get(indexDir)) self._searcher = IndexSearcher(DirectoryReader.open(index_dir))
def openStore(self): return SimpleFSDirectory(Paths.get(self.STORE_DIR))
def __init__(self, id: str, indexDir: str): index_dir = SimpleFSDirectory(Paths.get(indexDir)) self._searcher = IndexSearcher(DirectoryReader.open(index_dir)) self._id = id self._resDict = {} self._strDict = ''
import sys import lucene from java.io import File from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field, IntField, StringField, TextField from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version LUCENE_TYPES={'i':IntField,'s':StringField,'t':TextField} if __name__ == "__main__": lucene.initVM() indexDir = SimpleFSDirectory(File("data/lucene_full_v1/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(indexDir, writerConfig) print "%d docs in index" % writer.numDocs() print "Reading lines from sys.stdin..." header=[] for n, l in enumerate(sys.stdin): doc = Document() fields = l.rstrip().split("\t") #add one more field to header field set, which will index the concatenated set of all fields for general searches all_ = [] if len(fields) < 1 or len(fields[0]) == 0: continue for (idx,field) in enumerate(fields): if n == 0:
def run(self, writer=None, analyzer=None): if writer is None: writer = self.writer if analyzer is None: analyzer = self.analyzer searcher = IndexSearcher(DirectoryReader.open(\ SimpleFSDirectory.open(File(self.store_dir)))) while True: print() print("Hit enter with no input to quit.") command = input("Query:") if command == '': return print("Searching for:", command) query = QueryParser(Version.LUCENE_43, "contents", analyzer).parse(command) # We'll just show the top 10 matching documents for now scoreDocs = searcher.search(query, 10).scoreDocs print("%s total matching documents." % len(scoreDocs)) # Highlight the matching text in red highlighter = Highlighter( SimpleHTMLFormatter('<b><font color\ ="red">', '</font></b>'), QueryScorer(query)) # Using NullFragmenter since we still want to see # the whole document highlighter.setTextFragmenter(NullFragmenter()) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) # arg 3: the maximum number of fragments # arg 4: the separator used to intersperse the # document fragments (typically "...") # arg 3 and 4 don't really matter with NullFragmenter result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.hits_dir + '/' + doc.get("name"), 'w+') file_handler.write(result) # create hit fragments, if we want to show them # arg 1: fragment size highlighter.setTextFragmenter(SimpleFragmenter(200)) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) tokenStream = analyzer.tokenStream( "contents", StringReader(doc.get("contents"))) result = highlighter.getBestFragments(tokenStream, doc.get("contents"), 2, "...") if len(result) > 10: file_handler = open(self.frags_dir + '/' + doc.get("name"), 'w+') file_handler.write(result)
default="tfidf", help="Similarity (in [tfidf, lm, bm25])") parser.add_argument('--reorder', type=str, nargs='?', default="no", help="Reordering (in [ups, normups])") parser.add_argument('--short', action='store_false', help="Don't show the body of comments") args = parser.parse_args() if args.sim in ['bm25']: similarity = BM25Similarity() elif args.sim in ['lm']: similarity = LMDirichletSimilarity() else: similarity = ClassicSimilarity() # Sample query storeDir = SimpleFSDirectory(Paths.get(args.index_dir)) searcher = IndexSearcher(DirectoryReader.open(storeDir)) if similarity is not None: searcher.setSimilarity(similarity) analyzer = StandardAnalyzer() run(searcher, analyzer, ndocs=args.ndocs, reordering=args.reorder, show_bodies=not args.short)
def __recs_query(self, positive_rated_document_list, scores, recs_number, items_directory, candidate_list: List) -> pd.DataFrame: """ Builds a query using the contents that the user liked. The terms relative to the contents that the user liked are boosted by the rating he/she gave. A filter clause is added to the query to consider only candidate items Args: positive_rated_document_list: List of contents that the user liked scores: Ratings given by the user recs_number: How many items must be recommended. You can only specify the number, not a specific item for which compute the prediction items_directory: Directory where the items are stored Returns: score_frame (pd.DataFrame): DataFrame containing the recommendations for the user """ BooleanQuery.setMaxClauseCount(2000000) searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory( Paths.get(items_directory)))) if self.__classic_similarity: searcher.setSimilarity(ClassicSimilarity()) field_list = searcher.doc(positive_rated_document_list[0]).getFields() user_fields = {} field_parsers = {} analyzer = SimpleAnalyzer() for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] = field.stringValue() field_parsers[field.name()] = QueryParser(field.name(), analyzer) positive_rated_document_list.remove(positive_rated_document_list[0]) for _ in positive_rated_document_list: for field in field_list: if field.name() == 'content_id': continue user_fields[field.name()] += field.stringValue() logger.info("Building query") query_builder = BooleanQuery.Builder() for score in scores: for field_name in user_fields.keys(): if field_name == 'content_id': continue field_parsers[field_name].setDefaultOperator( QueryParser.Operator.OR) field_query = field_parsers[field_name].escape( user_fields[field_name]) field_query = field_parsers[field_name].parse(field_query) field_query = BoostQuery(field_query, score) query_builder.add(field_query, BooleanClause.Occur.SHOULD) if candidate_list is not None: id_query_string = ' OR '.join("content_id:\"" + content_id + "\"" for content_id in candidate_list) id_query = QueryParser("testo_libero", KeywordAnalyzer()).parse(id_query_string) query_builder.add(id_query, BooleanClause.Occur.MUST) query = query_builder.build() docs_to_search = len(positive_rated_document_list) + recs_number scoreDocs = searcher.search(query, docs_to_search).scoreDocs logger.info("Building score frame to return") recorded_items = 0 columns = ['to_id', 'rating'] score_frame = pd.DataFrame(columns=columns) for scoreDoc in scoreDocs: if recorded_items >= recs_number: break if scoreDoc.doc not in positive_rated_document_list: doc = searcher.doc(scoreDoc.doc) item_id = doc.getField("content_id").stringValue() recorded_items += 1 score_frame = pd.concat([ score_frame, pd.DataFrame.from_records([(item_id, scoreDoc.score)], columns=columns) ]) return score_frame
def createSearcher(index_dir): reader = DirectoryReader.open(SimpleFSDirectory(File(index_dir).toPath())) searcher = IndexSearcher(reader) return searcher
def update(collection_name, tofind, update, commit=False, add_field_if_not_exists=True): #As of now the update will be implemented as search,modify data in json file,delete and re-write if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(tofind) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) #setting writer configurations config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) except: return 105 no_of_documents_modified = 0 #finding the document to update #Scope for making this more efficient def rewrite(data_string): data = json.loads(data_string) toupdate = json.loads(update) #primary_key_modified=False #delete the appropriate document query = BooleanQuery() for key in primary_keys_map[collection_name]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(data[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) #print query #modify the values for key, value in toupdate.items(): #if such a key is not present the we either add and update that key into data,or just ignore it!(By default it is set to True!) if add_field_if_not_exists == False: if key in data.keys(): data[key] = value else: data[key] = value #this deletion statement has been intenstionally added here #only if the modified data,has primary keys already not existing,will the updating process continue primary_key_update = False for key in toupdate.keys(): if key in primary_keys_map[INDEX_DIR]: primary_key_update = True break if primary_key_update == True: query_search = BooleanQuery() for key in primary_keys_map[INDEX_DIR]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(data[key]) query_search.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query_search, MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 writer.deleteDocuments(query) #add the newly modified document doc = Document() #index files wrt primary key for primary_key in primary_keys_map[collection_name]: try: field = Field(primary_key, data[primary_key], Field.Store.NO, Field.Index.NOT_ANALYZED) doc.add(field) except: primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_map[collection_name] == True: temp = json.dumps(data) data_string = base64.b64encode(snappy.compress(temp)) else: temp = json.dumps(data) data_string = base64.b64encode(temp) field = Field("$DATA$", data_string, Field.Store.YES, Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs) > 0: query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse( tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 else: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 else: for i in range(0, ireader.numDocs()): doc = searcher.doc(i) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 else: if rewrite(data) != 106: no_of_documents_modified += 1 else: return 106 ireader.close() if commit == True: writer.commit() writer.close() return str(no_of_documents_modified) + " have been modified"
def store(collection_name, data, commit=False): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT print "started indexing input data......" #extracting values try: contents = json.loads(data) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #checking for existance of record with same primary_key set try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) query = BooleanQuery() for key in primary_keys_map[INDEX_DIR]: temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse(contents[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs if len(hits) > 0: return 106 except: pass #setting writer configurations config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(direc, config) #fix this later.....FieldType not defined #field_type=FieldType() #field_type.setIndexed(True) #field_type.setStored(False) #field_type.setTokenized(False) try: doc = Document() #index files wrt primary key for primary_key in primary_keys_map[collection_name]: try: field = Field(primary_key, contents[primary_key], Field.Store.NO, Field.Index.NOT_ANALYZED) doc.add(field) except: primary_keys_map.pop(collection_name) return 101 #compress data using snappy if compression is on if to_be_compressed_map[collection_name] == True: # print "here" #data=data.encode('utf-8') data = base64.b64encode(snappy.compress(data)) # print data else: data = base64.b64encode(data) field = Field("$DATA$", data, Field.Store.YES, Field.Index.ANALYZED) doc.add(field) writer.addDocument(doc) if commit == True: writer.commit() writer.close() return 000 except: return 102
def search(collection_name, tofind): if collection_name != "DEFAULT": INDEX_DIR = collection_name else: INDEX_DIR = INDEX_DIR_DEFAULT try: tofind_keyvalue_pairs = json.loads(tofind) except: return 100 direc = SimpleFSDirectory(File(INDEX_DIR)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) try: ireader = IndexReader.open(direc) searcher = IndexSearcher(ireader) except: return 105 #initializing return list return_list = [] #check_list=[] tofind_primary_keyvalue_pairs = {} tofind_nonprimary_keyvalue_pairs = {} #separating out primary and non_primary keys for key in tofind_keyvalue_pairs.keys(): if key in primary_keys_map[collection_name]: tofind_primary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] else: tofind_nonprimary_keyvalue_pairs[key] = tofind_keyvalue_pairs[key] #filtering documents if len(tofind_primary_keyvalue_pairs) > 0: query = BooleanQuery() for key in tofind_primary_keyvalue_pairs.keys(): temp = QueryParser(Version.LUCENE_CURRENT, key, analyzer).parse( tofind_primary_keyvalue_pairs[key]) query.add(BooleanClause(temp, BooleanClause.Occur.MUST)) hits = searcher.search(query, MAX_RESULTS).scoreDocs for hit in hits: doc = searcher.doc(hit.doc) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: return_list.append(data) else: return_list.append(data) else: for i in range(0, ireader.numDocs()): doc = searcher.doc(i) if to_be_compressed_map[collection_name] == True: temp = doc.get("$DATA$") data = snappy.uncompress(base64.b64decode(temp)) else: temp = doc.get("$DATA$") data = base64.b64decode(temp) #non primary key filtering(without having to load all the primary key filtered values into main memory!) if len(tofind_nonprimary_keyvalue_pairs) > 0: entry = json.loads(data) satisfied = True for key in tofind_nonprimary_keyvalue_pairs.keys(): if entry.get(key) != tofind_nonprimary_keyvalue_pairs[key]: satisfied = False break if satisfied == True: return_list.append(data) else: return_list.append(data) ireader.close() if len(return_list) == 0: return None else: return return_list
html_clean = html_clean + ' ' + token.encode('ascii', 'ignore') return html_clean def clean_countries_dict(dictionary_object): cleaned = [] for country in dictionary_object: country_cleaned = [country[0],clean_html(country[1]),country[2],clean_html(country[3])] cleaned.append(country_cleaned) return cleaned cleaned_dictionary = clean_countries_dict(merge_country_city_text(countries_dict)) # creating the index index_path = File(sys.argv[1]) analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) index = SimpleFSDirectory(index_path) # populating the index config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) writer = IndexWriter(index, config) def create_index(): for country in cleaned_dictionary: doc = Document() doc.add(Field("country", country[0], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("country_html", country[1], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital", country[2], Field.Store.YES, Field.Index.ANALYZED)) doc.add(Field("capital_html", country[3], Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) create_index()
using the gutenberg corpus that comes with NLTK package """ # In[3]: import lucene from org.apache.lucene.analysis.standard import StandardAnalyzer from org.apache.lucene.document import Document, Field from org.apache.lucene.index import IndexWriter, IndexWriterConfig from org.apache.lucene.store import SimpleFSDirectory from org.apache.lucene.util import Version import sys from java.io import File import re lucene.initVM() index_dir = SimpleFSDirectory(File("index/")) writerConfig = IndexWriterConfig(Version.LUCENE_4_10_1, StandardAnalyzer()) writer = IndexWriter(index_dir, writerConfig) # In[56]: pattern = r"\[([A-Za-z0-9_].*)\]" # In[26]: f = open('gutenberg/austen-emma.txt') # In[76]: