def testSlurp(self): fsDirReader = IndexReader.open(self.dir, True) self.assertEqual(len(self.keywords), fsDirReader.maxDoc()) self.assertEqual(len(self.keywords), fsDirReader.numDocs()) ramDir = RAMDirectory(self.dir) ramDirReader = IndexReader.open(ramDir, True) self.assertEqual(fsDirReader.maxDoc(), ramDirReader.maxDoc()) self.assertEqual(fsDirReader.numDocs(), ramDirReader.numDocs()) fsDirReader.close() ramDir.close()
def ExportIndex(b_print = False,b_write_file = False,b_filter = True): _dict = ReadConfig() initVM() try: if(b_write_file == True): output_file = _dict['resultDir'] + '/' + sys.argv[1] + '.xls' _fw = open(output_file,'w') directory = SimpleFSDirectory(File(_dict['indexDir'])) ireader = IndexReader.open(directory) # Enum all the terms all_terms = ireader.terms() word_dict = {} _stopword_set = ImportStopword() # SetPrint(_stopword_set) while all_terms.next(): term_elem = all_terms.term() if term_elem.field() == sys.argv[1]: _temp = term_elem.text().rstrip() word_dict[_temp] = all_terms.docFreq() if(b_filter == True): StopwordFilter(word_dict,_stopword_set) if(b_print != False): DictPrint(word_dict) if(b_write_file != False): DictPrint(word_dict,out_file=_fw) _fw.close() all_terms.close() return word_dict except Exception,e: print "Failed: ",e traceback.print_exc(file=sys.stdout)
def __init__(self, location): lucene.initVM() directory = SimpleFSDirectory(File(location)) self.reader = IndexReader.open(directory, True) self.searcher = IndexSearcher(self.reader) self.query_parser = QueryParser(Version.LUCENE_CURRENT, "text", WhitespaceAnalyzer())
def retrieveDocs(q): lucene.initVM() analyzer = StandardAnalyzer(Version.LUCENE_30) reader = IndexReader.open(SimpleFSDirectory(File("index/"))) searcher = IndexSearcher(reader) query = QueryParser(Version.LUCENE_30, "text", analyzer).parse(q) MAX = 1000 hits = searcher.search(query, MAX) nonDiverse = [] docsToScores = {} #create a list of html files with relevant websites rQ = [] print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for hit in hits.scoreDocs: print hit.score, hit.doc, hit.toString() doc = searcher.doc(hit.doc) print doc.get("text").encode("utf-8") #print(new_urls[str(hit.doc)]) result = str(hit.score) + " " + str(hit.doc) + " " + hit.toString() if (len(nonDiverse) < 10): nonDiverse.append(new_urls[str(hit.doc)]) #find the document that corresponds to the html website and append to a list for min distance website = new_urls[str(hit.doc)] #html_files numbers of the hit websites added to rQ rQ.append(inv_map[website]) docsToScores[int(inv_map[website])] = hit.score print(inv_map[website]) return docsToScores, rQ, nonDiverse
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add(Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add( Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add( Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.optimize() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
def openStore(self, store_dir): """Open a lucene store.""" if self._connected: self.closeStore() if store_dir == 'dummy': directory = RAMDirectory() self.initDummyStore(directory) store_path = store_dir else: store_path = os.path.abspath(store_dir) try: directory = SimpleFSDirectory(File(store_path)) #TODO , False) except JavaError: print "Error: %s Not found." % store_path return try: self.searcher = IndexSearcher(directory) except JavaError: print "Error: '%s' is not a valid lucene store." % store_path return print 'Opening store: %s' % store_path self.directory = directory self.store_path = store_path # TODO - TaxonomyReader?? self.index_reader = IndexReader.open(directory) self.fields = self.getFieldNames() self.fields.sort() self._connected = True
def c(): from apps.wantown import dao from apps.wantown.models import Entry,Category entries = Entry.objects.all() from dot.matrixmapper import MatrixMapper STOP_WORDS = [u'a', u'an', u'and', u'are', u'as', u'at', u'be', u'but', u'by', u'for', u'if', u'in', u'into', u'is', u'it', u'no', u'not', u'of', u'on', u'or', u'such', u'that', u'the', u'their', u'then', u'there', u'these', u'they', u'this', u'to', u'was', u'will', u'with', u'you',u'your',u'we',u'he',u'him',u'how',u'where', # add by myself u'i',u'been',u'about',u'们',u'这',u'那',u'的',u'己',u'个',u'我',u'你',u'很',u'了',u'是',u'以',u'过',u'一',u'么',u'没',u'在'] mapper = MatrixMapper(STOP_WORDS) ireader = IndexReader.open(STORE_DIR) for i in range(len(entries)): try: doc = ireader.document(i) link = doc.get('link') entry = dao.get_by_link(link, Entry) category = mapper.build([doc]) weight = 0 if category: cat = category[0].text weight = category[0].label_weight else: cat = '其他' entry.category = dao.save_category(cat,weight,'s') entry.save() except Exception,e: print i,e
def loadterms(): ireader = IndexReader.open(STORE_DIR) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title')) a = ireader.terms() rownames = [] # 列名为term的中英文表示 colnames = [] # term-freq矩阵 data = [] ireader.document(- 1) i = 0 while a.next(): term = a.term() if term.field() == 'summary': colnames.append(term.text()) if term.text() == '': print 'ok' break i = i+1 if i == 1000: break docs = ireader.termDocs(term) vector = [] lastdoc = 0 while docs.next(): # 填补那些不包含当前term的document的词频为0 if lastdoc < docs.doc(): id = docs.doc() for j in range(id - lastdoc): vector.append(0) vector.append(docs.freq()) data.append(vector) ireader.close() return colnames, data
def setUp(self): self.directory = RAMDirectory() self.analyzer = WhitespaceAnalyzer() writer = IndexWriter(self.directory, self.analyzer, True, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add( Field("f", "the quick brown fox jumps over the lazy dog", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) doc = Document() doc.add( Field("f", "the quick red fox jumps over the sleepy cat", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.close() self.searcher = IndexSearcher(self.directory, True) self.reader = IndexReader.open(self.directory, True) self.quick = SpanTermQuery(Term("f", "quick")) self.brown = SpanTermQuery(Term("f", "brown")) self.red = SpanTermQuery(Term("f", "red")) self.fox = SpanTermQuery(Term("f", "fox")) self.lazy = SpanTermQuery(Term("f", "lazy")) self.sleepy = SpanTermQuery(Term("f", "sleepy")) self.dog = SpanTermQuery(Term("f", "dog")) self.cat = SpanTermQuery(Term("f", "cat"))
def loadterms(): ireader = IndexReader.open(STORE_DIR) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'title')) a = ireader.terms() rownames = [] colnames = [] data = [] print dir(a) print dir(ireader) ireader.document(- 1) while 0 and a.next(): term = a.term() if term.field() == 'summary': colnames.append(term.text()) docs = ireader.termDocs(term) vector = [] lastdoc = 0 while docs.next(): if lastdoc < docs.doc(): id = docs.doc() for j in range(id - lastdoc): vector.append(0) vector.append(docs.freq()) data.append(vector) return colnames, data
def build_lda_corpus(index_folder, paths_index_file, dictionary_file, ldac_file, min_frequency, min_word_len, max_word_len=20): ''' The main function that does the job! ''' initVM() store = SimpleFSDirectory(File(index_folder)) index_reader = IndexReader.open(store) # Stores the file paths index (for LDA) _store_file_paths_index(index_reader, paths_index_file) # Creates the dictionary _create_dictionary(index_reader, dictionary_file, min_frequency, min_word_len, max_word_len) # Creates the corpus dictionary = corpora.Dictionary().load(dictionary_file) # doesn't load the corpus into the memory! corpus_memory_friendly = _TextCorpus(dictionary, index_reader) corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, id2word=dictionary) logging.info('The Enron corpus building is completed.')
def deleteOldDocuments(*args): now = datetime.datetime.now() - datetime.timedelta(hours=6) IndexReader = writer.getReader() for i in IndexReader.maxDoc(): if IndexReader.isDeleted(i): continue doc = IndexReader.document(i) date = doc.get("creation_date") realDate = datetime.datetime.strptime(str(date), "%a %b %d %H:%M:%S") if now > realDate: IndexReader.deleteDocument(i) writer.optimize() writer.commit()
def runDrillDown(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = IndexReader.open(self.indexDir, True) facetRes = SimpleSearcher.searchWithDrillDown(indexReader, taxo) # close readers taxo.close() indexReader.close() # return result return facetRes
def runSimple(self): # open readers taxo = DirectoryTaxonomyReader(self.taxoDir) indexReader = IndexReader.open(self.indexDir, True) # returns List<FacetResult> facetRes = SimpleSearcher.searchWithFacets(indexReader, taxo) # close readers taxo.close() indexReader.close() # return result return facetRes
def testDeleteBeforeIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) self.assert_(reader.isDeleted(1)) self.assert_(reader.hasDeletions()) self.assertEqual(2, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close() reader = IndexReader.open(self.dir, True) self.assertEqual(2, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
def loadtermFreq(self,context) : word_filter = lambda w: (len(w) > 3) and (w.isalpha()) and (w.lower() not in nltk.corpus.stopwords.words('english')) try: reader = IndexReader.open(context.ramIndex,True) wordList =[] termenum = reader.terms() while termenum.next(): wordList.append(termenum.term().text()) self.termList = filter(word_filter,wordList) except Exception,e: print 'Unable to read Ram Index',e
def do_describe_store(self, message): """\d[escribe_store] General information about the store.""" if not self.checkStore(): return nb_docs = self.numDocs() max_doc = self.maxDoc() directory = self.directory print "Directory info" print "--------------" print "* Directory path : %s" % self.store_path store_size = getTreeSize(self.store_path) print "* Directory size : %s" % readableSize(store_size) print "* Directory current version : %s" % ( IndexReader.getCurrentVersion(directory)) print "* Number of docs : %s (max doc num: %s)" % ( nb_docs, max_doc) print "* Number of fields : %d" % len(self.fields) if nb_docs: print "* Average doc size : %s" % ( readableSize(store_size / float(nb_docs))) try: last_modified = datetime.fromtimestamp( IndexReader.lastModified(directory)/1000.0) last_modified = last_modified.isoformat() except ValueError: last_modified = "Unknown" print "* Index last modified : %s" % last_modified print "* Index status :", # TODO #if IndexReader.isLocked(directory): # print "LOCKED" #else: # print "unlocked" print "* Has deletions :", if self.index_reader.hasDeletions(): print "YES" else: print "no" print "* Directory implementation : %s" % ( directory.getClass().getName())
def process(self,context): self.unigramList = context.termList self.ramreader = IndexReader.open(context.ramIndex,True) self.ramsearcher = IndexSearcher(context.ramIndex) self.N = self.ramreader.numDocs() self.m = len(self.unigramList) self.createTermDocumentMatrix() self.ramsearcher.close() self.ramreader.close() context.termdocumentmatrix = self.termdocumentMatrix print 'finished creating term document matrix' self.context = context
def testDeleteAfterIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) writer.optimize() writer.close() reader = IndexReader.open(self.dir, True) self.assert_(not reader.isDeleted(1)) self.assert_(not reader.hasDeletions()) self.assertEqual(1, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
def pesquisar_com_lucene(): initVM() #print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT) for query in querys: query_number = query.query_number # Constructs a query parser. We specify what field to search into. query.query_text = query.query_text.replace('?','') query.query_text = query.query_text.replace('*','') queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create the query query = queryParser.parse(query.query_text) # Run the query and get top 50 results topDocs = searcher.search(query,50000) # Get top hits scoreDocs = topDocs.scoreDocs r = resultado_query(query_number,scoreDocs) resultados.append(r) #print "%s total matching documents." % len(scoreDocs) #for scoreDoc in scoreDocs: # doc = searcher.doc(scoreDoc.doc) # print doc.get(FIELD_PATH) with open('resultados_da_busca/resultados.csv', 'w') as csvfile: spamwriter = csv.writer(csvfile, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in resultados: resultados_da_row = [] i = 1 for resultado_da_query in row.query_results: doc = searcher.doc(resultado_da_query.doc) resultados_da_row.append((i,int(doc.get(FIELD_PATH)))) i = i + 1 spamwriter.writerow([row.query_number,resultados_da_row])
def calculateWeight(self,context): #try: self.termList = context.termList ramreader = IndexReader.open(context.ramIndex,True) store = SimpleFSDirectory(File(context.luceneDir)) storereader = IndexReader.open(store) searcher = IndexSearcher(store) ramsearcher = IndexSearcher(context.ramIndex) # Number of documents in the collection N = storereader.numDocs() # Number of relevant documents R = ramreader.numDocs() analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT) for w in self.termList: searchString= "'" + w + "'" query = QueryParser(Version.LUCENE_CURRENT,"contents",analyzer).parse(searchString) # Number of relevant document having the term #r = ramsearcher.docFreq(Term("contents",w)) hits = ramsearcher.search(query,self.MAX) r = hits.totalHits # Number of documents having the term #n = searcher.docFreq(Term("contents",w)) query = QueryParser(Version.LUCENE_CURRENT,context.searchField,analyzer).parse(searchString) hits = searcher.search(query,self.MAX) n = hits.totalHits if (R-r) > 0 and (n-r) > 0 and (N-n-R+r) > 0: weight = (r/(R-r))/(((n-r)/(N-n-R+r))) else: weight =0 if weight > self.weightThreshold: self.gramList.append([w,weight]) searcher.close() ramsearcher.close() storereader.close() ramreader.close() #except Exception,e: # print 'error',e
def buildCategoryVectors(self): reader = IndexReader.open(self.directory, True) for id in xrange(reader.maxDoc()): doc = reader.document(id) category = doc.get("category") vectorMap = self.categoryMap.get(category, None) if vectorMap is None: vectorMap = self.categoryMap[category] = {} termFreqVector = reader.getTermFreqVector(id, "subject") self.addTermFreqToMap(vectorMap, termFreqVector)
def testDeleteAfterIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) writer.commit() writer.close() reader = IndexReader.open(self.dir, True) deletedDocs = MultiFields.getDeletedDocs(reader) self.assert_(deletedDocs is None or not deletedDocs.get(1)) self.assert_(not reader.hasDeletions()) self.assertEqual(1, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
def testDeleteBeforeIndexMerge(self): reader = IndexReader.open(self.dir, False) self.assertEqual(2, reader.maxDoc()) self.assertEqual(2, reader.numDocs()) reader.deleteDocument(1) deletedDocs = MultiFields.getDeletedDocs(reader) self.assert_(deletedDocs is not None and deletedDocs.get(1)) self.assert_(reader.hasDeletions()) self.assertEqual(2, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close() reader = IndexReader.open(self.dir, True) self.assertEqual(2, reader.maxDoc()) self.assertEqual(1, reader.numDocs()) reader.close()
def query(indexName, queryFile, runName): indReader = IndexReader.open(SimpleFSDirectory(File(indexName))) indSearcher = IndexSearcher(indReader) ir = indSearcher.getIndexReader() qp = QueryParser(Version.LUCENE_CURRENT, "content", StandardAnalyzer(Version.LUCENE_CURRENT)) f = open('results-'+runName, 'w') while(True): id = queryFile.readline() if id == "": break id = id.replace("C","") id = id.replace("\n","") queryString = queryFile.readline() queryString = queryString.replace("?","") queryString = queryString.replace("*","") queryString = queryString.replace("-","_") queryString = queryString.replace("\n","") query = qp.parse(queryString) queryFile.readline() returnedDocs = 1000 collector = TopScoreDocCollector.create(returnedDocs, True) indSearcher.search(query, collector) hits = collector.topDocs().scoreDocs size = len(hits) print "Total hits for query " +id+ ": "+str(size) i = 0 for hit in hits: docId = hits[i].doc score = hits[i].score doc = ir.document(docId) j = i + 1 f.write(id + " 0 " + doc.get('id') + " " + str(j) + " " + str(score) +" " + runName +"\n") i+=1 f.close()
def delete_old(self, index): existing_ids = set([book.id for book in Book.objects.all()]) reader = IndexReader.open(index.index, False) searcher = IndexSearcher(reader) try: num = searcher.docFreq(Term('is_book', 'true')) docs = searcher.search(Search.make_term_query(['true'], 'is_book'), num) for result in docs.scoreDocs: stored = searcher.doc(result.doc) book_id = int(stored.get('book_id')) if not book_id in existing_ids: print "book id %d doesn't exist." % book_id index.remove_book(book_id) finally: searcher.close() reader.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add( Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add( Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.optimize() writer.close() reader.close()
def main(cls, argv): if len(argv) != 3: print "Usage: T9er <WordNet index dir> <t9 index>" return for key in cls.keys: c = key[0] k = key[1:] for kc in k: cls.keyMap[kc] = c print kc, "=", c indexDir = argv[1] t9dir = argv[2] reader = IndexReader.open(indexDir) numDocs = reader.maxDoc() print "Processing", numDocs, "words" writer = IndexWriter(t9dir, WhitespaceAnalyzer(), True) for id in xrange(reader.maxDoc()): origDoc = reader.document(id) word = origDoc.get("word") if word is None or len(word) == 0: continue newDoc = Document() newDoc.add(Field("word", word, Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("t9", cls.t9(word), Field.Store.YES, Field.Index.UN_TOKENIZED)) newDoc.add(Field("length", str(len(word)), Field.Store.NO, Field.Index.UN_TOKENIZED)) writer.addDocument(newDoc) if id % 100 == 0: print "Document", id writer.commit() writer.close() reader.close()
def main(cls, argv): indexDir = System.getProperty("index.dir") directory = SimpleFSDirectory(File(indexDir)) reader = IndexReader.open(directory, True) blt = BooksLikeThis(reader) for id in xrange(reader.maxDoc()): if reader.isDeleted(id): continue doc = reader.document(id) print '' print doc.get("title").encode('utf-8') docs = blt.docsLike(id, doc, 10) if not docs: print " None like this" else: for doc in docs: print " ->", doc.get("title").encode('utf-8')
def similar(command, docno): STORE_DIR = "index" initVM(CLASSPATH) directory = FSDirectory.getDirectory(STORE_DIR, False) searcher = IndexSearcher(directory) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) parser.setFuzzyMinSim(0.2) query = parser.parse(command) hits = searcher.search(query) document = hits.id(docno) ir = IndexReader.open(STORE_DIR) mlt = MoreLikeThis(ir) mlt.setFieldNames(['name', 'contents']) mlt.setMinWordLen(2) mlt.setBoost(True) query = mlt.like(document) hits = map(transform, searcher.search(query)) searcher.close() return hits
def testUpdate(self): self.assertEqual(1, self.getHitCount("city", "Amsterdam")) reader = IndexReader.open(self.dir, False) reader.deleteDocuments(Term("city", "Amsterdam")) reader.close() writer = IndexWriter(self.dir, self.getAnalyzer(), False, IndexWriter.MaxFieldLength.UNLIMITED) doc = Document() doc.add(Field("id", "1", Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("country", "Russia", Field.Store.YES, Field.Index.NO)) doc.add(Field("contents", "St. Petersburg has lots of bridges", Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field("city", "St. Petersburg", Field.Store.YES, Field.Index.ANALYZED)) writer.addDocument(doc) writer.commit() writer.close() self.assertEqual(0, self.getHitCount("city", "Amsterdam")) self.assertEqual(1, self.getHitCount("city", "Petersburg"))
directory = RAMDirectory() iwriter = IndexWriter(directory, StandardAnalyzer(Version.LUCENE_CURRENT), True, IndexWriter.MaxFieldLength.LIMITED) ts = ["this bernhard is the text to be index text", "this claudia is the text to be index"] for t in ts: doc = Document() doc.add(Field("fieldname", t, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) iwriter.addDocument(doc) iwriter.optimize() iwriter.close() ireader = IndexReader.open(directory, True) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) for (t,f,i) in zip(tpv.getTerms(),tpv.getTermFrequencies(),xrange(100000)): print 'term %s' % t print ' freq: %i' % f try: print ' pos: ' + str([p for p in tpv.getTermPositions(i)]) except: print ' no pos' try: print ' off: ' + \ str(["%i-%i" % (o.getStartOffset(), o.getEndOffset()) for o in tpv.getOffsets(i)]) except:
def label_assign(self, docs, labels, lucene_ids): term_row = {} all = [] ireader = IndexReader.open(STORE_DIR) total_terms = 0 for i in range(len(lucene_ids)): tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary')) self.add2matrix(tpv, all, term_row, lucene_ids, i) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title')) self.add2matrix(tpv, all, term_row, lucene_ids, i) # 对label进行分词 analyzer = CJKAnalyzer() labelmatrix = zeros((len(all), len(labels))) label_term = [] for i in range(len(labels)): if not labels[i].is_candicate_label and len(labels[i].text) >= 3: label_term.append([]) continue #print labels[i].text,labels[i].id stream = analyzer.tokenStream('', StringReader(labels[i].text)) terms = [] for token in stream: if term_row.has_key(token.term()): # weighting termdocs = ireader.termDocs(Term('summary', token.term())) count = 0 span = 0 terms.append(token.term()) while termdocs.next(): count += termdocs.freq() span += 1 weight = labels[i].label_weight #if float(span)/ireader.numDocs() >= 0.18 and not re.search('a-zA-z', token.term()): #weight = 0 labelmatrix[term_row[token.term()]][i] = weight label_term.append(terms) termmatrix = array(all) termmatrix = transpose(termmatrix) #for i in range(len(labelmatrix[0])): #for j in range(len(termmatrix[0])): # row是doc,col是label #p = self.product(termmatrix,labelmatrix) d = dot(termmatrix, labelmatrix) result = d / (norm(labelmatrix) * norm(termmatrix)) doc_label = [] for i in range(len(result)): m = - 1 index = - 1 group = [] for j in range(len(result[i])): if result[i][j] > 0: labels[j].id = result[i][j] group.append(labels[j]) # substring是按照id来排序的,这里正好用到 group.sort() group.reverse() max_label = group[0] # i:doc number(just occur position in the docs) # label id # label score # 如果label自身并没有出现在当前doc中 if not max_label.doc_freq.has_key(i): #print 'oringial:',labels[index].text count = 0 overlap = '' for k in label_term[index]: if term_row.has_key(k) and termmatrix[i][term_row[k]] != 0: overlap = k print k count += 1 # 至少有一个交集,并且长度大于等于2 if count == 1 and len(overlap) >= 2 : new_label = pextractor.Substring() new_label.text = overlap new_label.id = m doc_label.append(group[0]) continue #labels[index].id = m doc_label.append(group[0]) return doc_label
def assign(self, docs, labels, lucene_ids): term_row = {} all = [] ireader = IndexReader.open(STORE_DIR) total_terms = 0 term_doc_freq = {} for i in range(len(lucene_ids)): tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'summary')) self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq) """ TODO:给属于标题的term加权 """ tpv = TermPositionVector.cast_(ireader.getTermFreqVector(lucene_ids[i], 'title')) self.add2matrix(tpv, all, term_row, lucene_ids, i,term_doc_freq) #for k,v in term_doc_freq.items(): # if v> 3: # print k,v # 对label进行分词 analyzer = CJKAnalyzer() labelmatrix = zeros((len(all), len(labels))) label_term = [] # doc -label:每个doc对应的label all_weight_table = {} #label -doc:每个label对应的doc label_doc = [] label_doc_map = {} for i in range(len(labels)): nonzero_table = [] # 一个label对应和所有doc的权重之积 weight_table = [] stream = analyzer.tokenStream('', StringReader(labels[i].text)) terms = [] c = 0 weight_row = {} nonzero_index = [] is_incomplete = False for token in stream: term = token.term()#token.decode('utf-8')# #print term if term_row.has_key(term): row = term_row[term] terms.append(term) docs_with_current_term = all[row] for j in range(len(docs_with_current_term)): if docs_with_current_term[j] != 0: if c == 0: nonzero_index.append(j) if c == 0 or j in nonzero_index: weight_row[j] = weight_row.get(j, 0) + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight else: # 加1防止权重之积为0 # 针对第一次出现在nonzero_index中而在后面的过程中没有出现的doc ,乘以-100使得权重乘积最小表示当前label不适用于此doc weight_row[j] = (1 + docs_with_current_term[j] * term_doc_freq[term] * labels[i].label_weight) * (- 100) # 针对第一次没有在nonzero_index中而在后面的过程中出现的doc elif docs_with_current_term[j] == 0 and j in nonzero_index: # 加1防止权重之积为0 weight_row[j] = (1 + docs_with_current_term[j] * labels[i].label_weight) * (- 100) c += 1 else: is_incomplete = True label_term.append(terms) # bugfix:如果当前label经分词后,不是所有的term都在全部doc的term中,那么放弃当前label,舍之。 if is_incomplete: weight_row = {} for doc, weight in weight_row.items(): last = all_weight_table.get(doc) if weight > 0: if not label_doc_map.has_key(labels[i].text): kc = dao.get_keyword_category_by_category(self.query, labels[i].text) #label_doc.append([ 0,labels[i].text,[]]) label_doc.append([ 0,labels[i].text,0]) label_doc_map[labels[i].text] = len(label_doc) - 1 new_label = pextractor.Substring() new_label.text = labels[i].text new_label.id = weight if last: all_weight_table[doc].append(new_label) else: all_weight_table[doc] = [new_label] #label_doc[label_doc_map[labels[i].text]][2].append(doc) label_doc[label_doc_map[labels[i].text]][2] += 1 label_doc[label_doc_map[labels[i].text]][0] += weight #try: # category = dao.save_category(labels[i].text, weight, 'd') # entry = self.entries[doc] # ec = dao.save_entry_cat(entry, category, weight) #except Exception,e: # print e #if last: # all_weight_table[doc].append(ec) #else: # all_weight_table[doc] = [ec] # 如果doc已经存在,那么用已经存在的doc-label权重比较当前的权重,如果当前的更大则替换已经存在的,即选择最大权重的label #if last: # if last.id < weight and weight > 0: # labels[i].id = weight # all_weight_table[doc] = labels[i] #else: # labels[i].id = weight # all_weight_table[doc] = labels[i] label_doc.sort(reverse=True) for k, v in all_weight_table.items(): v.sort(reverse=True) # 因为map中键为连续的整数值,哈希算法会把他按从小到大的位置排放,所以直接返回的values是已经排好序的了 thread = SaveLabelsThread(all_weight_table,label_doc,self.entries,self.query) thread.start() return all_weight_table,label_doc
def __init__(self, stopWords=None): if stopWords is None: self.stopWords = StopAnalyzer.ENGLISH_STOP_WORDS else: self.stopWords = stopWords def tokenStream(self, fieldName, reader): return StopFilter(LowerCaseFilter(LetterTokenizer(reader)), self.stopWords) if __name__ == '__main__': analyzer = CJKAnalyzer() directory = RAMDirectory() ireader = IndexReader.open(STORE_DIR) iwriter = IndexWriter(directory, StandardAnalyzer(), True) ts = ["javasd。 $##open所大家教唆犯地方地方即可解放大家空间艰苦奋斗矿井口地方", "所看看对抗赛不久交会法觉得拮抗剂"] for t in ts: doc = Document() doc.add(Field("fieldname", t, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) iwriter.addDocument(doc) iwriter.optimize() iwriter.close() ireader = IndexReader.open(directory) tpv = TermPositionVector.cast_(ireader.getTermFreqVector(0, 'fieldname')) for (t, f, i) in zip(tpv.getTerms(), tpv.getTermFrequencies(), xrange(100000)): print 'term %s' % t
FIELD_CONTENTS = "contents" FIELD_PATH = "path" #QUERY_STRING = "lucene and restored" QUERY_STRING = sys.argv[1] STORE_DIR = "/home/kanaujia/lucene_index" if __name__ == '__main__': initVM() print 'lucene', VERSION # Get handle to index directory directory = SimpleFSDirectory(File(STORE_DIR)) # Creates a searcher searching the provided index. ireader = IndexReader.open(directory, True) # Implements search over a single IndexReader. # Use a single instance and use it across queries # to improve performance. searcher = IndexSearcher(ireader) # Get the analyzer analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # Constructs a query parser. queryParser = QueryParser(Version.LUCENE_CURRENT, FIELD_CONTENTS, analyzer) # Create a query query = queryParser.parse(QUERY_STRING)
addedDocs = IndexHelper.index(f, writer) noDocs += addedDocs nrFiles += 1 except IOError: print "File " + filename + " does not exist. Skipping..." writer.close() print str(nrFiles) + " files containing " + str(noDocs) + " documents added to index " elif sys.argv[1] == 'read': reader = IndexReader.open(SimpleFSDirectory(File("senses-gh95"))) doc = reader.document(0) content = doc.getValues("content") id = doc.getValues("id") print content nrDocs = reader.numDocs() print "Number of docs: "+str(nrDocs) print "Doc 1: "+str(id[0]) #Print all terms (takes some time :-) ) #terms = reader.terms() #while(terms.next()): # t = terms.term() # freq = terms.docFreq()
def testIndexReader(self): reader = IndexReader.open(self.dir, True) self.assertEqual(len(self.keywords), reader.maxDoc()) self.assertEqual(len(self.keywords), reader.numDocs()) reader.close()
def getIndexReader(self): return IndexReader.open(self.getDirectory())