def populate_frame(self, date_range, term_vector) -> pd.DataFrame: data_frame = pd.DataFrame(data=0, index=date_range, columns=term_vector) iterator = self.lucene_dictionary.getEntryIterator() for term in BytesRefIterator.cast_(iterator): term_as_string = term.utf8ToString() # print('term:', term_as_string) query = QueryParser("contents", self.analyzer).parse(term_as_string) collector = TopScoreDocCollector.create(10000, 10000) hits = self.searcher.search(query, 1000) if hits is None: # print("No hit for term: ", term_as_string) continue print("Found hit: " + term_as_string) for hit in hits.scoreDocs: document = self.searcher.doc(hit.doc) doc_name = document.getField("doc_name") date = datetime.datetime.strptime(doc_name.stringValue(), '%m%d%y') current_value = data_frame.at[date, term_as_string] if np.isnan(current_value): current_value = 0 data_frame.at[date, term_as_string] = current_value + 1 return data_frame
def getMostFrequentTermNoStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def get_terms(self) -> List[str]: iterator = self.lucene_dictionary.getEntryIterator() map_iterator = map(lambda term: term.utf8ToString(), BytesRefIterator.cast_(iterator)) return list(map_iterator)
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if (terms is not None): termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los" ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode('UTF-8') t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def evaluate_index(index_dir, context, analyzer): # eval time of indexing (overall) # we should also measure the elapsed time of # each index_document call seperately start = time.clock() Indexer(index_dir, context, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(index_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) # print str(vocab_size) # size of vocabulary # print str(vocabulary.getDocCount()) # #docs that have at least one term for title field # print str(vocabulary.getSumTotalTermFreq()) # #tokens # print str(vocabulary.getSumDocFreq()) # #postings reader.close() return duration, vocab_size
def get_doc_termvector(self, lucene_doc_id, field): """Outputs the document term vector as a generator.""" terms = self.reader.getTermVector(lucene_doc_id, field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum
def getTermVectors(route): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) ls = [] for doc in range(ireader.numDocs()): vector = FreqVector() vector.vector = [] vector.freqs = [] norm = 0.0 terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() tf = 1 + math.log(termsEnum.totalTermFreq(), 2) t = Term("content", term) idf = math.log(ireader.numDocs() / ireader.docFreq(t)) vector.vector.append(text) vector.freqs.append(tf * idf) norm += (tf * idf) * (tf * idf) ls.append((vector, math.sqrt(norm))) else: ls.append((vector, 0)) return ls
def read_vectors(): for doc in range(0, reader.numDocs()): for fieldName in FIELD_NAMES: terms = reader.getTermVector(doc, fieldName) if terms: termsEnum = terms.iterator(None) vectors[fieldName][doc] = \ set(term.utf8ToString() for term in BytesRefIterator.cast_(termsEnum))
def get_coll_termvector(self, field): """ Returns collection term vector for the given field.""" self.open_reader() fields = MultiFields.getFields(self.reader) if fields is not None: terms = fields.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): yield bytesref.utf8ToString(), termenum
def get_term_freq(self, docid, field, terms=None): if terms is None: terms = self.reader.getTermVector(docid, field) term_freq = {} if terms is not None: te_itr = terms.iterator() for bytesref in BytesRefIterator.cast_(te_itr): t = bytesref.utf8ToString() freq = te_itr.totalTermFreq() term_freq[t] = freq return term_freq
def getLabelsandTerms(ireader, numDocs): labels = [] terms_list = [] for doc in xrange(0, numDocs): tv = ireader.getTermVector(doc, "contents") document = ireader.document(doc) topic = document.getField("topic") labels.append(topic.stringValue()) termsEnum = tv.iterator() for term in BytesRefIterator.cast_(termsEnum): curr_term = term.utf8ToString() terms_list.append(curr_term) return labels, terms_list
def getTFForField(self, field): tfs = {} fields = MultiFields.getFields(self.reader) terms = fields.terms(field) enum = BytesRefIterator.cast_(terms.iterator(None)) try: while enum.next(): termval = TermsEnum.cast_(enum) termString = termval.term().utf8ToString() freq = self.reader.totalTermFreq(Term(field, termString)) tfs[termString] = freq except: pass return tfs
def getTermFrequencyMatrix(ireader, num_docs, te): feature_mat = np.zeros([num_docs, len(te.classes_)]) for doc in xrange(0, num_docs): print "Running For Document number:" + str(doc) tv = ireader.getTermVector(doc, "contents") termsEnum = tv.iterator() for term in BytesRefIterator.cast_(termsEnum): str_term = term.utf8ToString() dpEnum = termsEnum.postings(None) dpEnum.nextDoc() freq = dpEnum.freq() term_ind = te.transform([str_term])[0] feature_mat[doc][term_ind] = freq return feature_mat
def main(storeDir): reader = DirectoryReader.open(storeDir) numDocs = reader.numDocs() print("n_docs:", numDocs) for i in range(numDocs): tvec = reader.getTermVector(i, 'body') if tvec is not None: termsEnum = tvec.iterator() vec = {} for term in BytesRefIterator.cast_(termsEnum): dpEnum = termsEnum.postings(None) dpEnum.nextDoc() vec[term.utf8ToString()] = dpEnum.freq() print(vec) reader.close()
def test_bug1842(self): reader = self.getReader() searcher = self.getSearcher() q = TermQuery(Term("id", '1')) topDocs = searcher.search(q, 50) termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") terms = [] freqs = [] termsEnum = termvec.iterator() for term in BytesRefIterator.cast_(termsEnum): terms.append(term.utf8ToString()) freqs.append(termsEnum.totalTermFreq()) terms.sort() self.assert_(terms == ['blah', 'gesundheit']) self.assert_(freqs == [3, 1])
def get_term_freq(self, docid, field, is_cached=False): if is_cached == True and (field, docid) in self.dict_term_freq: return self.dict_term_freq[(field, docid)] if len(self.dict_term_freq) > 2000: self.dict_term_freq.clear() terms = self.reader.getTermVector(docid, field) term_freq = {} if terms is not None: te_itr = terms.iterator() for bytesref in BytesRefIterator.cast_(te_itr): t = bytesref.utf8ToString() freq = te_itr.totalTermFreq() term_freq[t] = freq self.dict_term_freq[(field, docid)] = term_freq return self.dict_term_freq[(field, docid)]
def get_terms(indexReader, field='text'): """ Gets all terms in an index. :param indexReader: IndexReader object of your index :param field: document field from which terms should be counted :return: list of terms (strings) """ terms = [] multiterms = MultiFields.getTerms(indexReader, field) termit = multiterms.iterator() it = BytesRefIterator.cast_( termit) # Inheritance apparently doesn't work in PyLucene... term = it.next() while term: terms.append(term.utf8ToString()) term = it.next() return terms
def getFreqVectorFromText(self, text): # Initialization of Java Virtual Machine with Lucene vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() indexDir = "res/index" stopWords = [] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(indexDir)) conf = IndexWriterConfig(Version.LUCENE_4_10_1, analyzer) rebuild = True if rebuild: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE) else: conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) iwriter = IndexWriter(directory, conf) doc = Document() doc.add(Field("docName", 'url', Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field("content", text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) iwriter.addDocument(doc) iwriter.close() ireader = IndexReader.open(directory) freqVector = [] docVector = ireader.getTermVector(0, "content") termsEnum = docVector.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString() freq = termsEnum.totalTermFreq() freqVector.append((text, freq)) freqVector = sorted(freqVector, key=itemgetter(1), reverse=True) self.vector = list() self.freqs = list() for el in freqVector: self.vector.append(el[0]) self.freqs.append(el[1])
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add(Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [term.utf8ToString() for term in BytesRefIterator.cast_(term_enum)] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def get_doc_termfreqs_all_fields(self, lucene_doc_id): """ Returns term frequency for all fields in the given document. :param lucene_doc_id: Lucene document ID :return: dictionary {field: {term: freq, ...}, ...} """ doc_termfreqs = {} vectors = self.reader.getTermVectors(lucene_doc_id) if vectors: for field in vectors.iterator(): doc_termfreqs[field] = {} terms = vectors.terms(field) if terms: termenum = terms.iterator(None) for bytesref in BytesRefIterator.cast_(termenum): doc_termfreqs[field][bytesref.utf8ToString()] = int(termenum.totalTermFreq()) print doc_termfreqs[field] return doc_termfreqs
def test_FieldEnumeration(self): self.test_indexDocument() store = self.openStore() writer = None try: analyzer = self.getAnalyzer() writer = self.getWriter(store, analyzer, False) doc = Document() doc.add(Field("title", "value of testing", TextField.TYPE_STORED)) doc.add(Field("docid", str(2), StringField.TYPE_NOT_STORED)) doc.add(Field("owner", "unittester", StringField.TYPE_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("owner", "unittester", StringField.TYPE_NOT_STORED)) doc.add(Field("search_name", "wisdom", StoredField.TYPE)) doc.add( Field("meta_words", "rabbits are beautiful", TextField.TYPE_NOT_STORED)) writer.addDocument(doc) finally: self.closeStore(store, writer) store = self.openStore() reader = None try: reader = DirectoryReader.open(store) term_enum = MultiFields.getTerms(reader, "docid").iterator() docids = [ term.utf8ToString() for term in BytesRefIterator.cast_(term_enum) ] self.assertEqual(len(docids), 2) finally: self.closeStore(store, reader)
def termsForField(self, field, prefix=None, limit=10, **kwargs): convert = lambda term: term.utf8ToString() terms = [] termsEnum = MultiFields.getTerms(self._indexAndTaxonomy.searcher.getIndexReader(), field) if termsEnum is None: return terms iterator = termsEnum.iterator(None) if prefix: iterator.seekCeil(BytesRef(prefix)) terms.append((iterator.docFreq(), convert(iterator.term()))) bytesIterator = BytesRefIterator.cast_(iterator) try: while len(terms) < limit: term = convert(bytesIterator.next()) if prefix and not term.startswith(prefix): break terms.append((iterator.docFreq(), term)) except StopIteration: pass return terms
def get_tf_idf(self, field_name: str, content_id: str): """ Calculates the tf-idf for the words contained in the field of the content whose id is content_id Args: field_name (str): Name of the field containing the words for which calculate the tf-idf content_id (str): Id of the content that contains the specified field Returns: words_bag (Dict <str, float>): Dictionary whose keys are the words contained in the field, and the corresponding values are the tf-idf values. """ searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory)))) query = QueryParser("testo_libero", KeywordAnalyzer()).parse("content_id:\"" + content_id + "\"") score_docs = searcher.search(query, 1).scoreDocs document_offset = -1 for score_doc in score_docs: document_offset = score_doc.doc reader = searcher.getIndexReader() words_bag = {} term_vector = reader.getTermVector(document_offset, field_name) term_enum = term_vector.iterator() for term in BytesRefIterator.cast_(term_enum): term_text = term.utf8ToString() postings = term_enum.postings(None) postings.nextDoc() term_frequency = 1 + math.log10( postings.freq()) # normalized term frequency inverse_document_frequency = math.log10( reader.maxDoc() / reader.docFreq(Term(field_name, term))) tf_idf = term_frequency * inverse_document_frequency words_bag[term_text] = tf_idf reader.close() return words_bag
def get_document_vector(searcher, reader, document_id, \ id_field, text_field): ''' Given a document id, fetch the tf-idf vector of the document. ''' tc_dict = {} # Counts of each term dc_dict = {} # Number of docs associated with each term tfidf_dict = {} # TF-IDF values of each term in the doc # Get the document id. query_parser = QueryParser(id_field, WhitespaceAnalyzer()) score_docs = searcher.search(query_parser.parse(str(document_id)), 1).scoreDocs if len(score_docs) > 0: # get the tf-idf vector. termVector = reader.getTermVector(score_docs[0].doc, text_field) termsEnumvar = termVector.iterator() termsref = BytesRefIterator.cast_(termsEnumvar) N_terms = 0 try: while (termsref.next()): termval = TermsEnum.cast_(termsref) fg = termval.term().utf8ToString() # Term in unicode if len(fg) > 3 and not fg.isdigit(): tc = termval.totalTermFreq() # Term count in the doc # Number of docs having this term in the index dc = reader.docFreq(Term(text_field, termval.term())) N_terms = N_terms + 1 tc_dict[fg] = tc dc_dict[fg] = dc except: print('error in term_dict') # Compute TF-IDF for each term for term in tc_dict: tf = tc_dict[term] / N_terms idf = 1 + math.log(reader.numDocs() / (dc_dict[term] + 1)) tfidf_dict[term] = tf * idf return tfidf_dict
def evaluate_index(data_dir, store_dir, analyzer): """ Evaluates vocabulary size and indexing speed for different analyzer configurations. """ start = time.clock() Indexer(data_dir, store_dir, analyzer) end = time.clock() duration = end-start directory = SimpleFSDirectory(File(store_dir)) reader = IndexReader.open(directory) vocabulary = MultiFields.getTerms(reader, 'title') vocab_size = vocabulary.size() # sometimes .size() doesn't return the correct size, in this case # we have to count manually if vocab_size == -1: termsref = BytesRefIterator.cast_(vocabulary.iterator(None)) vocab_size = sum(1 for _ in termsref) reader.close() return duration, vocab_size
def main(): #constants FIELD_CONTENTS = "vectext" DOC_NAME = "identifier" STORE_DIR = "../full_index1" lucene.initVM() store = SimpleFSDirectory(Paths.get(STORE_DIR)) ireader = DirectoryReader.open(store) #, True) #print(ireader.readerIndex(0)) searcher = IndexSearcher(ireader) #self.getSearcher() pickle_file = glob.glob('full_word_list.pkl') print(pickle_file) date_range = (1785, 1805) bigrams = False remake_word_list = True if remake_word_list: #not pickle_file: full_df = get_full_df() full_term_data = [] for year in range(date_range[0], date_range[1]): docs_in_year = get_docs_in_year(full_df, year) #print(docs_in_year) year_dict = Counter({}) terms = [] freqs = [] print(year) for cd, doc_id in enumerate(docs_in_year): #if not cd%100: # print(cd , '--', len(docs_in_year)) # get document (query by id) q = TermQuery(Term("identifier", doc_id + '_djvu.txt')) topDocs = searcher.search(q, 50000) #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") one_doc = topDocs.scoreDocs[0].doc doc_name = searcher.doc(one_doc) #print(doc_name, doc_id) if bigrams == False: termvec = ireader.getTermVector(topDocs.scoreDocs[0].doc, FIELD_CONTENTS) if termvec != None: #termvec = reader.getTermVector(topDocs.scoreDocs[0].doc, "all") termsEnum = termvec.iterator() for term in BytesRefIterator.cast_(termsEnum): terms.append(term.utf8ToString()) freqs.append(termsEnum.totalTermFreq()) else: #print(doc_name, doc_id) text = doc_name.get("text") text = text.split() text = strip_stopwords_punc(text) for word1, word2 in zip(text[:-1], text[1:]): if len(word1) + len(word2) > 6: try: year_dict[word1 + ' ' + word2] += 1 except: year_dict[word1 + ' ' + word2] = 1 if bigrams == False: for term, freq in zip(terms, freqs): try: year_dict[term] += freq except: year_dict[term] = freq print(len(year_dict)) #print(year_dict) for term in list(year_dict): if year_dict[term] < 2: #5 and term not in stopwords: year_dict.pop(term) full_term_data.append(year_dict) print(len(year_dict)) #year_dict = year_dict + doc_dict #print(year_dict.most_common(1000)) print('\n\n') if bigrams: pickle.dump(full_term_data, open('full_bigram_list.pkl', 'wb')) else: pickle.dump(full_term_data, open('full_word_list.pkl', 'wb')) else: if bigrams: full_term_data = pickle.load(open('full_bigram_list.pkl', 'rb')) else: full_term_data = pickle.load(open('full_word_list.pkl', 'rb')) # get complete list of unique words # top_words_year = zscore_method(full_term_data, date_range) top_words_year = tfidf_method(full_term_data, date_range) print(top_words_year) pickle.dump(top_words_year, open('trending_ratio.pkl', 'wb'))
ts = [ "this bernhard is the text to be index text", "this claudia is the text to be indexed" ] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory) for doc in xrange(0, len(ts)): tv = ireader.getTermVector(doc, "fieldname") termsEnum = tv.iterator() for term in BytesRefIterator.cast_(termsEnum): dpEnum = termsEnum.postings(None) dpEnum.nextDoc() # prime the enum which works only for the current doc freq = dpEnum.freq() print 'term:', term.utf8ToString() print ' freq:', freq for i in xrange(freq): print " pos:", dpEnum.nextPosition() print " off: %i-%i" % (dpEnum.startOffset(), dpEnum.endOffset()) print
ft.setStoreTermVectorPositions(True) ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) ts = ["this bernhard is the text to be index text", "this claudia is the text to be indexed"] for t in ts: doc = Document() doc.add(Field("fieldname", t, ft)) iwriter.addDocument(doc) iwriter.commit() iwriter.close() ireader = DirectoryReader.open(directory) for doc in xrange(0, len(ts)): tv = ireader.getTermVector(doc, "fieldname") termsEnum = tv.iterator() for term in BytesRefIterator.cast_(termsEnum): dpEnum = termsEnum.postings(None) dpEnum.nextDoc() # prime the enum which works only for the current doc freq = dpEnum.freq() print 'term:', term.utf8ToString() print ' freq:', freq for i in xrange(freq): print " pos:", dpEnum.nextPosition() print " off: %i-%i" %(dpEnum.startOffset(), dpEnum.endOffset()) print
def getMostFrequentTermStopwords(route, query): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() stopWords = [ "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "no", "not", "more", "http", "html", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with", "el", "la", "lo", "los", "las", "ante", "con", "sin", "que", "es", "de", "en", "por", "y", "los", ] stopWordsSet = StopFilter.makeStopSet(Version.LUCENE_CURRENT, stopWords) analyzer = StopAnalyzer(Version.LUCENE_CURRENT, stopWordsSet) directory = SimpleFSDirectory(File(route)) ireader = IndexReader.open(directory) currentTerm = "" currentTermFreq = 0 for doc in range(ireader.numDocs()): terms = ireader.getTermVector(doc, "content") if terms is not None: termsEnum = terms.iterator(None) for term in BytesRefIterator.cast_(termsEnum): text = term.utf8ToString().encode("UTF-8") t = Term("content", term) freq = ireader.totalTermFreq(t) if freq > currentTermFreq and text not in query: currentTerm = text currentTermFreq = freq return currentTerm
def run(reader, searcher, analyzer, searchTerm, K): #get user's query and initiate some dictionaries command = searchTerm #for each word, record how many matched documents contains it termOccurrence = {} #inversed document frequency for each word idf = {} #term frequency for each word in each matched document tf = {} #the product of tf and idf for each word in each matched document tfidf = {} #the number of words for each matched document docLength = {} #for each kind of word in each document,record how many times it appeared in the document termOccurrenceInADoc = {} #record every kind of word in matched documents as key, the value of this dictionary doesn't matter allWords = {} #parse the user query query = QueryParser("contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs #the total number of matched documents totalDocs = 0 #for each matched document,calculate its all term's term normalized term frequency for scoreDoc in scoreDocs: totalDocs = totalDocs + 1 #get url and term vectors of the matched document doc = searcher.doc(scoreDoc.doc) vectors = reader.getTermVector(scoreDoc.doc, "contents") enum = vectors.iterator() url = doc.get("url") #record the url of matched document, create a nested dictionary for term occurence of a word in a document termOccurrenceInADoc[url] = {} #record the url of matched document, create a nested dictionary for term frequency tf[url] = {} #for each term in matched document, record each term's occurence and calculate the total word count of the document for term in BytesRefIterator.cast_(enum): term2 = term.utf8ToString() #record the term allWords[term2] = 1 #increase the termOccurence by one if termOccurrence.has_key(term2): termOccurrence[term2] = termOccurrence[term2] + 1 else: termOccurrence[term2] = 1 dpEnum = enum.postings(None) dpEnum.nextDoc() #get occurence of the term freq = dpEnum.freq() #add the occurence of the term into the total word count of the document if docLength.has_key(url): docLength[url] = docLength[url] + freq else: docLength[url] = freq #record occurence of this term in the according docuement termOccurrenceInADoc[url][term2] = freq #for each term in each document,divide its occurence by the total length of the document to get the normalized term frequency for key in termOccurrenceInADoc[url]: tf[url][key] = termOccurrenceInADoc[url][key] / float( docLength[url]) #calculate idf for each term for key in termOccurrence: idf[key] = math.log(float(totalDocs) / termOccurrence[key], 10) #calculate the product of tf and idf for each term in each document for key in tf: tfidf[key] = {} for word in allWords: if tf[key].has_key(word): tfidf[key][word] = tf[key][word] * idf[word] k = K #this is used to record the attributes of all centroids centroids = {} #this is used to record all urls of each cluster clusters = {} #copy the tfidf dictionary, because I need to delete some keys to make sure I choose different initial centroids copy = tfidf.copy() #randomly choose k vectors as initial centroid for counter in range(1, k + 1): if len(copy) == 0: break key = random.choice(copy.keys()) centroids[counter] = tfidf[key].copy() clusters[counter] = {} del copy[key] #this dictionary is used to record clusters during last iteration, it will be used to compare to new clusters to judge if clusters changed oldClusters = {} #I will do at most 100 iterations, if clusters no longer change before 100 iterations, break the loop for counter2 in range(1, 101): #for each vector, find its closest centroid, put it into the according cluster for key in tfidf: counter = 1 #for each centroid, calculte the euclidean distance between it and the vector, find the centroid with shortest eucliden distance for centroid in centroids: #for the first centroid, simply treat it as closest for now if counter == 1: closestDistance = 0 for attribute in tfidf[key]: closestDistance = closestDistance + ( tfidf[key].get(attribute, 0) - centroids[centroid].get(attribute, 0))**2 closestDistance = math.sqrt(closestDistance) closestCentroid = centroid #for other centroids, check if their euclidean distance is shorter than temporary closest, if yes, replace closest centroid else: temp = 0 for attribute in tfidf[key]: temp = temp + ( tfidf[key].get(attribute, 0) - centroids[centroid].get(attribute, 0))**2 temp = math.sqrt(temp) if temp < closestDistance: closestDistance = temp closestCentroid = centroid counter = counter + 1 #put the url of document into new cluster, the value of the dictionary doesn't matter clusters[closestCentroid][key] = 1 #after finishing calculating new clusters, clean old centroids centroids = {} #for each cluster, calculate the new centroid for cluster in clusters: docNumber = 0 #initialize nested dictionary for each cluster's centroid centroids[cluster] = {} #for every vector in a cluster, add them togeter for document in clusters[cluster]: docNumber = docNumber + 1 for term in allWords: centroids[cluster][term] = centroids[cluster].get( term, 0) + tfidf[document].get(term, 0) #if the cluster is not empty, divide the sum of all vectors by the number of vectors, the result is the new centroid if docNumber == 0: newCentroid = random.choice(tfidf.keys()) centroids[cluster] = tfidf[newCentroid].copy() else: for term in allWords: centroids[cluster][term] = float( centroids[cluster][term]) / docNumber #if this is not the first iteration, compare new clusters with old clusters, if they are completely same, break the loop if counter2 != 1: completelySame = True for cluster in clusters: for document in clusters[cluster]: if oldClusters[cluster].has_key(document) == False: completelySame = False break if completelySame == True: break #copy every new cluster, compare them with newer clusters of next iteration for cluster in clusters: oldClusters[cluster] = clusters[cluster].copy() clusters[cluster].clear() #after finishing clustering,print every cluster,three terms with highest tfidf and every website of that cluster counter2 = 1 for key in clusters: #print three terms with highest tfidf, treat them as labels if len(clusters[key]) != 0: print "cluster", counter2 print "(3 words with highest tf*idf (labels): " for counter in range(1, 4): if not bool(centroids[key]): break highestTFIDF = max(centroids[key].items(), key=lambda x: x[1])[0] print highestTFIDF.encode('ascii', 'ignore') del centroids[key][highestTFIDF] print ")<br>" counter2 = counter2 + 1 #for each cluster, print all websites #If the file has an HTML TITLE list the title #If the file has no title but the body begins with an HTML H1, H2, or H3, list that #Otherwise use the first three words of the text for key2 in clusters[key]: soup = BeautifulSoup(urllib2.urlopen(key2)) print "<a href=\"" + key2 + "\">" if soup.find("title"): print soup.find("title").string.encode('ascii', 'ignore') + "</a><br>" elif soup.find("h1"): print soup.find("h1").string.encode('ascii', 'ignore') + "</a><br>" elif soup.find("h2"): print soup.find("h2").string.encode('ascii', 'ignore') + "</a><br>" elif soup.find("h3"): print soup.find("h3").string.encode('ascii', 'ignore') + "</a><br>" else: text = soup.get_text().encode("utf-8") array = text.split() print array[0], " ", array[1], " ", array[2] + "</a><br>"
def search_query_with_relevance_feedback(self, query, feedback_qrels, num_returns=50, add_num=1): query_text = query["description"] print(query_text) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_number = query["Number"] qrel_doc_ids = [ qrel["docno"] for qrel in feedback_qrels if qrel["qid"] == query_number ] final_list = [] term_tf_idf = {} doc_count = len(qrel_doc_ids) for qrel_doc_id in qrel_doc_ids: initial_hit = self.feedback_searcher.search( TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs if len(initial_hit) == 0: continue assert len(initial_hit) == 1 termVector = self.reader.getTermVector(initial_hit[0].doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_tf_idf: if each[0] not in self.stopwords and not str(each[0]).isnumeric( ) and each[0] not in query_text.split(" "): final_list.append(each[0]) print(final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_text = " ".join(query_text.split(" ")) print(query_text) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity } trec_results.append(trec_result) return trec_results
def getSilhouette(reader, searcher, analyzer, searchTerm, K): #get user's query and initiate some dictionaries command = searchTerm #for each word, record how many matched documents contains it termOccurrence = {} #inversed document frequency for each word idf = {} #term frequency for each word in each matched document tf = {} #the product of tf and idf for each word in each matched document tfidf = {} #the number of words for each matched document docLength = {} #for each kind of word in each document,record how many times it appeared in the document termOccurrenceInADoc = {} #record every kind of word in matched documents as key, the value of this dictionary doesn't matter allWords = {} #parse the user query query = QueryParser("contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs #the total number of matched documents totalDocs = 0 #for each matched document,calculate its all term's term normalized term frequency for scoreDoc in scoreDocs: totalDocs = totalDocs + 1 #get url and term vectors of the matched document doc = searcher.doc(scoreDoc.doc) vectors = reader.getTermVector(scoreDoc.doc, "contents") enum = vectors.iterator() url = doc.get("url") #record the url of matched document, create a nested dictionary for term occurence of a word in a document termOccurrenceInADoc[url] = {} #record the url of matched document, create a nested dictionary for term frequency tf[url] = {} #for each term in matched document, record each term's occurence and calculate the total word count of the document for term in BytesRefIterator.cast_(enum): term2 = term.utf8ToString() #record the term allWords[term2] = 1 #increase the termOccurence by one if termOccurrence.has_key(term2): termOccurrence[term2] = termOccurrence[term2] + 1 else: termOccurrence[term2] = 1 dpEnum = enum.postings(None) dpEnum.nextDoc() #get occurence of the term freq = dpEnum.freq() #add the occurence of the term into the total word count of the document if docLength.has_key(url): docLength[url] = docLength[url] + freq else: docLength[url] = freq #record occurence of this term in the according docuement termOccurrenceInADoc[url][term2] = freq #for each term in each document,divide its occurence by the total length of the document to get the normalized term frequency for key in termOccurrenceInADoc[url]: tf[url][key] = termOccurrenceInADoc[url][key] / float( docLength[url]) #calculate idf for each term for key in termOccurrence: idf[key] = math.log(float(totalDocs) / termOccurrence[key], 10) #calculate the product of tf and idf for each term in each document for key in tf: tfidf[key] = {} for word in allWords: if tf[key].has_key(word): tfidf[key][word] = tf[key][word] * idf[word] k = K #this is used to record the attributes of all centroids centroids = {} #this is used to record all urls of each cluster clusters = {} #copy the tfidf dictionary, because I need to delete some keys to make sure I choose different initial centroids copy = tfidf.copy() #randomly choose k vectors as initial centroid for counter in range(1, k + 1): if len(copy) == 0: break key = random.choice(copy.keys()) centroids[counter] = tfidf[key].copy() clusters[counter] = {} del copy[key] #this dictionary is used to record clusters during last iteration, it will be used to compare to new clusters to judge if clusters changed oldClusters = {} #I will do at most 100 iterations, if clusters no longer change before 100 iterations, break the loop for counter2 in range(1, 101): #for each vector, find its closest centroid, put it into the according cluster for key in tfidf: counter = 1 #for each centroid, calculte the euclidean distance between it and the vector, find the centroid with shortest eucliden distance for centroid in centroids: #for the first centroid, simply treat it as closest for now if counter == 1: closestDistance = 0 for attribute in tfidf[key]: closestDistance = closestDistance + ( tfidf[key].get(attribute, 0) - centroids[centroid].get(attribute, 0))**2 closestDistance = math.sqrt(closestDistance) closestCentroid = centroid #for other centroids, check if their euclidean distance is shorter than temporary closest, if yes, replace closest centroid else: temp = 0 for attribute in tfidf[key]: temp = temp + ( tfidf[key].get(attribute, 0) - centroids[centroid].get(attribute, 0))**2 temp = math.sqrt(temp) if temp < closestDistance: closestDistance = temp closestCentroid = centroid counter = counter + 1 #put the url of document into new cluster, the value of the dictionary doesn't matter clusters[closestCentroid][key] = 1 #after finishing calculating new clusters, clean old centroids centroids = {} #for each cluster, calculate the new centroid for cluster in clusters: docNumber = 0 #initialize nested dictionary for each cluster's centroid centroids[cluster] = {} #for every vector in a cluster, add them togeter for document in clusters[cluster]: docNumber = docNumber + 1 for term in allWords: centroids[cluster][term] = centroids[cluster].get( term, 0) + tfidf[document].get(term, 0) #if the cluster is not empty, divide the sum of all vectors by the number of vectors, the result is the new centroid if docNumber == 0: newCentroid = random.choice(tfidf.keys()) centroids[cluster] = tfidf[newCentroid].copy() else: for term in allWords: centroids[cluster][term] = float( centroids[cluster][term]) / docNumber #if this is not the first iteration, compare new clusters with old clusters, if they are completely same, break the loop if counter2 != 1: completelySame = True for cluster in clusters: for document in clusters[cluster]: if oldClusters[cluster].has_key(document) == False: completelySame = False break if completelySame == True: break #copy every new cluster, compare them with newer clusters of next iteration for cluster in clusters: oldClusters[cluster] = clusters[cluster].copy() clusters[cluster].clear() #calculate average silhouette coefficient of all vectors silhouette = 0 numberOfDocuments = 0 #if no document found or only one cluster is there, treat silhouette as 0 if len(clusters) <= 1: return 0 averageSI = 0 counter3 = 0 #for each vector,calculate the average euclidean distance between it and other vectors in the same cluster (I call it ai) #and calculate the lowest average euclidean distance between it and other vectors in another cluster (I call it lowestBi) #then use ai and lowestBi to calculate silhouette coefficient for each vector for key in clusters: if len(clusters[key]) != 0: for key2 in clusters[key]: si = 0 #if there is only one vector in this cluster, treat this vector's silhouette coefficient as 0 if len(clusters[key]) == 1: si = 0 else: ai = 0 bi = {} biCounter = {} counter1 = 0 #for other vectors, if they are in the current vector's cluster, add euclidean distance into ai #if they are in other cluster, find the cluster, add euclidean distance(value) and cluster(key) into bi for otherVector in tfidf: if otherVector != key2: if clusters[key].has_key(otherVector): euclidean = 0 for attribute in tfidf[otherVector]: euclidean = euclidean + ( tfidf[otherVector].get(attribute, 0) - tfidf[key2].get(attribute, 0))**2 euclidean = math.sqrt(euclidean) ai = ai + euclidean counter1 = counter1 + 1 else: thisCluster = 0 #find which cluster this vector belongs to for key3 in clusters: if clusters[key3].has_key(otherVector): thisCluster = key3 break euclidean = 0 for attribute in tfidf[otherVector]: euclidean = euclidean + ( tfidf[otherVector].get(attribute, 0) - tfidf[key2].get(attribute, 0))**2 euclidean = math.sqrt(euclidean) bi[thisCluster] = bi.get(thisCluster, 0) + euclidean biCounter[thisCluster] = biCounter.get( thisCluster, 0) + 1 ai = ai / float(counter1) lowestBi = 0 counter4 = 1 #find the lowest average euclidean distance between this vector and other clusters for key3 in bi: bi[key3] = bi[key3] / float(biCounter[key3]) if counter4 == 1: lowestBi = bi[key3] else: if bi[key3] < lowestBi: lowestBi = bi[key3] counter4 = counter4 + 1 #now we have ai and lowest bi, calculate the silhouette coefficient of this vector if ai == 0 or lowestBi == 0: si = 0 else: if ai < lowestBi: si = 1 - ai / lowestBi elif ai == lowestBi: si = 0 else: si = lowestBi / ai - 1 averageSI = averageSI + si counter3 = counter3 + 1 #calculate the average silhouette coefficient of all vectors averageSI = averageSI / float(counter3) return averageSI
def get_terms(self, docid, field): terms = self.reader.getTermVector(docid, field) te_itr = terms.iterator() return [brf.utf8ToString() for brf in BytesRefIterator.cast_(te_itr)]
def search_query(self, query, num_returns=50, use_multipass_pseudo_relevance_feedback=False, doc_counts=None, add_nums=None): query_text = query["description"] print(query_text.lower()) query_text = " ".join(tokenizer.tokenize(query_text)) query_text = self.remove_stopwords(query_text.lower()) print(query_text) query_search = self.parser.parse(query_text) if use_multipass_pseudo_relevance_feedback: if doc_counts is None: doc_counts = [5, 9] if add_nums is None: add_nums = [2, 13] assert len(doc_counts) == len( add_nums), "The number of pass is inconsistent!" for doc_count, add_num in zip(doc_counts, add_nums): final_list = [] initial_hits = self.searcher.search(query_search, doc_count).scoreDocs term_tf_idf = {} for initial_hit in initial_hits: termVector = self.reader.getTermVector( initial_hit.doc, "text") terms_enum = termVector.iterator() termsref = BytesRefIterator.cast_(terms_enum) N_terms = 0 term_idf = {} term_freq = {} term_list = [] while (termsref.next()): termval = TermsEnum.cast_(termsref) termText = termval.term().utf8ToString() if termText in self.stopwords: continue tc = termval.totalTermFreq() if termText in term_freq: term_freq[termText] += tc else: term_freq[termText] = tc if termText in term_idf: term_idf[termText] += 1 else: term_idf[termText] = 1 if termText not in term_list: term_list.append(termText) N_terms = N_terms + 1 for term in term_list: if term in term_tf_idf: term_tf_idf[term] += term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) else: term_tf_idf[term] = term_freq[term] / N_terms * ( 1 + math.log(doc_count / (term_idf[term] + 1))) sorted_term_tf_idf = sorted(term_tf_idf.items(), key=lambda x: x[1], reverse=True) for each in sorted_term_tf_idf: if each[0] not in self.stopwords: final_list.append(each[0]) print("added query tokens:", final_list[:add_num]) query_text = query_text + " " + " ".join(final_list[:add_num]) query_search = self.parser.parse(query_text) results = self.searcher.search(query_search, num_returns) hits = results.scoreDocs trec_results = [] for rank, hit in enumerate(hits): doc = self.searcher.doc(hit.doc) trec_result = { "QueryID": query["Number"], "Q0": "Q0", "DocID": doc.get(".U"), "Rank": str(rank + 1), "Score": str(hit.score), "RunID": self.similarity + "-mpprf-" + str(len(doc_counts)) + "passes" if use_multipass_pseudo_relevance_feedback else self.similarity } trec_results.append(trec_result) return trec_results