def test_getFieldInfos(self): self.test_indexDocument() store = self.openStore() reader = None try: reader = DirectoryReader.open(store) fieldInfos = MultiFields.getMergedFieldInfos(reader) for fieldInfo in fieldInfos.iterator(): self.assert_(fieldInfo.name in ['owner', 'search_name', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed() and not fieldInfo.hasVectors(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) finally: store = self.closeStore(store, reader)
def test_getFieldInfos(self): self.test_indexDocument() store = self.openStore() reader = None try: reader = DirectoryReader.open(store) fieldInfos = MultiFields.getMergedFieldInfos(reader) for fieldInfo in fieldInfos.iterator(): self.assert_( fieldInfo.name in ['owner', 'search_name', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) if fieldInfo.isIndexed() and not fieldInfo.hasVectors(): self.assert_(fieldInfo.name in ['owner', 'meta_words', 'docid', 'title']) finally: store = self.closeStore(store, reader)
def main(): #constants FIELD_CONTENTS = "text" DOC_NAME = "identifier" STORE_DIR = "./full_index1" #take search term as command line argument if len(sys.argv) != 4: print( 'Format should be: python search_docs.py, [term to search for], redo? y/n, window_size' ) exit(0) #parse user input TERM = sys.argv[1] remake_df = True if sys.argv[2] == 'y' else False window_size = int(sys.argv[3]) #other options stem_flag = True spell_check_flag = False #get dataframe doc_data = get_doc_df(remake_df) #get dictionary SA_dict = get_dict(stem_flag) print('Searching for: "' + TERM + '"') sa_term = [] date_range = (1791, 1800) method = 'linear' #vs 1/x example_flag = False #full_dict = pickle.load(open('./spellcheck/full_word_list.pkl'), 'rb') full_dict, modern_dict, map_chars, charlist = sp_ch.load_clean_word_list() ### replacement table rep_data = pickle.load(open('./spellcheck/rep_table.pkl', 'rb')) print(rep_data) rep_table = rep_data['rep_table'] charlist = rep_data['charlist'] try: map_chars = rep_data['charmap'] except: map_chars = rep_data['map_chars'] ### top_n = 4 top_replacements = {} for cf, from_letter in enumerate(rep_table): sort_idx = np.argsort(from_letter)[::-1] #print(from_letter) top_rep = [sort_idx[i] for i in range(top_n)] #print(top_rep) top_replacements[charlist[cf]] = [charlist[char] for char in top_rep] # if not 'sentiment_vals_w_'+TERM in list(doc_data): if 1: # not glob.glob('./pickles/%s_df.pkl'%TERM): lucene.initVM() searcher, reader, query = define_search_params(STORE_DIR, FIELD_CONTENTS, TERM) fieldInfos = MultiFields.getMergedFieldInfos(reader) print(fieldInfos) for fieldInfo in fieldInfos.iterator(): print(fieldInfo.name) # Run the query and get documents that contain the term docs_containing_term = searcher.search(query, reader.numDocs()) print('Found ' + str(len(docs_containing_term.scoreDocs)) + ' documents with the term "' + TERM + '".') print('Calculating sentiment scores...') term_words = [] #hits = searcher.search(query, 1) for hit in tqdm(docs_containing_term.scoreDocs): doc = searcher.doc(hit.doc) #get the text from each document doc_text = doc.get("text") #doc.get("text")#.encode("utf-8") #single doc returns the score data for a single document, and a list of words that appear in the term windows for that document score_data, doc_words = sa.single_doc( TERM, doc_text, SA_dict, full_dict, top_replacements, window_size, spell_check_flag, example_flag, stem_flag, method) #print(score_data) term_words.append((doc.get(DOC_NAME).split('/')[-1], doc_words)) sa_doc_score = [doc.get(DOC_NAME)] + score_data sa_term.append(sa_doc_score) sa_df = a_sa.make_sa_df(doc_data, sa_term, TERM) pickle.dump(sa_df, open('./pickles/%s_df.pkl' % TERM, 'wb')) pickle.dump(term_words, open('./pickles/%s_words.pkl' % TERM, 'wb')) else: sa_df = doc_data print(sa_df) #process dataframe for various properties (split this into specific functions later) use_weighted = True total_doc = False
# titles = index.get_documents(ids, ["id", "title"]) print "\n".join(map(str, docs)) sys.exit() fields = [ DocField("id", stored=True, indexed=True), DocField("text", stored=True, indexed=True) ] index = Index(fields=fields) texts = [ "just writing ", "what ever dude", "el dudino", "your dude", "the Dude" ] for i, text in enumerate(texts): index.add(id='doc_%d' % (i + 1), text=text) index.commit() ids, scores = index.search("dude+ever", ["text"], limit=10) print index.get_documents(ids, "id") # Try out some filters filter = TermsFilter([Term("id", "doc_2")]) ids, scores = index.search("dude+ever", ["text"], filter, limit=10) print index.get_documents(ids, "id") fields = MultiFields.getMergedFieldInfos(index.reader).iterator() for f in fields: print f.attributes() # print filter.getDocIdSet(index.reader)