def index(): if request.method == 'GET': return render_template('index.html')#, request=request) else: abstract = '<strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Suspendisse vitae purus sit amet magna iaculis rhoncus. Aenean ullamcorper nibh vitae lacus commodo condimentum. Aenean ornare pharetra est id porttitor. <strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Morbi eu ante sed arcu maximus imperdiet. Phasellus id nisl quis sem consectetur sagittis. Duis placerat nisi ut nisl condimentum ornare. Sed pulvinar arcu nisl, eu faucibus dui tincidunt aliquam. Aliquam malesuada faucibus nisl, et malesuada turpis sagittis nec. Aliquam id pretium augue.' start_time = timeit.default_timer() query = request.form['query'] dproc = DocProcessor() dproc.prep_query(query) iidx = InvertedIndex( '/home/ubuntu/eecs767/var/wikipedia-3833/term.dct', '/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst' ) dlist = DocList( '/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst' ) results = [] if '_enhanced' in request.form: rel_docs = iidx.enhanced_query(dproc.tokens) #ranked_docs = sorted(rel_docs, key=itemgetter('fscore'), reverse=True) # cos_ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim'), reverse=True) ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim', 'term_prox','i_win_loc'), reverse=True) for doc in ranked_docs[:10]: results.append({ 'url': dlist[doc['did']]['url'], 'title': dlist[doc['did']]['title'], 'abstract': abstract, 'cos_sim': doc['cos_sim'], 'term_prox': doc['term_prox'], 'win_loc': int(round(1/doc['i_win_loc'])), 'fscore': doc['fscore'] }) else: rel_docs = iidx.query(dproc.tokens) ranked_docs = sorted(rel_docs.items(), key=itemgetter(1), reverse=True) for doc in ranked_docs[:10]: results.append({ 'url': dlist[doc[0]]['url'], 'title': dlist[doc[0]]['title'], 'score': doc[1], 'abstract': abstract }) elapsed_time = timeit.default_timer() - start_time return render_template('index.html', #request=request, #tokens=dproc.tokens, #docs=ranked_docs[:10], query=query, results=results, elapsed_time=round(elapsed_time,3), total_docs=len(dlist) )
def write(plist, url, title, config): ''' RQ worker function which adds the given document posting list data to the inverted index. ''' MAX_DOCS = int(config.get('crawler', 'max_docs')) TERM_DICT_FILE = config.get('indexer', 'term_dict_file') DOC_LIST_FILE = config.get('indexer', 'doc_list_file') dl = DocList(DOC_LIST_FILE) if len(dl) < MAX_DOCS: did = md5(url).hexdigest() if did not in dl: dl.append(url, title) iidx = InvertedIndex( TERM_DICT_FILE, DOC_LIST_FILE ) iidx.append(plist, did) iidx.update()
def main(): alphabets = [chr(x) for x in range(ord('a'), ord('z') + 1)] alphabets.append("num") current_dict = dict() N = 55393 for i in alphabets: f = open(i + ".txt", 'rb') current_indexer = InvertedIndex() current_indexer.merge(pickle.load(f)) tf_idf_score = dict() f.close() # Calculating tf-idf score for each separate index for token, dictionary in current_indexer.getDict().items(): df = len(dictionary.keys()) current_dict = dict() for docID, tf in dictionary.items(): current_dict[docID] = round(tf * math.log(N / df, 10), 4) current_indexer.getDict()[token].clear() current_indexer.getDict()[token] = current_dict # Store the results into tf_score_(insert here).txt f = open("tf_score_" + i + ".txt", 'wb') pickle.dump(current_indexer.getDict(), f) f.close()
def write(plist, url, title, config): ''' RQ worker function which adds the given document posting list data to the inverted index. ''' MAX_DOCS = int(config.get('crawler', 'max_docs')) TERM_DICT_FILE = config.get('indexer', 'term_dict_file') DOC_LIST_FILE = config.get('indexer', 'doc_list_file') dl = DocList(DOC_LIST_FILE) if len(dl) < MAX_DOCS: did = md5(url).hexdigest() if did not in dl: dl.append(url, title) iidx = InvertedIndex(TERM_DICT_FILE, DOC_LIST_FILE) iidx.append(plist, did) iidx.update()
def main(): i = 0 final_result = InvertedIndex() # Load all indexes into one InvertedIndex() (class object for index) for i in range(88): f = open("indexer" + str(i) + ".txt", 'rb') final_result.merge(pickle.load(f)) f.close() alphanumIndex = [dict() for i in range(27) ] # length of alphabet + number = 26 + 1 = 27 for i in final_result.getDict(): if 97 <= ord( i[0] ) <= 122: # if the key of the index is part of english alphabet alphanumIndex[ord(i[0]) - 97][i] = final_result.getDict( )[i] # the first index selects which partial dictionary the key belongs to # the second index maps the key i to the value of the key i from the merged index else: alphanumIndex[26][i] = final_result.getDict( )[i] # the key is not part of eng. alphabet -> insert in number index ## create partial file for each letter and dump the dict into that file ASCII_code = 97 # ascii of 'a' for i in range(len(alphanumIndex)): # == 27 if i == len( alphanumIndex) - 1: # at the last index, reserved for numbers filename = "num" else: filename = chr(ASCII_code + i) fileObject = open(f"{filename}.txt", "wb") pickle.dump(alphanumIndex[i], fileObject) fileObject.close()
from operator import itemgetter from indexer import InvertedIndex, DocList from tokenizer import DocProcessor query = 'mary had a little lamb whos fleece was white as snow' query = 'tom colwell' query = 'information retrieval' query = 'asian women alumni' dproc = DocProcessor() dproc.prep_query(query) iidx = InvertedIndex( #'/home/ubuntu/eecs767/var/ku/term.dct', #'/home/ubuntu/eecs767/var/ku/doc.lst', '/home/ubuntu/eecs767/var/wikipedia/term.dct', '/home/ubuntu/eecs767/var/wikipedia/doc.lst', ) rel_docs = iidx.query(dproc.tokens) #print rel_docs.items() ranked_docs = sorted(rel_docs.items(), key=itemgetter(1), reverse=True) #print ranked_docs[:10] dlist = DocList( '/home/ubuntu/eecs767/var/wikipedia/doc.lst' ) for doc in ranked_docs[:10]: print '%s: %s' % (doc[1], dlist[doc[0]])
def index(): if request.method == 'GET': return render_template('index.html') #, request=request) else: abstract = '<strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Suspendisse vitae purus sit amet magna iaculis rhoncus. Aenean ullamcorper nibh vitae lacus commodo condimentum. Aenean ornare pharetra est id porttitor. <strong>Lorem ipsum</strong> dolor sit amet, consectetur adipiscing elit. Morbi eu ante sed arcu maximus imperdiet. Phasellus id nisl quis sem consectetur sagittis. Duis placerat nisi ut nisl condimentum ornare. Sed pulvinar arcu nisl, eu faucibus dui tincidunt aliquam. Aliquam malesuada faucibus nisl, et malesuada turpis sagittis nec. Aliquam id pretium augue.' start_time = timeit.default_timer() query = request.form['query'] dproc = DocProcessor() dproc.prep_query(query) iidx = InvertedIndex( '/home/ubuntu/eecs767/var/wikipedia-3833/term.dct', '/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst') dlist = DocList('/home/ubuntu/eecs767/var/wikipedia-3833/doc.lst') results = [] if '_enhanced' in request.form: rel_docs = iidx.enhanced_query(dproc.tokens) #ranked_docs = sorted(rel_docs, key=itemgetter('fscore'), reverse=True) # cos_ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim'), reverse=True) ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim', 'term_prox', 'i_win_loc'), reverse=True) for doc in ranked_docs[:10]: results.append({ 'url': dlist[doc['did']]['url'], 'title': dlist[doc['did']]['title'], 'abstract': abstract, 'cos_sim': doc['cos_sim'], 'term_prox': doc['term_prox'], 'win_loc': int(round(1 / doc['i_win_loc'])), 'fscore': doc['fscore'] }) else: rel_docs = iidx.query(dproc.tokens) ranked_docs = sorted(rel_docs.items(), key=itemgetter(1), reverse=True) for doc in ranked_docs[:10]: results.append({ 'url': dlist[doc[0]]['url'], 'title': dlist[doc[0]]['title'], 'score': doc[1], 'abstract': abstract }) elapsed_time = timeit.default_timer() - start_time return render_template( 'index.html', #request=request, #tokens=dproc.tokens, #docs=ranked_docs[:10], query=query, results=results, elapsed_time=round(elapsed_time, 3), total_docs=len(dlist))
from indexer import InvertedIndex import csv from sys import exit with open('/home/ubuntu/eecs767/var/sample.csv') as f: term_list = [] for row in csv.DictReader(f, skipinitialspace=True): term_list.append(row) tdict = InvertedIndex() tdict.build(term_list) tdict.write() for term in sorted(tdict): tnode = tdict[term]['tnode'] pl_str = '' for p in tnode.plist: pl_str += ' -> %s x %s (%.3f)' % (p['did'], p['tf'], p['w']) print '(%s) %s (tf:%s; df:%s; idf:%.3f):%s' % ( tdict[term]['loc'], tnode.term, tnode.tf, tnode.df, tnode.idf, pl_str )
unprocessed_location, most_similar_documents, document_titles, document_snapshots ] except Exception as e: raise () if __name__ == "__main__": try: doc_basename = "docsnew" # the actual name of the folder containing the processed files doc_location = "../file_cache/processed/" + doc_basename dp = DPClass() #dp.runDocProc("../file_cache/unprocessed/" + doc_basename) iic = InvertedIndexClass() #iic.createInvertedIndex("../file_cache/processed/docsnew") #iic.createInvertedIndex("../file_cache/processed/testdoc") iic.loadInvertedIndex(doc_location) vsm = VSMClass(iic, doc_basename) vsm.createEntireModel() stemmer = PorterStemmer() continueLoop = True print("Welcome to the Search Engine\n") while continueLoop: fromUser = "" user_query = "" print( "\n\nSelect from the Following Options:\n\t1.) Search\n\t2.) Exit"
from tokenizer import DocProcessor from indexer import InvertedIndex docs = { 1: '/home/ubuntu/eecs767/var/docs/doc1.html', 2: '/home/ubuntu/eecs767/var/docs/doc2.html', 3: '/home/ubuntu/eecs767/var/docs/doc3.html', 4: '/home/ubuntu/eecs767/var/docs/doc4.html', 5: '/home/ubuntu/eecs767/var/docs/doc5.html', } dproc = DocProcessor() iidx = InvertedIndex() for did, doc in docs.iteritems(): print '-- Processing Doc #%s: %s' % (did, doc) dproc.parse(doc) plist = dproc.gen_posting_list() iidx.append(plist, did) iidx.update() iidx.clear()
from operator import itemgetter from indexer import InvertedIndex, DocList from tokenizer import DocProcessor query = 'mary had a little lamb whos fleece was white as snow' query = 'tom colwell' query = 'asian women alumni' query = 'information retrieval' dproc = DocProcessor() dproc.prep_query(query) iidx = InvertedIndex( #'/home/ubuntu/eecs767/var/ku/term.dct', #'/home/ubuntu/eecs767/var/ku/doc.lst', '/home/ubuntu/eecs767/var/wikipedia/term.dct', '/home/ubuntu/eecs767/var/wikipedia/doc.lst', ) rel_docs = iidx.enhanced_query(dproc.tokens) #for doc in rel_docs: # print doc #print '-----------------' cos_ranked_docs = sorted(rel_docs, key=itemgetter('cos_sim'), reverse=True) #for doc in cos_ranked_docs[:10]: #print doc #print '-----------------'
return [unprocessed_location, most_similar_documents, document_titles, document_snapshots] except Exception as e: raise() if __name__ == "__main__": output = {} try: doc_basename = "newly_crawled" # the actual name of the folder containing the processed files doc_location = "../file_cache/processed/" + doc_basename dp = DPClass() iic = InvertedIndexClass() iic.loadInvertedIndex("../file_cache/processed/" + doc_basename) stemmer = PorterStemmer() english_file = open("./nltk-3.3/nltk_data/corpora/stopwords/english", "r", encoding="UTF8") english_words = english_file.read().strip().split() english_file.close() if len(sys.argv) < 2: output = {"ERROR MESSAGE": "You Need to Give a Search Term"} else: arguments = "" query = [] # argument 0 is the file name
from indexer import InvertedIndex import csv from sys import exit tdict = InvertedIndex() tdict.init_index() for term in sorted(tdict): print '(%s) %s' % (tdict[term]['loc'], term)
def main(): # define command line parameters parser = argparse.ArgumentParser() parser.add_argument('-d', '--documents_folder', action='store') parser.add_argument('-o', '--index_output_file', action='store') parser.add_argument('-c', '--docid_map_output_file', action='store') parser.add_argument('-i', '--index_file', action='store') parser.add_argument('-b', '--docid_map_file', action='store') parser.add_argument('-q', '--evaluate_queries', action='store_true') parser.add_argument('-a', '--queries_result_file', action='store') parser.add_argument('-p', '--part_three_file', action='store') # parse command line parameters args = parser.parse_args() print('received the following arguments') for k, v in vars(args).items(): print(k, v) inverted_index = None # build index if requested if args.documents_folder is not None: inverted_index = build_inverted_index(args.documents_folder) # export inverted index to a file if requested if args.index_output_file is not None and args.docid_map_output_file is not None: inverted_index.index_to_file(args.index_output_file) inverted_index.docidmap_to_file(args.docid_map_output_file) # load inverted index from file if args.index_file is not None and args.docid_map_file: inverted_index = InvertedIndex(args.index_file, args.docid_map_file) # write queries results to a file if args.evaluate_queries and inverted_index and args.queries_result_file: queries_results = BooleanRetrieval(inverted_index) with open(args.queries_result_file, 'w') as file: file.write('\n'.join(queries_results)) # write hw part 3 answers to a file if args.part_three_file and inverted_index: answer = '' top = 10 bottom = 10 # Write the top 10 terms with the highest document frequency top_df = inverted_index.get_top_df_ids(top) answer += '--------------------------------------\n' answer += 'Top {} df terms:\n'.format(top) answer += '\n'.join(['{}: {}'.format(term, df) for term, df in top_df]) # Write the top 10 terms with the lowest document frequency bottom_df = inverted_index.get_bottom_df_ids(bottom) answer += '\n--------------------------------------\n' answer += 'Bottom {} df terms:\n'.format(bottom) answer += '\n'.join(['{}: {}'.format(term, df) for term, df in bottom_df]) # Explain the different characteristics of the above two sets of terms answer += '\n--------------------------------------\n' answer += 'The different characteristics of the above two sets of terms:\n' answer += ' Top dfs terms are the most common terms in the collection, located in large number of documents\n' answer += ' Bottom dfs terms are the rarest terms in the collection, located only in few documents\n' answer += ' * in our inverted index we don\'t keep track of terms frequency inside a document, ' \ 'meaning df of a term signifies the number of documents in which this term was present' \ ' at least one time\n' with open(args.part_three_file, 'w') as file: file.write(answer)
from indexer import InvertedIndex iidx = InvertedIndex() iidx.calc_scores() iidx.write()