def generate_snippet(self, doc, query): fa = FileAccess() stop_words = fa.get_stop_words() query = query.split() stopped_content = query final_query = " ".join(stopped_content) fq_list = final_query.split() doc_list = doc.split() intr = list(set(doc_list).intersection(fq_list)) positions = [] for each in intr: if each in intr: key = doc_list.index(each) positions.append(key) else: continue final_doc = '' i = 0 for each in doc_list: if i in positions: q = '"' + each + '" ' final_doc += q else: final_doc += each + ' ' i += 1 return final_doc
def phase2(model): stop = Stopper() stopped_corpus = stop.build_stopped_inverted_index() stop_inv_index = stopped_corpus[0] stop_total_corpus = stopped_corpus[1] task3a_folder = os.path.join(os.getcwd(), 'task3a') file_name = "task3a_cosine_stopped.txt" r = Retriever() fa = FileAccess() relevance_data = fa.get_relevance_data() query_dict = fa.read_queries() result_file = task3a_folder + '/' + file_name stopped_queries = stop.get_stopped_queries(query_dict) qe = QueryExpander(query_dict=stopped_queries, filename=result_file, clean=False) expanded_stopped_queries = qe.get_expanded_queries() r.run_all_queries(inverted_index=stop_inv_index, total_corpus=stop_total_corpus, relevance_data=relevance_data, query_dict=expanded_stopped_queries, model=model, task_id="phase2", notes="stopped_expanded", store_queries='stopped_expanded')
def build_stopped_corpus(self): cwd = os.getcwd() clean_cacm = os.path.join(cwd, 'clean_cacm') stopped_cacm = os.path.join(cwd, 'stopped_cacm') fa = FileAccess() if not os.path.exists(clean_cacm): print "Clean corpus doesn't exist. It is created now. " \ "PLease put cleaned files inside the corpus folder" os.makedirs(clean_cacm, 0755) return if not os.path.exists(stopped_cacm): os.makedirs(stopped_cacm, 0755) stop_words = fa.get_stop_words() os.chdir(clean_cacm) for eachfile in glob.glob('*.html'): print eachfile content = open(eachfile).read() content = content.split() stopped_content = [x for x in content if x not in stop_words] final_content = " ".join(stopped_content) clean_file = open(os.path.join(stopped_cacm, eachfile), 'w') clean_file.write(final_content) clean_file.close()
def evalaution(): p_k = [5, 20] fa = FileAccess() relevance_data = fa.get_relevance_data() base_dir = os.getcwd() all_runs = os.path.join(os.getcwd(), 'all_runs') os.chdir(all_runs) e = Evaluation() for eachfile in glob.glob('*.txt'): e.evaluate(eachfile, p_k, base_dir, relevance_data)
def snippet_generation(): r = Retriever() fa = FileAccess() query_dict = fa.read_queries() query_id = raw_input('Enter the query_id: \n') if int(query_id) > 64 or int(query_id) < 1: print 'No Query exists, please enter between 1 to 64' return query = query_dict[int(query_id) - 1] print 'Query: ' + query fa = FileAccess() relevance_data = fa.get_relevance_data() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] results = r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=query_dict, model='cosine', task_id="1", notes='', ret=True) results = results[0:4] snippet_dictionary = {} for each in results: docid = each[1] data = total_corpus[docid] data = " ".join(data) sg = SnippetGenerator() snippet = sg.generate_snippet(data, query) snippet_dictionary[docid] = snippet print '\n' for each in results: print 'Doc-Id: ' + each[1] print snippet_dictionary[each[1]] print '\n'
def __init__(self, filename, query_dict, top_k=12, n=5, clean=True): r = Retriever() if not clean: self.total_corpus = r.get_total_corpus(folder='stopped') else: self.total_corpus = r.get_total_corpus(folder='clean') self.k = top_k self.n = n fa = FileAccess() self.query_dict = query_dict self.results = fa.read_result_file(filename=filename) return
def get_stopped_queries(self, query_dict): fa = FileAccess() query_dict = query_dict stop_words = fa.get_stop_words() stopped_queries = {} for each in query_dict: query = query_dict[each] query_list = query.split() stopped_query = [x for x in query_list if x not in stop_words] stopped_query = " ".join(stopped_query) stopped_queries[each] = stopped_query return stopped_queries
def task3b(model): stem = Stemmer() r = Retriever() stem_total_corpus = stem.build_stemmed_data() stem_inv_index = stem.build_stemmed_index() fa = FileAccess() relevance_data = fa.get_relevance_data() stemmed_queries = fa.get_stem_queries() r.run_all_queries(inverted_index=stem_inv_index, total_corpus=stem_total_corpus, relevance_data=relevance_data, query_dict=stemmed_queries, model=model, task_id="3b", notes="stemmed", store_queries='stemmed')
def task1(notes=''): r = Retriever() fa = FileAccess() query_dict = fa.read_queries() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] relevance_data = fa.get_relevance_data() for model in models: r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=query_dict, model=model, task_id="1", notes=notes)
def task3a(model): stop = Stopper() stopped_corpus = stop.build_stopped_inverted_index() stop_inv_index = stopped_corpus[0] stop_total_corpus = stopped_corpus[1] fa = FileAccess() r = Retriever() query_dict = fa.read_queries() relevance_data = fa.get_relevance_data() stopped_queries = stop.get_stopped_queries(query_dict) r.run_all_queries(inverted_index=stop_inv_index, total_corpus=stop_total_corpus, relevance_data=relevance_data, query_dict=stopped_queries, model=model, task_id="3a", notes="stopped", store_queries='stopped')
def main(arg=None): if arg is None: PlaylistUpdater(user="******").update() sys.exit(0) if arg != "update": update_all() sys.exit(0) fa = FileAccess() print_intro_prompt() while True: print("\nEnter a command: ") command = input().lower() ## Handle Command if re.search("^he*", command): print_help() elif re.search("^up*", command): update_all() elif re.search("^ne*", command): add_user(fa) elif re.search("^re*", command): print("Remove User") elif re.search("^in*", command): print_info(fa) elif re.search("^se*", command): schedule_cron() elif re.search("^pr*", command): print_cron() elif re.search("^qu*", command): print("Goodbye!") sys.exit(0) else: print("ERROR: Invalid Command") print("Enter 'HELP' for list of commands") return 0
def task2(model): fa = FileAccess() r = Retriever() query_dict = fa.read_queries() corpus = r.get_corpus(True) inverted_index = corpus[0] total_corpus = corpus[1] relevance_data = fa.get_relevance_data() task1_folder = os.path.join(os.getcwd(), 'task1') file_name = "task1_" + model + "_.txt" result_file = task1_folder + '/' + file_name qe = QueryExpander(query_dict=query_dict, filename=result_file, clean=True) expanded_queries = qe.get_expanded_queries() r.run_all_queries(inverted_index=inverted_index, total_corpus=total_corpus, relevance_data=relevance_data, query_dict=expanded_queries, model='cosine', task_id="2", notes="expanded", store_queries='expanded')
if __name__ == "__main__": print( 'Please wait for around 30 mins! It will take sometimes while getting data from OMDB API calling.....' ) print( 'If you want to run the program with less data to avoid delay then please minimize the contents of ' 'inputs folder and re-run!!!\n') show = Show() unwatched_movie_dic = {} try: '''read input text file''' fa = FileAccess() movie_list = fa.read_file(fa.movie_list_txt) watched_movie_list = fa.read_file(fa.watched_movie_list_txt) unwatched_movie_list = set(movie_list) - set(watched_movie_list) '''print movie, watched and unwatched movie id''' ''' print('-----------> Movie Id: ') show.print_input_list(movie_list) print('-----------> Watched Movie Id: ') show.print_input_list(watched_movie_list) print('-----------> Unwatched Movie Id: ') show.print_input_list(unwatched_movie_list) ''' '''call OMDB api and load movie related data''' api = OMDBApi() movie_dic = api.call_omdb_api(movie_list)
def evaluate(self, file_name, rank_list, base_dir, relevant_data): fa = FileAccess() scores = fa.read_score_file(file_name) pr_results = [] ap_results = [] mrr = [] p_at_k = {} for each in rank_list: p_at_k[each] = [] for each in scores: ap = 0 if each in relevant_data: relevant_files = relevant_data[each] else: continue data = scores[each] total_retrieved = 1 total_relevant_retrieved = 0 for eachdata in data: qid = each rank = data.index(eachdata) + 1 docid = eachdata[0] doc_score = eachdata[1] if docid in relevant_files: if total_relevant_retrieved == 0: mrr.append(1.0/rank) total_relevant_retrieved += 1 relevance = 1 if docid in relevant_files else 0 precision = float(total_relevant_retrieved)/total_retrieved if rank in rank_list: tup = (qid, precision) p_at_k[rank].append(tup) if relevance: ap += precision recall = float(total_relevant_retrieved)/len(relevant_files) total_retrieved += 1 tup = (qid, rank, docid, doc_score, str(relevance), precision, recall) pr_results.append(tup) if total_relevant_retrieved != 0: avg_p = float(ap)/total_relevant_retrieved else: avg_p = 0 ap_results.append(avg_p) mean_avg_pr = sum(ap_results)/len(ap_results) mean_rr = sum(mrr)/len(mrr) phase2_evaluation = os.path.join(base_dir, 'evaluation_phase2') if not os.path.exists(phase2_evaluation): os.makedirs(phase2_evaluation, 0755) pre_file = file_name.split('.')[0] for each in p_at_k: pk_file_name = pre_file + '_p@k'+str(each)+'.txt' pk_file = open(os.path.join(phase2_evaluation, pk_file_name), 'w') for e in p_at_k[each]: pk_file.write('{} {}\n'.format(e[0], e[1])) pk_file.close() mrr_filename = pre_file + '_mrr.txt' pr_filename = pre_file + '_precision_recall.txt' map_filename = pre_file + '_map_results.txt' mrr_file = open(os.path.join(phase2_evaluation, mrr_filename), 'w') mrr_file.write(str(mean_rr)) mrr_file.close() map_file = open(os.path.join(phase2_evaluation, map_filename), 'w') map_file.write(str(mean_avg_pr)) map_file.close() pr_file = open(os.path.join(phase2_evaluation, pr_filename), 'w') for e in pr_results: pr_file.write("{} {} {} {} {} {} {}\n".format(e[0], e[1], e[2], e[3], e[4], round(e[5], 3), round(e[6], 3))) pr_file.close() return