def main(q1, q2, articles, batch, input_type, outputFileName, dictType): num_articles = int(articles) query = queries.main(q1,q2) if batch == "yes": q1_id_list, q2_id_list = pmids.main(query, num_articles) q1_file_paths= run_tees_batch(q1, q1_id_list) q2_file_paths= run_tees_batch(q2, q2_id_list) if batch == "no": q1_id_list, q2_id_list = pmids.main(query, num_articles) q1_file_paths= run_tees(q1, q1_id_list) q2_file_paths= run_tees(q2, q2_id_list) q1_dict = get_info_from_interaction_xml(q1_file_paths) q2_dict = get_info_from_interaction_xml(q2_file_paths) if dictType == 'all' or dictType == 'both': all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict) normalized_all_words_dict = normalize_dict(all_words_dict, query) angle_list_all = Cosine_Sim.main(normalized_all_words_dict, q1, q2) print_pair_score_dict(angle_list_all, normalized_all_words_dict, q1, q2, input_type, outputFileName) if dictType == 'protein' or dictType == 'both': query_dicts = [q1_dict, q2_dict] combined_dict = combine_dictionaries(query_dicts) normalized_protein_dict = normalize_dict(combined_dict, query) angle_list_protein = Cosine_Sim.main(normalized_protein_dict, q1, q2) print_pair_score_dict(angle_list_protein, normalized_protein_dict, q1, q2, input_type, outputFileName)
def main(q1, q2, articles, batch, input_type, outputFileName, dictType): num_articles = int(articles) query = queries.main(q1, q2) if batch == "yes": q1_id_list, q2_id_list = pmids.main(query, num_articles) q1_file_paths = run_tees_batch(q1, q1_id_list) q2_file_paths = run_tees_batch(q2, q2_id_list) if batch == "no": q1_id_list, q2_id_list = pmids.main(query, num_articles) q1_file_paths = run_tees(q1, q1_id_list) q2_file_paths = run_tees(q2, q2_id_list) q1_dict = get_info_from_interaction_xml(q1_file_paths) q2_dict = get_info_from_interaction_xml(q2_file_paths) if dictType == 'all' or dictType == 'both': all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict) normalized_all_words_dict = normalize_dict(all_words_dict, query) angle_list_all = Cosine_Sim.main(normalized_all_words_dict, q1, q2) print_pair_score_dict(angle_list_all, normalized_all_words_dict, q1, q2, input_type, outputFileName) if dictType == 'protein' or dictType == 'both': query_dicts = [q1_dict, q2_dict] combined_dict = combine_dictionaries(query_dicts) normalized_protein_dict = normalize_dict(combined_dict, query) angle_list_protein = Cosine_Sim.main(normalized_protein_dict, q1, q2) print_pair_score_dict(angle_list_protein, normalized_protein_dict, q1, q2, input_type, outputFileName)
def ClusterTweetSim(self, cluster, tweet): tweet_txt = tweet.text min_similariy = 1.0 total_sim = 0 for index, row in cluster.data.iterrows(): curr_sim = Cosine_Sim.get_cosine(tweet_txt, row['tweet_text']) if round(curr_sim, 2) >= 0.99: # duplicate tweet return 1 # return once you find the duplicate else: total_sim = total_sim + curr_sim #average similarity min_similariy = (total_sim * 1.0 / len(cluster.data.index)) return round(min_similariy, 2)
def writeToEventsFile(self, event_clust): print "Writing Events to a file" words_list = [] for index, row in event_clust.data.iterrows(): words_list += Cosine_Sim.tokenize_only(row['tweet_text']) word_counts = Counter(words_list) most_common = word_counts.most_common(10) text_file = open("events/Events.txt", "a") text_file.write("Cluster Id =" + str(event_clust.id) + " ,") for word, count in most_common: text_file.write("{0} : {1} ,".format(word, count)) text_file.write("\n") text_file.close() # write cluster to csv file clust_file = 'events/cluster_' + str(event_clust.id) + '.csv' if os.path.exists(clust_file): os.remove(clust_file) try: event_clust.data.to_csv(clust_file, index=False, encoding='utf-8') except: print " Error writing the Event File"
def main(q1, q2, articles, batch, input_type, outputFileName, dictType, outputType, evaluation_mode, stemmed, model, text_file): models = model.split(' ') num_articles = int(articles) query = queries.main(q1, q2) q1_dict = {} q2_dict = {} q1_already_downloaded_ids = get_already_downloaded_ids(q1, models) q2_already_downloaded_ids = get_already_downloaded_ids(q2, models) q1_already_downloaded_file_path_list = get_already_downloaded_file_paths( q1, models, num_articles) q2_already_downloaded_file_path_list = get_already_downloaded_file_paths( q2, models, num_articles) q1_already_dl_slice = None q2_already_dl_slice = None q1_file_paths = None q2_file_paths = None # if num_articles <= len(q1_already_downloaded_file_path_list): # q1_already_dl_slice = q1_already_downloaded_file_path_list[:num_articles] # q1_dict = get_info_from_interaction_xml(q1_already_dl_slice) # else: if num_articles * 100 <= len(q1_already_downloaded_file_path_list): q1_already_dl_slice = q1_already_downloaded_file_path_list[: num_articles] q1_dict = get_info_from_interaction_xml(q1_already_dl_slice) else: q1_id_list = pmids.main(query.q1, num_articles, query.q1_search_string, evaluation_mode) if len(q1_id_list) == len(q1_already_downloaded_file_path_list): q1_dict = get_info_from_interaction_xml( q1_already_downloaded_file_path_list) else: if batch == 'yes': q1_file_paths = run_tees_batch(q1, q1_id_list, models, text_file) elif batch == 'no': q1_file_paths = run_tees(q1, q1_id_list, models, text_file) if not q1_file_paths: q1_file_paths = q1_already_downloaded_file_path_list[: num_articles] q1_dict = get_info_from_interaction_xml(q1_file_paths) if num_articles * 100 <= len(q2_already_downloaded_file_path_list): q2_already_dl_slice = q2_already_downloaded_file_path_list[: num_articles] q2_dict = get_info_from_interaction_xml(q2_already_dl_slice) else: q2_id_list = pmids.main(query.q2, num_articles, query.q2_search_string, evaluation_mode) if len(q2_id_list) == len(q2_already_downloaded_file_path_list): q2_dict = get_info_from_interaction_xml( q2_already_downloaded_file_path_list) else: if batch == 'yes': q2_file_paths = run_tees_batch(q2, q2_id_list, models, text_file) elif batch == 'no': q2_file_paths = run_tees(q2, q2_id_list, models, text_file) if not q2_file_paths: q2_file_paths = q2_already_downloaded_file_path_list[: num_articles] q2_dict = get_info_from_interaction_xml(q2_file_paths) if q1_already_dl_slice: q1_num_docs_processed = len(q1_already_dl_slice) elif q1_file_paths: q1_num_docs_processed = len(q1_file_paths) else: q1_num_docs_processed = len(q1_already_downloaded_file_path_list) if q2_already_dl_slice: q2_num_docs_processed = len(q2_already_dl_slice) elif q2_file_paths: q2_num_docs_processed = len(q2_file_paths) else: q2_num_docs_processed = len(q2_already_downloaded_file_path_list) print q1, 'num_docs_processed', q1_num_docs_processed print q2, 'num_docs_processed', q2_num_docs_processed num_docs_processed = [q1_num_docs_processed, q2_num_docs_processed] return_dict_s = [] if dictType == 'all': all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict) normalized_all_words_dict = normalize_dict(all_words_dict, query, stemmed) return_dict_s.append(normalized_all_words_dict) if len(normalized_all_words_dict[query.q1.lower()]) < 1 or len( normalized_all_words_dict[query.q2.lower()]) < 1: angle_list = [90.00] else: angle_list = Cosine_Sim.main(normalized_all_words_dict, q1, q2) if dictType == 'protein': query_dicts = [q1_dict, q2_dict] combined_dict = combine_dictionaries(query_dicts) normalized_protein_dict = normalize_dict(combined_dict, query, stemmed) return_dict_s.append(normalized_protein_dict) if len(normalized_protein_dict[query.q1.lower()]) < 1 or len( normalized_protein_dict[query.q2.lower()]) < 1: angle_list = [90.00] else: angle_list = Cosine_Sim.main(normalized_protein_dict, q1, q2) return angle_list, return_dict_s, num_docs_processed
def main(q1, q2, articles, batch, input_type, outputFileName, dictType, outputType, evaluation_mode, stemmed, model, text_file): models = model.split(' ') num_articles = int(articles) query = queries.main(q1,q2) q1_dict = {} q2_dict = {} q1_already_downloaded_ids = get_already_downloaded_ids(q1, models) q2_already_downloaded_ids = get_already_downloaded_ids(q2, models) q1_already_downloaded_file_path_list = get_already_downloaded_file_paths(q1, models, num_articles) q2_already_downloaded_file_path_list = get_already_downloaded_file_paths(q2, models, num_articles) q1_already_dl_slice = None q2_already_dl_slice = None q1_file_paths = None q2_file_paths = None # if num_articles <= len(q1_already_downloaded_file_path_list): # q1_already_dl_slice = q1_already_downloaded_file_path_list[:num_articles] # q1_dict = get_info_from_interaction_xml(q1_already_dl_slice) # else: if num_articles * 100 <= len(q1_already_downloaded_file_path_list): q1_already_dl_slice = q1_already_downloaded_file_path_list[:num_articles] q1_dict = get_info_from_interaction_xml(q1_already_dl_slice) else: q1_id_list = pmids.main(query.q1, num_articles, query.q1_search_string, evaluation_mode) if len(q1_id_list) == len(q1_already_downloaded_file_path_list): q1_dict = get_info_from_interaction_xml(q1_already_downloaded_file_path_list) else: if batch == 'yes': q1_file_paths = run_tees_batch(q1, q1_id_list, models, text_file) elif batch == 'no': q1_file_paths = run_tees(q1, q1_id_list, models, text_file) if not q1_file_paths: q1_file_paths = q1_already_downloaded_file_path_list[:num_articles] q1_dict = get_info_from_interaction_xml(q1_file_paths) if num_articles * 100 <= len(q2_already_downloaded_file_path_list): q2_already_dl_slice = q2_already_downloaded_file_path_list[:num_articles] q2_dict = get_info_from_interaction_xml(q2_already_dl_slice) else: q2_id_list = pmids.main(query.q2, num_articles, query.q2_search_string, evaluation_mode) if len(q2_id_list) == len(q2_already_downloaded_file_path_list): q2_dict = get_info_from_interaction_xml(q2_already_downloaded_file_path_list) else: if batch == 'yes': q2_file_paths= run_tees_batch(q2, q2_id_list, models, text_file) elif batch == 'no': q2_file_paths= run_tees(q2, q2_id_list, models, text_file) if not q2_file_paths: q2_file_paths = q2_already_downloaded_file_path_list[:num_articles] q2_dict = get_info_from_interaction_xml(q2_file_paths) if q1_already_dl_slice: q1_num_docs_processed = len(q1_already_dl_slice) elif q1_file_paths: q1_num_docs_processed = len(q1_file_paths) else: q1_num_docs_processed = len(q1_already_downloaded_file_path_list) if q2_already_dl_slice: q2_num_docs_processed = len(q2_already_dl_slice) elif q2_file_paths: q2_num_docs_processed = len(q2_file_paths) else: q2_num_docs_processed = len(q2_already_downloaded_file_path_list) print q1, 'num_docs_processed', q1_num_docs_processed print q2, 'num_docs_processed', q2_num_docs_processed num_docs_processed = [q1_num_docs_processed,q2_num_docs_processed] return_dict_s = [] if dictType == 'all': all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict) normalized_all_words_dict = normalize_dict(all_words_dict, query, stemmed) return_dict_s.append(normalized_all_words_dict) if len(normalized_all_words_dict[query.q1.lower()]) < 1 or len(normalized_all_words_dict[query.q2.lower()]) < 1: angle_list = [90.00] else: angle_list = Cosine_Sim.main(normalized_all_words_dict, q1, q2) if dictType == 'protein': query_dicts = [q1_dict, q2_dict] combined_dict = combine_dictionaries(query_dicts) normalized_protein_dict = normalize_dict(combined_dict, query, stemmed) return_dict_s.append(normalized_protein_dict) if len(normalized_protein_dict[query.q1.lower()]) < 1 or len(normalized_protein_dict[query.q2.lower()]) < 1: angle_list = [90.00] else: angle_list = Cosine_Sim.main(normalized_protein_dict, q1, q2) return angle_list, return_dict_s, num_docs_processed
# print "working on tweet ", index # words_list = Cosine_Sim.tokenize_only(row['tweet_text']) # words_list_str = " ".join(words_list) # text_file.write(words_list_str) # text_file.close() # cnt = cnt + 1 cnt = 1 cluster_ids = [] files = glob.glob("\clusters_AvgSimilariy\*.csv") for f in files: print "working on file , ", f data = pd.read_csv(f, encoding='utf-8') cnt_str = f + "," for index, row in data.iterrows(): text_file = open(r"\tweets_avg\tweet" + str(cnt) + ".txt", "w") print "working on tweet ", index words_list = Cosine_Sim.tokenize_only(row['tweet_text']) words_list_str = " ".join(words_list) text_file.write(words_list_str) text_file.close() cnt_str = cnt_str + str(cnt) + "," cnt = cnt + 1 cluster_ids.append(cnt_str) text_file = open(r"\tweets_avg\tweet_clusters.txt", "w") for i in cluster_ids: text_file.write(i) text_file.write("\n") text_file.close()