示例#1
0
def main(q1, q2, articles, batch, input_type, outputFileName, dictType):
    num_articles = int(articles)
    query = queries.main(q1,q2)
    
    if batch == "yes":
        q1_id_list, q2_id_list = pmids.main(query, num_articles)
        q1_file_paths= run_tees_batch(q1, q1_id_list)
        q2_file_paths= run_tees_batch(q2, q2_id_list)
    
    if batch == "no":
        q1_id_list, q2_id_list = pmids.main(query, num_articles)  
        q1_file_paths= run_tees(q1, q1_id_list)
        q2_file_paths= run_tees(q2, q2_id_list)

    q1_dict = get_info_from_interaction_xml(q1_file_paths)
    q2_dict = get_info_from_interaction_xml(q2_file_paths)
    
    if dictType == 'all' or dictType == 'both':
        all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict)
        normalized_all_words_dict = normalize_dict(all_words_dict, query)
        angle_list_all = Cosine_Sim.main(normalized_all_words_dict, q1, q2)
        print_pair_score_dict(angle_list_all, normalized_all_words_dict, q1, q2, input_type, outputFileName)        
    
    if dictType == 'protein' or dictType == 'both':
        query_dicts = [q1_dict, q2_dict]
        combined_dict = combine_dictionaries(query_dicts)
        normalized_protein_dict = normalize_dict(combined_dict, query)
        angle_list_protein = Cosine_Sim.main(normalized_protein_dict, q1, q2)
        print_pair_score_dict(angle_list_protein, normalized_protein_dict, q1, q2, input_type, outputFileName)
示例#2
0
def main(q1, q2, articles, batch, input_type, outputFileName, dictType):
    num_articles = int(articles)
    query = queries.main(q1, q2)

    if batch == "yes":
        q1_id_list, q2_id_list = pmids.main(query, num_articles)
        q1_file_paths = run_tees_batch(q1, q1_id_list)
        q2_file_paths = run_tees_batch(q2, q2_id_list)

    if batch == "no":
        q1_id_list, q2_id_list = pmids.main(query, num_articles)
        q1_file_paths = run_tees(q1, q1_id_list)
        q2_file_paths = run_tees(q2, q2_id_list)

    q1_dict = get_info_from_interaction_xml(q1_file_paths)
    q2_dict = get_info_from_interaction_xml(q2_file_paths)

    if dictType == 'all' or dictType == 'both':
        all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict)
        normalized_all_words_dict = normalize_dict(all_words_dict, query)
        angle_list_all = Cosine_Sim.main(normalized_all_words_dict, q1, q2)
        print_pair_score_dict(angle_list_all, normalized_all_words_dict, q1,
                              q2, input_type, outputFileName)

    if dictType == 'protein' or dictType == 'both':
        query_dicts = [q1_dict, q2_dict]
        combined_dict = combine_dictionaries(query_dicts)
        normalized_protein_dict = normalize_dict(combined_dict, query)
        angle_list_protein = Cosine_Sim.main(normalized_protein_dict, q1, q2)
        print_pair_score_dict(angle_list_protein, normalized_protein_dict, q1,
                              q2, input_type, outputFileName)
 def ClusterTweetSim(self, cluster, tweet):
     tweet_txt = tweet.text
     min_similariy = 1.0
     total_sim = 0
     for index, row in cluster.data.iterrows():
         curr_sim = Cosine_Sim.get_cosine(tweet_txt, row['tweet_text'])
         if round(curr_sim, 2) >= 0.99:  # duplicate tweet
             return 1  # return once you find the duplicate
         else:
             total_sim = total_sim + curr_sim
     #average similarity
     min_similariy = (total_sim * 1.0 / len(cluster.data.index))
     return round(min_similariy, 2)
    def writeToEventsFile(self, event_clust):
        print "Writing Events to a file"
        words_list = []
        for index, row in event_clust.data.iterrows():
            words_list += Cosine_Sim.tokenize_only(row['tweet_text'])
        word_counts = Counter(words_list)
        most_common = word_counts.most_common(10)
        text_file = open("events/Events.txt", "a")
        text_file.write("Cluster Id =" + str(event_clust.id) + " ,")
        for word, count in most_common:
            text_file.write("{0} : {1} ,".format(word, count))
        text_file.write("\n")
        text_file.close()
        # write cluster to csv file
        clust_file = 'events/cluster_' + str(event_clust.id) + '.csv'
        if os.path.exists(clust_file):
            os.remove(clust_file)

        try:
            event_clust.data.to_csv(clust_file, index=False, encoding='utf-8')
        except:
            print " Error writing the Event File"
示例#5
0
def main(q1, q2, articles, batch, input_type, outputFileName, dictType,
         outputType, evaluation_mode, stemmed, model, text_file):
    models = model.split(' ')
    num_articles = int(articles)
    query = queries.main(q1, q2)
    q1_dict = {}
    q2_dict = {}

    q1_already_downloaded_ids = get_already_downloaded_ids(q1, models)
    q2_already_downloaded_ids = get_already_downloaded_ids(q2, models)
    q1_already_downloaded_file_path_list = get_already_downloaded_file_paths(
        q1, models, num_articles)
    q2_already_downloaded_file_path_list = get_already_downloaded_file_paths(
        q2, models, num_articles)

    q1_already_dl_slice = None
    q2_already_dl_slice = None
    q1_file_paths = None
    q2_file_paths = None

    #     if num_articles <= len(q1_already_downloaded_file_path_list):
    #         q1_already_dl_slice = q1_already_downloaded_file_path_list[:num_articles]
    #         q1_dict = get_info_from_interaction_xml(q1_already_dl_slice)
    #    else:

    if num_articles * 100 <= len(q1_already_downloaded_file_path_list):
        q1_already_dl_slice = q1_already_downloaded_file_path_list[:
                                                                   num_articles]
        q1_dict = get_info_from_interaction_xml(q1_already_dl_slice)
    else:
        q1_id_list = pmids.main(query.q1, num_articles, query.q1_search_string,
                                evaluation_mode)
        if len(q1_id_list) == len(q1_already_downloaded_file_path_list):
            q1_dict = get_info_from_interaction_xml(
                q1_already_downloaded_file_path_list)
        else:
            if batch == 'yes':
                q1_file_paths = run_tees_batch(q1, q1_id_list, models,
                                               text_file)
            elif batch == 'no':
                q1_file_paths = run_tees(q1, q1_id_list, models, text_file)
            if not q1_file_paths:
                q1_file_paths = q1_already_downloaded_file_path_list[:
                                                                     num_articles]
            q1_dict = get_info_from_interaction_xml(q1_file_paths)

    if num_articles * 100 <= len(q2_already_downloaded_file_path_list):
        q2_already_dl_slice = q2_already_downloaded_file_path_list[:
                                                                   num_articles]
        q2_dict = get_info_from_interaction_xml(q2_already_dl_slice)
    else:
        q2_id_list = pmids.main(query.q2, num_articles, query.q2_search_string,
                                evaluation_mode)
        if len(q2_id_list) == len(q2_already_downloaded_file_path_list):
            q2_dict = get_info_from_interaction_xml(
                q2_already_downloaded_file_path_list)
        else:
            if batch == 'yes':
                q2_file_paths = run_tees_batch(q2, q2_id_list, models,
                                               text_file)
            elif batch == 'no':
                q2_file_paths = run_tees(q2, q2_id_list, models, text_file)
            if not q2_file_paths:
                q2_file_paths = q2_already_downloaded_file_path_list[:
                                                                     num_articles]
            q2_dict = get_info_from_interaction_xml(q2_file_paths)

    if q1_already_dl_slice:
        q1_num_docs_processed = len(q1_already_dl_slice)
    elif q1_file_paths:
        q1_num_docs_processed = len(q1_file_paths)
    else:
        q1_num_docs_processed = len(q1_already_downloaded_file_path_list)

    if q2_already_dl_slice:
        q2_num_docs_processed = len(q2_already_dl_slice)
    elif q2_file_paths:
        q2_num_docs_processed = len(q2_file_paths)
    else:
        q2_num_docs_processed = len(q2_already_downloaded_file_path_list)

    print q1, 'num_docs_processed', q1_num_docs_processed
    print q2, 'num_docs_processed', q2_num_docs_processed
    num_docs_processed = [q1_num_docs_processed, q2_num_docs_processed]

    return_dict_s = []
    if dictType == 'all':
        all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict)
        normalized_all_words_dict = normalize_dict(all_words_dict, query,
                                                   stemmed)
        return_dict_s.append(normalized_all_words_dict)
        if len(normalized_all_words_dict[query.q1.lower()]) < 1 or len(
                normalized_all_words_dict[query.q2.lower()]) < 1:
            angle_list = [90.00]
        else:
            angle_list = Cosine_Sim.main(normalized_all_words_dict, q1, q2)

    if dictType == 'protein':
        query_dicts = [q1_dict, q2_dict]
        combined_dict = combine_dictionaries(query_dicts)
        normalized_protein_dict = normalize_dict(combined_dict, query, stemmed)
        return_dict_s.append(normalized_protein_dict)
        if len(normalized_protein_dict[query.q1.lower()]) < 1 or len(
                normalized_protein_dict[query.q2.lower()]) < 1:
            angle_list = [90.00]
        else:
            angle_list = Cosine_Sim.main(normalized_protein_dict, q1, q2)

    return angle_list, return_dict_s, num_docs_processed
示例#6
0
def main(q1, q2, articles, batch, input_type, outputFileName, dictType, outputType, evaluation_mode, stemmed, model, text_file):
    models = model.split(' ')
    num_articles = int(articles)
    query = queries.main(q1,q2)
    q1_dict = {}
    q2_dict = {}

    q1_already_downloaded_ids = get_already_downloaded_ids(q1, models)
    q2_already_downloaded_ids = get_already_downloaded_ids(q2, models)
    q1_already_downloaded_file_path_list = get_already_downloaded_file_paths(q1, models, num_articles)
    q2_already_downloaded_file_path_list = get_already_downloaded_file_paths(q2, models, num_articles)
    
    q1_already_dl_slice = None
    q2_already_dl_slice = None
    q1_file_paths = None
    q2_file_paths = None 
    
    
#     if num_articles <= len(q1_already_downloaded_file_path_list):
#         q1_already_dl_slice = q1_already_downloaded_file_path_list[:num_articles]
#         q1_dict = get_info_from_interaction_xml(q1_already_dl_slice)
#    else:

    if num_articles * 100 <= len(q1_already_downloaded_file_path_list):
        q1_already_dl_slice = q1_already_downloaded_file_path_list[:num_articles]
        q1_dict = get_info_from_interaction_xml(q1_already_dl_slice)
    else:
        q1_id_list = pmids.main(query.q1, num_articles, query.q1_search_string, evaluation_mode)
        if len(q1_id_list) == len(q1_already_downloaded_file_path_list):
            q1_dict = get_info_from_interaction_xml(q1_already_downloaded_file_path_list)
        else:
            if batch == 'yes':
                q1_file_paths = run_tees_batch(q1, q1_id_list, models, text_file)
            elif batch == 'no':
                q1_file_paths = run_tees(q1, q1_id_list, models, text_file)
            if not q1_file_paths:
                q1_file_paths = q1_already_downloaded_file_path_list[:num_articles]
            q1_dict = get_info_from_interaction_xml(q1_file_paths)
    
    if num_articles * 100 <= len(q2_already_downloaded_file_path_list):
        q2_already_dl_slice = q2_already_downloaded_file_path_list[:num_articles]
        q2_dict = get_info_from_interaction_xml(q2_already_dl_slice)
    else:
        q2_id_list = pmids.main(query.q2, num_articles, query.q2_search_string, evaluation_mode)
        if len(q2_id_list) == len(q2_already_downloaded_file_path_list):
            q2_dict = get_info_from_interaction_xml(q2_already_downloaded_file_path_list)
        else:
            if batch == 'yes':
                q2_file_paths= run_tees_batch(q2, q2_id_list, models, text_file)
            elif batch == 'no':
                q2_file_paths= run_tees(q2, q2_id_list, models, text_file)
            if not q2_file_paths:
                q2_file_paths = q2_already_downloaded_file_path_list[:num_articles]
            q2_dict = get_info_from_interaction_xml(q2_file_paths)


    if q1_already_dl_slice:
        q1_num_docs_processed = len(q1_already_dl_slice)
    elif q1_file_paths:
        q1_num_docs_processed = len(q1_file_paths)
    else:
        q1_num_docs_processed = len(q1_already_downloaded_file_path_list)
        
    if q2_already_dl_slice:
        q2_num_docs_processed = len(q2_already_dl_slice)
    elif q2_file_paths:
        q2_num_docs_processed = len(q2_file_paths)
    else:
        q2_num_docs_processed = len(q2_already_downloaded_file_path_list)
        
    print q1, 'num_docs_processed', q1_num_docs_processed
    print q2, 'num_docs_processed', q2_num_docs_processed
    num_docs_processed = [q1_num_docs_processed,q2_num_docs_processed]
    
    return_dict_s = []
    if dictType == 'all':
        all_words_dict = get_all_words_dict(q1, q2, q1_dict, q2_dict)
        normalized_all_words_dict = normalize_dict(all_words_dict, query, stemmed)
        return_dict_s.append(normalized_all_words_dict)
        if len(normalized_all_words_dict[query.q1.lower()]) < 1 or len(normalized_all_words_dict[query.q2.lower()]) < 1:
            angle_list = [90.00]
        else:
            angle_list = Cosine_Sim.main(normalized_all_words_dict, q1, q2)
        
    if dictType == 'protein':
        query_dicts = [q1_dict, q2_dict]
        combined_dict = combine_dictionaries(query_dicts)
        normalized_protein_dict = normalize_dict(combined_dict, query, stemmed)
        return_dict_s.append(normalized_protein_dict)
        if len(normalized_protein_dict[query.q1.lower()]) < 1 or len(normalized_protein_dict[query.q2.lower()]) < 1:
            angle_list = [90.00]
        else:
            angle_list = Cosine_Sim.main(normalized_protein_dict, q1, q2)


    return angle_list, return_dict_s, num_docs_processed
#         print "working on tweet ", index
#         words_list = Cosine_Sim.tokenize_only(row['tweet_text'])
#         words_list_str = " ".join(words_list)
#         text_file.write(words_list_str)
#         text_file.close()
#         cnt = cnt + 1

cnt = 1
cluster_ids = []
files = glob.glob("\clusters_AvgSimilariy\*.csv")
for f in files:
    print "working on file , ", f
    data = pd.read_csv(f, encoding='utf-8')
    cnt_str = f + ","
    for index, row in data.iterrows():
        text_file = open(r"\tweets_avg\tweet" + str(cnt) + ".txt", "w")
        print "working on tweet ", index
        words_list = Cosine_Sim.tokenize_only(row['tweet_text'])
        words_list_str = " ".join(words_list)
        text_file.write(words_list_str)
        text_file.close()
        cnt_str = cnt_str + str(cnt) + ","
        cnt = cnt + 1
    cluster_ids.append(cnt_str)

text_file = open(r"\tweets_avg\tweet_clusters.txt", "w")
for i in cluster_ids:
    text_file.write(i)
    text_file.write("\n")
text_file.close()