def enrich_user_id_train_mlma(file): #df = utils.read_file(file,"~",names=['id','datetime','text','r1']) df = utils.read_file(file, "~", names=['id'], dtype='object') df["user_id"] = pd.Series([]) df["datetime"] = pd.Series([]) df["text"] = pd.Series([]) db = utils.get_mongo_client_db() try: counter = 0 for index, row in df.iterrows(): id = row["id"] id = id.rstrip("\r") res = db.tweet.find_one({"ID": str(id)}) if not res: logger.info("this tweet is not existing: " + str(id)) continue if not 'user_id' in res: logger.info("this tweet does not have user id: " + str(id)) continue user_id = res["user_id"] datetime = res["datetime"] text = res["text"] df.loc[index, 'user_id'] = user_id df.loc[index, 'datetime'] = datetime df.loc[index, 'text'] = text counter += 1 except Exception as ex: logger.error(str(ex)) logger.info("counter:" + str(counter)) df.to_csv(file + "_userid.csv", "~", index=False)
def r1_stats(file): df = utils.read_file(file, "~", names=['ID', 'user_id', 'datetime', 'text', 'r1']) logger.info("started") try: logger.info(df['r1'].value_counts()) except Exception as e: logger.error(str(e))
def time_period_splitter(file, separator): try: df = utils.read_file(file, separator, ["ID", "datetime", "text"], dtype=object) df = utils.drop_nans(df) df_p2 = df[df["datetime"].str.startswith(tuple(globals.p2_times))] df_p3 = df[df["datetime"].str.startswith(tuple(globals.p3_times))] df_p4 = df[df["datetime"].str.startswith(tuple(globals.p4_times))] df_p5 = df[df["datetime"].str.startswith(tuple(globals.p5_times))] df_p6 = df[df["datetime"].str.startswith(tuple(globals.p6_times))] df_p7 = df[df["datetime"].str.startswith(tuple(globals.p7_times))] df_p8 = df[df["datetime"].str.startswith(tuple(globals.p8_times))] df_p2.to_csv("p2.csv", sep=separator, index=False, encoding="utf-8") df_p3.to_csv("p3.csv", sep=separator, index=False, encoding="utf-8") df_p4.to_csv("p4.csv", sep=separator, index=False, encoding="utf-8") df_p5.to_csv("p5.csv", sep=separator, index=False, encoding="utf-8") df_p6.to_csv("p6.csv", sep=separator, index=False, encoding="utf-8") df_p7.to_csv("p7.csv", sep=separator, index=False, encoding="utf-8") df_p8.to_csv("p8.csv", sep=separator, index=False, encoding="utf-8") print("ok") except Exception as ex: logger.error(ex)
async def main(): if speech_to_text(filename): loop = asyncio.get_running_loop() data = utils.read_file(filename) words_count = utils.total_words(data) print("=" * 44) print("total spoken words -> ", words_count) print("=" * 44) fut_speech_fluency = loop.create_future() loop.create_task( utils.rate_speech_on_fluency(fut_speech_fluency, words_count)) fluency_rating = await fut_speech_fluency spelling_rating = rate_spelling(data, words_count) fut_unnecessary_filler = loop.create_future() loop.create_task(rate_unnecessary_fillers(fut_unnecessary_filler, data)) filler_rating = await fut_unnecessary_filler grammar_rating = rate_grammar(data) print("=" * 44) print("fluency rating (out of 1)-> ", fluency_rating) print("spelling rating (out of 2)-> ", spelling_rating) print("unnecessary fillers rating (out of 1)-> ", filler_rating) print("grammar rating (out of 1)-> ", grammar_rating) total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating print("=" * 44) print("overall rating (out of 5)-> ", total_rating) print("=" * 44)
def extract_convert_lda_input(file): #overall process for data preparation as input to LDA algorithm #1st, extract records from mongo by executing mongo expert javascript file. mongo_extract_data_script.js logger.info("started LDA related operations") df = utils.read_file(file, "~", names=['ID', 'datetime', 'text']) df_new = utils.preprocess_text_for_topic_discovery(df) df_new.to_csv(file + "_out.csv", index=False)
def extract_stance_changes_of_users_with_only_two_tweets(file): #note: this method is no longer used try: df = utils.read_file(file,"~", globals.STANCE_FILE_COLUMNS) dict_users={} for index, row in df.iterrows(): try: user_id = row['user_id'] datetime_object = datetime.strptime(row['datetime'], '%Y-%m-%d') r1 = row['r1'] if not user_id in dict_users.keys(): dict_users[user_id] = {1:(datetime_object, str(r1))} else: records = dict_users[user_id] if(len(records)==1): old_datetime_object, old_r1 = records[1] if old_datetime_object > datetime_object: records[1] = (datetime_object,str(r1)) records[2] = (old_datetime_object,str(old_r1)) elif old_datetime_object < datetime_object: records[2] = (datetime_object, str(r1)) else: old_datetime_object, old_r1 = records[1] if old_datetime_object > datetime_object: records[1] = (datetime_object,str(r1)) elif old_datetime_object < datetime_object: old_datetime_object, old_r1 = records[2] if old_datetime_object < datetime_object: records[2] = (datetime_object, str(r1)) except Exception as ex: logger.error(row) logger.error(str(ex)) logger.info(traceback.format_exc()) file_write = open(file+"_out.csv","w") counter = 0 for key, value in dict_users.items(): output = "" values_dict = dict_users[key] if(len(values_dict) != 2): continue; output += str(key)+"," ordered_values = collections.OrderedDict(sorted(values_dict.items())) for key2, value2 in ordered_values.items(): (date, stance) = value2 output += str(stance) + "," output = output[0:len(output)-1] counter += 1 file_write.write(output) file_write.write("\n") if(counter % 1000 == 0): file_write.flush() print("ok") except Exception as ex: logger.error(str(ex)) logger.info(traceback.format_exc())
def extract_fields_by_r1(file, r1): df = utils.read_file(file, "~", names=['ID', 'user_id', 'datetime', 'text', 'r1']) df = df[df['r1'] == r1] logger.info(df.head()) if r1 == 1: df['r1']= 0 elif r1 == 2: df['r1'] = 1 logger.info(df.head()) df.to_csv(file + "_" + str(r1) + "_out.csv", index=False, columns=['ID', 'user_id', 'datetime', 'text', 'r1'], header=None, sep="~")
def r1_stats(): df = utils.read_file("F:/tmp/full_features.csvl_out.csv", "~", names=['ID', 'user_id', 'datetime', 'text', 'r1']) logger.info("started") try: print(df['r1'].value_counts()) grouped = df.groupby(['datetime','r1'])['datetime'] grouped.count().to_csv('F:/tmp/tt.txt') except Exception as e: logger.error(str(e))
def extract_stance_changes_of_users_old(file): try: df = utils.read_file(file,"~", globals.STANCE_FILE_COLUMNS) df_records = df[['user_id', 'r1', 'datetime']].groupby(['user_id', 'r1', 'datetime']).agg({'r1': ["count"]}).reset_index() converted = utils.convert_consolidate_monthly_stances(df_records) df_final = utils.convert_nested_dict_to_line_chart_input_and_write(converted) df_final.to_csv(file+"_out.csv") print("ok") except Exception as ex: logger.error(str(ex)) logger.info(traceback.format_exc())
def analyze_duplicate_tweets(file): try: df = utils.read_file(file, "~", ['ID', 'user_id', 'datetime', 'text']) grouped = df.groupby('text').text.count() grouped = grouped[grouped > 5] grouped.sort_index(ascending=False) logger.info(grouped.value_counts()) grouped.to_csv('F:/tmp/tt1.txt') except Exception as ex: logger.error(ex)
def analyze_group_by_influence(file): try: df = utils.read_file(file, "~", ['datetime', 'nb_retweet', 'nb_like']) logger.info(df.head()) grouped_r = df.groupby('datetime')['nb_retweet'].mean() grouped_l = df.groupby('datetime')['nb_like'].mean() grouped_r.to_csv('F:/tmp/retweet.txt') grouped_l.to_csv('F:/tmp/like.txt') except Exception as ex: logger.info(ex)
def time_period_splitter(file, separator): logger.info("started") logger.info(globals.p2_times) logger.info("started to read file:" + file) try: df = utils.read_file(file, separator, ["ID", "datetime", "text"], dtype=object) df = utils.drop_nans(df) df_p2 = df[df["datetime"].str.startswith(tuple(globals.p2_times))] df_p3 = df[df["datetime"].str.startswith(tuple(globals.p3_times))] df_p4 = df[df["datetime"].str.startswith(tuple(globals.p4_times))] df_p5 = df[df["datetime"].str.startswith(tuple(globals.p5_times))] df_p6 = df[df["datetime"].str.startswith(tuple(globals.p6_times))] df_p7 = df[df["datetime"].str.startswith(tuple(globals.p7_times))] df_p8 = df[df["datetime"].str.startswith(tuple(globals.p8_times))] df_p2.to_csv(data_path + "p2.csv", sep=separator, index=False, encoding="utf-8") df_p3.to_csv(data_path + "p3.csv", sep=separator, index=False, encoding="utf-8") df_p4.to_csv(data_path + "p4.csv", sep=separator, index=False, encoding="utf-8") df_p5.to_csv(data_path + "p5.csv", sep=separator, index=False, encoding="utf-8") df_p6.to_csv(data_path + "p6.csv", sep=separator, index=False, encoding="utf-8") df_p7.to_csv(data_path + "p7.csv", sep=separator, index=False, encoding="utf-8") df_p8.to_csv(data_path + "p8.csv", sep=separator, index=False, encoding="utf-8") except Exception as ex: logger.error(ex) logger.info(traceback.format_exc()) logger.info("completed")
def plot_stance_transition(): try: scaling_enabled = False col_label_list = ['2016-01','2016-02','2016-03','2016-04','2016-05','2016-06','2016-07','2016-08','2016-09','2016-10','2016-11','2016-12','2017-01','2017-02','2017-03','2017-04','2017-05','2017-06','2017-07','2017-08','2017-09','2017-10','2017-11','2017-12','2018-01','2018-02','2018-03','2018-04','2018-05','2018-06','2018-07','2018-08','2018-09'] col_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33'] df = utils.read_file("F:/tmp/datt_out.csv", names=col_list) rowsize,colsize = df.shape plt.interactive(False) fig = plt.figure(figsize=(20, 15)) ax = fig.add_subplot(111) plt.yticks([0,1]) counter = 0 for i in range(0,colsize): if(i==colsize-1): break counter += 1 if(counter % 1000 == 0): logger.info(str(counter) + " out of " + str(colsize) + " is completed") first = df.iloc[:,i].tolist() second = df.iloc[:,i+1].tolist() zero_to_zero, one_to_one, zero_to_one, one_to_zero = calculate_combinated_weights(first, second) logger.info("zero_to_zero, one_to_one, zero_to_one, one_to_zero:"+str(zero_to_zero) +str(one_to_one) +str(zero_to_one) +str(one_to_zero)) if scaling_enabled: if zero_to_zero!=0.0: zero_to_zero=np.log(zero_to_zero) if one_to_one != 0.0: one_to_one=np.log(one_to_one) if zero_to_one != 0.0: zero_to_one=np.log(zero_to_one) if one_to_zero != 0.0: one_to_zero=np.log(one_to_zero) plt.plot([i, i+1],[0, 0],linewidth=zero_to_zero, c="black", solid_capstyle="round") plt.plot([i, i+1],[1, 1], linewidth=one_to_one, c="black", solid_capstyle="round") plt.plot([i, i+1],[0, 1], linewidth=zero_to_one, c="black", solid_capstyle="round") plt.plot([i, i+1],[1, 0], linewidth=one_to_zero, c="black", solid_capstyle="round") i+=1 ax.set_xticklabels(col_label_list, rotation=45) plt.savefig("F:/tmp/pplot") print("ok") except Exception as ex: logger.info(str(ex))
async def main(): speech_to_text(filename) loop = asyncio.get_running_loop() data = utils.read_file(filename) words_count = utils.total_words(data) logs = '=' * 44 + '\n' logs += "total spoken words -> " + str( words_count) + '\n' logs += '=' * 44 + '\n' fut_speech_fluency = loop.create_future() loop.create_task( utils.rate_speech_on_fluency(fut_speech_fluency, words_count)) fluency_rating = await fut_speech_fluency spelling_rating = rate_spelling(data, words_count) fut_unnecessary_filler = loop.create_future() loop.create_task(rate_unnecessary_fillers(fut_unnecessary_filler, data)) filler_rating = await fut_unnecessary_filler grammar_rating = rate_grammar(data) logs += '=' * 44 + '\n' logs += "fluency rating (out of 1)-> " + \ str(fluency_rating)+'\n' logs += "spelling rating (out of 2)-> " + \ str(spelling_rating)+'\n' logs += "unnecessary fillers rating (out of 1)-> " + \ str(filler_rating)+'\n' logs += "grammar rating (out of 1)-> " + \ str(grammar_rating) + '\n' total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating logs += '=' * 44 + '\n' logs += "overall rating (out of 5)-> " + str( total_rating) + "\n" logs += '=' * 44 + '\n' f = open("logs.txt", "w") f.write(logs) f.close return total_rating
def read_rfc_data(rfc_number): try: rfc_filename = utils.get_rfc_filename(rfc_number) rfc_filename = os.path.abspath(os.path.join(rfc_location, rfc_filename)) # Get the RFC file stat file_stats = os.stat(rfc_filename) last_modified = file_stats.st_mtime content_length = file_stats.st_size # Read the file contents rfc_data = utils.read_file(rfc_filename) except IOError: logger.error("The entered RFC was not found") return "", "", "" return rfc_data, str(last_modified), str(content_length)
def pandas_extract_tweet_text_by_topic_label_random_n_records( file, requested_amount, stance): try: logger.info("started to read file") df = utils.read_file(file, "~", names=['ID', 'user_id', 'datetime', 'text', 'r1']) df_filtered = df[df['r1'] == stance] df_filtered_sample = df_filtered.sample(n=requested_amount) df_filtered_sample.to_csv("F:/tmp/random_stance_" + str(stance) + "_sample" + str(requested_amount) + ".csv", index=False, columns=['text', 'r1'], sep="~", header=False) logger.info("file export operation completed") except Exception as ex: logger.error(ex)
def rate(duration): # if speech_to_text(file): data = utils.read_file(filename) words_count = utils.total_words(data) print("actual words spoken: ", words_count) fluency_rating = utils.rate_speech_on_fluency(words_count, duration) spelling_rating = rate_spelling(data, words_count) filler_rating = rate_unnecessary_fillers(data) grammar_rating = rate_grammar(data) total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating rating = SpeechRater(fluency_rating, spelling_rating, filler_rating, grammar_rating, total_rating) return json.dumps(rating, cls=SpeechRaterEncoder)
def topic_discovery(): try: number_of_files_splitted_periods = 1 for i in range(1, number_of_files_splitted_periods + 4): # Load texts filename = data_path + 'p' + str(i) + '.csv' #filename = "F:/tmp/test" df = utils.read_file(filename, "~", names=['ID', 'datetime', 'text']) texts = df["text"].tolist() dictionary, corpus, texts = create_dictionary_corpus(texts) lm = LdaModel( corpus=corpus, id2word=dictionary, num_topics=topic_number, chunksize=chunksize, passes=epochs, eval_every=model_eval_every, iterations=max_iterations, alpha=alpha, eta=beta, ) output_visual_file_name = filename + ".vis" ml_utils.evaluate_lda_results(corpus, dictionary, texts, lm, topic_number, output_visual_file_name, visual_enabled) combined_topic_id_file_name = filename + "_topic_out.csv" combine_lda_results_with_lda_output(corpus, lm, df, combined_topic_id_file_name) if lda_model_save_enabled: lm.save(data_path + 'LDA_model_' + str(i) + '.lda') except Exception as ex: logger.error(str(ex)) logger.info(traceback.format_exc())
def extract_nb_of_words(): try: logger.info("started") df = utils.read_file( data_path + "merged_stance_of_tweets.csv", "~", names=['ID', 'user_id', 'datetime', 'text', 'r1', 'sentiment']) df["words_cnt"] = df["text"].str.split().str.len() cnt = df["words_cnt"].value_counts() sum = 0 counter = 0 for index, row in cnt.iteritems(): counter += row sum += (index * row) logger.info("average nb of wordsin dataset: " + str(sum / counter)) except Exception as ex: logger.error(ex)
def discover(): try: logger.info("Started Topic discovery operations") logger.basicConfig(level="INFO", filename=globals.WINDOWS_LOG_PATH, format="%(asctime)s %(message)s") # data = df.tweet.values.tolist() logger.info("started LDA related operations") filename_read = "F:/tmp/p3_1000.csv" df = utils.read_file(filename_read, ",", names=['ID', 'datetime', 'text']) corpus, id2word, data_words_bigrams = text_utils.prepare_lda_input(df) logger.info("building LDA model") expected_topic_cnt = 20 lda_model = ml_utils.build_lda_model(corpus, id2word, expected_topic_cnt) ml_utils.evaluate_lda_results(corpus, id2word, data_words_bigrams, lda_model, expected_topic_cnt, filename_read) utils.combine_lda_results_with_lda_output(corpus, lda_model, df, filename_read) except Exception as ex: exc_type, exc_value, exc_traceback = sys.exc_info() traceback.print_exception(exc_type, exc_value, exc_traceback, limit=2, file=sys.stdout) logger.error(ex) logger.error("Something bad happened: %s", ex) logger.info("Completed everything. Program is being terminated")
def pandas_users_stances(file): try: df = utils.read_file(file, "~", ['ID', 'user_id', 'datetime', 'text', 'r1']) dict = {} logger.info(str(df.shape)) df_grouped = df[['user_id', 'r1']].groupby(['user_id', 'r1']).agg({'r1':["count"]}).reset_index().groupby('user_id') user_id = -1 counter_remain_users = 0 counter_leave_users = 0 logger.info("number of users: " + str(len(df_grouped))) #TODO, this part could be simplified with idmax function of pandas dataframe for user_id, values_per_stance in df_grouped: count_remain = 0 count_leave = 0 for value in values_per_stance.values: if value[1] == 0: count_remain = value[2] elif value[1] == 1: count_leave = value[2] user_stance = utils.calculate_user_stance(count_remain, count_leave) if user_stance != -1: if user_stance == 0: counter_remain_users += 1 elif user_stance == 1: counter_leave_users += 1 dict[str(user_id)] = user_stance logger.info("number of users: [remain, leave]: " + str(counter_remain_users)+","+str(counter_leave_users)) utils.write_dict_to_file(file+"_users.csv", dict) return dict except Exception as ex: logger.error(ex)
from util.utils import read_file import matplotlib.pyplot as plt #load data epoch = read_file('epoch', 'txt') avg_loss = read_file('avg_loss', 'txt') avg_acc = read_file('avg_acc', 'txt') val_loss = read_file('val_loss', 'txt') #plot s = 10 fig = plt.figure() plt.title('Loss and accuracy', fontsize=s) ax1 = fig.add_subplot(1, 1, 1) l1 = ax1.plot(epoch, avg_loss, 'r', label='train loss') l3 = ax1.plot(epoch, val_loss, 'y', label='val loss') #plt.legend(bbox_to_anchor=(1.0,0.15)) ax1.set_ylabel('Loss', fontsize=s) ax2 = ax1.twinx() l2 = ax2.plot(epoch, avg_acc, 'g', label='train accuracy') ls = l1 + l2 + l3 labs = [l.get_label() for l in ls] ax1.legend(ls, labs, bbox_to_anchor=(1.0, 0.95)) ax2.set_ylabel('Accuracy', fontsize=s) ax1.set_xlabel('Epoch', fontsize=s) plt.show() plt.savefig('loss_acu.png')
from util import utils from speech_converter import speech_to_text from grammar_rater import rate_spelling from grammar_rater import rate_unnecessary_fillers from grammar_rater import rate_grammar filename = "speech.txt" if speech_to_text(filename): data = utils.read_file(filename) words_count = utils.total_words(data) print("total spoken words ", words_count) fluency_rating = utils.rate_speech_on_fluency(words_count) print("fluency rating out of 1: ", fluency_rating) spelling_rating = rate_spelling(data, words_count) print("spelling rating out of 2: ", spelling_rating) filler_rating = rate_unnecessary_fillers(data) print("unnecessary fillers rating out of 1: ", filler_rating) grammar_rating = rate_grammar(data) print("grammar rating out of 1: ", grammar_rating) total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating print("overall rating out of 5: ", total_rating)
def populate_missing_data_for_stance_transition(file): try: logger.info("started populating data") cols_list = [ 'id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33' ] col_list = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33' ] df = utils.read_file(file, names=cols_list, delimiter=",") df[cols_list] = df[cols_list].fillna(-1) df[cols_list] = df[cols_list].astype(int) df[cols_list] = df[cols_list].astype(str) df[cols_list] = df[cols_list].replace('-1', np.nan) #df[cols_list] = df[cols_list].astype(int, errors='ignore') df = df[col_list] total_count = df.shape[0] counter = 0 print("here") for index, row in df.iterrows(): try: if utils.every_col_is_nan(row): logger.info("dropping row index: " + str(index) + " since all columns are NaN values") df.drop(index, inplace=True) continue real_values = {} counter += 1 logger.info( str(counter) + " out of " + str(total_count) + " completed.") print(str(counter)) for i in range(0, row.size): temp = row[i] if (type(temp) == str): temp = int(temp) if (math.isnan(temp)): continue else: val = (int)(row[i]) real_values[i] = val ordered_real_values = collections.OrderedDict( sorted(real_values.items())) if (counter % 1000 == 0): logger.info("ordering completed for" + str(counter) + " th row, out of " + str(total_count)) for i in range(0, row.size): col_name = str(i + 1) temp = row[i] if (type(temp) == str): temp = int(temp) if (math.isnan(temp)): cnt_dict = 0 for key, value in ordered_real_values.items(): cnt_dict += 1 if i < key: #row[i] = value df.loc[index, col_name] = str(value) break elif (i > key and cnt_dict == len(ordered_real_values)): #row[i] = value df.loc[index, col_name] = str(value) break except Exception as ex: logger.error(str(ex)) logger.info(traceback.format_exc()) df.to_csv(file + "_out.csv", index=False, header=False) logger.info("completed populating data") except Exception as ex: logger.error(str(ex))
def populate_missing_data_for_stance_transition_s(): try: logger.info("started populating data") cols_list = [ 'id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33' ] col_list = [ '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33' ] df = utils.read_file("F:/tmp/datt.txt", names=cols_list, lineterminator="\r") df = df[col_list] total_count = df.shape[0] counter = 0 for index, row in df.iterrows(): if utils.every_col_is_nan(row): logger.info("dropping row index: " + str(index) + " since all columns are NaN values") df.drop(index, inplace=True) continue real_values = {} counter += 1 if (counter % 1000 == 0): logger.info( str(counter) + " out of " + str(total_count) + " completed.") for i in range(0, row.size): if (math.isnan(row[i])): continue else: val = (int)(row[i]) real_values[i] = val ordered_real_values = collections.OrderedDict( sorted(real_values.items())) if (counter % 1000 == 0): logger.info("ordering completed for" + str(counter) + " th row, out of " + str(total_count)) for i in range(0, row.size): if (math.isnan(row[i])): cnt_dict = 0 for key, value in ordered_real_values.items(): cnt_dict += 1 if i < key: row[i] = value break elif (i > key and cnt_dict == len(ordered_real_values)): row[i] = value break df[col_list] = df[col_list].astype(int) df.to_csv("F:/tmp/datt_out.csv", index=False, header=False) logger.info("completed populating data") except Exception as ex: logger.error(str(ex))
def extract_stance_changes_of_users_before_after_ref(file): #note: input file contains the prediction results of tweets on the last column try: df = utils.read_file(file,"~", globals.STANCE_FILE_COLUMNS) dict_users_before_ref = {} dict_users_after_ref = {} datetime_ref = datetime.strptime("2016-06-24", '%Y-%m-%d') # at first, we count the user-centric stance posts for the period before and after ref for index, row in df.iterrows(): try: user_id = row['user_id'] datetime_object = datetime.strptime(row['datetime'], '%Y-%m-%d') r1 = int(row['r1']) if(datetime_object < datetime_ref): if not user_id in dict_users_before_ref.keys(): if (r1 == 0): value = (1, 0) elif (r1 == 1): value = (0, 1) else: (remain,leave) = dict_users_before_ref[user_id] if (r1 == 0): remain += 1 elif (r1 == 1): leave +=1 value = (remain, leave) dict_users_before_ref[user_id] = value elif (datetime_object > datetime_ref): if not user_id in dict_users_after_ref.keys(): if (r1 == 0): value = (1, 0) elif (r1 == 1): value = (0, 1) else: (remain,leave) = dict_users_after_ref[user_id] if (r1 == 0): remain += 1 elif (r1 == 1): leave +=1 value = (remain, leave) dict_users_after_ref[user_id] = value except Exception as ex: logger.error(row) logger.error(str(ex)) logger.info(traceback.format_exc()) logger.info("number of people-stance couple before ref: " + str(len(dict_users_before_ref))) logger.info("number of people-stance couple after ref: " + str(len(dict_users_after_ref))) keys_to_be_deleted_from_dict_before = [] logger.info("discarding operation for the people who don't have tweets in after ref time periods. current size: " + str(len(dict_users_before_ref))) for key in dict_users_before_ref.keys(): has_found = False for key2 in dict_users_after_ref.keys(): if(key == key2): has_found = True break if not has_found: keys_to_be_deleted_from_dict_before.append(key) for item in keys_to_be_deleted_from_dict_before: del dict_users_before_ref[item] logger.info("discarded " + str(len(keys_to_be_deleted_from_dict_before)) + " of people who don't have tweets in after ref time periods. final size: " + str(len(dict_users_before_ref))) ######################################## keys_to_be_deleted_from_dict_after = [] logger.info("discarding operation for the people who don't have tweets in after ref time periods. current size: " + str(len(dict_users_after_ref))) for key in dict_users_after_ref.keys(): has_found = False for key2 in dict_users_before_ref.keys(): if(key == key2): has_found = True break if not has_found: keys_to_be_deleted_from_dict_after.append(key) for item in keys_to_be_deleted_from_dict_after: del dict_users_after_ref[item] logger.info("discarded " + str(len(keys_to_be_deleted_from_dict_after)) + " of people who don't have tweets in after ref time periods. final size: " + str(len(dict_users_after_ref))) # now calculating a single value for each user dframe = pd.DataFrame(data=None, columns=['before','after'], index=dict_users_before_ref.keys()) for key, value in dict_users_before_ref.items(): (remain,leave) = value if(remain>=leave): dframe.at[key,'before']=0 else: dframe.at[key,'before']=1 for key, value in dict_users_after_ref.items(): (remain,leave) = value if(remain>=leave): dframe.at[key,'after']=0 else: dframe.at[key,'after']=1 dframe.to_csv(file+"out_stance_change.csv") except Exception as ex: logger.error(str(ex)) logger.info(traceback.format_exc())
print('Epoch:{:04d} Val loss:{:.4f} Val acc:{:.4f}'.format(epoch+1, loss_val.data[0], acc_val.data[0])) return loss_train.data[0],acc_train.data[0],los_val_ def test(test_feature,test_label): model.eval() output = model(test_feature, adj) print(output.max(1)[1].data) print(test_label.data) acc_test = accuracy(output, test_label) loss_test = F.nll_loss(output, test_label) print("Test set results: loss={:.4f} test acc={:.4f}".format(loss_test.data[0],acc_test.data[0])) #load data x_data = read_file('./data/bw_x_data','pkl') y_data = read_file('./data/bw_y_data','pkl') A = read_file('./data/bw_adj_data','pkl') time_step = len(x_data) nodes = len(x_data[0]) adj = sparse_mx_to_torch_sparse_tensor(A) adj = Variable(adj) # Train model t_total = time.time() train_step = time_step - 0 ep_avg_loss = [] ep_avg_acc = [] ep = []
return loss_train.data[0], acc_train.data[0], los_val_ def test(test_feature, test_label): model.eval() output = model(test_feature, adj) #print(output.max(1)[1].data) #print(test_label.data) acc_test = accuracy(output, test_label) loss_test = F.nll_loss(output, test_label) print("Test set results: loss={:.4f} test acc={:.4f}".format( loss_test.data[0], acc_test.data[0])) #load data x_data = read_file('./data/x_data', 'pkl') y_data = read_file('./data/y_data', 'pkl') A = read_file('./data/adj_data', 'pkl') time_step = len(x_data) nodes = len(x_data[0]) adj = sparse_mx_to_torch_sparse_tensor(A) adj = Variable(adj) # Train model test_num = 3 t_total = time.time() train_step = time_step - test_num ep_avg_loss = [] ep_avg_acc = []