예제 #1
0
def enrich_user_id_train_mlma(file):
    #df = utils.read_file(file,"~",names=['id','datetime','text','r1'])
    df = utils.read_file(file, "~", names=['id'], dtype='object')
    df["user_id"] = pd.Series([])
    df["datetime"] = pd.Series([])
    df["text"] = pd.Series([])

    db = utils.get_mongo_client_db()
    try:
        counter = 0
        for index, row in df.iterrows():
            id = row["id"]
            id = id.rstrip("\r")
            res = db.tweet.find_one({"ID": str(id)})
            if not res:
                logger.info("this tweet is not existing: " + str(id))
                continue
            if not 'user_id' in res:
                logger.info("this tweet does not have user id: " + str(id))
                continue
            user_id = res["user_id"]
            datetime = res["datetime"]
            text = res["text"]
            df.loc[index, 'user_id'] = user_id
            df.loc[index, 'datetime'] = datetime
            df.loc[index, 'text'] = text
            counter += 1
    except Exception as ex:
        logger.error(str(ex))
    logger.info("counter:" + str(counter))
    df.to_csv(file + "_userid.csv", "~", index=False)
예제 #2
0
def r1_stats(file):
    df = utils.read_file(file, "~", names=['ID', 'user_id', 'datetime', 'text', 'r1'])
    logger.info("started")
    try:
        logger.info(df['r1'].value_counts())
    except Exception as e:
        logger.error(str(e))
예제 #3
0
def time_period_splitter(file, separator):
    try:
        df = utils.read_file(file,
                             separator, ["ID", "datetime", "text"],
                             dtype=object)
        df = utils.drop_nans(df)
        df_p2 = df[df["datetime"].str.startswith(tuple(globals.p2_times))]
        df_p3 = df[df["datetime"].str.startswith(tuple(globals.p3_times))]
        df_p4 = df[df["datetime"].str.startswith(tuple(globals.p4_times))]
        df_p5 = df[df["datetime"].str.startswith(tuple(globals.p5_times))]
        df_p6 = df[df["datetime"].str.startswith(tuple(globals.p6_times))]
        df_p7 = df[df["datetime"].str.startswith(tuple(globals.p7_times))]
        df_p8 = df[df["datetime"].str.startswith(tuple(globals.p8_times))]

        df_p2.to_csv("p2.csv", sep=separator, index=False, encoding="utf-8")
        df_p3.to_csv("p3.csv", sep=separator, index=False, encoding="utf-8")
        df_p4.to_csv("p4.csv", sep=separator, index=False, encoding="utf-8")
        df_p5.to_csv("p5.csv", sep=separator, index=False, encoding="utf-8")
        df_p6.to_csv("p6.csv", sep=separator, index=False, encoding="utf-8")
        df_p7.to_csv("p7.csv", sep=separator, index=False, encoding="utf-8")
        df_p8.to_csv("p8.csv", sep=separator, index=False, encoding="utf-8")

        print("ok")
    except Exception as ex:
        logger.error(ex)
예제 #4
0
async def main():

    if speech_to_text(filename):

        loop = asyncio.get_running_loop()
        data = utils.read_file(filename)
        words_count = utils.total_words(data)
        print("=" * 44)
        print("total spoken words                   -> ", words_count)
        print("=" * 44)

        fut_speech_fluency = loop.create_future()
        loop.create_task(
            utils.rate_speech_on_fluency(fut_speech_fluency, words_count))
        fluency_rating = await fut_speech_fluency

        spelling_rating = rate_spelling(data, words_count)

        fut_unnecessary_filler = loop.create_future()
        loop.create_task(rate_unnecessary_fillers(fut_unnecessary_filler,
                                                  data))
        filler_rating = await fut_unnecessary_filler

        grammar_rating = rate_grammar(data)

        print("=" * 44)
        print("fluency rating             (out of 1)-> ", fluency_rating)
        print("spelling rating            (out of 2)-> ", spelling_rating)
        print("unnecessary fillers rating (out of 1)-> ", filler_rating)
        print("grammar rating             (out of 1)-> ", grammar_rating)

        total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating
        print("=" * 44)
        print("overall rating             (out of 5)-> ", total_rating)
        print("=" * 44)
예제 #5
0
def extract_convert_lda_input(file):
    #overall process for data preparation as input to LDA algorithm
    #1st, extract records from mongo by executing mongo expert javascript file. mongo_extract_data_script.js

    logger.info("started LDA related operations")
    df = utils.read_file(file, "~", names=['ID', 'datetime', 'text'])
    df_new = utils.preprocess_text_for_topic_discovery(df)
    df_new.to_csv(file + "_out.csv", index=False)
예제 #6
0
def extract_stance_changes_of_users_with_only_two_tweets(file):
    #note: this method is no longer used
    try:
        df = utils.read_file(file,"~", globals.STANCE_FILE_COLUMNS)
        dict_users={}

        for index, row in df.iterrows():
            try:
                user_id = row['user_id']
                datetime_object = datetime.strptime(row['datetime'], '%Y-%m-%d')
                r1 = row['r1']
                if not user_id in dict_users.keys():
                    dict_users[user_id] = {1:(datetime_object, str(r1))}
                else:
                    records = dict_users[user_id]
                    if(len(records)==1):
                        old_datetime_object, old_r1 = records[1]
                        if old_datetime_object > datetime_object:
                            records[1] = (datetime_object,str(r1))
                            records[2] = (old_datetime_object,str(old_r1))
                        elif old_datetime_object < datetime_object:
                            records[2] = (datetime_object, str(r1))
                    else:
                        old_datetime_object, old_r1 = records[1]
                        if old_datetime_object > datetime_object:
                            records[1] = (datetime_object,str(r1))
                        elif old_datetime_object < datetime_object:
                            old_datetime_object, old_r1 = records[2]
                            if old_datetime_object < datetime_object:
                                records[2] = (datetime_object, str(r1))
            except Exception as ex:
                logger.error(row)
                logger.error(str(ex))
                logger.info(traceback.format_exc())

        file_write = open(file+"_out.csv","w")
        counter = 0
        for key, value in dict_users.items():
            output = ""
            values_dict = dict_users[key]
            if(len(values_dict) != 2):
                continue;
            output += str(key)+","
            ordered_values = collections.OrderedDict(sorted(values_dict.items()))
            for key2, value2 in ordered_values.items():
                (date, stance) = value2
                output += str(stance) + ","
            output = output[0:len(output)-1]
            counter += 1
            file_write.write(output)
            file_write.write("\n")
            if(counter % 1000 == 0):
                file_write.flush()

        print("ok")
    except Exception as ex:
        logger.error(str(ex))
        logger.info(traceback.format_exc())
예제 #7
0
def extract_fields_by_r1(file, r1):
    df = utils.read_file(file, "~", names=['ID', 'user_id', 'datetime', 'text', 'r1'])
    df = df[df['r1'] == r1]
    logger.info(df.head())
    if r1 == 1:
        df['r1']= 0
    elif r1 == 2:
        df['r1'] = 1
    logger.info(df.head())
    df.to_csv(file + "_" + str(r1) + "_out.csv", index=False, columns=['ID', 'user_id', 'datetime', 'text', 'r1'], header=None, sep="~")
예제 #8
0
def r1_stats():
    df = utils.read_file("F:/tmp/full_features.csvl_out.csv", "~", names=['ID', 'user_id', 'datetime', 'text', 'r1'])
    logger.info("started")
    try:
        print(df['r1'].value_counts())
        grouped = df.groupby(['datetime','r1'])['datetime']
        grouped.count().to_csv('F:/tmp/tt.txt')

    except Exception as e:
        logger.error(str(e))
예제 #9
0
def extract_stance_changes_of_users_old(file):
    try:
        df = utils.read_file(file,"~", globals.STANCE_FILE_COLUMNS)
        df_records = df[['user_id', 'r1', 'datetime']].groupby(['user_id', 'r1', 'datetime']).agg({'r1': ["count"]}).reset_index()
        converted = utils.convert_consolidate_monthly_stances(df_records)
        df_final = utils.convert_nested_dict_to_line_chart_input_and_write(converted)
        df_final.to_csv(file+"_out.csv")
        print("ok")
    except Exception as ex:
        logger.error(str(ex))
        logger.info(traceback.format_exc())
예제 #10
0
def analyze_duplicate_tweets(file):
    try:
        df = utils.read_file(file, "~", ['ID', 'user_id', 'datetime', 'text'])
        grouped = df.groupby('text').text.count()
        grouped = grouped[grouped > 5]
        grouped.sort_index(ascending=False)
        logger.info(grouped.value_counts())
        grouped.to_csv('F:/tmp/tt1.txt')

    except Exception as ex:
        logger.error(ex)
예제 #11
0
def analyze_group_by_influence(file):
    try:
        df = utils.read_file(file, "~", ['datetime', 'nb_retweet', 'nb_like'])
        logger.info(df.head())
        grouped_r = df.groupby('datetime')['nb_retweet'].mean()
        grouped_l = df.groupby('datetime')['nb_like'].mean()

        grouped_r.to_csv('F:/tmp/retweet.txt')
        grouped_l.to_csv('F:/tmp/like.txt')

    except Exception as ex:
        logger.info(ex)
예제 #12
0
def time_period_splitter(file, separator):
    logger.info("started")
    logger.info(globals.p2_times)
    logger.info("started to read file:" + file)

    try:
        df = utils.read_file(file,
                             separator, ["ID", "datetime", "text"],
                             dtype=object)
        df = utils.drop_nans(df)
        df_p2 = df[df["datetime"].str.startswith(tuple(globals.p2_times))]
        df_p3 = df[df["datetime"].str.startswith(tuple(globals.p3_times))]
        df_p4 = df[df["datetime"].str.startswith(tuple(globals.p4_times))]
        df_p5 = df[df["datetime"].str.startswith(tuple(globals.p5_times))]
        df_p6 = df[df["datetime"].str.startswith(tuple(globals.p6_times))]
        df_p7 = df[df["datetime"].str.startswith(tuple(globals.p7_times))]
        df_p8 = df[df["datetime"].str.startswith(tuple(globals.p8_times))]

        df_p2.to_csv(data_path + "p2.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")
        df_p3.to_csv(data_path + "p3.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")
        df_p4.to_csv(data_path + "p4.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")
        df_p5.to_csv(data_path + "p5.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")
        df_p6.to_csv(data_path + "p6.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")
        df_p7.to_csv(data_path + "p7.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")
        df_p8.to_csv(data_path + "p8.csv",
                     sep=separator,
                     index=False,
                     encoding="utf-8")

    except Exception as ex:
        logger.error(ex)
        logger.info(traceback.format_exc())

    logger.info("completed")
예제 #13
0
def plot_stance_transition():
    try:
        scaling_enabled = False
        col_label_list = ['2016-01','2016-02','2016-03','2016-04','2016-05','2016-06','2016-07','2016-08','2016-09','2016-10','2016-11','2016-12','2017-01','2017-02','2017-03','2017-04','2017-05','2017-06','2017-07','2017-08','2017-09','2017-10','2017-11','2017-12','2018-01','2018-02','2018-03','2018-04','2018-05','2018-06','2018-07','2018-08','2018-09']
        col_list = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31','32','33']

        df = utils.read_file("F:/tmp/datt_out.csv", names=col_list)
        rowsize,colsize = df.shape
        plt.interactive(False)
        fig = plt.figure(figsize=(20, 15))
        ax = fig.add_subplot(111)

        plt.yticks([0,1])

        counter = 0
        for i in range(0,colsize):
            if(i==colsize-1):
                break
            counter += 1
            if(counter % 1000 == 0):
                logger.info(str(counter) + " out of " + str(colsize) + " is completed")
            first = df.iloc[:,i].tolist()
            second = df.iloc[:,i+1].tolist()
            zero_to_zero, one_to_one, zero_to_one, one_to_zero = calculate_combinated_weights(first, second)
            logger.info("zero_to_zero, one_to_one, zero_to_one, one_to_zero:"+str(zero_to_zero) +str(one_to_one) +str(zero_to_one) +str(one_to_zero))
            if scaling_enabled:
                if zero_to_zero!=0.0:
                    zero_to_zero=np.log(zero_to_zero)
                if one_to_one != 0.0:
                    one_to_one=np.log(one_to_one)
                if zero_to_one != 0.0:
                    zero_to_one=np.log(zero_to_one)
                if one_to_zero != 0.0:
                    one_to_zero=np.log(one_to_zero)

            plt.plot([i, i+1],[0, 0],linewidth=zero_to_zero, c="black", solid_capstyle="round")
            plt.plot([i, i+1],[1, 1], linewidth=one_to_one,  c="black", solid_capstyle="round")
            plt.plot([i, i+1],[0, 1], linewidth=zero_to_one, c="black", solid_capstyle="round")
            plt.plot([i, i+1],[1, 0], linewidth=one_to_zero, c="black", solid_capstyle="round")

            i+=1
        ax.set_xticklabels(col_label_list, rotation=45)
        plt.savefig("F:/tmp/pplot")
        print("ok")
    except Exception as ex:
        logger.info(str(ex))
async def main():

    speech_to_text(filename)
    loop = asyncio.get_running_loop()
    data = utils.read_file(filename)
    words_count = utils.total_words(data)
    logs = '=' * 44 + '\n'
    logs += "total spoken words                   -> " + str(
        words_count) + '\n'
    logs += '=' * 44 + '\n'

    fut_speech_fluency = loop.create_future()
    loop.create_task(
        utils.rate_speech_on_fluency(fut_speech_fluency, words_count))
    fluency_rating = await fut_speech_fluency

    spelling_rating = rate_spelling(data, words_count)

    fut_unnecessary_filler = loop.create_future()
    loop.create_task(rate_unnecessary_fillers(fut_unnecessary_filler, data))
    filler_rating = await fut_unnecessary_filler

    grammar_rating = rate_grammar(data)

    logs += '=' * 44 + '\n'
    logs += "fluency rating             (out of 1)-> " + \
        str(fluency_rating)+'\n'
    logs += "spelling rating            (out of 2)-> " + \
        str(spelling_rating)+'\n'
    logs += "unnecessary fillers rating (out of 1)-> " + \
        str(filler_rating)+'\n'
    logs += "grammar rating             (out of 1)-> " + \
        str(grammar_rating) + '\n'

    total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating

    logs += '=' * 44 + '\n'
    logs += "overall rating             (out of 5)-> " + str(
        total_rating) + "\n"

    logs += '=' * 44 + '\n'

    f = open("logs.txt", "w")
    f.write(logs)
    f.close
    return total_rating
def read_rfc_data(rfc_number):

    try:
        rfc_filename = utils.get_rfc_filename(rfc_number)
        rfc_filename = os.path.abspath(os.path.join(rfc_location, rfc_filename))

        # Get the RFC file stat
        file_stats = os.stat(rfc_filename)
        last_modified = file_stats.st_mtime
        content_length = file_stats.st_size

        # Read the file contents
        rfc_data = utils.read_file(rfc_filename)
    except IOError:
        logger.error("The entered RFC was not found")
        return "", "", ""

    return rfc_data, str(last_modified), str(content_length)
예제 #16
0
def pandas_extract_tweet_text_by_topic_label_random_n_records(
        file, requested_amount, stance):
    try:
        logger.info("started to read file")

        df = utils.read_file(file,
                             "~",
                             names=['ID', 'user_id', 'datetime', 'text', 'r1'])
        df_filtered = df[df['r1'] == stance]
        df_filtered_sample = df_filtered.sample(n=requested_amount)

        df_filtered_sample.to_csv("F:/tmp/random_stance_" + str(stance) +
                                  "_sample" + str(requested_amount) + ".csv",
                                  index=False,
                                  columns=['text', 'r1'],
                                  sep="~",
                                  header=False)
        logger.info("file export operation completed")
    except Exception as ex:
        logger.error(ex)
예제 #17
0
def rate(duration):

    # if speech_to_text(file):
    data = utils.read_file(filename)
    words_count = utils.total_words(data)
    print("actual words spoken: ", words_count)
    fluency_rating = utils.rate_speech_on_fluency(words_count, duration)

    spelling_rating = rate_spelling(data, words_count)

    filler_rating = rate_unnecessary_fillers(data)

    grammar_rating = rate_grammar(data)

    total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating

    rating = SpeechRater(fluency_rating, spelling_rating, filler_rating,
                         grammar_rating, total_rating)

    return json.dumps(rating, cls=SpeechRaterEncoder)
예제 #18
0
def topic_discovery():
    try:
        number_of_files_splitted_periods = 1
        for i in range(1, number_of_files_splitted_periods + 4):

            # Load texts
            filename = data_path + 'p' + str(i) + '.csv'
            #filename = "F:/tmp/test"
            df = utils.read_file(filename,
                                 "~",
                                 names=['ID', 'datetime', 'text'])
            texts = df["text"].tolist()

            dictionary, corpus, texts = create_dictionary_corpus(texts)

            lm = LdaModel(
                corpus=corpus,
                id2word=dictionary,
                num_topics=topic_number,
                chunksize=chunksize,
                passes=epochs,
                eval_every=model_eval_every,
                iterations=max_iterations,
                alpha=alpha,
                eta=beta,
            )
            output_visual_file_name = filename + ".vis"
            ml_utils.evaluate_lda_results(corpus, dictionary, texts, lm,
                                          topic_number,
                                          output_visual_file_name,
                                          visual_enabled)
            combined_topic_id_file_name = filename + "_topic_out.csv"
            combine_lda_results_with_lda_output(corpus, lm, df,
                                                combined_topic_id_file_name)

            if lda_model_save_enabled:
                lm.save(data_path + 'LDA_model_' + str(i) + '.lda')

    except Exception as ex:
        logger.error(str(ex))
        logger.info(traceback.format_exc())
예제 #19
0
def extract_nb_of_words():
    try:
        logger.info("started")

        df = utils.read_file(
            data_path + "merged_stance_of_tweets.csv",
            "~",
            names=['ID', 'user_id', 'datetime', 'text', 'r1', 'sentiment'])
        df["words_cnt"] = df["text"].str.split().str.len()
        cnt = df["words_cnt"].value_counts()

        sum = 0
        counter = 0
        for index, row in cnt.iteritems():
            counter += row
            sum += (index * row)

        logger.info("average nb of wordsin dataset: " + str(sum / counter))

    except Exception as ex:
        logger.error(ex)
예제 #20
0
def discover():
    try:

        logger.info("Started Topic discovery operations")
        logger.basicConfig(level="INFO",
                           filename=globals.WINDOWS_LOG_PATH,
                           format="%(asctime)s %(message)s")

        # data = df.tweet.values.tolist()
        logger.info("started LDA related operations")
        filename_read = "F:/tmp/p3_1000.csv"
        df = utils.read_file(filename_read,
                             ",",
                             names=['ID', 'datetime', 'text'])

        corpus, id2word, data_words_bigrams = text_utils.prepare_lda_input(df)
        logger.info("building LDA model")

        expected_topic_cnt = 20

        lda_model = ml_utils.build_lda_model(corpus, id2word,
                                             expected_topic_cnt)

        ml_utils.evaluate_lda_results(corpus, id2word, data_words_bigrams,
                                      lda_model, expected_topic_cnt,
                                      filename_read)
        utils.combine_lda_results_with_lda_output(corpus, lda_model, df,
                                                  filename_read)

    except Exception as ex:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        traceback.print_exception(exc_type,
                                  exc_value,
                                  exc_traceback,
                                  limit=2,
                                  file=sys.stdout)
        logger.error(ex)
        logger.error("Something bad happened: %s", ex)

    logger.info("Completed everything. Program is being terminated")
예제 #21
0
def pandas_users_stances(file):
    try:
        df = utils.read_file(file, "~", ['ID', 'user_id', 'datetime', 'text', 'r1'])

        dict = {}
        logger.info(str(df.shape))

        df_grouped = df[['user_id', 'r1']].groupby(['user_id', 'r1']).agg({'r1':["count"]}).reset_index().groupby('user_id')
        user_id = -1
        counter_remain_users = 0
        counter_leave_users = 0
        logger.info("number of users: " + str(len(df_grouped)))

        #TODO, this part could be simplified with idmax function of pandas dataframe
        for user_id, values_per_stance in df_grouped:
            count_remain = 0
            count_leave = 0
            for value in values_per_stance.values:
                if value[1] == 0:
                    count_remain = value[2]
                elif value[1] == 1:
                    count_leave = value[2]

            user_stance = utils.calculate_user_stance(count_remain, count_leave)
            if user_stance != -1:
                if user_stance == 0:
                    counter_remain_users += 1
                elif user_stance == 1:
                    counter_leave_users += 1
                dict[str(user_id)] = user_stance

        logger.info("number of users: [remain, leave]: " + str(counter_remain_users)+","+str(counter_leave_users))
        utils.write_dict_to_file(file+"_users.csv", dict)

        return dict

    except Exception as ex:
        logger.error(ex)
예제 #22
0
from util.utils import read_file
import matplotlib.pyplot as plt

#load data
epoch = read_file('epoch', 'txt')
avg_loss = read_file('avg_loss', 'txt')
avg_acc = read_file('avg_acc', 'txt')
val_loss = read_file('val_loss', 'txt')

#plot
s = 10
fig = plt.figure()
plt.title('Loss and accuracy', fontsize=s)
ax1 = fig.add_subplot(1, 1, 1)
l1 = ax1.plot(epoch, avg_loss, 'r', label='train loss')
l3 = ax1.plot(epoch, val_loss, 'y', label='val loss')
#plt.legend(bbox_to_anchor=(1.0,0.15))
ax1.set_ylabel('Loss', fontsize=s)
ax2 = ax1.twinx()
l2 = ax2.plot(epoch, avg_acc, 'g', label='train accuracy')
ls = l1 + l2 + l3
labs = [l.get_label() for l in ls]
ax1.legend(ls, labs, bbox_to_anchor=(1.0, 0.95))
ax2.set_ylabel('Accuracy', fontsize=s)
ax1.set_xlabel('Epoch', fontsize=s)
plt.show()
plt.savefig('loss_acu.png')
예제 #23
0
from util import utils
from speech_converter import speech_to_text
from grammar_rater import rate_spelling
from grammar_rater import rate_unnecessary_fillers
from grammar_rater import rate_grammar

filename = "speech.txt"

if speech_to_text(filename):
    data = utils.read_file(filename)
    words_count = utils.total_words(data)
    print("total spoken words ", words_count)
    fluency_rating = utils.rate_speech_on_fluency(words_count)
    print("fluency rating out of 1: ", fluency_rating)
    spelling_rating = rate_spelling(data, words_count)
    print("spelling rating out of 2: ", spelling_rating)
    filler_rating = rate_unnecessary_fillers(data)
    print("unnecessary fillers rating out of 1: ", filler_rating)
    grammar_rating = rate_grammar(data)
    print("grammar rating out of 1: ", grammar_rating)
    total_rating = fluency_rating + spelling_rating + filler_rating + grammar_rating
    print("overall rating out of 5: ", total_rating)
예제 #24
0
def populate_missing_data_for_stance_transition(file):
    try:
        logger.info("started populating data")
        cols_list = [
            'id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
            '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
            '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'
        ]
        col_list = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
            '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'
        ]

        df = utils.read_file(file, names=cols_list, delimiter=",")
        df[cols_list] = df[cols_list].fillna(-1)
        df[cols_list] = df[cols_list].astype(int)
        df[cols_list] = df[cols_list].astype(str)
        df[cols_list] = df[cols_list].replace('-1', np.nan)

        #df[cols_list] = df[cols_list].astype(int, errors='ignore')
        df = df[col_list]

        total_count = df.shape[0]
        counter = 0
        print("here")
        for index, row in df.iterrows():
            try:
                if utils.every_col_is_nan(row):
                    logger.info("dropping row index: " + str(index) +
                                " since all columns are NaN values")
                    df.drop(index, inplace=True)
                    continue
                real_values = {}
                counter += 1
                logger.info(
                    str(counter) + " out of " + str(total_count) +
                    " completed.")
                print(str(counter))
                for i in range(0, row.size):
                    temp = row[i]
                    if (type(temp) == str):
                        temp = int(temp)

                    if (math.isnan(temp)):
                        continue
                    else:
                        val = (int)(row[i])
                        real_values[i] = val
                ordered_real_values = collections.OrderedDict(
                    sorted(real_values.items()))
                if (counter % 1000 == 0):
                    logger.info("ordering completed for" + str(counter) +
                                " th row, out of " + str(total_count))
                for i in range(0, row.size):
                    col_name = str(i + 1)
                    temp = row[i]
                    if (type(temp) == str):
                        temp = int(temp)
                    if (math.isnan(temp)):
                        cnt_dict = 0
                        for key, value in ordered_real_values.items():
                            cnt_dict += 1
                            if i < key:
                                #row[i] = value
                                df.loc[index, col_name] = str(value)
                                break
                            elif (i > key
                                  and cnt_dict == len(ordered_real_values)):
                                #row[i] = value
                                df.loc[index, col_name] = str(value)
                                break
            except Exception as ex:
                logger.error(str(ex))
                logger.info(traceback.format_exc())

        df.to_csv(file + "_out.csv", index=False, header=False)
        logger.info("completed populating data")
    except Exception as ex:
        logger.error(str(ex))
예제 #25
0
def populate_missing_data_for_stance_transition_s():
    try:
        logger.info("started populating data")
        cols_list = [
            'id', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
            '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
            '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'
        ]
        col_list = [
            '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
            '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23',
            '24', '25', '26', '27', '28', '29', '30', '31', '32', '33'
        ]

        df = utils.read_file("F:/tmp/datt.txt",
                             names=cols_list,
                             lineterminator="\r")
        df = df[col_list]

        total_count = df.shape[0]
        counter = 0
        for index, row in df.iterrows():
            if utils.every_col_is_nan(row):
                logger.info("dropping row index: " + str(index) +
                            " since all columns are NaN values")
                df.drop(index, inplace=True)
                continue
            real_values = {}
            counter += 1
            if (counter % 1000 == 0):
                logger.info(
                    str(counter) + " out of " + str(total_count) +
                    " completed.")
            for i in range(0, row.size):
                if (math.isnan(row[i])):
                    continue
                else:
                    val = (int)(row[i])
                    real_values[i] = val
            ordered_real_values = collections.OrderedDict(
                sorted(real_values.items()))
            if (counter % 1000 == 0):
                logger.info("ordering completed for" + str(counter) +
                            " th row, out of " + str(total_count))
            for i in range(0, row.size):
                if (math.isnan(row[i])):
                    cnt_dict = 0
                    for key, value in ordered_real_values.items():
                        cnt_dict += 1
                        if i < key:
                            row[i] = value
                            break
                        elif (i > key
                              and cnt_dict == len(ordered_real_values)):
                            row[i] = value
                            break

        df[col_list] = df[col_list].astype(int)
        df.to_csv("F:/tmp/datt_out.csv", index=False, header=False)
        logger.info("completed populating data")
    except Exception as ex:
        logger.error(str(ex))
예제 #26
0
def extract_stance_changes_of_users_before_after_ref(file):
    #note: input file contains the prediction results of tweets on the last column
    try:
        df = utils.read_file(file,"~", globals.STANCE_FILE_COLUMNS)
        dict_users_before_ref = {}
        dict_users_after_ref = {}
        datetime_ref = datetime.strptime("2016-06-24", '%Y-%m-%d')
        # at first, we count the user-centric stance posts for the period before and after ref
        for index, row in df.iterrows():
            try:
                user_id = row['user_id']
                datetime_object = datetime.strptime(row['datetime'], '%Y-%m-%d')
                r1 = int(row['r1'])
                if(datetime_object < datetime_ref):
                    if not user_id in dict_users_before_ref.keys():
                        if (r1 == 0):
                            value = (1, 0)
                        elif (r1 == 1):
                            value = (0, 1)

                    else:
                        (remain,leave) = dict_users_before_ref[user_id]
                        if (r1 == 0):
                            remain += 1
                        elif (r1 == 1):
                            leave +=1
                        value = (remain, leave)
                    dict_users_before_ref[user_id] = value

                elif (datetime_object > datetime_ref):
                    if not user_id in dict_users_after_ref.keys():
                        if (r1 == 0):
                            value = (1, 0)
                        elif (r1 == 1):
                            value = (0, 1)

                    else:
                        (remain,leave) = dict_users_after_ref[user_id]
                        if (r1 == 0):
                            remain += 1
                        elif (r1 == 1):
                            leave +=1
                        value = (remain, leave)
                    dict_users_after_ref[user_id] = value

            except Exception as ex:
                logger.error(row)
                logger.error(str(ex))
                logger.info(traceback.format_exc())
        logger.info("number of people-stance couple before ref: " + str(len(dict_users_before_ref)))
        logger.info("number of people-stance couple after ref: " + str(len(dict_users_after_ref)))
        keys_to_be_deleted_from_dict_before = []

        logger.info("discarding operation for the people who don't have tweets in after ref time periods. current size: " + str(len(dict_users_before_ref)))

        for key in dict_users_before_ref.keys():
            has_found = False
            for key2 in dict_users_after_ref.keys():
                if(key == key2):
                    has_found = True
                    break
            if not has_found:
                keys_to_be_deleted_from_dict_before.append(key)

        for item in keys_to_be_deleted_from_dict_before:
            del dict_users_before_ref[item]
        logger.info("discarded " + str(len(keys_to_be_deleted_from_dict_before)) + " of people who don't have tweets in after ref time periods. final size: " + str(len(dict_users_before_ref)))

        ########################################
        keys_to_be_deleted_from_dict_after = []
        logger.info("discarding operation for the people who don't have tweets in after ref time periods. current size: " + str(len(dict_users_after_ref)))

        for key in dict_users_after_ref.keys():
            has_found = False
            for key2 in dict_users_before_ref.keys():
                if(key == key2):
                    has_found = True
                    break
            if not has_found:
                keys_to_be_deleted_from_dict_after.append(key)

        for item in keys_to_be_deleted_from_dict_after:
            del dict_users_after_ref[item]
        logger.info("discarded " + str(len(keys_to_be_deleted_from_dict_after)) + " of people who don't have tweets in after ref time periods. final size: " + str(len(dict_users_after_ref)))


        # now calculating a single value for each user
        dframe = pd.DataFrame(data=None, columns=['before','after'], index=dict_users_before_ref.keys())
        for key, value in dict_users_before_ref.items():
            (remain,leave) = value
            if(remain>=leave):
                dframe.at[key,'before']=0
            else:
                dframe.at[key,'before']=1

        for key, value in dict_users_after_ref.items():
            (remain,leave) = value

            if(remain>=leave):
                dframe.at[key,'after']=0
            else:
                dframe.at[key,'after']=1
        dframe.to_csv(file+"out_stance_change.csv")

    except Exception as ex:
            logger.error(str(ex))
            logger.info(traceback.format_exc())
예제 #27
0
파일: model_train.py 프로젝트: AXinx/py_gcn
        print('Epoch:{:04d} Val loss:{:.4f} Val acc:{:.4f}'.format(epoch+1, loss_val.data[0], acc_val.data[0]))

    return loss_train.data[0],acc_train.data[0],los_val_


def test(test_feature,test_label):
    model.eval()
    output = model(test_feature, adj)
    print(output.max(1)[1].data)
    print(test_label.data)
    acc_test = accuracy(output, test_label)
    loss_test = F.nll_loss(output, test_label)
    print("Test set results: loss={:.4f} test acc={:.4f}".format(loss_test.data[0],acc_test.data[0]))

#load data
x_data = read_file('./data/bw_x_data','pkl')
y_data = read_file('./data/bw_y_data','pkl')
A = read_file('./data/bw_adj_data','pkl')

time_step = len(x_data)
nodes = len(x_data[0])

adj = sparse_mx_to_torch_sparse_tensor(A)
adj = Variable(adj)

# Train model
t_total = time.time()
train_step = time_step - 0
ep_avg_loss = []
ep_avg_acc = []
ep = []
예제 #28
0
    return loss_train.data[0], acc_train.data[0], los_val_


def test(test_feature, test_label):
    model.eval()
    output = model(test_feature, adj)
    #print(output.max(1)[1].data)
    #print(test_label.data)
    acc_test = accuracy(output, test_label)
    loss_test = F.nll_loss(output, test_label)
    print("Test set results: loss={:.4f} test acc={:.4f}".format(
        loss_test.data[0], acc_test.data[0]))


#load data
x_data = read_file('./data/x_data', 'pkl')
y_data = read_file('./data/y_data', 'pkl')
A = read_file('./data/adj_data', 'pkl')

time_step = len(x_data)
nodes = len(x_data[0])

adj = sparse_mx_to_torch_sparse_tensor(A)
adj = Variable(adj)

# Train model
test_num = 3
t_total = time.time()
train_step = time_step - test_num
ep_avg_loss = []
ep_avg_acc = []