def feature_pred(features, chik, ldak):
    global users
    wn.ensure_loaded()
    facts = gt.get_fact_topics(DIR)

    if NEW_DATA:
        users = gt.get_users(DIR)
        transactions = gt.get_transactions(DIR)
        print(transactions.describe())

        tr_hsh = transactions['fact'].values
        # if castillo: comment cond2 out
        cond = facts['hash'].isin(tr_hsh)
        cond2 = facts['true'] == 1 | facts['true'] == 0
        facts = facts[cond & cond2]
        facts = Parallel(n_jobs=num_jobs)(delayed(get_features)(
            fact, transactions[transactions['fact'] == fact['hash']], [
                u for u in users if int(u.user_id) in list(transactions[
                    transactions['fact'] == fact['hash']]['user_id'].values)
            ]) for idx, fact in facts.iterrows())
        facts = pd.DataFrame(facts)
        with open('model_data/feature_data', 'wb') as tmpfile:
            pickle.dump(facts, tmpfile)
    else:
        with open('model_data/feature_data', 'rb') as tmpfile:
            facts = pickle.load(tmpfile)

    print(facts[list(features)].describe())
    X = facts[list(features)].values
    y = facts['y'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), PCA(n_components=ldak),
                            SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred_test_std = std_clf.predict(X_test)
    precision, recall, fscore, sup = precision_recall_fscore_support(
        y_test, pred_test_std, average='macro')
    score = metrics.accuracy_score(y_test, pred_test_std)
    print("Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f" %
          (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    return acc_scores.mean()
示例#2
0
def main():
    global bow_corpus
    global word_to_idx
    wn.ensure_loaded()
    if NEW_CORPUS:
        bow_corpus = build_bow_corpus(get_users())
        save_corpus(bow_corpus)
    else:
        bow_corpus = get_corpus()

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}

    users = get_users()
    facts = gt.get_fact_topics()
    transactions = gt.get_transactions()
    users_df = pd.DataFrame([vars(u) for u in users])
    print(users_df.describe())
    print(users_df[users_df['stance'] == 0].describe())
    print(users_df[users_df['stance'] == 1].describe())
    print(users_df[users_df['stance'] == 2].describe())
    print(users_df[users_df['stance'] == 3].describe())
    users_df['f_t'] = users_df['fact'].map(
        lambda x: facts[facts['hash'] == x]['true'].values[0])
    c_true = users_df['f_t'] == '1'
    c_fal = users_df['f_t'] == '0'
    c_fal1 = users_df['f_t'] == 0
    c_den = users_df['stance'] == 0
    c_sup = users_df['stance'] == 1
    print(users_df[c_true & c_sup].describe())
    print(users_df[c_fal | c_fal1][c_den].describe())
    print(users_df[c_fal | c_fal1][c_sup].describe())
    print(users_df[c_true & c_den].describe())
    print(users_df[users_df['was_correct'] == 1].describe())
    print(users_df[users_df['was_correct'] == 0].describe())
    print(len([t for u in users for t in u.tweets if u.tweets is not None]))

    corpus_analysis(bow_corpus, word_to_idx, idx_to_word)
    # temporal_analysis(get_users())

    cluster_users_on_tweets(users, word_to_idx, idx_to_word)
示例#3
0
def main():
    global bow_corpus
    global word_to_idx, idx_to_word, fact_to_words
    global bow_corpus_top_n
    wn.ensure_loaded()
    print('Grabbing Data')
    bow_corpus = gt.get_corpus()
    facts = gt.get_fact_topics()
    facts = facts[facts['true'] != 'unknown']

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}
    fact_to_words = {
        r['hash']: [w for w in r['fact_terms']]
        for index, r in facts[['hash', 'fact_terms']].iterrows()
    }

    if NEW_MODEL:
        users = gt.get_users()
        # Prepping lstm model
        top_words = 50000
        X, y, user_order = lstm_cred.get_prebuilt_data()
        X, y, user_order = lstm_cred.balance_classes(X, y, user_order)

        #X_train, X_test, y_train, y_test = train_test_split_every_user(X, y, user_order)
        #X_train, X_test, y_train, y_test = train_test_split_on_facts(X, y, user_order, facts_train.values, users)
        #X_train, X_test, y_train, y_test = lstm_cred.train_test_split_on_users(X, y, user_order, users, 100)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.5)

        X_train, X_test, word_to_idx = lstm_cred.keep_n_best_words(
            X_train, y_train, X_test, y_test, idx_to_word, top_words)
        max_tweet_length = 12
        X_train = sequence.pad_sequences(X_train, maxlen=max_tweet_length)
        X_test = sequence.pad_sequences(X_test, maxlen=max_tweet_length)

        # Training lstm model
        embedding_vecor_length = 32
        model = Sequential()
        model.add(
            Embedding(top_words,
                      embedding_vecor_length,
                      input_length=max_tweet_length))
        model.add(Dropout(0.2))
        model.add(LSTM(100))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        print(model.summary())
        model.fit(X_train,
                  y_train,
                  validation_data=(X_test, y_test),
                  epochs=5,
                  batch_size=64)
        model.save('model_data/cred_model.h5')
        scores = model.evaluate(X_test, y_test, verbose=0)
        print("Accuracy: %.2f%%" % (scores[1] * 100))

        if NEW_REL_TWEETS:
            print('Building new relevant tweets')
            users = Parallel(n_jobs=num_jobs)(
                delayed(get_relevant_tweets)(user) for user in users)
            #users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets_test_set)(user, X_test) for user in users)
            user_to_rel_tweet = {
                user.user_id: user.features['relevant_tweets']
                for user in users if 'relevant_tweets' in user.features
            }
            with open('model_data/relevant_tweets.pkl', 'wb') as tmpfile:
                pickle.dump(user_to_rel_tweet, tmpfile)
        else:
            with open('model_data/relevant_tweets.pkl', 'rb') as tmpfile:
                user_to_rel_tweet = pickle.load(tmpfile)
            for user in users:
                if 'relevant_tweets' in user.features:
                    user.features['relevant_tweets'] = user_to_rel_tweet[
                        user.user_id]

        # Build credibility scores for all users on their topic
        print('Computing credibility')
        users = [prebuild_cred(model, u) for u in users]
        users_df = pd.DataFrame([vars(u) for u in users])

        [store_result(u) for u in users]
        with open('model_data/cred_pred_data', 'wb') as tmpfile:
            pickle.dump({'users': users_df, 'map': word_to_idx}, tmpfile)
    else:
        print('Loading users & model')
        with open('model_data/cred_pred_data', 'rb') as tmpfile:
            construct = pickle.load(tmpfile)
        users_df = construct['users']
        word_to_idx = construct['map']

    print('Making cred*sent predictions')
    X = []
    y = []
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x = cred_stance_prediction(this_users)
        this_y = facts['true'].iloc[idx]
        X.append((np.average(this_x), np.std(this_x)))
        y.append(int(this_y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), LinearSVC)
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    print('Making cred*stance predictions')
    X = []
    y = []
    all_evidence = []
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x, evidence = only_cred_support_deny_pred(this_users)
        this_y = facts['true'].iloc[idx]
        evidence = sorted(evidence, reverse=True, key=lambda x: x[0])
        # print(facts[facts['hash']==hsh]['text'].values, int(this_y), this_x[-1])
        # print(evidence if len(evidence) <3 else evidence[:3])
        X.append((np.average(this_x), np.std(this_x)))
        y.append(int(this_y))
    print(X[:20])
    print(y[:20])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), LinearSVC())
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print(acc_scores)
    print(pr_scores)
    print(re_scores)
    print(f1_scores)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))
示例#4
0
def main():
    global bow_corpus
    global word_to_idx, idx_to_word, fact_to_words
    global bow_corpus_top_n
    wn.ensure_loaded()
    print('Grabbing Data')
    bow_corpus = gt.get_corpus()
    facts = gt.get_fact_topics()
    facts = facts[facts['true'] != 'unknown']

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}
    fact_to_words = {
        r['hash']: [w for w in r['fact_terms']]
        for index, r in facts[['hash', 'fact_terms']].iterrows()
    }

    # Credibility data
    print('Loading users & model')
    with open('model_data/cred_pred_data', 'rb') as tmpfile:
        construct = pickle.load(tmpfile)
    users_df = construct['users']
    word_to_idx = construct['map']
    # Feature data
    with open('model_data/feature_data', 'rb') as tmpfile:
        fact_features = pickle.load(tmpfile)
    features = [
        'avg_links', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'lvl_size',
        'avg_len', 'avg_special_symbol', 'avg_time_retweet',
        'avg_count_distinct_words', 'avg_sent_pos', 'cred_pred',
        'cred_pred_std'
    ]

    print('Making cred*stance +best features predictions')
    facts['cred_pred'] = facts['hash'].map(
        lambda x: only_cred_support_deny_pred(users_df[users_df['fact'] == x]))
    facts['cred_pred_std'] = facts['cred_pred'].map(lambda x: np.std(x))
    facts['cred_pred'] = facts['cred_pred'].map(lambda x: x[-1])
    facts = facts.set_index('hash').join(fact_features.set_index('hash'),
                                         rsuffix='_other')
    X = facts[features].values
    y = facts['y'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))
示例#5
0
def main(k_tweets):
    global bow_corpus
    global word_to_idx, idx_to_word, fact_to_words
    global bow_corpus_top_n
    wn.ensure_loaded()
    print('Grabbing Data')
    bow_corpus = gt.get_corpus()
    facts = gt.get_fact_topics()
    facts = facts[facts['true'] != 'unknown']

    bow_corpus_tmp = [w[0] for w in bow_corpus.items() if w[1] > 2]
    word_to_idx = {k: idx for idx, k in enumerate(bow_corpus_tmp)}
    idx_to_word = {idx: k for k, idx in word_to_idx.items()}
    fact_to_words = {
        r['hash']: [w for w in r['fact_terms']]
        for index, r in facts[['hash', 'fact_terms']].iterrows()
    }
    users = gt.get_users()

    if NEW_MODEL:
        # Prepping lstm model
        top_words = 50000
        X, y, user_order = lstm_cred.get_prebuilt_data()
        X, y, user_order = lstm_cred.balance_classes(X, y, user_order)

        X_train, X_test, y_train, y_test = train_test_split_every_user(
            X, y, user_order)
        #X_train, X_test, y_train, y_test = train_test_split_on_facts(X, y, user_order, facts_train.values, users)
        #X_train, X_test, y_train, y_test = lstm_cred.train_test_split_on_users(X, y, user_order, users, 100)
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

        X_train, X_test, word_to_idx = lstm_cred.keep_n_best_words(
            X_train, y_train, X_test, y_test, idx_to_word, top_words)
        max_tweet_length = 12
        X_train = sequence.pad_sequences(X_train, maxlen=max_tweet_length)
        X_test = sequence.pad_sequences(X_test, maxlen=max_tweet_length)

        # Training lstm model
        embedding_vecor_length = 32
        model = Sequential()
        model.add(
            Embedding(top_words,
                      embedding_vecor_length,
                      input_length=max_tweet_length))
        model.add(Dropout(0.2))
        model.add(LSTM(100))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        #print(model.summary())
        model.fit(X_train,
                  y_train,
                  validation_data=(X_test, y_test),
                  epochs=5,
                  batch_size=64,
                  verbose=0)
        model.save('model_data/cred_model.h5')
        #scores = model.evaluate(X_test, y_test, verbose=0)
        #print("Accuracy: %.2f%%" % (scores[1] * 100))
    else:
        model = load_model('model_data/cred_model.h5')

    if NEW_REL_TWEETS:
        print('Building new relevant tweets')
        users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets)(user)
                                          for user in users)
        #users = Parallel(n_jobs=num_jobs)(delayed(get_relevant_tweets_test_set)(user, X_test) for user in users)
        user_to_rel_tweet = {
            user.user_id: user.features['relevant_tweets']
            for user in users if 'relevant_tweets' in user.features
        }
        with open('model_data/relevant_tweets.pkl', 'wb') as tmpfile:
            pickle.dump(user_to_rel_tweet, tmpfile)
    else:
        with open('model_data/relevant_tweets.pkl', 'rb') as tmpfile:
            user_to_rel_tweet = pickle.load(tmpfile)
        for user in users:            user.features['relevant_tweets'] = user_to_rel_tweet[user.user_id] if 'relevant_tweets' in user.features and user.user_id in user_to_rel_tweet else []

    if NEW_CRED:
        # Build credibility scores for all users on their topic
        print('Computing credibility')
        users = [prebuild_cred(model, u, k_tweets) for u in users]
        users_df = pd.DataFrame([vars(u) for u in users])

        [store_result(u) for u in users]
        with open('model_data/cred_pred_data', 'wb') as tmpfile:
            pickle.dump({'users': users_df, 'map': word_to_idx}, tmpfile)
    else:
        print('Loading users & model')
        with open('model_data/cred_pred_data', 'rb') as tmpfile:
            construct = pickle.load(tmpfile)
        users_df = construct['users']
        word_to_idx = construct['map']

    print('Making cred*stance predictions')
    X = []
    y = []
    all_evidence = []
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x, evidence = only_cred_support_deny_pred(this_users)
        this_y = facts['true'].iloc[idx]
        evidence = sorted(evidence, reverse=True, key=lambda x: x[0])
        #print(facts[facts['hash']==hsh]['text'].values, int(this_y), this_x[-1])
        #print(evidence if len(evidence) <3 else evidence[:3])
        X.append((this_x[-1], np.std(this_x)))
        y.append(int(this_y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    if EXP1:
        return

    # Pred with faulty stance

    print('Making cred * faulty stance predictions')
    X = []
    y = []
    all_evidence = []
    with open('model_data/faulty_stances.json', 'rb') as tmpfile:
        f_stances_raw = json.load(tmpfile)
    f_stances = {}
    for k, v in f_stances_raw.items():
        this_val = 0
        if v == 0: this_val = 1
        elif v == 1: this_val = 2
        elif v == 3: this_val = 3
        f_stances[k] = this_val

    #print(sum([1 for x in users_df['tweet_id'].values if str(x) not in f_stances]))
    users_df['true_stance'] = users_df['stance']
    users_df['stance'] = users_df['tweet_id'].map(
        lambda x: f_stances[str(x)] if str(x) in f_stances else users_df[
            users_df['tweet_id'] == x]['true_stance'].values[0])
    #print(users_df[['stance', 'true_stance']])
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x, evidence = only_cred_support_deny_pred(this_users)
        this_y = facts['true'].iloc[idx]
        X.append((this_x[-1], np.std(this_x)))
        y.append(int(this_y))
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(), SVC(C=1, gamma=1))
    std_clf.fit(X_train, y_train)
    pred = std_clf.predict(X_test)

    score = metrics.accuracy_score(y_test, pred)
    precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
        y_test, pred, average='macro')
    print(
        "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))

    # Pred with cred and standard features
    print('Making cred * stance plus standard feature predictions')
    with open('model_data/feature_data', 'rb') as tmpfile:
        fact_features = pickle.load(tmpfile)
    features = [
        'avg_links', 'avg_sent_neg', 'avg_sentiment', 'fr_has_url', 'lvl_size',
        'avg_len', 'avg_special_symbol', 'avg_time_retweet',
        'avg_count_distinct_words', 'avg_sent_pos'
    ]

    X = []
    y = []
    users_df['stance'] = users_df['true_stance']
    #print(fact_features['hash'])
    for idx, hsh in enumerate(facts['hash'].values):
        this_users = users_df[users_df['fact'] == hsh]
        this_x, evidence = only_cred_support_deny_pred(this_users)
        this_y = facts['true'].iloc[idx]

        this_fact_features = [0] * len(features)
        if hsh in fact_features['hash'].values:
            this_fact_features = fact_features[fact_features['hash'] == hsh][
                list(features)].values

        X.append(
            np.concatenate(([this_x[-1], np.std(this_x)], this_fact_features),
                           axis=None))
        #X.append([this_x[-1], np.std(this_x)] + this_fact_features)
        y.append(int(this_y))

    from sklearn.model_selection import KFold  # import KFold
    kf = KFold(n_splits=3)  # Define the split - into 2 folds
    kf.get_n_splits(
        X)  # returns the number of splitting iterations in the cross-validator
    for train_index, test_index in kf.split(X):
        X_train, X_test, y_train, y_test = np.asarray(
            X)[train_index], np.asarray(X)[test_index], np.asarray(
                y)[train_index], np.asarray(y)[test_index]
        #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

        X_train_cred = np.asarray(X_train)[:, :2]
        X_test_cred = np.asarray(X_test)[:, :2]
        std_clf = make_pipeline(StandardScaler(),
                                SVC(C=1, gamma=1, probability=True))
        std_clf.fit(X_train_cred, y_train)
        pred_cred = std_clf.predict_proba(X_test_cred)

        X_train_feat = np.asarray(X_train)[:, 2:]
        X_test_feat = np.asarray(X_test)[:, 2:]
        std_clf = make_pipeline(StandardScaler(), PCA(n_components=8),
                                SVC(C=1, gamma=1, probability=True))
        std_clf.fit(X_train_feat, y_train)
        pred_feat = std_clf.predict_proba(X_test_feat)

        #print(pred_feat)

        pred_proba = np.add(pred_cred, pred_feat)
        #print(pred_proba)

        pred = [np.argmax(x) for x in np.divide(pred_proba, 2)]
        print(pred)
        print(y_test)

        score = metrics.accuracy_score(y_test, pred)
        precision, recall, fscore, sup = metrics.precision_recall_fscore_support(
            y_test, pred, average='macro')
        print(
            "Rumors: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
            % (score, precision, recall, fscore))
示例#6
0
def main():
    global users
    wn.ensure_loaded()
    facts = gt.get_fact_topics(DIR)
    features = [
        'avg_mentions', 'avg_emoticons', 'avg_links', 'avg_questionM',
        'avg_personal_pronoun_first', 'avg_sent_pos', 'avg_sent_neg',
        'avg_sentiment', 'fr_has_url', 'share_most_freq_author', 'lvl_size',
        'avg_followers', 'avg_friends', 'avg_status_cnt', 'avg_reg_age'
    ]
    if NEW_DATA:
        users = gt.get_users(DIR)
        transactions = gt.get_transactions(DIR)
        print(transactions.describe())

        tr_hsh = transactions['fact'].values
        cond = facts['hash'].isin(tr_hsh)
        facts = facts[cond]
        facts = pd.DataFrame([
            get_features(fact, transactions, users)
            for idx, fact in facts.iterrows() if fact['true'] != 'unknown'
        ])
        with open('model_data/castillo_data', 'wb') as tmpfile:
            pickle.dump(facts, tmpfile)
    else:
        with open('model_data/castillo_data', 'rb') as tmpfile:
            facts = pickle.load(tmpfile)
    print(facts.describe())
    X = facts[list(features)].values
    y = facts['y'].values

    fig = plt.figure()
    fig.subplots_adjust(hspace=0.4, wspace=0.4)
    for i in range(1, len(features) + 1):
        ax = fig.add_subplot(3, 5, i)
        sns.boxplot(x="y", y=features[i - 1], data=facts, palette="Set3")
    #plt.show()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    std_clf = make_pipeline(StandardScaler(),
                            DecisionTreeClassifier(random_state=42))
    std_clf.fit(X_train, y_train)
    pred_test_std = std_clf.predict(X_test)
    precision, recall, fscore, sup = precision_recall_fscore_support(
        y_test, pred_test_std, average='macro')
    score = metrics.accuracy_score(y_test, pred_test_std)
    print(
        "Random split: Accuracy: %0.3f, Precision: %0.3f, Recall: %0.3f, F1 score: %0.3f"
        % (score, precision, recall, fscore))
    acc_scores = cross_val_score(std_clf, X, y, cv=3)
    pr_scores = cross_val_score(std_clf, X, y, scoring='precision', cv=3)
    re_scores = cross_val_score(std_clf, X, y, scoring='recall', cv=3)
    f1_scores = cross_val_score(std_clf, X, y, scoring='f1', cv=3)
    print("\t Cross validated Accuracy: %0.3f (+/- %0.3f)" %
          (acc_scores.mean(), acc_scores.std() * 2))
    print("\t Cross validated Precision: %0.3f (+/- %0.3f)" %
          (pr_scores.mean(), pr_scores.std() * 2))
    print("\t Cross validated Recall: %0.3f (+/- %0.3f)" %
          (re_scores.mean(), re_scores.std() * 2))
    print("\t Cross validated F1: %0.3f (+/- %0.3f)" %
          (f1_scores.mean(), f1_scores.std() * 2))