Пример #1
0
def load_data_from_file():
    """
    reads tweets downloaded with twitter_download.py
    and extracts two fields:
    data are the tweets
    labels are the handles corresponding to each tweet
    returns list of tweets and list of labels
    """
    handles = get_handles()
    directory = './tweets/'
    data = []
    labels = []

    for handle in handles:
        filename = directory + handle + '_tweets.csv'
        print(f'[load_data_from_file]: Loading {filename}')

        dat, lab = load_data_by_name(handle, directory)

        data += dat
        labels += lab

        print(
            f'[load_data_from_file]: cumulative data size= {len(data)} tweets\n'
        )

    return data, labels
Пример #2
0
def classify(X_train, y_train, X_test, y_test, params, clf_algo, clf_name,
             method):
    '''Dispatcher:
    1. send to train_classifier to get model
    2. send to compute & plot confusion matrix
    3. optionally send to plot ROC & AUC curves
    '''
    handles = get_handles()
    clf = train_classifier(clf_algo, params, X_train, y_train)
    result_dispatcher(clf, X_test, y_test, np.array(handles), clf_name, method)
    if ROC_flag:
        ROC(clf, clf_name, method, X_train, X_test, y_train, y_test, save=True)
    return clf
Пример #3
0
def load_all_tweets():
    directory = './tweets/'
    data = []
    labels = []

    handles = get_handles()

    for handle in handles:
        datum, label = load_data_by_name(handle, directory)
        data.append(datum)
        labels.append(label)
    print(f'[load_all_tweets] tweets loaded\n')
    return data, labels
def analyzer():
    handles = get_handles()
    data = []
    # Grab twitter handles and append the name and image to data
    for handle in handles:
        data_dict = {}
        tweets = api.user_timeline(handle)
        data_dict['Handle'] = handle
        data_dict['Name'] = tweets[0]['user']['name']
        data_dict['Image'] = tweets[0]['user'][
            'profile_image_url_https'].replace('normal', '400x400')
        data.append(data_dict)

    # Setup sentiment analyzer
    analyzer = SentimentIntensityAnalyzer()

    # Grab tweets containing the user name of the twitter handle
    for user in data:
        compound_scores = []
        tweets = api.search_users(user['Name'])
        query_name = user['Name']
        articles = requests.get('https://newsapi.org/v2/everything?q=' +
                                query_name + '&language=en' + "&apiKey=" +
                                api_key).json()

        # Run sentiment analysis on tweets and append the average
        # compound sentiment score to data
        for tweet in tweets:
            try:
                sent = analyzer.polarity_scores(tweet['status']['text'])
                compound_scores.append(sent['compound'])
            except KeyError:
                pass

        for article in articles['articles']:
            if article['content']:
                senti = analyzer.polarity_scores(article["content"])
                compound_scores.append(senti['compound'])

        user['Score'] = np.mean(compound_scores)

    # Convert the list of dictionaries to a dataframe
    data_df = pd.DataFrame(data)

    # Sort the dataframe by Score in descending order
    data_df_sorted = data_df.sort_values(by='Score', ascending=False)

    # Convert the dataframe back to a list of dictionaries
    data_ordered = data_df_sorted.to_dict('records')

    return data_ordered
Пример #5
0
def ROC_plot(fpr, tpr, roc_auc, n_classes, ix, gmean, algo, method, save=True):
    '''Plots ROC curves
    gmean = average
    '''
    color_dict = get_color_dict()
    names = get_names()
    handles = get_handles()

    lw = 1
    figsize = 6
    title = f'ROC {method}+{algo}'

    # Make plot
    plt.figure(figsize=(figsize, figsize))
    colors = cycle([
        color_dict[handles[0]], color_dict[handles[1]], color_dict[handles[2]]
    ])

    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i],
                 tpr[i],
                 color=color,
                 linewidth=lw,
                 label=f'ROC {names[i]} (auc = {roc_auc[i]:0.2f})')
        if i == 0:
            plt.scatter(fpr[i][ix[i]],
                        tpr[i][ix[i]],
                        marker='o',
                        color='black',
                        label=f'Best (gmeans={gmean:.3f})')
        else:
            plt.scatter(fpr[i][ix[i]],
                        tpr[i][ix[i]],
                        marker='o',
                        color='black')
    plt.plot([0, 1], [0, 1], 'k--', linewidth=lw, label='No Skill')  # diagonal

    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")
    if save:
        plt.savefig(clean_filename(title, 'png', plot_directory))
    plt.show()
Пример #6
0
                mat[i][j] = arr[k]
                mat[j][i] = mat[i][j]
                k += 1
    return mat


if __name__ == '__main__':
    t0 = time.time()
    name_dict = get_name_dict()

    stemmer_flag = False
    max_ngram = 2

    # Load tweets from csv files
    names = list()
    handles = get_handles()
    for handle in handles:
        names.append(name_dict[handle])

    raw_corpus, raw_labels = load_all_tweets()
    corpus, labels = iterate_preprocess(raw_corpus, raw_labels, handles,
                                        stemmer_flag)

    for ngram in range(1, max_ngram + 1):  # loop through ngrams
        # Create three sets of tokens
        tokens = list()
        for tweets in corpus:
            tk = list()
            for tweet in tweets:
                tk += nltk.word_tokenize(tweet)
            tokens.append(set(nltk.ngrams(tk, n=ngram)))
Пример #7
0
    name_dict = get_name_dict()
    color_dict = get_color_dict()

    Gcorpus_nodes = np.array([7350, 8872, 7249])
    Gcorpus_edges = np.array([81032, 103925, 81362])
    Gcore_nodes = np.array([271, 280, 229])
    Gcore_edges = np.array([8007, 8672, 6847])
    Gtruss_nodes = np.array([59, 63, 63])
    Gtruss_edges = np.array([996, 1153, 1249])

    Gcorpus_sparcity = np.zeros(3)
    Gcore_sparcity = np.zeros(3)
    Gtruss_sparcity = np.zeros(3)

    for i, handle in enumerate(get_handles()):
        nodes = Gcorpus_nodes[i]
        edges = Gcorpus_edges[i]
        Gcorpus_sparcity[i] = edges / (.5 * nodes * (nodes - 1))
        nodes = Gcore_nodes[i]
        edges = Gcore_edges[i]
        Gcore_sparcity[i] = edges / (.5 * nodes * (nodes - 1))
        nodes = Gtruss_nodes[i]
        edges = Gtruss_edges[i]
        Gtruss_sparcity[i] = edges / (.5 * nodes * (nodes - 1))

    plot_Gmeta(Gcorpus_nodes, Gcore_nodes, Gtruss_nodes,
               'Order of dense subgraphs', '# vertices (log)', False)

    plot_Gmeta(Gcorpus_edges, Gcore_edges, Gtruss_edges,
               'Size of dense subgraphs', '# edges (log)', False)