def load_data_from_file(): """ reads tweets downloaded with twitter_download.py and extracts two fields: data are the tweets labels are the handles corresponding to each tweet returns list of tweets and list of labels """ handles = get_handles() directory = './tweets/' data = [] labels = [] for handle in handles: filename = directory + handle + '_tweets.csv' print(f'[load_data_from_file]: Loading {filename}') dat, lab = load_data_by_name(handle, directory) data += dat labels += lab print( f'[load_data_from_file]: cumulative data size= {len(data)} tweets\n' ) return data, labels
def classify(X_train, y_train, X_test, y_test, params, clf_algo, clf_name, method): '''Dispatcher: 1. send to train_classifier to get model 2. send to compute & plot confusion matrix 3. optionally send to plot ROC & AUC curves ''' handles = get_handles() clf = train_classifier(clf_algo, params, X_train, y_train) result_dispatcher(clf, X_test, y_test, np.array(handles), clf_name, method) if ROC_flag: ROC(clf, clf_name, method, X_train, X_test, y_train, y_test, save=True) return clf
def load_all_tweets(): directory = './tweets/' data = [] labels = [] handles = get_handles() for handle in handles: datum, label = load_data_by_name(handle, directory) data.append(datum) labels.append(label) print(f'[load_all_tweets] tweets loaded\n') return data, labels
def analyzer(): handles = get_handles() data = [] # Grab twitter handles and append the name and image to data for handle in handles: data_dict = {} tweets = api.user_timeline(handle) data_dict['Handle'] = handle data_dict['Name'] = tweets[0]['user']['name'] data_dict['Image'] = tweets[0]['user'][ 'profile_image_url_https'].replace('normal', '400x400') data.append(data_dict) # Setup sentiment analyzer analyzer = SentimentIntensityAnalyzer() # Grab tweets containing the user name of the twitter handle for user in data: compound_scores = [] tweets = api.search_users(user['Name']) query_name = user['Name'] articles = requests.get('https://newsapi.org/v2/everything?q=' + query_name + '&language=en' + "&apiKey=" + api_key).json() # Run sentiment analysis on tweets and append the average # compound sentiment score to data for tweet in tweets: try: sent = analyzer.polarity_scores(tweet['status']['text']) compound_scores.append(sent['compound']) except KeyError: pass for article in articles['articles']: if article['content']: senti = analyzer.polarity_scores(article["content"]) compound_scores.append(senti['compound']) user['Score'] = np.mean(compound_scores) # Convert the list of dictionaries to a dataframe data_df = pd.DataFrame(data) # Sort the dataframe by Score in descending order data_df_sorted = data_df.sort_values(by='Score', ascending=False) # Convert the dataframe back to a list of dictionaries data_ordered = data_df_sorted.to_dict('records') return data_ordered
def ROC_plot(fpr, tpr, roc_auc, n_classes, ix, gmean, algo, method, save=True): '''Plots ROC curves gmean = average ''' color_dict = get_color_dict() names = get_names() handles = get_handles() lw = 1 figsize = 6 title = f'ROC {method}+{algo}' # Make plot plt.figure(figsize=(figsize, figsize)) colors = cycle([ color_dict[handles[0]], color_dict[handles[1]], color_dict[handles[2]] ]) for i, color in zip(range(n_classes), colors): plt.plot(fpr[i], tpr[i], color=color, linewidth=lw, label=f'ROC {names[i]} (auc = {roc_auc[i]:0.2f})') if i == 0: plt.scatter(fpr[i][ix[i]], tpr[i][ix[i]], marker='o', color='black', label=f'Best (gmeans={gmean:.3f})') else: plt.scatter(fpr[i][ix[i]], tpr[i][ix[i]], marker='o', color='black') plt.plot([0, 1], [0, 1], 'k--', linewidth=lw, label='No Skill') # diagonal plt.xlim([0, 1]) plt.ylim([0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title(title) plt.legend(loc="lower right") if save: plt.savefig(clean_filename(title, 'png', plot_directory)) plt.show()
mat[i][j] = arr[k] mat[j][i] = mat[i][j] k += 1 return mat if __name__ == '__main__': t0 = time.time() name_dict = get_name_dict() stemmer_flag = False max_ngram = 2 # Load tweets from csv files names = list() handles = get_handles() for handle in handles: names.append(name_dict[handle]) raw_corpus, raw_labels = load_all_tweets() corpus, labels = iterate_preprocess(raw_corpus, raw_labels, handles, stemmer_flag) for ngram in range(1, max_ngram + 1): # loop through ngrams # Create three sets of tokens tokens = list() for tweets in corpus: tk = list() for tweet in tweets: tk += nltk.word_tokenize(tweet) tokens.append(set(nltk.ngrams(tk, n=ngram)))
name_dict = get_name_dict() color_dict = get_color_dict() Gcorpus_nodes = np.array([7350, 8872, 7249]) Gcorpus_edges = np.array([81032, 103925, 81362]) Gcore_nodes = np.array([271, 280, 229]) Gcore_edges = np.array([8007, 8672, 6847]) Gtruss_nodes = np.array([59, 63, 63]) Gtruss_edges = np.array([996, 1153, 1249]) Gcorpus_sparcity = np.zeros(3) Gcore_sparcity = np.zeros(3) Gtruss_sparcity = np.zeros(3) for i, handle in enumerate(get_handles()): nodes = Gcorpus_nodes[i] edges = Gcorpus_edges[i] Gcorpus_sparcity[i] = edges / (.5 * nodes * (nodes - 1)) nodes = Gcore_nodes[i] edges = Gcore_edges[i] Gcore_sparcity[i] = edges / (.5 * nodes * (nodes - 1)) nodes = Gtruss_nodes[i] edges = Gtruss_edges[i] Gtruss_sparcity[i] = edges / (.5 * nodes * (nodes - 1)) plot_Gmeta(Gcorpus_nodes, Gcore_nodes, Gtruss_nodes, 'Order of dense subgraphs', '# vertices (log)', False) plot_Gmeta(Gcorpus_edges, Gcore_edges, Gtruss_edges, 'Size of dense subgraphs', '# edges (log)', False)