def word_predict(save_path, i): """ Creates TSV with scores of all vocabulary. :param save_path: :param i: :return: """ print("TRAINING SECTION:", (i), save_path) config = DefaultConfig() data = dp.Data_Processing(load_tokenizers=True) x_train, y_train, x_test, y_test, x_tokenizer, y_tokenizer = data.run( train_file="../../data/train_data/train_data.p", test_file="../../data/test_data/test_data.p", section=i, y_mode="vectorize", shuffle=False) get_counts = Tokenizer() get_counts.fit_on_texts(x_test) words = [word[0] for word in get_counts.word_index.items()] word_vectors = data.pad_sequences(data.vectorize_text(words), max_len=30) labels = [[0, 1] for label in range(len(words))] num_words = len(x_tokenizer.word_index) + 1 num_classes = len(y_tokenizer.word_index.items()) print("LEN X: ", len(word_vectors)) w2v_layer = data.create_embedding_layer( '../../trained_w2v/twitter2vec.w2v') validation_batches = generate_instances(word_vectors, labels, num_words, num_classes, config.max_timesteps, batch_size=config.batch_size) # Train the model predict(config, validation_batches, w2v_layer, save_path, words, pred_type="words") print("FINISHED:", save_path, "WORD PREDICTION\n\n")
def train(save_path, i, from_saved=False): """ Trains RNN model with given save_path. Can resume training from saved model. :param save_path: :param i: :return: """ print("TRAINING SECTION:", i, save_path) config = DefaultConfig() data = dp.Data_Processing(load_tokenizers=True) x_train, y_train, x_test, y_test, x_tokenizer, y_tokenizer = data.run( train_file="../../data/train_data/train_data.p", test_file="../../data/test_data/test_data.p", section=i, x_mode="index", y_mode="vectorize") num_words = len(x_tokenizer.word_index) + 1 num_classes = len(y_tokenizer.word_index.items()) # TODO CHECK INDEX LIST w2v_layer = data.create_embedding_layer( '../../trained_w2v/twitter2vec.w2v') # Generate batches train_batches = generate_instances(x_train, y_train, num_words, num_classes, config.max_timesteps, batch_size=config.batch_size) validation_batches = generate_instances(x_test, y_test, num_words, num_classes, config.max_timesteps, batch_size=config.batch_size) # Train the model train_model(config, train_batches, validation_batches, w2v_layer, save_path, from_saved=from_saved) print("FINISHED:", save_path, "TRAINING\n\n")
def tweet_predict(save_path, i): """ Creates TSV with scores of all tweets :param save_path: :param i: :return: """ print("TRAINING SECTION:", i, save_path) config = DefaultConfig() data = dp.Data_Processing(load_tokenizers=True) x_train, y_train, x_test, y_test, x_tokenizer, y_tokenizer = data.run( train_file="../../data/train_data/train_data.p", test_file="../../data/test_data/test_data.p", section=i, x_mode="index", y_mode="vectorize", shuffle=False) tweets_train, _, tweets_test, _, _, _ = data.run( train_file="../../data/train_data/train_data.p", test_file="../../data/test_data/test_data.p", section=i, shuffle=False) num_words = len(x_tokenizer.word_index) + 1 num_classes = len(y_tokenizer.word_index.items()) print("NUM WORDS: ", num_words, " NUM CLASSES: ", num_classes) w2v_layer = data.create_embedding_layer( '../../trained_w2v/twitter2vec.w2v') # TODO STILL ONLY VALIDATION BATCHES? validation_batches = generate_instances(x_test, y_test, num_words, num_classes, config.max_timesteps, batch_size=config.batch_size) # Train the model predict(config, validation_batches, w2v_layer, save_path, tweets_test) print("FINISHED:", save_path, "TWEET PREDICTION\n\n")
def tfidf_analysis(sections, i): """ Writes 50 highest scoring words from each political party per quarter :param sections: :param i: :return: """ with open("tfidf_results/tfidf_scores.tsv", "w+") as results: results.write("time\t" + "d_word\t" + "d_score\t" + 'r_word\t' + 'r_score\t') results.close() print("WORKING ON SECTION:", i, save_path) data = dp.Data_Processing() x, y, x_test, y_test, x_tokenizer, y_tokenizer = data.run( train_file="../data/train_data/train_data.p", test_file="../data/test_data/test_data.p", section=i, shuffle=False) x = np.append(x, x_test) y = np.append(y, y_test) x = data.tweet_tokenizer(x) r_x = [] d_x = [] x_full = [] for idx, tweet in enumerate(x): if y[idx] == "R": r_x.append(tweet) else: d_x.append(tweet) x_full.append(tweet) t = Tfidf() t.build_model(x_full) r_tfidf = dict() d_tfidf = dict() for tweet in r_x: counts, length = t.counts(tweet) for word in tweet: score = t.tf_idf(counts[word], length, t.num_docs, t.token_appearance[word]) r_tfidf[word] = r_tfidf.get(word, 0) + score for tweet in d_x: counts, length = t.counts(tweet) for word in tweet: score = t.tf_idf(counts[word], length, t.num_docs, t.token_appearance[word]) d_tfidf[word] = d_tfidf.get(word, 0) + score r_words = [item[0] for item in r_tfidf.items()] r_scores = [item[1] for item in r_tfidf.items()] r_top_words = [(r_words[idx], r_scores[idx]) for idx in list(np.argsort(r_scores))][::-1] d_words = [item[0] for item in d_tfidf.items()] d_scores = [item[1] for item in d_tfidf.items()] d_top_words = [(d_words[idx], d_scores[idx]) for idx in list(np.argsort(d_scores))][::-1] with open("tfidf_results/tfidf_scores.tsv", "a") as results: for idx in range(50): results.write("\n%s\t%s\t%.5f\t%s\t%.5f" % (save_path, d_top_words[idx][0], d_top_words[idx][1], r_top_words[idx][0], d_top_words[idx][1])) results.close() print(save_path, "FINISHED")
# Load some categories from the training se # writes coef_d feat_d coef_r feat_r section model acc_d acc_r acc_all recall f1 support with open("svm_top_features.tsv", "w", encoding="utf-8") as tsv: tsv.write("coef_d\t" + "feat_d\t" + "coef_r\t" + "feat_r\t" + "section\t" + "model\t" + "acc_d\t" + "acc_r\t" + "acc_all\t" + "recall\t" + "f1\t" + "support") tsv.close() sections = [ "2015_1q", "2015_2q", "2015_3q", "2015_4q", "2016_1q", "2016_2q", "2016_3q", "2016_4q", "2017_1q", "2017_2q", "2017_2q_REDUCED", "2017_2q_REDUCED" ] data = dp.Data_Processing(load_tokenizers=True) for i, section in enumerate(sections): print("\nWORKING ON SECTION: ", section, "\n") data_train, y_train, data_test, y_test, _, _ = data.run( train_file="../data/train_data/train_data.p", test_file="../data/test_data/test_data.p", section=i + 1) target_names = list(set(y_train)) print( "Extracting features from the training data using a sparse vectorizer") vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english')