def likelihood_and_perplexity(save_path, articles, log_likelihood): num_words = sum( utils.get_word_counts(articles).values()) # vocabulary size log_likelihood = np.asarray(log_likelihood) x_axis = np.asarray(list(range(len(log_likelihood)))) # plot log likelihood path = save_path + "log_likelihood.png" plt.subplots_adjust(left=0.2) plt.plot(x_axis, log_likelihood) plt.title('Log-Likelihood (max value ' + str(round(log_likelihood[-1], 2)) + ")") plt.ylabel('Log Likelihood') plt.xlabel('Iteration') plt.savefig(path) plt.close() # plot the perplexity path = save_path + "perplexity.png" perplexity = np.e**((-1 / float(num_words)) * log_likelihood) print("Best Perplexity: " + str(perplexity[-1])) plt.title('Perplexity (min value ' + str(round(perplexity[-1], 2)) + ")") plt.plot(x_axis, perplexity) plt.ylabel('Perplexity') plt.xlabel('Iteration') plt.savefig(path) plt.close()
def EM(articles): N = len(articles) # number of documents V = len(utils.get_word_counts(articles).keys()) # vocabulary size n = utils.words_per_doc_vec(articles) # shape of NxV, each cell in n_tk likelihod_vals = [] # initial weights - each example belong to 1 cluster only w = np.zeros((N, num_classes)) for idx in range(N): w[idx, idx % num_classes] = 1 P, alpha = M_step(N, V, n, w) likelihood = log_likelihood(N, n, k, P, alpha) likelihod_vals.append(likelihood) # do the EM until convergence eps = abs(likelihood / const) delta = abs(likelihood) iter = 0 print("log-likelihood " + "iteration " + str(iter) + ": " + str(likelihood)) while delta > eps: w = E_step(N, n, k, P, alpha) P, alpha = M_step(N, V, n, w) likelihood = log_likelihood(N, n, k, P, alpha) likelihod_vals.append(likelihood) delta = likelihod_vals[-1] - likelihod_vals[-2] iter += 1 print("log-likelihood " + "iteration " + str(iter) + ": " + str(likelihood)) return likelihod_vals, w
def replace_infrequent_words(in_file, out_file, count_thresh=5, symbol="_RARE_"): """ Replace words with frequency < count_thresh in in_file by symbol and store in out_file. """ # get frequency of each word in in_file word_count_dict = get_word_counts(in_file) out_lines_list = [] in_file.seek(0) l = in_file.readline() while l: line = l.strip() if line: # Nonempty line fields = line.split(" ") word = " ".join(fields[:-1]) # replace word with symbol if frequency < count_thresh if word_count_dict[word] < count_thresh: line = " ".join([symbol, fields[-1]]) out_lines_list.append(line) l = in_file.readline() out_lines = "\n".join(out_lines_list) out_file.write(out_lines)
def get_wordvectors(): print("wordvectors read ...") word_dict = utils.get_word_counts("/../resources/yelp/data/yelp_restaurant_word_counts.txt") stop_words = utils.get_stopwords() vw_model = utils.get_word2vec_model('../resources/yelp/word2vec/yelp_restaurants_word2vector', ncols, nwin) vw_model.syn0 = utils.normalize2(vw_model.syn0) glove_dict = utils.get_glove_data('../resources/yelp/glove/','vectors_'+str(ncols)+'.txt') return vw_model, word_dict, stop_words, glove_dict
def main(): vw_model = utils.get_word2vec_model('../resources/yelp/word2vec/yelp_restaurants_word2vector', ncols, nwin) vw_model.vectors = utils.normalize2(vw_model.vectors) glove_dict = utils.get_glove_data('../resources/yelp/glove/', 'vectors_' + str(ncols) + '.txt') word_dict = utils.get_word_counts("../resources/yelp/data/yelp_restaurant_word_counts.txt") # train_list, test_list = get_list() # train_list = utils.get_list('../resources/yelp/train_data50000.txt') # test_list = utils.get_list('../resources/yelp/test_data50000.txt') max_rlen = get_max_number_of_token() model = my_model(max_rlen) print("#################### Iterations ################\n") results = [] for j in range(1): acc_hist = {} train_list = utils.get_list('../resources/yelp/cv_train_data_'+str(j) +'.txt') test_list = utils.get_list('../resources/yelp/cv_train_data_'+str(j)+'.txt') half = len(test_list) // 2 for e in range(nepochs): for i in range(int(len(train_list)/nbatch)): x_train, y_train = get_review_windows(vw_model, train_list[nbatch*i:nbatch*(i+1)], max_rlen, ncols, nbatch, glove_dict, word_dict) (loss,acc) = model.train_on_batch(x_train, y_train) print("Train: Epoch:" + str(e+1) + " Loss = " + str(loss) + " -- " + "Accuracy = " + str(acc)) acc_hist[str(e+1)] = acc with open("../resources/yelp/scores/CNN-VALID_"+str(j), 'w') as out: counter = 0 score = 0 for i in range(int(len(test_list)/nbatch)): x_test, y_test = get_review_windows(vw_model, test_list[nbatch*i:nbatch*(i+1)], max_rlen, ncols, nbatch, glove_dict, word_dict) loss = model.test_on_batch(x_test, y_test) pred = model.predict_proba(x_test) classes = model.predict_classes(x_test) counter += nbatch for p, c in zip(pred, classes): # score: if counter <= half and c == 1: score += 1 elif counter > half and c == 0: score += 1 out.write(str(c) + " " + str(p[1]) + " " + str(p[0]) + "\n") print("Test: Iteration:" + str(i + 1) + " Loss = " + str(loss)) results.append(score) print("######################## Trial = " + str(j)) acc = [] for result in results: acc.append(result) print(result) np_acc = np.array(acc) print("Mean = " + str(np_acc.mean())) print("Std.Dev = " + str(np.std(np_acc, dtype=np.float64)))
def get_indices_from_word_count(lyrics, limits=(5, 1000)): remove_indices = [] print("Filtering based on {} < word count < {}...".format( limits[0], limits[1])) word_counts = get_word_counts(lyrics) for i in pbar.progressbar(range(len(lyrics))): if word_counts[i] > limits[1] or word_counts[i] < limits[0]: remove_indices.append(i) print("Songs filtered based on word count: {}".format(len(remove_indices))) return remove_indices
def get_review_windows(model, reviews, max_rlen, ncols, nsen, glove_dict): word_dict = utils.get_word_counts("../resources/semeval/data/word_counts.txt",6,2,1) x = np.zeros(shape=(nsen, max_rlen, gm*ncols)) y = np.zeros(shape=(nsen,num_classes)) for i,review in enumerate(reviews): try: #x[i] = utils.get_token_matrix(model, review[0], max_rlen, ncols, glove_dict, gm) x[i] = utils.get_token_matrix_weight(model, review[0], max_rlen, ncols, glove_dict, gm, word_dict) except IndexError as e: print(e) y[i] = review[1] x = x.reshape(x.shape[0], max_rlen, gm*ncols, 1) x = x.astype('float32') #y = keras_test.utils.to_categorical(y, num_classes) return x,y
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"): """ Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file. """ # get frequently occurring words word_count_dict = get_word_counts(file('ner_train.dat')) freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5] # compute emission probs counter = Hmm(3) counter.read_counts(counts_file) emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0]) out_lines_list = [] l = dev_file.readline() while l: word = l.strip() if word: # Nonempty line # use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words. if word not in freq_words: tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0] prob = emission_probs[rare_symbol][tag] # use emission probabilities of the word itself for frequently occurring words. else: tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0] prob = emission_probs[word][tag] log_prob = math.log(prob, 2) l = word + " " + tag + " " + str(log_prob) else: l = "" out_lines_list.append(l) l = dev_file.readline() out_lines = "\n".join(out_lines_list) out_lines = out_lines + "\n" # write words, corresponding tags and log probs to file with open('4_2.txt','w') as out_file: out_file.write(out_lines)
def replace_infrequent_words_with_categories(in_file, out_file, count_thresh=5): """ Replace words with frequency < count_thresh in in_file by their category and store in out_file. """ # get frequency of each word in in_file word_count_dict = get_word_counts(in_file) out_lines_list = [] in_file.seek(0) l = in_file.readline() while l: line = l.strip() if line: # Nonempty line fields = line.split(" ") word = " ".join(fields[:-1]) # replace word with its category if frequency < count_thresh if word_count_dict[word] < count_thresh: line = " ".join([get_category(word), fields[-1]]) out_lines_list.append(line) l = in_file.readline() out_lines = "\n".join(out_lines_list) out_file.write(out_lines)
def count_blocks(blocks, word_frequencies): block_frequencies = defaultdict(int) for word, blocks in blocks.items(): frequency = word_frequencies[word] for block in blocks: block_frequencies[block] += frequency return block_frequencies if __name__ == '__main__': import os word_counts = get_word_counts( os.path.join('word_lists', 'filtered_word_counts.txt')) words = word_counts.keys() bigrams = get_bigram_frequencies(word_counts) bigram_dict = get_bigram_dictionary(word_counts) # show_in_order(bigrams) # total_bigrams = sum(count for count in bigrams.values()) # print(total_bigrams) # for bigram, count in sorted(bigrams.items(), key=lambda item: item[1])[:10]: # print(bigram, count, find_words_containing_substring(words, bigram)) # Top 40 bigrams # top_bigrams = [item for item, count in sorted(bigrams.items(), key=lambda item: item[1])[:40]] # letter_counts = Counter("".join(top_bigrams)) # print(letter_counts)
word = " ".join(fields[:-1]) # replace word with its category if frequency < count_thresh if word_count_dict[word] < count_thresh: line = " ".join([get_category(word), fields[-1]]) out_lines_list.append(line) l = in_file.readline() out_lines = "\n".join(out_lines_list) out_file.write(out_lines) if __name__ == "__main__": # replace infrequent words with categories and write to file replace_infrequent_words_with_categories(file('ner_train.dat'), file('ner_train_cats.dat', 'w')) # generate counts file os.system('python count_freqs.py ner_train_cats.dat > ner_cats.counts') # get frequent words word_count_dict = get_word_counts(file('ner_train.dat')) freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5] # get transition and emission probabilities counter = Hmm(3) counter.read_counts(file('ner_cats.counts')) transition_probs = compute_transition_probs(counter.ngram_counts[1], counter.ngram_counts[2]) emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0]) # store tagged data with the log probs to file tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words) os.system('python eval_ne_tagger.py ner_dev.key 6.txt')
def main(): vw_model = utils.get_word2vec_model( '../resources/imdb/data/alldata_word2vector', ncols, nwin) vw_model.syn0 = utils.normalize2(vw_model.syn0) glove_dict = utils.get_glove_data('../resources/imdb/glove', 'vectors_300_5.txt') word_dict = utils.get_word_counts("../resources/imdb/data/word_counts.txt", 1, 1, 2) # train_list = utils.get_shuffle_list('../resources/imdb/data/full-train-pos.txt', # '../resources/imdb/data/full-train-neg.txt', True) # test_list = utils.get_shuffle_list('../resources/imdb/data/test-pos.txt', # '../resources/imdb/data/test-neg.txt', False) train_list = utils.get_shuffle_list( '../resources/imdb/data/small-train-pos.txt', '../resources/imdb/data/small-train-neg.txt', True) test_list = utils.get_shuffle_list('../resources/imdb/data/valid-pos.txt', '../resources/imdb/data/valid-neg.txt', False) max_rlen = get_max_number_of_token() model = my_model(max_rlen) half = len(test_list) // 2 results = [] for j in range(30): acc_hist = {} for e in range(nepochs): for i in range(int(len(train_list) / nbatch)): x_train, y_train = get_review_windows( vw_model, train_list[nbatch * i:nbatch * (i + 1)], max_rlen, ncols, nbatch, glove_dict, word_dict) (loss, acc) = model.train_on_batch(x_train, y_train) print("Train: Epoch:" + str(e + 1) + " Loss = " + str(loss) + " -- " + "Accuracy = " + str(acc)) acc_hist[str(e + 1)] = acc with open("../resources/imdb/scores/CNN-VALID_" + str(j), 'w') as out: counter = 0 score = 0 for i in range(int(len(test_list) / nbatch)): x_test, y_test = get_review_windows( vw_model, test_list[nbatch * i:nbatch * (i + 1)], max_rlen, ncols, nbatch, glove_dict, word_dict) loss = model.test_on_batch(x_test, y_test) pred = model.predict_proba(x_test) classes = model.predict_classes(x_test) counter += nbatch for p, c in zip(pred, classes): # score: if counter <= half and c == 1: score += 1 elif counter > half and c == 0: score += 1 out.write( str(c) + " " + str(p[1]) + " " + str(p[0]) + "\n") print("Test: Iteration:" + str(i + 1) + " Loss = " + str(loss)) results.append(score) print("######################## Trial = " + str(j)) acc = [] for result in results: acc.append(result) print(result) np_acc = np.array(acc) print("Mean = " + str(np_acc.mean())) print("Std.Dev = " + str(np.std(np_acc, dtype=np.float64)))
def main(*args): plot = const.PLOT_DEFAULT print_ = const.PRINT_DEFAULT even_distrib = const.EVEN_DISTRIB_DEFAULT plt.rcParams.update({'font.size': const.FONT_SIZE_DEFAULT}) # print command line arguments for arg in args: k = arg.split("=")[0] v = arg.split("=")[1] if k == 'plot': plot = utils.str_to_bool(v) elif k == 'print': print_ = utils.str_to_bool(v) elif k == 'font_size': plt.rcParams.update({'font.size': int(v)}) elif k == 'even_distrib': even_distrib = utils.str_to_bool(v) if print_: print() print("--- Stats config ---") print("Even distribution dataset: {}".format(even_distrib)) print("Plot: {}".format(plot)) print("--------------------") print() # load data gen_spotify_df = pd.read_csv(const.GEN_SPOTIFY) clean_spotify_df = pd.read_csv(const.CLEAN_SPOTIFY) if even_distrib == False: clean_spotify_df = pd.read_csv(const.CLEAN_UNEVEN_SPOTIFY) if print_: print("Spotify missing per col: \n{}".format( clean_spotify_df.isna().sum())) print("Spotify unclean shape: {}".format(gen_spotify_df)) print("Spotify shape: {}".format(clean_spotify_df)) print() gen_deezer_df = pd.read_csv(const.GEN_DEEZER) clean_deezer_df = pd.read_csv(const.CLEAN_DEEZER) if even_distrib == False: clean_deezer_df = pd.read_csv(const.CLEAN_UNEVEN_DEEZER) if print_: print("Deezer missing per col: \n{}".format( clean_deezer_df.isna().sum())) print("Spotify unclean shape: {}".format(gen_deezer_df)) print("Deezer shape: {}".format(clean_deezer_df)) print() # get info on datasets clean_spotify_wc = get_word_counts(clean_spotify_df.lyrics.values, print_=print_) clean_spotify_uc = get_unique_counts(clean_spotify_df.lyrics.values, print_=print_) spotify_class_distrib = get_emotion_counts(clean_spotify_df, print_=print_) clean_deezer_wc = get_word_counts(clean_deezer_df.lyrics.values, print_=print_) clean_deezer_uc = get_unique_counts(clean_deezer_df.lyrics.values, print_=print_) deezer_class_distrib = get_emotion_counts(clean_deezer_df, print_=print_) # word count hist plot_hist("Dataset Word Count", clean_spotify_wc, const.SPOTIFY, clean_deezer_wc, const.DEEZER, a1=0.4, a2=0.4, xlabel="# of Songs", ylabel="Word Count") # unique word count hist plot_hist("Dataset Unique Words Count", clean_spotify_uc, const.SPOTIFY, clean_deezer_uc, const.DEEZER, a1=0.4, a2=0.4, ylabel="Unique Word Count", xlabel="# of Songs") # class distrib scatter plot plot_val_arousal_scatter( "Spotify: Valence-Arousal Distribution", clean_spotify_df.valence.values, clean_spotify_df.arousal.values, gen_spotify_df.valence.values, gen_spotify_df.arousal.values, ) plot_val_arousal_scatter( "Deezer: Valence-Arousal Distribution", clean_deezer_df.valence.values, clean_deezer_df.arousal.values, gen_deezer_df.valence.values, gen_deezer_df.arousal.values, ) datasets = [ (const.SPOTIFY, clean_spotify_df), (const.DEEZER, clean_deezer_df), ] for name, dataset in datasets: for i in dataset.y.unique(): class_df = utils.get_class_based_data( dataset, i, random_state=const.RANDOM_STATE_DEFAULT, include_other_classes=True, even_distrib=False, limit_size=False, print_=True) print("{} Class {} data shape: {}".format(name, i, class_df.shape)) print("{} Class {} data mean valence-arousal: {}".format( name, i, (class_df.valence.mean(), class_df.arousal.mean()))) plot_val_arousal_scatter( "{}: Class {} Data Valence-Arousal Distribution".format( name, i), class_df.valence.values, class_df.arousal.values) # class distrib hist plt.figure() x = np.array([i + 1 for i in range(len(clean_spotify_df.y.unique()))]) plt.title("Dataset Class Distribution") plt.bar(x - 0.125, get_y(clean_spotify_df), width=0.25, align='center', label=const.SPOTIFY) plt.bar(x + 0.125, get_y(clean_deezer_df), width=0.25, align='center', label=const.DEEZER) plt.xticks(x, labels=["Happy", "Angry", "Sad", "Relaxed"]) plt.legend() if plot: plt.draw() plt.show()