예제 #1
0
def likelihood_and_perplexity(save_path, articles, log_likelihood):

    num_words = sum(
        utils.get_word_counts(articles).values())  # vocabulary size
    log_likelihood = np.asarray(log_likelihood)
    x_axis = np.asarray(list(range(len(log_likelihood))))

    # plot log likelihood
    path = save_path + "log_likelihood.png"
    plt.subplots_adjust(left=0.2)
    plt.plot(x_axis, log_likelihood)
    plt.title('Log-Likelihood (max value ' +
              str(round(log_likelihood[-1], 2)) + ")")
    plt.ylabel('Log Likelihood')
    plt.xlabel('Iteration')
    plt.savefig(path)
    plt.close()

    # plot the perplexity
    path = save_path + "perplexity.png"
    perplexity = np.e**((-1 / float(num_words)) * log_likelihood)
    print("Best Perplexity: " + str(perplexity[-1]))
    plt.title('Perplexity (min value ' + str(round(perplexity[-1], 2)) + ")")
    plt.plot(x_axis, perplexity)
    plt.ylabel('Perplexity')
    plt.xlabel('Iteration')
    plt.savefig(path)
    plt.close()
def EM(articles):

    N = len(articles)  # number of documents
    V = len(utils.get_word_counts(articles).keys())  # vocabulary size
    n = utils.words_per_doc_vec(articles)  # shape of NxV, each cell in n_tk
    likelihod_vals = []

    # initial weights - each example belong to 1 cluster only
    w = np.zeros((N, num_classes))
    for idx in range(N):
        w[idx, idx % num_classes] = 1

    P, alpha = M_step(N, V, n, w)
    likelihood = log_likelihood(N, n, k, P, alpha)
    likelihod_vals.append(likelihood)

    # do the EM until convergence
    eps = abs(likelihood / const)
    delta = abs(likelihood)
    iter = 0
    print("log-likelihood " + "iteration " + str(iter) + ": " +
          str(likelihood))
    while delta > eps:
        w = E_step(N, n, k, P, alpha)
        P, alpha = M_step(N, V, n, w)
        likelihood = log_likelihood(N, n, k, P, alpha)
        likelihod_vals.append(likelihood)
        delta = likelihod_vals[-1] - likelihod_vals[-2]
        iter += 1
        print("log-likelihood " + "iteration " + str(iter) + ": " +
              str(likelihood))

    return likelihod_vals, w
예제 #3
0
def replace_infrequent_words(in_file,
                             out_file,
                             count_thresh=5,
                             symbol="_RARE_"):
    """
	Replace words with frequency < count_thresh in in_file by symbol and store in out_file.
	"""

    # get frequency of each word in in_file
    word_count_dict = get_word_counts(in_file)

    out_lines_list = []
    in_file.seek(0)
    l = in_file.readline()
    while l:
        line = l.strip()
        if line:  # Nonempty line
            fields = line.split(" ")
            word = " ".join(fields[:-1])

            # replace word with symbol if frequency < count_thresh
            if word_count_dict[word] < count_thresh:
                line = " ".join([symbol, fields[-1]])
        out_lines_list.append(line)
        l = in_file.readline()
    out_lines = "\n".join(out_lines_list)
    out_file.write(out_lines)
예제 #4
0
def get_wordvectors():
    print("wordvectors read ...")
    word_dict = utils.get_word_counts("/../resources/yelp/data/yelp_restaurant_word_counts.txt")
    stop_words = utils.get_stopwords()
    vw_model = utils.get_word2vec_model('../resources/yelp/word2vec/yelp_restaurants_word2vector', ncols, nwin)
    vw_model.syn0 = utils.normalize2(vw_model.syn0)
    glove_dict = utils.get_glove_data('../resources/yelp/glove/','vectors_'+str(ncols)+'.txt')
    return vw_model, word_dict, stop_words, glove_dict
예제 #5
0
def main():
    vw_model = utils.get_word2vec_model('../resources/yelp/word2vec/yelp_restaurants_word2vector', ncols, nwin)
    vw_model.vectors = utils.normalize2(vw_model.vectors)
    glove_dict = utils.get_glove_data('../resources/yelp/glove/', 'vectors_' + str(ncols) + '.txt')
    word_dict = utils.get_word_counts("../resources/yelp/data/yelp_restaurant_word_counts.txt")
    # train_list, test_list = get_list()
    # train_list = utils.get_list('../resources/yelp/train_data50000.txt')
    # test_list = utils.get_list('../resources/yelp/test_data50000.txt')

    max_rlen = get_max_number_of_token()
    model = my_model(max_rlen)

    print("#################### Iterations ################\n")
    results = []
    for j in range(1):
        acc_hist = {}
        train_list = utils.get_list('../resources/yelp/cv_train_data_'+str(j) +'.txt')
        test_list = utils.get_list('../resources/yelp/cv_train_data_'+str(j)+'.txt')
        half = len(test_list) // 2
        for e in range(nepochs):
            for i in range(int(len(train_list)/nbatch)):
                x_train, y_train = get_review_windows(vw_model, train_list[nbatch*i:nbatch*(i+1)], max_rlen, ncols, nbatch, glove_dict, word_dict)
                (loss,acc) = model.train_on_batch(x_train, y_train)

            print("Train: Epoch:" + str(e+1) + " Loss = " + str(loss) + " -- " + "Accuracy = " + str(acc))
            acc_hist[str(e+1)] = acc

        with open("../resources/yelp/scores/CNN-VALID_"+str(j), 'w') as out:
            counter = 0
            score = 0
            for i in range(int(len(test_list)/nbatch)):
                x_test, y_test = get_review_windows(vw_model, test_list[nbatch*i:nbatch*(i+1)], max_rlen, ncols, nbatch, glove_dict, word_dict)
                loss = model.test_on_batch(x_test, y_test)
                pred = model.predict_proba(x_test)
                classes = model.predict_classes(x_test)
                counter += nbatch
                for p, c in zip(pred, classes):
                    # score:
                    if counter <= half and c == 1:
                        score += 1
                    elif counter > half and c == 0:
                        score += 1
                    out.write(str(c) + " " + str(p[1]) + " " + str(p[0]) + "\n")

                print("Test: Iteration:" + str(i + 1) + " Loss = " + str(loss))
            results.append(score)
        print("######################## Trial = " + str(j))

    acc = []
    for result in results:
        acc.append(result)
        print(result)

    np_acc = np.array(acc)
    print("Mean = " + str(np_acc.mean()))
    print("Std.Dev = " + str(np.std(np_acc, dtype=np.float64)))
예제 #6
0
파일: preproc.py 프로젝트: rlzh/mec
def get_indices_from_word_count(lyrics, limits=(5, 1000)):
    remove_indices = []
    print("Filtering based on {} < word count < {}...".format(
        limits[0], limits[1]))
    word_counts = get_word_counts(lyrics)
    for i in pbar.progressbar(range(len(lyrics))):
        if word_counts[i] > limits[1] or word_counts[i] < limits[0]:
            remove_indices.append(i)
    print("Songs filtered based on word count: {}".format(len(remove_indices)))
    return remove_indices
예제 #7
0
def get_review_windows(model, reviews, max_rlen, ncols, nsen, glove_dict):
    word_dict = utils.get_word_counts("../resources/semeval/data/word_counts.txt",6,2,1)
    x = np.zeros(shape=(nsen, max_rlen, gm*ncols))
    y = np.zeros(shape=(nsen,num_classes))

    for i,review in enumerate(reviews):
        try:
            #x[i] = utils.get_token_matrix(model, review[0], max_rlen, ncols, glove_dict, gm)
            x[i] = utils.get_token_matrix_weight(model, review[0], max_rlen, ncols, glove_dict, gm, word_dict)
        except IndexError as e:
            print(e)
        y[i] = review[1]

    x = x.reshape(x.shape[0], max_rlen, gm*ncols, 1)
    x = x.astype('float32')

    #y = keras_test.utils.to_categorical(y, num_classes)
    return x,y
예제 #8
0
def baseline_tagger(counts_file, dev_file, rare_symbol="_RARE_"):
	"""
	Implements a baseline tagger that uses only the emission probabilities to assign tags and stores in a file.
	"""

	# get frequently occurring words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# compute emission probs
	counter = Hmm(3)
	counter.read_counts(counts_file)
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	out_lines_list = []
	l = dev_file.readline()
	while l:
		word = l.strip()
		if word:  # Nonempty line
			# use emission probabilities of rare_symbol to assign tag and its probability for rare or unseen words.
			if word not in freq_words:
				tag = sorted(emission_probs[rare_symbol], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[rare_symbol][tag]

			# use emission probabilities of the word itself for frequently occurring words.
			else:
				tag = sorted(emission_probs[word], key=emission_probs[word].get, reverse=True)[0]
				prob = emission_probs[word][tag]
			log_prob = math.log(prob, 2)
			l = word + " " + tag + " " + str(log_prob)
		else:
			l = ""
		out_lines_list.append(l)
		l = dev_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_lines = out_lines + "\n"

	# write words, corresponding tags and log probs to file
	with open('4_2.txt','w') as out_file:
		out_file.write(out_lines)
예제 #9
0
def replace_infrequent_words_with_categories(in_file, out_file, count_thresh=5):
	"""
	Replace words with frequency < count_thresh in in_file by their category and store in out_file.
	"""

	# get frequency of each word in in_file
	word_count_dict = get_word_counts(in_file)

	out_lines_list = []
	in_file.seek(0)
	l = in_file.readline()
	while l:
		line = l.strip()
		if line:  # Nonempty line
			fields = line.split(" ")
			word = " ".join(fields[:-1])

			# replace word with its category if frequency < count_thresh
			if word_count_dict[word] < count_thresh:
				line = " ".join([get_category(word), fields[-1]])
		out_lines_list.append(line)
		l = in_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_file.write(out_lines)
예제 #10
0

def count_blocks(blocks, word_frequencies):
    block_frequencies = defaultdict(int)

    for word, blocks in blocks.items():
        frequency = word_frequencies[word]
        for block in blocks:
            block_frequencies[block] += frequency

    return block_frequencies


if __name__ == '__main__':
    import os
    word_counts = get_word_counts(
        os.path.join('word_lists', 'filtered_word_counts.txt'))
    words = word_counts.keys()
    bigrams = get_bigram_frequencies(word_counts)
    bigram_dict = get_bigram_dictionary(word_counts)

    # show_in_order(bigrams)
    # total_bigrams = sum(count for count in bigrams.values())
    # print(total_bigrams)

    # for bigram, count in sorted(bigrams.items(), key=lambda item: item[1])[:10]:
    #     print(bigram, count, find_words_containing_substring(words, bigram))

    # Top 40 bigrams
    # top_bigrams = [item for item, count in sorted(bigrams.items(), key=lambda item: item[1])[:40]]
    # letter_counts = Counter("".join(top_bigrams))
    # print(letter_counts)
예제 #11
0
			word = " ".join(fields[:-1])

			# replace word with its category if frequency < count_thresh
			if word_count_dict[word] < count_thresh:
				line = " ".join([get_category(word), fields[-1]])
		out_lines_list.append(line)
		l = in_file.readline()
	out_lines = "\n".join(out_lines_list)
	out_file.write(out_lines)

if __name__ == "__main__":
	# replace infrequent words with categories and write to file
	replace_infrequent_words_with_categories(file('ner_train.dat'), file('ner_train_cats.dat', 'w'))

	# generate counts file
	os.system('python count_freqs.py ner_train_cats.dat > ner_cats.counts')

	# get frequent words
	word_count_dict = get_word_counts(file('ner_train.dat'))
	freq_words = [word for word in word_count_dict if word_count_dict[word] >= 5]

	# get transition and emission probabilities
	counter = Hmm(3)
	counter.read_counts(file('ner_cats.counts'))
	transition_probs = compute_transition_probs(counter.ngram_counts[1], counter.ngram_counts[2])
	emission_probs = compute_emission_probs(counter.emission_counts, counter.ngram_counts[0])

	# store tagged data with the log probs to file
	tagger(file('ner_dev.dat'), transition_probs, emission_probs, freq_words)

	os.system('python eval_ne_tagger.py ner_dev.key 6.txt')
예제 #12
0
def main():

    vw_model = utils.get_word2vec_model(
        '../resources/imdb/data/alldata_word2vector', ncols, nwin)
    vw_model.syn0 = utils.normalize2(vw_model.syn0)
    glove_dict = utils.get_glove_data('../resources/imdb/glove',
                                      'vectors_300_5.txt')
    word_dict = utils.get_word_counts("../resources/imdb/data/word_counts.txt",
                                      1, 1, 2)

    # train_list = utils.get_shuffle_list('../resources/imdb/data/full-train-pos.txt',
    #                                     '../resources/imdb/data/full-train-neg.txt', True)
    # test_list = utils.get_shuffle_list('../resources/imdb/data/test-pos.txt',
    #                                    '../resources/imdb/data/test-neg.txt', False)
    train_list = utils.get_shuffle_list(
        '../resources/imdb/data/small-train-pos.txt',
        '../resources/imdb/data/small-train-neg.txt', True)
    test_list = utils.get_shuffle_list('../resources/imdb/data/valid-pos.txt',
                                       '../resources/imdb/data/valid-neg.txt',
                                       False)
    max_rlen = get_max_number_of_token()
    model = my_model(max_rlen)

    half = len(test_list) // 2
    results = []

    for j in range(30):
        acc_hist = {}
        for e in range(nepochs):
            for i in range(int(len(train_list) / nbatch)):
                x_train, y_train = get_review_windows(
                    vw_model, train_list[nbatch * i:nbatch * (i + 1)],
                    max_rlen, ncols, nbatch, glove_dict, word_dict)
                (loss, acc) = model.train_on_batch(x_train, y_train)

            print("Train: Epoch:" + str(e + 1) + " Loss = " + str(loss) +
                  " -- " + "Accuracy = " + str(acc))
            acc_hist[str(e + 1)] = acc

        with open("../resources/imdb/scores/CNN-VALID_" + str(j), 'w') as out:
            counter = 0
            score = 0
            for i in range(int(len(test_list) / nbatch)):
                x_test, y_test = get_review_windows(
                    vw_model, test_list[nbatch * i:nbatch * (i + 1)], max_rlen,
                    ncols, nbatch, glove_dict, word_dict)
                loss = model.test_on_batch(x_test, y_test)
                pred = model.predict_proba(x_test)
                classes = model.predict_classes(x_test)
                counter += nbatch
                for p, c in zip(pred, classes):
                    # score:
                    if counter <= half and c == 1:
                        score += 1
                    elif counter > half and c == 0:
                        score += 1
                    out.write(
                        str(c) + " " + str(p[1]) + " " + str(p[0]) + "\n")

                print("Test: Iteration:" + str(i + 1) + " Loss = " + str(loss))
            results.append(score)
        print("######################## Trial = " + str(j))

    acc = []
    for result in results:
        acc.append(result)
        print(result)

    np_acc = np.array(acc)
    print("Mean = " + str(np_acc.mean()))
    print("Std.Dev = " + str(np.std(np_acc, dtype=np.float64)))
예제 #13
0
파일: stats_report.py 프로젝트: rlzh/mec
def main(*args):

    plot = const.PLOT_DEFAULT
    print_ = const.PRINT_DEFAULT
    even_distrib = const.EVEN_DISTRIB_DEFAULT
    plt.rcParams.update({'font.size': const.FONT_SIZE_DEFAULT})
    # print command line arguments
    for arg in args:
        k = arg.split("=")[0]
        v = arg.split("=")[1]
        if k == 'plot':
            plot = utils.str_to_bool(v)
        elif k == 'print':
            print_ = utils.str_to_bool(v)
        elif k == 'font_size':
            plt.rcParams.update({'font.size': int(v)})
        elif k == 'even_distrib':
            even_distrib = utils.str_to_bool(v)

    if print_:
        print()
        print("--- Stats config ---")
        print("Even distribution dataset: {}".format(even_distrib))
        print("Plot: {}".format(plot))
        print("--------------------")
        print()

    # load data
    gen_spotify_df = pd.read_csv(const.GEN_SPOTIFY)
    clean_spotify_df = pd.read_csv(const.CLEAN_SPOTIFY)
    if even_distrib == False:
        clean_spotify_df = pd.read_csv(const.CLEAN_UNEVEN_SPOTIFY)
    if print_:
        print("Spotify missing per col: \n{}".format(
            clean_spotify_df.isna().sum()))
        print("Spotify unclean shape: {}".format(gen_spotify_df))
        print("Spotify shape: {}".format(clean_spotify_df))
        print()

    gen_deezer_df = pd.read_csv(const.GEN_DEEZER)
    clean_deezer_df = pd.read_csv(const.CLEAN_DEEZER)
    if even_distrib == False:
        clean_deezer_df = pd.read_csv(const.CLEAN_UNEVEN_DEEZER)
    if print_:
        print("Deezer missing per col: \n{}".format(
            clean_deezer_df.isna().sum()))
        print("Spotify unclean shape: {}".format(gen_deezer_df))
        print("Deezer shape: {}".format(clean_deezer_df))
        print()

    # get info on datasets
    clean_spotify_wc = get_word_counts(clean_spotify_df.lyrics.values,
                                       print_=print_)
    clean_spotify_uc = get_unique_counts(clean_spotify_df.lyrics.values,
                                         print_=print_)
    spotify_class_distrib = get_emotion_counts(clean_spotify_df, print_=print_)
    clean_deezer_wc = get_word_counts(clean_deezer_df.lyrics.values,
                                      print_=print_)
    clean_deezer_uc = get_unique_counts(clean_deezer_df.lyrics.values,
                                        print_=print_)
    deezer_class_distrib = get_emotion_counts(clean_deezer_df, print_=print_)

    # word count hist
    plot_hist("Dataset Word Count",
              clean_spotify_wc,
              const.SPOTIFY,
              clean_deezer_wc,
              const.DEEZER,
              a1=0.4,
              a2=0.4,
              xlabel="# of Songs",
              ylabel="Word Count")

    # unique word count hist
    plot_hist("Dataset Unique Words Count",
              clean_spotify_uc,
              const.SPOTIFY,
              clean_deezer_uc,
              const.DEEZER,
              a1=0.4,
              a2=0.4,
              ylabel="Unique Word Count",
              xlabel="# of Songs")

    # class distrib scatter plot
    plot_val_arousal_scatter(
        "Spotify: Valence-Arousal Distribution",
        clean_spotify_df.valence.values,
        clean_spotify_df.arousal.values,
        gen_spotify_df.valence.values,
        gen_spotify_df.arousal.values,
    )
    plot_val_arousal_scatter(
        "Deezer: Valence-Arousal Distribution",
        clean_deezer_df.valence.values,
        clean_deezer_df.arousal.values,
        gen_deezer_df.valence.values,
        gen_deezer_df.arousal.values,
    )

    datasets = [
        (const.SPOTIFY, clean_spotify_df),
        (const.DEEZER, clean_deezer_df),
    ]
    for name, dataset in datasets:
        for i in dataset.y.unique():
            class_df = utils.get_class_based_data(
                dataset,
                i,
                random_state=const.RANDOM_STATE_DEFAULT,
                include_other_classes=True,
                even_distrib=False,
                limit_size=False,
                print_=True)
            print("{} Class {} data shape: {}".format(name, i, class_df.shape))
            print("{} Class {} data mean valence-arousal: {}".format(
                name, i, (class_df.valence.mean(), class_df.arousal.mean())))
            plot_val_arousal_scatter(
                "{}: Class {} Data Valence-Arousal Distribution".format(
                    name, i), class_df.valence.values, class_df.arousal.values)

    # class distrib hist
    plt.figure()
    x = np.array([i + 1 for i in range(len(clean_spotify_df.y.unique()))])

    plt.title("Dataset Class Distribution")
    plt.bar(x - 0.125,
            get_y(clean_spotify_df),
            width=0.25,
            align='center',
            label=const.SPOTIFY)
    plt.bar(x + 0.125,
            get_y(clean_deezer_df),
            width=0.25,
            align='center',
            label=const.DEEZER)
    plt.xticks(x, labels=["Happy", "Angry", "Sad", "Relaxed"])
    plt.legend()
    if plot:
        plt.draw()
        plt.show()