import numpy as np import tensorflow as tf from tensorflow.contrib.tensorboard.plugins import projector import tools.processing as pre batch_size = 256 # embedding_dimension = 128 embedding_dimension = 10 negative_samples = 32 LOG_DIR = "logs/word2vec_intro" EPOCHS = 20 tf.reset_default_graph() text = pre.get_text("data/cleaned-rap-lyrics/ref_text3.txt") sentences = [] # Create two kinds of sentences - sequences of odd and even digits. # for i in range(10000): # rand_odd_ints = np.random.choice(range(1, 10, 2), 3) # sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints])) # rand_even_ints = np.random.choice(range(2, 10, 2), 3) # sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints])) # text = pre.get_text("data/cleaned-rap-lyrics/lyrics_combined.txt") # text = pre.get_text("data/prepped/clean2_pac.txt") # sentences = text.split("\n") sentences = [text.replace("\n", ";")]
import os import math import glob import numpy as np import tensorflow as tf from tensorflow.contrib.tensorboard.plugins import projector import tools.processing as pre batch_size = 256 embedding_dimension = 3 negative_samples = 32 LOG_DIR = "logs/phone2vec_v3" EPOCHS = 5 text = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt") sentences = [text.replace("\n", ";")] vocab = pre.Vocabulary(sentences[0]) # Map words to indices index2word_map = vocab.index2word_map word2index_map = vocab._dict vocabulary_size = len(index2word_map) print("vocab_size: {} \n".format(vocabulary_size)) # Generate skip-gram pairs skip_gram_pairs = [] for sent in sentences: tokenized_sent = sent.split()
import tools.processing as pre import tools.spell_correction as spell import os FILE = "clean2_pac.txt" SRC = os.path.join( "data/cleaned-rap-lyrics", FILE) DST = os.path.join("data/prepped", FILE) text = pre.get_text( SRC ) corr_text = (spell.correct(text, "data/words_alpha.txt")) print(corr_text[:500]) import re corr_text = re.sub(" *linebreakhere *", "\n", corr_text) pre.write_text( DST, corr_text)
batch_size = 256 # embedding_dimension = 128 embedding_dimension = 10 negative_samples = 32 # window_size of 1 had the best outcome to predict words which have an equal meaning, # window_size of 5 predicted topical words which we don't need in our case # For the best results we decided to use the GloVe embeddings window_size = 1 so we have decided to take the GloVe embeddings LOG_DIR = "logs/word2vec_v2" EPOCHS = 10 tf.reset_default_graph() text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt") sentences = [] # Create two kinds of sentences - sequences of odd and even digits. # for i in range(10000): # rand_odd_ints = np.random.choice(range(1, 10, 2), 3) # sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints])) # rand_even_ints = np.random.choice(range(2, 10, 2), 3) # sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints])) # text = pre.get_text("data/cleaned-rap-lyrics/lyrics_combined.txt") # text = pre.get_text("data/prepped/clean2_pac.txt") # sentences = text.split("\n") sentences = [text.replace( "\n", ";" )]