Пример #1
0
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import tools.processing as pre

batch_size = 256
# embedding_dimension = 128
embedding_dimension = 10
negative_samples = 32
LOG_DIR = "logs/word2vec_intro"
EPOCHS = 20

tf.reset_default_graph()

text = pre.get_text("data/cleaned-rap-lyrics/ref_text3.txt")

sentences = []

# Create two kinds of sentences - sequences of odd and even digits.
# for i in range(10000):
#     rand_odd_ints = np.random.choice(range(1, 10, 2), 3)
#     sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints]))
#     rand_even_ints = np.random.choice(range(2, 10, 2), 3)
#     sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints]))

# text = pre.get_text("data/cleaned-rap-lyrics/lyrics_combined.txt")
# text = pre.get_text("data/prepped/clean2_pac.txt")

# sentences = text.split("\n")
sentences = [text.replace("\n", ";")]
Пример #2
0
import os
import math
import glob
import numpy as np
import tensorflow as tf
from tensorflow.contrib.tensorboard.plugins import projector

import tools.processing as pre

batch_size = 256
embedding_dimension = 3
negative_samples = 32
LOG_DIR = "logs/phone2vec_v3"
EPOCHS = 5

text = pre.get_text("data/phonem-rap-lyrics/phonem_all.txt")
sentences = [text.replace("\n", ";")]

vocab = pre.Vocabulary(sentences[0])

# Map words to indices
index2word_map = vocab.index2word_map
word2index_map = vocab._dict

vocabulary_size = len(index2word_map)
print("vocab_size: {} \n".format(vocabulary_size))

# Generate skip-gram pairs
skip_gram_pairs = []
for sent in sentences:
    tokenized_sent = sent.split()
Пример #3
0
import tools.processing as pre
import tools.spell_correction as spell
import os

FILE = "clean2_pac.txt"

SRC = os.path.join( "data/cleaned-rap-lyrics", FILE)
DST = os.path.join("data/prepped", FILE)

text = pre.get_text( SRC )

corr_text = (spell.correct(text, "data/words_alpha.txt"))

print(corr_text[:500])

import re
corr_text = re.sub(" *linebreakhere *", "\n", corr_text)

pre.write_text( DST, corr_text)
Пример #4
0
batch_size = 256
# embedding_dimension = 128
embedding_dimension = 10
negative_samples = 32
# window_size of 1 had the best outcome to predict words which have an equal meaning, 
# window_size of 5 predicted topical words which we don't need in our case
# For the best results we decided to use the GloVe embeddings
window_size = 1 
so we have decided to take the GloVe embeddings
LOG_DIR = "logs/word2vec_v2"
EPOCHS = 10

tf.reset_default_graph()

text = pre.get_text("data/cleaned-rap-lyrics/final_2_pac_rakim_kid_cudi.txt")

sentences = []

# Create two kinds of sentences - sequences of odd and even digits.
# for i in range(10000):
#     rand_odd_ints = np.random.choice(range(1, 10, 2), 3)
#     sentences.append(" ".join([digit_to_word_map[r] for r in rand_odd_ints]))
#     rand_even_ints = np.random.choice(range(2, 10, 2), 3)
#     sentences.append(" ".join([digit_to_word_map[r] for r in rand_even_ints]))

# text = pre.get_text("data/cleaned-rap-lyrics/lyrics_combined.txt")
# text = pre.get_text("data/prepped/clean2_pac.txt")

# sentences = text.split("\n")
sentences = [text.replace( "\n", ";" )]