def __iter__(self): """ Defines how to iterate the MySentences class in order to feed it directly into Word2Vec method. Yields a sentence (as a list of words) for every iteration. """ # for root, dirs, files in os.walk(self.dirname): for file_path in self.file_paths: file_data = VectorManager.read_vector(file_path) file_sentences = VectorManager.parse_into_sentences(file_data) for sentence in file_sentences: yield sentence
def is_valid_numpy(): """ """ docs_ids = VectorManager.read_vector(filename) original = VectorManager.parse_into_4D( VectorManager.read_vector(file_words)) file_list = [] comparison = [] unknowns = 0 for d in range(0, len(docs_ids)): doc_list = [] for p in range(0, len(docs_ids[d])): par_list = [] for s in range(0, len(docs_ids[d][p])): sent_list = [] for w in range(0, len(docs_ids[d][p][s])): try: translated = to_word(docs_ids[d][p][s][w]) if translated == '<unk>': unknowns += 1 comparison.append( translated == original[d][p][s][w]) sent_list.append(translated) except Exception as e: print("[%s] Indices %s %s %s %s: %s" % (filename, d, p, s, w, e)) par_list.append(sent_list) doc_list.append(par_list) file_list.append(doc_list) valid = False try: ratio = float(comparison.count(True)) / len(comparison) u_ratio = round(float(unknowns) / len(comparison), 2) if ratio < confidence: print( "[WARN] File %s equality ratio is %s with %s unknown ratio" % (filename, round(ratio, 2), u_ratio)) else: print( "[OK] File %s equality ratio is %s with %s unknown ratio" % (filename, round(ratio, 2), u_ratio)) valid = True except KeyError as e: print( "[ERROR] File %s is completely different (%s) with %s unknown ratio" % (filename, e, u_ratio)) return valid
def __init__(self, dictionary_path, word2id, embeddings, lda=None, lsi=None): self.dictionary = self.load_dict(dictionary_path) self.word2id = VectorManager.read_vector(word2id) # self.word2id = self.word2id_to_id2word(word2id) self.embeddings = embeddings self.lda = lda self.lsi = lsi
def transform_numpy(): """ Transforms a 4D list of words into a 4D numpy array of integers and writes it into file_out """ docs = VectorManager.parse_into_4D(VectorManager.read_vector(filename)) file_list = [] for doc in docs: doc_list = [] for paragraph in doc: par_list = [] for sentence in paragraph: s_id = [toId(word) for word in sentence if word] if s_id: par_list.append(s_id) doc_list.append(par_list) file_list.append(doc_list) np.save(file_out, np.array(file_list))
def _transform_file(file_path, w2id, split_par=False, debug=False): """ Transforms a file containing articles into a 4D list of words divided into sentences, paragraphs and docs. Write the result to disk with the name filename_clean.pklz :param file_path: file to transform """ if debug: print("Cleaning %s" % file_path) with open(file_path) as f: data = f.read().decode("latin-1") docs = data.split("</doc>") del data if not split_par: file_out = "%s_clean_simple" % file_path else: file_out = "%s_clean_paragraph" % file_path file_string = "" for doc in [d.strip() for d in docs if d.strip()]: paragraphs = [ tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par ] doc_a = False for p in paragraphs: par_a = False for sent in p: line = [ word for word in sent.lower().split() if word.isalpha() or is_number(word) ] line = " ".join([known(word, w2id) for word in line]) if line: file_string += line + " <eos> " par_a = True if par_a and split_par: file_string += " <eop> " VectorManager.write_string(file_out, file_string.encode("latin-1")) del file_string if debug: print("Done with %s" % file_path)
def _transform_file(file_path, debug=False): """ Transforms a file containing articles into a 4D list of words divided into sentences, paragraphs and docs. Write the result to disk with the name filename_wl (words list) :param file_path: file to transform """ if debug: print("Cleaning %s" % file_path) with open(file_path) as f: raw = f.read().decode("latin-1") data = cleanhtml(raw) docs = data.split("</doc>") del data file_out = "%s_wl" % file_path file_string = "" for doc in [d.strip() for d in docs if d.strip()]: paragraphs = [ tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par ] doc_a = False for p in paragraphs: par_a = False for sent in p: line = " ".join([ word for word in sent.lower().split() if word.isalpha() or is_number(word) ]) if line: file_string += line + "\n" par_a = True doc_a = True if par_a: file_string += "\n" if doc_a: file_string += "\n" VectorManager.write_string(file_out, file_string.encode("latin-1")) del file_string if debug: print("Done with %s" % file_path)
def generate_arrays_from_list(name, files, embeddings, num_steps=35, batch_size=20, embedding_size=200): debug = False while 1: for file_name in files: print("Generating from file %s for %s" % (file_name, name)) raw_list = VectorManager.parse_into_list(open(file_name).read()) n_words = len(raw_list) batch_len = n_words // batch_size data = np.reshape(raw_list[0:batch_size * batch_len], [batch_size, batch_len]) for i in range(0, n_words - num_steps, 1): x = data[0:batch_size, i * num_steps:(i + 1) * num_steps] x = [[embeddings[int(elem)][2] for elem in l] for l in x] y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1] if len(x[0]) < num_steps or len(y[0]) < num_steps: break if debug: print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size)) print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0]))) print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0]))) x = np.reshape(x, newshape=(batch_size, num_steps, embedding_size)) y = np.reshape(y, newshape=(batch_size, num_steps)) yield x, y
def main(_): if not FLAGS.data_path: raise ValueError("Must set --data_path to wiki data directory list") vocab_size = 126930 config = get_config() config.vocab_size = vocab_size valid_config = get_config() config.vocab_size = vocab_size eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 eval_config.vocab_size = vocab_size embeddings = VectorManager.read_vector( "%s%s.pklz" % (FLAGS.embeddings, config.embedding_size)) files = open(FLAGS.data_path).read().split() training_list = files[0:int(0.8 * len(files))] validation_list = files[int(0.8 * len(files)):int(0.9 * len(files))] testing_list = files[int(0.9 * len(files)):len(files)] config.epoch_size = get_epoch_size(training_list, config) valid_config.epoch_size = get_epoch_size(validation_list, valid_config) eval_config.epoch_size = get_epoch_size(testing_list, eval_config) gen_train = generate_arrays_from_list("Train", training_list, embeddings, batch_size=config.batch_size, embedding_size=config.embedding_size, num_steps=config.num_steps) gen_valid = generate_arrays_from_list( "Validation", validation_list, embeddings, batch_size=valid_config.batch_size, embedding_size=valid_config.embedding_size, num_steps=valid_config.num_steps) gen_test = generate_arrays_from_list( "Test", testing_list, embeddings, batch_size=eval_config.batch_size, embedding_size=eval_config.embedding_size, num_steps=eval_config.num_steps) print("Epoch sizes\n * Training: %s\n * Validation: %s\n * Testing: %s" % (config.epoch_size, valid_config.epoch_size, eval_config.epoch_size)) sys.stdout.flush() with tf.Graph().as_default(): # Args: [minval, maxval] initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=None, initializer=initializer): m = WPModel(is_training=True, config=config) tf.summary.scalar("Training Loss", m.cost) tf.summary.scalar("Learning Rate", m.lr) with tf.name_scope("Valid"): with tf.variable_scope("Model", reuse=True, initializer=initializer): mvalid = WPModel(is_training=False, config=valid_config) tf.summary.scalar("Validation Loss", mvalid.cost) with tf.name_scope("Test"): with tf.variable_scope("Model", reuse=True, initializer=initializer): mtest = WPModel(is_training=False, config=eval_config) sv = tf.train.Supervisor(logdir=FLAGS.save_path) with sv.managed_session() as session: for i in range(config.max_max_epoch): lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, generator=gen_train, model=m, eval_op=m.train_op, verbose=True) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, generator=gen_valid, model=mvalid) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) test_perplexity = run_epoch(session, generator=gen_test, model=mtest) print("Test Perplexity: %.3f" % test_perplexity) if FLAGS.save_path: print("Saving model to %s." % FLAGS.save_path) sv.saver.save(session, FLAGS.save_path, global_step=sv.global_step)
'--id_word_vec', type=str, help="Path of id <-> word <-> embedding vector", required=True) parser.add_argument('-w', '--word_vectors', type=str, help="Path of LM to perform the tests upon", required=True) args = parser.parse_args() # Arguments parsing wv_path = args.word_vectors path = args.id_word_vec print("Loading model...") wv = KeyedVectors.load_word2vec_format(wv_path, binary=False) print("Loading id-word-vec...") id_word_vec = VectorManager.read_vector(path) print("Finding subset to plot") initial_word = 'jupiter' max_elements = 500 sb = subset(initial_word, id_word_vec, wv, max_elements) print("Plotting subset of words...") # Plot t-SNE plot_tsne(sb)
sys.stdout.flush() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '-d', '--data', type=str, help="Path of the data to be translated with word2id vector." " and clean up.", required=True) parser.add_argument( '-w ', '--word_vector', type=str, help="Word2ID vector to be used for doc reverse translation.", required=True) args = parser.parse_args() data_path = args.data word2id_file = args.word_vector begin = time() w2Id = VectorManager.read_vector(word2id_file) check_translated_files(data_path, w2Id) end = time() print("Total processing time: %d seconds" % (end - begin))
print("Saving embeddings model...") model.save("../models/word2vec_gensim_%s" % emb_size) model.wv.save_word2vec_format("../models/word2vec_org_%s" % emb_size, "../models/vocabulary_%s" % emb_size, binary=False) # Get only: # * word2id vector (for transforming data to numerical) # * id_word_vec (actually contain word embeddings an associated id <-> word t3 = time() word2id, id_word_vec = transform_gensim(model.wv) t4 = time() print("Time transforming gensim to word2ID and idWordVec vectors: %s" % (t4 - t3)) # Save model for checkpointing VectorManager.write_pickled("../models/word2id_%s" % emb_size, word2id) VectorManager.write_pickled("../models/idWordVec_%s" % emb_size, id_word_vec) t5 = time() translate_files(data_path, word2id) t6 = time() print("Time translating words to numbers: %s" % (t6 - t5)) t7 = time() check_translated_files(data_path, word2id) t8 = time() print("Time translating words to numbers: %s" % (t8 - t7))
help="Id2Word vector path ['wiki_en_wordids.txt'].", required=True, default=None) args = parser.parse_args() model_path = args.model id2word_path = args.id_word word2id_path = args.word2id_path emb_path = args.embeddings begin = time() dictionary = load_dict(id2word_path) id2word = word2id_to_id2word(word2id_path) w2Id = VectorManager.read_vector(word2id_path) embeddings = VectorManager.read_vector(emb_path) demo1 = "the roman consul is normally a notable person from the senate elected " \ "by direct voting of the italic tribes" data = open("../data/small/AA/wiki_01_clean_simple").read().split("<eop>") s1 = data[0].split("<eos>")[0] data = open("../data/small/AA/wiki_00_clean_simple").read().split("<eop>") s2 = data[0].split("<eos>")[0] data = open("../data/small/AB/wiki_00_clean_simple").read().split("<eop>") s3 = data[0].split("<eos>")[0] data = open("../data/small/AB/wiki_01_clean_simple").read().split("<eop>") s4 = data[0].split("<eos>")[0] if "lda" in model_path:
word2id = dict([(w, id) for id, w, _ in id_word_vec]) return word2id, id_word_vec if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument( '-k', '--kv', type=str, help="Path of the keyed vectors to translate [word2vec_org_XXX]", required=True) args = parser.parse_args() data_path = args.kv print("Loading keyed vectors") wv = KeyedVectors.load_word2vec_format(data_path, binary=False) emb_size = len(wv.syn0[0]) word2id, id_word_vec = transform_gensim(wv) w2id_filepath = "../models/word2id_%s" % emb_size idWordVec_filepath = "../models/idWordVec_%s" % emb_size print("Writing files:\n\t * word2id: %s\n\t * idWordVec: %s" % (w2id_filepath, idWordVec_filepath)) VectorManager.write_pickled(w2id_filepath, word2id) VectorManager.write_pickled(idWordVec_filepath, id_word_vec)
def generate_arrays_from_list(name, topic_creator, files, embeddings, num_steps=35, batch_size=20, embedding_size=200): eos_mark = [id for id, w, vec in embeddings if w == "<eos>"][0] eop_mark = [id for id, w, vec in embeddings if w == "<eop>"][0] unknown_embedding = [vec for id, w, vec in embeddings if w == "<unk>"][0] debug = False # print("EOS mark: %s, EOP mark: %s" % (eos_mark, eop_mark)) while 1: for file_name in files: raw_list = VectorManager.parse_into_list(open(file_name).read()) n_words = len(raw_list) batch_len = n_words // batch_size data = np.reshape(raw_list[0:batch_size*batch_len], [batch_size, batch_len]) sentSegments = [list() for _ in range(batch_size)] parSegments = [list() for _ in range(batch_size)] for i in range(0, n_words - num_steps, 1): x = data[0:batch_size, i * num_steps:(i + 1) * num_steps] y = data[0:batch_size, i * num_steps + 1:(i + 1) * num_steps + 1] if len(x[0]) < num_steps or len(y[0]) < num_steps: break emb_x = [[embeddings[int(elem)][2] for elem in l] for l in x] emb_x = np.reshape(emb_x, newshape=(batch_size, num_steps, embedding_size)) final_x = np.zeros(shape=(batch_size, num_steps, len(embeddings[0][2])*3)) for batch in range(0, batch_size): for step in range(0, num_steps): if debug: print("%s == %s ? %s [eos]\n%s == %s ? %s[eop]" % (int(x[batch][step]), eos_mark, int(x[batch][step]) == eos_mark, int(x[batch][step]), eop_mark, int(x[batch][step]) == eop_mark)) if int(x[batch][step]) == eos_mark: sentSegments[batch] = [] else: sentSegments[batch].append(x[batch][step]) if int(x[batch][step]) == eop_mark: parSegments[batch] = [] else: parSegments[batch].append(x[batch][step]) sentTopic = unknown_embedding parTopic = unknown_embedding if sentSegments: sentTopic = get_context(topic_creator, sentSegments[batch]) if parSegments: if sentSegments[batch] == parSegments[batch]: parTopic = sentTopic else: parTopic = get_context(topic_creator, parSegments[batch]) final_x[batch][step] = np.hstack((emb_x[batch][step], sentTopic, parTopic)) if debug: print("Batch size %s\nNum steps %s\nEmbedding size %s" % (batch_size, num_steps, embedding_size )) print("Len(x): %s\n Len(x[0] %s\n Len(x[0][0] %s" % (len(x), len(x[0]), len(x[0][0]))) print("Len(y): %s\n Len(y[0] %s" % (len(y), len(y[0]))) y = np.reshape(y, newshape=(batch_size, num_steps)) yield final_x, y
def get_vocab_size(): word_to_id = VectorManager.read_vector(FLAGS.word_to_id_path) size = len(word_to_id) print("Vocabulary size: %s" % size) return size
def get_file_as_list(filename): words_list = VectorManager.parse_into_list( VectorManager.read_vector(filename)) words_list = [w for w in words_list if w not in stop_words] return words_list
def read_file(filename): return VectorManager.read_vector(filename)