def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--embeddings", nargs="+", required=True) parser.add_argument("-o", "--output", required=True) parser.add_argument("-v", "--vocabulary", default=None) parser.add_argument("-b", "--batch_size", type=int, default=1024) parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10) parser.add_argument("-oov", "--generate_oov_words", action="store_false") args = parser.parse_args() if args.generate_oov_words: concatenate_embeddings_generate( embeddings_path=args.embeddings, out_path=args.output, vocab=vocab_from_path(args.vocabulary) if args.vocabulary else None, batch_size=args.batch_size, k=args.num_nearest_neighbor, ) else: concatenate_embeddings( embeddings_path=args.embeddings, out_path=args.output, vocab=vocab_from_path(args.vocabulary) if args.vocabulary else None, )
def emb_converter(path_input, path_output, args): printTrace('Loading Embedding ' + str(path_input) + '...') format = 'bin' if path_input.split('/')[-1].split('.')[-1] == 'bin' else 'text' emb = load_embedding(path_input, format=format, vocabulary=None if args.vocab is None else vocab_from_path(args.vocab), length_normalize=args.length_normalize, normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True, lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete") printTrace('Saving result to ' + str(path_output) + '...') num_words = 0 with open(path_output, 'w+') as file: for i_word, word in enumerate(emb.words): if i_word % 5000 ==0: string = "<" + str(datetime.datetime.now()) + "> " + 'Converting : ' + str( int(100 * i_word / len(emb.words))) + '%' print(string, end="\r") if args.language is None or any(l in word.split(args.delimiter) for l in args.language): print(word.split(args.delimiter)[-1] + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file) num_words+=1 print() if args.word2vec: excec_com = 'sed -i \'1s/^/' + str(num_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(path_output) print(excec_com) os.system(excec_com) printTrace('Done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-o', '--output', required=True) args = parser.parse_args() vocab = vocab_from_path(args.embedding) with open(args.output, 'w+') as file: for word in vocab: print(word, file=file) print('Done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embeddings', nargs='+', required=True) parser.add_argument('-o', '--output', type=str, required=True) args = parser.parse_args() printTrace('Loading vocabulary from embeddings...') vocab_embeddings = [vocab_from_path(x) for x in args.embeddings] union_vocab = (set.union(*vocab_embeddings)) printTrace('Te union of the vocabulary has ' + str(len(union_vocab)) + ' words.') printTrace('Printing vocabulary in ' + args.output + '...') with open(args.output, 'w+') as file: for word in union_vocab: print(word, file=file)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-l', '--search_words', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-b', '--batch_size', type=int, default=1024) parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10) args = parser.parse_args() emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True) words_2_search = vocab_from_path(args.search_words) m = emb.words_to_matrix(words_2_search) M = emb.words_to_matrix(emb.words) nn = [] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = "<" + str( datetime.datetime.now()) + "> " + 'Calculating nn words ' + str( int(100 * (i_batch * args.batch_size) / len(m))) + '%' print(string, end="\r") result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append(["\"" + emb.words[i] + "\"" for i in indexes]) file = open(args.output, 'w+', encoding='utf-8') for word, nns in zip(words_2_search, nn): print(word + ': ' + ' '.join(nns), file=file)
def emb_converter(path_input, path_output, args): printTrace('Loading Embedding ' + str(path_input) + '...') format = 'bin' if path_input.split('/')[-1].split( '.')[-1] == 'bin' else 'text' emb = load_embedding( path_input, format=format, vocabulary=None if args.vocab is None else vocab_from_path(args.vocab), length_normalize=args.length_normalize, normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True, lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete") printTrace('Saving result to ' + str(path_output) + '...') emb.export(path=path_output, printHeader=args.word2vec) printTrace('Done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--embedding", type=str, required=True) parser.add_argument("-c", "--emb_4_generation", type=str, required=True) parser.add_argument("-d", "--dataset", type=str, required=True) parser.add_argument("-b", "--batch_size", type=int, default=1024) parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10) args = parser.parse_args() dims = get_dimensions(args.embedding) if dims != get_dimensions(args.emb_4_generation): raise ValueError( "All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format" ) printTrace("Reading vocab...") vocab_emb = vocab_from_path(args.embedding) vocab_cross = vocab_from_path(args.emb_4_generation) dataset = get_dataset(args.dataset) vocab_to_generate = list( set(np.append((dataset.X[:, 0]), (dataset.X[:, 1])))) vocab_to_generate_set = set(vocab_to_generate) vocab_emb_delete = [x for x in vocab_emb if x not in vocab_to_generate_set] total_vocab = set.union(set(vocab_emb_delete), set(vocab_cross)) interset_vocab = list( set.intersection(set(vocab_emb_delete), set(vocab_cross))) print("Final embedding will have " + str(len(total_vocab)) + " words") print("We will generate " + str(len(vocab_to_generate)) + " words") emb = load_embedding( args.emb_4_generation, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) m = emb.words_to_matrix(vocab_to_generate) M = emb.words_to_matrix(interset_vocab) nn = [] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = ("<" + str(datetime.datetime.now()) + "> " + "Using Embedding " + str(args.emb_4_generation) + " to generate vocab for Embedding " + str(args.embedding) + ": " + str(int(100 * (i_batch * args.batch_size) / len(m))) + "%") print(string, end="\r") # print(np.asarray(mb).shape) # print(np.asarray(M).shape) result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append([interset_vocab[i] for i in indexes]) del emb printTrace("===> Generating new_vocab <===") emb = load_embedding( args.embedding, vocabulary=vocab_emb_delete, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True, ) new_vectors = [] for i_word, word in enumerate(vocab_to_generate): if i_word % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Generating vocab " + ": " + str(int(100 * i_word / len(vocab_to_generate))) + "%") print(string, end="\r") try: lw = nn[i_word] v = np.zeros([dims], dtype=float) for word_nn in lw: v += emb.word_to_vector(word_nn) except KeyError as r: raise ValueError( "Something went wrong in the word generation process") new_vectors.append(v / args.num_nearest_neighbor) print() del emb printTrace("===> Loading embeddings to compare <===") emb_generated = Embedding(vocabulary=Vocabulary(vocab_to_generate), vectors=new_vectors) emb_original = load_embedding( args.embedding, vocabulary=vocab_to_generate, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True, ) printTrace("===> Evaluate <===") print("Original Embedding: ", end="") print( similarity_emd( emb_original, dataset.X, dataset.y, backoff_vector=None, lower=False, lang1prefix=None, lang2prefix=None, )) print("Generated Embedding: ", end="") print( similarity_emd( emb_generated, dataset.X, dataset.y, backoff_vector=None, lower=False, lang1prefix=None, lang2prefix=None, ))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-c', '--cross_embedding', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-b', '--batch_size', type=int, default=1024) parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10) args = parser.parse_args() dims = get_dimensions(args.embedding) if dims != get_dimensions(args.cross_embedding): raise ValueError('All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format') printTrace('Reading vocab...') vocab_emb = vocab_from_path(args.embedding) vocab_cross = vocab_from_path(args.cross_embedding) total_vocab = set.union(set(vocab_emb), set(vocab_cross)) interset_vocab = list(set.intersection(set(vocab_emb), set(vocab_cross))) vocab_to_generate = set(vocab_cross) - set(vocab_emb) print('Final embedding will have ' + str(len(total_vocab)) + ' words') print('We will generate ' + str(len(vocab_to_generate)) + ' words') emb = load_embedding(args.cross_embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True) m = emb.words_to_matrix(vocab_to_generate) M = emb.words_to_matrix(interset_vocab) nn=[] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = "<" + str(datetime.datetime.now()) + "> " + 'Using Embedding ' + str( args.cross_embedding) + ' to generate vocab for Embedding ' + str(args.embedding) + ': ' + str( int(100 * (i_batch * args.batch_size) / len(m))) + '%' print(string, end="\r") # print(np.asarray(mb).shape) # print(np.asarray(M).shape) result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append([interset_vocab[i] for i in indexes]) del emb printTrace('===> Generating new_vocab <===') emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True) new_vectors = [] for i_word, word in enumerate(vocab_to_generate): if i_word%1000 == 0: string = "<" + str(datetime.datetime.now()) + "> " + 'Generating vocab ' + args.output + ': ' + str( int(100 * i_word / len(vocab_to_generate))) + '%' print(string, end="\r") try: lw = nn[i_word] v = np.zeros([dims], dtype=float) for word_nn in lw: v += emb.word_to_vector(word_nn) except KeyError as r: raise ValueError('Something went wrong in the word generation process') new_vectors.append(v/args.num_nearest_neighbor) print() printTrace('===> Printing to file <===') with open(args.output,'w') as file: print(str(len(emb.words)+len(vocab_to_generate)) + ' ' + str(dims),file=file) for w in emb.words: print(w + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(w)]), file=file) for w_i, w in enumerate(vocab_to_generate): print(w + ' ' + ' '.join(['%.6g' % x for x in new_vectors[w_i]]), file=file)
def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) #parser.add_argument('-n', '--name_of_embedding', default=None) parser.add_argument('-l', '--lowercase_dataset', action='store_true') parser.add_argument('-lg', '--language', nargs='+', default=['en']) parser.add_argument('-p', '--add_lang_prefix', action='store_true') parser.add_argument('-v', '--vocab', type=str, default=None) args = parser.parse_args() emb_list = [] if args.embedding is not None: emb_list.append(args.embedding) else: emb_list = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] for emb_i, emb_path in enumerate(emb_list): printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' + str(len(emb_list)) + ' : ' + str(emb_path)) emb = load_embedding(emb_path, vocabulary=(None if args.vocab is None else vocab_from_path(args.vocab)), lower=False, length_normalize=False, delete_duplicates=True) for lang in args.language: lang1prefix = None lang2prefix = None if args.add_lang_prefix: if lang == 'en': lang1prefix = 'en' lang2prefix = 'en' elif lang == 'es': lang1prefix = 'es' lang2prefix = 'es' elif lang == 'enes': lang1prefix = 'en' lang2prefix = 'es' else: logging.warning( 'Language not supported, could not add prefix') if not os.path.exists('Results_' + lang): os.makedirs('Results_' + lang) print('>>> Results deleting oov <<< ') a, b = results_to_csv(evaluate_on_all( emb, backoff_vector=None, lowercase_dataset=args.lowercase_dataset, lang=lang, lang1prefix=lang1prefix, lang2prefix=lang2prefix), printRes=False, returnRes=True) export_to_csv( txtResults=a, txtCov=b, name=emb_path, filenameResults='Results_' + lang + '/Sim_Results_delete.csv', filenameCoverage='Results_' + lang + '/Sim_Coverage.csv') print('>>> Result using mean of all word vectors as OOV <<<') a, b = results_to_csv(evaluate_on_all( emb, backoff_vector=np.mean(emb.vectors, axis=0), lowercase_dataset=args.lowercase_dataset, lang=lang, lang1prefix=lang1prefix, lang2prefix=lang2prefix), printRes=False, returnRes=True) export_to_csv( txtResults=a, txtCov=b, name=emb_path, filenameResults='Results_' + lang + '/Sim_Results_mean.csv', filenameCoverage='Results_' + lang + '/Sim_Coverage.csv') print('Results have been exported in csv format to the Results folder')
import sys sys.path.insert(0, '../') from embedding import load_embedding from utils import vocab_from_path Joint_path = '../../Embeddings/' print("====ENGLISH-SPANISH===") words_eng = [] words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb')) words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENIT.emb')) english_words = list(set.intersection(*words_eng)) words_es = [] words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb')) words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ESIT.emb')) spanish_words = list(set.intersection(*words_es)) emb = load_embedding(Joint_path + 'JOINTC-HYB-ENES.emb', length_normalize=False, delete_duplicates=True) with open('../../Embeddings/separated/JointENES.vec', 'w') as file: print(str(len(spanish_words) + len(english_words)) + ' 300', file=file) for word in english_words: print('en/' + word + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]),
def concatenate_embeddings_generate(embeddings_path, out_path, vocab=None, batch_size=1024, k=10): printTrace("Reading vocab...") # [[vocab_emb1], [vocab_emb_2], ...] vocab_embeddings = [vocab_from_path(x) for x in embeddings_path] word_id = set() if vocab is None: word_id = list(set.union(*vocab_embeddings)) else: word_id = set(vocab) union = set.union(*vocab_embeddings) [ print("Word " + str(w) + " not found in any embedding") for w in word_id - union ] word_id = list(word_id.intersection(union)) print("The final embedding will have " + str(len(word_id)) + " words.") for i_voc, voc in enumerate(vocab_embeddings): print("Embedding " + str(i_voc) + " has " + str(len(voc)) + " words.") print("We will generate " + str(len(set(word_id) - voc)) + " words for the embedding " + str(i_voc)) print() printTrace("Building matrix for word generation...") generation_vocab_matrix = [[x for x in range(len(embeddings_path))] for x in range(len(embeddings_path))] nn_vocab = [defaultdict() for x in range(len(embeddings_path))] for x, emb1 in enumerate(vocab_embeddings): vocab_to_generate = set(word_id) - emb1 for y, emb2 in enumerate(vocab_embeddings): generation_vocab_matrix[y][x] = list( vocab_to_generate.intersection(emb2)) vocab_to_generate = vocab_to_generate - emb2 printTrace("===> Calculating nearest neighbors <===") for i_emb_path, emb_path in enumerate(embeddings_path): printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) for i_g, g in enumerate(generation_vocab_matrix[i_emb_path]): if len(g) > 0: # print('G: ' + str(g)) m = emb.words_to_matrix( g) # generation_vocab_matrix[i_emb_path][i_g]) # print(len(m)) # print(generation_vocab_matrix[x][gi]) interset_vocab = list( set.intersection(vocab_embeddings[i_emb_path], vocab_embeddings[i_g])) M = emb.words_to_matrix(interset_vocab) total_words = len(m) for i_batch, mb in enumerate(batch(m, batch_size)): string = ( "<" + str(datetime.datetime.now()) + "> " + "Using Embedding " + str(i_emb_path) + " to generate vocab for Embedding " + str(i_g) + ": " + str(int(100 * (i_batch * batch_size) / total_words)) + "%") print(string, end="\r") result = cosine_knn(mb, M, k) for i_result, indexes in enumerate(result): nn_vocab[i_g][g[i_result + (batch_size * i_batch)]] = [ interset_vocab[i] for i in indexes ] print() printTrace("===> Calculating meta embedding <===") total_words = len(word_id) first_emb = True if not os.path.exists("tmp"): os.makedirs("tmp") total_dims = 0 for x, emb_path in enumerate(embeddings_path): matrix = [] printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) total_dims += emb.dims string = "<" + str( datetime.datetime.now()) + "> " + "Embedding " + str(x) print(string, end="\r") actual_matrix = [] for wi, w in enumerate(word_id): m = np.zeros([emb.dims], dtype=float) try: m = emb.word_to_vector(w) except KeyError as r: try: lw = nn_vocab[x][w] v = np.zeros([emb.dims], dtype=float) for word in lw: v += emb.word_to_vector(word) except KeyError as r: raise ValueError( "Something went wrong in the word generation process") m = normalize_vector(v / k) matrix.append(m) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Calculating meta embeddind for embedding " + str(x) + ": " + str(int(100 * wi / total_words)) + "%") print(string, end="\r") print() with open("tmp/" + str(x), "w") as file: for wi, w in enumerate(word_id): if first_emb: print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]), file=file) else: print(" ".join(["%.6g" % x for x in matrix[wi]]), file=file) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Saving embedding " + str(x) + " to file : " + str(int(100 * wi / total_words)) + "%") print(string, end="\r") print() first_emb = False printTrace("Concatenation...") excec_com = "paste -d ' ' " for x in range(len(embeddings_path)): excec_com = excec_com + "tmp/" + str(x) + " " excec_com = excec_com + "> " + str(out_path) print(excec_com) os.system(excec_com) excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) + "\\n/' " + str(out_path)) print(excec_com) os.system(excec_com) try: os.system("rm -rf tmp") except: print("Could not delete the tmp folder, do it manually") printTrace("Done. Meta embedding saved in " + str(out_path))
def concatenate_embeddings( embeddings_path, out_path, vocab, ): printTrace("===> Calculating meta embedding (No OOV) <===") vocab_embeddings = [vocab_from_path(x) for x in embeddings_path] if vocab is None: word_id = list(set.union(*vocab_embeddings)) else: word_id = set(vocab) union = set.union(*vocab_embeddings) [ print("Word " + str(w) + " not found in any embedding") for w in word_id - union ] word_id = list(word_id.intersection(union)) print("The final embedding will have " + str(len(word_id)) + " words.") first_emb = True if not os.path.exists("tmp_conc"): os.makedirs("tmp_conc") total_dims = 0 for x, emb_path in enumerate(embeddings_path): matrix = [] printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) total_dims += emb.dims string = "<" + str( datetime.datetime.now()) + "> " + "Embedding " + str(x) print(string, end="\r") for wi, w in enumerate(word_id): m = np.zeros([emb.dims], dtype=float) try: m = emb.word_to_vector(w) except KeyError as r: pass matrix.append(m) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Calculating meta embeddind for embedding " + str(x) + ": " + str(int(100 * wi / len(word_id))) + "%") print(string, end="\r") print() with open("tmp_conc/" + str(x), "w+", encoding="utf-8") as file: for wi, w in enumerate(word_id): if first_emb: print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]), file=file) else: print(" ".join(["%.6g" % x for x in matrix[wi]]), file=file) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Saving embedding " + str(x) + " to file : " + str(int(100 * wi / len(word_id))) + "%") print(string, end="\r") print() first_emb = False printTrace("Concatenation...") excec_com = "paste -d ' ' " for x in range(len(embeddings_path)): excec_com = excec_com + "tmp_conc/" + str(x) + " " excec_com = excec_com + "> " + str(out_path) print(excec_com) os.system(excec_com) excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) + "\\n/' " + str(out_path)) print(excec_com) os.system(excec_com) try: shutil.rmtree("/tmp_conc") except: print("Could not delete the tmp folder, do it manually") printTrace("Done. Meta embedding saved in " + str(out_path))