def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) parser.add_argument('-k', '--num_clusters', type=int, required=True) parser.add_argument('-m', '--metric', type=str, default='cosine') parser.add_argument('-o', '--output', type=str) parser.add_argument('-b', '--batch_size', type=int, default=1024) args = parser.parse_args() if args.embedding is not None: output = ''.join(args.embedding.split('/')[-1].split('.')[:-1]) + '_' + str(args.metric) + '_' + str(args.num_clusters) + '.clusters' if args.output is None else args.output kmeans4embedding(args.embedding, output, args.num_clusters, args.metric, args.batch_size) else: files = [os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f))] for i_file, file in enumerate(files): printTrace('==> Doing clustering for embedding ' + str(i_file) + ' of ' + str(len(files)) + ' : ' + str(file)) if args.output is None: if not os.path.exists('Clustering'): os.makedirs('Clustering') output = str(('Clustering/' if args.output is None else args.output)) + ''.join(file.split('/')[-1].split('.')[:-1]) + '_' + str(args.metric) + '_' + str(args.num_clusters) + '.clusters' kmeans4embedding(file, output, args.num_clusters, args.metric, args.batch_size)
def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) parser.add_argument('-o', '--output', required=True) parser.add_argument('-v', '--vocab', default=None) parser.add_argument('-nl', '--length_normalize', action='store_true') parser.add_argument('-nd', '--normalize_dimensionwise', action='store_true') parser.add_argument('-l', '--lower', action='store_true') outputtype = parser.add_mutually_exclusive_group(required=True) outputtype.add_argument('-w2v', '--word2vec', action='store_true') outputtype.add_argument('-glv', '--glove', action='store_true') args = parser.parse_args() if args.embedding: emb_converter(args.embedding, args.output, args) else: files = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] for i_file, file in enumerate(files): printTrace('Converting Embedding ' + str(i_file) + ' of ' + str(len(files)) + ' : ' + str(file)) emb_converter(file, args.output + '/' + file.split('/')[-1], args)
def kmeans4embedding(embedding_path, output_path, k, metric, batch_size): printTrace('Loading embedding ' + str(embedding_path)) emb = load_embedding(embedding_path, lower=False, length_normalize=False, delete_duplicates=True) printTrace('Clustering for embedding ' + str(embedding_path)) labels = doKmeans(emb.vectors, k, metric, batch_size) printTrace('Printing clusters for embedding ' + str(embedding_path)) with open(output_path, 'w') as file: for i_label, label in enumerate(labels): print(emb.vocabulary.index_to_word(i_label) + ' ' + str(label), file=file) printTrace('Sorting clusters for embedding ' + str(embedding_path)) excec_com = 'sort -k2 -n ' + str(output_path) + ' > ' + str(output_path) + '_sorted' print(excec_com) os.system(excec_com) excec_com = 'rm ' + str(output_path) print(excec_com) os.system(excec_com) excec_com = 'mv ' + str(output_path) + '_sorted ' + str(output_path) print(excec_com) os.system(excec_com) printTrace('Done, clusters saved in ' + str(output_path))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--directory', type=str) parser.add_argument('-o', '--output_directory', type=str) parser.add_argument('-m', '--method', choices=['PCA', 'tSVD', 'DRA'], default='DRA') parser.add_argument('-n', '--reduce_to', type=int, default=300) parser.add_argument('-b', '--do_in_batches', action='store_true') args = parser.parse_args() files = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] for i_file, file in enumerate(files): printTrace('Dimensionality reduction: Embedding ' + str(i_file) + ' of ' + str(len(files)) + ' : ' + str(file)) excec_com = 'python3 dimensionality_reduction.py -i ' + str(file) + ' -m ' + str(args.method) + ' -o ' +\ args.output_directory + file.split('/')[-1] + '_' + str(args.method) +'.vec -n ' +\ str(args.reduce_to) + (' -b ' if args.do_in_batches else '') print(excec_com) os.system(excec_com)
def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) parser.add_argument('-b', '--batch_size', type=int, default=512) parser.add_argument('-dic', '--dictionary_path', type=str, default='DictionaryInductionDataset/es-en.test') parser.add_argument('-p', '--add_lang_prefix', action='store_true') args = parser.parse_args() emb_list = [] if args.embedding is not None: emb_list.append(args.embedding) else: emb_list = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] if not os.path.exists('Results'): os.makedirs('Results') for emb_i, emb_path in enumerate(emb_list): printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' + str(len(emb_list)) + ' : ' + str(emb_path)) emb = load_embedding(emb_path, lower=False, length_normalize=True, delete_duplicates=True) top1, top2, top3, top5, top10, coverage = evaluate_dictionary_induction( emb, args.dictionary_path, args.batch_size, emb_path, args.add_lang_prefix) with open('Results/dictionary_induction', 'a+') as file: print(','.join([ str(emb_path), str(top1), str(top2), str(top3), str(top5), str(top10), str(coverage) ]), file=file) print('Results have been exported in csv format to the Results folder')
def emb_converter(path_input, path_output, args): printTrace('Loading Embedding ' + str(path_input) + '...') format = 'bin' if path_input.split('/')[-1].split('.')[-1] == 'bin' else 'text' emb = load_embedding(path_input, format=format, vocabulary=None if args.vocab is None else vocab_from_path(args.vocab), length_normalize=args.length_normalize, normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True, lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete") printTrace('Saving result to ' + str(path_output) + '...') num_words = 0 with open(path_output, 'w+') as file: for i_word, word in enumerate(emb.words): if i_word % 5000 ==0: string = "<" + str(datetime.datetime.now()) + "> " + 'Converting : ' + str( int(100 * i_word / len(emb.words))) + '%' print(string, end="\r") if args.language is None or any(l in word.split(args.delimiter) for l in args.language): print(word.split(args.delimiter)[-1] + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file) num_words+=1 print() if args.word2vec: excec_com = 'sed -i \'1s/^/' + str(num_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(path_output) print(excec_com) os.system(excec_com) printTrace('Done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embeddings', nargs='+', required=True) parser.add_argument('-o', '--output', type=str, required=True) args = parser.parse_args() printTrace('Loading vocabulary from embeddings...') vocab_embeddings = [vocab_from_path(x) for x in args.embeddings] union_vocab = (set.union(*vocab_embeddings)) printTrace('Te union of the vocabulary has ' + str(len(union_vocab)) + ' words.') printTrace('Printing vocabulary in ' + args.output + '...') with open(args.output, 'w+') as file: for word in union_vocab: print(word, file=file)
def emb_converter(path_input, path_output, args): printTrace('Loading Embedding ' + str(path_input) + '...') format = 'bin' if path_input.split('/')[-1].split( '.')[-1] == 'bin' else 'text' emb = load_embedding( path_input, format=format, vocabulary=None if args.vocab is None else vocab_from_path(args.vocab), length_normalize=args.length_normalize, normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True, lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete") printTrace('Saving result to ' + str(path_output) + '...') emb.export(path=path_output, printHeader=args.word2vec) printTrace('Done.')
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--embedding", type=str, required=True) parser.add_argument("-c", "--emb_4_generation", type=str, required=True) parser.add_argument("-d", "--dataset", type=str, required=True) parser.add_argument("-b", "--batch_size", type=int, default=1024) parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10) args = parser.parse_args() dims = get_dimensions(args.embedding) if dims != get_dimensions(args.emb_4_generation): raise ValueError( "All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format" ) printTrace("Reading vocab...") vocab_emb = vocab_from_path(args.embedding) vocab_cross = vocab_from_path(args.emb_4_generation) dataset = get_dataset(args.dataset) vocab_to_generate = list( set(np.append((dataset.X[:, 0]), (dataset.X[:, 1])))) vocab_to_generate_set = set(vocab_to_generate) vocab_emb_delete = [x for x in vocab_emb if x not in vocab_to_generate_set] total_vocab = set.union(set(vocab_emb_delete), set(vocab_cross)) interset_vocab = list( set.intersection(set(vocab_emb_delete), set(vocab_cross))) print("Final embedding will have " + str(len(total_vocab)) + " words") print("We will generate " + str(len(vocab_to_generate)) + " words") emb = load_embedding( args.emb_4_generation, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) m = emb.words_to_matrix(vocab_to_generate) M = emb.words_to_matrix(interset_vocab) nn = [] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = ("<" + str(datetime.datetime.now()) + "> " + "Using Embedding " + str(args.emb_4_generation) + " to generate vocab for Embedding " + str(args.embedding) + ": " + str(int(100 * (i_batch * args.batch_size) / len(m))) + "%") print(string, end="\r") # print(np.asarray(mb).shape) # print(np.asarray(M).shape) result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append([interset_vocab[i] for i in indexes]) del emb printTrace("===> Generating new_vocab <===") emb = load_embedding( args.embedding, vocabulary=vocab_emb_delete, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True, ) new_vectors = [] for i_word, word in enumerate(vocab_to_generate): if i_word % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Generating vocab " + ": " + str(int(100 * i_word / len(vocab_to_generate))) + "%") print(string, end="\r") try: lw = nn[i_word] v = np.zeros([dims], dtype=float) for word_nn in lw: v += emb.word_to_vector(word_nn) except KeyError as r: raise ValueError( "Something went wrong in the word generation process") new_vectors.append(v / args.num_nearest_neighbor) print() del emb printTrace("===> Loading embeddings to compare <===") emb_generated = Embedding(vocabulary=Vocabulary(vocab_to_generate), vectors=new_vectors) emb_original = load_embedding( args.embedding, vocabulary=vocab_to_generate, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True, ) printTrace("===> Evaluate <===") print("Original Embedding: ", end="") print( similarity_emd( emb_original, dataset.X, dataset.y, backoff_vector=None, lower=False, lang1prefix=None, lang2prefix=None, )) print("Generated Embedding: ", end="") print( similarity_emd( emb_generated, dataset.X, dataset.y, backoff_vector=None, lower=False, lang1prefix=None, lang2prefix=None, ))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--embeddings", nargs="+", required=True) parser.add_argument("-t", "--rotate_to", required=True) parser.add_argument("-o", "--output", required=True) parser.add_argument("-v", "--vocabulary", default=None) parser.add_argument("-b", "--batch_size", type=int, default=256) parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10) parser.add_argument("-r", "--retrofitting", default=None) parser.add_argument("-rn", "--retrofitting_n_iters", type=int, default=10) # parser.add_argument('-n', '--do_not_normalize_embs', default=False) parser.add_argument("-ir", "--do_not_retrofit_rotate_to", default=False) parser.add_argument("-nc", "--do_not_clean_files", default=False) parser.add_argument("-oov", "--generate_oov_words", action="store_false") args = parser.parse_args() is_rot_in_input = None for emb_i, emb in enumerate(args.embeddings): if emb == args.rotate_to: is_rot_in_input = emb_i if not os.path.exists("tmp"): os.makedirs("tmp") print( "tmp folder created, it will be deleted at the end of the execution (unless you have run the program with the -nc True option)" ) if args.retrofitting is not None: printTrace("==> Retrofitting <==") for emb_i, emb in enumerate(args.embeddings): string = (str(emb_i + 1) + " of " + str( len(args.embeddings) if is_rot_in_input is not None or args. do_not_retrofit_rotate_to else str(len(args.embeddings) + 1))) print(string) excec_com = ("python3 Retrofitting/retrofit.py -i " + str(emb) + " -l " + str(args.retrofitting) + " -n " + str(args.retrofitting_n_iters) + " -o " + "tmp/" + str(emb_i) + ".retro -d " + str(get_dimensions(emb))) print(excec_com) os.system(excec_com) if is_rot_in_input is not None and not args.do_not_retrofit_rotate_to: string = (str(len(args.embeddings + 1)) + " of " + str(len(args.embeddings)) if is_rot_in_input is not None or args.do_not_retrofit_rotate_to else str(len(args.embeddings) + 1)) print(string) excec_com = ("python3 Retrofitting/retrofit.py -i " + str(args.rotate_to) + " -l " + str(args.retrofitting) + " -n " + str(args.retrofitting_n_iters) + " -o " + "tmp/" + "out.retro -d " + str(get_dimensions(emb))) print(excec_com) os.system(excec_com) print() printTrace("==> Generating dictionaries for the mapping <==") for emb_i, emb in enumerate(args.embeddings): string = str(emb_i + 1) + " of " + str(len(args.embeddings)) print(string) print_dictionary_for_vecmap( "tmp/" + str(emb_i) + ".dict", generate_dictionary_for_vecmap(path1=emb, path2=args.rotate_to), ) print() printTrace("==> Normalizing Embeddings <==") for emb_i, emb in enumerate(args.embeddings): string = (str(emb_i + 1) + " of " + str( len(args.embeddings) if is_rot_in_input is not None else str( len(args.embeddings) + 1))) print(string) excec_com = ("python3 VecMap/normalize_embeddings.py unit center -i " + (emb if args.retrofitting is None else "tmp/" + str(emb_i) + ".retro") + " -o tmp/" + str(emb_i) + ".norm") print(excec_com) os.system(excec_com) if is_rot_in_input is None: string = str(len(args.embeddings) + 1) + " of " + str(len(args.embeddings) + 1) print(string) excec_com = ("python3 VecMap/normalize_embeddings.py unit center -i " + (args.rotate_to if args.retrofitting is None or args.do_not_retrofit_rotate_to else "tmp/out.retro") + " -o tmp/out.norm") print(excec_com) os.system(excec_com) print() printTrace("==> Mapping Embeddings <==") for emb_i, emb in enumerate(args.embeddings): if is_rot_in_input is None or (is_rot_in_input is not None and is_rot_in_input != emb_i): string = (str(emb_i + 1) + " of " + str(len(args.embeddings) - 1) if is_rot_in_input is not None else str(len(args.embeddings) + 1)) print(string) source_input = "tmp/" + str(emb_i) + ".norm" target_input = ("tmp/out.norm" if is_rot_in_input is None else "tmp/" + str(is_rot_in_input) + ".norm") source_output = "tmp/" + str(emb_i) + ".vecmap" target_output = "tmp/out.vecmap" dictionary = "tmp/" + str(emb_i) + ".dict" excec_com = ("python3 VecMap/map_embeddings.py --orthogonal " + source_input + " " + target_input + " " + source_output + " " + target_output + " -d " + dictionary) print(excec_com) os.system(excec_com) print() printTrace("==> Generating Meta Embedding <==") embs = "" for emb_i, emb in enumerate(args.embeddings): if is_rot_in_input is None or (is_rot_in_input is not None and is_rot_in_input != emb_i): embs = embs + "tmp/" + str(emb_i) + ".vecmap " if is_rot_in_input is not None: embs = embs + "tmp/out.vecmap " excec_com = ("python3 embeddings_mean.py -i " + embs + "-o " + args.output + " -b " + str(args.batch_size) + " -k " + str(args.num_nearest_neighbor)) if not args.generate_oov_words: excec_com = excec_com + " -oov" if args.vocabulary is not None: excec_com = excec_com + " -v " + args.vocabulary print(excec_com) os.system(excec_com) print() print("Done! Meta embedding generated in " + args.output) if not args.do_not_clean_files: print("Cleaning files...") try: os.system("rm -rf tmp") except: print("Could not delete the tmp folder, do it manually")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--embedding', required=True) parser.add_argument('-c', '--cross_embedding', required=True) parser.add_argument('-o', '--output', required=True) parser.add_argument('-b', '--batch_size', type=int, default=1024) parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10) args = parser.parse_args() dims = get_dimensions(args.embedding) if dims != get_dimensions(args.cross_embedding): raise ValueError('All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format') printTrace('Reading vocab...') vocab_emb = vocab_from_path(args.embedding) vocab_cross = vocab_from_path(args.cross_embedding) total_vocab = set.union(set(vocab_emb), set(vocab_cross)) interset_vocab = list(set.intersection(set(vocab_emb), set(vocab_cross))) vocab_to_generate = set(vocab_cross) - set(vocab_emb) print('Final embedding will have ' + str(len(total_vocab)) + ' words') print('We will generate ' + str(len(vocab_to_generate)) + ' words') emb = load_embedding(args.cross_embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True) m = emb.words_to_matrix(vocab_to_generate) M = emb.words_to_matrix(interset_vocab) nn=[] for i_batch, mb in enumerate(batch(m, args.batch_size)): string = "<" + str(datetime.datetime.now()) + "> " + 'Using Embedding ' + str( args.cross_embedding) + ' to generate vocab for Embedding ' + str(args.embedding) + ': ' + str( int(100 * (i_batch * args.batch_size) / len(m))) + '%' print(string, end="\r") # print(np.asarray(mb).shape) # print(np.asarray(M).shape) result = cosine_knn(mb, M, args.num_nearest_neighbor) for i_result, indexes in enumerate(result): nn.append([interset_vocab[i] for i in indexes]) del emb printTrace('===> Generating new_vocab <===') emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=False, normalize_dimensionwise=False, delete_duplicates=True) new_vectors = [] for i_word, word in enumerate(vocab_to_generate): if i_word%1000 == 0: string = "<" + str(datetime.datetime.now()) + "> " + 'Generating vocab ' + args.output + ': ' + str( int(100 * i_word / len(vocab_to_generate))) + '%' print(string, end="\r") try: lw = nn[i_word] v = np.zeros([dims], dtype=float) for word_nn in lw: v += emb.word_to_vector(word_nn) except KeyError as r: raise ValueError('Something went wrong in the word generation process') new_vectors.append(v/args.num_nearest_neighbor) print() printTrace('===> Printing to file <===') with open(args.output,'w') as file: print(str(len(emb.words)+len(vocab_to_generate)) + ' ' + str(dims),file=file) for w in emb.words: print(w + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(w)]), file=file) for w_i, w in enumerate(vocab_to_generate): print(w + ' ' + ' '.join(['%.6g' % x for x in new_vectors[w_i]]), file=file)
def main(): parser = argparse.ArgumentParser() inputtype = parser.add_mutually_exclusive_group(required=True) inputtype.add_argument('-i', '--embedding', type=str) inputtype.add_argument('-d', '--directory', type=str) #parser.add_argument('-n', '--name_of_embedding', default=None) parser.add_argument('-l', '--lowercase_dataset', action='store_true') parser.add_argument('-lg', '--language', nargs='+', default=['en']) parser.add_argument('-p', '--add_lang_prefix', action='store_true') parser.add_argument('-v', '--vocab', type=str, default=None) args = parser.parse_args() emb_list = [] if args.embedding is not None: emb_list.append(args.embedding) else: emb_list = [ os.path.join(args.directory, f) for f in os.listdir(args.directory) if os.path.isfile(os.path.join(args.directory, f)) ] for emb_i, emb_path in enumerate(emb_list): printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' + str(len(emb_list)) + ' : ' + str(emb_path)) emb = load_embedding(emb_path, vocabulary=(None if args.vocab is None else vocab_from_path(args.vocab)), lower=False, length_normalize=False, delete_duplicates=True) for lang in args.language: lang1prefix = None lang2prefix = None if args.add_lang_prefix: if lang == 'en': lang1prefix = 'en' lang2prefix = 'en' elif lang == 'es': lang1prefix = 'es' lang2prefix = 'es' elif lang == 'enes': lang1prefix = 'en' lang2prefix = 'es' else: logging.warning( 'Language not supported, could not add prefix') if not os.path.exists('Results_' + lang): os.makedirs('Results_' + lang) print('>>> Results deleting oov <<< ') a, b = results_to_csv(evaluate_on_all( emb, backoff_vector=None, lowercase_dataset=args.lowercase_dataset, lang=lang, lang1prefix=lang1prefix, lang2prefix=lang2prefix), printRes=False, returnRes=True) export_to_csv( txtResults=a, txtCov=b, name=emb_path, filenameResults='Results_' + lang + '/Sim_Results_delete.csv', filenameCoverage='Results_' + lang + '/Sim_Coverage.csv') print('>>> Result using mean of all word vectors as OOV <<<') a, b = results_to_csv(evaluate_on_all( emb, backoff_vector=np.mean(emb.vectors, axis=0), lowercase_dataset=args.lowercase_dataset, lang=lang, lang1prefix=lang1prefix, lang2prefix=lang2prefix), printRes=False, returnRes=True) export_to_csv( txtResults=a, txtCov=b, name=emb_path, filenameResults='Results_' + lang + '/Sim_Results_mean.csv', filenameCoverage='Results_' + lang + '/Sim_Coverage.csv') print('Results have been exported in csv format to the Results folder')
def concatenate_embeddings_generate(embeddings_path, out_path, vocab=None, batch_size=1024, k=10): printTrace("Reading vocab...") # [[vocab_emb1], [vocab_emb_2], ...] vocab_embeddings = [vocab_from_path(x) for x in embeddings_path] word_id = set() if vocab is None: word_id = list(set.union(*vocab_embeddings)) else: word_id = set(vocab) union = set.union(*vocab_embeddings) [ print("Word " + str(w) + " not found in any embedding") for w in word_id - union ] word_id = list(word_id.intersection(union)) print("The final embedding will have " + str(len(word_id)) + " words.") for i_voc, voc in enumerate(vocab_embeddings): print("Embedding " + str(i_voc) + " has " + str(len(voc)) + " words.") print("We will generate " + str(len(set(word_id) - voc)) + " words for the embedding " + str(i_voc)) print() printTrace("Building matrix for word generation...") generation_vocab_matrix = [[x for x in range(len(embeddings_path))] for x in range(len(embeddings_path))] nn_vocab = [defaultdict() for x in range(len(embeddings_path))] for x, emb1 in enumerate(vocab_embeddings): vocab_to_generate = set(word_id) - emb1 for y, emb2 in enumerate(vocab_embeddings): generation_vocab_matrix[y][x] = list( vocab_to_generate.intersection(emb2)) vocab_to_generate = vocab_to_generate - emb2 printTrace("===> Calculating nearest neighbors <===") for i_emb_path, emb_path in enumerate(embeddings_path): printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) for i_g, g in enumerate(generation_vocab_matrix[i_emb_path]): if len(g) > 0: # print('G: ' + str(g)) m = emb.words_to_matrix( g) # generation_vocab_matrix[i_emb_path][i_g]) # print(len(m)) # print(generation_vocab_matrix[x][gi]) interset_vocab = list( set.intersection(vocab_embeddings[i_emb_path], vocab_embeddings[i_g])) M = emb.words_to_matrix(interset_vocab) total_words = len(m) for i_batch, mb in enumerate(batch(m, batch_size)): string = ( "<" + str(datetime.datetime.now()) + "> " + "Using Embedding " + str(i_emb_path) + " to generate vocab for Embedding " + str(i_g) + ": " + str(int(100 * (i_batch * batch_size) / total_words)) + "%") print(string, end="\r") result = cosine_knn(mb, M, k) for i_result, indexes in enumerate(result): nn_vocab[i_g][g[i_result + (batch_size * i_batch)]] = [ interset_vocab[i] for i in indexes ] print() printTrace("===> Calculating meta embedding <===") total_words = len(word_id) first_emb = True if not os.path.exists("tmp"): os.makedirs("tmp") total_dims = 0 for x, emb_path in enumerate(embeddings_path): matrix = [] printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) total_dims += emb.dims string = "<" + str( datetime.datetime.now()) + "> " + "Embedding " + str(x) print(string, end="\r") actual_matrix = [] for wi, w in enumerate(word_id): m = np.zeros([emb.dims], dtype=float) try: m = emb.word_to_vector(w) except KeyError as r: try: lw = nn_vocab[x][w] v = np.zeros([emb.dims], dtype=float) for word in lw: v += emb.word_to_vector(word) except KeyError as r: raise ValueError( "Something went wrong in the word generation process") m = normalize_vector(v / k) matrix.append(m) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Calculating meta embeddind for embedding " + str(x) + ": " + str(int(100 * wi / total_words)) + "%") print(string, end="\r") print() with open("tmp/" + str(x), "w") as file: for wi, w in enumerate(word_id): if first_emb: print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]), file=file) else: print(" ".join(["%.6g" % x for x in matrix[wi]]), file=file) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Saving embedding " + str(x) + " to file : " + str(int(100 * wi / total_words)) + "%") print(string, end="\r") print() first_emb = False printTrace("Concatenation...") excec_com = "paste -d ' ' " for x in range(len(embeddings_path)): excec_com = excec_com + "tmp/" + str(x) + " " excec_com = excec_com + "> " + str(out_path) print(excec_com) os.system(excec_com) excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) + "\\n/' " + str(out_path)) print(excec_com) os.system(excec_com) try: os.system("rm -rf tmp") except: print("Could not delete the tmp folder, do it manually") printTrace("Done. Meta embedding saved in " + str(out_path))
def concatenate_embeddings( embeddings_path, out_path, vocab, ): printTrace("===> Calculating meta embedding (No OOV) <===") vocab_embeddings = [vocab_from_path(x) for x in embeddings_path] if vocab is None: word_id = list(set.union(*vocab_embeddings)) else: word_id = set(vocab) union = set.union(*vocab_embeddings) [ print("Word " + str(w) + " not found in any embedding") for w in word_id - union ] word_id = list(word_id.intersection(union)) print("The final embedding will have " + str(len(word_id)) + " words.") first_emb = True if not os.path.exists("tmp_conc"): os.makedirs("tmp_conc") total_dims = 0 for x, emb_path in enumerate(embeddings_path): matrix = [] printTrace("Loading file: " + str(emb_path)) emb = load_embedding( emb_path, vocabulary=None, length_normalize=True, normalize_dimensionwise=False, delete_duplicates=True, ) total_dims += emb.dims string = "<" + str( datetime.datetime.now()) + "> " + "Embedding " + str(x) print(string, end="\r") for wi, w in enumerate(word_id): m = np.zeros([emb.dims], dtype=float) try: m = emb.word_to_vector(w) except KeyError as r: pass matrix.append(m) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Calculating meta embeddind for embedding " + str(x) + ": " + str(int(100 * wi / len(word_id))) + "%") print(string, end="\r") print() with open("tmp_conc/" + str(x), "w+", encoding="utf-8") as file: for wi, w in enumerate(word_id): if first_emb: print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]), file=file) else: print(" ".join(["%.6g" % x for x in matrix[wi]]), file=file) if wi % 1000 == 0: string = ("<" + str(datetime.datetime.now()) + "> " + "Saving embedding " + str(x) + " to file : " + str(int(100 * wi / len(word_id))) + "%") print(string, end="\r") print() first_emb = False printTrace("Concatenation...") excec_com = "paste -d ' ' " for x in range(len(embeddings_path)): excec_com = excec_com + "tmp_conc/" + str(x) + " " excec_com = excec_com + "> " + str(out_path) print(excec_com) os.system(excec_com) excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) + "\\n/' " + str(out_path)) print(excec_com) os.system(excec_com) try: shutil.rmtree("/tmp_conc") except: print("Could not delete the tmp folder, do it manually") printTrace("Done. Meta embedding saved in " + str(out_path))