def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--embeddings", nargs="+", required=True)
    parser.add_argument("-o", "--output", required=True)
    parser.add_argument("-v", "--vocabulary", default=None)
    parser.add_argument("-b", "--batch_size", type=int, default=1024)
    parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10)
    parser.add_argument("-oov", "--generate_oov_words", action="store_false")
    args = parser.parse_args()

    if args.generate_oov_words:
        concatenate_embeddings_generate(
            embeddings_path=args.embeddings,
            out_path=args.output,
            vocab=vocab_from_path(args.vocabulary)
            if args.vocabulary else None,
            batch_size=args.batch_size,
            k=args.num_nearest_neighbor,
        )
    else:
        concatenate_embeddings(
            embeddings_path=args.embeddings,
            out_path=args.output,
            vocab=vocab_from_path(args.vocabulary)
            if args.vocabulary else None,
        )
def emb_converter(path_input, path_output, args):
    printTrace('Loading Embedding ' + str(path_input) + '...')
    format = 'bin' if path_input.split('/')[-1].split('.')[-1] == 'bin' else 'text'

    emb = load_embedding(path_input, format=format,
                             vocabulary=None if args.vocab is None else vocab_from_path(args.vocab),
                             length_normalize=args.length_normalize,
                             normalize_dimensionwise=args.normalize_dimensionwise, to_unicode=True,
                             lower=args.lower, path2='', delete_duplicates=True, method_vgg="delete")

    printTrace('Saving result to ' + str(path_output) + '...')

    num_words = 0
    with open(path_output, 'w+') as file:
        for i_word, word in enumerate(emb.words):

            if i_word % 5000 ==0:
                string = "<" + str(datetime.datetime.now()) + ">  " + 'Converting : ' + str(
                    int(100 * i_word / len(emb.words))) + '%'
                print(string, end="\r")
            if args.language is None or any(l in word.split(args.delimiter) for l in args.language):
                print(word.split(args.delimiter)[-1] + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]), file=file)
                num_words+=1

    print()

    if args.word2vec:
        excec_com = 'sed -i \'1s/^/' + str(num_words) + ' ' + str(emb.dims) + '\\n/\' ' + str(path_output)
        print(excec_com)
        os.system(excec_com)

    printTrace('Done.')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-o', '--output', required=True)
    args = parser.parse_args()

    vocab = vocab_from_path(args.embedding)

    with open(args.output, 'w+') as file:
        for word in vocab:
            print(word, file=file)

    print('Done.')
예제 #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embeddings', nargs='+', required=True)
    parser.add_argument('-o', '--output', type=str, required=True)
    args = parser.parse_args()

    printTrace('Loading vocabulary from embeddings...')
    vocab_embeddings = [vocab_from_path(x) for x in args.embeddings]
    union_vocab = (set.union(*vocab_embeddings))
    printTrace('Te union of the vocabulary has ' + str(len(union_vocab)) +
               ' words.')
    printTrace('Printing vocabulary in ' + args.output + '...')
    with open(args.output, 'w+') as file:
        for word in union_vocab:
            print(word, file=file)
예제 #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-l', '--search_words', required=True)
    parser.add_argument('-o', '--output', required=True)
    parser.add_argument('-b', '--batch_size', type=int, default=1024)
    parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10)

    args = parser.parse_args()

    emb = load_embedding(args.embedding,
                         vocabulary=None,
                         lower=False,
                         length_normalize=True,
                         normalize_dimensionwise=False,
                         delete_duplicates=True)

    words_2_search = vocab_from_path(args.search_words)

    m = emb.words_to_matrix(words_2_search)
    M = emb.words_to_matrix(emb.words)

    nn = []

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = "<" + str(
            datetime.datetime.now()) + ">  " + 'Calculating nn words  ' + str(
                int(100 * (i_batch * args.batch_size) / len(m))) + '%'
        print(string, end="\r")

        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append(["\"" + emb.words[i] + "\"" for i in indexes])

    file = open(args.output, 'w+', encoding='utf-8')

    for word, nns in zip(words_2_search, nn):
        print(word + ': ' + ' '.join(nns), file=file)
예제 #6
0
def emb_converter(path_input, path_output, args):
    printTrace('Loading Embedding ' + str(path_input) + '...')
    format = 'bin' if path_input.split('/')[-1].split(
        '.')[-1] == 'bin' else 'text'

    emb = load_embedding(
        path_input,
        format=format,
        vocabulary=None if args.vocab is None else vocab_from_path(args.vocab),
        length_normalize=args.length_normalize,
        normalize_dimensionwise=args.normalize_dimensionwise,
        to_unicode=True,
        lower=args.lower,
        path2='',
        delete_duplicates=True,
        method_vgg="delete")

    printTrace('Saving result to ' + str(path_output) + '...')
    emb.export(path=path_output, printHeader=args.word2vec)

    printTrace('Done.')
예제 #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--embedding", type=str, required=True)
    parser.add_argument("-c", "--emb_4_generation", type=str, required=True)
    parser.add_argument("-d", "--dataset", type=str, required=True)
    parser.add_argument("-b", "--batch_size", type=int, default=1024)
    parser.add_argument("-k", "--num_nearest_neighbor", type=int, default=10)

    args = parser.parse_args()

    dims = get_dimensions(args.embedding)

    if dims != get_dimensions(args.emb_4_generation):
        raise ValueError(
            "All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format"
        )

    printTrace("Reading vocab...")

    vocab_emb = vocab_from_path(args.embedding)
    vocab_cross = vocab_from_path(args.emb_4_generation)
    dataset = get_dataset(args.dataset)
    vocab_to_generate = list(
        set(np.append((dataset.X[:, 0]), (dataset.X[:, 1]))))
    vocab_to_generate_set = set(vocab_to_generate)
    vocab_emb_delete = [x for x in vocab_emb if x not in vocab_to_generate_set]

    total_vocab = set.union(set(vocab_emb_delete), set(vocab_cross))
    interset_vocab = list(
        set.intersection(set(vocab_emb_delete), set(vocab_cross)))

    print("Final embedding will have " + str(len(total_vocab)) + " words")
    print("We will generate " + str(len(vocab_to_generate)) + " words")

    emb = load_embedding(
        args.emb_4_generation,
        vocabulary=None,
        lower=False,
        length_normalize=True,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    m = emb.words_to_matrix(vocab_to_generate)
    M = emb.words_to_matrix(interset_vocab)

    nn = []

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = ("<" + str(datetime.datetime.now()) + ">  " +
                  "Using Embedding " + str(args.emb_4_generation) +
                  " to generate vocab for Embedding " + str(args.embedding) +
                  ":  " + str(int(100 *
                                  (i_batch * args.batch_size) / len(m))) + "%")
        print(string, end="\r")

        # print(np.asarray(mb).shape)
        # print(np.asarray(M).shape)

        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append([interset_vocab[i] for i in indexes])

    del emb

    printTrace("===> Generating new_vocab <===")

    emb = load_embedding(
        args.embedding,
        vocabulary=vocab_emb_delete,
        lower=False,
        length_normalize=False,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    new_vectors = []
    for i_word, word in enumerate(vocab_to_generate):
        if i_word % 1000 == 0:
            string = ("<" + str(datetime.datetime.now()) + ">  " +
                      "Generating vocab " + ": " +
                      str(int(100 * i_word / len(vocab_to_generate))) + "%")
            print(string, end="\r")

        try:
            lw = nn[i_word]
            v = np.zeros([dims], dtype=float)
            for word_nn in lw:
                v += emb.word_to_vector(word_nn)

        except KeyError as r:
            raise ValueError(
                "Something went wrong in the word generation process")

        new_vectors.append(v / args.num_nearest_neighbor)

    print()

    del emb

    printTrace("===> Loading embeddings to compare <===")
    emb_generated = Embedding(vocabulary=Vocabulary(vocab_to_generate),
                              vectors=new_vectors)
    emb_original = load_embedding(
        args.embedding,
        vocabulary=vocab_to_generate,
        lower=False,
        length_normalize=False,
        normalize_dimensionwise=False,
        delete_duplicates=True,
    )

    printTrace("===> Evaluate <===")

    print("Original Embedding: ", end="")
    print(
        similarity_emd(
            emb_original,
            dataset.X,
            dataset.y,
            backoff_vector=None,
            lower=False,
            lang1prefix=None,
            lang2prefix=None,
        ))
    print("Generated Embedding: ", end="")
    print(
        similarity_emd(
            emb_generated,
            dataset.X,
            dataset.y,
            backoff_vector=None,
            lower=False,
            lang1prefix=None,
            lang2prefix=None,
        ))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--embedding', required=True)
    parser.add_argument('-c', '--cross_embedding', required=True)
    parser.add_argument('-o', '--output', required=True)
    parser.add_argument('-b', '--batch_size', type=int, default=1024)
    parser.add_argument('-k', '--num_nearest_neighbor', type=int, default=10)


    args = parser.parse_args()


    dims = get_dimensions(args.embedding)

    if dims != get_dimensions(args.cross_embedding):
        raise ValueError('All the embeddings must have the same number of dimensions and the embeddings must be in the word2vec format')

    printTrace('Reading vocab...')

    vocab_emb = vocab_from_path(args.embedding)
    vocab_cross = vocab_from_path(args.cross_embedding)

    total_vocab = set.union(set(vocab_emb), set(vocab_cross))
    interset_vocab = list(set.intersection(set(vocab_emb), set(vocab_cross)))
    vocab_to_generate = set(vocab_cross) - set(vocab_emb)

    print('Final embedding will have ' + str(len(total_vocab)) + ' words')
    print('We will generate ' + str(len(vocab_to_generate)) + ' words')

    emb = load_embedding(args.cross_embedding, vocabulary=None, lower=False, length_normalize=True, normalize_dimensionwise=False,
                         delete_duplicates=True)

    m = emb.words_to_matrix(vocab_to_generate)

    M = emb.words_to_matrix(interset_vocab)

    nn=[]

    for i_batch, mb in enumerate(batch(m, args.batch_size)):

        string = "<" + str(datetime.datetime.now()) + ">  " + 'Using Embedding ' + str(
            args.cross_embedding) + ' to generate vocab for Embedding ' + str(args.embedding) + ':  ' + str(
            int(100 * (i_batch * args.batch_size) / len(m))) + '%'
        print(string, end="\r")

        # print(np.asarray(mb).shape)
        # print(np.asarray(M).shape)


        result = cosine_knn(mb, M, args.num_nearest_neighbor)

        for i_result, indexes in enumerate(result):
            nn.append([interset_vocab[i] for i in indexes])

    del emb


    printTrace('===> Generating new_vocab <===')

    emb = load_embedding(args.embedding, vocabulary=None, lower=False, length_normalize=False, normalize_dimensionwise=False,
                         delete_duplicates=True)



    new_vectors = []
    for i_word, word in enumerate(vocab_to_generate):
        if i_word%1000 == 0:
            string = "<" + str(datetime.datetime.now()) + ">  " + 'Generating vocab ' + args.output + ': ' + str(
                int(100 * i_word / len(vocab_to_generate))) + '%'
            print(string, end="\r")

        try:
            lw = nn[i_word]
            v = np.zeros([dims], dtype=float)
            for word_nn in lw:
                v += emb.word_to_vector(word_nn)

        except KeyError as r:
            raise ValueError('Something went wrong in the word generation process')

        new_vectors.append(v/args.num_nearest_neighbor)

    print()


    printTrace('===> Printing to file <===')

    with open(args.output,'w') as file:

        print(str(len(emb.words)+len(vocab_to_generate)) + ' ' + str(dims),file=file)

        for w in emb.words:
            print(w + ' ' + ' '.join(['%.6g' % x for x in emb.word_to_vector(w)]), file=file)

        for w_i, w in enumerate(vocab_to_generate):
            print(w + ' ' + ' '.join(['%.6g' % x for x in new_vectors[w_i]]), file=file)
예제 #9
0
def main():
    parser = argparse.ArgumentParser()
    inputtype = parser.add_mutually_exclusive_group(required=True)
    inputtype.add_argument('-i', '--embedding', type=str)
    inputtype.add_argument('-d', '--directory', type=str)

    #parser.add_argument('-n', '--name_of_embedding', default=None)
    parser.add_argument('-l', '--lowercase_dataset', action='store_true')
    parser.add_argument('-lg', '--language', nargs='+', default=['en'])

    parser.add_argument('-p', '--add_lang_prefix', action='store_true')

    parser.add_argument('-v', '--vocab', type=str, default=None)

    args = parser.parse_args()

    emb_list = []

    if args.embedding is not None:
        emb_list.append(args.embedding)
    else:
        emb_list = [
            os.path.join(args.directory, f) for f in os.listdir(args.directory)
            if os.path.isfile(os.path.join(args.directory, f))
        ]

    for emb_i, emb_path in enumerate(emb_list):

        printTrace('Evaluating Embedding ' + str(emb_i + 1) + ' of ' +
                   str(len(emb_list)) + ' : ' + str(emb_path))

        emb = load_embedding(emb_path,
                             vocabulary=(None if args.vocab is None else
                                         vocab_from_path(args.vocab)),
                             lower=False,
                             length_normalize=False,
                             delete_duplicates=True)

        for lang in args.language:

            lang1prefix = None
            lang2prefix = None

            if args.add_lang_prefix:
                if lang == 'en':
                    lang1prefix = 'en'
                    lang2prefix = 'en'
                elif lang == 'es':
                    lang1prefix = 'es'
                    lang2prefix = 'es'
                elif lang == 'enes':
                    lang1prefix = 'en'
                    lang2prefix = 'es'
                else:
                    logging.warning(
                        'Language not supported, could not add prefix')

            if not os.path.exists('Results_' + lang):
                os.makedirs('Results_' + lang)

            print('>>> Results deleting oov <<< ')

            a, b = results_to_csv(evaluate_on_all(
                emb,
                backoff_vector=None,
                lowercase_dataset=args.lowercase_dataset,
                lang=lang,
                lang1prefix=lang1prefix,
                lang2prefix=lang2prefix),
                                  printRes=False,
                                  returnRes=True)
            export_to_csv(
                txtResults=a,
                txtCov=b,
                name=emb_path,
                filenameResults='Results_' + lang + '/Sim_Results_delete.csv',
                filenameCoverage='Results_' + lang + '/Sim_Coverage.csv')

            print('>>> Result using mean of all word vectors as OOV <<<')

            a, b = results_to_csv(evaluate_on_all(
                emb,
                backoff_vector=np.mean(emb.vectors, axis=0),
                lowercase_dataset=args.lowercase_dataset,
                lang=lang,
                lang1prefix=lang1prefix,
                lang2prefix=lang2prefix),
                                  printRes=False,
                                  returnRes=True)
            export_to_csv(
                txtResults=a,
                txtCov=b,
                name=emb_path,
                filenameResults='Results_' + lang + '/Sim_Results_mean.csv',
                filenameCoverage='Results_' + lang + '/Sim_Coverage.csv')

    print('Results have been exported in csv format to the Results folder')
예제 #10
0
import sys
sys.path.insert(0, '../')

from embedding import load_embedding
from utils import vocab_from_path

Joint_path = '../../Embeddings/'

print("====ENGLISH-SPANISH===")

words_eng = []
words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb'))
words_eng.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENIT.emb'))
english_words = list(set.intersection(*words_eng))

words_es = []
words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ENES.emb'))
words_es.append(vocab_from_path(Joint_path + 'JOINTC-HYB-ESIT.emb'))
spanish_words = list(set.intersection(*words_es))

emb = load_embedding(Joint_path + 'JOINTC-HYB-ENES.emb',
                     length_normalize=False,
                     delete_duplicates=True)

with open('../../Embeddings/separated/JointENES.vec', 'w') as file:

    print(str(len(spanish_words) + len(english_words)) + ' 300', file=file)

    for word in english_words:
        print('en/' + word + ' ' +
              ' '.join(['%.6g' % x for x in emb.word_to_vector(word)]),
def concatenate_embeddings_generate(embeddings_path,
                                    out_path,
                                    vocab=None,
                                    batch_size=1024,
                                    k=10):
    printTrace("Reading vocab...")

    # [[vocab_emb1], [vocab_emb_2], ...]
    vocab_embeddings = [vocab_from_path(x) for x in embeddings_path]

    word_id = set()

    if vocab is None:
        word_id = list(set.union(*vocab_embeddings))
    else:
        word_id = set(vocab)
        union = set.union(*vocab_embeddings)
        [
            print("Word " + str(w) + " not found in any embedding")
            for w in word_id - union
        ]
        word_id = list(word_id.intersection(union))

    print("The final embedding will have " + str(len(word_id)) + " words.")

    for i_voc, voc in enumerate(vocab_embeddings):
        print("Embedding " + str(i_voc) + " has " + str(len(voc)) + " words.")
        print("We will generate " + str(len(set(word_id) - voc)) +
              " words for the embedding " + str(i_voc))

    print()

    printTrace("Building matrix for word generation...")
    generation_vocab_matrix = [[x for x in range(len(embeddings_path))]
                               for x in range(len(embeddings_path))]
    nn_vocab = [defaultdict() for x in range(len(embeddings_path))]

    for x, emb1 in enumerate(vocab_embeddings):
        vocab_to_generate = set(word_id) - emb1
        for y, emb2 in enumerate(vocab_embeddings):
            generation_vocab_matrix[y][x] = list(
                vocab_to_generate.intersection(emb2))
            vocab_to_generate = vocab_to_generate - emb2

    printTrace("===> Calculating nearest neighbors <===")

    for i_emb_path, emb_path in enumerate(embeddings_path):

        printTrace("Loading file: " + str(emb_path))
        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        for i_g, g in enumerate(generation_vocab_matrix[i_emb_path]):
            if len(g) > 0:
                # print('G: ' + str(g))
                m = emb.words_to_matrix(
                    g)  # generation_vocab_matrix[i_emb_path][i_g])

                # print(len(m))
                # print(generation_vocab_matrix[x][gi])

                interset_vocab = list(
                    set.intersection(vocab_embeddings[i_emb_path],
                                     vocab_embeddings[i_g]))

                M = emb.words_to_matrix(interset_vocab)

                total_words = len(m)

                for i_batch, mb in enumerate(batch(m, batch_size)):

                    string = (
                        "<" + str(datetime.datetime.now()) + ">  " +
                        "Using Embedding " + str(i_emb_path) +
                        " to generate vocab for Embedding " + str(i_g) +
                        ":  " +
                        str(int(100 *
                                (i_batch * batch_size) / total_words)) + "%")
                    print(string, end="\r")

                    result = cosine_knn(mb, M, k)
                    for i_result, indexes in enumerate(result):
                        nn_vocab[i_g][g[i_result + (batch_size * i_batch)]] = [
                            interset_vocab[i] for i in indexes
                        ]

                print()

    printTrace("===> Calculating meta embedding <===")

    total_words = len(word_id)
    first_emb = True

    if not os.path.exists("tmp"):
        os.makedirs("tmp")

    total_dims = 0

    for x, emb_path in enumerate(embeddings_path):
        matrix = []
        printTrace("Loading file: " + str(emb_path))

        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        total_dims += emb.dims

        string = "<" + str(
            datetime.datetime.now()) + ">  " + "Embedding " + str(x)
        print(string, end="\r")

        actual_matrix = []

        for wi, w in enumerate(word_id):
            m = np.zeros([emb.dims], dtype=float)
            try:
                m = emb.word_to_vector(w)
            except KeyError as r:
                try:
                    lw = nn_vocab[x][w]
                    v = np.zeros([emb.dims], dtype=float)
                    for word in lw:
                        v += emb.word_to_vector(word)

                except KeyError as r:
                    raise ValueError(
                        "Something went wrong in the word generation process")

                m = normalize_vector(v / k)

            matrix.append(m)

            if wi % 1000 == 0:
                string = ("<" + str(datetime.datetime.now()) + "> " +
                          "Calculating meta embeddind for embedding " +
                          str(x) + ": " + str(int(100 * wi / total_words)) +
                          "%")
                print(string, end="\r")
        print()

        with open("tmp/" + str(x), "w") as file:
            for wi, w in enumerate(word_id):
                if first_emb:
                    print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)
                else:
                    print(" ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)

                if wi % 1000 == 0:
                    string = ("<" + str(datetime.datetime.now()) + "> " +
                              "Saving embedding " + str(x) + " to file : " +
                              str(int(100 * wi / total_words)) + "%")
                    print(string, end="\r")

            print()

        first_emb = False

    printTrace("Concatenation...")

    excec_com = "paste -d ' ' "
    for x in range(len(embeddings_path)):
        excec_com = excec_com + "tmp/" + str(x) + " "
    excec_com = excec_com + "> " + str(out_path)
    print(excec_com)
    os.system(excec_com)

    excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) +
                 "\\n/' " + str(out_path))
    print(excec_com)
    os.system(excec_com)

    try:
        os.system("rm -rf tmp")
    except:
        print("Could not delete the tmp folder, do it manually")

    printTrace("Done. Meta embedding saved in " + str(out_path))
def concatenate_embeddings(
    embeddings_path,
    out_path,
    vocab,
):
    printTrace("===> Calculating meta embedding (No OOV) <===")

    vocab_embeddings = [vocab_from_path(x) for x in embeddings_path]

    if vocab is None:
        word_id = list(set.union(*vocab_embeddings))
    else:
        word_id = set(vocab)
        union = set.union(*vocab_embeddings)
        [
            print("Word " + str(w) + " not found in any embedding")
            for w in word_id - union
        ]
        word_id = list(word_id.intersection(union))

    print("The final embedding will have " + str(len(word_id)) + " words.")

    first_emb = True

    if not os.path.exists("tmp_conc"):
        os.makedirs("tmp_conc")

    total_dims = 0

    for x, emb_path in enumerate(embeddings_path):
        matrix = []
        printTrace("Loading file: " + str(emb_path))

        emb = load_embedding(
            emb_path,
            vocabulary=None,
            length_normalize=True,
            normalize_dimensionwise=False,
            delete_duplicates=True,
        )

        total_dims += emb.dims

        string = "<" + str(
            datetime.datetime.now()) + ">  " + "Embedding " + str(x)
        print(string, end="\r")

        for wi, w in enumerate(word_id):
            m = np.zeros([emb.dims], dtype=float)
            try:
                m = emb.word_to_vector(w)
            except KeyError as r:
                pass

            matrix.append(m)

            if wi % 1000 == 0:
                string = ("<" + str(datetime.datetime.now()) + "> " +
                          "Calculating meta embeddind for embedding " +
                          str(x) + ": " + str(int(100 * wi / len(word_id))) +
                          "%")
                print(string, end="\r")
        print()

        with open("tmp_conc/" + str(x), "w+", encoding="utf-8") as file:
            for wi, w in enumerate(word_id):
                if first_emb:
                    print(w + " " + " ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)
                else:
                    print(" ".join(["%.6g" % x for x in matrix[wi]]),
                          file=file)

                if wi % 1000 == 0:
                    string = ("<" + str(datetime.datetime.now()) + "> " +
                              "Saving embedding " + str(x) + " to file : " +
                              str(int(100 * wi / len(word_id))) + "%")
                    print(string, end="\r")

            print()

        first_emb = False

    printTrace("Concatenation...")

    excec_com = "paste -d ' ' "
    for x in range(len(embeddings_path)):
        excec_com = excec_com + "tmp_conc/" + str(x) + " "
    excec_com = excec_com + "> " + str(out_path)
    print(excec_com)
    os.system(excec_com)

    excec_com = ("sed -i '1s/^/" + str(len(word_id)) + " " + str(total_dims) +
                 "\\n/' " + str(out_path))
    print(excec_com)
    os.system(excec_com)

    try:
        shutil.rmtree("/tmp_conc")
    except:
        print("Could not delete the tmp folder, do it manually")

    printTrace("Done. Meta embedding saved in " + str(out_path))