Exemplo n.º 1
0
def augment_pretrained_with_random_initialization(args):
    words = OrderedDict()
    # words in pretrained word embedding
    for word in open(args.pretrained_vocab):
        words[word.strip()] = 1

    # words in specials e.g. PAD, START, END
    for word in args.specials:
        words[word] = 1

    # words found in training data
    for word, freq in read_model_defs(args.new_words).items():
        if freq >= args.freq_cut:
            words[word.encode("utf-8")] = freq

    new_pretrained_vocab = os.path.join(args.out, "new_words.txt")
    print >> sys.stderr, "writing to", new_pretrained_vocab
    with open(new_pretrained_vocab, "w") as f:
        for word, freq in words.items():
            f.write("{} {}\n".format(word, freq))

    embeddings = read_pretrained_embeddings(args.pretrained)
    assert embeddings.shape[0] <= len(words), "pretrained size: {}, read words: {}".format(embeddings.shape[0], len(words))
    new_embeddings = 0.02 * np.random.random_sample(
            (len(words), embeddings.shape[1])).astype('f') - 0.01
    for i in range(embeddings.shape[0]):
        new_embeddings[i] = embeddings[i]

    new_pretrained = os.path.join(args.out, "new_embeddings.txt")
    print >> sys.stderr, "writing to", new_pretrained
    np.savetxt(new_pretrained, new_embeddings)
    print >> sys.stderr, "vocabulary size", len(embeddings), "-->", len(new_embeddings)
Exemplo n.º 2
0
def extract_subset_of_pretrained_embeddings(args):
    embeddings = read_pretrained_embeddings(args.pretrained)
    emb_words = [word.strip().decode("utf-8") for word in open(args.pretrained_vocab)]
    subset = read_model_defs(args.new_words).keys()

    new_pretrained = os.path.join(args.out, "extracted_embeddings.vector")
    new_vocab = os.path.join(args.out, "extracted_embeddings.words")
    print >> sys.stderr, "writing to", new_pretrained
    with open(new_vocab, "w") as v:
        with open(new_pretrained, "w") as f:
            for i, word in enumerate(emb_words):
                if word in subset:
                    f.write(" ".join([str(u) for u in embeddings[i]]) + "\n")
                    v.write(word.encode("utf-8") + "\n")
Exemplo n.º 3
0
 def load_pretrained_embeddings(self, path):
     self.emb_word.W.data = read_pretrained_embeddings(path)