예제 #1
0
def construct_vocab_emb(data_path,
                        train_vocab,
                        test_vocab,
                        embed_size,
                        qrepr,
                        base_embed_path,
                        type='word2vec'):
    train_vocab_emb, test_vocab_emb = None, None
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    f = open('%s/OOV_words.txt' % data_path, 'w')

    print('Load %s word embeddings...' % type)
    if type == 'word2vec':
        assert base_embed_path.endswith(
            "GoogleNews-vectors-negative300.bin.gz")
        entity_model = gensim.models.KeyedVectors.load_word2vec_format(
            base_embed_path, binary=True, unicode_errors="ignore")
    elif type == 'glove':
        entity_model = gensim.models.KeyedVectors.load_word2vec_format(
            base_embed_path, binary=False, unicode_errors="ignore")
    else:
        base_emb_vocab = json.load(
            open(base_embed_path.replace("_emb", "").replace(".npy", ".json")))
        base_emb_matrix = np.load(base_embed_path)
        entity_model = (base_emb_vocab, base_emb_matrix)
    print("Building embedding matrix from base embedding at %s..." %
          base_embed_path)
    cnt_oov = 0
    train_vocab_emb = np.zeros(shape=(len(train_vocab), embed_size))
    test_vocab_emb = np.zeros(shape=(len(test_vocab), embed_size))
    print("train vocab size: %d, test vocab size: %d" %
          (len(train_vocab), len(test_vocab)))
    for word in train_vocab:
        wid = train_vocab[word]
        # padded words embedded to vector with all zeros
        if wid != PAD_WORD_INDEX:
            emb = get_word_vector(entity_model, word)
            if emb is None:
                cnt_oov += 1
                emb = np.random.rand(embed_size).astype(np.float32)
                emb = emb * 0.1
                f.write(word + '\n')
            train_vocab_emb[wid] = emb
    for word in test_vocab:
        wid = test_vocab[word] - len(train_vocab)
        emb = get_word_vector(entity_model, word)
        if emb is None:
            cnt_oov += 1
            emb = np.random.rand(embed_size).astype(np.float32)
            emb = emb * 0.1
            f.write(word + '\n')
        #print(word, test_vocab[word], wid)
        test_vocab_emb[wid] = emb

    print('OOV words: %d' % cnt_oov)
    f.close()
    return train_vocab_emb, test_vocab_emb
예제 #2
0
파일: run.py 프로젝트: nguyenlab/SentSum
def get_sample(model, data, index, FLAGS=FLAGS):
    sample = data[index]
    word = sample['word']
    stem = sample['stem']
    bin = sample['bin']
    name = sample['name']
    eos = utils.get_eos_vector()

    fw = np.ndarray(shape=(0, 1, FLAGS.embedding_size), dtype=float)
    labels = np.ndarray(shape=(0, 1, FLAGS.class_size), dtype=float)
    eos_ex = np.ndarray(shape=(0, 1, FLAGS.embedding_size), dtype=float)

    l = len(word)
    for i in range(l):
        _word = word[i]  # [2:len(word[i]) - 1]
        _stem = stem[i]  # [2:len(stem[i]) - 1]
        vector = utils.get_word_vector(model, _word, _stem)
        fw = np.append(fw, vector, axis=0)

    for i in range(l, FLAGS.max_len):
        eos_ex = np.append(eos_ex, eos, axis=0)
        bin.append(0)

    fw = np.append(fw, eos_ex, axis=0)

    for i in range(FLAGS.max_len):
        if bin[i] == 0:
            x = np.array([[[1, 0]]])
        elif bin[i] == 1:
            x = np.array([[[0, 1]]])
        labels = np.append(labels, x, axis=0)

        # print(d.shape, e.shape, labels.shape)
    return fw, labels, name, np.array([l])
예제 #3
0
    def build_vocab(self, vocab_path, vocab_size=5000):
        sorted_words = sorted(self.words_counts.items(),
                              key=lambda x: x[1],
                              reverse=True)
        filter_words = list(filter(lambda x: is_ch(str(x[0])), sorted_words))

        # save vocabulary
        write = open(vocab_path, 'w', encoding='utf-8')

        self.words2id['UNK'] = 0
        self.id2words[0] = 'UNK'
        index = 1

        for item in filter_words:
            word = item[0]
            write.write(word + '\n')

            word_embedding = get_word_vector(word)
            if not word_embedding:
                continue

            self.words2id[word] = index
            self.id2words[index] = word
            self.word_embedding[word] = word_embedding
            index += 1

            if index == vocab_size:
                break
def construct_vocab_emb(data_path, train_vocab, test_vocab, embed_size, qrepr,
                        base_embed_path):
    train_vocab_emb, test_vocab_emb = None, None
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    f = open('%s/OOV_word.txt' % data_path, 'w')
    if embed_size == 300 and not qrepr == "char":
        print('Load base word embeddings...')
        if base_embed_path.endswith("GoogleNews-vectors-negative300.bin.gz") or \
            base_embed_path.endswith("tweet_vector_0401.bin"):
            entity_model = gensim.models.KeyedVectors.load_word2vec_format(
                base_embed_path, binary=True, unicode_errors="ignore")
        else:
            base_emb_vocab = json.load(
                open(
                    base_embed_path.replace("_emb",
                                            "").replace(".npy", ".json")))
            base_emb_matrix = np.load(base_embed_path)
            entity_model = (base_emb_vocab, base_emb_matrix)
        print("Building embedding matrix from base embedding at %s..." %
              base_embed_path)
        cnt_oov_train, cnt_oov_test = 0, 0
        train_vocab_emb = np.zeros(shape=(len(train_vocab), embed_size))
        test_vocab_emb = np.zeros(shape=(len(test_vocab), embed_size))
        print("train vocab size: %d, test vocab size: %d" %
              (len(train_vocab), len(test_vocab)))
        for word in train_vocab:
            wid = train_vocab[word]
            # padded words embedded to vector with all zeros
            if wid != PAD_WORD_INDEX:
                emb = get_word_vector(entity_model, word)
                if emb is None:
                    cnt_oov_train += 1
                    emb = np.random.rand(embed_size).astype(np.float32)
                    emb = emb * 0.1
                    # emb = emb * 2 - 1
                    # emb = emb * 0.1 - 0.05
                    # emb = emb * 0.5 - 0.25
                    # mu, sigma = 0, 0.2
                    # np.random.normal(mu, sigma, embed_size).astype(np.float32)
                    f.write(word + '\n')
                train_vocab_emb[wid] = emb
        for word in test_vocab:
            wid = test_vocab[word] - len(train_vocab)
            emb = get_word_vector(entity_model, word)
            if emb is None:
                cnt_oov_test += 1
                emb = np.random.rand(embed_size).astype(np.float32)
                emb = emb * 0.1
                # emb = emb * 2 - 1
                # emb = emb * 0.1 - 0.05
                # emb = emb * 0.5 - 0.25
                # mu, sigma = 0, 0.2
                # np.random.normal(mu, sigma, embed_size).astype(np.float32)
                f.write(word + '\n')
            #print(word, test_vocab[word], wid)
            test_vocab_emb[wid] = emb

        print('OOV words in Train Set: {},  OOV words in Test Set: {}'.format(
            cnt_oov_train, cnt_oov_test))
        f.close()
    return train_vocab_emb, test_vocab_emb