def construct_vocab_emb(data_path, train_vocab, test_vocab, embed_size, qrepr, base_embed_path, type='word2vec'): train_vocab_emb, test_vocab_emb = None, None if not os.path.exists(data_path): os.makedirs(data_path) f = open('%s/OOV_words.txt' % data_path, 'w') print('Load %s word embeddings...' % type) if type == 'word2vec': assert base_embed_path.endswith( "GoogleNews-vectors-negative300.bin.gz") entity_model = gensim.models.KeyedVectors.load_word2vec_format( base_embed_path, binary=True, unicode_errors="ignore") elif type == 'glove': entity_model = gensim.models.KeyedVectors.load_word2vec_format( base_embed_path, binary=False, unicode_errors="ignore") else: base_emb_vocab = json.load( open(base_embed_path.replace("_emb", "").replace(".npy", ".json"))) base_emb_matrix = np.load(base_embed_path) entity_model = (base_emb_vocab, base_emb_matrix) print("Building embedding matrix from base embedding at %s..." % base_embed_path) cnt_oov = 0 train_vocab_emb = np.zeros(shape=(len(train_vocab), embed_size)) test_vocab_emb = np.zeros(shape=(len(test_vocab), embed_size)) print("train vocab size: %d, test vocab size: %d" % (len(train_vocab), len(test_vocab))) for word in train_vocab: wid = train_vocab[word] # padded words embedded to vector with all zeros if wid != PAD_WORD_INDEX: emb = get_word_vector(entity_model, word) if emb is None: cnt_oov += 1 emb = np.random.rand(embed_size).astype(np.float32) emb = emb * 0.1 f.write(word + '\n') train_vocab_emb[wid] = emb for word in test_vocab: wid = test_vocab[word] - len(train_vocab) emb = get_word_vector(entity_model, word) if emb is None: cnt_oov += 1 emb = np.random.rand(embed_size).astype(np.float32) emb = emb * 0.1 f.write(word + '\n') #print(word, test_vocab[word], wid) test_vocab_emb[wid] = emb print('OOV words: %d' % cnt_oov) f.close() return train_vocab_emb, test_vocab_emb
def get_sample(model, data, index, FLAGS=FLAGS): sample = data[index] word = sample['word'] stem = sample['stem'] bin = sample['bin'] name = sample['name'] eos = utils.get_eos_vector() fw = np.ndarray(shape=(0, 1, FLAGS.embedding_size), dtype=float) labels = np.ndarray(shape=(0, 1, FLAGS.class_size), dtype=float) eos_ex = np.ndarray(shape=(0, 1, FLAGS.embedding_size), dtype=float) l = len(word) for i in range(l): _word = word[i] # [2:len(word[i]) - 1] _stem = stem[i] # [2:len(stem[i]) - 1] vector = utils.get_word_vector(model, _word, _stem) fw = np.append(fw, vector, axis=0) for i in range(l, FLAGS.max_len): eos_ex = np.append(eos_ex, eos, axis=0) bin.append(0) fw = np.append(fw, eos_ex, axis=0) for i in range(FLAGS.max_len): if bin[i] == 0: x = np.array([[[1, 0]]]) elif bin[i] == 1: x = np.array([[[0, 1]]]) labels = np.append(labels, x, axis=0) # print(d.shape, e.shape, labels.shape) return fw, labels, name, np.array([l])
def build_vocab(self, vocab_path, vocab_size=5000): sorted_words = sorted(self.words_counts.items(), key=lambda x: x[1], reverse=True) filter_words = list(filter(lambda x: is_ch(str(x[0])), sorted_words)) # save vocabulary write = open(vocab_path, 'w', encoding='utf-8') self.words2id['UNK'] = 0 self.id2words[0] = 'UNK' index = 1 for item in filter_words: word = item[0] write.write(word + '\n') word_embedding = get_word_vector(word) if not word_embedding: continue self.words2id[word] = index self.id2words[index] = word self.word_embedding[word] = word_embedding index += 1 if index == vocab_size: break
def construct_vocab_emb(data_path, train_vocab, test_vocab, embed_size, qrepr, base_embed_path): train_vocab_emb, test_vocab_emb = None, None if not os.path.exists(data_path): os.makedirs(data_path) f = open('%s/OOV_word.txt' % data_path, 'w') if embed_size == 300 and not qrepr == "char": print('Load base word embeddings...') if base_embed_path.endswith("GoogleNews-vectors-negative300.bin.gz") or \ base_embed_path.endswith("tweet_vector_0401.bin"): entity_model = gensim.models.KeyedVectors.load_word2vec_format( base_embed_path, binary=True, unicode_errors="ignore") else: base_emb_vocab = json.load( open( base_embed_path.replace("_emb", "").replace(".npy", ".json"))) base_emb_matrix = np.load(base_embed_path) entity_model = (base_emb_vocab, base_emb_matrix) print("Building embedding matrix from base embedding at %s..." % base_embed_path) cnt_oov_train, cnt_oov_test = 0, 0 train_vocab_emb = np.zeros(shape=(len(train_vocab), embed_size)) test_vocab_emb = np.zeros(shape=(len(test_vocab), embed_size)) print("train vocab size: %d, test vocab size: %d" % (len(train_vocab), len(test_vocab))) for word in train_vocab: wid = train_vocab[word] # padded words embedded to vector with all zeros if wid != PAD_WORD_INDEX: emb = get_word_vector(entity_model, word) if emb is None: cnt_oov_train += 1 emb = np.random.rand(embed_size).astype(np.float32) emb = emb * 0.1 # emb = emb * 2 - 1 # emb = emb * 0.1 - 0.05 # emb = emb * 0.5 - 0.25 # mu, sigma = 0, 0.2 # np.random.normal(mu, sigma, embed_size).astype(np.float32) f.write(word + '\n') train_vocab_emb[wid] = emb for word in test_vocab: wid = test_vocab[word] - len(train_vocab) emb = get_word_vector(entity_model, word) if emb is None: cnt_oov_test += 1 emb = np.random.rand(embed_size).astype(np.float32) emb = emb * 0.1 # emb = emb * 2 - 1 # emb = emb * 0.1 - 0.05 # emb = emb * 0.5 - 0.25 # mu, sigma = 0, 0.2 # np.random.normal(mu, sigma, embed_size).astype(np.float32) f.write(word + '\n') #print(word, test_vocab[word], wid) test_vocab_emb[wid] = emb print('OOV words in Train Set: {}, OOV words in Test Set: {}'.format( cnt_oov_train, cnt_oov_test)) f.close() return train_vocab_emb, test_vocab_emb