def to_vocab(data, frequency_cutoff=None, size_cutoff=None): if not utils.xor(frequency_cutoff, size_cutoff): raise Exception("one or the other cutoffs please") counter = Counter(word for sent in data for word in sent) if frequency_cutoff is not None: print("Using a frequency of {} to reduce vocabulary size.".format( frequency_cutoff)) words = [ word for word, count in counter.most_common() if count > frequency_cutoff ] print("Vocabulary size reduced. {} -> {}".format( len(counter), len(words))) elif size_cutoff is not None: print("Using a cutoff of {} to reduce vocabulary size.".format( size_cutoff)) words = [word for word, count in counter.most_common(size_cutoff)] print("Vocabulary size reduced. {} -> {}".format( len(counter), len(words))) else: raise Exception("should never happen...") vocab = Vocabulary(use_mask=True) vocab.add_many(['<START>', "<END>"]) vocab.add_many(words) return vocab
def make_hash_embeddings(igor, vocab): assert os.path.exists(igor.target_glove), "You need to specify a real file" fileiter = open(igor.target_glove).readlines() hash_vocab = Vocabulary() hash_vocab.use_mask = True hash_vocab.add(hash_vocab.mask_symbol) hash_vocab.add(hash_vocab.unk_symbol) word2hash = {} for word, v_id in vocab.items(): ids = hash_vocab.add_many(hash_word(word)) word2hash[v_id] = ids embeddings = np.zeros((len(hash_vocab), igor.embedding_size)) remaining_vocab = set(vocab.keys()) remaining_hashes = set(hash_vocab.values()) for line in tqdm(fileiter): line = line.replace("\n","").split(" ") word, nums = line[0], [float(x.strip()) for x in line[1:]] word_hash = hash_word(word) if word in remaining_vocab: hash_ids = word2hash[vocab[word]] remaining_vocab.remove(word) remaining_hashes.difference_update(hash_ids) embeddings[hash_ids] += np.array(nums) / len(hash_ids) print("{} words were not seen. {} hashes were not seen".format(len(remaining_vocab), len(remaining_hashes))) for hash_id in remaining_hashes: embeddings[hash_id] = np.asarray(glorot_uniform((igor.embedding_size,)).eval()) glove_name = igor.target_glove[igor.target_glove.find("glove"):].replace("/","") hash_vocab.save('hash_embedding_{}.vocab'.format(glove_name)) with open(path.join(igor.save_dir, "hash_embedding_{}.npy".format(glove_name)), "wb") as fp: np.save(fp, embeddings) with open(path.join(igor.save_dir, "word2hash.json".format(glove_name)), "w") as fp: json.dump(word2hash, fp)