def build_dict_by_gensim(docs, special_tokens=None): if special_tokens is None: special_tokens = {} dct = Dictionary(docs) dct.filter_extremes(no_below=3, no_above=1.0, keep_n=None) dct.patch_with_special_tokens(special_tokens) return dct
def build_dictionary(): letters = [] with open('data/unicodes.txt') as f: for line in f: c = line.split()[1] letters.append(c) special_tokens = { '<pad>': 0, '<sos>': 1, '<eos>': 2, '<ctx>': 3, '<unk>': 4, '<sep>': 5 } # special_tokens = {'<pad>':0, '<sos>':1, '<eos>':2, '<sep>':3, '<unk>':4} dictionary = Dictionary([letters]) # initialize a Dictionary dictionary.patch_with_special_tokens(special_tokens) return dictionary
def test_patch_with_special_tokens(self): special_tokens = {'pad': 0, 'space': 1, 'quake': 3} corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] d = Dictionary(corpus) self.assertEqual(len(d.token2id), 5) d.patch_with_special_tokens(special_tokens) self.assertEqual(d.token2id['pad'], 0) self.assertEqual(d.token2id['space'], 1) self.assertEqual(d.token2id['quake'], 3) self.assertEqual(len(d.token2id), 8) self.assertNotIn((0, 1), d.doc2bow(corpus[0])) self.assertIn((0, 1), d.doc2bow(['pad'] + corpus[0])) corpus_with_special_tokens = [["máma", "mele", "maso"], ["ema", "má", "máma", "space"]] d = Dictionary(corpus_with_special_tokens) self.assertEqual(len(d.token2id), 6) self.assertNotEqual(d.token2id['space'], 1) d.patch_with_special_tokens(special_tokens) self.assertEqual(len(d.token2id), 8) self.assertEqual(max(d.token2id.values()), 7) self.assertEqual(d.token2id['space'], 1) self.assertNotIn((1, 1), d.doc2bow(corpus_with_special_tokens[0])) self.assertIn((1, 1), d.doc2bow(corpus_with_special_tokens[1]))
for word_document in tqdm(word_documents): sws = [sp.tokenize(word) for word in word_document] sw_documents.append(list(chain.from_iterable(sws))) sw_documents_dict[sp_key] = sw_documents if os.path.exists(config_dic.get( "cache_dir")) and not os.path.exists(sp_cache_path): print(f"Write Cache data. {sp_cache_path}") with open(sp_cache_path, "wb") as f: f.write(cloudpickle.dumps(sw_documents)) print("=========== Build vocabulary ===========") special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3} word_dic = Dictionary(word_documents) word_dic.filter_extremes(no_below=5, no_above=1.0, keep_n=None) word_dic.patch_with_special_tokens(special_token_dict) sw_dicts = {} for sp_key, sw_documents in sw_documents_dict.items(): sw_dic = Dictionary(sw_documents) sw_dic.filter_extremes(no_below=5, no_above=1.0, keep_n=None) sw_dic.patch_with_special_tokens(special_token_dict) sw_dicts[sp_key] = sw_dic # char_dic = Dictionary([[char for word in word_document for char in word] for word_document in word_documents]) # char_dic.filter_extremes(no_below=5, no_above=1.0, keep_n=None) # char_dic.patch_with_special_tokens(special_token_dict) print("============= Save Vocabulary ================") word_dic.save( os.path.join(config_dic.get("vocab_dir"), f"{args.config}.word.dic"))
sw_dicts[sp_key] = Dictionary.load( os.path.join(config_dic.get("vocab_dir"), f"{args.config}.{sp_key}.dic")) else: special_token_dict = {PADDING: 0, UNKNOWN: 1, START: 2, END: 3} word_dic = Dictionary() word_dic.token2id = special_token_dict #char_dic = Dictionary() #char_dic.token2id = special_token_dict sw_dicts = {} for sp_key, sp in sps.items(): _dic = Dictionary() _dic.token2id = special_token_dict sw_dicts[sp_key] = _dic label_dic = Dictionary(train_label_documents) label_dic.patch_with_special_tokens({PADDING: 0}) label_dic.id2token = { _id: label for label, _id in label_dic.token2id.items() } # add vocabulary word_dic.add_documents(train_word_documents) #char_dic.add_documents(list(chain.from_iterable(train_char_documents))) for sp_key, train_sw_documents in train_sw_documents_dicts.items(): sw_dicts[sp_key].add_documents(train_sw_documents) # load GloVe if config_dic.get("glove_path"): print("========= Load Pretrain Word Embeddings ==========") word2vec = load_pretrain_embeddings(
class Dataset(object): ''' Create dataset for training supervised model ''' def __init__(self, config): self.config = config self.train_data = None self.test_data = None self.val_data = None self.vocab = None self.word_embeddings = None def get_pandas_df(self, filename): ''' Load the data into Pandas.DataFrame object This will be used to convert data to torchtext object ''' with open(filename, 'r', encoding='utf-8') as datafile: data = [line.strip().split(' ', maxsplit=1) for line in datafile] data_text = list(map(lambda x: x[1], data)) data_label = list(map(lambda x: x[0], data)) full_df = pd.DataFrame({"text": data_text, "label": data_label}) return full_df def load_data(self, train_file, test_file, dataname, embed_file=None, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: embed_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec) train_file (String): absolute path to training file test_file (String): absolute path to test file val_file (String): absolute path to validation file ''' # load embeddings voc_file = dataname + '_vocab.txt' new_embed = dataname + '_embed.pkl' train_X, train_Y = read_labeled(train_file) test_X, test_Y = read_labeled(test_file) val_X = None val_Y = None if val_file: val_X, val_Y = read_labeled(val_file) else: sp = int(len(train_X) * 0.8) train_X, val_X = (train_X[:sp], train_X[sp:]) train_Y, val_Y = (train_Y[:sp], train_Y[sp:]) train_X = [doc_padding(x, self.config.max_sen_len) for x in train_X] test_X = [doc_padding(x, self.config.max_sen_len) for x in test_X] val_X = [doc_padding(x, self.config.max_sen_len) for x in val_X] if os.path.isfile(voc_file): self.vocab = Dictionary.load_from_text(voc_file) else: self.vocab = Dictionary(train_X) special_tokens = {'<pad>': 0, '<unk>': 1} self.vocab.patch_with_special_tokens(special_tokens) self.vocab.save_as_text(voc_file) # build vocab train_X = [self.vocab.doc2idx(x, 1) for x in train_X] test_X = [self.vocab.doc2idx(x, 1) for x in test_X] val_X = [self.vocab.doc2idx(x, 1) for x in val_X] # transform words to index if os.path.isfile(new_embed): self.word_embeddings = torch.load(new_embed) else: embeds = Vectors(embed_file, unk_init=lambda x: torch.Tensor( np.random.normal(scale=0.6, size=(x.size())))) self.word_embeddings = weight_matrix(self.vocab, embeds) torch.save(self.word_embeddings, new_embed) self.train_data = (train_X, train_Y) self.test_data = (test_X, test_Y) self.val_data = (val_X, val_Y) print("Loaded {} training examples".format(len(train_X))) print("Loaded {} test examples".format(len(test_X))) print("Loaded {} validation examples".format(len(val_X))) def train_iterator(self): return batch_iter(*self.train_data, self.config.batch_size) def test_iterator(self): return batch_iter(*self.test_data, self.config.batch_size, False) def val_iterator(self): return batch_iter(*self.val_data, self.config.batch_size, False)