def build_vocabulary(self): ''' vocabulary ''' try: emb_source = self.args.emb_source except: emb_source = 'scratch' if emb_source == 'pretrain': vocab2id, id2vocab, pretrain_vec = load_vocab_pretrain( os.path.join(self.args.data_dir, self.args.file_pretrain_vocab), os.path.join(self.args.data_dir, self.args.file_pretrain_vec)) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['pretrain_emb'] = pretrain_vec self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size)) elif emb_source == 'scratch': vocab2id, id2vocab = construct_vocab( file_=os.path.join(self.args.data_dir, self.args.file_vocab), max_size=self.args.max_vocab_size, mincount=self.args.word_minfreq) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size))
def build_vocabulary(self): vocab2id, id2vocab = construct_vocab(file_=os.path.join( self.args.data_dir, self.args.file_vocab), max_size=self.args.max_vocab_size, mincount=self.args.word_minfreq) vocab_size = len(vocab2id) self.batch_data['vocab2id'] = vocab2id self.batch_data['id2vocab'] = id2vocab self.batch_data['vocab_size'] = vocab_size print('The vocabulary size: {}'.format(vocab_size))