def main(args): problem_fns = problems.parse_problems(args.problem, registry_problem_fns) for n, (k, [fn, on_list]) in enumerate(problem_fns.items(), start=1): if on_list: utils.verbose('Start processing no.{} problem [{}]'.format(n, k)) fn.process(args) utils.verbose('Finish processing no.{} problem [{}]'.format(n, k))
def _build_vocab(self, data, vocab_size): self.words_count = self.collect_vocab(data) self.vocab = copy_head + list(self.words_count) if len(self.vocab) > vocab_size: self.vocab = self.vocab[:vocab_size] utils.verbose('real vocab: {}, final vocab: {}'.format( len(self.words_count), self.vocab_size)) self.build_vocab_dict()
def split_dialogues(dialogues, train_dev_ratio=10): random.shuffle(dialogues) divider = int(len(dialogues) / train_dev_ratio) dev_dialogues = dialogues[:divider] train_dialogues = dialogues[divider:] utils.verbose('train set #: {}'.format(len(dialogues) - divider)) utils.verbose('dev set #: {}'.format(divider)) return train_dialogues, dev_dialogues
def merge_hparam(args): if args.hparam_set not in registry_hparams: raise ValueError('invalid high parameter set {}'.format(args.hparam_set)) else: hparam = registry_hparams[args.hparam_set] for k, v in hparam.__dict__.items(): if not k.startswith('_'): utils.verbose('add attribute {} [{}] to hparams'.format(k, v)) setattr(args, k, v) return args
def __init__(self, vocab_file=None, segment='jieba'): self.words_count = dict() if vocab_file is not None: self.vocab = utils.read_lines(vocab_file) utils.verbose( 'loading vocab from file {} with vocab_size {}'.format( vocab_file, self.vocab_size)) else: self.vocab = [] self.sub_cutter = SubCutter(chinese_seg=segment) self.vocab_dict = dict() self.build_vocab_dict() self.PAD_ID = 0
def main(args): problem_fns = problems.parse_problems(args.problem, registry_problem_fns) model = problems.parse_model(args.model, registry_models) for n, (k, [fn, on_list]) in enumerate(problem_fns.items(), start=1): if on_list: utils.verbose('Start processing no.{} problem [{}]'.format(n, k)) if args.gpu_device != '': os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory with tf.Session(config=config) as sess: fn.process(args, model(args), sess) utils.verbose('Finish processing no.{} problem [{}]'.format(n, k))
def collect_vocab(self, lines): words_count = dict() for n, line in enumerate(lines, start=1): if not n % 10: utils.verbose('processing no.{} lines'.format(n)) tokens = self.sub_cutter.cut(line) for token in tokens: if token in words_count: words_count[token] += 1 elif token.startswith('{{') and token.endswith('}}'): new_token = '<' + token.split(':')[0][2:] + '>' if new_token in words_count: words_count[new_token] += 1 else: words_count[new_token] = 1 else: words_count[token] = 1 words_count = sorted(words_count, key=words_count.get, reverse=True) return words_count
def build_qa(dialogues, directory, prefix='train', mode='qaqaq'): q_path = os.path.join(directory, prefix + '_q.txt') a_path = os.path.join(directory, prefix + '_a.txt') counter = 0 with open(q_path, 'w', encoding='utf-8') as fq: with open(a_path, 'w', encoding='utf-8') as fa: for dial in dialogues: content, sent_by = zip(*dial) full = ''.join(sent_by) for i in re.finditer(r'(?={})'.format(mode + 'a'), full): question = '<s>'.join( content[i.start():i.start() + len(mode)]) + '<s>' answer = content[i.start() + len(mode)] fq.write(question + '\n') fa.write(answer + '\n') counter += 1 if counter % 10000 == 0: utils.verbose('store {} lines for {} set'.format( counter, prefix))
def process(hparam, model, sess): utils.clean_and_make_directory(hparam.model_dir) tokenizer = Tokenizer(hparam.vocab_file, segment=hparam.segment) train_batch = BaseBatch( tokenizer, init_helper(hparam, hparam.train_q, hparam.train_a)) dev_batch = BaseBatch(tokenizer, init_helper(hparam, hparam.dev_q, hparam.dev_a)) sess.run(tf.global_variables_initializer()) starter = time.time() saver = tf.train.Saver(pad_step_number=True) features = {'lowest_loss': 10, 'train_id': 0, 'dev_id': 0} features = reset_features(features) for i in range(hparam.max_steps): train_batch_features = train_batch.next_batch(hparam.batch_size, features['train_id']) train_fetches, train_feed_dict = model.train_step(train_batch_features) features['train_id'] = train_batch_features['idx'] _, train_loss, train_acc = sess.run(train_fetches, feed_dict=train_feed_dict) features['train_losses'].append(train_loss) features['train_acc'].append(train_acc) if i % hparam.show_steps == 0 and i: dev_fetches, dev_feed_dict = model.dev_step( dev_batch.next_batch(hparam.batch_size, features['dev_id'])) features['dev_id'] += hparam.batch_size dev_loss, dev_acc = sess.run(dev_fetches, feed_dict=dev_feed_dict) features['dev_losses'].append(dev_loss) features['dev_acc'].append(dev_acc) speed = hparam.show_steps / (time.time() - starter) utils.verbose(r' step {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format( i, train_loss, train_acc, dev_loss, dev_acc, speed)) starter = time.time() if i % hparam.save_steps == 0 and i: features = avg_features(features) if features['dev_losses'] < features['lowest_loss']: saver.save(sess, hparam.model_path) features['lowest_loss'] = features['dev_losses'] utils.verbose(r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}]'.format( i - hparam.save_steps, i, features['train_losses'], features['train_acc'], features['dev_losses'], features['dev_acc'])) print('-+' * 55) features = reset_features(features) if train_batch_features['update_epoch']: train_batch.shuffle_data() if train_batch.epoch > 10: utils.verbose('update epoch and reorder data...') train_batch = reorder_batch(hparam, model, sess, train_batch) utils.write_result(hparam, features['lowest_loss'])
def build_vocab(self, data, vocab_size, path): self._build_vocab(data, vocab_size) utils.write_lines(path, self.vocab) utils.verbose('vocab has been dumped in {}'.format( os.path.abspath(path)))