def _collect_vocab(self, lines): def insert(counter, obj): if obj in counter: counter[obj] += 1 else: counter[obj] = 1 return counter word_counter = dict() char_counter = dict() for n, line in enumerate(lines, start=1): if not n % 10000: utils.verbose('processing no.{} lines'.format(n)) words = self.cutter.cut(line) for word in words: if word.startswith('{{') and word.endswith('}}'): new_word = '<' + word.split(':')[0][2:] + '>' word_counter = insert(word_counter, new_word) char_counter = insert(char_counter, new_word) else: word_counter = insert(word_counter, word) for char in word: char_counter = insert(char_counter, char) word_counter = sorted(word_counter, key=word_counter.get, reverse=True) char_counter = sorted(char_counter, key=char_counter.get, reverse=True) return word_counter, char_counter
def _reconstruct_args(args): hp_mode, hparams = args.hparams.split('_') args.tokenizer = Tokenizer if hparams == 'lstm': original = hparams_utils.lstm() elif hparams == 'gru': original = hparams_utils.gru() elif hparams == 'lstmln': original = hparams_utils.lstm_ln() elif hparams == 'lstmrcnn': original = hparams_utils.lstm_rcnn() else: raise ValueError('Unknown hparams: {}'.format(hparams)) if hp_mode == 'solo': args.batch = data.SoloBatch args.model = model.SoloModel args.max_lens = [original.x_max_len, original.y_max_len] elif hp_mode == 'penta': args.batch = data.PentaBatch args.model = model.PentaModel args.max_lens = [original.y_max_len, original.y_max_len] else: raise ValueError('Unknown hp_mode: {}'.format(hp_mode)) for k, v in original.__dict__.items(): if not k.startswith('_'): utils.verbose('add attribute {} [{}] to hparams'.format(k, v)) setattr(args, k, v) return args
def process(args): utils.make_directory(args.path['model']) tokenizer = args.tokenizer(args.path['vocab']) train_x = utils.read_lines(args.path['train_x']) train_y = utils.read_lines(args.path['train_y']) dataset = train_x + train_y keywords = None if args.problem == 'lda': model = LDAModel(args) else: trainset = [tokenizer.encode_line_into_words(i) for i in dataset] train_keywords(trainset, args.path['model']) keywords = load_keywords(args.path['model']) model = TFIDFModel(args) list_toks = [] for n, line in enumerate(train_x): if not n % 10000 and n: utils.verbose('Tokenizing {} lines for {}'.format(n, args.problem)) if keywords is None: list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line)]) else: list_toks.append([str(s) for s in tokenizer.encode_line_into_words(line) if s in keywords[: args.num_keywords]]) model.fit(list_toks)
def split_dialogues(dialogues, train_dev_ratio=10): random.shuffle(dialogues) divider = int(len(dialogues) / train_dev_ratio) dev_dialogues = dialogues[: divider] train_dialogues = dialogues[divider:] utils.verbose('train set #: {}'.format(len(dialogues) - divider)) utils.verbose('dev set #: {}'.format(divider)) return train_dialogues, dev_dialogues
def _set_vocab(self, data, word_size, char_size): self.word_counter, self.char_counter = self._collect_vocab(data) self.words = copy_head + list(self.word_counter)[: word_size - len(copy_head)] self.chars = copy_head + list(self.char_counter)[: char_size - len(copy_head)] utils.verbose('real words: {}, final words: {}'.format( len(self.word_counter) + 3, len(self.words))) utils.verbose('real chars: {}, final chars: {}'.format( len(self.char_counter) + 3, len(self.chars))) self._set_dict()
def load_keywords(model_dir): path = os.path.join(model_dir, 'keywords.txt') idf_freq = {} utils.verbose('loading keywords from {}'.format(path)) with open(path, 'r', encoding='utf-8') as f: for line in f: word, freq = line.strip().split(' ') idf_freq[int(word)] = float(freq) keywords = sorted(idf_freq, key=idf_freq.get) return keywords
def build_vocab(self, data, token_limits, files): """ Build words and chars with limited sizes and write into files :param data: list of lines :param token_limits: word_limit_size, char_limit_size :param files: word_file_path, char_file_path :return: """ self._set_vocab(data, token_limits[0], token_limits[1]) utils.write_lines(files[0], self.words) utils.verbose( 'words has been dumped in {}'.format(os.path.abspath(files[0]))) utils.write_lines(files[1], self.chars) utils.verbose( 'chars has been dumped in {}'.format(os.path.abspath(files[1])))
def train_keywords(data, model_dir): path = os.path.join(model_dir, 'keywords.txt') vocab_counter = {} i = 0 for line in data: for word in line: if word in vocab_counter: vocab_counter[word] += 1 else: vocab_counter[word] = 1 if not i % 10000 and i: utils.verbose('processing {} lines'.format(i)) i += 1 with open(path, 'w', encoding='utf-8') as f: for key, value in vocab_counter.items(): f.write(str(key) + ' ' + str(math.log(i / value, 2)) + '\n') utils.verbose('keywords are saved in {}'.format(path))
def build_qa(dialogues, directory, prefix='train', mode='qaqaq'): q_path = os.path.join(directory, prefix + '_q.txt') a_path = os.path.join(directory, prefix + '_a.txt') counter = 0 with open(q_path, 'w', encoding='utf-8') as fq: with open(a_path, 'w', encoding='utf-8') as fa: for dial in dialogues: content, sent_by = zip(*dial) full = ''.join(sent_by) for i in re.finditer(r'(?={})'.format(mode + 'a'), full): question = '<s>'.join(content[i.start(): i.start() + len(mode)]) + '<s>' answer = content[i.start() + len(mode)] fq.write(question + '\n') fa.write(answer + '\n') counter += 1 if counter % 10000 == 0: utils.verbose('store {} lines for {} set'.format(counter, prefix))
def fit(self, list_toks): utils.verbose('Start training tfidf dictionary') self.dict = corpora.Dictionary(list_toks) utils.verbose('Start building tfidf corpus') self.corpus = [self.dict.doc2bow(toks) for toks in list_toks] utils.verbose('Start training tfidf model') self.model = models.TfidfModel(self.corpus) utils.verbose('Start saving tfidf dictionary and model') self.model.save(self.paths['model']) self.dict.save(self.paths['dict']) utils.verbose('Start building tfidf index') self.index = similarities.SparseMatrixSimilarity( self.model[self.corpus], num_features=len(self.dict.dfs)) # self.index = similarities.MatrixSimilarity(self.model[self.corpus]) self.index.save(self.paths['index'])
def __init__(self, files=None): """ Char-base adding word Tokenizer :param files: [word_file_path, char_file_path] """ self.word_counter = {} self.char_counter = {} if files is not None: self.words = utils.read_lines(files[0]) self.chars = utils.read_lines(files[1]) utils.verbose('loading words from file {} with word size {}'.format( files[0], self.word_size)) utils.verbose('loading chars from file {} with char size {}'.format( files[1], self.char_size)) else: self.words = [] self.chars = [] self.cutter = SubCutter() self.word_dict = dict() self.char_dict = dict() self._set_dict() self.PAD_ID = 0 self.UNK_ID = 1 self.EOS_ID = 2
def load(self): if all([os.path.exists(i) for i in self.paths.values()]): self.model = models.LdaMulticore.load(self.paths['model']) utils.verbose('load lda model from {}'.format(self.paths['model'])) self.dict = corpora.Dictionary.load(self.paths['dict']) utils.verbose('load lda dictionary from {}'.format( self.paths['dict'])) self.ann = AnnoyIndex(self.vec_dim) self.ann.load(self.paths['ann']) utils.verbose('load lda annoy from {}'.format(self.paths['ann'])) else: raise ValueError('Files under directory {} disappear'.format( self.model_dir))
def load(self): if all([os.path.exists(i) for i in self.paths.values()]): self.model = models.TfidfModel.load(self.paths['model']) utils.verbose('Load tfidf model from {}'.format( self.paths['model'])) self.dict = corpora.Dictionary.load(self.paths['dict']) utils.verbose('Load tfidf dictionary from {}'.format( self.paths['dict'])) self.index = similarities.SparseMatrixSimilarity.load( self.paths['index']) # self.index = similarities.MatrixSimilarity.load(self.paths['index']) utils.verbose('Load tfidf index from {}'.format( self.paths['index'])) else: raise ValueError('Files under directory {} disappear'.format( self.model_dir))
def process(args): utils.make_directory(args.path['model']) tokenizer = args.tokenizer(args.path['vocab']) train_batch = args.batch(tokenizer, args.max_lens) train_batch.set_data(utils.read_lines(args.path['train_x']), utils.read_lines(args.path['train_y'])) dev_batch = args.batch(tokenizer, args.max_lens) dev_batch.set_data(utils.read_lines(args.path['dev_x']), utils.read_lines(args.path['dev_y'])) model = args.model(args) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_device config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(pad_step_number=True) recorder = Recorder() starter = time.time() for i in range(args.max_steps): input_x, input_y, idx, update_epoch = train_batch.next_batch( args.batch_size, recorder.train_idx) train_features = { 'input_x_ph': input_x, 'input_y_ph': input_y, 'keep_prob_ph': args.keep_prob } recorder.train_idx = idx train_fetches, train_feed = model.train_step(train_features) _, train_loss, train_acc = sess.run(train_fetches, train_feed) recorder.train_losses.append(train_loss) recorder.train_accs.append(train_acc) if not i % args.show_steps and i: input_x, input_y, idx, update_epoch = dev_batch.next_batch( args.batch_size, recorder.dev_idx) dev_features = { 'input_x_ph': input_x, 'input_y_ph': input_y, 'keep_prob_ph': 1.0 } recorder.dev_idx = idx dev_fetches, dev_feed = model.dev_step(dev_features) dev_loss, dev_acc = sess.run(dev_fetches, dev_feed) recorder.dev_losses.append(dev_loss) recorder.dev_accs.append(dev_acc) speed = args.show_steps / (time.time() - starter) utils.verbose( r' step {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}] | speed {:.5f} it/s'.format( i, train_loss, train_acc, dev_loss, dev_acc, speed)) starter = time.time() if not i % args.save_steps and i: features = recorder.stats() if features['save']: saver.save(sess, args.path['model']) utils.verbose( r'step {:05d} - {:05d} | train [{:.5f} {:.5f}] | ' r'dev [{:.5f} {:.5f}]'.format(i - args.save_steps, i, features['train_loss'], features['train_acc'], features['dev_loss'], features['dev_acc'])) print('-+' * 55) utils.write_result(args, recorder.lowest_loss) utils.verbose('Start building vector space from dual encoder model') vectors = [] infer_batch = args.batch(tokenizer, args.max_lens) infer_batch.set_data(utils.read_lines(args.path['train_x']), utils.read_lines(args.path['train_y'])) starter = time.time() idx = 0 update_epoch = False i = 0 while not update_epoch: input_x, input_y, idx, update_epoch = infer_batch.next_batch( args.batch_size, idx) infer_features = {'input_x_ph': input_x, 'keep_prob_ph': 1.0} infer_fetches, infer_feed = model.infer_step(infer_features) enc_questions = sess.run(infer_fetches, infer_feed) vectors += enc_questions if not i % args.show_steps and i: speed = args.show_steps / (time.time() - starter) utils.verbose('step : {:05d} | speed: {:.5f} it/s'.format( i, speed)) starter = time.time() i += 1 vectors = np.reshape(np.array(vectors), [-1, args.hidden])[:infer_batch.data_size] vec_dim = vectors.shape[-1] ann = AnnoyIndex(vec_dim) for n, ii in enumerate(vectors): ann.add_item(n, ii) ann.build(args.num_trees) ann.save(args.path['ann']) utils.verbose('Annoy has been dump in {}'.format(args.path['ann']))
# coding:utf-8 from __future__ import unicode_literals from __future__ import division from __future__ import print_function from src import utils from src.utils import args_utils from src.data_utils import data_generator from src.data_utils import vocab_generator if __name__ == '__main__': hparams = args_utils.minor_args() utils.verbose('Start generating data') data_generator.process(hparams) utils.verbose('Finish generating data') utils.verbose('Start generating vocab') vocab_generator.process(hparams) utils.verbose('Finish generating vocab')
# coding:utf-8 from __future__ import unicode_literals from __future__ import division from __future__ import print_function from src import utils from src.utils import args_utils from src.dual_encoder import trainer_lib as dual_encoder_trainer from src.traditional import trainer_lib as traditional_trainer trainer_index = { 'dual_encoder': dual_encoder_trainer, 'tfidf': traditional_trainer, 'lda': traditional_trainer } if __name__ == '__main__': hparams = args_utils.major_args() if hparams.problem is None: raise ValueError('At least one problem must be announced') elif hparams.problem not in trainer_index: raise ValueError('Invalid problem: {}'.format(hparams.problem)) else: utils.verbose('Start training problem: {}'.format(hparams.problem)) trainer_index[hparams.problem].process(hparams) utils.verbose('Finish training problem: {}'.format(hparams.problem))
def fit(self, list_toks): utils.verbose('start training lda dictionary') self.dict = corpora.Dictionary(list_toks) utils.verbose('start building lda corpus') self.corpus = [self.dict.doc2bow(toks) for toks in list_toks] utils.verbose('start training lda model') self.model = models.LdaMulticore(self.corpus, self.vec_dim, id2word=self.dict) utils.verbose('start saving lda dictionary and model') self.model.save(self.paths['model']) self.dict.save(self.paths['dict']) utils.verbose('start vectorization for lda') self.ann = AnnoyIndex(self.vec_dim) for n, toks in enumerate(list_toks): if not n % 10000 and n: utils.verbose('vectorizing {} lines for lda'.format(n)) vec = self.get(toks) self.ann.add_item(n, vec) utils.verbose('start building lda ann') self.ann.build(self.num_trees) self.ann.save(self.paths['ann']) utils.verbose('dump lda annoy into {}'.format(self.paths['ann']))