def build_emb_matrix(corpus): corpus_ = [] _ = map(lambda x: corpus_.extend(x), corpus) word2id = NlpUtil.build_word2id(corpus_) word2vec = NlpUtil.load_word2vec(config.word2vec_fpath) emb_matrix = NlpUtil.build_emb_matrix(word2vec, config.embedding_size, word2id) np.save(config.emb_matrix_fpath, emb_matrix) with codecs.open(config.word2id_fpath, 'w', 'utf-8') as out_f: out_f.write('\n'.join( ['%s\t%d' % (k, v) for k, v in word2id.iteritems()])) print 'Build emb_matrix done'
def process_dialog(cls, msg, use_task=True): """ Dialog strategy: use sub-task to handle dialog firstly, if failed, use retrieval or generational func to handle it. """ # Task response. if use_task: task_response, cls.dialog_status = TaskCore.task_handle( msg, cls.dialog_status) else: task_response = None # Search response. if len(cls.dialog_status.context) >= 3 and ch_count(msg) <= 4: user_msgs = cls.dialog_status.context[::2][-3:] msg = "<s>".join(user_msgs) mode = "cr" else: mode = "qa" msg_tokens = NlpUtil.tokenize(msg, True) search_response, sim_score = SearchCore.search(msg_tokens, mode=mode) # Seq2seq response. seq2seq_response = cls._predict_via_seq2seq(msg_tokens) log_print("search_response=%s" % search_response) log_print("seq2seq_response=%s" % seq2seq_response) if task_response: response = task_response elif sim_score >= 1.0: response = search_response else: response = seq2seq_response return response
def precess_line(line, is_train_data=True): try: line = line.strip() if is_train_data: line, flag = line.rsplit(',', 1) id_, text = line.split(',', 1) text = text.replace('|', ' ') text = text.replace('\t', ' ') text = '|'.join(['<s>'] + NlpUtil.tokenize(text, True) + ['</s>']) #text = '|'.join(NlpUtil.tokenize(text, True)) return ('\t'.join([id_, text, flag]) + '\n' if is_train_data else '\t'.join([id_, text]) + '\n') except Exception as e: print('line=%s, errmsg=%s', line, e)
def info_supply_handle(msg, dialog_status): last_msg = dialog_status.context[-3] msg_tokens = NlpUtil.tokenize(last_msg, True) response, _ = SearchCore.search(msg_tokens, info_supply_pattern) return response
def build_emb_matrix(corpus): corpus_ = [] _ = map(lambda x: corpus_.extend(x), corpus) word2id = NlpUtil.build_word2id(corpus_) word2vec = NlpUtil.load_word2vec(config.word2vec_fpath) emb_matrix = NlpUtil.build_emb_matrix(word2vec, config.embedding_size, word2id) np.save(config.emb_matrix_fpath, emb_matrix) with codecs.open(config.word2id_fpath, 'w', 'utf-8') as out_f: out_f.write('\n'.join( ['%s\t%d' % (k, v) for k, v in word2id.iteritems()])) print 'Build emb_matrix done' if __name__ == '__main__': # Tokenize data tokenize_corpus(config.raw_train_fpath, config.train_fpath, is_train_data=True) tokenize_corpus(config.raw_predict_fpath, config.predict_fpath, is_train_data=False) corpus = _get_corpus() # Train word2vec NlpUtil.train_word2vec(corpus, './model/word2vec') # Build emb matrix build_emb_matrix(corpus)