def parse_dialogs_per_response(sentences, lines, candid_dic, char=1): ''' Parse dialogs provided in the babi tasks format ''' data = [] context = [] u = None r = None # print(candid_dic) for line in lines: line = line.strip() if line: if '\t' in line: # print(line) try: u, r, salt = line.split('\t') except: print('error and exit', line) exit(-1) if config.multi_label: a = [candid_dic[single_r] for single_r in r.split(",")] else: if r not in candid_dic: print('warning candidate is not listed..', r) continue a = candid_dic[r] u = tokenize(u, char=char) if config.fix_vocab: r = translator.en2cn(r) r = tokenize(r, char=char) placeholder = salt == 'placeholder' if config.fix_vocab: salt = translator.en2cn(salt) salt = tokenize(salt, char=char) sentences.add(','.join(u)) sentences.add(','.join(r)) sentences.add(','.join(salt)) # print(u) # temporal encoding, and utterance/response encoding # data.append((context[:],u[:],candid_dic[' '.join(r)])) data.append((context[:], u[:], a)) context.append(u) # r = r if placeholder == 'placeholder' else r + salt context.append(r) if not placeholder: context.append(salt) else: # clear context context = [] # print(data) sentences.add(config.EMPTY) return data
def parse_dialogs_per_response(lines, candid_dic, dmn=False): ''' Parse dialogs provided in the babi tasks format ''' data = [] context = [] u = None r = None # print(candid_dic) for line in lines: line = line.strip() if line: if '\t' in line: # print(line) try: u, r, salt = line.split('\t') except: print(line) exit(-1) if config.MULTILABEL >= 1: a = [candid_dic[single_r] for single_r in r.split(",")] else: if r not in candid_dic: continue a = candid_dic[r] u = tokenize(u) if config.FIX_VOCAB: r = translator.en2cn(r) r = tokenize(r) placeholder = salt == 'placeholder' if config.FIX_VOCAB: salt = translator.en2cn(salt) salt = tokenize(salt) # print(u) # temporal encoding, and utterance/response encoding # data.append((context[:],u[:],candid_dic[' '.join(r)])) data.append((context[:], u[:], a)) if dmn: context.append(u) context.append(r + salt) else: u.append('$u') r.append('$r') salt.append('$r') context.append(u) context.append(r) if not placeholder: context.append(salt) else: # clear context context = [] # print(data) return data
def w2v_local_similarity(self, query1, query2): tokens1 = tokenize(query1, 3) tokens2 = [tokenize(t, 3) for t in query2] max_sim = -10 _g = query2[0] for words2 in tokens2: sim = computeSentenceSim(tokens1, words2) if sim > max_sim: max_sim = sim _g = words2 return float(max_sim), _g
def reply(self, msg): line = msg.strip().lower() if line == 'clear': self.context = [] replp_msg = 'memory cleared!' else: inputs = [] questions = [] q = tokenize(line, char=2) q_vector = [self.w2idx[w] for w in q] inp_vector = [[self.w2idx[w] for w in s] for s in self.context] inputs.append(inp_vector) questions.append(np.vstack(q_vector).astype(np.float32)) input_lens, sen_lens, max_sen_len = dmn_data_utils.get_sentence_lens( inputs) q_lens = dmn_data_utils.get_lens(questions) # max_q_len = np.max(q_lens) # max_input_len = min(np.max(input_lens), # self.config.max_allowed_inputs) max_input_len = self.model.max_input_len max_sen_len = self.model.max_sen_len max_q_len = self.model.max_q_len inputs = dmn_data_utils.pad_inputs(inputs, input_lens, max_input_len, "split_sentences", sen_lens, max_sen_len) # inputs = [inputs[0] for _ in range(self.config.batch_size)] inputs = np.asarray(inputs) questions = dmn_data_utils.pad_inputs(questions, q_lens, max_q_len) # questions = [questions[0] for _ in range(self.config.batch_size)] questions = np.asarray(questions) preds = self.model.predict(self.session, inputs, input_lens, max_sen_len, questions, q_lens) preds = preds[0].tolist() print(preds) r = self.idx2candid[preds[0]] reply_msg = r r = translator.en2cn(r) r = tokenize(r, char=2) self.context.append(r) return reply_msg
def similarity(self, query1, query2): def cos(embed1, embed2): num = np.dot(embed1, embed2.T) denom = np.linalg.norm(embed1) * np.linalg.norm(embed2) cos = num / denom sin = 0.5 + 0.5 * cos return cos tokens1 = tokenize(query1, 3) tokens2 = tokenize(query2, 3) embed1 = self.embed(tokens1) embed2 = self.embed(tokens2) return cos(embed1, embed2)
def build_vocab_beforehand(vocab_base, vocab_path): with open(vocab_base, 'r', encoding='utf-8') as f: lines = f.readlines() vocab = reduce(lambda x, y: x | y, (set(tokenize(line, char=0)) for line in lines)) vocab = sorted(vocab) # print(vocab) extra_list = [ 'api', 'call', 'slot', 'deny', 'rhetorical', 'general', 'brand', 'price', 'ac', 'power', 'fr', 'cool_type', 'phone', 'sys', 'feature', 'color', 'memsize', 'size', 'distance', 'resolution', 'panel', 'dyson', 'root', 'virtual', 'mode', 'energy_lvl', 'connect', 'net', 'rmem', 'mmem', 'people', 'vol', 'width', 'height', 'control', 'olec', 'led', 'vr', 'oled', 'tcl', 'lcd', 'oled', 'oppo', 'vivo', 'moto', "1.5", '2.5', 'plugin' ] for w in extra_list: vocab.append(w) for i in range(100): vocab.append('placeholder' + str(i + 1)) vocab = sorted(vocab) print(vocab) # 0 is reserved w2idx = dict((c, i + 1) for i, c in enumerate(vocab)) with open(vocab_path, 'w', encoding='utf-8') as f: json.dump(vocab, f, ensure_ascii=False)
def append_memory(self, m): if not m: return if config.FIX_VOCAB: m = translator.en2cn(m) print('translated..', m) m = tokenize(m) m.append('$r') self.context.append(m)
def vectorize_candidates(candidates, word_idx, sentence_size): shape = (len(candidates), sentence_size) C = [] # print(shape) for i, candidate in enumerate(candidates): tokens = tokenize(candidate) lc = max(0, sentence_size - len(tokens)) C.append([word_idx[w] if w in word_idx else 0 for w in tokens] + [0] * lc) # print(C) return tf.constant(C, shape=shape)
def append_memory(self, m): if not m: return m = translator.en2cn(m) m = tokenize(m, self.char) print('appending memory..', m) q_vector = m + \ [self.config.PAD for _ in range( self.max_sen_len - len(m))] q_vector = [getVector(word) for word in q_vector] self.context.append(q_vector) self.context = self.context[-self.config.max_memory_size:]
def test(): with open(os.path.join(grandfatherdir, 'data/memn2n/train/tree/train.txt'), 'r', encoding='utf-8') as f: candidates = f.readlines() translator = Translator( os.path.join(grandfatherdir, "model/graph/translator_graph.pkl")) for line in candidates: line = line.strip('\n') line = translator.en2cn(line) line = query_util.tokenize(line, char=0) print(line)
def build_vocab(data, candidates, memory_size=config.MAX_MEMORY_SIZE): if config.FIX_VOCAB: with open(grandfatherdir + '/data/char_table/vocab.txt', 'r') as f: vocab = json.load(f) else: vocab = reduce(lambda x, y: x | y, (set(list(chain.from_iterable(s)) + q) for s, q, a in data)) # vocab2 = reduce(lambda x, y: x | y, (set(candidate) # for candidate in candidates)) vocab2 = reduce(lambda x, y: x | y, (set(tokenize(candidate)) for candidate in candidates)) vocab |= vocab2 vocab = sorted(vocab) print(vocab) # 0 is reserved w2idx = dict((c, i + 1) for i, c in enumerate(vocab)) print(w2idx) max_story_size = max(map(len, (s for s, _, _ in data))) mean_story_size = int(np.mean([len(s) for s, _, _ in data])) sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data))) # candidate_sentence_size = max(map(len, candidates)) tokenized_candidates = [tokenize(candidate) for candidate in candidates] candidate_sentence_size = max(map(len, tokenized_candidates)) query_size = max(map(len, (q for _, q, _ in data))) memory_size = min(memory_size, max_story_size) vocab_size = len(w2idx) + 1 # +1 for nil word sentence_size = max(query_size, sentence_size) # for the position return { 'w2idx': w2idx, 'idx2w': vocab, 'sentence_size': sentence_size, 'candidate_sentence_size': candidate_sentence_size, 'memory_size': memory_size, 'vocab_size': vocab_size, 'n_cand': len(candidates) } # metadata
def reply(self, msg): line = msg.strip().lower() if line == 'clear': self.context = [] reply_msg = ['memory cleared!'] values = [0] else: inputs = [] questions = [] q = tokenize(line, self.char) q_vector = [self.w2idx.get(w, 0) for w in q] print('q_vector:', q_vector) inp_vector = [[self.w2idx.get(w, 0) for w in s] for s in self.context] inp_vector = inp_vector[-self.config.max_memory_size:] inputs.append(inp_vector) questions.append(np.vstack(q_vector).astype(np.float32)) input_lens, sen_lens, max_sen_len = dmn_data_utils.get_sentence_lens( inputs) q_lens = dmn_data_utils.get_lens(questions) max_input_len = self.model.max_input_len max_sen_len = self.model.max_sen_len max_q_len = self.model.max_q_len inputs = dmn_data_utils.pad_inputs(inputs, input_lens, max_input_len, "split_sentences", sen_lens, max_sen_len) inputs = np.asarray(inputs) questions = dmn_data_utils.pad_inputs(questions, q_lens, max_q_len) questions = np.asarray(questions) pred, top_prob = self.model.predict(self.session, inputs, input_lens, max_sen_len, questions, q_lens) print(pred) # print('preds:', preds) # if self.config.multi_label: indices = top_prob.indices.tolist()[0] values = top_prob.values.tolist()[0] # else: # indices = preds[1].tolist()[0] # values = preds[0].tolist()[0] # print('indices:{0},values:{1}'.format(indices, values)) # reply_msg = [self.idx2candid[ind] for ind in indices] # print(reply_msg) reply_msg = self.idx2candid[pred[0]] r = reply_msg # print('r:',r) r = translator.en2cn(r) r = tokenize(r, self.char) self.context.append(r) if self.config.multi_label: return reply_msg, values else: return reply_msg, values #reply_msg[0], values[0]
def append_memory(self, m): if not m: return m = translator.en2cn(m) m = tokenize(m, self.char) self.context.append(m)
def m_similarity(self, query1, m_query2): tokens1 = ','.join(tokenize(query1, 3)) tokens2 = '@@'.join([','.join(tokenize(t, 3)) for t in m_query2]) score, _g = mlt_ff_embedding(tokens1, tokens2) return score, _g
def reply(self, msg): line = msg.strip().lower() if line == 'clear': # self.context = [[data_helper.ff_embedding_local(self.config.EMPTY) for _ in range(self.max_sen_len)]] self.context = [[ getVector(self.config.EMPTY) for _ in range(self.max_sen_len) ]] reply_msg = ['memory cleared!'] top_prob = [0] else: inputs = [] questions = [] q = tokenize(line, self.char) q_len = len(q) q = q[:self.max_sen_len] self.context_raw.append(q) q_vector = q + \ [self.config.PAD for _ in range( self.max_sen_len - len(q))] q_vector = [getVector(word) for word in q_vector] inp_vector = self.context pad_vector = [ getVector(self.config.PAD) for _ in range(self.max_sen_len) ] inp_vector = inp_vector + \ [pad_vector for _ in range( self.max_input_len - len(inp_vector))] inputs.append(inp_vector) questions.append(q_vector) # with tf.Session() as session: # session.run(tf.global_variables_initializer()) # # top_predict_proba = self.graph.get_tensor_by_name( # 'pred:0') # qp = self.graph.get_tensor_by_name('questions:0') # ql = self.graph.get_tensor_by_name('question_lens:0') # ip = self.graph.get_tensor_by_name('inputs:0') # il = self.graph.get_tensor_by_name('input_lens:0') # dp = self.graph.get_tensor_by_name('dropout:0') # output = session.run(top_predict_proba, feed_dict={ # qp: questions, ql: [self.max_sen_len], ip: inputs, il: [len(self.context)], dp: self.config.dropout}) pred, top_prob = self.model.predict(self.session, inputs, [len(self.context)], self.max_sen_len, questions, [q_len]) # print('pred:', pred, top_prob) # indices = output.indices.tolist()[0] # values = output.values.tolist()[0] reply_msg = [self.idx2candid[ind] for ind in pred] r = reply_msg[0] r = translator.en2cn(r) r = tokenize(r, self.char) r = r[:self.max_sen_len] self.context_raw.append(r) r_vector = r + \ [self.config.PAD for _ in range(self.max_sen_len - len(r))] r_vector = [getVector(word) for word in r_vector] self.context.append(q_vector) self.context.append(r_vector) if len(self.context) > self.config.max_memory_size: self.context = self.context[-self.config.max_memory_size:] return reply_msg[0], top_prob[0]
def reply(self, msg): line = msg.strip().lower() if line == 'clear': self.clear_memory() reply_msg = 'memory cleared!' else: if config.MULTILABEL >= 1: u = tokenize(line) print('context:', self.context) data = [(self.context, u, -1)] print('data:', data) s, q, a = data_utils.vectorize_data(data, self.w2idx, self.model._sentence_size, 1, self.n_cand, self.memory_size) preds, top_probs = self.model.predict(s, q) # preds = preds.indices[0] preds = preds.indices[0].tolist() top_probs = top_probs.values[0] print(top_probs) r = [] for i, pred in enumerate(preds): r.append(self.idx2candid[pred]) reply_msg = ','.join(r) r = tokenize(reply_msg) u.append('$u') # u.append('#' + str(self.nid)) r.append('$r') # r.append('#' + str(self.nid)) r = translator(r) self.context.append(u) self.context.append(r) print('context:', self.context) self.nid += 1 else: u = data_utils.tokenize(line) data = [(self.context, u, -1)] s, q, a = data_utils.vectorize_data(data, self.w2idx, self.model._sentence_size, 1, self.n_cand, self.memory_size) preds, top_probs = self.model.predict(s, q) prob = 1 try: prob = top_probs.values[0][0] except: pass r = self.idx2candid[preds[0]] reply_msg = r if config.FIX_VOCAB: r = translator.en2cn(r) print('translated..', r) r = data_utils.tokenize(r) u.append('$u') # u.append('#' + str(self.nid)) r.append('$r') # r = translator(r) # r.append('#' + str(self.nid)) self.context.append(u) self.context.append(r) self.nid += 1 return reply_msg, prob