def chat_with_lshf(model_manager): ann = model_manager.load_lshf_model(True) encoder = model_manager.load_currently_selected_model() sampler = search.BeamSampler(encoder) label_decoder = get_label_translator(m) while 1: # get textual input context = raw_input('context: ').strip() if not context.endswith(' __eou__ </s>'): context += ' __eou__ </s>' dia_embs, utt_embs = encoder.encode(context) #print context #''' context_emb = dia_embs[-2][0] utt_emb = utt_embs[-2][0] distances, labels, embeddings = ann.kneighbors(context_emb, 120) print labels for label, distance in zip(labels[:10], distances[0][:10]): print distance, label_decoder(label) print label_decoder((label[0], label[1] + 1)) print search_context = { 'distances': distances[0], 'labels': [(label[0], label[1] + 1) for label in labels], 'candidate_dialogue_embeddings': embeddings, 'utterance_embeddings': ann.utterance_embeddings, 'original_utterance_embedding': utt_emb, 'original_dialogue_embedding': context_emb } scored = candidate_selection.answer_relevance(search_context) print 'answer relevance ' * 10 print scored = sorted(scored, key=lambda pair: pair[0]) for score, label in scored[:10]: print score, label_decoder(label) print #''' samples, costs = sampler.sample([context.split()], n_samples=5, n_turns=1) print 'HRED: ', samples[0][0] print
def evaluation_sample_iterator(model_manager, amount = 30000, seed = 10): rand = Random(seed) database = data_access.get_database(model_manager) test_ids = database[data_access.TEST_IDS_SET_NAME][:] rand.shuffle(test_ids) test_ids = test_ids[0:min(len(test_ids), amount)] debug('yielding evaluation samples from %i conversations'%len(test_ids)) coords = [database[data_access.EMBEDDINGS_COORDINATES_SET_NAME][d_idx] for d_idx in test_ids] utt_embs = FileArray(model_manager.files['utterance_embeddings']) dia_embs = FileArray(model_manager.files['dialogue_embeddings']) utt_embs.open() dia_embs.open() label_to_text = data_access.get_label_translator(model_manager) #context-textual response-textual, response-emb context-emb, answer-emb progress = 0 for d_idx, (global_idx, conv_length) in zip(test_ids, coords): progress += 1 context = label_to_text((d_idx, 0)) relevant_utt_embs = utt_embs.read_chunk(global_idx, conv_length) relevant_dia_embs = dia_embs.read_chunk(global_idx, conv_length) for idx, conv_turn in enumerate(xrange(0, conv_length-1)): instance = {} instance['question'] = label_to_text((d_idx, conv_turn)) instance['question_utterance_emb'] = relevant_utt_embs[idx] instance['context'] = context instance['context_emb'] = relevant_dia_embs[idx] instance['answer'] = label_to_text((d_idx, conv_turn+1)) instance['answer_utterance_emb'] = relevant_utt_embs[idx+1] instance['answer_context_emb'] = relevant_dia_embs[idx+1] instance['progress'] = progress instance['conversations'] = len(test_ids) yield instance context = context + ' </s> ' + instance['answer'] utt_embs.close() dia_embs.close()
def random_response_generator(model_manager, seed = 10): rand = Random(seed) database = data_access.get_database(model_manager) test_ids = database[data_access.TEST_IDS_SET_NAME][:] debug('yielding random responses from %i conversations' % len(test_ids)) coords = [database[data_access.EMBEDDINGS_COORDINATES_SET_NAME][d_idx] for d_idx in test_ids] utt_embs = FileArray(model_manager.files['utterance_embeddings']) utt_embs.open() label_to_text = data_access.get_label_translator(model_manager) while 1: idx = rand.randint(0, len(test_ids)-1) d_idx = test_ids[idx] global_idx, conv_length = coords[idx] conv_turn = rand.randint(0, conv_length-1) yield label_to_text((d_idx, conv_turn)), utt_embs.read(global_idx+conv_turn)
#from ann.lsh_forest import save_linked_utterance_embeddings #save_linked_utterance_embeddings(m) #lsh_forest.train_lsh_forest(m, corpus_percentage=0.05) ann = lsh_forest.load_lshf(m) utt_embs = lsh_forest.load_utterance_embeddings(m) encoder = m.load_currently_selected_model() embs = encode('how do i update all packages ? __eou__', encoder) d_emb = embs[0][0][0] distances, labels, embeddings = ann.kneighbors(d_emb, 10) translator = get_label_translator(m, as_text=True) labels = [(label[0], label[1] + 1) for label in labels] search_context = { 'distances': distances, 'labels': labels, 'candidate_dialogue_embeddings': embeddings, 'utterance_embeddings': utt_embs, 'original_utterance_embedding': embs[1][0][0], 'original_dialogue_embedding': embs[0][0][0] } scored = answer_relevance(search_context) scored = sorted(scored, key=lambda tpl: tpl[0]) for score, label in scored: print score, translator(label)
def evaluate(model_manager): rand_iter = random_response_generator(model_manager) # encoder = model_manager.load_currently_selected_model() translator = data_access.get_label_translator(model_manager) evaluator = get_response_evaluator(model_manager.load_currently_selected_model()) rankings = [] start_time = time() progress = 0 result_arr = FileArray('./results/decoder_results_%s.bin'%model_manager.model_name, shape=(1000000, 1), dtype='i4') result_arr.open() for instance in evaluation_sample_iterator(model_manager): prev_result = result_arr.read(progress) if prev_result >= 1: progress += 1 rankings.append(prev_result[0]-1) continue progress += 1 random_responses = [rand_iter.next()[0] for x in xrange(9)] context = instance['context'] candidates = [(evaluator(instance['answer'], context), True)] ''' test = encode(context, encoder) test2 = encode(context + ' </s> ' + instance['answer'], encoder) print 'context emb', sum(test[0][0][0]), sum(instance['context_emb']) print 'question emb', sum(test[1][0][0]), sum(instance['question_utterance_emb']) print 'answer emb', sum(test2[1][-1][0]), sum(instance['answer_utterance_emb']) print 'answer context emb', sum(test2[0][-1][0]), sum(instance['answer_context_emb']) ''' for random_resp in random_responses: cost = evaluator(random_resp, context) candidates.append((cost, False)) candidates = sorted(candidates, key=lambda pair: pair[0]) rank = [idx for idx, cand in enumerate(candidates) if candidates[idx][1]][0] rankings.append(rank) result_arr.write(progress-1, np.array([rank+1], dtype='i4')) rATk = calculate_recall_at_k(rankings, 10) result_str = ' | '.join(['R@%i %.3f%%' % (k + 1, percentage * 100) for k, percentage in rATk.iteritems()]) print_progress_bar(instance['progress'], instance['conversations'], additional_text=result_str, start_time=start_time) if progress % 300 == 0: print 'gc collect' collect() result_arr.close()