Пример #1
0
def chat_with_lshf(model_manager):

    ann = model_manager.load_lshf_model(True)

    encoder = model_manager.load_currently_selected_model()
    sampler = search.BeamSampler(encoder)
    label_decoder = get_label_translator(m)

    while 1:

        # get textual input

        context = raw_input('context: ').strip()

        if not context.endswith(' __eou__ </s>'):
            context += ' __eou__ </s>'

        dia_embs, utt_embs = encoder.encode(context)

        #print context
        #'''
        context_emb = dia_embs[-2][0]
        utt_emb = utt_embs[-2][0]

        distances, labels, embeddings = ann.kneighbors(context_emb, 120)

        print labels
        for label, distance in zip(labels[:10], distances[0][:10]):

            print distance, label_decoder(label)
            print label_decoder((label[0], label[1] + 1))
            print

        search_context = {
            'distances': distances[0],
            'labels': [(label[0], label[1] + 1) for label in labels],
            'candidate_dialogue_embeddings': embeddings,
            'utterance_embeddings': ann.utterance_embeddings,
            'original_utterance_embedding': utt_emb,
            'original_dialogue_embedding': context_emb
        }

        scored = candidate_selection.answer_relevance(search_context)

        print 'answer relevance ' * 10
        print
        scored = sorted(scored, key=lambda pair: pair[0])
        for score, label in scored[:10]:
            print score, label_decoder(label)
            print
        #'''
        samples, costs = sampler.sample([context.split()],
                                        n_samples=5,
                                        n_turns=1)
        print 'HRED: ', samples[0][0]
        print
Пример #2
0
def evaluation_sample_iterator(model_manager, amount = 30000, seed = 10):
    rand = Random(seed)

    database = data_access.get_database(model_manager)

    test_ids = database[data_access.TEST_IDS_SET_NAME][:]
    rand.shuffle(test_ids)
    test_ids = test_ids[0:min(len(test_ids), amount)]

    debug('yielding evaluation samples from %i conversations'%len(test_ids))

    coords = [database[data_access.EMBEDDINGS_COORDINATES_SET_NAME][d_idx] for d_idx in test_ids]

    utt_embs = FileArray(model_manager.files['utterance_embeddings'])
    dia_embs = FileArray(model_manager.files['dialogue_embeddings'])
    utt_embs.open()
    dia_embs.open()

    label_to_text = data_access.get_label_translator(model_manager)

    #context-textual response-textual, response-emb context-emb, answer-emb
    progress = 0
    for d_idx, (global_idx, conv_length) in zip(test_ids, coords):
        progress += 1
        context = label_to_text((d_idx, 0))

        relevant_utt_embs = utt_embs.read_chunk(global_idx, conv_length)
        relevant_dia_embs = dia_embs.read_chunk(global_idx, conv_length)

        for idx, conv_turn in enumerate(xrange(0, conv_length-1)):
            instance = {}
            instance['question'] = label_to_text((d_idx, conv_turn))
            instance['question_utterance_emb'] = relevant_utt_embs[idx]
            instance['context'] = context
            instance['context_emb'] = relevant_dia_embs[idx]
            instance['answer'] = label_to_text((d_idx, conv_turn+1))
            instance['answer_utterance_emb'] = relevant_utt_embs[idx+1]
            instance['answer_context_emb'] = relevant_dia_embs[idx+1]
            instance['progress'] = progress
            instance['conversations'] = len(test_ids)

            yield instance

            context = context + ' </s> ' + instance['answer']

    utt_embs.close()
    dia_embs.close()
Пример #3
0
def random_response_generator(model_manager, seed = 10):
    rand = Random(seed)

    database = data_access.get_database(model_manager)

    test_ids = database[data_access.TEST_IDS_SET_NAME][:]

    debug('yielding random responses from %i conversations' % len(test_ids))

    coords = [database[data_access.EMBEDDINGS_COORDINATES_SET_NAME][d_idx] for d_idx in test_ids]

    utt_embs = FileArray(model_manager.files['utterance_embeddings'])
    utt_embs.open()

    label_to_text = data_access.get_label_translator(model_manager)

    while 1:

        idx = rand.randint(0, len(test_ids)-1)
        d_idx = test_ids[idx]
        global_idx, conv_length = coords[idx]

        conv_turn = rand.randint(0, conv_length-1)
        yield label_to_text((d_idx, conv_turn)), utt_embs.read(global_idx+conv_turn)
Пример #4
0
    #from ann.lsh_forest import save_linked_utterance_embeddings
    #save_linked_utterance_embeddings(m)

    #lsh_forest.train_lsh_forest(m, corpus_percentage=0.05)
    ann = lsh_forest.load_lshf(m)
    utt_embs = lsh_forest.load_utterance_embeddings(m)
    encoder = m.load_currently_selected_model()

    embs = encode('how do i update all packages ? __eou__', encoder)

    d_emb = embs[0][0][0]

    distances, labels, embeddings = ann.kneighbors(d_emb, 10)

    translator = get_label_translator(m, as_text=True)

    labels = [(label[0], label[1] + 1) for label in labels]

    search_context = {
        'distances': distances,
        'labels': labels,
        'candidate_dialogue_embeddings': embeddings,
        'utterance_embeddings': utt_embs,
        'original_utterance_embedding': embs[1][0][0],
        'original_dialogue_embedding': embs[0][0][0]
    }
    scored = answer_relevance(search_context)
    scored = sorted(scored, key=lambda tpl: tpl[0])
    for score, label in scored:
        print score, translator(label)
Пример #5
0
def evaluate(model_manager):

    rand_iter = random_response_generator(model_manager)
    #    encoder = model_manager.load_currently_selected_model()

    translator = data_access.get_label_translator(model_manager)

    evaluator = get_response_evaluator(model_manager.load_currently_selected_model())
    rankings = []
    start_time = time()
    progress = 0

    result_arr = FileArray('./results/decoder_results_%s.bin'%model_manager.model_name, shape=(1000000, 1), dtype='i4')
    result_arr.open()


    for instance in evaluation_sample_iterator(model_manager):


        prev_result = result_arr.read(progress)

        if prev_result >= 1:
            progress += 1
            rankings.append(prev_result[0]-1)
            continue

        progress += 1
        random_responses = [rand_iter.next()[0] for x in xrange(9)]

        context = instance['context']
        candidates = [(evaluator(instance['answer'], context), True)]

        '''

        test = encode(context, encoder)
        test2 = encode(context + ' </s> ' + instance['answer'], encoder)


        print 'context emb', sum(test[0][0][0]), sum(instance['context_emb'])
        print 'question emb', sum(test[1][0][0]), sum(instance['question_utterance_emb'])
        print 'answer emb', sum(test2[1][-1][0]), sum(instance['answer_utterance_emb'])
        print 'answer context emb', sum(test2[0][-1][0]), sum(instance['answer_context_emb'])
        '''
        for random_resp in random_responses:
            cost = evaluator(random_resp, context)
            candidates.append((cost, False))

        candidates = sorted(candidates, key=lambda pair: pair[0])

        rank = [idx for idx, cand in enumerate(candidates) if candidates[idx][1]][0]
        rankings.append(rank)


        result_arr.write(progress-1, np.array([rank+1], dtype='i4'))

        rATk = calculate_recall_at_k(rankings, 10)
        result_str = ' | '.join(['R@%i %.3f%%' % (k + 1, percentage * 100) for k, percentage in rATk.iteritems()])

        print_progress_bar(instance['progress'], instance['conversations'], additional_text=result_str, start_time=start_time)

        if progress % 300 == 0:
            print 'gc collect'
            collect()

    result_arr.close()