def run(in_model_folder):
    with tf.Session() as sess:
        model, actual_config, vocab, char_vocab, label_vocab = load(
            in_model_folder, sess)
        rev_vocab = {word_id: word for word, word_id in vocab.iteritems()}
        rev_label_vocab = {
            label_id: label
            for label, label_id in label_vocab.iteritems()
        }
        print 'Done loading'
        try:
            line = raw_input().strip()
            while line:
                print filter_line(line, model,
                                  [(vocab, label_vocab, rev_label_vocab),
                                   (vocab, vocab, rev_vocab)], actual_config,
                                  sess)
                line = raw_input().strip()
        except EOFError as e:
            pass
def init_model(trainset, in_model_folder, resume, in_config, in_session):
    model = None
    if not resume:
        if in_config['use_pos_tags']:
            utterances = []
            for utterance, postags in zip(trainset['utterance'],
                                          trainset['pos']):
                utterance_augmented = [
                    '{}_{}'.format(token, pos)
                    for token, pos in zip(utterance, postags)
                ]
                utterances.append(utterance_augmented)
        else:
            utterances = trainset['utterance']
        vocab, _ = make_vocabulary(utterances,
                                   in_config['max_vocabulary_size'])
        char_vocab = make_char_vocabulary()
        label_vocab, _ = make_vocabulary(trainset['tags'].values,
                                         in_config['max_vocabulary_size'],
                                         special_tokens=[])
        task_output_dimensions = []
        for task in in_config['tasks']:
            if task == 'tag':
                task_output_dimensions.append(len(label_vocab))
            elif task == 'lm':
                task_output_dimensions.append(len(vocab))
            else:
                raise NotImplementedError

        model = create_model(len(vocab), in_config['embedding_size'],
                             in_config['max_input_length'],
                             task_output_dimensions)
        init = tf.global_variables_initializer()
        in_session.run(init)
        save(in_config, vocab, char_vocab, label_vocab, in_model_folder,
             in_session)
    model, actual_config, vocab, char_vocab, label_vocab = load(
        in_model_folder, in_session, existing_model=model)
    return model, actual_config, vocab, char_vocab, label_vocab
def main(in_dataset, in_model_folder, in_trainset_size, in_epochs_number, in_result_folder):
    model, vocab, label_vocab = load(in_model_folder)
    in_dataset = in_dataset.sample(frac=1).reset_index(drop=True)
    trainset, testset = in_dataset[:in_trainset_size], in_dataset[in_trainset_size:]
    train_data_points = [(tokens, tags) for tokens, tags in zip(trainset['utterance'], trainset['tags'])]
    test_data_points = [(tokens, tags) for tokens, tags in zip(testset['utterance'], testset['tags'])]
    train_data = make_dataset(train_data_points, vocab, label_vocab)
    test_data = make_dataset(test_data_points, vocab, label_vocab)

    if not os.path.exists(in_result_folder):
        os.makedirs(in_result_folder)

    train(model,
          train_data,
          test_data,
          test_data,
          os.path.join(in_result_folder, MODEL_NAME),
          epochs=in_epochs_number,
          batch_size=1)
    save(model, vocab, label_vocab, in_result_folder, save_model=False)

    print 'Testset accuracy: {:.3f}'.format(evaluate(model, *test_data))
Exemplo n.º 4
0
def main(in_dataset_file, in_model_folder, in_result_file):
    dataset = pd.read_json(in_dataset_file)

    with tf.Session() as sess:
        model, vocab, char_vocab, label_vocab = load(in_model_folder, sess)
        rev_label_vocab = {
            label_id: label
            for label, label_id in label_vocab.iteritems()
        }
        print 'Done loading'
        X, y = make_dataset(dataset, vocab, label_vocab)
        y_pred = predict(model, (X, y), rev_label_vocab, sess)
    tags_predicted = []
    tag_idx = 0
    for tag_seq in dataset['tags']:
        tags_predicted.append(y_pred[tag_idx:tag_idx + len(tag_seq)])
        tag_idx += len(tag_seq)
    result = pd.DataFrame({
        'utterance': dataset['utterance'],
        'tags_gold': dataset['tags'],
        'tags_predicted': tags_predicted
    })
    result.to_json(in_result_file)
def main(in_dataset_file, in_model_folder, in_mode):
    with tf.Session() as sess:
        model, actual_config, vocab, char_vocab, label_vocab = load(
            in_model_folder, sess)
        rev_vocab = {word_id: word for word, word_id in vocab.iteritems()}
        rev_label_vocab = {
            label_id: label
            for label, label_id in label_vocab.iteritems()
        }
        if in_mode == 'deep_disfluency':
            eval_result = eval_deep_disfluency(
                model, [(vocab, label_vocab, rev_label_vocab),
                        (vocab, vocab, rev_vocab)], in_dataset_file,
                actual_config, sess)
        elif in_mode == 'babi':
            eval_result = eval_babi(model,
                                    [(vocab, label_vocab, rev_label_vocab),
                                     (vocab, vocab, rev_vocab)],
                                    in_dataset_file, actual_config, sess)
        else:
            raise NotImplementedError
        for key, value in eval_result.iteritems():
            print '{}:\t{}'.format(key, value)