def main(in_main_dataset_folder, in_lm_dataset_folder, in_model_folder, resume,
         in_config):
    trainset_main = pd.read_json(
        os.path.join(in_main_dataset_folder, 'trainset.json'))
    devset_main = pd.read_json(
        os.path.join(in_main_dataset_folder, 'devset.json'))
    testset_main = pd.read_json(
        os.path.join(in_main_dataset_folder, 'testset.json'))

    trainset_lm = pd.read_json(
        os.path.join(in_lm_dataset_folder, 'trainset.json'))
    devset_lm = pd.read_json(os.path.join(in_lm_dataset_folder, 'devset.json'))
    testset_lm = pd.read_json(
        os.path.join(in_lm_dataset_folder, 'testset.json'))

    with tf.Session() as sess:
        model, actual_config, vocab, char_vocab, label_vocab = init_model(
            trainset_main, in_model_folder, resume, in_config, sess)
        rev_vocab = {word_id: word for word, word_id in vocab.iteritems()}
        rev_label_vocab = {
            label_id: label
            for label, label_id in label_vocab.iteritems()
        }
        _, ys_train_main = make_multitask_dataset(trainset_main, vocab,
                                                  label_vocab, actual_config)
        X_dev_main, ys_dev_main = make_multitask_dataset(
            devset_main, vocab, label_vocab, actual_config)
        X_test_main, ys_test_main = make_multitask_dataset(
            testset_main, vocab, label_vocab, actual_config)

        y_train_flattened = np.argmax(ys_train_main[0], axis=-1)
        smoothing_coef = actual_config['class_weight_smoothing_coef']
        class_weight = get_class_weight_proportional(
            y_train_flattened, smoothing_coef=smoothing_coef)

        scaler = MinMaxScaler(feature_range=(1, 5))
        class_weight_vector = scaler.fit_transform(
            np.array(
                map(itemgetter(1),
                    sorted(class_weight.items(),
                           key=itemgetter(0)))).reshape(-1, 1)).flatten()

        post_train_lm(model, (X_train, ys_train), (X_dev, ys_dev),
                      (X_test, ys_test),
                      [(vocab, label_vocab, rev_label_vocab),
                       (vocab, vocab, rev_vocab)],
                      in_model_folder,
                      actual_config['epochs_number'],
                      actual_config,
                      sess,
                      class_weights=[class_weight_vector,
                                     np.ones(len(vocab))])
예제 #2
0
def predict_babi_file(in_model,
                      vocabs_for_tasks,
                      dataset,
                      in_config,
                      in_session,
                      target_file_path=None):
    if target_file_path:
        target_file = open(target_file_path, "w")

    # eval tags --> RNN tags
    X, ys_for_tasks = make_multitask_dataset(dataset, vocabs_for_tasks[0][0],
                                             vocabs_for_tasks[0][1], in_config)
    predictions = predict(in_model, (X, ys_for_tasks),
                          [vocabs_for_tasks[0][-1]], in_session)
    predictions_eval = []
    global_word_index = 0
    broken_sequences_number = 0
    # RNN tags --> eval tags
    for utterance in dataset['utterance']:
        current_tags = predictions[global_word_index:global_word_index +
                                   len(utterance)]
        try:
            current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags(
                current_tags, utterance, representation="disf1")
        except:
            current_tags_eval = current_tags
            broken_sequences_number += 1
        predictions_eval += current_tags_eval
        global_word_index += len(utterance)
    print '#broken sequences after RNN --> eval conversion: {} out of {}'.format(
        broken_sequences_number, dataset.shape[0])

    predictions_eval_iter = iter(predictions_eval)
    for speaker, (_, speaker_data) in enumerate(dataset.iterrows()):
        if target_file_path:
            target_file.write("Speaker: " + str(speaker) + "\n\n")
        timing_data, lex_data, pos_data, labels = (create_fake_timings(
            len(speaker_data['utterance'])), speaker_data['utterance'],
                                                   speaker_data['pos'],
                                                   speaker_data['tags'])

        for i in range(0, len(timing_data)):
            # print i, timing_data[i]
            _, end = timing_data[i]
            word = lex_data[i]
            pos = pos_data[i]
            predicted_tags = [next(predictions_eval_iter)]
            current_time = end
            if target_file_path:
                target_file.write("Time: " + str(current_time) + "\n")
                new_words = lex_data[i - (len(predicted_tags) - 1):i + 1]
                new_pos = pos_data[i - (len(predicted_tags) - 1):i + 1]
                new_timings = timing_data[i - (len(predicted_tags) - 1):i + 1]
                for t, w, p, tag in zip(new_timings, new_words, new_pos,
                                        predicted_tags):
                    target_file.write("\t".join(
                        [str(t[0]), str(t[1]), w, p, tag]))
                    target_file.write("\n")
                target_file.write("\n")
        target_file.write("\n")
예제 #3
0
def filter_line(in_line, in_model, in_vocabs_for_tasks, in_config, in_session):
    tokens = unicode(in_line.lower()).split()
    dataset = pd.DataFrame({
        'utterance': [tokens],
        'tags': [['<f/>'] * len(tokens)],
        'pos': [pos_tag(tokens)]
    })
    (tag_vocab, tag_label_vocab, tag_rev_label_vocab) = in_vocabs_for_tasks[0]
    X_line, ys_line = make_multitask_dataset(dataset, tag_vocab,
                                             tag_label_vocab, in_config)
    result_tokens = predict(in_model, (X_line, ys_line),
                            in_vocabs_for_tasks,
                            in_session,
                            batch_size=1)
    return ' '.join(result_tokens)
예제 #4
0
def predict_increco_file(in_model,
                         vocabs_for_tasks,
                         source_file_path,
                         in_config,
                         in_session,
                         target_file_path=None,
                         is_asr_results_file=False):
    """Return the incremental output in an increco style
    given the incoming words + POS. E.g.:

    Speaker: KB3_1

    Time: 1.50
    KB3_1:1    0.00    1.12    $unc$yes    NNP    <f/><tc/>

    Time: 2.10
    KB3_1:1    0.00    1.12    $unc$yes    NNP    <rms id="1"/><tc/>
    KB3_1:2    1.12    2.00     because    IN    <rps id="1"/><cc/>

    Time: 2.5
    KB3_1:2    1.12    2.00     because    IN    <rps id="1"/><rpndel id="1"/><cc/>

    from an ASR increco style input without the POStags:

    or a normal style disfluency dectection ground truth corpus:

    Speaker: KB3_1
    KB3_1:1    0.00    1.12    $unc$yes    NNP    <rms id="1"/><tc/>
    KB3_1:2    1.12    2.00     $because    IN    <rps id="1"/><cc/>
    KB3_1:3    2.00    3.00    because    IN    <f/><cc/>
    KB3_1:4    3.00    4.00    theres    EXVBZ    <f/><cc/>
    KB3_1:6    4.00    5.00    a    DT    <f/><cc/>
    KB3_1:7    6.00    7.10    pause    NN    <f/><cc/>


    :param source_file_path: str, file path to the input file
    :param target_file_path: str, file path to output in the above format
    :param is_asr_results_file: bool, whether the input is increco style
    """
    if target_file_path:
        target_file = open(target_file_path, "w")
    if 'timings' in source_file_path:
        print "input file has timings"
        if not is_asr_results_file:
            dialogues = []
            IDs, timings, words, pos_tags, labels = \
                get_tag_data_from_corpus_file(source_file_path)
            for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags,
                                            labels):
                dialogues.append((dialogue, (a, b, c, d)))
    else:
        print "no timings in input file, creating fake timings"
        raise NotImplementedError

    # collecting a single dataset for the model to predict in batches
    utterances, tags, pos = [], [], []
    for speaker, speaker_data in dialogues:
        timing_data, lex_data, pos_data, labels = speaker_data
        # iterate through the utterances
        # utt_idx = -1

        for i in range(0, len(timing_data)):
            # print i, timing_data[i]
            _, end = timing_data[i]
            if "<t" in labels[i]:
                utterances.append([])
                tags.append([])
                pos.append([])
            utterances[-1].append(lex_data[i])
            tags[-1].append(labels[i])
            pos[-1].append(pos_data[i])

    # eval tags --> RNN tags
    dataset = pd.DataFrame({
        'utterance':
        utterances,
        'tags': [
            convert_from_eval_tags_to_inc_disfluency_tags(
                tags_i, words_i, representation="disf1")
            for tags_i, words_i in zip(tags, utterances)
        ],
        'pos':
        pos
    })
    X, ys_for_tasks = make_multitask_dataset(dataset, vocabs_for_tasks[0][0],
                                             vocabs_for_tasks[0][1], in_config)
    predictions = predict(in_model, (X, ys_for_tasks),
                          [vocabs_for_tasks[0][-1]], in_session)
    predictions_eval = []
    global_word_index = 0
    broken_sequences_number = 0
    # RNN tags --> eval tags
    for utterance in utterances:
        current_tags = predictions[global_word_index:global_word_index +
                                   len(utterance)]
        try:
            current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags(
                current_tags, utterance, representation="disf1")
        except:
            current_tags_eval = current_tags
            broken_sequences_number += 1
        predictions_eval += current_tags_eval
        global_word_index += len(utterance)
    print '#broken sequences after RNN --> eval conversion: {} out of {}'.format(
        broken_sequences_number, len(utterances))

    predictions_eval_iter = iter(predictions_eval)
    for speaker, speaker_data in dialogues:
        if target_file_path:
            target_file.write("Speaker: " + str(speaker) + "\n\n")
        timing_data, lex_data, pos_data, labels = speaker_data

        for i in range(0, len(timing_data)):
            # print i, timing_data[i]
            _, end = timing_data[i]
            word = lex_data[i]
            pos = pos_data[i]
            predicted_tags = [next(predictions_eval_iter)]
            current_time = end
            if target_file_path:
                target_file.write("Time: " + str(current_time) + "\n")
                new_words = lex_data[i - (len(predicted_tags) - 1):i + 1]
                new_pos = pos_data[i - (len(predicted_tags) - 1):i + 1]
                new_timings = timing_data[i - (len(predicted_tags) - 1):i + 1]
                for t, w, p, tag in zip(new_timings, new_words, new_pos,
                                        predicted_tags):
                    target_file.write("\t".join(
                        [str(t[0]), str(t[1]), w, p, tag]))
                    target_file.write("\n")
                target_file.write("\n")
        target_file.write("\n")