def main(in_main_dataset_folder, in_lm_dataset_folder, in_model_folder, resume, in_config): trainset_main = pd.read_json( os.path.join(in_main_dataset_folder, 'trainset.json')) devset_main = pd.read_json( os.path.join(in_main_dataset_folder, 'devset.json')) testset_main = pd.read_json( os.path.join(in_main_dataset_folder, 'testset.json')) trainset_lm = pd.read_json( os.path.join(in_lm_dataset_folder, 'trainset.json')) devset_lm = pd.read_json(os.path.join(in_lm_dataset_folder, 'devset.json')) testset_lm = pd.read_json( os.path.join(in_lm_dataset_folder, 'testset.json')) with tf.Session() as sess: model, actual_config, vocab, char_vocab, label_vocab = init_model( trainset_main, in_model_folder, resume, in_config, sess) rev_vocab = {word_id: word for word, word_id in vocab.iteritems()} rev_label_vocab = { label_id: label for label, label_id in label_vocab.iteritems() } _, ys_train_main = make_multitask_dataset(trainset_main, vocab, label_vocab, actual_config) X_dev_main, ys_dev_main = make_multitask_dataset( devset_main, vocab, label_vocab, actual_config) X_test_main, ys_test_main = make_multitask_dataset( testset_main, vocab, label_vocab, actual_config) y_train_flattened = np.argmax(ys_train_main[0], axis=-1) smoothing_coef = actual_config['class_weight_smoothing_coef'] class_weight = get_class_weight_proportional( y_train_flattened, smoothing_coef=smoothing_coef) scaler = MinMaxScaler(feature_range=(1, 5)) class_weight_vector = scaler.fit_transform( np.array( map(itemgetter(1), sorted(class_weight.items(), key=itemgetter(0)))).reshape(-1, 1)).flatten() post_train_lm(model, (X_train, ys_train), (X_dev, ys_dev), (X_test, ys_test), [(vocab, label_vocab, rev_label_vocab), (vocab, vocab, rev_vocab)], in_model_folder, actual_config['epochs_number'], actual_config, sess, class_weights=[class_weight_vector, np.ones(len(vocab))])
def predict_babi_file(in_model, vocabs_for_tasks, dataset, in_config, in_session, target_file_path=None): if target_file_path: target_file = open(target_file_path, "w") # eval tags --> RNN tags X, ys_for_tasks = make_multitask_dataset(dataset, vocabs_for_tasks[0][0], vocabs_for_tasks[0][1], in_config) predictions = predict(in_model, (X, ys_for_tasks), [vocabs_for_tasks[0][-1]], in_session) predictions_eval = [] global_word_index = 0 broken_sequences_number = 0 # RNN tags --> eval tags for utterance in dataset['utterance']: current_tags = predictions[global_word_index:global_word_index + len(utterance)] try: current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags( current_tags, utterance, representation="disf1") except: current_tags_eval = current_tags broken_sequences_number += 1 predictions_eval += current_tags_eval global_word_index += len(utterance) print '#broken sequences after RNN --> eval conversion: {} out of {}'.format( broken_sequences_number, dataset.shape[0]) predictions_eval_iter = iter(predictions_eval) for speaker, (_, speaker_data) in enumerate(dataset.iterrows()): if target_file_path: target_file.write("Speaker: " + str(speaker) + "\n\n") timing_data, lex_data, pos_data, labels = (create_fake_timings( len(speaker_data['utterance'])), speaker_data['utterance'], speaker_data['pos'], speaker_data['tags']) for i in range(0, len(timing_data)): # print i, timing_data[i] _, end = timing_data[i] word = lex_data[i] pos = pos_data[i] predicted_tags = [next(predictions_eval_iter)] current_time = end if target_file_path: target_file.write("Time: " + str(current_time) + "\n") new_words = lex_data[i - (len(predicted_tags) - 1):i + 1] new_pos = pos_data[i - (len(predicted_tags) - 1):i + 1] new_timings = timing_data[i - (len(predicted_tags) - 1):i + 1] for t, w, p, tag in zip(new_timings, new_words, new_pos, predicted_tags): target_file.write("\t".join( [str(t[0]), str(t[1]), w, p, tag])) target_file.write("\n") target_file.write("\n") target_file.write("\n")
def filter_line(in_line, in_model, in_vocabs_for_tasks, in_config, in_session): tokens = unicode(in_line.lower()).split() dataset = pd.DataFrame({ 'utterance': [tokens], 'tags': [['<f/>'] * len(tokens)], 'pos': [pos_tag(tokens)] }) (tag_vocab, tag_label_vocab, tag_rev_label_vocab) = in_vocabs_for_tasks[0] X_line, ys_line = make_multitask_dataset(dataset, tag_vocab, tag_label_vocab, in_config) result_tokens = predict(in_model, (X_line, ys_line), in_vocabs_for_tasks, in_session, batch_size=1) return ' '.join(result_tokens)
def predict_increco_file(in_model, vocabs_for_tasks, source_file_path, in_config, in_session, target_file_path=None, is_asr_results_file=False): """Return the incremental output in an increco style given the incoming words + POS. E.g.: Speaker: KB3_1 Time: 1.50 KB3_1:1 0.00 1.12 $unc$yes NNP <f/><tc/> Time: 2.10 KB3_1:1 0.00 1.12 $unc$yes NNP <rms id="1"/><tc/> KB3_1:2 1.12 2.00 because IN <rps id="1"/><cc/> Time: 2.5 KB3_1:2 1.12 2.00 because IN <rps id="1"/><rpndel id="1"/><cc/> from an ASR increco style input without the POStags: or a normal style disfluency dectection ground truth corpus: Speaker: KB3_1 KB3_1:1 0.00 1.12 $unc$yes NNP <rms id="1"/><tc/> KB3_1:2 1.12 2.00 $because IN <rps id="1"/><cc/> KB3_1:3 2.00 3.00 because IN <f/><cc/> KB3_1:4 3.00 4.00 theres EXVBZ <f/><cc/> KB3_1:6 4.00 5.00 a DT <f/><cc/> KB3_1:7 6.00 7.10 pause NN <f/><cc/> :param source_file_path: str, file path to the input file :param target_file_path: str, file path to output in the above format :param is_asr_results_file: bool, whether the input is increco style """ if target_file_path: target_file = open(target_file_path, "w") if 'timings' in source_file_path: print "input file has timings" if not is_asr_results_file: dialogues = [] IDs, timings, words, pos_tags, labels = \ get_tag_data_from_corpus_file(source_file_path) for dialogue, a, b, c, d in zip(IDs, timings, words, pos_tags, labels): dialogues.append((dialogue, (a, b, c, d))) else: print "no timings in input file, creating fake timings" raise NotImplementedError # collecting a single dataset for the model to predict in batches utterances, tags, pos = [], [], [] for speaker, speaker_data in dialogues: timing_data, lex_data, pos_data, labels = speaker_data # iterate through the utterances # utt_idx = -1 for i in range(0, len(timing_data)): # print i, timing_data[i] _, end = timing_data[i] if "<t" in labels[i]: utterances.append([]) tags.append([]) pos.append([]) utterances[-1].append(lex_data[i]) tags[-1].append(labels[i]) pos[-1].append(pos_data[i]) # eval tags --> RNN tags dataset = pd.DataFrame({ 'utterance': utterances, 'tags': [ convert_from_eval_tags_to_inc_disfluency_tags( tags_i, words_i, representation="disf1") for tags_i, words_i in zip(tags, utterances) ], 'pos': pos }) X, ys_for_tasks = make_multitask_dataset(dataset, vocabs_for_tasks[0][0], vocabs_for_tasks[0][1], in_config) predictions = predict(in_model, (X, ys_for_tasks), [vocabs_for_tasks[0][-1]], in_session) predictions_eval = [] global_word_index = 0 broken_sequences_number = 0 # RNN tags --> eval tags for utterance in utterances: current_tags = predictions[global_word_index:global_word_index + len(utterance)] try: current_tags_eval = convert_from_inc_disfluency_tags_to_eval_tags( current_tags, utterance, representation="disf1") except: current_tags_eval = current_tags broken_sequences_number += 1 predictions_eval += current_tags_eval global_word_index += len(utterance) print '#broken sequences after RNN --> eval conversion: {} out of {}'.format( broken_sequences_number, len(utterances)) predictions_eval_iter = iter(predictions_eval) for speaker, speaker_data in dialogues: if target_file_path: target_file.write("Speaker: " + str(speaker) + "\n\n") timing_data, lex_data, pos_data, labels = speaker_data for i in range(0, len(timing_data)): # print i, timing_data[i] _, end = timing_data[i] word = lex_data[i] pos = pos_data[i] predicted_tags = [next(predictions_eval_iter)] current_time = end if target_file_path: target_file.write("Time: " + str(current_time) + "\n") new_words = lex_data[i - (len(predicted_tags) - 1):i + 1] new_pos = pos_data[i - (len(predicted_tags) - 1):i + 1] new_timings = timing_data[i - (len(predicted_tags) - 1):i + 1] for t, w, p, tag in zip(new_timings, new_words, new_pos, predicted_tags): target_file.write("\t".join( [str(t[0]), str(t[1]), w, p, tag])) target_file.write("\n") target_file.write("\n") target_file.write("\n")