def get_test_examples(df_path, tokenizer): data_dict = load_data_from_file(df_path) text_src = data_dict['defused_sent'] exact_text_trg = data_dict['exact_fused_sent'] many_text_trg = data_dict['many_fused_sents'] src_text = [] src_lines = [] for src in text_src: src_line = tokenization.convert_to_unicode(src) tokens = tokenizer.tokenize(src_line) ids = tokenizer.convert_tokens_to_ids(tokens) src_text.append(ids) src_lines.append(src_line) trg_text = [] trg_lines = [] for tgt in many_text_trg: cur_trg_text = [] cur_trg_lines = [] for cur_tgt in tgt: tgt_line = tokenization.convert_to_unicode(cur_tgt) tokens = tokenizer.tokenize(tgt_line) ids = tokenizer.convert_tokens_to_ids(tokens) cur_trg_text.append(ids) cur_trg_lines.append(tgt_line) trg_text.append(cur_trg_text) trg_lines.append(cur_trg_lines) return src_text, src_lines, trg_text, trg_lines
def _create_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): if i == 0: continue guid = "%s-%s" % (set_type, i) text_a = tokenization.convert_to_unicode(line[2]) text_b = tokenization.convert_to_unicode(line[1]) examples.append( InputExample(guid=guid, text_a=text_a, text_b=text_b,)) return examples
def _create_examples(self, src_lines, tgt_lines, type_labels, conn_labels, set_type): examples = [] for i, (src, tgt) in enumerate(zip(src_lines, tgt_lines)): guid = "%s-%s" % (set_type, i) src_lines = tokenization.convert_to_unicode(" ".join(src)) tgt_lines = tokenization.convert_to_unicode(" ".join(tgt)) type_label = type_labels[i] conn_label = conn_labels[i] examples.append( InputExample(guid=guid, text_a=src_lines, text_b=tgt_lines, type_label=type_label, conn_label=conn_label)) return examples