def get_token_counts(data_path, max_sent, max_highlight): doc_paths = [ os.path.join(data_path, file) for file in os.listdir(data_path) ] num_docs = len(doc_paths) counts_inp = defaultdict(int) counts_hl = defaultdict(int) for i, doc_path in enumerate(doc_paths, 1): sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format( i, num_docs, 100 * i / num_docs)) sys.stdout.flush() doc = read_document(doc_path) for sent in doc["sentences"][:max_sent]: tokens = preprocess_tokens(sent["tokens"], doc["entities"]) for token in tokens: counts_inp[token] += 1 for sent in doc["highlights"][:max_highlight]: tokens = preprocess_tokens(sent["tokens"], doc["entities"]) for token in tokens: counts_hl[token] += 1 return counts_inp, counts_hl
def process_document(document_path, output_dir, lead): output_path = os.path.join(output_dir, os.path.split(document_path)[1]) doc = read_document(document_path) summary_text = build_summary(doc, lead) with open(output_path, "w") as f: f.write(summary_text)
def process_document(args): document_path, output_dir, rouge_settings = args output_path = os.path.join(output_dir, os.path.split(document_path)[1]) doc = read_document(document_path) summary_text = build_summary(doc, rouge_settings) with open(output_path, "w") as f: f.write(summary_text)
def test_read(data_path): doc_paths = [ os.path.join(data_path, file) for file in os.listdir(data_path) ] doc_paths.sort() num_docs = len(doc_paths) bad_paths = list() for i, doc_path in enumerate(doc_paths, 1): sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format( i, num_docs, 100 * i / num_docs)) sys.stdout.flush() #try: try: read_document(doc_path) except ValueError, e: bad_paths.append((str(e), doc_path))
def process_document(args): document_path, output_dir, max_input, max_highlight, no_overwrite = args output_path = os.path.join(output_dir, os.path.split(document_path)[1]) if no_overwrite is True and os.path.exists(output_path): return print document_path doc = read_document(document_path) meta = init_doc_meta(doc, max_input) #print document_path data = list() for highlight in doc["highlights"][:max_highlight]: backbone, support, alignments = find_highlight_alignments( highlight, doc, meta) data.append([backbone, support, alignments]) with open(output_path, "w") as f: f.write(yaml.dump(data))
def build_summary(doc_path, align_path, summary_path): backbones = [] used = set() with open(align_path, "r") as f: alignments = yaml.load(f) for backbone, support, ta in alignments: if backbone != None and backbone not in used: backbones.append(backbone) used.add(backbone) doc = read_document(doc_path) lines = list() for b in backbones: tokens = doc["sentences"][b]["tokens"] sent_str = " ".join(replace_entities(tokens, doc["entities"])) lines.append(sent_str) with open(summary_path, "w") as f: f.write("\n".join(lines))
def display_example(example): example = int(example) doc_path = app.config["DOC_PATHS"][example] align_path = app.config["ALIGN_PATHS"][example] doc = read_document(doc_path) doc_tokens = [replace_entities(s["tokens"], doc["entities"]) for s in doc["sentences"][:25]] highlight_tokens = [replace_entities(s["tokens"], doc["entities"]) for s in doc["highlights"][:4]] i=0 doc_token_ids = list() for tokens in doc_tokens: token_ids = list() for token in tokens: token_ids.append(i) i += 1 doc_token_ids.append(token_ids) backbone_ids = list() alignments = list() with open(align_path, "r") as f: #backbones, support, alignments data = yaml.load(f) for backbone, support, alignment in data: if backbone != None: backbone_ids.append(doc_token_ids[backbone]) else: backbone_ids.append(list()) alignments.append(alignment) return render_template("default.html", doc_tokens=doc_tokens, highlights=highlight_tokens, alignments=alignments, alignments_json=json.dumps(alignments), backbone_ids=json.dumps(backbone_ids))
def main(): import argparse hlp = "View a random document" parser = argparse.ArgumentParser(hlp) parser.add_argument('--corpus', required=True, help="Corpus to use.", choices=["dailymail", "cnn"]) parser.add_argument('--data-path', required=True, help="Path to Cheng&Lapata data.") parser.add_argument('--split', required=True, help="Data split to use.", choices=["train", "dev", "test"]) parser.add_argument('--replace-entities', default=False, action="store_true") parser.add_argument('--pproc', default=False, action="store_true") args = parser.parse_args() arg2split = {"test": "test", "train": "training", "dev": "validation"} split = arg2split[args.split] data_path = os.path.join(args.data_path, args.corpus, split) doc_paths = [os.path.join(data_path, file) for file in os.listdir(data_path)] doc_paths.sort() random.shuffle(doc_paths) doc = read_document(doc_paths[0]) print("url") print("===") print(doc["url"]) print("\nINPUT") print("=====") for s, sent in enumerate(doc["sentences"], 1): tokens = sent["tokens"] if args.pproc: tokens = preprocess_tokens(tokens, doc["entities"]) if args.replace_entities: tokens = replace_entities(tokens, doc["entities"]) sent_str = " ".join(tokens) line = "{}) [{}] {}".format(s, sent["score"], sent_str) print(textwrap.fill(line, subsequent_indent=" ")) print("\nENTITIES") print("========") for id, entity in sorted(doc["entities"].items(), key=lambda x: x[0]): print("{:10} :: {}".format(id, entity)) print("\nHIGHLIGHTS") print("==========") for s, sent in enumerate(doc["highlights"], 1): tokens = sent["tokens"] if args.pproc: tokens = preprocess_tokens(tokens, doc["entities"]) if args.replace_entities: tokens = replace_entities(tokens, doc["entities"]) sent_str = " ".join(tokens) line = "{}) {}".format(s, sent_str) print(textwrap.fill(line, subsequent_indent=" "))
def collect_split_stats(data_dir, alignments_dir, vocab_out): document_paths = get_document_paths(data_dir) alignments_paths = get_document_paths(alignments_dir) backbone_counts = list() highlight_counts = list() support_counts = list() aligned_counts = list() unaligned_ent_counts = list() unaligned_counts = list() unaligned_common_counts = list() for doc_path, align_path in izip(document_paths, alignments_paths): if not os.path.basename(doc_path) == os.path.basename(align_path): raise Exception( "Alignments directory does not contain one file for every " \ "file in data path.") doc = read_document(doc_path) with open(align_path, "r") as f: alignments = yaml.load(f) backbone_count = 0 for a in xrange(len(alignments)): backbone, support, token_alignments = alignments[a] if backbone is not None: backbone_count += 1 support_counts.append(len(support)) highlight_tokens = doc["highlights"][a]["tokens"] pp_highlight_tokens = preprocess_tokens( highlight_tokens, doc["entities"]) aligned_tokens = list() unaligned_tokens = list() unaligned_common_tokens = list() unaligned_entity_tokens = list() for token, align in izip(pp_highlight_tokens, token_alignments): if align == unk_id or align == sw_id: unaligned_tokens.append(token) if token in vocab_out: unaligned_common_tokens.append(token) elif token == "__ENTITY__": unaligned_entity_tokens.append(token) else: aligned_tokens.append(token) unaligned_ent_counts.append(len(unaligned_entity_tokens)) aligned_counts.append(len(aligned_tokens)) unaligned_counts.append(len(unaligned_tokens)) unaligned_common_counts.append(len(unaligned_common_tokens)) backbone_counts.append(backbone_count) highlight_counts.append(len(alignments)) print "% highlights w/o alignments", \ 1 - np.sum(backbone_counts) / np.sum(highlight_counts) print "macro avg. support", np.mean(support_counts) aligned_counts = np.array(aligned_counts) unaligned_counts = np.array(unaligned_counts) unaligned_common_counts = np.array(unaligned_common_counts) total_tokens = aligned_counts + unaligned_counts macro_avg_align_recall = (aligned_counts / total_tokens).mean() micro_avg_align_recall = aligned_counts.sum() / total_tokens.sum() macro_avg_unalign_recall = (unaligned_common_counts / total_tokens).mean() micro_avg_unalign_recall = \ unaligned_common_counts.sum() / total_tokens.sum() macro_avg_unalign_ent_recall = (unaligned_ent_counts / total_tokens).mean() macro_avg_max_recall = \ ((unaligned_common_counts + aligned_counts) / total_tokens).mean() micro_avg_max_recall = \ (unaligned_common_counts.sum() + aligned_counts.sum()) \ / total_tokens.sum() print "avg. token count", total_tokens.mean() print "macro avg. align. recall", macro_avg_align_recall print "micro avg. align. recall", micro_avg_align_recall print "macro avg. unalign. recall", macro_avg_unalign_recall print "micro avg. unalign. recall", micro_avg_unalign_recall print "macro avg. unalign. ent recall", macro_avg_unalign_ent_recall print "macro avg. max recall", macro_avg_max_recall print "micro avg. max recall", micro_avg_max_recall
def process_example(doc_path, align_path): print doc_path doc = read_document(doc_path) sent2token_ids = list() sent2pretty_tokens = list() sent2tokens = list() id = 0 for sent in doc["sentences"]: token_ids = list() pretty_tokens = replace_entities(sent["tokens"], doc["entities"]) pp_tokens = preprocess_tokens(sent["tokens"], doc["entities"]) for token in pretty_tokens: token_ids.append(id) #pretty_tokens.append(token) id += 1 sent2token_ids.append(token_ids) sent2pretty_tokens.append(pretty_tokens) sent2tokens.append(pp_tokens) hl_tokens_pretty = replace_entities(doc["highlights"][0]["tokens"], doc["entities"]) hl_tokens = preprocess_tokens(doc["highlights"][0]["tokens"], doc["entities"]) with open(align_path, "r") as f: backbone, supports, alignments = yaml.load(f)[0] token_ids_flat = list(["<S>"]) token_ids_flat.extend(sent2token_ids[backbone]) pretty_tokens_flat = list(["<S>"]) pretty_tokens_flat.extend(sent2pretty_tokens[backbone]) tokens_flat = list(["<S>"]) tokens_flat.extend(sent2tokens[backbone]) for support in supports: token_ids_flat.append("<B>") token_ids_flat.extend(sent2token_ids[support]) pretty_tokens_flat.append("<B>") pretty_tokens_flat.extend(sent2pretty_tokens[support]) tokens_flat.append("<B>") tokens_flat.extend(sent2tokens[support]) relative_alignments = list() for i, a in enumerate(alignments): if a > -1: index = token_ids_flat.index(a) relative_alignments.append(index) else: if hl_tokens[i] in vocab2id_out: relative_alignments.append(-1) else: relative_alignments.append(-99) print print len(supports) print pretty_tokens_flat print hl_tokens_pretty print relative_alignments print[pretty_tokens_flat[a] if a > -1 else -1 for a in relative_alignments] print[a + len(vocab2id_out) if a > -1 else a for a in relative_alignments] relative_alignments = list() for i, a in enumerate(alignments): if a > -1: index = token_ids_flat.index(a) relative_alignments.append(index + len(id2vocab_out)) else: if hl_tokens[i] in vocab2id_out: relative_alignments.append(vocab2id_out[hl_tokens[i]]) else: relative_alignments.append(vocab2id_out["__UNK__"]) print relative_alignments backbone_data_items = list() backbone_data_items.append(vocab2id_in.get("<S>")) for token in sent2tokens[backbone]: backbone_data_items.append( vocab2id_in.get(token, vocab2id_in["__UNK__"])) backbone_data_str = " ".join(str(i) for i in backbone_data_items) print sent2tokens[backbone] print[ vocab2id_in.get(token, vocab2id_in["__UNK__"]) for token in sent2tokens[backbone] ] print backbone_data_str print support_data_items = list() for support in supports: print sent2tokens[support] print[ vocab2id_in.get(token, vocab2id_in["__UNK__"]) for token in sent2tokens[support] ] print support_data_items.append(vocab2id_in["<B>"]) for token in sent2tokens[support]: support_data_items.append( vocab2id_in.get(token, vocab2id_in["__UNK__"])) support_data_items.append(vocab2id_in["<B>"]) support_data_str = " ".join(str(i) for i in support_data_items) relative_alignments = [vocab2id_out["<D>"] ] + relative_alignments + [vocab2id_out["<E>"]] target_data_str = " ".join(str(i) for i in relative_alignments) print "THEDATA" print "=======" print backbone_data_str print support_data_str print target_data_str print print[id2vocab_in[i] for i in backbone_data_items] print[id2vocab_in[i] for i in support_data_items] print[ i if i < len(id2vocab_out) else pretty_tokens_flat[i - len(id2vocab_out)] for i in relative_alignments ] return " | ".join([backbone_data_str, support_data_str, target_data_str])
def collect_split_stats(data_path): doc_paths = [ os.path.join(data_path, file) for file in os.listdir(data_path) ] num_docs = len(doc_paths) num_highlights = list() num_inputs = list() num_input_tokens = list() num_highlight_tokens = list() doc_len_tokens = list() doc_len_tokens_trunc = list() ref_len_tokens = list() num_ref_trunc75_tokens = list() num_ref_trunc250_tokens = list() num_ref_truncNA_tokens = list() num_ref_trunc75_sents = list() num_ref_trunc250_sents = list() num_ref_truncNA_sents = list() for i, doc_path in enumerate(doc_paths, 1): sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format( i, num_docs, 100 * i / num_docs)) sys.stdout.flush() doc = read_document(doc_path) num_highlights.append(len(doc["highlights"])) num_inputs.append(len(doc["sentences"])) doc_i_len_tokens = 0 doc_i_len_tokens_trunc = 0 for s, sent in enumerate(doc["sentences"]): tokens = replace_entities(sent["tokens"], doc["entities"]) num_input_tokens.append(len(tokens)) doc_i_len_tokens += len(tokens) if s < 25: doc_i_len_tokens_trunc += len(tokens) doc_len_tokens.append(doc_i_len_tokens) doc_len_tokens_trunc.append(doc_i_len_tokens_trunc) ref_i_len_tokens = 0 hl_tokens = list() hl_tokens_flat = list() for highlight in doc["highlights"]: tokens = replace_entities(highlight["tokens"], doc["entities"]) num_highlight_tokens.append(len(tokens)) hl_tokens.append(tokens) hl_tokens_flat.extend(tokens) ref_i_len_tokens += len(tokens) ref_len_tokens.append(ref_i_len_tokens) ref_text = "\n".join([" ".join(tokens) for tokens in hl_tokens]) ref_text_flat = " ".join(hl_tokens_flat) ref_trunc75 = ref_text[:75] ref_trunc75_flat = ref_text_flat[:75] num_ref_trunc75_tokens.append(len(ref_trunc75_flat.split())) num_ref_trunc75_sents.append(len(ref_trunc75.split("\n"))) ref_trunc250 = ref_text[:250] ref_trunc250_flat = ref_text_flat[:250] num_ref_trunc250_tokens.append(len(ref_trunc250_flat.split())) num_ref_trunc250_sents.append(len(ref_trunc250.split("\n"))) ref_truncNA = ref_text ref_truncNA_flat = ref_text_flat num_ref_truncNA_tokens.append(len(ref_truncNA_flat.split())) num_ref_truncNA_sents.append(len(ref_truncNA.split("\n"))) sys.stdout.write("\n") sys.stdout.flush() percentiles = [20, 30, 40, 50, 60, 70, 80, 90, 95, 99] def make_data_row(data): row_data = [np.mean(data), np.median(data), np.std(data), np.max(data)] row_data.extend(np.percentile(data, percentiles)) return row_data df_data = list() df_data.append(make_data_row(num_inputs)) df_data.append(make_data_row(doc_len_tokens)) df_data.append(make_data_row(doc_len_tokens_trunc)) df_data.append(make_data_row(num_input_tokens)) df_data.append(make_data_row(num_highlights)) df_data.append(make_data_row(ref_len_tokens)) df_data.append(make_data_row(num_highlight_tokens)) df_data.append(make_data_row(num_ref_trunc75_sents)) df_data.append(make_data_row(num_ref_trunc75_tokens)) df_data.append(make_data_row(num_ref_trunc250_sents)) df_data.append(make_data_row(num_ref_trunc250_tokens)) df_data.append(make_data_row(num_ref_truncNA_sents)) df_data.append(make_data_row(num_ref_truncNA_tokens)) columns = pd.MultiIndex.from_tuples( [("", "mean"), ("", "median"), ("", "std"), ("", "max")] + \ [("Percentile", "{}th".format(p)) for p in percentiles]) index = [ "inp. len. (sents.)", "inp. len. (tok.)", "inp. len. trunc25sent (tok.)", "inp. sent. len. (toks.)", "hl. len. (sents.)", "hl. len. (tok.)", "hl. sent. len. (toks.)", "ref[:75] len. (sents.)", "ref[:75] len. (tok.)", "ref[:250] len. (sents.)", "ref[:250] len. (tok.)", "ref[:+inf] len. (sents.)", "ref[:+inf] len. (tok.)" ] df = pd.DataFrame(df_data, columns=columns, index=index) df_str_lines = str(df).split("\n") print("\n".join(df_str_lines[:2]) + "\n") print("\n".join(df_str_lines[2:6]) + "\n") print("\n".join(df_str_lines[6:9]) + "\n") print("\n".join(df_str_lines[9:11]) + "\n") print("\n".join(df_str_lines[11:13]) + "\n") print("\n".join(df_str_lines[13:15]) + "\n")