def get_input_sentences(doc, sent_limit): sentences = list() for sentence in doc["sentences"][:25]: sent_tokens = replace_entities(sentence["tokens"], doc["entities"]) sentences.append(" ".join(sent_tokens)) return sentences
def build_summary(doc, lead): sents = list() for sent in doc["sentences"][:lead]: sent_tokens = replace_entities(sent["tokens"], doc["entities"]) sents.append(" ".join(sent_tokens)) return "\n".join(sents)
def build_summary(doc): highlights = list() for highlight in doc["highlights"]: highlight_tokens = replace_entities(highlight["tokens"], doc["entities"]) highlights.append(" ".join(highlight_tokens)) return "\n".join(highlights)
def find_highlight_alignments(highlight, doc, meta): id2token, id2sent, sent2tokens, sent2token_ids, doc_token_sets = meta highlight_tokens = replace_entities(highlight["tokens"], doc["entities"]) highlight_token_set = set(highlight_tokens) #print len(highlight_tokens) #print "###\n" quotes, quote_support = find_quotes(highlight_tokens, sent2tokens, sent2token_ids) for s in quote_support: highlight_token_set -= doc_token_sets[s] support = find_sentence_support(highlight_token_set, doc_token_sets) #print quote_support, support support.extend(quote_support) support.sort() #for s in support: # print " ".join(sent2tokens[s]) #print #print " ".join(highlight_tokens) src_tokens = [token for sent in support for token in sent2tokens[sent]] src_ids = [id for sent in support for id in sent2token_ids[sent]] raw_token_aligments = find_token_alignments( src_tokens, src_ids, highlight_tokens, quotes) token_alignments = rescore_alignments(raw_token_aligments, id2sent) fill_stopword_alignments(src_tokens, src_ids, highlight_tokens, token_alignments) sent_counts = defaultdict(int) for a in token_alignments: if a >= 0: sent_counts[id2sent[a]] += 1 sent_counts = sent_counts.items() #print token_alignments if len(sent_counts) > 0: # Sourt counts first by sentence id, then by count. Since sort is # stable count ties will be broken by earliest occurring support # sentence. sent_counts.sort(key=lambda x: x[0]) sent_counts.sort(key=lambda x: x[1], reverse=True) backbone = sent_counts[0][0] support = [sc[0] for sc in sent_counts[1:]] return backbone, support, token_alignments else: return None, list(), token_alignments
def get_reference_file(doc): highlights = list() for highlight in doc["highlights"]: highlight_tokens = replace_entities(highlight["tokens"], doc["entities"]) highlights.append(" ".join(highlight_tokens)) ref_text = "\n".join(highlights) ref_file = NamedTemporaryFile("w", delete=False) ref_file.write(ref_text) ref_file.close() return ref_file
def display_example(example): example = int(example) doc_path = app.config["DOC_PATHS"][example] align_path = app.config["ALIGN_PATHS"][example] doc = read_document(doc_path) doc_tokens = [replace_entities(s["tokens"], doc["entities"]) for s in doc["sentences"][:25]] highlight_tokens = [replace_entities(s["tokens"], doc["entities"]) for s in doc["highlights"][:4]] i=0 doc_token_ids = list() for tokens in doc_tokens: token_ids = list() for token in tokens: token_ids.append(i) i += 1 doc_token_ids.append(token_ids) backbone_ids = list() alignments = list() with open(align_path, "r") as f: #backbones, support, alignments data = yaml.load(f) for backbone, support, alignment in data: if backbone != None: backbone_ids.append(doc_token_ids[backbone]) else: backbone_ids.append(list()) alignments.append(alignment) return render_template("default.html", doc_tokens=doc_tokens, highlights=highlight_tokens, alignments=alignments, alignments_json=json.dumps(alignments), backbone_ids=json.dumps(backbone_ids))
def build_summary(doc_path, align_path, summary_path): backbones = [] used = set() with open(align_path, "r") as f: alignments = yaml.load(f) for backbone, support, ta in alignments: if backbone != None and backbone not in used: backbones.append(backbone) used.add(backbone) doc = read_document(doc_path) lines = list() for b in backbones: tokens = doc["sentences"][b]["tokens"] sent_str = " ".join(replace_entities(tokens, doc["entities"])) lines.append(sent_str) with open(summary_path, "w") as f: f.write("\n".join(lines))
def init_doc_meta(doc, max_sent): id2token = list() id2sent = list() sent2token_ids = list() sent2tokens = list() token_sets = list() for s, sentence in enumerate(doc["sentences"][:max_sent]): tokens = replace_entities(sentence["tokens"], doc["entities"]) token_ids = [id for id, token in enumerate(tokens, len(id2token))] sent2token_ids.append(token_ids) sent2tokens.append(tokens) id2token.extend(tokens) id2sent.extend([s] * len(token_ids)) token_sets.append( set([token for token in tokens if token not in stopwords])) return id2token, id2sent, sent2tokens, sent2token_ids, token_sets
def main(): import argparse hlp = "View a random document" parser = argparse.ArgumentParser(hlp) parser.add_argument('--corpus', required=True, help="Corpus to use.", choices=["dailymail", "cnn"]) parser.add_argument('--data-path', required=True, help="Path to Cheng&Lapata data.") parser.add_argument('--split', required=True, help="Data split to use.", choices=["train", "dev", "test"]) parser.add_argument('--replace-entities', default=False, action="store_true") parser.add_argument('--pproc', default=False, action="store_true") args = parser.parse_args() arg2split = {"test": "test", "train": "training", "dev": "validation"} split = arg2split[args.split] data_path = os.path.join(args.data_path, args.corpus, split) doc_paths = [os.path.join(data_path, file) for file in os.listdir(data_path)] doc_paths.sort() random.shuffle(doc_paths) doc = read_document(doc_paths[0]) print("url") print("===") print(doc["url"]) print("\nINPUT") print("=====") for s, sent in enumerate(doc["sentences"], 1): tokens = sent["tokens"] if args.pproc: tokens = preprocess_tokens(tokens, doc["entities"]) if args.replace_entities: tokens = replace_entities(tokens, doc["entities"]) sent_str = " ".join(tokens) line = "{}) [{}] {}".format(s, sent["score"], sent_str) print(textwrap.fill(line, subsequent_indent=" ")) print("\nENTITIES") print("========") for id, entity in sorted(doc["entities"].items(), key=lambda x: x[0]): print("{:10} :: {}".format(id, entity)) print("\nHIGHLIGHTS") print("==========") for s, sent in enumerate(doc["highlights"], 1): tokens = sent["tokens"] if args.pproc: tokens = preprocess_tokens(tokens, doc["entities"]) if args.replace_entities: tokens = replace_entities(tokens, doc["entities"]) sent_str = " ".join(tokens) line = "{}) {}".format(s, sent_str) print(textwrap.fill(line, subsequent_indent=" "))
def process_example(doc_path, align_path): print doc_path doc = read_document(doc_path) sent2token_ids = list() sent2pretty_tokens = list() sent2tokens = list() id = 0 for sent in doc["sentences"]: token_ids = list() pretty_tokens = replace_entities(sent["tokens"], doc["entities"]) pp_tokens = preprocess_tokens(sent["tokens"], doc["entities"]) for token in pretty_tokens: token_ids.append(id) #pretty_tokens.append(token) id += 1 sent2token_ids.append(token_ids) sent2pretty_tokens.append(pretty_tokens) sent2tokens.append(pp_tokens) hl_tokens_pretty = replace_entities(doc["highlights"][0]["tokens"], doc["entities"]) hl_tokens = preprocess_tokens(doc["highlights"][0]["tokens"], doc["entities"]) with open(align_path, "r") as f: backbone, supports, alignments = yaml.load(f)[0] token_ids_flat = list(["<S>"]) token_ids_flat.extend(sent2token_ids[backbone]) pretty_tokens_flat = list(["<S>"]) pretty_tokens_flat.extend(sent2pretty_tokens[backbone]) tokens_flat = list(["<S>"]) tokens_flat.extend(sent2tokens[backbone]) for support in supports: token_ids_flat.append("<B>") token_ids_flat.extend(sent2token_ids[support]) pretty_tokens_flat.append("<B>") pretty_tokens_flat.extend(sent2pretty_tokens[support]) tokens_flat.append("<B>") tokens_flat.extend(sent2tokens[support]) relative_alignments = list() for i, a in enumerate(alignments): if a > -1: index = token_ids_flat.index(a) relative_alignments.append(index) else: if hl_tokens[i] in vocab2id_out: relative_alignments.append(-1) else: relative_alignments.append(-99) print print len(supports) print pretty_tokens_flat print hl_tokens_pretty print relative_alignments print[pretty_tokens_flat[a] if a > -1 else -1 for a in relative_alignments] print[a + len(vocab2id_out) if a > -1 else a for a in relative_alignments] relative_alignments = list() for i, a in enumerate(alignments): if a > -1: index = token_ids_flat.index(a) relative_alignments.append(index + len(id2vocab_out)) else: if hl_tokens[i] in vocab2id_out: relative_alignments.append(vocab2id_out[hl_tokens[i]]) else: relative_alignments.append(vocab2id_out["__UNK__"]) print relative_alignments backbone_data_items = list() backbone_data_items.append(vocab2id_in.get("<S>")) for token in sent2tokens[backbone]: backbone_data_items.append( vocab2id_in.get(token, vocab2id_in["__UNK__"])) backbone_data_str = " ".join(str(i) for i in backbone_data_items) print sent2tokens[backbone] print[ vocab2id_in.get(token, vocab2id_in["__UNK__"]) for token in sent2tokens[backbone] ] print backbone_data_str print support_data_items = list() for support in supports: print sent2tokens[support] print[ vocab2id_in.get(token, vocab2id_in["__UNK__"]) for token in sent2tokens[support] ] print support_data_items.append(vocab2id_in["<B>"]) for token in sent2tokens[support]: support_data_items.append( vocab2id_in.get(token, vocab2id_in["__UNK__"])) support_data_items.append(vocab2id_in["<B>"]) support_data_str = " ".join(str(i) for i in support_data_items) relative_alignments = [vocab2id_out["<D>"] ] + relative_alignments + [vocab2id_out["<E>"]] target_data_str = " ".join(str(i) for i in relative_alignments) print "THEDATA" print "=======" print backbone_data_str print support_data_str print target_data_str print print[id2vocab_in[i] for i in backbone_data_items] print[id2vocab_in[i] for i in support_data_items] print[ i if i < len(id2vocab_out) else pretty_tokens_flat[i - len(id2vocab_out)] for i in relative_alignments ] return " | ".join([backbone_data_str, support_data_str, target_data_str])
def collect_split_stats(data_path): doc_paths = [ os.path.join(data_path, file) for file in os.listdir(data_path) ] num_docs = len(doc_paths) num_highlights = list() num_inputs = list() num_input_tokens = list() num_highlight_tokens = list() doc_len_tokens = list() doc_len_tokens_trunc = list() ref_len_tokens = list() num_ref_trunc75_tokens = list() num_ref_trunc250_tokens = list() num_ref_truncNA_tokens = list() num_ref_trunc75_sents = list() num_ref_trunc250_sents = list() num_ref_truncNA_sents = list() for i, doc_path in enumerate(doc_paths, 1): sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format( i, num_docs, 100 * i / num_docs)) sys.stdout.flush() doc = read_document(doc_path) num_highlights.append(len(doc["highlights"])) num_inputs.append(len(doc["sentences"])) doc_i_len_tokens = 0 doc_i_len_tokens_trunc = 0 for s, sent in enumerate(doc["sentences"]): tokens = replace_entities(sent["tokens"], doc["entities"]) num_input_tokens.append(len(tokens)) doc_i_len_tokens += len(tokens) if s < 25: doc_i_len_tokens_trunc += len(tokens) doc_len_tokens.append(doc_i_len_tokens) doc_len_tokens_trunc.append(doc_i_len_tokens_trunc) ref_i_len_tokens = 0 hl_tokens = list() hl_tokens_flat = list() for highlight in doc["highlights"]: tokens = replace_entities(highlight["tokens"], doc["entities"]) num_highlight_tokens.append(len(tokens)) hl_tokens.append(tokens) hl_tokens_flat.extend(tokens) ref_i_len_tokens += len(tokens) ref_len_tokens.append(ref_i_len_tokens) ref_text = "\n".join([" ".join(tokens) for tokens in hl_tokens]) ref_text_flat = " ".join(hl_tokens_flat) ref_trunc75 = ref_text[:75] ref_trunc75_flat = ref_text_flat[:75] num_ref_trunc75_tokens.append(len(ref_trunc75_flat.split())) num_ref_trunc75_sents.append(len(ref_trunc75.split("\n"))) ref_trunc250 = ref_text[:250] ref_trunc250_flat = ref_text_flat[:250] num_ref_trunc250_tokens.append(len(ref_trunc250_flat.split())) num_ref_trunc250_sents.append(len(ref_trunc250.split("\n"))) ref_truncNA = ref_text ref_truncNA_flat = ref_text_flat num_ref_truncNA_tokens.append(len(ref_truncNA_flat.split())) num_ref_truncNA_sents.append(len(ref_truncNA.split("\n"))) sys.stdout.write("\n") sys.stdout.flush() percentiles = [20, 30, 40, 50, 60, 70, 80, 90, 95, 99] def make_data_row(data): row_data = [np.mean(data), np.median(data), np.std(data), np.max(data)] row_data.extend(np.percentile(data, percentiles)) return row_data df_data = list() df_data.append(make_data_row(num_inputs)) df_data.append(make_data_row(doc_len_tokens)) df_data.append(make_data_row(doc_len_tokens_trunc)) df_data.append(make_data_row(num_input_tokens)) df_data.append(make_data_row(num_highlights)) df_data.append(make_data_row(ref_len_tokens)) df_data.append(make_data_row(num_highlight_tokens)) df_data.append(make_data_row(num_ref_trunc75_sents)) df_data.append(make_data_row(num_ref_trunc75_tokens)) df_data.append(make_data_row(num_ref_trunc250_sents)) df_data.append(make_data_row(num_ref_trunc250_tokens)) df_data.append(make_data_row(num_ref_truncNA_sents)) df_data.append(make_data_row(num_ref_truncNA_tokens)) columns = pd.MultiIndex.from_tuples( [("", "mean"), ("", "median"), ("", "std"), ("", "max")] + \ [("Percentile", "{}th".format(p)) for p in percentiles]) index = [ "inp. len. (sents.)", "inp. len. (tok.)", "inp. len. trunc25sent (tok.)", "inp. sent. len. (toks.)", "hl. len. (sents.)", "hl. len. (tok.)", "hl. sent. len. (toks.)", "ref[:75] len. (sents.)", "ref[:75] len. (tok.)", "ref[:250] len. (sents.)", "ref[:250] len. (tok.)", "ref[:+inf] len. (sents.)", "ref[:+inf] len. (tok.)" ] df = pd.DataFrame(df_data, columns=columns, index=index) df_str_lines = str(df).split("\n") print("\n".join(df_str_lines[:2]) + "\n") print("\n".join(df_str_lines[2:6]) + "\n") print("\n".join(df_str_lines[6:9]) + "\n") print("\n".join(df_str_lines[9:11]) + "\n") print("\n".join(df_str_lines[11:13]) + "\n") print("\n".join(df_str_lines[13:15]) + "\n")