예제 #1
0
def get_input_sentences(doc, sent_limit):

    sentences = list()
    for sentence in doc["sentences"][:25]:
        sent_tokens = replace_entities(sentence["tokens"], doc["entities"])
        sentences.append(" ".join(sent_tokens))
    return sentences
예제 #2
0
def build_summary(doc, lead):

    sents = list()
    for sent in doc["sentences"][:lead]:
        sent_tokens = replace_entities(sent["tokens"], doc["entities"])
        sents.append(" ".join(sent_tokens))
    return "\n".join(sents)
예제 #3
0
def build_summary(doc):

    highlights = list()
    for highlight in doc["highlights"]:
        highlight_tokens = replace_entities(highlight["tokens"],
                                            doc["entities"])
        highlights.append(" ".join(highlight_tokens))
    return "\n".join(highlights)
예제 #4
0
def find_highlight_alignments(highlight, doc, meta):
    
    id2token, id2sent, sent2tokens, sent2token_ids, doc_token_sets = meta

    highlight_tokens = replace_entities(highlight["tokens"], doc["entities"])
    highlight_token_set = set(highlight_tokens)
    #print len(highlight_tokens)


    #print "###\n"
    quotes, quote_support = find_quotes(highlight_tokens, sent2tokens, sent2token_ids)
    for s in quote_support:
        highlight_token_set -= doc_token_sets[s]
    support = find_sentence_support(highlight_token_set, doc_token_sets)
    #print quote_support, support
   
    support.extend(quote_support) 
    support.sort()

    #for s in support:
    #    print " ".join(sent2tokens[s])

    #print
    #print " ".join(highlight_tokens)

    src_tokens = [token for sent in support for token in sent2tokens[sent]] 
    src_ids = [id for sent in support for id in sent2token_ids[sent]]

    raw_token_aligments = find_token_alignments(
            src_tokens, src_ids, highlight_tokens, quotes)
    token_alignments = rescore_alignments(raw_token_aligments, id2sent)

    fill_stopword_alignments(src_tokens, src_ids, highlight_tokens,
            token_alignments)
    
    sent_counts = defaultdict(int)
    for a in token_alignments:
        if a >= 0: sent_counts[id2sent[a]] += 1

    sent_counts = sent_counts.items()

    #print token_alignments

    if len(sent_counts) > 0:

        # Sourt counts first by sentence id, then by count. Since sort is 
        # stable count ties will be broken by earliest occurring support 
        # sentence.
        sent_counts.sort(key=lambda x: x[0])
        sent_counts.sort(key=lambda x: x[1], reverse=True)
        
        backbone = sent_counts[0][0]
        support = [sc[0] for sc in sent_counts[1:]]
        return backbone, support, token_alignments
            
    else:
        return None, list(), token_alignments
예제 #5
0
def get_reference_file(doc):
    highlights = list()
    for highlight in doc["highlights"]:
        highlight_tokens = replace_entities(highlight["tokens"], doc["entities"])
        highlights.append(" ".join(highlight_tokens))
    ref_text = "\n".join(highlights)
    ref_file = NamedTemporaryFile("w", delete=False)
    ref_file.write(ref_text)
    ref_file.close()
    return ref_file
예제 #6
0
    def display_example(example):

        example = int(example)

        doc_path = app.config["DOC_PATHS"][example]
        align_path = app.config["ALIGN_PATHS"][example]
        doc = read_document(doc_path)
        doc_tokens = [replace_entities(s["tokens"], doc["entities"])
                      for s in doc["sentences"][:25]]
        highlight_tokens = [replace_entities(s["tokens"], doc["entities"])
                      for s in doc["highlights"][:4]]

        i=0

        doc_token_ids = list()
        for tokens in doc_tokens:
            token_ids = list()
            for token in tokens:
                token_ids.append(i)
                i += 1
            doc_token_ids.append(token_ids)


        backbone_ids = list()
        alignments = list()
        with open(align_path, "r") as f:
            #backbones, support, alignments 
            data = yaml.load(f)
            for backbone, support, alignment in data:
                if backbone != None:
                    backbone_ids.append(doc_token_ids[backbone])
                else:
                    backbone_ids.append(list())
                alignments.append(alignment)
        return render_template("default.html", doc_tokens=doc_tokens,
            highlights=highlight_tokens, alignments=alignments,
            alignments_json=json.dumps(alignments),
            backbone_ids=json.dumps(backbone_ids))
예제 #7
0
def build_summary(doc_path, align_path, summary_path):

    backbones = []
    used = set()
    with open(align_path, "r") as f:
        alignments = yaml.load(f)
        for backbone, support, ta in alignments:
            if backbone != None and backbone not in used:
                backbones.append(backbone)
                used.add(backbone)

    doc = read_document(doc_path)

    lines = list()
    for b in backbones:
        tokens = doc["sentences"][b]["tokens"]
        sent_str = " ".join(replace_entities(tokens, doc["entities"]))
        lines.append(sent_str)

    with open(summary_path, "w") as f:
        f.write("\n".join(lines))
예제 #8
0
def init_doc_meta(doc, max_sent):

    id2token = list()
    id2sent = list()
    sent2token_ids = list()
    sent2tokens = list()
    token_sets = list()

    for s, sentence in enumerate(doc["sentences"][:max_sent]):
        
        tokens = replace_entities(sentence["tokens"], doc["entities"])
        token_ids = [id for id, token in enumerate(tokens, len(id2token))]
        sent2token_ids.append(token_ids)
        sent2tokens.append(tokens)
        id2token.extend(tokens)
        id2sent.extend([s] * len(token_ids))

        token_sets.append(
            set([token for token in tokens if token not in stopwords]))

    return id2token, id2sent, sent2tokens, sent2token_ids, token_sets
예제 #9
0
def main():

    import argparse

    hlp = "View a random document"

    parser = argparse.ArgumentParser(hlp)
    parser.add_argument('--corpus', required=True, help="Corpus to use.",
        choices=["dailymail", "cnn"])
    parser.add_argument('--data-path', required=True, 
        help="Path to Cheng&Lapata data.")
    parser.add_argument('--split', required=True, help="Data split to use.",
        choices=["train", "dev", "test"])
    parser.add_argument('--replace-entities', default=False, 
        action="store_true")
    parser.add_argument('--pproc', default=False, 
        action="store_true")

    args = parser.parse_args()

    arg2split = {"test": "test", "train": "training", "dev": "validation"}
    split = arg2split[args.split]

    data_path = os.path.join(args.data_path, args.corpus, split)
    doc_paths = [os.path.join(data_path, file) 
                 for file in os.listdir(data_path)]
    doc_paths.sort()
    random.shuffle(doc_paths)

    doc = read_document(doc_paths[0])

    print("url")
    print("===")
    print(doc["url"])

    print("\nINPUT")
    print("=====")
    for s, sent in enumerate(doc["sentences"], 1):
        tokens = sent["tokens"]
        if args.pproc:
            tokens = preprocess_tokens(tokens, doc["entities"])
        if args.replace_entities:
            tokens = replace_entities(tokens, doc["entities"]) 
        sent_str = " ".join(tokens)
        line = "{}) [{}] {}".format(s, sent["score"], sent_str)
        print(textwrap.fill(line, subsequent_indent="   "))

    print("\nENTITIES")
    print("========")
    for id, entity in sorted(doc["entities"].items(), key=lambda x: x[0]):
        print("{:10} :: {}".format(id, entity))

    print("\nHIGHLIGHTS")
    print("==========")

    for s, sent in enumerate(doc["highlights"], 1):
        tokens = sent["tokens"]
        if args.pproc:
            tokens = preprocess_tokens(tokens, doc["entities"])
        if args.replace_entities:
            tokens = replace_entities(tokens, doc["entities"]) 
        sent_str = " ".join(tokens)
        line = "{}) {}".format(s, sent_str)
        print(textwrap.fill(line, subsequent_indent="   "))
예제 #10
0
def process_example(doc_path, align_path):

    print doc_path
    doc = read_document(doc_path)

    sent2token_ids = list()
    sent2pretty_tokens = list()
    sent2tokens = list()

    id = 0
    for sent in doc["sentences"]:
        token_ids = list()
        pretty_tokens = replace_entities(sent["tokens"], doc["entities"])
        pp_tokens = preprocess_tokens(sent["tokens"], doc["entities"])
        for token in pretty_tokens:
            token_ids.append(id)
            #pretty_tokens.append(token)
            id += 1

        sent2token_ids.append(token_ids)
        sent2pretty_tokens.append(pretty_tokens)
        sent2tokens.append(pp_tokens)

    hl_tokens_pretty = replace_entities(doc["highlights"][0]["tokens"],
                                        doc["entities"])
    hl_tokens = preprocess_tokens(doc["highlights"][0]["tokens"],
                                  doc["entities"])

    with open(align_path, "r") as f:
        backbone, supports, alignments = yaml.load(f)[0]

    token_ids_flat = list(["<S>"])
    token_ids_flat.extend(sent2token_ids[backbone])
    pretty_tokens_flat = list(["<S>"])
    pretty_tokens_flat.extend(sent2pretty_tokens[backbone])
    tokens_flat = list(["<S>"])
    tokens_flat.extend(sent2tokens[backbone])

    for support in supports:
        token_ids_flat.append("<B>")
        token_ids_flat.extend(sent2token_ids[support])
        pretty_tokens_flat.append("<B>")
        pretty_tokens_flat.extend(sent2pretty_tokens[support])
        tokens_flat.append("<B>")
        tokens_flat.extend(sent2tokens[support])

    relative_alignments = list()
    for i, a in enumerate(alignments):
        if a > -1:
            index = token_ids_flat.index(a)
            relative_alignments.append(index)
        else:
            if hl_tokens[i] in vocab2id_out:
                relative_alignments.append(-1)
            else:
                relative_alignments.append(-99)

    print
    print len(supports)
    print pretty_tokens_flat
    print hl_tokens_pretty
    print relative_alignments
    print[pretty_tokens_flat[a] if a > -1 else -1 for a in relative_alignments]

    print[a + len(vocab2id_out) if a > -1 else a for a in relative_alignments]

    relative_alignments = list()
    for i, a in enumerate(alignments):
        if a > -1:
            index = token_ids_flat.index(a)
            relative_alignments.append(index + len(id2vocab_out))
        else:
            if hl_tokens[i] in vocab2id_out:
                relative_alignments.append(vocab2id_out[hl_tokens[i]])
            else:
                relative_alignments.append(vocab2id_out["__UNK__"])
    print relative_alignments

    backbone_data_items = list()
    backbone_data_items.append(vocab2id_in.get("<S>"))
    for token in sent2tokens[backbone]:
        backbone_data_items.append(
            vocab2id_in.get(token, vocab2id_in["__UNK__"]))
    backbone_data_str = " ".join(str(i) for i in backbone_data_items)

    print sent2tokens[backbone]
    print[
        vocab2id_in.get(token, vocab2id_in["__UNK__"])
        for token in sent2tokens[backbone]
    ]
    print backbone_data_str
    print

    support_data_items = list()

    for support in supports:
        print sent2tokens[support]
        print[
            vocab2id_in.get(token, vocab2id_in["__UNK__"])
            for token in sent2tokens[support]
        ]
        print
        support_data_items.append(vocab2id_in["<B>"])
        for token in sent2tokens[support]:
            support_data_items.append(
                vocab2id_in.get(token, vocab2id_in["__UNK__"]))
    support_data_items.append(vocab2id_in["<B>"])

    support_data_str = " ".join(str(i) for i in support_data_items)

    relative_alignments = [vocab2id_out["<D>"]
                           ] + relative_alignments + [vocab2id_out["<E>"]]
    target_data_str = " ".join(str(i) for i in relative_alignments)

    print "THEDATA"
    print "======="
    print backbone_data_str
    print support_data_str
    print target_data_str

    print
    print[id2vocab_in[i] for i in backbone_data_items]
    print[id2vocab_in[i] for i in support_data_items]
    print[
        i if i < len(id2vocab_out) else pretty_tokens_flat[i -
                                                           len(id2vocab_out)]
        for i in relative_alignments
    ]

    return " | ".join([backbone_data_str, support_data_str, target_data_str])
예제 #11
0
def collect_split_stats(data_path):

    doc_paths = [
        os.path.join(data_path, file) for file in os.listdir(data_path)
    ]

    num_docs = len(doc_paths)

    num_highlights = list()
    num_inputs = list()
    num_input_tokens = list()
    num_highlight_tokens = list()

    doc_len_tokens = list()
    doc_len_tokens_trunc = list()
    ref_len_tokens = list()

    num_ref_trunc75_tokens = list()
    num_ref_trunc250_tokens = list()
    num_ref_truncNA_tokens = list()
    num_ref_trunc75_sents = list()
    num_ref_trunc250_sents = list()
    num_ref_truncNA_sents = list()

    for i, doc_path in enumerate(doc_paths, 1):
        sys.stdout.write("\r {:d} / {:d} ( {:7.4f}% ) ".format(
            i, num_docs, 100 * i / num_docs))
        sys.stdout.flush()
        doc = read_document(doc_path)
        num_highlights.append(len(doc["highlights"]))
        num_inputs.append(len(doc["sentences"]))

        doc_i_len_tokens = 0
        doc_i_len_tokens_trunc = 0

        for s, sent in enumerate(doc["sentences"]):
            tokens = replace_entities(sent["tokens"], doc["entities"])
            num_input_tokens.append(len(tokens))
            doc_i_len_tokens += len(tokens)
            if s < 25:
                doc_i_len_tokens_trunc += len(tokens)

        doc_len_tokens.append(doc_i_len_tokens)
        doc_len_tokens_trunc.append(doc_i_len_tokens_trunc)

        ref_i_len_tokens = 0
        hl_tokens = list()
        hl_tokens_flat = list()
        for highlight in doc["highlights"]:
            tokens = replace_entities(highlight["tokens"], doc["entities"])
            num_highlight_tokens.append(len(tokens))
            hl_tokens.append(tokens)
            hl_tokens_flat.extend(tokens)
            ref_i_len_tokens += len(tokens)

        ref_len_tokens.append(ref_i_len_tokens)

        ref_text = "\n".join([" ".join(tokens) for tokens in hl_tokens])
        ref_text_flat = " ".join(hl_tokens_flat)

        ref_trunc75 = ref_text[:75]
        ref_trunc75_flat = ref_text_flat[:75]
        num_ref_trunc75_tokens.append(len(ref_trunc75_flat.split()))
        num_ref_trunc75_sents.append(len(ref_trunc75.split("\n")))

        ref_trunc250 = ref_text[:250]
        ref_trunc250_flat = ref_text_flat[:250]
        num_ref_trunc250_tokens.append(len(ref_trunc250_flat.split()))
        num_ref_trunc250_sents.append(len(ref_trunc250.split("\n")))

        ref_truncNA = ref_text
        ref_truncNA_flat = ref_text_flat
        num_ref_truncNA_tokens.append(len(ref_truncNA_flat.split()))
        num_ref_truncNA_sents.append(len(ref_truncNA.split("\n")))

    sys.stdout.write("\n")
    sys.stdout.flush()

    percentiles = [20, 30, 40, 50, 60, 70, 80, 90, 95, 99]

    def make_data_row(data):

        row_data = [np.mean(data), np.median(data), np.std(data), np.max(data)]
        row_data.extend(np.percentile(data, percentiles))
        return row_data

    df_data = list()
    df_data.append(make_data_row(num_inputs))
    df_data.append(make_data_row(doc_len_tokens))
    df_data.append(make_data_row(doc_len_tokens_trunc))
    df_data.append(make_data_row(num_input_tokens))

    df_data.append(make_data_row(num_highlights))
    df_data.append(make_data_row(ref_len_tokens))
    df_data.append(make_data_row(num_highlight_tokens))

    df_data.append(make_data_row(num_ref_trunc75_sents))
    df_data.append(make_data_row(num_ref_trunc75_tokens))
    df_data.append(make_data_row(num_ref_trunc250_sents))
    df_data.append(make_data_row(num_ref_trunc250_tokens))
    df_data.append(make_data_row(num_ref_truncNA_sents))
    df_data.append(make_data_row(num_ref_truncNA_tokens))


    columns = pd.MultiIndex.from_tuples(
        [("", "mean"), ("", "median"), ("", "std"), ("", "max")] + \
        [("Percentile", "{}th".format(p)) for p in percentiles])

    index = [
        "inp. len. (sents.)", "inp. len. (tok.)",
        "inp. len. trunc25sent (tok.)", "inp. sent. len. (toks.)",
        "hl. len. (sents.)", "hl. len. (tok.)", "hl. sent. len. (toks.)",
        "ref[:75] len. (sents.)", "ref[:75] len. (tok.)",
        "ref[:250] len. (sents.)", "ref[:250] len. (tok.)",
        "ref[:+inf] len. (sents.)", "ref[:+inf] len. (tok.)"
    ]

    df = pd.DataFrame(df_data, columns=columns, index=index)
    df_str_lines = str(df).split("\n")

    print("\n".join(df_str_lines[:2]) + "\n")
    print("\n".join(df_str_lines[2:6]) + "\n")
    print("\n".join(df_str_lines[6:9]) + "\n")
    print("\n".join(df_str_lines[9:11]) + "\n")
    print("\n".join(df_str_lines[11:13]) + "\n")
    print("\n".join(df_str_lines[13:15]) + "\n")