def main():
    gold_tl_fname = sys.argv[1]
    sys_tl_fname = sys.argv[2]

    with open(gold_tl_fname, errors="ignore") as f:
        gold_tl = Timeline.from_file(f)

    with open(sys_tl_fname, errors="ignore") as f:
        sys_tl = Timeline.from_file(f)

    all_dates = sorted(set(gold_tl.get_dates()).union(set(sys_tl.get_dates())))

    for date in all_dates:
        print(date)
        if date in gold_tl.dates_to_summaries:
            gold_sum = "\n".join(gold_tl.dates_to_summaries[date])
        else:
            gold_sum = "-----"

        if date in sys_tl.dates_to_summaries:
            sys_sum = "\n".join(sys_tl.dates_to_summaries[date])
        else:
            sys_sum = "-----"
        print_vertical_split(gold_sum, sys_sum)
        print()
Пример #2
0
def generate_question_html(sys_tl_dir, tl_questions):
    for (tl_topic, tl_name), questions in tl_questions.items():
        sys_tl_path = os.path.join(sys_tl_dir, tl_topic, tl_name)

        with open(sys_tl_path, errors="ignore") as f_tl:
            tl = Timeline.from_file(f_tl)

        yield tl_topic, tl_name, generate_template(tl, questions)
Пример #3
0
def read_gold_tl_dir(gold_dir):
    timelines = {}
    for tl_fname in os.listdir(gold_dir):
        tl_path = os.path.join(gold_dir, tl_fname)

        with open(tl_path, errors="ignore") as f:
            timelines[tl_fname] = Timeline.from_file(f)

    return timelines
Пример #4
0
def load_all_gold_timelines():
    all_timelines = {}
    gold_tl_dir = "gold-timelines"
    for topic_gold_tl_dir in iter_dirs(gold_tl_dir):
        for gold_tl_fname in iter_files(topic_gold_tl_dir, ".txt"):
            with open(gold_tl_fname, errors="ignore") as f:
                tl = Timeline.from_file(f)
                all_timelines[os.path.basename(topic_gold_tl_dir),
                              os.path.basename(gold_tl_fname)] = tl

    return all_timelines
Пример #5
0
def compute_reliability(date_scores):
    tl_base_path = Path("./gold-timelines")
    all_annotators = sorted(date_scores)

    all_topics = set()

    for scores in date_scores.values():
        for topic, tl_name in scores:
            all_topics.add((topic, tl_name))

    for topic, tl_name in all_topics:
        with open(tl_base_path / topic / (tl_name + ".txt"),
                  errors="ignore") as f:
            tl = Timeline.from_file(f)

            score_matrix = np.zeros((len(date_scores), len(tl.get_dates())))

            all_dates = sorted(tl.get_dates())

            for annotator_idx, annotator in enumerate(all_annotators):
                annotator_tl_scores = date_scores[annotator][(topic, tl_name)]

                sorted_dates = sorted(
                    all_dates,
                    key=lambda date: annotator_tl_scores.get(date, 0),
                    reverse=True)
                new_annotator_tl_scores = {}
                curr_idx = 0
                prev_score = None
                for date in sorted_dates:
                    score = annotator_tl_scores.get(date, 0)
                    if prev_score is None or prev_score != score:
                        curr_idx += 1
                        prev_score = score
                    new_annotator_tl_scores[date] = curr_idx

                for date_idx, date in enumerate(all_dates):
                    score_matrix[annotator_idx,
                                 date_idx] = new_annotator_tl_scores.get(
                                     date, 0)

        print(topic, tl_name,
              k.alpha(score_matrix, level_of_measurement="ordinal"))

        for annotator_1_idx, annotator_2_idx in it.combinations(
                range(len(all_annotators)), 2):
            annotator_1 = ANNOTATORS[all_annotators[annotator_1_idx]]
            annotator_2 = ANNOTATORS[all_annotators[annotator_2_idx]]

            annotator_rows = score_matrix[[annotator_1_idx, annotator_2_idx]]

            print(annotator_1, annotator_2,
                  k.alpha(annotator_rows, level_of_measurement="interval"))
Пример #6
0
def main():
    with open(sys.argv[2], "w") as f_out:
        for topic_dirname in os.listdir(sys.argv[1]):
            for tl_fname in os.listdir(os.path.join(sys.argv[1],
                                                    topic_dirname)):
                with open(os.path.join(sys.argv[1], topic_dirname, tl_fname),
                          errors="ignore") as f_tl:
                    tl = Timeline.from_file(f_tl)

                f_out.write("== {} {}\n".format(topic_dirname, tl_fname))
                for date in sorted(tl.get_dates()):
                    f_out.write(str(date))
                    f_out.write("\t\t\n")
def main():
    all_timelines = []
    gold_tl_dir = "gold-timelines"
    for topic_gold_tl_dir in iter_dirs(gold_tl_dir):
        for gold_tl_fname in iter_files(topic_gold_tl_dir, ".txt"):
            with open(gold_tl_fname, errors="ignore") as f:
                tl = Timeline.from_file(f)

                print(topic_gold_tl_dir, (gold_tl_fname))

                if (os.path.split(topic_gold_tl_dir)[-1],
                        os.path.split(gold_tl_fname)[-1]) in [
                            ("tl17-bpoil", "bbc.txt"),
                            ("crisis-syria", "bbc.txt"),
                            ("tl17-mj", "bbc.txt"),
                            ("crisis-libya", "xinhua.txt")
                        ]:
                    all_timelines.append(tl)
                    print(len(tl))

    print("Original TL-Count", len(all_timelines))
    all_timelines = list(filter(lambda i: len(i) <= 50, all_timelines))
    print("New TL-Count", len(all_timelines))

    tl_tuple_counts = [len(tl.get_dates()) * 2 for tl in all_timelines]

    print(sum(tl_tuple_counts))

    price_per_tuple = STUDENT_PRICE_PER_TUPLE
    num_annotations = 3

    tl_words = [
        tokenize(" ".join(summary_sents)) for tl in all_timelines
        for summary_sents in tl.dates_to_summaries.values()
    ]

    tl_reading_times = [len(w) / 200 for w in tl_words]

    print("Reading time", sum(tl_reading_times))
    print(
        "Reading cost",
        sum(tl_reading_times) * STUDENT_PRICE_PER_READING_MINUTE *
        num_annotations)

    tl_tuple_price = sum(tl_tuple_counts) * price_per_tuple * num_annotations

    print("Number of Tuples", sum(tl_tuple_counts))
    print("Annotation Time", sum(tl_tuple_counts) / 1.5)
    print("Annotation Cost", tl_tuple_price)
Пример #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("timeline")
    parser.add_argument("replay_file")
    parser.add_argument("--replay", action="store_true", default=False)

    args = parser.parse_args()

    with open(args.timeline, errors="ignore") as f:
        tl = Timeline.from_file(f)

    comparisons = play_comparison_game(tl)
    all_comparisons = []

    if not args.replay:
        with open(args.replay_file, "w") as f_out:
            for date_1, date_2, result in comparisons:
                f_out.write("{}\t{}\t{}\n".format(date_1, date_2, result))
                all_comparisons.append((date_1, date_2, result))
    else:
        with open(args.replay_file) as f:
            for line in f:
                date_1, date_2, result = line.split()
                date_1 = datetime.date(*map(int, date_1.split("-")))
                date_2 = datetime.date(*map(int, date_2.split("-")))
                result = int(result)
                all_comparisons.append((date_1, date_2, result))

    scores = {key: 0 for key in tl.get_dates()}

    for date_1, date_2, result in all_comparisons:
        if result == 0:
            scores[date_1] += 0.5
            scores[date_2] += 0.5
        elif result == -1:
            scores[date_1] += 1
        elif result == 1:
            scores[date_2] += 1

    for date, score in sorted(scores.items(), key=lambda x: x[1],
                              reverse=True):
        print(date, "\n", "\n".join(tl.dates_to_summaries[date]))
        print()
Пример #9
0
def retrieve_hit_ranking(hit_task_name):
    client = MTurkClient(hit_task_name)

    hit_ids = client.list_reviewable_hit_ids_and_annotations()

    date_scores = defaultdict(Counter)

    for hit_id, info in hit_ids:
        results = client.load_answers_for_hit(hit_id)
        task_name, hit_info = info.split(":")
        tl_topic, tl_name = hit_info.split("__")

        for assignment in results:
            for worker_id, assignment_answer in assignment:
                for key, answer in assignment_answer.items():
                    _, row_idx, answer_type = key.split("-")
                    date_scores[tl_topic, tl_name][str_to_tl_date(
                        answer)] += 1 if answer_type == "best" else -1

                    if "K" not in hit_task_name:
                        print(answer, answer_type)

    return date_scores

    tl_base_path = Path("./gold-timelines")

    timelines_and_scores = {}

    for (topic, tl_name), results in date_scores.items():
        with open(tl_base_path / topic / (tl_name + ".txt"),
                  errors="ignore") as f:
            tl = Timeline.from_file(f)

        timelines_and_scores[topic, tl_name] = (results, tl)

        #summaries = sorted(tl.dates_to_summaries.items(), key=lambda it: results[it[0]], reverse=True)

        #if (info_path / hit_id).is_file():
        #    pass

    return timelines_and_scores
Пример #10
0
def compute_stats_for_tl(tlfname):
    num_splitted_sents = 0
    root_counts = []

    with open(tlfname, errors="ignore") as f:
        timeline = Timeline.from_file(f)

    num_sents = 0
    for date in timeline:
        for sent in timeline[date]:
            num_sents += 1
            sents = list(nlp(sent).sents)
            if len(sents) > 1:
                num_splitted_sents += 1

            for sent in sents:
                num_roots = 0
                for tok in sent:
                    if tok.head == tok:
                        num_roots += 1
                root_counts.append(num_roots)

    return num_splitted_sents, root_counts, num_sents
Пример #11
0
def cross_eval_main():
    parser = argparse.ArgumentParser()

    parser.add_argument("corpus_def")
    parser.add_argument("config")
    parser.add_argument("param_file")

    args = parser.parse_args()

    corpora_and_timelines = []

    with open(args.corpus_def) as f:
        corpus_defs = json.load(f)

    for corpus_def in corpus_defs["corpora"]:
        timeline_dir = corpus_def["tl_dir"]
        corpus_pickle = corpus_def["corpus_pkl"]

        corpus = load_corpus(corpus_pickle)

        timelines = []
        for tl_fname in iter_files(timeline_dir, ".txt"):
            with open(tl_fname, encoding="latin-1") as f:
                timeline = Timeline.from_file(f)
                timelines.append((os.path.basename(tl_fname), timeline))

        corpora_and_timelines.append((corpus, timelines))

    with open(args.config) as f:
        config = json.load(f)

    tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)

    parameters = tl_gen.run_scoring_cv_train_mode(corpora_and_timelines)

    with open(args.param_file, "wb") as f_out:
        pickle.dump(parameters, f_out)
Пример #12
0
def evaluate_tl_main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-f",
                        dest="filter_corpus",
                        default=False,
                        action="store_true")
    parser.add_argument("-c", dest="constraint", default="sent")
    parser.add_argument("-t", dest="timelines", nargs="+")
    parser.add_argument("-m",
                        dest="num_multi_selection_runs",
                        type=int,
                        default=None)
    parser.add_argument("--queryfile")
    parser.add_argument("corpus_pickle")
    parser.add_argument("config")

    args = parser.parse_args()

    if args.constraint == "sent":
        use_token_count = False
    elif args.constraint == "tok":
        use_token_count = True
    else:
        raise ValueError("Unknown constraint {}".format(args.constraint))

    corpus = load_corpus(args.corpus_pickle,
                         filter_blacklist=args.filter_corpus)

    timelines = []

    for tl_fname in args.timelines:
        with open(tl_fname, errors="ignore") as f:
            timeline = Timeline.from_file(f)
            timelines.append((os.path.basename(tl_fname), timeline))

    #tl_gen = APClusteringTimelineGenerator(True)

    with open(args.config) as f:
        config = json.load(f)

    tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config)

    corpus_basename = os.path.basename(corpus.name).split(".")[0]
    print(corpus_basename)
    config_basename = os.path.basename(args.config)

    results_basename = config_basename
    if args.queryfile:
        results_basename += "+queryfilter"

    out_timelines_dir = os.path.join("system_timelines",
                                     results_basename + "+" + args.constraint,
                                     corpus_basename)
    results_dir = os.path.join("evaluation_results",
                               results_basename + "+" + args.constraint)

    if not os.path.isdir(out_timelines_dir):
        os.makedirs(out_timelines_dir)
    if not os.path.isdir(results_dir):
        os.makedirs(results_dir)

    query_words = None
    if args.queryfile is not None:
        with open(args.queryfile) as f:
            query_words = [l.strip() for l in f]

    debug_identifier = results_basename + "+" + corpus_basename

    if use_token_count:
        config["scoring"]["use_length"] = True

    if args.num_multi_selection_runs is None:
        sys_timelines = tl_gen.generate_timelines(
            corpus, [
                determine_tl_parameters(tl, use_token_count=use_token_count)
                for _, tl in timelines
            ],
            reference_timelines=list(map(lambda x: x[1], timelines)),
            query_words=query_words,
            debug_identifier=debug_identifier)

        write_results_file(os.path.join(results_dir, corpus_basename + ".txt"),
                           out_timelines_dir, timelines, sys_timelines)

    else:
        with open("multirun-results+{}.txt".format(config_basename),
                  "a") as f_out:
            print(timelines)

            evaluator = rouge.TimelineRougeEvaluator(
                measures=["rouge_1", "rouge_2"])
            all_run_timelines = tl_gen.generate_timelines(
                corpus, [
                    determine_tl_parameters(tl,
                                            use_token_count=use_token_count)
                    for _, tl in timelines
                ],
                reference_timelines=list(map(lambda x: x[1], timelines)),
                query_words=query_words,
                debug_identifier=debug_identifier,
                num_selection_runs=args.num_multi_selection_runs)
            for sys_timelines in all_run_timelines:
                for (timeline_name, gold_timeline), sys_timeline in zip(
                        timelines, sys_timelines):
                    reference_timeline = GroundTruth([gold_timeline])
                    eval_results = evaluator.evaluate_concat(
                        "TL", sys_timeline, reference_timeline)
                    eval_results_agree = evaluator.evaluate_agreement(
                        "TL", sys_timeline, reference_timeline)
                    eval_results_align = evaluator.evaluate_align_date_content_costs_many_to_one(
                        "TL", sys_timeline, reference_timeline)

                    f_out.write(" ".join(
                        map(str, [
                            eval_results["rouge_1"]["f_score"],
                            eval_results["rouge_2"]["f_score"],
                            eval_results_agree["rouge_1"]["f_score"],
                            eval_results_agree["rouge_2"]["f_score"],
                            eval_results_align["rouge_1"]["f_score"],
                            eval_results_align["rouge_2"]["f_score"]
                        ])))
                    f_out.write("\n")
                f_out.write("--------\n")

            f_out.write("========\n")
Пример #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("timeline")
    parser.add_argument("event_info")

    parser.add_argument("--tuple-file")

    args = parser.parse_args()

    tl_path = PurePath(args.timeline)
    tl_topic = tl_path.parts[-2]
    tl_name = tl_path.parts[-1].split(".")[0]

    with open(args.event_info) as f:
        all_event_info = json.load(f)

    event_info = all_event_info[tl_topic]

    with open(args.timeline, errors="ignore") as f:
        tl = Timeline.from_file(f)

    if args.tuple_file is None:
        result_tuples = generate_date_tuples(tl)
    else:
        with open(args.tuple_file) as f:
            result_tuples = f.read().strip().split("\n")[1:]

    tuples = [
        tuple(
            sorted((str_to_tl_date(date),
                    tl.dates_to_summaries[str_to_tl_date(date)])
                   for date in tuple_line.split("\t")))
        for tuple_line in result_tuples
    ]

    batches = []

    curr_idx = 0

    while curr_idx < len(tuples):
        batches.append(tuples[curr_idx:curr_idx + 20])
        curr_idx += 20

    print("Uploading", len(tuples), "tuples")

    client = MTurkClient("Timeline Importance Annotation VIII (K)")

    hit_info_path = Path("./hitinfo")
    hit_info_path.mkdir(parents=True, exist_ok=True)

    for batch in batches:
        hit_ids = client.upload_html([
            ("importance:" + tl_topic + "__" + tl_name,
             generate_template(tl, batch, event_info))
        ])

        hit_id = hit_ids[0]

        hit_info_file_path = hit_info_path / hit_id
        with open(hit_info_file_path, "w") as f:
            f.write(tl_topic)
            f.write("\t")
            f.write(tl_name)
            f.write("\n")
            for candidate_tuple in batch:
                f.write("\t".join(map(lambda it: str(it[0]), candidate_tuple)))
                f.write("\n")
Пример #14
0
def evaluate(tls_model,
             dataset,
             result_path,
             trunc_timelines=False,
             time_span_extension=0,
             word_mover_stop_words='nltk'):
    results = []
    metric = 'align_date_content_costs_many_to_one'
    evaluator = tilse_rouge.TimelineRougeEvaluator(
        measures=["rouge_1", "rouge_2"])
    n_topics = len(dataset.collections)

    for i, collection in enumerate(dataset.collections):

        ref_timelines = [
            TilseTimeline(tl.date_to_summaries) for tl in collection.timelines
        ]
        topic = collection.name
        n_ref = len(ref_timelines)

        if trunc_timelines:
            ref_timelines = data.truncate_timelines(ref_timelines, collection)

        for j, ref_timeline in enumerate(ref_timelines):
            print(
                f'topic {i + 1}/{n_topics}: {topic}, ref timeline {j + 1}/{n_ref}'
            )

            tls_model.load(ignored_topics=[collection.name])

            ref_dates = sorted(ref_timeline.dates_to_summaries)

            start, end = data.get_input_time_span(ref_dates,
                                                  time_span_extension)

            collection.start = start
            collection.end = end

            # utils.plot_date_stats(collection, ref_dates)

            l = len(ref_dates)
            k = data.get_average_summary_length(ref_timeline)

            pred_timeline_ = tls_model.predict(
                collection,
                max_dates=l,
                max_summary_sents=k,
                ref_tl=ref_timeline  # only oracles need this
            )

            print('*** PREDICTED ***')
            utils.print_tl(pred_timeline_)

            print('timeline done')
            pred_timeline = TilseTimeline(pred_timeline_.date_to_summaries)
            sys_len = len(pred_timeline.get_dates())
            ground_truth = TilseGroundTruth([ref_timeline])

            rouge_scores = get_scores(metric, pred_timeline, ground_truth,
                                      evaluator)
            date_scores = evaluate_dates(pred_timeline, ground_truth)
            wm_scores = get_wordmover_score(pred_timeline,
                                            ground_truth,
                                            word_mover_stop_words,
                                            device='cpu')
            dd_scores = date_dist_scores(pred_timeline, ground_truth)

            print('sys-len:', sys_len, 'gold-len:', l, 'gold-k:', k)

            print('Alignment-based ROUGE:')
            pprint(rouge_scores)
            print('Date selection:')
            pprint(date_scores)
            pprint(dd_scores)
            print('WordMover scores:')
            pprint(wm_scores)
            print('-' * 100)
            results.append((rouge_scores, date_scores, wm_scores, dd_scores,
                            pred_timeline_.to_dict()))

            print("Running average:")
            print(get_average_results(results))
            print()

    avg_results = get_average_results(results)
    print('Average results:')
    pprint(avg_results)
    output = {
        'average': avg_results,
        'results': results,
    }
    utils.write_json(output, result_path)
from tilse.data.timelines import Timeline

import sys

if __name__ == "__main__":
    timelines = []
    for tl_fname in sys.argv[1:]:
        with open(tl_fname, encoding="latin-1") as f:
            #print(tl_fname)
            timelines.append(Timeline.from_file(f))

    timeline_dates = [sorted(tl.get_dates()) for tl in timelines]

    rows = []
    for date_idx in range(5):
        date_row = []
        sum_row = []
        for tl_idx in range(len(timelines)):
            tl_date = timeline_dates[tl_idx][date_idx]
            tl_sum = timelines[tl_idx][tl_date]
            date_row.append("\\textbf{" + str(tl_date) + "}")
            sum_row.append(str(" \\newline ".join(tl_sum)))

        rows.append(date_row)
        rows.append(sum_row)

    for row in rows:
        print(" & ".join(row), "\\\\\\hline")
Пример #16
0
    args = parser.parse_args()
    relevant_systems = set(args.relevant_systems)

    all_relevant_timelines = defaultdict(lambda: defaultdict(dict))

    for directory in iter_dirs(args.system_tl_dir):
        system_name = os.path.basename(directory)
        for tl_dir in iter_dirs(directory):
            for tlfilename in iter_files(tl_dir, ".txt"):
                #print(system_name, relevant_systems)
                if system_name in relevant_systems:
                    with open(tlfilename) as tlfile:
                        all_relevant_timelines[system_name][os.path.basename(
                            tl_dir)][os.path.basename(
                                tlfilename)] = Timeline.from_file(tlfile)

    #for directory in iter_dirs(args.human_tl_dir):
    #    source_name = os.path.basename(directory)
    #    for tlfilename in iter_files(directory, ".txt"):
    #        with open(tlfilename, errors='ignore') as tlfile:
    #            all_relevant_timelines["human"][source_name][os.path.basename(tlfilename)] = Timeline.from_file(tlfile)

    vectorized_timelines = vectorize_timelines(all_relevant_timelines)

    num_samples_per_tl = 5

    all_samples = []
    for system, timelines in vectorized_timelines.items():
        system_samples = set()
        for topic_name, tl_name, timeline in timelines:
Пример #17
0
def main():
    with open(sys.argv[1]) as f, open("readability-samples-fixed-withcopy.csv",
                                      "w") as f_out:
        reader = csv.reader(f)
        writer = csv.writer(f_out)

        sentences_per_system = defaultdict(list)

        corpus_basepath = pathlib.Path("./corpora")

        header = next(reader)
        writer.writerow(header + ["Copied?"])
        lines = sorted(enumerate(reader), key=lambda x: x[1][1], reverse=True)
        prev_corpus = None

        new_lines = []

        for l_idx, line in lines:
            did_copy = False
            if line[0] != "gold":
                corpus_path = corpus_basepath / (line[1] + ".pkl")
                print(corpus_path)

                corpus = CachedCorpusReader.load_corpus(str(corpus_path))
                if prev_corpus != corpus:
                    corpus_sentences = set(
                        map(
                            lambda s: tuple(
                                s.as_token_attr_sequence("form_lowercase")),
                            corpus.sentences))
                    #corpus_sentences = set(map(lambda s: "".join(s.as_token_attr_sequence("form_lowercase")).translate(str.maketrans('', '', string.punctuation)).lower(), corpus.sentences))

                #sents = line[-1].split(". ")
                #did_copy = False
                #for sent in sents:
                #    if len("".join(sent.split()).translate(str.maketrans('', '', string.punctuation)).lower()) == 0:
                #        continue
                #    if "".join(sent.split()).translate(str.maketrans('', '', string.punctuation)).lower() in corpus_sentences:
                #        did_copy = True

                tl_name = line[2]
                if "nn" not in line[0] and line[1] == "tl17-mj" and line[
                        2] == "bbc.txt":
                    tl_name = "bbc.co.uk.txt"

                tl_path = os.path.join("filtered", "system_timelines", line[0],
                                       line[1], tl_name)
                print(tl_path)
                with open(tl_path, errors="ignore") as f:
                    tl = Timeline.from_file(f)
                    summary = tl.dates_to_summaries[datetime.date(
                        *map(int, line[3].split("-")))]

                    for summary_line in summary:
                        summary_line = summary_line.strip()
                        if tuple(summary_line.lower().split()
                                 ) in corpus_sentences:
                            print("COPY!")
                            did_copy = True

            new_line = list(line)
            new_line.append("y" if did_copy else "n")

            new_lines.append((l_idx, new_line))

        for idx, new_line in sorted(new_lines):
            writer.writerow(new_line)
def main():
    base_config_path = sys.argv[1]
    corpus_fname = sys.argv[2]
    gold_tl_fname = sys.argv[3]

    with open(gold_tl_fname, errors="ignore") as f:
        gold_tl = Timeline.from_file(f)

    with open(base_config_path) as f:
        config = json.load(f)

    corpus = reader.load_corpus(corpus_fname)

    generator = GloballyClusteredSentenceCompressionTimelineGenerator(config)

    corpus_promise, cluster_promise, dated_cluster_promise, cluster_candidates = generator.get_promises(corpus)

    dated_clusters = dated_cluster_promise.get()

    all_svo_tuples_per_date = defaultdict(Counter)

    for cluster, date in dated_clusters:
        for sentence in cluster:
            for pred, subj, obj in sentence.dependency_tree.extract_svo_tuples():
                if pred is not None:
                    pred = pred.lemma.lower()
                if obj is not None:
                    obj = obj.lemma.lower()
                if subj is not None:
                    subj = subj.lemma.lower()

                all_svo_tuples_per_date[date][(pred, subj, obj)] += 1

    triples_by_frequency = []

    for date, counter in sorted(all_svo_tuples_per_date.items()):
        for triple, count in counter.items():
            triples_by_frequency.append((count, date, triple))

    params = determine_tl_parameters(gold_tl)

    selected_triples = dict()

    for (cnt, date, triple) in sorted(triples_by_frequency, reverse=True, key=lambda x: x[0]):
        if date < params.first_date or date > params.last_date:
            continue

        if date not in selected_triples:
            if len(selected_triples) < params.max_date_count:
                selected_triples[date] = []
            else:
                continue

        if len(selected_triples[date]) > params.max_date_sent_count:
            continue

        if triple[0] in ("happen", "say"):
            continue

        selected_triples[date].append(str(triple))

    print(Timeline(selected_triples))
Пример #19
0
def read_tl(fname):
    with open(fname, errors="ignore") as f:
        return Timeline.from_file(f)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("sys_1_results_dir")
    parser.add_argument("sys_2_results_dir")

    args = parser.parse_args()

    results_1 = read_results_dir(args.sys_1_results_dir)
    results_2 = read_results_dir(args.sys_2_results_dir)
    score_diffs = []
    available_sents = []
    compression_rates = []
    spreads = []

    for corpus_name in results_1:
        if corpus_name not in results_2:
            continue

        corpus = load_corpus("corpora/" + corpus_name.rsplit(".")[0] + ".pkl")

        for tl_name, result_1 in results_1[corpus_name].items():
            result_2 = results_2[corpus_name][tl_name]
            with open("gold-timelines/" + corpus_name.split(".")[0] + "/" +
                      tl_name,
                      errors="ignore") as f:
                print("gold-timelines/" + corpus_name.split(".")[0] + "/" +
                      tl_name)
                gold_tl = Timeline.from_file(f)

            total_tl_length = sum(map(len,
                                      gold_tl.dates_to_summaries.values()))
            total_corpus_length = len(corpus.sentences)

            score_diffs.append(result_1.rouge_2_align.f1 -
                               result_2.rouge_2_align.f1)
            available_sents.append(
                len(
                    corpus.docs_between_dates(min(gold_tl.get_dates()),
                                              max(gold_tl.get_dates()))))
            compression_rates.append(1.0 -
                                     (total_tl_length / total_corpus_length))

            spreads.append(compute_spread(gold_tl))

    print("Sents", scipy.stats.spearmanr(available_sents, score_diffs))
    print("Compression", scipy.stats.spearmanr(compression_rates, score_diffs))
    print("Spread", scipy.stats.spearmanr(spreads, score_diffs))

    plt.axhline(color="b")
    plt.scatter(
        available_sents,
        score_diffs,
        c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs])
    plt.figure()
    plt.axhline(color="b")
    plt.scatter(
        compression_rates,
        score_diffs,
        c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs])
    plt.figure()
    plt.axhline(color="b")
    plt.scatter(
        spreads,
        score_diffs,
        c=["r" if score_diff <= 0.0 else "b" for score_diff in score_diffs])

    plt.show()
Пример #21
0
def main():
    event_annotations = defaultdict(dict)
    current_tl_key = None

    system_name = None

    if len(sys.argv) == 3:
        system_name = sys.argv[2]

    with open(sys.argv[1]) as f:
        for lidx, line in enumerate(f):
            if line.startswith("=="):
                _, corpus, tl_name = line.split()
                current_tl_key = (corpus, tl_name)
                continue

            elems = line.split()
            if len(elems) == 1:
                continue

            date, response = elems

            year, month, day = map(int, date.split("-"))

            event_annotations[current_tl_key][datetime.date(year, month,
                                                            day)] = response

    event_present_in_first_3_ratios = {}
    event_present_in_last_3_ratios = {}
    event_present_ratios = {}
    for tl_key, annotations in event_annotations.items():
        dates_to_consider = None
        if system_name:
            with open(os.path.join("gold-timelines", tl_key[0], tl_key[1]),
                      errors="ignore") as f:
                gold_tl = Timeline.from_file(f)

            with open(os.path.join("system_timelines", system_name, tl_key[0],
                                   tl_key[1]),
                      errors="ignore") as f:
                sys_tl = Timeline.from_file(f)

            dates_to_consider = set(gold_tl.get_dates()).intersection(
                set(sys_tl.get_dates()))

        num_correct_first_3 = 0
        num_correct_last_3 = 0
        num_correct = 0
        num_total = 0
        annotations = annotations.items()

        for idx, (date, annotation) in enumerate(sorted(annotations)):
            if dates_to_consider and date not in dates_to_consider:
                continue
            if annotation != "-":
                #print("==>", annotation)
                num_correct += 1

                if idx < 3:
                    num_correct_first_3 += 1
                elif idx >= (len(annotations) - 3):
                    num_correct_last_3 += 1
            num_total += 1

        if num_total == 0:
            event_present_ratios[tl_key] = 0
        else:
            event_present_ratios[tl_key] = num_correct / num_total
        event_present_in_first_3_ratios[tl_key] = num_correct_first_3 / 3
        event_present_in_last_3_ratios[tl_key] = num_correct_last_3 / 3

    #print(event_present_ratios)

    print(
        "Last",
        sum(event_present_in_last_3_ratios.values()) /
        len(event_present_in_last_3_ratios))
    print(
        "First",
        sum(event_present_in_first_3_ratios.values()) /
        len(event_present_in_first_3_ratios))
    print("Total",
          sum(event_present_ratios.values()) / len(event_present_ratios))
Пример #22
0
def main_twotasks():
    annotations_j = retrieve_hit_ranking(
        "Timeline Importance Annotation V (J)")
    annotations_k = retrieve_hit_ranking(
        "Timeline Importance Annotation VI (K)")

    assert list(annotations_j) == list(annotations_k)

    tl_base_path = Path("./gold-timelines")

    for topic, tl_name in annotations_j:
        with open(tl_base_path / topic / (tl_name + ".txt"),
                  errors="ignore") as f:
            tl = Timeline.from_file(f)

        all_dates = tl.get_dates()

        scores_j = annotations_j[topic, tl_name]
        scores_k = annotations_k[topic, tl_name]

        for date in all_dates:
            if date not in scores_j:
                scores_j[date] = 0
            if date not in scores_k:
                scores_k[date] = 0

        obs_j = np.zeros(len(tl.get_dates()))
        obs_k = np.zeros(len(tl.get_dates()))

        top_5_dates_j = [
            i[0] for i in sorted(scores_j.items(), key=lambda i: i[1])[:10]
        ]
        top_5_dates_k = [
            i[0] for i in sorted(scores_k.items(), key=lambda i: i[1])[:10]
        ]

        print(top_5_dates_j)
        print(top_5_dates_k)
        print(
            len(set(top_5_dates_j).intersection(set(top_5_dates_k))) /
            len(top_5_dates_j))

        ranking_j = map(
            lambda x: x[0],
            sorted(scores_j.items(), key=lambda x: x[1], reverse=True))
        ranking_k = map(
            lambda x: x[0],
            sorted(scores_k.items(), key=lambda x: x[1], reverse=True))

        ranks_j = dict(map(lambda x: tuple(reversed(x)), enumerate(ranking_j)))
        ranks_k = dict(map(lambda x: tuple(reversed(x)), enumerate(ranking_k)))

        print(ranks_j)

        for idx, date in enumerate(all_dates):
            score_j = scores_j[date]
            score_k = scores_k[date]

            obs_j[idx] = score_j
            obs_k[idx] = score_k

            print(ranks_j[date] + 1, ranks_k[date] + 1)
            print(score_j, score_k)
            print(date)
            print(tl.dates_to_summaries[date])

        print(
            k.alpha(np.stack([obs_j, obs_k]), level_of_measurement="interval"))

        print(scipy.stats.kendalltau(obs_j, obs_k))
Пример #23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-e",
                        dest="evaluation_results_dir",
                        default="evaluation_results")
    parser.add_argument("-s",
                        dest="system_timelines_dir",
                        default="system_timelines")
    parser.add_argument("-g",
                        dest="gold_timelines_dir",
                        default="gold-timelines")

    parser.add_argument("-c", dest="cutoff_constraint", default="none")

    parser.add_argument("system_name")
    parser.add_argument("system_tl_files", nargs="+")

    args = parser.parse_args()

    per_corpus_timelines = defaultdict(dict)

    for tl_fname in args.system_tl_files:
        corpus_name, tl_name = parse_tl_name(tl_fname)

        with open(tl_fname) as f:
            timeline = Timeline.from_file(f)
            per_corpus_timelines[corpus_name][tl_name] = timeline

    system_name = args.system_name + "+" + args.cutoff_constraint

    system_sys_tl_dir = os.path.join(args.system_timelines_dir, system_name)

    system_evaluation_dir = os.path.join(args.evaluation_results_dir,
                                         system_name)

    ensure_is_dir(system_sys_tl_dir)
    ensure_is_dir(system_evaluation_dir)

    for corpus_name, tls in per_corpus_timelines.items():
        corpus_sys_tl_dir = os.path.join(system_sys_tl_dir, corpus_name)

        ensure_is_dir(corpus_sys_tl_dir)

        gold_tl_dir = os.path.join(args.gold_timelines_dir,
                                   corpus_name[:-len(".pkl")])

        gold_tls = read_gold_tl_dir(gold_tl_dir)

        sys_tls = [it[1] for it in sorted(tls.items())]
        gold_tls = [it for it in sorted(gold_tls.items())]

        if args.cutoff_constraint != "none":
            sys_tls = [
                timeline_by_applying_constraints(
                    sys_tl,
                    determine_tl_parameters(gold_tl),
                    constraint_type=args.cutoff_constraint)
                for sys_tl, (_, gold_tl) in zip(sys_tls, gold_tls)
            ]

        write_results_file(
            os.path.join(system_evaluation_dir,
                         corpus_name[:-len(".pkl")] + ".txt"),
            corpus_sys_tl_dir, gold_tls, sys_tls)
        db_file.write(
            "\t".join([
                os.path.basename(topic_gold_tl_dir),
                os.path.basename(gold_tl_fname),
                "detail",
                str(date)
            ])
        )
        db_file.write("\n")
        annotator_file.write("# Ask about a detail of what happened on that day.\n\n")


if __name__ == "__main__":
    gold_tl_dir = sys.argv[1]
    out_file_prefix = sys.argv[2]

    db_file = open(out_file_prefix + ".db", "w")
    annotator_file = open(out_file_prefix + ".txt", "w")

    for topic_gold_tl_dir in iter_dirs(gold_tl_dir):
        for gold_tl_fname in iter_files(topic_gold_tl_dir, ".txt"):
            with open(gold_tl_fname, errors="ignore") as f:
                tl = Timeline.from_file(f)

                #write_entity_questions(topic_gold_tl_dir, gold_tl_fname, tl, db_file, annotator_file)
                #write_date_questions(topic_gold_tl_dir, gold_tl_fname, tl, db_file, annotator_file)
                write_entity_detail_question(topic_gold_tl_dir, gold_tl_fname, tl, db_file, annotator_file)

    db_file.close()
    annotator_file.close()
Пример #25
0
def evaluate(tls_model,
             dataset,
             result_path,
             trunc_timelines=False,
             time_span_extension=0):

    results = []
    metric = 'align_date_content_costs_many_to_one'
    evaluator = rouge.TimelineRougeEvaluator(measures=["rouge_1", "rouge_2"])
    n_topics = len(dataset.collections)
    ave_cluster = 0

    for i, collection in enumerate(dataset.collections):

        ref_timelines = [
            TilseTimeline(tl.date_to_summaries) for tl in collection.timelines
        ]
        topic = collection.name
        n_ref = len(ref_timelines)

        # only for entity
        if trunc_timelines:
            ref_timelines = data.truncate_timelines(ref_timelines, collection)

        for j, ref_timeline in enumerate(ref_timelines):

            print(
                f'topic {i+1}/{n_topics}: {topic}, ref timeline {j+1}/{n_ref}')

            tls_model.load(ignored_topics=[collection.name])

            ref_dates = sorted(ref_timeline.dates_to_summaries)
            #print("data to summaries = {}".format(ref_dates))

            start, end = data.get_input_time_span(ref_dates,
                                                  time_span_extension)

            collection.start = start
            collection.end = end
            print("name = {} start = {} end = {}".format(topic, start, end))

            #utils.plot_date_stats(collection, ref_dates)

            l = len(ref_dates)
            k = data.get_average_summary_length(ref_timeline)

            pred_timeline_, n_clusters = tls_model.predict(
                collection,
                max_dates=l,
                max_summary_sents=k,
                ref_tl=ref_timeline  # only oracles need this
            )
            ave_cluster = ave_cluster + n_clusters

            # print('*** PREDICTED ***')
            # utils.print_tl(pred_timeline_)

            print('timeline done')
            pred_timeline = TilseTimeline(pred_timeline_.date_to_summaries)
            sys_len = len(pred_timeline.get_dates())
            ground_truth = TilseGroundTruth([ref_timeline])

            rouge_scores = get_scores(metric, pred_timeline, ground_truth,
                                      evaluator)
            date_scores = evaluate_dates(pred_timeline, ground_truth)

            print('sys-len:', sys_len, 'gold-len:', l, 'gold-k:', k)

            print('Alignment-based ROUGE:')
            pprint(rouge_scores)
            print('Date selection:')
            pprint(date_scores)
            print('-' * 100)
            results.append(
                (rouge_scores, date_scores, pred_timeline_.to_dict()))

    avg_results = get_average_results(results)
    print('Average results:')
    pprint(avg_results)
    output = {
        'average_clusters': ave_cluster / len(dataset.collections),
        'average': avg_results,
        'results': results,
    }
    utils.write_json(output, result_path)