예제 #1
0
def calculate_coherence(args, save=True):
    topic_dir = Path(args.input_dir)
    parent_dir = topic_dir.parent
    config = load_yaml(parent_dir / "config.yml")
    try:
        data_dir = config["input_dir"]  # dvae, mallet
    except KeyError:
        data_dir = str(Path(config["data_path"]).parent)  # etm

    #### quick HACK to handle scratch directories, needs cleanup ###
    processed_name = Path(data_dir).name
    data_dir_map = {
        f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}":
        f"/scratch/{processed_name}/nytimes",
        f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}":
        f"/scratch/{processed_name}/wikitext",
        f"/workspace/topic-preprocessing/data/bbc/processed/{processed_name}":
        f"/scratch/{processed_name}/bbc",
    }

    # for out-of-sample coherence
    ref_corpus = args.reference_corpus
    if ref_corpus == "wikitext_full" or ref_corpus == "nytimes_full":
        try:
            data_dict = Dictionary.load(str(Path(data_dir, "train-dict.npy")))
        except FileNotFoundError:
            data_dict = make_dictionary(data_dir)

        if ref_corpus == "wikitext_full":
            mapped_dir = f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}"
        if ref_corpus == "nytimes_full":
            mapped_dir = f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}"
        ref_corpus_fname = "full.txt"
    # standard coherence
    else:
        ref_corpus = args.reference_corpus
        ref_corpus_fname = f"{ref_corpus}.txt"  # can later update to external if needed
        mapped_dir = Path(data_dir_map[data_dir])

        if Path(mapped_dir, "train-dict.npy").exists() and Path(
                mapped_dir, ref_corpus_fname).exists():
            print("reading files from scratch", flush=True)
            data_dict = Dictionary.load(str(Path(mapped_dir,
                                                 "train-dict.npy")))
        else:
            print("loading files", flush=True)
            try:
                data_dict = Dictionary.load(
                    str(Path(data_dir, "train-dict.npy")))
            except FileNotFoundError:
                data_dict = make_dictionary(data_dir)

            # copy to scratch directory
            print("copying files to scratch", flush=True)
            mapped_dir.mkdir(exist_ok=True, parents=True)
            shutil.copy(Path(data_dir, ref_corpus_fname),
                        Path(mapped_dir, ref_corpus_fname))
            shutil.copy(Path(data_dir, "train-dict.npy"),
                        Path(mapped_dir, "train-dict.npy"))

    ### end hack ###

    topic_sets = collect_topics(
        topic_dir=topic_dir,
        start_at=args.start_at,
        eval_every_n=args.eval_every_n,
        eval_last_only=args.eval_last_only,
    )

    measure_name = gen_measure_name(args.coherence_measure, args.window_size,
                                    args.reference_corpus, args.top_n)
    coherence_results = {measure_name: {}}

    print("calculating coherence...", flush=True)
    for idx, path, topics in topic_sets:
        topics = [t[:args.top_n] for t in topics]
        reference_text = load_tokens(Path(mapped_dir, ref_corpus_fname))

        cm = CoherenceModel(
            topics=topics,
            texts=reference_text,
            dictionary=data_dict,
            coherence=args.coherence_measure,
            window_size=args.window_size,
        )
        confirmed_measures = cm.get_coherence_per_topic()
        mean = cm.aggregate_measures(confirmed_measures)
        coherence_results[measure_name][idx] = {
            "aggregate": float(mean),
            "by_topic": [float(i) for i in confirmed_measures
                         ],  # needs to be python float to json-serialize
            "path": str(path),
        }
    if not save:
        return coherence_results

    output_dir = parent_dir / "coherences.json"
    if output_dir.exists(
    ):  # TODO: currently broken, will overwrite different epochs
        prev_coherence = load_json(output_dir)
        prev_coherence.update(**coherence_results)
        coherence_results = prev_coherence

    save_json(coherence_results, parent_dir / "coherences.json")
    print("done!")
    return coherence_results