def calculate_coherence(args, save=True): topic_dir = Path(args.input_dir) parent_dir = topic_dir.parent config = load_yaml(parent_dir / "config.yml") try: data_dir = config["input_dir"] # dvae, mallet except KeyError: data_dir = str(Path(config["data_path"]).parent) # etm #### quick HACK to handle scratch directories, needs cleanup ### processed_name = Path(data_dir).name data_dir_map = { f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}": f"/scratch/{processed_name}/nytimes", f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}": f"/scratch/{processed_name}/wikitext", f"/workspace/topic-preprocessing/data/bbc/processed/{processed_name}": f"/scratch/{processed_name}/bbc", } # for out-of-sample coherence ref_corpus = args.reference_corpus if ref_corpus == "wikitext_full" or ref_corpus == "nytimes_full": try: data_dict = Dictionary.load(str(Path(data_dir, "train-dict.npy"))) except FileNotFoundError: data_dict = make_dictionary(data_dir) if ref_corpus == "wikitext_full": mapped_dir = f"/workspace/topic-preprocessing/data/wikitext/processed/{processed_name}" if ref_corpus == "nytimes_full": mapped_dir = f"/workspace/topic-preprocessing/data/nytimes/processed/{processed_name}" ref_corpus_fname = "full.txt" # standard coherence else: ref_corpus = args.reference_corpus ref_corpus_fname = f"{ref_corpus}.txt" # can later update to external if needed mapped_dir = Path(data_dir_map[data_dir]) if Path(mapped_dir, "train-dict.npy").exists() and Path( mapped_dir, ref_corpus_fname).exists(): print("reading files from scratch", flush=True) data_dict = Dictionary.load(str(Path(mapped_dir, "train-dict.npy"))) else: print("loading files", flush=True) try: data_dict = Dictionary.load( str(Path(data_dir, "train-dict.npy"))) except FileNotFoundError: data_dict = make_dictionary(data_dir) # copy to scratch directory print("copying files to scratch", flush=True) mapped_dir.mkdir(exist_ok=True, parents=True) shutil.copy(Path(data_dir, ref_corpus_fname), Path(mapped_dir, ref_corpus_fname)) shutil.copy(Path(data_dir, "train-dict.npy"), Path(mapped_dir, "train-dict.npy")) ### end hack ### topic_sets = collect_topics( topic_dir=topic_dir, start_at=args.start_at, eval_every_n=args.eval_every_n, eval_last_only=args.eval_last_only, ) measure_name = gen_measure_name(args.coherence_measure, args.window_size, args.reference_corpus, args.top_n) coherence_results = {measure_name: {}} print("calculating coherence...", flush=True) for idx, path, topics in topic_sets: topics = [t[:args.top_n] for t in topics] reference_text = load_tokens(Path(mapped_dir, ref_corpus_fname)) cm = CoherenceModel( topics=topics, texts=reference_text, dictionary=data_dict, coherence=args.coherence_measure, window_size=args.window_size, ) confirmed_measures = cm.get_coherence_per_topic() mean = cm.aggregate_measures(confirmed_measures) coherence_results[measure_name][idx] = { "aggregate": float(mean), "by_topic": [float(i) for i in confirmed_measures ], # needs to be python float to json-serialize "path": str(path), } if not save: return coherence_results output_dir = parent_dir / "coherences.json" if output_dir.exists( ): # TODO: currently broken, will overwrite different epochs prev_coherence = load_json(output_dir) prev_coherence.update(**coherence_results) coherence_results = prev_coherence save_json(coherence_results, parent_dir / "coherences.json") print("done!") return coherence_results