def compute_rouge(model, dataset, reference_dir): model.eval() ids2refs = collect_reference_paths(reference_dir) with rouge_papier.util.TempFileManager() as manager: path_data = [] for batch in dataset.iter_batch(): batch_size = batch.inputs.sequence.size(0) predictions = model.greedy_predict(batch.inputs) for b in range(batch_size): id = batch.metadata.id[b] preds = [ p for p in predictions.data[b].cpu().tolist() if p > -1 ] summary = "\n".join([batch.metadata.text[b][p] for p in preds]) #for id, summary in zip(batch.metadata.id, batch.metadata): summary_path = manager.create_temp_file(summary) ref_paths = ids2refs[id] path_data.append([summary_path, ref_paths]) config_text = rouge_papier.util.make_simple_config_text(path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=False, remove_stopwords=True) return df[-1:]
def compute_rouge(model, dataset, reference_dir, remove_stopwords=True, summary_length=100): model.eval() ids2refs = collect_reference_paths(reference_dir) with rouge_papier.util.TempFileManager() as manager: path_data = [] for batch in dataset.iter_batch(): texts = model.predict(batch.inputs, batch.metadata) for b, text in enumerate(texts): id = batch.metadata.id[b] summary = "\n".join(text) summary_path = manager.create_temp_file(summary) ref_paths = ids2refs[id] path_data.append([summary_path, ref_paths]) config_text = rouge_papier.util.make_simple_config_text(path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge( config_path, max_ngram=2, lcs=False, remove_stopwords=remove_stopwords, length=summary_length) return df[-1:]
def compute_rouge(model, dataset, reference_dir): model.eval() ids2refs = collect_reference_paths(reference_dir) with rouge_papier.util.TempFileManager() as manager: path_data = [] for batch in dataset.iter_batch(): texts = model.extract(batch.inputs, batch.metadata, strategy="rank") for id, summary in zip(batch.metadata.id, texts): summary_path = manager.create_temp_file(summary) ref_paths = ids2refs[id] path_data.append([summary_path, ref_paths]) config_text = rouge_papier.util.make_simple_config_text(path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=False, remove_stopwords=True) return df[-1:]
def compute_rouge(model, dataloader, remove_stopwords=True, summary_length=100): model.eval() hist = {} with rouge_papier.util.TempFileManager() as manager: path_data = [] for batch in dataloader: texts, positions = model.predict(batch, return_indices=True, max_length=summary_length) for pos_b in positions: for p in pos_b: hist[p] = hist.get(p, 0) + 1 for b, text in enumerate(texts): id = batch.id[b] summary = "\n".join(text) summary_path = manager.create_temp_file(summary) path_data.append( [summary_path, [str(x) for x in batch.reference_paths[b]]]) config_text = rouge_papier.util.make_simple_config_text(path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=False, remove_stopwords=remove_stopwords, length=summary_length) return df[-1:], hist
def evaluate_method(inputs_path, abs_dir, output_dir, method="lead", summary_length=100): ids = [] rouge_config_paths = [] for f in os.listdir(inputs_path): with open("%s/%s"%(inputs_path,f), "r") as inp_fp: example = json.load(inp_fp) if method == "lead": sys_summary_text = make_lead(example, limit=summary_length) elif method == "tail": sys_summary_text = make_tail(example, limit=summary_length) elif method == "random": sys_summary_text = make_random(example, limit=summary_length) else: raise Exception("method not implemented: " + method) ref_paths = find_references(abs_dir, example["id"]) sys_path = os.path.join( output_dir, "{}.summary".format(example["id"])) with open(sys_path, "w") as out_fp: out_fp.write(sys_summary_text) rouge_config_paths.append([sys_path, ref_paths]) ids.append(example["id"]) with rouge_papier.util.TempFileManager() as manager: config_text = rouge_papier.util.make_simple_config_text( rouge_config_paths) config_path = manager.create_temp_file(config_text) df, conf = rouge_papier.compute_rouge( config_path, max_ngram=2, lcs=True, remove_stopwords=False, length=summary_length, return_conf=True) df.index = ids + ["average"] #df = pd.concat([df[:-1].sort_index(), df[-1:]], axis=0) return df, conf
def partition_inputs(inputs_path, abstracts, inputs_out, output_dir, summary_length=100): ids = [] rouge_config_paths = [] for f in os.listdir(inputs_path): with open("%s/%s" % (inputs_path, f), "r") as inp_fp: example = json.load(inp_fp) sys_summary_text = make_lead(example, limit=summary_length) ref_paths = find_references(abstracts, example["id"]) sys_path = os.path.join(output_dir, "{}.summary".format(example["id"])) with open(sys_path, "w") as out_fp: out_fp.write(sys_summary_text) rouge_config_paths.append([sys_path, ref_paths]) ids.append(example["id"]) with rouge_papier.util.TempFileManager() as manager: config_text = rouge_papier.util.make_simple_config_text( rouge_config_paths) config_path = manager.create_temp_file(config_text) df, conf = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=True, remove_stopwords=False, length=summary_length, return_conf=True) df.index = ids + ["average"] scored_ids = sorted(df.to_dict()["rouge-1"].items(), key=lambda x: x[1]) os.system("mkdir -p %s" % inputs_out) os.system("mkdir -p %s/%s" % (inputs_out, "inputs1")) os.system("mkdir -p %s/%s" % (inputs_out, "inputs2")) os.system("mkdir -p %s/%s" % (inputs_out, "inputs3")) os.system("mkdir -p %s/%s" % (inputs_out, "inputs4")) idx = int(len(scored_ids) / 4) for id, score in scored_ids[:idx]: if id != "average": os.system("cp %s/%s.json %s/inputs1/" % (inputs_path, id, inputs_out)) for id, score in scored_ids[idx:2 * idx]: if id != "average": os.system("cp %s/%s.json %s/inputs2/" % (inputs_path, id, inputs_out)) for id, score in scored_ids[2 * idx:3 * idx]: if id != "average": os.system("cp %s/%s.json %s/inputs3/" % (inputs_path, id, inputs_out)) for id, score in scored_ids[3 * idx:4 * idx]: if id != "average": os.system("cp %s/%s.json %s/inputs4/" % (inputs_path, id, inputs_out))
def compute_rouge(model, dataset, reference_dir, output_dir, remove_stopwords=True, summary_length=100): model.eval() hist = {} ids2refs = collect_reference_paths(reference_dir) max_iters = int(np.ceil(dataset.size / dataset.batch_size)) ordered_ids = [] with rouge_papier.util.TempFileManager() as manager: path_data = [] for i, batch in enumerate(dataset.iter_batch(), 1): sys.stdout.write("{}/{}\r".format(i, max_iters)) sys.stdout.flush() texts, positions = model.predict(batch.inputs, batch.metadata, return_indices=True, max_length=summary_length + 25) for pos_b in positions: for p in pos_b: p = int(p) hist[p] = hist.get(p, 0) + 1 for b, text in enumerate(texts): id = batch.metadata.id[b] summary = "\n".join(text) summary_path = os.path.join(output_dir, "{}.summary".format(id)) with open(summary_path, "w") as sfp: sfp.write(summary) ref_paths = ids2refs[id] path_data.append([summary_path, ref_paths]) ordered_ids.append(id) print("") config_text = rouge_papier.util.make_simple_config_text(path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=True, remove_stopwords=remove_stopwords, length=summary_length) df.index = ordered_ids + ["average"] df = pd.concat([df[:-1].sort_index(), df[-1:]], axis=0) return df, hist
def main(args=None): parser = argparse.ArgumentParser() parser.add_argument("--inputs", type=str, required=True) parser.add_argument("--remove-stopwords", action="store_true", required=False, default=False) parser.add_argument("--reference-summary-dir", type=str, required=True) args = parser.parse_args(args) ids2refs = collect_reference_paths(args.reference_summary_dir) with rouge_papier.util.TempFileManager() as manager: data_paths = [] with open(args.inputs, "r") as fp: for line in fp: example = json.loads(line) lines = [] word_count = 0 for sent in example["inputs"]: lines.append(sent["text"]) word_count += sent["word_count"] if word_count > 100: break summary = "\n".join(lines) summary_path = manager.create_temp_file(summary) data_paths.append([summary_path, ids2refs[example["id"]]]) config_text = rouge_papier.util.make_simple_config_text(data_paths) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=False, remove_stopwords=args.remove_stopwords) result = df[-1:] result.index = ["lead"] print(result) return result
def main(): parser = argparse.ArgumentParser() parser.add_argument("--reference-summaries", type=str, required=True) parser.add_argument("--system-summaries", nargs="+", type=str, required=True) parser.add_argument("--system-names", type=str, nargs="+", required=False, default=None) args = parser.parse_args() if args.system_names is None: args.system_names = [path[-20:] for path in args.system_summaries] if len(args.system_names) != len(args.system_summaries): raise Exception("--system-names must have the same number of " \ "arguments as --system-summaries") data = [] systems = [] id2paths = read_reference_summary_manifest(args.reference_summaries) for sys_dir, sys_name in zip(args.system_summaries, args.system_names): sys_ids, sys_paths = read_system_summary_manifest(sys_dir, id2paths) sys_and_sum_paths = [[spth, id2paths[sid]] for sid, spth in zip(sys_ids, sys_paths)] config_text = rouge_papier.util.make_simple_config_text( sys_and_sum_paths) with rouge_papier.util.TempFileManager() as manager: config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=4, lcs=True) data.append(df[-1:]) systems.append(sys_name) df = pd.concat(data, axis=0) df.index = systems print(df)
def compute(self): if len(self._path_data) == 0: raise NotComputableError( 'PerlRouge must have at least one example before ' \ 'it can be computed') with rouge_papier.util.TempFileManager() as manager: config_text = rouge_papier.util.make_simple_config_text( self._path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge( config_path, max_ngram=2, lcs=False, remove_stopwords=self.remove_stopwords, length=self.summary_length) if self.delete_temp_files: for paths in self._path_data: pathlib.Path(paths[0]).unlink() return df.iloc[-1:].to_dict("records")[0]
ids2refs = collect_reference_paths(refs_path) with rouge_papier.util.TempFileManager() as manager: for line in open(dps_path): dp = json.loads(line) id = dp["id"] query = dp["query"] #print("dp,id: %s,%s" % (id,query)) qry_embds = dp["qembedding"] sentences = [] tokens = [] sen_embds = [] for input in dp["inputs"]: sen_id = input["sentence_id"] sen_embds.append(input["embedding"]) sentences.append(input["text"]) tokens.append(input["text"].split(" ")) ref_paths = ids2refs[id] inputs, metadata = get_inputs_metadata(tokens, sentences, sen_embds, qry_embds) if inputs is not None: summaries, _ = predictor.extract(inputs, metadata, strategy=strategy, word_limit=100, rescore=rescore) summary_path = "%s/%s.pred" % (out_path, id) write2file("%s" % summaries[0] + "\n", summary_path) rouge_paths.append([summary_path, ref_paths]) # compute rouge config_text = rouge_papier.util.make_simple_config_text(rouge_paths) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=False) print(df[-1:])
def main(): parser = argparse.ArgumentParser( "Evaluate nnsum models using original Perl ROUGE script.") parser.add_argument("--batch-size", default=32, type=int) parser.add_argument("--gpu", default=-1, type=int) parser.add_argument("--sentence-limit", default=None, type=int) parser.add_argument("--summary-length", type=int, default=100) parser.add_argument("--loader-workers", type=int, default=None) parser.add_argument("--remove-stopwords", action="store_true", default=False) parser.add_argument("--inputs", type=pathlib.Path, required=True) parser.add_argument("--refs", type=pathlib.Path, required=True) parser.add_argument("--model", type=pathlib.Path, required=True) parser.add_argument("--results", type=pathlib.Path, required=False, default=None) args = parser.parse_args() if args.loader_workers is None: args.loader_workers = min(16, cpu_count()) print("Loading model...", end="", flush=True) model = torch.load(args.model, map_location=lambda storage, loc: storage) if args.gpu > -1: model.cuda(args.gpu) vocab = model.embeddings.vocab print(" OK!") data = nnsum.data.SummarizationDataset(vocab, args.inputs, references_dir=args.refs, sentence_limit=args.sentence_limit) loader = nnsum.data.SummarizationDataLoader( data, batch_size=args.batch_size, num_workers=args.loader_workers) ids = [] path_data = [] model.eval() with rouge_papier.util.TempFileManager() as manager: with torch.no_grad(): for step, batch in enumerate(loader, 1): batch = batch.to(args.gpu) print("generating summaries {} / {} ...".format( step, len(loader)), end="\r" if step < len(loader) else "\n", flush=True) texts = model.predict(batch, max_length=args.summary_length) for text, ref_paths in zip(texts, batch.reference_paths): summary = "\n".join(text) summary_path = manager.create_temp_file(summary) path_data.append( [summary_path, [str(x) for x in ref_paths]]) ids.extend(batch.id) config_text = rouge_papier.util.make_simple_config_text(path_data) config_path = manager.create_temp_file(config_text) df = rouge_papier.compute_rouge(config_path, max_ngram=2, lcs=True, remove_stopwords=args.remove_stopwords, length=args.summary_length) df.index = ids + ["average"] df = pd.concat([df[:-1].sort_index(), df[-1:]], axis=0) print(df[-1:]) if args.results: records = df[:-1].to_dict("records") results = { "idividual": {id: record for id, record in zip(ids, records)}, "average": df[-1:].to_dict("records")[0] } args.results.parent.mkdir(parents=True, exist_ok=True) with args.results.open("w") as fp: fp.write(json.dumps(results))