def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--train-inputs", type=str, required=True) parser.add_argument("--valid-inputs", type=str, required=True) parser.add_argument("--batch-size", default=16, type=int) parser.add_argument("--gpu", default=-1, type=int) parser.add_argument("--train-salience", type=str, default=None) parser.add_argument("--valid-salience", type=str, default=None) parser.add_argument("--model-path", type=str, default=None) parser.add_argument("--seed", default=48929234, type=int) args = parser.parse_args() ntp.set_random_seed(args.seed) print("Reading model from {} ...".format(args.model_path)) model = torch.load(args.model_path) if args.gpu > -1: model.cuda(args.gpu) else: model.cpu() if args.train_salience is not None: results_dir = os.path.dirname(args.train_salience) if not os.path.exists(results_dir) and results_dir != "": os.makedirs(results_dir) if args.valid_salience is not None: results_dir = os.path.dirname(args.valid_salience) if not os.path.exists(results_dir) and results_dir != "": os.makedirs(results_dir) print("Reading training inputs data from {} ...".format(args.train_inputs)) training_data = make_dataset(read_data(args.train_inputs), args.batch_size, args.gpu) print("Writing training salience data to {} ...".format( args.train_salience)) write_salience(model, training_data, args.train_salience) print("Reading validation inputs data from {} ...".format( args.valid_inputs)) validation_data = make_dataset(read_data(args.valid_inputs), args.batch_size, args.gpu) print("Writing validation salience data to {} ...".format( args.valid_salience)) write_salience(model, validation_data, args.valid_salience)
def main(args=None): parser = argparse.ArgumentParser() parser.add_argument("--train-inputs", type=str, required=True) parser.add_argument("--train-labels", type=str, required=True) parser.add_argument("--valid-inputs", type=str, required=True) parser.add_argument("--valid-labels", type=str, required=True) parser.add_argument("--train-summary-dir", type=str, required=True) parser.add_argument("--valid-summary-dir", type=str, required=True) #parser.add_argument("--valid", type=str, required=True) parser.add_argument("--gpu", default=-1, type=int, required=False) parser.add_argument("--epochs", default=20, type=int, required=False) parser.add_argument("--seed", default=83432534, type=int, required=False) parser.add_argument("--lr", required=False, default=.0001, type=float) parser.add_argument("--batch-size", default=8, type=int, required=False) parser.add_argument("--embedding-size", type=int, required=False, default=300) parser.add_argument("--rnn-hidden-size", type=int, required=False, default=512) parser.add_argument("--rnn-layers", type=int, required=False, default=1) parser.add_argument("--hidden-layer-sizes", nargs="+", default=[100], type=int, required=False) parser.add_argument("--hidden-layer-activations", nargs="+", default="relu", type=str, required=False) parser.add_argument("--hidden-layer-dropout", default=.0, type=float, required=False) parser.add_argument("--input-layer-norm", default=False, action="store_true") parser.add_argument("--save-model", required=False, type=str) # parser.add_argument( # "--save-predictor", default=None, required=False, type=str) args = parser.parse_args(args) ntp.set_random_seed(args.seed) input_reader = spensum.dataio.init_duc_sds_input_reader( args.embedding_size) label_reader = spensum.dataio.init_duc_sds_label_reader() train_dataset = spensum.dataio.read_input_label_dataset( args.train_inputs, args.train_labels, input_reader, label_reader, batch_size=args.batch_size, gpu=args.gpu) valid_dataset = spensum.dataio.read_input_label_dataset( args.valid_inputs, args.valid_labels, input_reader, label_reader, batch_size=args.batch_size, gpu=args.gpu) model = spensum.model.CNNExtractor2(args.embedding_size) if args.gpu > -1: model.cuda(args.gpu) non_salient_count = train_dataset.targets.eq(0).sum() salient_count = train_dataset.targets.eq(1).sum() weight = torch.FloatTensor([1 / non_salient_count, 1 / salient_count]) # print("Training data:") # print("# salient = {}".format(salient_count)) # print("# non-salient = {}".format(non_salient_count)) opt = ntp.optimizer.Adam(model.parameters(), lr=args.lr) crit = ntp.criterion.BinaryCrossEntropy(mode="prob", weight=weight, mask_value=-1) crit.add_reporter(ntp.criterion.BinaryFMeasureReporter(mode="prob")) crit.set_selection_criterion("BinaryFMeasureReporter") #ntp.trainer.optimize_criterion(crit, model, opt, train_dataset, # validation_data=valid_dataset, # max_epochs=15) train_rouge_results = [] valid_rouge_results = [] best_rouge = 0 for epoch in range(1, args.epochs + 1): def train_step_callback(step, max_steps, batch_loss, criterion): sys.stdout.write("\r") sys.stdout.write(" " * 79) sys.stdout.write("\r") sys.stdout.write("\ttrain {}: {} / {} | obj: {:0.9f}".format( epoch, step, max_steps, criterion.avg_loss)) sys.stdout.flush() if step == max_steps: sys.stdout.write("\r" + " " * 79 + "\r") sys.stdout.flush() def valid_step_callback(step, max_steps, batch_loss, criterion): sys.stdout.write("\r") sys.stdout.write(" " * 79) sys.stdout.write("\r") sys.stdout.write("\tvalid {}: {} / {} | obj: {:0.9f}".format( epoch, step, max_steps, criterion.avg_loss)) sys.stdout.flush() if step == max_steps: sys.stdout.write("\r" + " " * 79 + "\r") sys.stdout.flush() ntp.trainer.train_epoch(crit, model, opt, train_dataset, step_callback=train_step_callback) crit.checkpoint("training") #print(crit.report(indent=" ")) #print(compute_rouge(model, train_dataset, args.train_summary_dir)) #print("\n * == Validation ==") ntp.trainer.eval(crit, model, valid_dataset, step_callback=valid_step_callback) crit.checkpoint("validation") #best_epoch, obj = crit.find_best_checkpoint("validation") #if best_epoch == epoch and save_model is not None: # torch.save(model, save_model) #print(crit.report(indent=" ")) #print("\n Best epoch: {} obj: {}\n".format( # best_epoch, obj)) #print("") valid_rouge = compute_rouge(model, valid_dataset, args.valid_summary_dir) #print(valid_rouge) valid_rouge_results.append(valid_rouge) rouge_score = valid_rouge["rouge-2"].values[0] if rouge_score > best_rouge: best_rouge = rouge_score if args.save_model is not None: print("Saving model!") torch.save(model, args.save_model) #, # save_model=module_save_path) return pd.concat(valid_rouge_results, axis=0)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--train-tsne", type=str, required=True) parser.add_argument("--train-ranks", type=str, required=True) parser.add_argument("--valid-tsne", type=str, required=True) parser.add_argument("--valid-ranks", type=str, required=True) parser.add_argument("--batch-size", default=16, type=int) parser.add_argument("--gpu", default=-1, type=int) parser.add_argument("--lr", default=.001, type=float) parser.add_argument("--remove-stopwords", action="store_true", default=False) parser.add_argument("--results-path", type=str, default=None) parser.add_argument("--model-path", type=str, default=None) parser.add_argument("--context-dropout", default=.5, type=float) parser.add_argument("--context-size", default=300, type=int) parser.add_argument("--validation-summary-dir", required=True, type=str) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--seed", default=48929234, type=int) args = parser.parse_args() ntp.set_random_seed(args.seed) embedding_size = 4 if args.results_path is not None: results_dir = os.path.dirname(args.results_path) if not os.path.exists(results_dir) and results_dir != "": os.makedirs(results_dir) if args.model_path is not None: model_dir = os.path.dirname(args.model_path) if not os.path.exists(model_dir) and model_dir != "": os.makedirs(model_dir) print("Reading training tsne and salience data from {} ...".format( args.train_tsne)) training_tsne_data = read_data(args.train_tsne) print("Reading training rank data from {} ...".format(args.train_ranks)) training_ranks_data = read_data(args.train_ranks) for a, b in zip(training_tsne_data, training_ranks_data): assert a["id"] == b["id"] training_data = make_dataset(training_tsne_data, training_ranks_data, args.batch_size, args.gpu) print("Reading validation tsne and salience data from {} ...".format( args.valid_tsne)) validation_tsne_data = read_data(args.valid_tsne) print("Reading validation rank data from {} ...".format(args.valid_ranks)) validation_ranks_data = read_data(args.valid_ranks) for a, b in zip(validation_tsne_data, validation_ranks_data): assert a["id"] == b["id"] validation_data = make_dataset(validation_tsne_data, validation_ranks_data, args.batch_size, args.gpu) input_module = SequenceStandardizer(embedding_size) pn_model = PointerNetwork(input_module, args.context_size, attention_hidden_size=150, layers=2, context_dropout=args.context_dropout) for name, param in pn_model.named_parameters(): if "weight" in name or name.startswith("W") or name == "v": nn.init.xavier_normal(param) elif "bias" in name: nn.init.constant(param, 0) else: nn.init.normal(param) if args.gpu > -1: pn_model.cuda(args.gpu) optim = torch.optim.Adam(pn_model.parameters(), lr=args.lr) train_xents = [] valid_results = [] best_rouge_2 = 0 best_epoch = None for epoch in range(1, args.epochs + 1): train_xent = train(optim, pn_model, training_data, epoch) train_xents.append(train_xent) valid_result = validate(pn_model, validation_data, epoch, args.validation_summary_dir, remove_stopwords=args.remove_stopwords) valid_results.append(valid_result) print( "Epoch {} :: Train xent: {:0.3f} | Valid xent: {:0.3f} | R1: {:0.3f} | R2: {:0.3f}" .format(epoch, train_xents[-1], *valid_results[-1])) if valid_results[-1][-1] > best_rouge_2: best_rouge_2 = valid_results[-1][-1] best_epoch = epoch if args.model_path is not None: print("Saving model ...") torch.save(pn_model, args.model_path) print("Best epoch: {} ROUGE-1 {:0.3f} ROUGE-2 {:0.3f}".format( best_epoch, *valid_results[best_epoch - 1][1:])) if args.results_path is not None: results = { "training": { "cross-entropy": train_xents }, "validation": { "cross-entropy": [x[0] for x in valid_results], "rouge-1": [x[1] for x in valid_results], "rouge-2": [x[2] for x in valid_results] } } print("Writing results to {} ...".format(args.results_path)) with open(args.results_path, "w") as fp: fp.write(json.dumps(results))
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--train-inputs", type=str, required=True) parser.add_argument("--valid-inputs", type=str, required=True) parser.add_argument("--test-inputs", type=str, required=True) parser.add_argument("--batch-size", default=16, type=int) parser.add_argument("--gpu", default=-1, type=int) parser.add_argument( "--remove-stopwords", action="store_true", default=False) parser.add_argument( "--summary-length", default=100, type=int) parser.add_argument("--results-path", type=str, default=None) parser.add_argument("--model-path", type=str, default=None) parser.add_argument("--train-summary-dir", required=True, type=str) parser.add_argument("--valid-summary-dir", required=True, type=str) parser.add_argument("--test-summary-dir", required=True, type=str) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--seed", default=48929234, type=int) args = parser.parse_args() ntp.set_random_seed(args.seed) if args.results_path is not None: results_dir = os.path.dirname(args.results_path) if not os.path.exists(results_dir) and results_dir != "": os.makedirs(results_dir) print("Loading model from {} ...".format(args.model_path)) model = torch.load(args.model_path) if args.gpu > -1: model.cuda(args.gpu) else: model.cpu() print("Reading training input data from {} ...".format( args.train_inputs)) training_input_data = read_data(args.train_inputs) training_data = make_dataset( training_input_data, args.batch_size, args.gpu) train_rouge_df = compute_rouge( model, training_data, args.train_summary_dir, remove_stopwords=args.remove_stopwords, summary_length=args.summary_length) train_r1, train_r2 = train_rouge_df.values[0].tolist() print("TRAIN R1 {:0.3f} R2 {:0.3f}".format(train_r1, train_r2)) print("Reading validation input data from {} ...".format( args.valid_inputs)) validation_input_data = read_data(args.valid_inputs) validation_data = make_dataset( validation_input_data, args.batch_size, args.gpu) valid_rouge_df = compute_rouge( model, validation_data, args.valid_summary_dir, remove_stopwords=args.remove_stopwords, summary_length=args.summary_length) valid_r1, valid_r2 = valid_rouge_df.values[0].tolist() print("VALID R1 {:0.3f} R2 {:0.3f}".format(valid_r1, valid_r2)) print("Reading testing input data from {} ...".format( args.test_inputs)) testing_input_data = read_data(args.test_inputs) testing_data = make_dataset( testing_input_data, args.batch_size, args.gpu) test_rouge_df = compute_rouge( model, testing_data, args.test_summary_dir, remove_stopwords=args.remove_stopwords, summary_length=args.summary_length) test_r1, test_r2 = test_rouge_df.values[0].tolist() print("TEST R1 {:0.3f} R2 {:0.3f}".format(test_r1, test_r2)) results = {"training": {"rouge-1": train_r1, "rouge-2": train_r2}, "validation": {"rouge-1": valid_r1, "rouge-2": valid_r2}, "testing": {"rouge-1": test_r1, "rouge-2": test_r2}} with open(args.results_path, "w") as fp: fp.write(json.dumps(results))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--spensum-data-path", type=str, default=None) parser.add_argument("--nyt-train-inputs-path", type=str, default=None) parser.add_argument("--nyt-train-abstracts-path", type=str, default=None) parser.add_argument("--nyt-test-inputs-path", type=str, default=None) parser.add_argument("--nyt-test-abstracts-path", type=str, default=None) parser.add_argument("--seed", type=int, default=43929524) parser.add_argument("--nprocs", type=int, default=None) args = parser.parse_args() if args.nprocs is None: args.nprocs = max(1, mp.cpu_count() // 2) ntp.set_random_seed(args.seed) if args.spensum_data_path is None: args.spensum_data_path = os.getenv("SPENSUM_DATA", None) if args.spensum_data_path is None: sys.stderr.write( "Set SPENSUM_DATA to set location to write data.\n") sys.exit(1) if args.nyt_train_inputs_path is None: args.nyt_train_inputs_path = os.getenv("NYT_TRAIN_INPUTS_ORIGINAL", None) if args.nyt_train_inputs_path is None: sys.stderr.write( "Set NYT_TRAIN_INPUTS_ORIGINAL to location of " \ "NYT preprocessed training inputs directory (see " \ "(https://github.com/gregdurrett/berkeley-doc-summarizer " \ ").\n") sys.exit(1) if args.nyt_train_abstracts_path is None: args.nyt_train_abstracts_path = os.getenv("NYT_TRAIN_ABS_ORIGINAL", None) if args.nyt_train_abstracts_path is None: sys.stderr.write( "Set NYT_TRAIN_ABS_ORIGINAL to location of " \ "NYT preprocessed training abstracts directory (see " \ "(https://github.com/gregdurrett/berkeley-doc-summarizer " \ ").\n") sys.exit(1) if args.nyt_test_inputs_path is None: args.nyt_test_inputs_path = os.getenv("NYT_TEST_INPUTS_ORIGINAL", None) if args.nyt_test_inputs_path is None: sys.stderr.write( "Set NYT_TEST_INPUTS_ORIGINAL to location of " \ "NYT preprocessed testing inputs directory (see " \ "(https://github.com/gregdurrett/berkeley-doc-summarizer " \ ").\n") sys.exit(1) if args.nyt_test_abstracts_path is None: args.nyt_test_abstracts_path = os.getenv("NYT_TEST_ABS_ORIGINAL", None) if args.nyt_test_abstracts_path is None: sys.stderr.write( "Set NYT_TEST_ABS_ORIGINAL to location of " \ "NYT preprocessed testing abstracts directory (see " \ "(https://github.com/gregdurrett/berkeley-doc-summarizer " \ ").\n") sys.exit(1) nyt_sds_data_root = os.path.join(args.spensum_data_path, "nyt-sds") train_ids, valid_ids = make_train_valid_list(args.nyt_train_inputs_path) print("Reading training abstracts...") train_abstracts = read_inputs(args.nyt_train_abstracts_path, train_ids) train_abstracts = [ ex for ex in train_abstracts if sum([s["word_count"] for s in ex["inputs"]]) > 50 ] train_abstracts = train_abstracts[:25000] train_ids = [ex["id"] for ex in train_abstracts] print(len(train_abstracts)) print("Writing training reference abstracts...") summaries_train_path = os.path.join(nyt_sds_data_root, "summaries", "train", "human_abstracts") write_summaries(train_abstracts, summaries_train_path) print("Reading validation abstracts...") valid_abstracts = read_inputs(args.nyt_train_abstracts_path, valid_ids) valid_abstracts = [ ex for ex in valid_abstracts if sum([s["word_count"] for s in ex["inputs"]]) > 50 ] valid_abstracts = valid_abstracts[:2500] valid_ids = [ex["id"] for ex in valid_abstracts] print(len(valid_abstracts)) print("Writing validation reference abstracts...") summaries_valid_path = os.path.join(nyt_sds_data_root, "summaries", "valid", "human_abstracts") write_summaries(valid_abstracts, summaries_valid_path) print("Reading test abstracts...") test_abstracts = read_inputs(args.nyt_test_abstracts_path) test_abstracts = [ ex for ex in test_abstracts if sum([s["word_count"] for s in ex["inputs"]]) > 50 ] test_ids = [ex["id"] for ex in test_abstracts] print(len(test_abstracts)) print("Writing test reference abstracts...") summaries_test_path = os.path.join(nyt_sds_data_root, "summaries", "test", "human_abstracts") write_summaries(test_abstracts, summaries_test_path) print("Reading training inputs...") train_inputs_data = read_inputs(args.nyt_train_inputs_path, train_ids) print("Reading validation inputs...") valid_inputs_data = read_inputs(args.nyt_train_inputs_path, valid_ids) print("Reading test inputs...") test_inputs_data = read_inputs(args.nyt_test_inputs_path, test_ids) print("Writing train labels...") train_labels_path = os.path.join( nyt_sds_data_root, "labels", "nyt.sds.labels.seq.rouge-1.sw.train.json") train_ranks_path = os.path.join(nyt_sds_data_root, "ranks", "nyt.sds.ranks.seq.rouge-1.sw.train.json") generate_extracts(train_inputs_data, train_abstracts, "sequential", 1, train_labels_path, train_ranks_path, args.nprocs) print("Writing valid labels...") valid_labels_path = os.path.join( nyt_sds_data_root, "labels", "nyt.sds.labels.seq.rouge-1.sw.valid.json") valid_ranks_path = os.path.join(nyt_sds_data_root, "ranks", "nyt.sds.ranks.seq.rouge-1.sw.valid.json") generate_extracts(valid_inputs_data, valid_abstracts, "sequential", 1, valid_labels_path, valid_ranks_path, args.nprocs) print("Writing test labels...") test_labels_path = os.path.join(nyt_sds_data_root, "labels", "nyt.sds.labels.seq.rouge-1.sw.test.json") test_ranks_path = os.path.join(nyt_sds_data_root, "ranks", "nyt.sds.ranks.seq.rouge-1.sw.test.json") generate_extracts(test_inputs_data, test_abstracts, "sequential", 1, test_labels_path, test_ranks_path, args.nprocs) print("Collecting sentence tokens...") all_training_sents = [[token.lower() for token in sent["tokens"]] for ex in train_inputs_data for sent in ex['inputs']] print(len(all_training_sents)) print("Loading sif embedding model...") sif_emb = ntp.models.sentence_embedding.SIFEmbedding.from_pretrained() print("Fitting principal component...") sif_emb.fit_principle_component(all_training_sents) sif_path = os.path.join(args.spensum_data_path, nyt_sds_data_root, "sif.bin") torch.save(sif_emb, sif_path) print("Writing training inputs...") inputs_train_path = os.path.join(nyt_sds_data_root, "inputs", "nyt.sds.inputs.train.json") generate_inputs(train_inputs_data, sif_emb, inputs_train_path) print("Writing validation inputs...") inputs_valid_path = os.path.join(nyt_sds_data_root, "inputs", "nyt.sds.inputs.valid.json") generate_inputs(valid_inputs_data, sif_emb, inputs_valid_path) print("Writing test inputs...") inputs_test_path = os.path.join(nyt_sds_data_root, "inputs", "nyt.sds.inputs.test.json") generate_inputs(test_inputs_data, sif_emb, inputs_test_path)
def main(args=None): parser = argparse.ArgumentParser() parser.add_argument("--train-inputs", type=str, required=True) parser.add_argument("--train-labels", type=str, required=True) parser.add_argument("--valid-inputs", type=str, required=True) parser.add_argument("--valid-labels", type=str, required=True) parser.add_argument("--valid-summary-dir", type=str, required=True) parser.add_argument("--gpu", default=-1, type=int, required=False) parser.add_argument("--epochs", default=100, type=int, required=False) parser.add_argument("--seed", default=83432534, type=int, required=False) parser.add_argument("--lr", required=False, default=.01, type=float) parser.add_argument("--batch-size", default=16, type=int, required=False) parser.add_argument("--embedding-size", type=int, required=False, default=300) parser.add_argument("--rnn-salience", action="store_true", default=False) parser.add_argument("--rs-xent", type=float, required=False, default=0) parser.add_argument("--rs-burn-in", type=int, default=0, required=False) parser.add_argument("--rs-hidden-size", default=150, type=int, required=False) parser.add_argument("--position", action="store_true", default=False) parser.add_argument("--p-xent", type=float, default=0, required=False) parser.add_argument("--p-num-positions", type=int, default=50, required=False) parser.add_argument("--p-burn-in", type=int, default=0, required=False) parser.add_argument("--word-count", action="store_true", default=False) parser.add_argument("--wc-xent", type=float, default=0, required=False) parser.add_argument("--wc-burn-in", type=int, default=0, required=False) parser.add_argument("--neighbor-clique", action="store_true", default=False) parser.add_argument("--report-every", type=int, required=False, default=1000) parser.add_argument("--validate-every", type=int, required=False, default=1) parser.add_argument("--burn-in-report-every", type=int, required=False, default=500) parser.add_argument("--pc-coverage", action="store_true", default=False) parser.add_argument("--pcc-xent", type=float, required=False, default=0) parser.add_argument("--pcc-burn-in", type=int, default=0, required=False) parser.add_argument("--hidden-layer-sizes", nargs="+", default=[100], type=int, required=False) parser.add_argument("--hidden-layer-activations", nargs="+", default="relu", type=str, required=False) parser.add_argument("--hidden-layer-dropout", default=.0, type=float, required=False) parser.add_argument("--input-layer-norm", default=False, action="store_true") # parser.add_argument( # "--save-module", required=True, type=str) # parser.add_argument( # "--save-predictor", default=None, required=False, type=str) args = parser.parse_args(args) colorama.init() print("") print( " ++=============================================================++") print(" || Summary Energy Network Sentence Extractor " + \ colorama.Fore.GREEN + colorama.Style.BRIGHT + "(SENSEi)" + \ colorama.Fore.RESET + colorama.Style.NORMAL + " trainer. ||" ) print( " ++=============================================================++") print("") print("Setting random seed: " + colorama.Style.BRIGHT + \ colorama.Fore.WHITE + str(args.seed) + \ colorama.Style.NORMAL + colorama.Fore.RESET + "\n") ntp.set_random_seed(args.seed) input_reader = spensum.dataio.init_duc_sds_input_reader( args.embedding_size) label_reader = spensum.dataio.init_duc_sds_label_reader() train_dataset = spensum.dataio.read_input_label_dataset( args.train_inputs, args.train_labels, input_reader, label_reader, batch_size=args.batch_size, gpu=args.gpu) valid_dataset = spensum.dataio.read_input_label_dataset( args.valid_inputs, args.valid_labels, input_reader, label_reader, batch_size=args.batch_size, gpu=args.gpu) non_salient_count = train_dataset.targets.eq(0).sum() salient_count = train_dataset.targets.eq(1).sum() weight = torch.FloatTensor([1 / non_salient_count, 1 / salient_count]) crit = spensum.criterion.SPENLoss(weight=weight) crit.add_reporter(ntp.criterion.BinaryFMeasureReporter(mode="prob")) crit.set_selection_criterion("BinaryFMeasureReporter") print(colorama.Style.BRIGHT + "Beginning preflight check...\n" \ + colorama.Style.NORMAL) print("Initializing submodules...") submodules = [] if args.rnn_salience: module = spensum.module.RNNSalience(args.embedding_size, hidden_size=args.rs_hidden_size, burn_in=args.rs_burn_in) submodules.append(module) msg = " {:>15} ... {:>8}".format( "rnn salience", colorama.Fore.GREEN + 'OK' + colorama.Fore.RESET) if module.burn_in > 0: msg += " burnin={} iters".format(module.burn_in) if args.rs_xent > 0: aux_crit = ntp.criterion.BinaryCrossEntropy(name="RNNSalienceXEnt", mode="logit", weight=weight, mask_value=-1) aux_crit.add_reporter( ntp.criterion.BinaryFMeasureReporter(mode="logit")) crit.add_aux_criterion(module, aux_crit, args.rs_xent) msg += colorama.Fore.GREEN + " xent obj" + colorama.Fore.RESET print(msg) else: print(" {:>15} ... {:>8}".format( "rnn salience", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET)) if args.position: module = spensum.module.Position(args.p_num_positions, burn_in=args.p_burn_in) submodules.append(module) msg = " {:>15} ... {:>8}".format( "position", colorama.Fore.GREEN + 'OK' + colorama.Fore.RESET) if module.burn_in > 0: msg += " burnin={} iters".format(module.burn_in) if args.p_xent > 0: aux_crit = ntp.criterion.BinaryCrossEntropy(name="PositionXEnt", mode="logit", weight=weight, mask_value=-1) aux_crit.add_reporter( ntp.criterion.BinaryFMeasureReporter(mode="logit")) crit.add_aux_criterion(module, aux_crit, weight=args.p_xent) msg += colorama.Fore.GREEN + " xent obj" + colorama.Fore.RESET print(msg) else: print(" {:>15} ... {:>8}".format( "position", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET)) if args.word_count: module = spensum.module.WordCount(burn_in=args.wc_burn_in) submodules.append(module) msg = " {:>15} ... {:>8}".format( "word_count", colorama.Fore.GREEN + 'OK' + colorama.Fore.RESET) if module.burn_in > 0: msg += " burnin={} iters".format(module.burn_in) if args.wc_xent > 0: aux_crit = ntp.criterion.BinaryCrossEntropy(name="WordCountXEnt", mode="logit", weight=weight, mask_value=-1) aux_crit.add_reporter( ntp.criterion.BinaryFMeasureReporter(mode="logit")) crit.add_aux_criterion(module, aux_crit, weight=args.wc_xent) msg += colorama.Fore.GREEN + " xent obj" + colorama.Fore.RESET print(msg) else: print(" {:>15} ... {:>8}".format( "word_count", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET)) if args.pc_coverage: module = spensum.module.PCCoverage(args.embedding_size, burn_in=args.pcc_burn_in) submodules.append(module) msg = " {:>15} ... {:>8}".format( "pc_coverage", colorama.Fore.GREEN + 'READY' + colorama.Fore.RESET) if module.burn_in > 0: msg += " burnin={} iters".format(module.burn_in) if args.pcc_xent > 0: aux_crit = ntp.criterion.BinaryCrossEntropy(name="PCCoverageXEnt", mode="logit", weight=weight, mask_value=-1) aux_crit.add_reporter( ntp.criterion.BinaryFMeasureReporter(mode="logit")) crit.add_aux_criterion(module, aux_crit, weight=args.pcc_xent) msg += colorama.Fore.GREEN + " xent obj" + colorama.Fore.RESET print(msg) else: print(" {:>15} ... {:>8}".format( "pc_coverage", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET)) if args.neighbor_clique: module = spensum.module.NeighborClique() submodules.append(module) msg = " {:>15} ... {:>8}".format( "neighbor_clique", colorama.Fore.GREEN + 'READY' + colorama.Fore.RESET) if module.burn_in > 0: msg += " burnin={} iters".format(module.burn_in) # if args.nc_xent > 0: # aux_crit = ntp.criterion.BinaryCrossEntropy( # name="NCliqueXEnt", # mode="logit", weight=weight, mask_value=-1) # aux_crit.add_reporter( # ntp.criterion.BinaryFMeasureReporter(mode="logit")) # crit.add_aux_criterion(module, aux_crit, weight=args.pcc_xent) # msg += colorama.Fore.GREEN + " xent obj" + colorama.Fore.RESET print(msg) else: print(" {:>15} ... {:>8}".format( "neighbor_clique", colorama.Fore.YELLOW + 'SKIP' + colorama.Fore.RESET)) print("\nInitializing energy model...") model = spensum.model.EnergyModel(submodules) if args.gpu > -1: print("Placing model on gpu device: " + \ colorama.Style.BRIGHT + colorama.Fore.WHITE + str(args.gpu) \ + colorama.Style.NORMAL + colorama.Fore.RESET) model.cuda(args.gpu) if not model.ready: print("Running burn in for {} iters...".format(model.burn_in_iters)) burn_in(model, train_dataset, weight=weight, report_every=args.burn_in_report_every) opt = ntp.optimizer.Adam(model.parameters(), lr=args.lr) max_iters = 1000000 fit_model(model, crit, opt, train_dataset, max_iters, validation_dataset=valid_dataset, report_every=args.report_every, validate_every=args.validate_every, validation_summary_dir=args.valid_summary_dir)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--train-inputs", type=str, required=True) parser.add_argument("--train-salience", type=str, required=True) parser.add_argument("--valid-inputs", type=str, required=True) parser.add_argument("--valid-salience", type=str, required=True) parser.add_argument("--batch-size", default=300, type=int) parser.add_argument("--gpu", default=-1, type=int) parser.add_argument("--lr", default=.00005, type=float) parser.add_argument( "--remove-stopwords", action="store_true", default=False) parser.add_argument("--results-path", type=str, default=None) parser.add_argument("--model-path", type=str, default=None) parser.add_argument("--dropout", default=.5, type=float) parser.add_argument("--context-size", default=300, type=int) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--seed", default=48929234, type=int) args = parser.parse_args() ntp.set_random_seed(args.seed) embedding_size = 300 if args.results_path is not None: results_dir = os.path.dirname(args.results_path) if not os.path.exists(results_dir) and results_dir != "": os.makedirs(results_dir) if args.model_path is not None: model_dir = os.path.dirname(args.model_path) if not os.path.exists(model_dir) and model_dir != "": os.makedirs(model_dir) print("Reading training salience data from {} ...".format( args.train_salience)) training_salience_data = read_data(args.train_salience) print("Reading training inputs data from {} ...".format(args.train_inputs)) training_inputs_data = read_data(args.train_inputs) for a, b in zip(training_inputs_data, training_salience_data): assert a["id"] == b["id"] training_data = make_dataset( training_inputs_data, training_salience_data, args.batch_size, args.gpu) print("Reading validation salience data from {} ...".format( args.valid_salience)) validation_salience_data = read_data(args.valid_salience) print("Reading validation inputs data from {} ...".format(args.valid_inputs)) validation_inputs_data = read_data(args.valid_inputs) for a, b in zip(validation_salience_data, validation_inputs_data): assert a["id"] == b["id"] validation_data = make_dataset( validation_inputs_data, validation_salience_data, args.batch_size, args.gpu) model = RougePredictor(dropout=args.dropout) #input_module, args.context_size, attention_hidden_size=150, layers=2, #context_dropout=args.context_dropout) for name, param in model.named_parameters(): if "emb" in name: nn.init.normal(param) elif "weight" in name: nn.init.xavier_normal(param) elif "bias" in name: nn.init.constant(param, 0) else: nn.init.normal(param) if args.gpu > -1: model.cuda(args.gpu) optim = torch.optim.Adam(model.parameters(), lr=args.lr) train_loss = [] valid_loss = [] best_loss = float("inf") best_epoch = None for epoch in range(1, args.epochs + 1): train_loss.append(train(optim, model, training_data, epoch)) valid_loss.append(validate(model, validation_data, epoch)) print("Epoch {} :: Train err: {:0.5f} | Valid err: {:0.5f} | ".format( epoch, train_loss[-1], valid_loss[-1])) if valid_loss[-1] < best_loss: best_loss = valid_loss[-1] best_epoch = epoch if args.model_path is not None: print("Saving model ...") torch.save(model, args.model_path) print("Best epoch: {} Error={:0.5f}".format( best_epoch, valid_loss[best_epoch - 1])) if args.results_path is not None: results = {"training": train_loss, "validation": valid_loss} print("Writing results to {} ...".format(args.results_path)) with open(args.results_path, "w") as fp: fp.write(json.dumps(results))
def main(): parser = argparse.ArgumentParser() parser.add_argument("--spensum-data-path", type=str, default=None) parser.add_argument("--duc-2001-data-path", type=str, default=None) parser.add_argument("--seed", type=int, default=43929524) args = parser.parse_args() ntp.set_random_seed(args.seed) if args.spensum_data_path is None: args.spensum_data_path = os.getenv("SPENSUM_DATA", None) if args.spensum_data_path is None: sys.stderr.write( "Set SPENSUM_DATA to set location to write data.\n") sys.exit(1) if args.duc_2001_data_path is None: args.duc_2001_data_path = os.getenv("DUC2001_ORIGINAL", None) if args.duc_2001_data_path is None: sys.stderr.write( "Set DUC2001_ORIGINAL to location of nist duc 2001 data.\n") sys.exit(1) pp_duc_2001_sds_path = os.path.join(args.spensum_data_path, "duc-sds", "preprocessed-data", "duc2001") # print("Preprocessing raw duc 2001 sds data ...") # duc2001.preprocess_sds( # pp_duc_2001_sds_path, nist_data_path=args.duc_2001_data_path, # cnlp_port=9000) print("Loading sif embedding model ...") sif_emb = ntp.models.sentence_embedding.SIFEmbedding.from_pretrained() duc_sds_data_root = os.path.join(args.spensum_data_path, "duc-sds") train_data, valid_data = generate_train_valid_splits( pp_duc_2001_sds_path, duc_sds_data_root) all_training_sents = [[token.lower() for token in sent["tokens"]] for ex in train_data for sent in ex[0]] sif_emb.fit_principle_component(all_training_sents) sif_path = os.path.join(args.spensum_data_path, duc_sds_data_root, "sif.bin") torch.save(sif_emb, sif_path) print("Writing training inputs...") inputs_train_path = os.path.join(duc_sds_data_root, "inputs", "duc.sds.inputs.train.json") generate_inputs(train_data, sif_emb, inputs_train_path) print("Writing validation inputs...") inputs_valid_path = os.path.join(duc_sds_data_root, "inputs", "duc.sds.inputs.valid.json") generate_inputs(valid_data, sif_emb, inputs_valid_path) # print("Writing training summaries...") # summaries_train_path = os.path.join( # duc_sds_data_root, "summaries", "train", "human_abstracts") # write_summaries(train_data, summaries_train_path) # print("Writing validation summaries...") # summaries_valid_path = os.path.join( # duc_sds_data_root, "summaries", "valid", "human_abstracts") # write_summaries(valid_data, summaries_valid_path) #for mode in ["independent", "sequential"]: for mode in ["sequential"]: for part, data in [["valid", valid_data], ["train", train_data]]: #for mode in ["sequential"]: for rouge in [1]: #2, 3, 4]: labels_path = os.path.join( duc_sds_data_root, "labels", "duc.sds.labels.{}.rouge-{}.sw.{}.json".format( "indie" if mode == "independent" else "seq", rouge, part)) ranks_path = os.path.join( duc_sds_data_root, "ranks", "duc.sds.ranks.{}.rouge-{}.sw.{}.json".format( "indie" if mode == "independent" else "seq", rouge, part)) print("Generating {} rouge-{} ranks/labels " \ "for {} data".format(mode, rouge, part)) generate_extracts(data, mode, rouge, labels_path, ranks_path)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--train-inputs", type=str, required=True) parser.add_argument("--train-labels", type=str, required=True) parser.add_argument("--valid-inputs", type=str, required=True) parser.add_argument("--valid-labels", type=str, required=True) parser.add_argument("--batch-size", default=16, type=int) parser.add_argument("--gpu", default=-1, type=int) parser.add_argument( "--remove-stopwords", action="store_true", default=False) parser.add_argument( "--summary-length", default=100, type=int) parser.add_argument("--results-path", type=str, default=None) parser.add_argument("--model-path", type=str, default=None) parser.add_argument("--context-dropout", default=.5, type=float) parser.add_argument("--context-size", default=200, type=int) parser.add_argument("--validation-summary-dir", required=True, type=str) parser.add_argument("--epochs", type=int, default=10) parser.add_argument("--seed", default=48929234, type=int) args = parser.parse_args() ntp.set_random_seed(args.seed) embedding_size = 300 if args.results_path is not None: results_dir = os.path.dirname(args.results_path) if not os.path.exists(results_dir) and results_dir != "": os.makedirs(results_dir) if args.model_path is not None: model_dir = os.path.dirname(args.model_path) if not os.path.exists(model_dir) and model_dir != "": os.makedirs(model_dir) print("Reading training input data from {} ...".format( args.train_inputs)) training_input_data = read_data(args.train_inputs) print("Reading training label data from {} ...".format(args.train_labels)) training_label_data = read_data(args.train_labels) for a, b in zip(training_input_data, training_label_data): assert a["id"] == b["id"] training_data = make_dataset( training_input_data, training_label_data, args.batch_size, args.gpu) print("Reading validation input data from {} ...".format( args.valid_inputs)) validation_input_data = read_data(args.valid_inputs) print("Reading validation label data from {} ...".format( args.valid_labels)) validation_label_data = read_data(args.valid_labels) for a, b in zip(validation_input_data, validation_label_data): assert a["id"] == b["id"] validation_data = make_dataset( validation_input_data, validation_label_data, args.batch_size, args.gpu) model = SummaRunner(hidden_size=args.context_size, dropout=args.context_dropout) for name, param in model.named_parameters(): if "emb" not in name and "weight" in name: nn.init.xavier_normal(param) elif "emb" not in name and "bias" in name: nn.init.constant(param, 0) if args.gpu > -1: model.cuda(args.gpu) optim = torch.optim.Adam(model.parameters(), lr=.001) train_xents = [] valid_results = [] best_rouge_2 = 0 best_epoch = None for epoch in range(1, args.epochs + 1): train_xent = train(optim, model, training_data, epoch) train_xents.append(train_xent) valid_result = validate( model, validation_data, epoch, args.validation_summary_dir, remove_stopwords=args.remove_stopwords, summary_length=args.summary_length) valid_results.append(valid_result) print(("Epoch {} :: Train xent: {:0.3f} | Valid xent: {:0.3f} | " \ "R1: {:0.3f} | R2: {:0.3f}").format( epoch, train_xents[-1], *valid_results[-1])) if valid_results[-1][-1] > best_rouge_2: best_rouge_2 = valid_results[-1][-1] best_epoch = epoch if args.model_path is not None: print("Saving model ...") torch.save(model, args.model_path) print("Best epoch: {} ROUGE-1 {:0.3f} ROUGE-2 {:0.3f}".format( best_epoch, *valid_results[best_epoch - 1][1:])) if args.results_path is not None: results = {"training": {"cross-entropy": train_xents}, "validation": { "cross-entropy": [x[0] for x in valid_results], "rouge-1": [x[1] for x in valid_results], "rouge-2": [x[2] for x in valid_results]}} print("Writing results to {} ...".format(args.results_path)) with open(args.results_path, "w") as fp: fp.write(json.dumps(results))