def write(self, dataset_name): path = directories.DOC_DATA + dataset_name + '/' if not self.columns: utils.rmkdir(path) self.mention_inds.write(path) self.pair_inds.write(path) self.features.write(path)
def write(self, dataset_name): path = directories.PAIR_DATA + dataset_name + '/' if not self.columns: utils.rmkdir(path) self.pair_indices.write(path) self.pair_features.write(path) self.y.write(path) self.pair_ids.write(path)
def write(self, dataset_name): path = directories.MENTION_DATA + dataset_name + '/' if not self.columns: utils.rmkdir(path) self.words.write(path) self.spans.write(path) self.features.write(path) self.mention_nums.write(path) self.mention_ids.write(path) self.dids.write(path)
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-dir", required=True, help="Location of pre-training text files.") parser.add_argument("--vocab-file", required=True, help="Location of vocabulary file.") parser.add_argument("--output-dir", required=True, help="Where to write out the tfrecords.") parser.add_argument("--max-seq-length", default=128, type=int, help="Number of tokens per example.") parser.add_argument("--num-processes", default=1, type=int, help="Parallelize across multiple processes.") parser.add_argument( "--blanks-separate-docs", default=True, type=bool, help="Whether blank lines indicate document boundaries.") parser.add_argument("--do-lower-case", dest='do_lower_case', action='store_true', help="Lower case input text.") parser.add_argument("--no-lower-case", dest='do_lower_case', action='store_false', help="Don't lower case input text.") parser.add_argument("--num-out-files", default=1000, type=int, help="Number of output files.") parser.set_defaults(do_lower_case=True) args = parser.parse_args() utils.rmkdir(args.output_dir) if args.num_processes == 1: write_examples(0, args) else: jobs = [] for i in range(args.num_processes): job = multiprocessing.Process(target=write_examples, args=(i, args)) jobs.append(job) job.start() for job in jobs: job.join()
def __init__(self, model_props, train_set='train', test_set='dev', n_epochs=200, empty_buffer=True, betas=None, write_every=1, max_docs=10000): self.model_props = model_props if betas is None: betas = [0.8 ** i for i in range(1, 5)] self.write_every = write_every print "Model=" + model_props.path + ", ordering from " + directories.ACTION_SPACE self.pair_model, self.anaphoricity_model, self.model, word_vectors = \ clustering_models.get_models(model_props) json_string = self.model.to_json() open(model_props.path + 'architecture.json', 'w').write(json_string) utils.rmkdir(model_props.path + 'src') for fname in os.listdir('.'): if fname.endswith('.py'): shutil.copyfile(fname, model_props.path + 'src/' + fname) self.train_data, self.train_docs = load_docs(train_set, word_vectors) print "Train loaded" self.dev_data, self.dev_docs = self.train_data, self.train_docs print "Dev loaded!" self.test_data, self.test_docs = load_docs(test_set, word_vectors) print "Test loaded" random.seed(0) random.shuffle(self.train_docs) random.shuffle(self.dev_docs) random.shuffle(self.test_docs) self.train_docs = self.train_docs[:max_docs] self.dev_docs = self.dev_docs[:max_docs] self.test_docs = self.test_docs[:max_docs] self.epoch = 0 self.n = 0 self.history = [] self.best_conll = 0 self.best_conll_window = 0 replay_memory = ReplayMemory(self, self.model) for self.epoch in range(n_epochs): print 80 * "-" print "ITERATION", (self.epoch + 1), "model =", model_props.path ar = AgentRunner(self, self.train_docs, self.train_data, "Training", replay_memory, beta=0 if self.epoch >= len(betas) else betas[self.epoch]) self.train_pairs = ar.merged_pairs if empty_buffer: replay_memory.train_all() self.run_evaluation()
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'): print "Training", model_props.path pprint(model_props.__dict__) model_props.write(model_props.path + 'model_props.pkl') utils.rmkdir(model_props.path + 'src') for fname in os.listdir('.'): if fname.endswith('.py'): shutil.copyfile(fname, model_props.path + 'src/' + fname) if model_props.ranking or \ model_props.top_pairs: write_start = 0 write_every = 10 else: write_start = 80 write_every = 20 print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') train = datasets.DocumentBatchedDataset( "train_reduced" if reduced else "train", model_props, with_ids=True) dev = datasets.DocumentBatchedDataset( dev_set_name + "_reduced" if reduced else dev_set_name, model_props, with_ids=True) print "Building model" model, _ = pairwise_models.get_model(dev, vectors, model_props) json_string = model.to_json() open(model_props.path + 'architecture.json', 'w').write(json_string) best_val_score = 1000 best_val_score_in_window = 1000 history = [] for epoch in range(n_epochs): timer.start("train") print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path) epoch_stats = {} model_weights = model.get_weights() train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl') dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl') if reduced: dev_docs = dev_docs[:3] if model_props.ranking: print "Running over training set" run_model_over_docs(train, train_docs, model) epoch_stats.update(compute_metrics(train_docs, "train")) if model_props.use_rewards: print "Setting costs" set_costs(train, train_docs) print "Training" prog = utils.Progbar(train.n_batches) train.shuffle() loss_sum, n_examples = 0, 0 for i, X in enumerate(train): if X['y'].size == 0: continue batch_loss = model.train_on_batch(X) loss_sum += batch_loss * train.scale_factor n_examples += X['y'].size prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)]) epoch_stats["train time"] = time.time() - prog.start for k in prog.unique_values: epoch_stats[k] = prog.sum_values[k][0] / max( 1, prog.sum_values[k][1]) epoch_stats["weight diffs"] = [ (np.sum(np.abs(new_weight - old_weight)), new_weight.size) for new_weight, old_weight in zip(model.get_weights(), model_weights) ] summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0) epoch_stats["total weight diff"] = tuple(summed) print "Testing on dev set" evaluate_model(dev, dev_docs, model, model_props, epoch_stats) history.append(epoch_stats) utils.write_pickle(history, model_props.path + 'history.pkl') score = -epoch_stats["dev conll"] if model_props.ranking else \ (epoch_stats["dev loss"] if not model_props.anaphoricity_only else epoch_stats["dev anaphoricity loss"]) if score < best_val_score: best_val_score = score print "New best {:}, saving model".format( "CoNLL F1" if model_props.ranking else "validation loss") model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True) if score < best_val_score_in_window and epoch > write_start: print "Best in last {:}, saved to weights_{:}".format( write_every, write_every * (epoch / write_every)) best_val_score_in_window = score model.save_weights( model_props.path + "weights_{:}.hdf5".format(write_every * (epoch / write_every)), overwrite=True) if epoch + write_every >= n_epochs: model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True) if epoch % write_every == 0: best_val_score_in_window = 1000 timer.stop("train") timer.print_totals() print timer.clear()
def __init__(self, model_props, train_set='train', test_set='dev', n_epochs=200, empty_buffer=True, betas=None, write_every=1, max_docs=10000): self.model_props = model_props if betas is None: betas = [0.8**i for i in range(1, 5)] self.write_every = write_every print "Model=" + model_props.path + ", ordering from " + directories.ACTION_SPACE self.pair_model, self.anaphoricity_model, self.model, word_vectors = \ clustering_models.get_models(model_props) json_string = self.model.to_json() open(model_props.path + 'architecture.json', 'w').write(json_string) utils.rmkdir(model_props.path + 'src') for fname in os.listdir('.'): if fname.endswith('.py'): shutil.copyfile(fname, model_props.path + 'src/' + fname) self.train_data, self.train_docs = load_docs(train_set, word_vectors) print "Train loaded" self.dev_data, self.dev_docs = self.train_data, self.train_docs print "Dev loaded!" self.test_data, self.test_docs = load_docs(test_set, word_vectors) print "Test loaded" random.seed(0) random.shuffle(self.train_docs) random.shuffle(self.dev_docs) random.shuffle(self.test_docs) self.train_docs = self.train_docs[:max_docs] self.dev_docs = self.dev_docs[:max_docs] self.test_docs = self.test_docs[:max_docs] self.epoch = 0 self.n = 0 self.history = [] self.best_conll = 0 self.best_conll_window = 0 replay_memory = ReplayMemory(self, self.model) for self.epoch in range(n_epochs): print 80 * "-" print "ITERATION", (self.epoch + 1), "model =", model_props.path ar = AgentRunner( self, self.train_docs, self.train_data, "Training", replay_memory, beta=0 if self.epoch >= len(betas) else betas[self.epoch]) self.train_pairs = ar.merged_pairs if empty_buffer: replay_memory.train_all() self.run_evaluation()
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'): print "Training", model_props.path pprint(model_props.__dict__) model_props.write(model_props.path + 'model_props.pkl') utils.rmkdir(model_props.path + 'src') for fname in os.listdir('.'): if fname.endswith('.py'): shutil.copyfile(fname, model_props.path + 'src/' + fname) if model_props.ranking or \ model_props.top_pairs: write_start = 0 write_every = 10 else: write_start = 80 write_every = 20 print "Loading data" vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy') train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train", model_props, with_ids=True) dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name, model_props, with_ids=True) print "Building model" model, _ = pairwise_models.get_model(dev, vectors, model_props) json_string = model.to_json() open(model_props.path + 'architecture.json', 'w').write(json_string) best_val_score = 1000 best_val_score_in_window = 1000 history = [] for epoch in range(n_epochs): timer.start("train") print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path) epoch_stats = {} model_weights = model.get_weights() train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl') dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl') if reduced: dev_docs = dev_docs[:3] if model_props.ranking: print "Running over training set" run_model_over_docs(train, train_docs, model) epoch_stats.update(compute_metrics(train_docs, "train")) if model_props.use_rewards: print "Setting costs" set_costs(train, train_docs) print "Training" prog = utils.Progbar(train.n_batches) train.shuffle() loss_sum, n_examples = 0, 0 for i, X in enumerate(train): if X['y'].size == 0: continue batch_loss = model.train_on_batch(X) loss_sum += batch_loss * train.scale_factor n_examples += X['y'].size prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)]) epoch_stats["train time"] = time.time() - prog.start for k in prog.unique_values: epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1]) epoch_stats["weight diffs"] = [ (np.sum(np.abs(new_weight - old_weight)), new_weight.size) for new_weight, old_weight in zip(model.get_weights(), model_weights)] summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0) epoch_stats["total weight diff"] = tuple(summed) print "Testing on dev set" evaluate_model(dev, dev_docs, model, model_props, epoch_stats) history.append(epoch_stats) utils.write_pickle(history, model_props.path + 'history.pkl') score = -epoch_stats["dev conll"] if model_props.ranking else \ (epoch_stats["dev loss"] if not model_props.anaphoricity_only else epoch_stats["dev anaphoricity loss"]) if score < best_val_score: best_val_score = score print "New best {:}, saving model".format( "CoNLL F1" if model_props.ranking else "validation loss") model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True) if score < best_val_score_in_window and epoch > write_start: print "Best in last {:}, saved to weights_{:}".format( write_every, write_every * (epoch / write_every)) best_val_score_in_window = score model.save_weights(model_props.path + "weights_{:}.hdf5".format( write_every * (epoch / write_every)), overwrite=True) if epoch + write_every >= n_epochs: model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True) if epoch % write_every == 0: best_val_score_in_window = 1000 timer.stop("train") timer.print_totals() print timer.clear()
def main(): # Parse essential args parser = argparse.ArgumentParser() parser.add_argument("--data_dir", required=True, help="Location of data files (model weights, etc).") parser.add_argument("--model_name", required=True, help="The name of the model being fine-tuned.") parser.add_argument("--pretrain_tfrecords", type=str) parser.add_argument("--seed", type=int) parser.add_argument("--num_train_steps", type=int) parser.add_argument("--num_warmup_steps", type=int) parser.add_argument("--learning_rate", type=float) parser.add_argument("--train_batch_size", type=int) parser.add_argument("--max_seq_length", type=int) parser.add_argument("--mask_prob", type=float) parser.add_argument("--disc_weight", type=float) parser.add_argument("--generator_hidden_size", type=float) parser.add_argument("--save_checkpoints_steps", type=int) parser.add_argument("--keep_checkpoint_max", type=int) parser.add_argument("--restore_checkpoint", action='store_true') parser.add_argument("--optimizer", default="adam", type=str, help="adam or lamb") args = parser.parse_args() config = PretrainingConfig(**args.__dict__) # Set up tensorflow hvd.init() gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(config.xla) tf.config.optimizer.set_experimental_options( {"auto_mixed_precision": config.amp}) tf.random.set_seed(config.seed) # Set up config if config.do_train == config.do_eval: raise ValueError( "Exactly one of `do_train` or `do_eval` must be True.") if config.debug and config.do_train: utils.rmkdir(config.model_dir) utils.heading("Config:") log_config(config) # Save pretrain configs pretrain_config_json = os.path.join(config.checkpoints_dir, 'pretrain_config.json') if is_main_process(): utils.write_json(config.__dict__, pretrain_config_json) log("Configuration saved in {}".format(pretrain_config_json)) # Set up model model = PretrainingModel(config) # Set up metrics perf_metrics = dict() perf_metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf") eval_metrics = dict() eval_metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss") eval_metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="masked_lm_accuracy") eval_metrics["masked_lm_loss"] = tf.keras.metrics.Mean( name="masked_lm_loss") if config.electra_objective: eval_metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy( name="sampled_masked_lm_accuracy") if config.disc_weight > 0: eval_metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss") eval_metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc") eval_metrics["disc_accuracy"] = tf.keras.metrics.Accuracy( name="disc_accuracy") eval_metrics["disc_precision"] = tf.keras.metrics.Accuracy( name="disc_precision") eval_metrics["disc_recall"] = tf.keras.metrics.Accuracy( name="disc_recall") # Set up tensorboard current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = os.path.join( config.log_dir, current_time, 'train_' + str(get_rank()) + '_of_' + str(get_world_size())) train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Set up dataset dataset = pretrain_utils.get_dataset(config, config.train_batch_size, world_size=get_world_size(), rank=get_rank()) train_iterator = iter(dataset) # Set up optimizer optimizer = create_optimizer(init_lr=config.learning_rate, num_train_steps=config.num_train_steps, num_warmup_steps=config.num_warmup_steps, weight_decay_rate=config.weight_decay_rate, optimizer=config.optimizer) if config.amp: optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, "dynamic") if config.do_train: # Set up checkpoint manager checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=optimizer, model=model) manager = tf.train.CheckpointManager( checkpoint, config.checkpoints_dir, max_to_keep=config.keep_checkpoint_max) iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator) iter_manager = tf.train.CheckpointManager( iter_checkpoint, os.path.join(config.checkpoints_dir, 'iter_ckpt_rank_' + '{:02}'.format(get_rank())), checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()), max_to_keep=config.keep_checkpoint_max) if config.restore_checkpoint and manager.latest_checkpoint: checkpoint.restore(manager.latest_checkpoint) log(" ** Restored model checkpoint from {}".format( manager.latest_checkpoint)) if iter_manager.latest_checkpoint: iter_checkpoint.restore(iter_manager.latest_checkpoint) log(" ** Restored iterator checkpoint from {}".format( iter_manager.latest_checkpoint), all_rank=True) else: log(" ** Initializing from scratch.") utils.heading("Running training") train_start, start_step = time.time(), int(checkpoint.step) - 1 while int(checkpoint.step) <= config.num_train_steps: step = int(checkpoint.step) features = next(train_iterator) iter_start = time.time() # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir) total_loss, eval_fn_inputs = train_one_step( config, model, optimizer, features, step <= 1) # if step == 300: tf.profiler.experimental.stop() perf_metrics["train_perf"].update_state(config.train_batch_size * get_world_size() / (time.time() - iter_start)) eval_metrics["total_loss"].update_state(values=total_loss) metric_fn(config, eval_metrics, eval_fn_inputs) if step % 100 == 0: log('Step:{:6d}, Loss:{:10.6f}, Gen_loss:{:10.6f}, Disc_loss:{:10.6f}, Gen_acc:{:6.2f}, ' 'Disc_acc:{:6.2f}, Perf:{:4.0f}, Elapsed: {}, ETA: {}, '. format( step, total_loss, eval_metrics["masked_lm_loss"].result().numpy(), eval_metrics["disc_loss"].result().numpy(), eval_metrics["masked_lm_accuracy"].result().numpy() * 100, eval_metrics["disc_accuracy"].result().numpy() * 100, perf_metrics["train_perf"].result().numpy(), utils.get_readable_time(time.time() - train_start), utils.get_readable_time( (time.time() - train_start) / (step - start_step) * (config.num_train_steps - step))), all_rank=True) with train_summary_writer.as_default(): for key, m in eval_metrics.items(): tf.summary.scalar(key, m.result(), step=step) for m in eval_metrics.values(): m.reset_states() checkpoint.step.assign_add(1) if step % config.save_checkpoints_steps == 0: if is_main_process(): save_path = manager.save() log(" ** Saved model checkpoint for step {}: {}".format( step, save_path)) iter_save_path = iter_manager.save() log(" ** Saved iterator checkpoint for step {}: {}".format( step, iter_save_path), all_rank=True) if config.do_eval: pass