Пример #1
0
 def write(self, dataset_name):
     path = directories.DOC_DATA + dataset_name + '/'
     if not self.columns:
         utils.rmkdir(path)
     self.mention_inds.write(path)
     self.pair_inds.write(path)
     self.features.write(path)
Пример #2
0
 def write(self, dataset_name):
     path = directories.DOC_DATA + dataset_name + '/'
     if not self.columns:
         utils.rmkdir(path)
     self.mention_inds.write(path)
     self.pair_inds.write(path)
     self.features.write(path)
Пример #3
0
 def write(self, dataset_name):
     path = directories.PAIR_DATA + dataset_name + '/'
     if not self.columns:
         utils.rmkdir(path)
     self.pair_indices.write(path)
     self.pair_features.write(path)
     self.y.write(path)
     self.pair_ids.write(path)
Пример #4
0
 def write(self, dataset_name):
     path = directories.PAIR_DATA + dataset_name + '/'
     if not self.columns:
         utils.rmkdir(path)
     self.pair_indices.write(path)
     self.pair_features.write(path)
     self.y.write(path)
     self.pair_ids.write(path)
Пример #5
0
 def write(self, dataset_name):
     path = directories.MENTION_DATA + dataset_name + '/'
     if not self.columns:
         utils.rmkdir(path)
     self.words.write(path)
     self.spans.write(path)
     self.features.write(path)
     self.mention_nums.write(path)
     self.mention_ids.write(path)
     self.dids.write(path)
Пример #6
0
 def write(self, dataset_name):
     path = directories.MENTION_DATA + dataset_name + '/'
     if not self.columns:
         utils.rmkdir(path)
     self.words.write(path)
     self.spans.write(path)
     self.features.write(path)
     self.mention_nums.write(path)
     self.mention_ids.write(path)
     self.dids.write(path)
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--corpus-dir",
                        required=True,
                        help="Location of pre-training text files.")
    parser.add_argument("--vocab-file",
                        required=True,
                        help="Location of vocabulary file.")
    parser.add_argument("--output-dir",
                        required=True,
                        help="Where to write out the tfrecords.")
    parser.add_argument("--max-seq-length",
                        default=128,
                        type=int,
                        help="Number of tokens per example.")
    parser.add_argument("--num-processes",
                        default=1,
                        type=int,
                        help="Parallelize across multiple processes.")
    parser.add_argument(
        "--blanks-separate-docs",
        default=True,
        type=bool,
        help="Whether blank lines indicate document boundaries.")
    parser.add_argument("--do-lower-case",
                        dest='do_lower_case',
                        action='store_true',
                        help="Lower case input text.")
    parser.add_argument("--no-lower-case",
                        dest='do_lower_case',
                        action='store_false',
                        help="Don't lower case input text.")
    parser.add_argument("--num-out-files",
                        default=1000,
                        type=int,
                        help="Number of output files.")
    parser.set_defaults(do_lower_case=True)
    args = parser.parse_args()

    utils.rmkdir(args.output_dir)
    if args.num_processes == 1:
        write_examples(0, args)
    else:
        jobs = []
        for i in range(args.num_processes):
            job = multiprocessing.Process(target=write_examples,
                                          args=(i, args))
            jobs.append(job)
            job.start()
        for job in jobs:
            job.join()
Пример #8
0
    def __init__(self, model_props, train_set='train', test_set='dev', n_epochs=200,
                 empty_buffer=True, betas=None, write_every=1, max_docs=10000):
        self.model_props = model_props
        if betas is None:
            betas = [0.8 ** i for i in range(1, 5)]
        self.write_every = write_every

        print "Model=" + model_props.path + ", ordering from " + directories.ACTION_SPACE
        self.pair_model, self.anaphoricity_model, self.model, word_vectors = \
            clustering_models.get_models(model_props)
        json_string = self.model.to_json()
        open(model_props.path + 'architecture.json', 'w').write(json_string)
        utils.rmkdir(model_props.path + 'src')
        for fname in os.listdir('.'):
            if fname.endswith('.py'):
                shutil.copyfile(fname, model_props.path + 'src/' + fname)

        self.train_data, self.train_docs = load_docs(train_set, word_vectors)
        print "Train loaded"

        self.dev_data, self.dev_docs = self.train_data, self.train_docs
        print "Dev loaded!"

        self.test_data, self.test_docs = load_docs(test_set, word_vectors)
        print "Test loaded"

        random.seed(0)
        random.shuffle(self.train_docs)
        random.shuffle(self.dev_docs)
        random.shuffle(self.test_docs)
        self.train_docs = self.train_docs[:max_docs]
        self.dev_docs = self.dev_docs[:max_docs]
        self.test_docs = self.test_docs[:max_docs]

        self.epoch = 0
        self.n = 0
        self.history = []
        self.best_conll = 0
        self.best_conll_window = 0
        replay_memory = ReplayMemory(self, self.model)
        for self.epoch in range(n_epochs):
            print 80 * "-"
            print "ITERATION", (self.epoch + 1), "model =", model_props.path
            ar = AgentRunner(self, self.train_docs, self.train_data, "Training", replay_memory,
                             beta=0 if self.epoch >= len(betas) else betas[self.epoch])
            self.train_pairs = ar.merged_pairs
            if empty_buffer:
                replay_memory.train_all()
            self.run_evaluation()
Пример #9
0
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'):
    print "Training", model_props.path
    pprint(model_props.__dict__)

    model_props.write(model_props.path + 'model_props.pkl')
    utils.rmkdir(model_props.path + 'src')
    for fname in os.listdir('.'):
        if fname.endswith('.py'):
            shutil.copyfile(fname, model_props.path + 'src/' + fname)
    if model_props.ranking or \
            model_props.top_pairs:
        write_start = 0
        write_every = 10
    else:
        write_start = 80
        write_every = 20

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    train = datasets.DocumentBatchedDataset(
        "train_reduced" if reduced else "train", model_props, with_ids=True)
    dev = datasets.DocumentBatchedDataset(
        dev_set_name + "_reduced" if reduced else dev_set_name,
        model_props,
        with_ids=True)

    print "Building model"
    model, _ = pairwise_models.get_model(dev, vectors, model_props)
    json_string = model.to_json()
    open(model_props.path + 'architecture.json', 'w').write(json_string)

    best_val_score = 1000
    best_val_score_in_window = 1000
    history = []
    for epoch in range(n_epochs):
        timer.start("train")
        print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path)

        epoch_stats = {}
        model_weights = model.get_weights()
        train_docs = utils.load_pickle(directories.DOCUMENTS +
                                       'train_docs.pkl')
        dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name +
                                     '_docs.pkl')
        if reduced:
            dev_docs = dev_docs[:3]

        if model_props.ranking:
            print "Running over training set"
            run_model_over_docs(train, train_docs, model)
            epoch_stats.update(compute_metrics(train_docs, "train"))
            if model_props.use_rewards:
                print "Setting costs"
                set_costs(train, train_docs)

        print "Training"
        prog = utils.Progbar(train.n_batches)
        train.shuffle()
        loss_sum, n_examples = 0, 0
        for i, X in enumerate(train):
            if X['y'].size == 0:
                continue
            batch_loss = model.train_on_batch(X)
            loss_sum += batch_loss * train.scale_factor
            n_examples += X['y'].size
            prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)])
        epoch_stats["train time"] = time.time() - prog.start
        for k in prog.unique_values:
            epoch_stats[k] = prog.sum_values[k][0] / max(
                1, prog.sum_values[k][1])

        epoch_stats["weight diffs"] = [
            (np.sum(np.abs(new_weight - old_weight)), new_weight.size)
            for new_weight, old_weight in zip(model.get_weights(),
                                              model_weights)
        ]
        summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0)
        epoch_stats["total weight diff"] = tuple(summed)

        print "Testing on dev set"
        evaluate_model(dev, dev_docs, model, model_props, epoch_stats)

        history.append(epoch_stats)
        utils.write_pickle(history, model_props.path + 'history.pkl')
        score = -epoch_stats["dev conll"] if model_props.ranking else \
            (epoch_stats["dev loss"] if not model_props.anaphoricity_only else
             epoch_stats["dev anaphoricity loss"])
        if score < best_val_score:
            best_val_score = score
            print "New best {:}, saving model".format(
                "CoNLL F1" if model_props.ranking else "validation loss")
            model.save_weights(model_props.path + "best_weights.hdf5",
                               overwrite=True)
        if score < best_val_score_in_window and epoch > write_start:
            print "Best in last {:}, saved to weights_{:}".format(
                write_every, write_every * (epoch / write_every))
            best_val_score_in_window = score
            model.save_weights(
                model_props.path +
                "weights_{:}.hdf5".format(write_every * (epoch / write_every)),
                overwrite=True)
            if epoch + write_every >= n_epochs:
                model.save_weights(model_props.path + "final_weights.hdf5",
                                   overwrite=True)
        if epoch % write_every == 0:
            best_val_score_in_window = 1000

        timer.stop("train")
        timer.print_totals()
        print

    timer.clear()
Пример #10
0
    def __init__(self,
                 model_props,
                 train_set='train',
                 test_set='dev',
                 n_epochs=200,
                 empty_buffer=True,
                 betas=None,
                 write_every=1,
                 max_docs=10000):
        self.model_props = model_props
        if betas is None:
            betas = [0.8**i for i in range(1, 5)]
        self.write_every = write_every

        print "Model=" + model_props.path + ", ordering from " + directories.ACTION_SPACE
        self.pair_model, self.anaphoricity_model, self.model, word_vectors = \
            clustering_models.get_models(model_props)
        json_string = self.model.to_json()
        open(model_props.path + 'architecture.json', 'w').write(json_string)
        utils.rmkdir(model_props.path + 'src')
        for fname in os.listdir('.'):
            if fname.endswith('.py'):
                shutil.copyfile(fname, model_props.path + 'src/' + fname)

        self.train_data, self.train_docs = load_docs(train_set, word_vectors)
        print "Train loaded"

        self.dev_data, self.dev_docs = self.train_data, self.train_docs
        print "Dev loaded!"

        self.test_data, self.test_docs = load_docs(test_set, word_vectors)
        print "Test loaded"

        random.seed(0)
        random.shuffle(self.train_docs)
        random.shuffle(self.dev_docs)
        random.shuffle(self.test_docs)
        self.train_docs = self.train_docs[:max_docs]
        self.dev_docs = self.dev_docs[:max_docs]
        self.test_docs = self.test_docs[:max_docs]

        self.epoch = 0
        self.n = 0
        self.history = []
        self.best_conll = 0
        self.best_conll_window = 0
        replay_memory = ReplayMemory(self, self.model)
        for self.epoch in range(n_epochs):
            print 80 * "-"
            print "ITERATION", (self.epoch + 1), "model =", model_props.path
            ar = AgentRunner(
                self,
                self.train_docs,
                self.train_data,
                "Training",
                replay_memory,
                beta=0 if self.epoch >= len(betas) else betas[self.epoch])
            self.train_pairs = ar.merged_pairs
            if empty_buffer:
                replay_memory.train_all()
            self.run_evaluation()
Пример #11
0
def train(model_props, n_epochs=10000, reduced=False, dev_set_name='dev'):
    print "Training", model_props.path
    pprint(model_props.__dict__)

    model_props.write(model_props.path + 'model_props.pkl')
    utils.rmkdir(model_props.path + 'src')
    for fname in os.listdir('.'):
        if fname.endswith('.py'):
            shutil.copyfile(fname, model_props.path + 'src/' + fname)
    if model_props.ranking or \
            model_props.top_pairs:
        write_start = 0
        write_every = 10
    else:
        write_start = 80
        write_every = 20

    print "Loading data"
    vectors = np.load(directories.RELEVANT_VECTORS + 'word_vectors.npy')
    train = datasets.DocumentBatchedDataset("train_reduced" if reduced else "train",
                                            model_props, with_ids=True)
    dev = datasets.DocumentBatchedDataset(dev_set_name + "_reduced" if reduced else dev_set_name,
                                          model_props, with_ids=True)

    print "Building model"
    model, _ = pairwise_models.get_model(dev, vectors, model_props)
    json_string = model.to_json()
    open(model_props.path + 'architecture.json', 'w').write(json_string)

    best_val_score = 1000
    best_val_score_in_window = 1000
    history = []
    for epoch in range(n_epochs):
        timer.start("train")
        print "EPOCH {:}, model = {:}".format((epoch + 1), model_props.path)

        epoch_stats = {}
        model_weights = model.get_weights()
        train_docs = utils.load_pickle(directories.DOCUMENTS + 'train_docs.pkl')
        dev_docs = utils.load_pickle(directories.DOCUMENTS + dev_set_name + '_docs.pkl')
        if reduced:
            dev_docs = dev_docs[:3]

        if model_props.ranking:
            print "Running over training set"
            run_model_over_docs(train, train_docs, model)
            epoch_stats.update(compute_metrics(train_docs, "train"))
            if model_props.use_rewards:
                print "Setting costs"
                set_costs(train, train_docs)

        print "Training"
        prog = utils.Progbar(train.n_batches)
        train.shuffle()
        loss_sum, n_examples = 0, 0
        for i, X in enumerate(train):
            if X['y'].size == 0:
                continue
            batch_loss = model.train_on_batch(X)
            loss_sum += batch_loss * train.scale_factor
            n_examples += X['y'].size
            prog.update(i + 1, exact=[("train loss", loss_sum / n_examples)])
        epoch_stats["train time"] = time.time() - prog.start
        for k in prog.unique_values:
            epoch_stats[k] = prog.sum_values[k][0] / max(1, prog.sum_values[k][1])

        epoch_stats["weight diffs"] = [
            (np.sum(np.abs(new_weight - old_weight)), new_weight.size)
            for new_weight, old_weight in zip(model.get_weights(), model_weights)]
        summed = np.sum(map(np.array, epoch_stats["weight diffs"][1:]), axis=0)
        epoch_stats["total weight diff"] = tuple(summed)

        print "Testing on dev set"
        evaluate_model(dev, dev_docs, model, model_props, epoch_stats)

        history.append(epoch_stats)
        utils.write_pickle(history, model_props.path + 'history.pkl')
        score = -epoch_stats["dev conll"] if model_props.ranking else \
            (epoch_stats["dev loss"] if not model_props.anaphoricity_only else
             epoch_stats["dev anaphoricity loss"])
        if score < best_val_score:
            best_val_score = score
            print "New best {:}, saving model".format(
                "CoNLL F1" if model_props.ranking else "validation loss")
            model.save_weights(model_props.path + "best_weights.hdf5", overwrite=True)
        if score < best_val_score_in_window and epoch > write_start:
            print "Best in last {:}, saved to weights_{:}".format(
                write_every, write_every * (epoch / write_every))
            best_val_score_in_window = score
            model.save_weights(model_props.path + "weights_{:}.hdf5".format(
                write_every * (epoch / write_every)), overwrite=True)
            if epoch + write_every >= n_epochs:
                model.save_weights(model_props.path + "final_weights.hdf5", overwrite=True)
        if epoch % write_every == 0:
            best_val_score_in_window = 1000

        timer.stop("train")
        timer.print_totals()
        print

    timer.clear()
Пример #12
0
def main():
    # Parse essential args
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir",
                        required=True,
                        help="Location of data files (model weights, etc).")
    parser.add_argument("--model_name",
                        required=True,
                        help="The name of the model being fine-tuned.")
    parser.add_argument("--pretrain_tfrecords", type=str)

    parser.add_argument("--seed", type=int)
    parser.add_argument("--num_train_steps", type=int)
    parser.add_argument("--num_warmup_steps", type=int)
    parser.add_argument("--learning_rate", type=float)
    parser.add_argument("--train_batch_size", type=int)
    parser.add_argument("--max_seq_length", type=int)

    parser.add_argument("--mask_prob", type=float)
    parser.add_argument("--disc_weight", type=float)
    parser.add_argument("--generator_hidden_size", type=float)

    parser.add_argument("--save_checkpoints_steps", type=int)
    parser.add_argument("--keep_checkpoint_max", type=int)
    parser.add_argument("--restore_checkpoint", action='store_true')

    parser.add_argument("--optimizer",
                        default="adam",
                        type=str,
                        help="adam or lamb")

    args = parser.parse_args()
    config = PretrainingConfig(**args.__dict__)

    # Set up tensorflow
    hvd.init()
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()],
                                                   'GPU')
    tf.config.optimizer.set_jit(config.xla)
    tf.config.optimizer.set_experimental_options(
        {"auto_mixed_precision": config.amp})
    tf.random.set_seed(config.seed)

    # Set up config
    if config.do_train == config.do_eval:
        raise ValueError(
            "Exactly one of `do_train` or `do_eval` must be True.")
    if config.debug and config.do_train:
        utils.rmkdir(config.model_dir)
    utils.heading("Config:")
    log_config(config)

    # Save pretrain configs
    pretrain_config_json = os.path.join(config.checkpoints_dir,
                                        'pretrain_config.json')
    if is_main_process():
        utils.write_json(config.__dict__, pretrain_config_json)
        log("Configuration saved in {}".format(pretrain_config_json))

    # Set up model
    model = PretrainingModel(config)

    # Set up metrics
    perf_metrics = dict()
    perf_metrics["train_perf"] = tf.keras.metrics.Mean(name="train_perf")

    eval_metrics = dict()
    eval_metrics["total_loss"] = tf.keras.metrics.Mean(name="total_loss")
    eval_metrics["masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
        name="masked_lm_accuracy")
    eval_metrics["masked_lm_loss"] = tf.keras.metrics.Mean(
        name="masked_lm_loss")
    if config.electra_objective:
        eval_metrics["sampled_masked_lm_accuracy"] = tf.keras.metrics.Accuracy(
            name="sampled_masked_lm_accuracy")
        if config.disc_weight > 0:
            eval_metrics["disc_loss"] = tf.keras.metrics.Mean(name="disc_loss")
            eval_metrics["disc_auc"] = tf.keras.metrics.AUC(name="disc_auc")
            eval_metrics["disc_accuracy"] = tf.keras.metrics.Accuracy(
                name="disc_accuracy")
            eval_metrics["disc_precision"] = tf.keras.metrics.Accuracy(
                name="disc_precision")
            eval_metrics["disc_recall"] = tf.keras.metrics.Accuracy(
                name="disc_recall")

    # Set up tensorboard
    current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    train_log_dir = os.path.join(
        config.log_dir, current_time,
        'train_' + str(get_rank()) + '_of_' + str(get_world_size()))
    train_summary_writer = tf.summary.create_file_writer(train_log_dir)

    # Set up dataset
    dataset = pretrain_utils.get_dataset(config,
                                         config.train_batch_size,
                                         world_size=get_world_size(),
                                         rank=get_rank())
    train_iterator = iter(dataset)

    # Set up optimizer
    optimizer = create_optimizer(init_lr=config.learning_rate,
                                 num_train_steps=config.num_train_steps,
                                 num_warmup_steps=config.num_warmup_steps,
                                 weight_decay_rate=config.weight_decay_rate,
                                 optimizer=config.optimizer)
    if config.amp:
        optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, "dynamic")

    if config.do_train:
        # Set up checkpoint manager
        checkpoint = tf.train.Checkpoint(step=tf.Variable(1),
                                         optimizer=optimizer,
                                         model=model)
        manager = tf.train.CheckpointManager(
            checkpoint,
            config.checkpoints_dir,
            max_to_keep=config.keep_checkpoint_max)
        iter_checkpoint = tf.train.Checkpoint(train_iterator=train_iterator)
        iter_manager = tf.train.CheckpointManager(
            iter_checkpoint,
            os.path.join(config.checkpoints_dir,
                         'iter_ckpt_rank_' + '{:02}'.format(get_rank())),
            checkpoint_name='iter_ckpt_rank_' + '{:02}'.format(get_rank()),
            max_to_keep=config.keep_checkpoint_max)
        if config.restore_checkpoint and manager.latest_checkpoint:
            checkpoint.restore(manager.latest_checkpoint)
            log(" ** Restored model checkpoint from {}".format(
                manager.latest_checkpoint))
            if iter_manager.latest_checkpoint:
                iter_checkpoint.restore(iter_manager.latest_checkpoint)
                log(" ** Restored iterator checkpoint from {}".format(
                    iter_manager.latest_checkpoint),
                    all_rank=True)
        else:
            log(" ** Initializing from scratch.")

        utils.heading("Running training")
        train_start, start_step = time.time(), int(checkpoint.step) - 1
        while int(checkpoint.step) <= config.num_train_steps:
            step = int(checkpoint.step)
            features = next(train_iterator)
            iter_start = time.time()

            # if step == 200: tf.profiler.experimental.start(logdir=train_log_dir)
            total_loss, eval_fn_inputs = train_one_step(
                config, model, optimizer, features, step <= 1)
            # if step == 300: tf.profiler.experimental.stop()

            perf_metrics["train_perf"].update_state(config.train_batch_size *
                                                    get_world_size() /
                                                    (time.time() - iter_start))
            eval_metrics["total_loss"].update_state(values=total_loss)
            metric_fn(config, eval_metrics, eval_fn_inputs)

            if step % 100 == 0:
                log('Step:{:6d}, Loss:{:10.6f}, Gen_loss:{:10.6f}, Disc_loss:{:10.6f}, Gen_acc:{:6.2f}, '
                    'Disc_acc:{:6.2f}, Perf:{:4.0f}, Elapsed: {}, ETA: {}, '.
                    format(
                        step, total_loss,
                        eval_metrics["masked_lm_loss"].result().numpy(),
                        eval_metrics["disc_loss"].result().numpy(),
                        eval_metrics["masked_lm_accuracy"].result().numpy() *
                        100,
                        eval_metrics["disc_accuracy"].result().numpy() * 100,
                        perf_metrics["train_perf"].result().numpy(),
                        utils.get_readable_time(time.time() - train_start),
                        utils.get_readable_time(
                            (time.time() - train_start) / (step - start_step) *
                            (config.num_train_steps - step))),
                    all_rank=True)

                with train_summary_writer.as_default():
                    for key, m in eval_metrics.items():
                        tf.summary.scalar(key, m.result(), step=step)

                for m in eval_metrics.values():
                    m.reset_states()

            checkpoint.step.assign_add(1)
            if step % config.save_checkpoints_steps == 0:
                if is_main_process():
                    save_path = manager.save()
                    log(" ** Saved model checkpoint for step {}: {}".format(
                        step, save_path))
                iter_save_path = iter_manager.save()
                log(" ** Saved iterator checkpoint for step {}: {}".format(
                    step, iter_save_path),
                    all_rank=True)

    if config.do_eval:
        pass