예제 #1
0
 def __init__(self, albert_config):
     self.albert_config = albert_config
     model_name = self.albert_config['model_name']
     self.tokenizer = AlbertTokenizer.from_pretrained(model_name)
     self.model = TFAlbertModel.from_pretrained(model_name)
     self.summary_extraction_mode = self.albert_config[
         'summary_extraction_mode']
예제 #2
0
    def _build_albert_from_transformers(self):
        from transformers import TFAlbertModel

        model = TFAlbertModel.from_pretrained(os.path.join(
            PYTORCH_MODEL_PATH, "albert_base_zh_pytorch"),
                                              from_pt=True)
        return model
예제 #3
0
    def __init__(self, config, pretrained_path, *inputs, **kwargs):
        super(QaAlbertModel, self).__init__(config, pretrained_path, *inputs,
                                            **kwargs)

        # config needs to be a dict
        self.linear_size = 128
        self.num_labels = config['num_labels']

        # set inputs to None (sequence length not known) as this is a condition for being able to save the model
        self.sequence = tf.keras.Input(shape=(None, ), dtype=np.int32)

        self.albert = TFAlbertModel.from_pretrained(pretrained_path)

        self.relu = tf.keras.layers.Activation('relu')

        # compute score of each output of a word t_i being the start or end of the answer:
        # s*t_i, e*t_j (s: start vector, e: end vector) in top layer with num_labels=2
        self.qa_linear = tf.keras.layers.Dense(
            self.linear_size,
            activation='relu',
            kernel_initializer=u.get_initializer(config['initializer_range']),
            name="qa_linear")
        self.qa_outputs = tf.keras.layers.Dense(
            self.num_labels,
            kernel_initializer=u.get_initializer(config['initializer_range']),
            name="qa_outputs")
        # sum over sequence
        self.softmax = tf.keras.layers.Softmax(axis=1)
        self.albert.trainable = False

        # has to come at the end of this constructor, otherwise other layers are unknown
        self._set_inputs(self.sequence)
예제 #4
0
def get_albert_for_comparison():
    model_name = 'albert-base-v2'
    config = AlbertConfig.from_pretrained(model_name)
    config.output_hidden_states = False

    input_ids = tf.keras.Input(shape=(128, ), name='input_ids', dtype=tf.int32)
    attention_mask = tf.keras.Input(shape=(128, ),
                                    name='attention_mask',
                                    dtype=tf.int32)

    transformer_model = TFAlbertModel.from_pretrained(model_name,
                                                      config=config)
    embedding_layer = transformer_model([input_ids, attention_mask])[0]

    X = tf.keras.layers.Dense(
        config.hidden_size,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=config.initializer_range),
        activation="relu",
        name="pre_classifier",
    )(embedding_layer[:, 0])
    X = tf.keras.layers.Dropout(config.classifier_dropout_prob)(X)
    output_ = tf.keras.layers.Dense(
        1,
        kernel_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=config.initializer_range),
        name="classifier")(X)

    return tf.keras.Model([input_ids, attention_mask], output_)
예제 #5
0
    def __init__(
            self,
            pretrained_model_name_or_path='albert-base-v2',
            reduce_output='cls_pooled',
            trainable=True,
            num_tokens=None,
            **kwargs
    ):
        super(ALBERTEncoder, self).__init__()
        try:
            from transformers import TFAlbertModel
        except ModuleNotFoundError:
            logger.error(
                ' transformers is not installed. '
                'In order to install all text feature dependencies run '
                'pip install ludwig[text]'
            )
            sys.exit(-1)

        self.transformer = TFAlbertModel.from_pretrained(
            pretrained_model_name_or_path
        )
        self.reduce_output = reduce_output
        if not self.reduce_output == 'cls_pooled':
            self.reduce_sequence = SequenceReducer(reduce_mode=reduce_output)
        self.transformer.trainable = trainable
        self.transformer.resize_token_embeddings(num_tokens)
예제 #6
0
def initialize_embeddings_from_average_representations(config, unique_entity_map, train_samples):
    batch_size = config['embedding_batch_size']
    model = TFAlbertModel.from_pretrained('albert-base-v2')
    embedding_matrix, index_to_eid, eid_to_index = [], {}, {}
    for entity_num, (eid, unique_entity) in enumerate(unique_entity_map.items()):
        embedding = []
        utterances, masks = unique_entity.utterances, unique_entity.masks
        input_data = [utterances[i * batch_size:(i + 1) * batch_size] for i in
                      range((len(utterances) + batch_size - 1) // batch_size)]
        input_masks = [masks[i * batch_size:(i + 1) * batch_size] for i in
                       range((len(masks) + batch_size - 1) // batch_size)]
        for index, (batch, mask) in enumerate(zip(input_data, input_masks)):
            if index == 0:
                embedding = np.asarray(
                    tf.reduce_sum(tf.reduce_mean(model(tf.constant(batch), attention_mask=np.array(mask))[0], axis=1),
                                  axis=0))
            else:
                embedding = np.add(embedding, np.asarray(
                    tf.reduce_sum(tf.reduce_mean(model(tf.constant(batch), attention_mask=np.array(mask))[0], axis=1),
                                  axis=0)))
        embedding = np.divide(embedding, len(utterances))
        unique_entity.set_embedding(embedding)
        embedding_matrix.append(embedding)
        index_to_eid[entity_num] = eid
        eid_to_index[eid] = entity_num
    for train_sample in train_samples:
        train_sample.set_embedding(unique_entity_map[train_sample.entity_id].entity_embedding)
    return index_to_eid, eid_to_index, embedding_matrix
예제 #7
0
 def __init__(self, dropout=0.1):
     super().__init__()
     self.albert = TFAlbertModel.from_pretrained('bert-base-uncased',
                                                 trainable=True)
     self.drop = tf.keras.layers.Dropout(dropout)
     self.fc = tf.keras.layers.Dense(300, tf.nn.swish)
     self.out = tf.keras.layers.Dense(2)
예제 #8
0
    def __init__(self, max_seq_length=512):

        # ALBERT unique params
        self.max_seq_length = max_seq_length

        # GR params
        self.vectorized_knowledge = {}
        self.text = {}
        self.questions = {}
        self.opt_params = {'learning_rate':0.001,'beta_1':0.9,'beta_2':0.999,'epsilon':1e-07}

        # init saved model
        self.albert_layer = TFAlbertModel.from_pretrained('albert-base-v2')

        # writing the model for the training tasks
        # get inputs
        
        res_id = tf.keras.layers.Input(shape=(self.max_seq_length,), name="input_ids", dtype='int32')
        res_mask = tf.keras.layers.Input(shape=(self.max_seq_length,), name="input_masks", dtype='int32')
        res_segment = tf.keras.layers.Input(shape=(self.max_seq_length,), name="input_seg", dtype='int32')

        # encode the three inputs
        _, res_pooled = self.albert_layer([res_id, res_mask, res_segment])

        # dense layer specifically for 
        self.response_encoder = tf.keras.layers.Dense(768, input_shape=(768,), name='response_dense_layer')
        encoded_response = self.response_encoder(res_pooled)

        # init model
        self.albert_model = tf.keras.Model(inputs=[res_id, res_mask, res_segment],
                                    outputs=encoded_response)
        
        print("Initializing tokenizer and optimizer")
        self.init_signatures()
예제 #9
0
 def _test_TFAlbert(self, size, large=False):
     from transformers import AlbertTokenizer, TFAlbertModel
     tokenizer = AlbertTokenizer.from_pretrained(size)
     model = TFAlbertModel.from_pretrained(size)
     input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf")
     spec, input_dict = self.spec_and_pad(input_dict)
     outputs = ["last_hidden_state"]
     self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
예제 #10
0
def get_transformer(bert_model_type, output_hidden_states=False):
    config = get_bert_config(bert_model_type, output_hidden_states)
    if bert_model_type in [
            'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                           config=config)
    elif bert_model_type in [
            'prod-bert-base-uncased', 'tune_bert-base-uncased_nsp'
    ]:
        return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                           config=config,
                                           from_pt=True)
    elif bert_model_type in [
            'roberta-base', 'roberta-large', 'roberta-large-mnli',
            'distilroberta-base'
    ]:
        return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                              config=config)
    elif bert_model_type in ['prod-roberta-base-cased']:
        return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                              config=config,
                                              from_pt=True)
    elif bert_model_type in ['xlnet-base-cased']:
        return TFXLNetModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                            config=config)
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        return TFAlbertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                             config=config)
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        return TFGPT2Model.from_pretrained(BERT_MODEL_FILE[bert_model_type],
                                           config=config)
    elif bert_model_type in ['transfo-xl']:
        return TFTransfoXLModel.from_pretrained(
            BERT_MODEL_FILE[bert_model_type], config=config)
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        return TFDistilBertModel.from_pretrained(
            BERT_MODEL_FILE[bert_model_type], config=config)
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')
예제 #11
0
    def __init__(self, model_name_path: str, use_dropout: bool, name: str):
        """
        Pretrained Classification Model initializer.

        Args:
            model_name_path (str): model_name_path
            use_dropout (bool): use_dropout
            name (str): name
        """
        self.model_name_path = model_name_path
        self.use_dropout = use_dropout
        super().__init__()
        self.pretrained_layer = TFAlbertModel.from_pretrained(model_name_path)
        self.dropout_layer = (tf.keras.layers.Dropout(
            rate=0.1, name="Dropout_layer") if self.use_dropout else None)
예제 #12
0
    def __init__(self, *args, **kwargs):
        super(AlbertForComparison, self).__init__(self, args, kwargs)
        self.model_name = 'albert-base-v2'
        self.config = AlbertConfig.from_pretrained(self.model_name)
        self.config.output_hidden_states = False

        self.embedding_layer = TFAlbertModel.from_pretrained(
            self.model_name, config=self.config)
        self.pre_classifier = tf.keras.layers.Dense(
            self.config.hidden_size,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=self.config.initializer_range),
            activation="relu",
            name="pre_classifier",
        )
        self.classifier = tf.keras.layers.Dense(
            1,
            kernel_initializer=tf.keras.initializers.TruncatedNormal(
                stddev=self.config.initializer_range),
            name="classifier")
예제 #13
0
def initialize_embeddings_from_canonical(config, unique_entity_map, train_samples):
    batch_size = config['embedding_batch_size']
    unique_entities = list(unique_entity_map.values())
    input_data = [unique_entities[i * batch_size:(i + 1) * batch_size] for i in
                  range((len(unique_entities) + batch_size - 1) // batch_size)]
    model = TFAlbertModel.from_pretrained('albert-base-v2')
    embedding_matrix, index_to_eid, eid_to_index, index = [], {}, {}, 0
    for batch in input_data:
        model_inputs = [x.canonical_tokens for x in batch]
        masks = [x.canonical_attention_mask for x in batch]
        embeddings = tf.reduce_mean(model(tf.constant(model_inputs), attention_mask=np.array(masks))[0], axis=1)
        for entity, embedding in zip(batch, embeddings):
            entity.set_embedding(embedding)
            eid = entity.entity_id
            embedding_matrix.append(embedding)
            index_to_eid[index] = eid
            eid_to_index[eid] = index
            index = index + 1
    for train_sample in train_samples:
        train_sample.set_embedding(unique_entity_map[train_sample.entity_id].entity_embedding)
    return index_to_eid, eid_to_index, embedding_matrix
예제 #14
0
 def __init__(self,
              intent_size,
              slot_size,
              lr=1e-4,
              dropout_rate=0.2,
              units=300):
     super().__init__()
     self.albert = TFAlbertModel.from_pretrained('bert-base-uncased',
                                                 trainable=True)
     self.inp_dropout = Dropout(dropout_rate)
     self.intent_dropout = Dropout(dropout_rate)
     self.fc_intent = Dense(units, activation='relu')
     self.trans_params = self.add_weight(shape=(slot_size, slot_size))
     self.out_linear_intent = Dense(intent_size)
     self.out_linear_slot = Dense(slot_size)
     self.optimizer = Adam(lr)
     self.slots_accuracy = tf.keras.metrics.Accuracy()
     self.intent_accuracy = tf.keras.metrics.Accuracy()
     self.decay_lr = tf.optimizers.schedules.ExponentialDecay(
         lr, 1000, 0.95)
     self.logger = logging.getLogger('tensorflow')
     self.logger.setLevel(logging.INFO)
예제 #15
0
 def __init__(self, nb_classes):
     super(AlbertPlusOutputLayer, self).__init__()
     self.albert_embedder = TFAlbertModel.from_pretrained('albert-base-v2')
     self.output_layer = Dense(nb_classes)
예제 #16
0
 def __init__(self):
     super(AlbertEmbedder, self).__init__()
     self.albert_embedder = TFAlbertModel.from_pretrained('albert-base-v2')
예제 #17
0
def main(
    fsx_prefix: str,
    model_type: str,
    model_size: str,
    batch_size: int,
    max_seq_length: int,
    gradient_accumulation_steps: int,
    optimizer: str,
    name: str,
    learning_rate: float,
    end_learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    skip_sop: bool,
    skip_mlm: bool,
    pre_layer_norm: bool,
    fast_squad: bool,
    dummy_eval: bool,
    squad_steps: List[int],
    hidden_dropout_prob: float,
):
    # Hard-coded values that don't need to be arguments
    max_predictions_per_seq = 20
    log_frequency = 1000
    checkpoint_frequency = 5000
    validate_frequency = 2000
    histogram_frequency = 100
    do_gradient_accumulation = gradient_accumulation_steps > 1

    if hvd.rank() == 0:
        # Run name should only be used on one process to avoid race conditions
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        platform = "eks" if args.fsx_prefix == "/fsx" else "sm"
        if skip_sop:
            loss_str = "-skipsop"
        elif skip_mlm:
            loss_str = "-skipmlm"
        else:
            loss_str = ""

        amp_str = ("-skipamp"
                   if not tf.config.optimizer.get_experimental_options().get(
                       "auto_mixed_precision", False) else "")
        ln_str = "-preln" if pre_layer_norm else "-postln"
        dropout_str = f"-{hidden_dropout_prob}dropout" if hidden_dropout_prob != 0 else ""
        name_str = f"-{name}" if name else ""
        metadata = f"{model_type}-{model_size}-{args.load_from}-{hvd.size()}gpus-{batch_size}batch-{gradient_accumulation_steps}accum-{learning_rate}lr-{args.max_grad_norm}maxgrad-{optimizer}opt-{total_steps}steps-{max_seq_length}seq{amp_str}{ln_str}{loss_str}{dropout_str}{name_str}"
        run_name = f"{current_time}-{platform}-{metadata}"

        # Logging should only happen on a single process
        # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time
        level = logging.INFO
        format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
        handlers = [
            logging.FileHandler(f"{fsx_prefix}/logs/albert/{run_name}.log"),
            logging.StreamHandler(),
        ]
        logging.basicConfig(level=level, format=format, handlers=handlers)

        # Check that arguments passed in properly, only after registering the alert_func and logging
        assert not (skip_sop
                    and skip_mlm), "Cannot use --skip_sop and --skip_mlm"

    wrap_global_functions(do_gradient_accumulation)

    if model_type == "albert":
        model_desc = f"albert-{model_size}-v2"
    elif model_type == "bert":
        model_desc = f"bert-{model_size}-uncased"

    config = AutoConfig.from_pretrained(model_desc)
    config.pre_layer_norm = pre_layer_norm
    config.output_hidden_states = True
    config.hidden_dropout_prob = hidden_dropout_prob
    model = TFAutoModelForPreTraining.from_config(config)

    if args.load_from == "scratch":
        pass
    else:
        assert model_type == "albert", "Only loading pretrained albert models is supported"
        huggingface_name = f"albert-{model_size}-v2"
        if args.load_from == "huggingface":
            albert = TFAlbertModel.from_pretrained(huggingface_name,
                                                   config=config)
            model.albert = albert
        elif args.load_from == "huggingfacepreds":
            mlm_model = TFAlbertForMaskedLM.from_pretrained(huggingface_name,
                                                            config=config)
            model.albert = mlm_model.albert
            model.cls.predictions = mlm_model.predictions

    tokenizer = get_tokenizer()
    schedule = LinearWarmupLinearDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=end_learning_rate,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    if optimizer == "lamb":
        opt = LAMB(
            learning_rate=schedule,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        )
    elif optimizer == "adam":
        opt = AdamW(weight_decay=0.0, learning_rate=schedule)

    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
        opt, loss_scale="dynamic")
    gradient_accumulator = GradientAccumulator()

    # Train filenames are [1, 2047]
    # Val filenames are [0]
    # Note the different subdirectories
    train_glob = f"{fsx_prefix}/albert_pretraining/tfrecords/train/max_seq_len_{max_seq_length}_max_predictions_per_seq_{max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord"
    validation_glob = f"{fsx_prefix}/albert_pretraining/tfrecords/validation/max_seq_len_{max_seq_length}_max_predictions_per_seq_{max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord"

    train_filenames = glob.glob(train_glob)
    validation_filenames = glob.glob(validation_glob)

    train_dataset = get_mlm_dataset(
        filenames=train_filenames,
        max_seq_length=max_seq_length,
        max_predictions_per_seq=max_predictions_per_seq,
        batch_size=batch_size,
    )  # Of shape [batch_size, ...]
    train_dataset = train_dataset.batch(
        gradient_accumulation_steps
    )  # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, batch_size, ...]
    # train_dataset = (
    #    train_dataset.repeat()
    # )  # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps.
    train_dataset = train_dataset.prefetch(buffer_size=8)

    # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks
    if hvd.rank() == 0:
        validation_dataset = get_mlm_dataset(
            filenames=validation_filenames,
            max_seq_length=max_seq_length,
            max_predictions_per_seq=max_predictions_per_seq,
            batch_size=batch_size,
        )
        # validation_dataset = validation_dataset.batch(1)
        validation_dataset = validation_dataset.prefetch(buffer_size=8)

        pbar = tqdm.tqdm(total_steps)
        summary_writer = None  # Only create a writer if we make it through a successful step

    if hvd.rank() == 0:
        logger.info(f"Starting training, job name {run_name}")

    for i, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(i, dtype=tf.float32))
        loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm = train_step(
            model=model,
            opt=opt,
            gradient_accumulator=gradient_accumulator,
            batch=batch,
            gradient_accumulation_steps=gradient_accumulation_steps,
            skip_sop=skip_sop,
            skip_mlm=skip_mlm,
        )

        # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors
        if i == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)

        is_final_step = i >= total_steps - 1
        do_squad = i in squad_steps or is_final_step
        # Squad requires all the ranks to train, but results are only returned on rank 0
        if do_squad:
            squad_results = get_squad_results(
                model=model,
                model_size=model_size,
                step=i,
                fast=fast_squad,
                dummy_eval=dummy_eval,
            )
            if hvd.rank() == 0:
                squad_exact, squad_f1 = squad_results["exact"], squad_results[
                    "f1"]
                logger.info(
                    f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}"
                )
            # Re-wrap autograph so it doesn't get arg mismatches
            wrap_global_functions(do_gradient_accumulation)

        if hvd.rank() == 0:
            do_log = i % log_frequency == 0
            do_checkpoint = (i % checkpoint_frequency == 0) or is_final_step
            do_validation = (i % validate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}"
            pbar.set_description(description)
            if do_log:
                logger.info(f"Train step {i} -- {description}")

            if do_checkpoint:
                checkpoint_path = f"{fsx_prefix}/checkpoints/albert/{run_name}-step{i}.ckpt"
                logger.info(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)
                # model.load_weights(checkpoint_path)

            if do_validation:
                val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation(
                    model=model,
                    validation_dataset=validation_dataset,
                    skip_sop=skip_sop,
                    skip_mlm=skip_mlm,
                )
                description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}"
                logger.info(f"Validation step {i} -- {description}")

            # Create summary_writer after the first step
            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    f"{fsx_prefix}/logs/albert/{run_name}")
            # Log to TensorBoard
            weight_norm = tf.math.sqrt(
                tf.math.reduce_sum([
                    tf.norm(var, ord=2)**2 for var in model.trainable_variables
                ]))
            with summary_writer.as_default():
                tf.summary.scalar("weight_norm", weight_norm, step=i)
                tf.summary.scalar("learning_rate", learning_rate, step=i)
                tf.summary.scalar("train_loss", loss, step=i)
                tf.summary.scalar("train_mlm_loss", mlm_loss, step=i)
                tf.summary.scalar("train_mlm_acc", mlm_acc, step=i)
                tf.summary.scalar("train_sop_loss", sop_loss, step=i)
                tf.summary.scalar("train_sop_acc", sop_acc, step=i)
                tf.summary.scalar("grad_norm", grad_norm, step=i)
                if do_validation:
                    tf.summary.scalar("val_loss", val_loss, step=i)
                    tf.summary.scalar("val_mlm_loss", val_mlm_loss, step=i)
                    tf.summary.scalar("val_mlm_acc", val_mlm_acc, step=i)
                    tf.summary.scalar("val_sop_loss", val_sop_loss, step=i)
                    tf.summary.scalar("val_sop_acc", val_sop_acc, step=i)
                if do_squad:
                    tf.summary.scalar("squad_f1", squad_f1, step=i)
                    tf.summary.scalar("squad_exact", squad_exact, step=i)

        if is_final_step:
            break

    if hvd.rank() == 0:
        pbar.close()
        logger.info(f"Finished pretraining, job name {run_name}")
def main():
    parser = HfArgumentParser((ModelArguments, DataTrainingArguments,
                               TrainingArguments, LoggingArguments))
    model_args, data_args, train_args, log_args = parser.parse_args_into_dataclasses(
    )

    tf.random.set_seed(train_args.seed)
    tf.autograph.set_verbosity(0)

    # Settings init
    parse_bool = lambda arg: arg == "true"
    do_gradient_accumulation = train_args.gradient_accumulation_steps > 1
    do_xla = not parse_bool(train_args.skip_xla)
    do_eager = parse_bool(train_args.eager)
    skip_sop = parse_bool(train_args.skip_sop)
    skip_mlm = parse_bool(train_args.skip_mlm)
    pre_layer_norm = parse_bool(model_args.pre_layer_norm)
    fast_squad = parse_bool(log_args.fast_squad)
    dummy_eval = parse_bool(log_args.dummy_eval)
    squad_steps = get_squad_steps(log_args.extra_squad_steps)
    is_sagemaker = data_args.fsx_prefix.startswith("/opt/ml")
    disable_tqdm = is_sagemaker
    global max_grad_norm
    max_grad_norm = train_args.max_grad_norm

    # Horovod init
    hvd.init()
    gpus = tf.config.list_physical_devices("GPU")
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    if gpus:
        tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU")
    # XLA, AutoGraph
    tf.config.optimizer.set_jit(do_xla)
    tf.config.experimental_run_functions_eagerly(do_eager)

    if hvd.rank() == 0:
        # Run name should only be used on one process to avoid race conditions
        current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        platform = "sm" if is_sagemaker else "eks"
        if skip_sop:
            loss_str = "-skipsop"
        elif skip_mlm:
            loss_str = "-skipmlm"
        else:
            loss_str = ""

        metadata = (f"{model_args.model_type}"
                    f"-{model_args.model_size}"
                    f"-{model_args.load_from}"
                    f"-{hvd.size()}gpus"
                    f"-{train_args.batch_size}batch"
                    f"-{train_args.gradient_accumulation_steps}accum"
                    f"-{train_args.learning_rate}maxlr"
                    f"-{train_args.end_learning_rate}endlr"
                    f"-{train_args.learning_rate_decay_power}power"
                    f"-{train_args.max_grad_norm}maxgrad"
                    f"-{train_args.optimizer}opt"
                    f"-{train_args.total_steps}steps"
                    f"-{data_args.max_seq_length}seq"
                    f"-{data_args.max_predictions_per_seq}preds"
                    f"-{'preln' if pre_layer_norm else 'postln'}"
                    f"{loss_str}"
                    f"-{model_args.hidden_dropout_prob}dropout"
                    f"-{train_args.seed}seed")
        run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}"

        # Logging should only happen on a single process
        # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time
        level = logging.INFO
        format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s"
        handlers = [
            logging.FileHandler(
                f"{data_args.fsx_prefix}/logs/albert/{run_name}.log"),
            TqdmLoggingHandler(),
        ]
        logging.basicConfig(level=level, format=format, handlers=handlers)

        # Check that arguments passed in properly, only after registering the alert_func and logging
        assert not (skip_sop
                    and skip_mlm), "Cannot use --skip_sop and --skip_mlm"

    wrap_global_functions(do_gradient_accumulation)

    if model_args.model_type == "albert":
        model_desc = f"albert-{model_args.model_size}-v2"
    elif model_args.model_type == "bert":
        model_desc = f"bert-{model_args.model_size}-uncased"

    config = AutoConfig.from_pretrained(model_desc)
    config.pre_layer_norm = pre_layer_norm
    config.hidden_dropout_prob = model_args.hidden_dropout_prob
    model = TFAutoModelForPreTraining.from_config(config)

    # Create optimizer and enable AMP loss scaling.
    schedule = LinearWarmupPolyDecaySchedule(
        max_learning_rate=train_args.learning_rate,
        end_learning_rate=train_args.end_learning_rate,
        warmup_steps=train_args.warmup_steps,
        total_steps=train_args.total_steps,
        power=train_args.learning_rate_decay_power,
    )
    if train_args.optimizer == "lamb":
        opt = LAMB(
            learning_rate=schedule,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"],
        )
    elif train_args.optimizer == "adam":
        opt = AdamW(weight_decay=0.0, learning_rate=schedule)
    opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        opt, loss_scale="dynamic")
    gradient_accumulator = GradientAccumulator()

    loaded_opt_weights = None
    if model_args.load_from == "scratch":
        pass
    elif model_args.load_from.startswith("huggingface"):
        assert (model_args.model_type == "albert"
                ), "Only loading pretrained albert models is supported"
        huggingface_name = f"albert-{model_args.model_size}-v2"
        if model_args.load_from == "huggingface":
            albert = TFAlbertModel.from_pretrained(huggingface_name,
                                                   config=config)
            model.albert = albert
    else:
        model_ckpt, opt_ckpt = get_checkpoint_paths_from_prefix(
            model_args.checkpoint_path)

        model = TFAutoModelForPreTraining.from_config(config)
        if hvd.rank() == 0:
            model.load_weights(model_ckpt)
            loaded_opt_weights = np.load(opt_ckpt, allow_pickle=True)
            # We do not set the weights yet, we have to do a first step to initialize the optimizer.

    # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories
    # Move to same folder structure and remove if/else
    if model_args.model_type == "albert":
        train_glob = f"{data_args.fsx_prefix}/albert_pretraining/tfrecords/train/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord"
        validation_glob = f"{data_args.fsx_prefix}/albert_pretraining/tfrecords/validation/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord"
    if model_args.model_type == "bert":
        train_glob = f"{data_args.fsx_prefix}/bert_pretraining/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/training/*.tfrecord"
        validation_glob = f"{data_args.fsx_prefix}/bert_pretraining/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/validation/*.tfrecord"

    train_filenames = glob.glob(train_glob)
    validation_filenames = glob.glob(validation_glob)

    train_dataset = get_mlm_dataset(
        filenames=train_filenames,
        max_seq_length=data_args.max_seq_length,
        max_predictions_per_seq=data_args.max_predictions_per_seq,
        batch_size=train_args.batch_size,
    )  # Of shape [batch_size, ...]
    # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, batch_size, ...]
    train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps)
    # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps.
    train_dataset = train_dataset.prefetch(buffer_size=8)

    # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks
    if hvd.rank() == 0:
        validation_dataset = get_mlm_dataset(
            filenames=validation_filenames,
            max_seq_length=data_args.max_seq_length,
            max_predictions_per_seq=data_args.max_predictions_per_seq,
            batch_size=train_args.batch_size,
        )
        # validation_dataset = validation_dataset.batch(1)
        validation_dataset = validation_dataset.prefetch(buffer_size=8)

        pbar = tqdm.tqdm(train_args.total_steps, disable=disable_tqdm)
        summary_writer = None  # Only create a writer if we make it through a successful step
        logger.info(f"Starting training, job name {run_name}")

    i = 0
    start_time = time.perf_counter()
    for batch in train_dataset:
        learning_rate = schedule(step=tf.constant(i, dtype=tf.float32))
        loss_scale = opt.loss_scale()
        loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step(
            model=model,
            opt=opt,
            gradient_accumulator=gradient_accumulator,
            batch=batch,
            gradient_accumulation_steps=train_args.gradient_accumulation_steps,
            skip_sop=skip_sop,
            skip_mlm=skip_mlm,
        )

        # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors
        if i == 0:
            if hvd.rank() == 0 and loaded_opt_weights is not None:
                opt.set_weights(loaded_opt_weights)
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(opt.variables(), root_rank=0)
            i = opt.get_weights()[0] - 1

        is_final_step = i >= train_args.total_steps - 1
        do_squad = i in squad_steps or is_final_step
        # Squad requires all the ranks to train, but results are only returned on rank 0
        if do_squad:
            squad_results = get_squad_results_while_pretraining(
                model=model,
                model_size=model_args.model_size,
                fsx_prefix=data_args.fsx_prefix,
                step=i,
                fast=log_args.fast_squad,
                dummy_eval=log_args.dummy_eval,
            )
            if hvd.rank() == 0:
                squad_exact, squad_f1 = squad_results["exact"], squad_results[
                    "f1"]
                logger.info(
                    f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}"
                )
            # Re-wrap autograph so it doesn't get arg mismatches
            wrap_global_functions(do_gradient_accumulation)

        if hvd.rank() == 0:
            do_log = i % log_args.log_frequency == 0
            do_checkpoint = (
                (i > 0) and
                (i % log_args.checkpoint_frequency == 0)) or is_final_step
            do_validation = (
                (i > 0) and
                (i % log_args.validation_frequency == 0)) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}"
            pbar.set_description(description)
            if do_log:
                elapsed_time = time.perf_counter() - start_time
                if i == 0:
                    logger.info(f"First step: {elapsed_time:.3f} secs")
                else:
                    it_per_sec = log_args.log_frequency / elapsed_time
                    logger.info(
                        f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}"
                    )
                    start_time = time.perf_counter()

            if do_checkpoint:
                checkpoint_prefix = f"{data_args.fsx_prefix}/checkpoints/albert/{run_name}-step{i}"
                model_ckpt = f"{checkpoint_prefix}.ckpt"
                opt_ckpt = f"{checkpoint_prefix}-opt.npy"
                logger.info(
                    f"Saving model at {model_ckpt}, optimizer at {opt_ckpt}")
                model.save_weights(model_ckpt)
                # model.load_weights(model_ckpt)

                opt_weights = opt.get_weights()
                np.save(opt_ckpt, opt_weights)
                # opt.set_weights(opt_weights)

            if do_validation:
                val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation(
                    model=model,
                    validation_dataset=validation_dataset,
                    skip_sop=skip_sop,
                    skip_mlm=skip_mlm,
                )
                description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}"
                logger.info(f"Validation step {i} -- {description}")

            # Create summary_writer after the first step
            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    f"{data_args.fsx_prefix}/logs/albert/{run_name}")
                with summary_writer.as_default():
                    HP_MODEL_TYPE = hp.HParam("model_type",
                                              hp.Discrete(["albert", "bert"]))
                    HP_MODEL_SIZE = hp.HParam("model_size",
                                              hp.Discrete(["base", "large"]))
                    HP_LEARNING_RATE = hp.HParam("learning_rate",
                                                 hp.RealInterval(1e-5, 1e-1))
                    HP_BATCH_SIZE = hp.HParam("global_batch_size",
                                              hp.IntInterval(1, 64))
                    HP_PRE_LAYER_NORM = hp.HParam("pre_layer_norm",
                                                  hp.Discrete([True, False]))
                    HP_HIDDEN_DROPOUT = hp.HParam("hidden_dropout")
                    hparams = [
                        HP_MODEL_TYPE,
                        HP_MODEL_SIZE,
                        HP_BATCH_SIZE,
                        HP_LEARNING_RATE,
                        HP_PRE_LAYER_NORM,
                        HP_HIDDEN_DROPOUT,
                    ]

                    HP_F1 = hp.Metric("squad_f1")
                    HP_EXACT = hp.Metric("squad_exact")
                    HP_MLM = hp.Metric("val_mlm_acc")
                    HP_SOP = hp.Metric("val_sop_acc")
                    HP_TRAIN_LOSS = hp.Metric("train_loss")
                    HP_VAL_LOSS = hp.Metric("val_loss")
                    metrics = [
                        HP_TRAIN_LOSS, HP_VAL_LOSS, HP_F1, HP_EXACT, HP_MLM,
                        HP_SOP
                    ]

                    hp.hparams_config(
                        hparams=hparams,
                        metrics=metrics,
                    )
                    hp.hparams(
                        {
                            HP_MODEL_TYPE: model_args.model_type,
                            HP_MODEL_SIZE: model_args.model_size,
                            HP_LEARNING_RATE: train_args.learning_rate,
                            HP_BATCH_SIZE: train_args.batch_size * hvd.size(),
                            HP_PRE_LAYER_NORM: model_args.pre_layer_norm
                            == "true",
                            HP_HIDDEN_DROPOUT: model_args.hidden_dropout_prob,
                        },
                        trial_id=run_name,
                    )

            # Log to TensorBoard
            with summary_writer.as_default():
                tf.summary.scalar("weight_norm", weight_norm, step=i)
                tf.summary.scalar("loss_scale", loss_scale, step=i)
                tf.summary.scalar("learning_rate", learning_rate, step=i)
                tf.summary.scalar("train_loss", loss, step=i)
                tf.summary.scalar("train_mlm_loss", mlm_loss, step=i)
                tf.summary.scalar("train_mlm_acc", mlm_acc, step=i)
                tf.summary.scalar("train_sop_loss", sop_loss, step=i)
                tf.summary.scalar("train_sop_acc", sop_acc, step=i)
                tf.summary.scalar("grad_norm", grad_norm, step=i)
                if do_validation:
                    tf.summary.scalar("val_loss", val_loss, step=i)
                    tf.summary.scalar("val_mlm_loss", val_mlm_loss, step=i)
                    tf.summary.scalar("val_mlm_acc", val_mlm_acc, step=i)
                    tf.summary.scalar("val_sop_loss", val_sop_loss, step=i)
                    tf.summary.scalar("val_sop_acc", val_sop_acc, step=i)
                if do_squad:
                    tf.summary.scalar("squad_f1", squad_f1, step=i)
                    tf.summary.scalar("squad_exact", squad_exact, step=i)

        i += 1
        if is_final_step:
            break

    if hvd.rank() == 0:
        pbar.close()
        logger.info(f"Finished pretraining, job name {run_name}")
예제 #19
0
import pandas as pd

import nmslib

import re

from transformers import AlbertTokenizer, TFAlbertModel

albert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")

from transformers import AlbertConfig

config = AlbertConfig.from_pretrained('./albert', output_hidden_states=True)

model = TFAlbertModel.from_pretrained('./albert', config=config, from_pt=True)

df = pd.read_csv('final_search.csv')

search_index = nmslib.init(method='hnsw', space='cosinesimil')

search_index.loadIndex('./final.nmslib')


def search(query):
    e = albert_tokenizer.encode(query.lower())
    input = tf.constant(e)[None, :]
    output = model(input)
    v = [0] * 768
    for i in range(-1, -13, -1):
        v = v + output[2][i][0][0]
예제 #20
0
def get_albert():
    ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='ids')
    att = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='att')
    tok_type_ids = keras.layers.Input(shape=(None, ),
                                      dtype=tf.int32,
                                      name='tti')

    config = AlbertConfig.from_pretrained(Config.Albert.config)
    config.output_hidden_states = True
    albert_model = TFAlbertModel.from_pretrained(Config.Albert.model,
                                                 config=config)

    _, _, x = albert_model(ids,
                           attention_mask=att,
                           token_type_ids=tok_type_ids)

    x1 = keras.layers.Dropout(0.15)(x[-1])
    x1 = keras.layers.Conv1D(768, 2, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.add([x1, x[-2]])
    x1 = keras.layers.Conv1D(768, 5, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.add([x1, x[-3]])
    x1 = keras.layers.Conv1D(768, 8, padding='same')(x1)
    x1 = keras.layers.LeakyReLU()(x1)
    x1 = keras.layers.LayerNormalization()(x1)
    x1 = keras.layers.add([x1, x[-4]])
    x1 = keras.layers.Dense(1)(x1)
    x1 = keras.layers.Flatten()(x1)
    x1 = keras.layers.Activation('softmax', dtype='float32', name='sts')(x1)

    x2 = keras.layers.Dropout(0.15)(x[-1])
    x2 = keras.layers.Conv1D(768, 2, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.add([x2, x[-2]])
    x2 = keras.layers.Conv1D(768, 5, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.add([x2, x[-3]])
    x2 = keras.layers.Conv1D(768, 8, padding='same')(x2)
    x2 = keras.layers.LeakyReLU()(x2)
    x2 = keras.layers.LayerNormalization()(x2)
    x2 = keras.layers.add([x2, x[-4]])
    x2 = keras.layers.Dense(1)(x2)
    x2 = keras.layers.Flatten()(x2)
    x2 = keras.layers.Activation('softmax', dtype='float32', name='ets')(x2)

    model = keras.models.Model(inputs=[ids, att, tok_type_ids],
                               outputs=[x1, x2])

    optimizer = keras.optimizers.Adam(learning_rate=6e-5)
    if Config.Train.use_amp:
        optimizer = keras.mixed_precision.experimental.LossScaleOptimizer(
            optimizer, 'dynamic')
    loss = keras.losses.CategoricalCrossentropy(
        label_smoothing=Config.Train.label_smoothing)
    model.compile(loss=loss, optimizer=optimizer)

    return model
예제 #21
0
import os
import sys
sys.path.insert(0, os.path.abspath('../'))

from transformers import AlbertModel, TFAlbertModel
from tokenization_kbalbert import KbAlbertCharTokenizer

kb_albert_model_path = '../kb-albert-model'
text = '방카슈랑스는 금융의 겸업화 추세에 부응하여 금융산업의 선진화를 도모하고 금융소비자의 편익을 위하여 도입되었습니다.'

tokenizer = KbAlbertCharTokenizer.from_pretrained(kb_albert_model_path)

# PyTorch
pt_model = AlbertModel.from_pretrained(kb_albert_model_path)
pt_inputs = tokenizer(text, return_tensors='pt')
pt_outputs = pt_model(**pt_inputs)[0]
print(pt_outputs)

# TensorFlow 2.0
tf_model = TFAlbertModel.from_pretrained(kb_albert_model_path)
tf_inputs = tokenizer(text, return_tensors='tf')
tf_outputs = tf_model(tf_inputs)[0]
print(tf_outputs)
예제 #22
0
 def _build_albert_from_transformers(self):
     from transformers import TFAlbertModel
     model = TFAlbertModel.from_pretrained(os.path.join(
         BASE_DIR, 'albert_base_zh_pytorch'),
                                           from_pt=True)
     return model
예제 #23
0
from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM
import os

checkpoint = "albert-base-v1"

model = AlbertForMaskedLM.from_pretrained(checkpoint)

if not os.path.exists("~/saved/" + checkpoint):
    os.makedirs("~/saved/" + checkpoint)
    

model.save_pretrained("~/saved/" + checkpoint)
model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True)
model.save_pretrained("~/saved/" + checkpoint)
model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint)
model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint)
model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint)


print("nice model")