def __init__(self, albert_config): self.albert_config = albert_config model_name = self.albert_config['model_name'] self.tokenizer = AlbertTokenizer.from_pretrained(model_name) self.model = TFAlbertModel.from_pretrained(model_name) self.summary_extraction_mode = self.albert_config[ 'summary_extraction_mode']
def _build_albert_from_transformers(self): from transformers import TFAlbertModel model = TFAlbertModel.from_pretrained(os.path.join( PYTORCH_MODEL_PATH, "albert_base_zh_pytorch"), from_pt=True) return model
def __init__(self, config, pretrained_path, *inputs, **kwargs): super(QaAlbertModel, self).__init__(config, pretrained_path, *inputs, **kwargs) # config needs to be a dict self.linear_size = 128 self.num_labels = config['num_labels'] # set inputs to None (sequence length not known) as this is a condition for being able to save the model self.sequence = tf.keras.Input(shape=(None, ), dtype=np.int32) self.albert = TFAlbertModel.from_pretrained(pretrained_path) self.relu = tf.keras.layers.Activation('relu') # compute score of each output of a word t_i being the start or end of the answer: # s*t_i, e*t_j (s: start vector, e: end vector) in top layer with num_labels=2 self.qa_linear = tf.keras.layers.Dense( self.linear_size, activation='relu', kernel_initializer=u.get_initializer(config['initializer_range']), name="qa_linear") self.qa_outputs = tf.keras.layers.Dense( self.num_labels, kernel_initializer=u.get_initializer(config['initializer_range']), name="qa_outputs") # sum over sequence self.softmax = tf.keras.layers.Softmax(axis=1) self.albert.trainable = False # has to come at the end of this constructor, otherwise other layers are unknown self._set_inputs(self.sequence)
def get_albert_for_comparison(): model_name = 'albert-base-v2' config = AlbertConfig.from_pretrained(model_name) config.output_hidden_states = False input_ids = tf.keras.Input(shape=(128, ), name='input_ids', dtype=tf.int32) attention_mask = tf.keras.Input(shape=(128, ), name='attention_mask', dtype=tf.int32) transformer_model = TFAlbertModel.from_pretrained(model_name, config=config) embedding_layer = transformer_model([input_ids, attention_mask])[0] X = tf.keras.layers.Dense( config.hidden_size, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), activation="relu", name="pre_classifier", )(embedding_layer[:, 0]) X = tf.keras.layers.Dropout(config.classifier_dropout_prob)(X) output_ = tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.initializer_range), name="classifier")(X) return tf.keras.Model([input_ids, attention_mask], output_)
def __init__( self, pretrained_model_name_or_path='albert-base-v2', reduce_output='cls_pooled', trainable=True, num_tokens=None, **kwargs ): super(ALBERTEncoder, self).__init__() try: from transformers import TFAlbertModel except ModuleNotFoundError: logger.error( ' transformers is not installed. ' 'In order to install all text feature dependencies run ' 'pip install ludwig[text]' ) sys.exit(-1) self.transformer = TFAlbertModel.from_pretrained( pretrained_model_name_or_path ) self.reduce_output = reduce_output if not self.reduce_output == 'cls_pooled': self.reduce_sequence = SequenceReducer(reduce_mode=reduce_output) self.transformer.trainable = trainable self.transformer.resize_token_embeddings(num_tokens)
def initialize_embeddings_from_average_representations(config, unique_entity_map, train_samples): batch_size = config['embedding_batch_size'] model = TFAlbertModel.from_pretrained('albert-base-v2') embedding_matrix, index_to_eid, eid_to_index = [], {}, {} for entity_num, (eid, unique_entity) in enumerate(unique_entity_map.items()): embedding = [] utterances, masks = unique_entity.utterances, unique_entity.masks input_data = [utterances[i * batch_size:(i + 1) * batch_size] for i in range((len(utterances) + batch_size - 1) // batch_size)] input_masks = [masks[i * batch_size:(i + 1) * batch_size] for i in range((len(masks) + batch_size - 1) // batch_size)] for index, (batch, mask) in enumerate(zip(input_data, input_masks)): if index == 0: embedding = np.asarray( tf.reduce_sum(tf.reduce_mean(model(tf.constant(batch), attention_mask=np.array(mask))[0], axis=1), axis=0)) else: embedding = np.add(embedding, np.asarray( tf.reduce_sum(tf.reduce_mean(model(tf.constant(batch), attention_mask=np.array(mask))[0], axis=1), axis=0))) embedding = np.divide(embedding, len(utterances)) unique_entity.set_embedding(embedding) embedding_matrix.append(embedding) index_to_eid[entity_num] = eid eid_to_index[eid] = entity_num for train_sample in train_samples: train_sample.set_embedding(unique_entity_map[train_sample.entity_id].entity_embedding) return index_to_eid, eid_to_index, embedding_matrix
def __init__(self, dropout=0.1): super().__init__() self.albert = TFAlbertModel.from_pretrained('bert-base-uncased', trainable=True) self.drop = tf.keras.layers.Dropout(dropout) self.fc = tf.keras.layers.Dense(300, tf.nn.swish) self.out = tf.keras.layers.Dense(2)
def __init__(self, max_seq_length=512): # ALBERT unique params self.max_seq_length = max_seq_length # GR params self.vectorized_knowledge = {} self.text = {} self.questions = {} self.opt_params = {'learning_rate':0.001,'beta_1':0.9,'beta_2':0.999,'epsilon':1e-07} # init saved model self.albert_layer = TFAlbertModel.from_pretrained('albert-base-v2') # writing the model for the training tasks # get inputs res_id = tf.keras.layers.Input(shape=(self.max_seq_length,), name="input_ids", dtype='int32') res_mask = tf.keras.layers.Input(shape=(self.max_seq_length,), name="input_masks", dtype='int32') res_segment = tf.keras.layers.Input(shape=(self.max_seq_length,), name="input_seg", dtype='int32') # encode the three inputs _, res_pooled = self.albert_layer([res_id, res_mask, res_segment]) # dense layer specifically for self.response_encoder = tf.keras.layers.Dense(768, input_shape=(768,), name='response_dense_layer') encoded_response = self.response_encoder(res_pooled) # init model self.albert_model = tf.keras.Model(inputs=[res_id, res_mask, res_segment], outputs=encoded_response) print("Initializing tokenizer and optimizer") self.init_signatures()
def _test_TFAlbert(self, size, large=False): from transformers import AlbertTokenizer, TFAlbertModel tokenizer = AlbertTokenizer.from_pretrained(size) model = TFAlbertModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large)
def get_transformer(bert_model_type, output_hidden_states=False): config = get_bert_config(bert_model_type, output_hidden_states) if bert_model_type in [ 'bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in [ 'prod-bert-base-uncased', 'tune_bert-base-uncased_nsp' ]: return TFBertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config, from_pt=True) elif bert_model_type in [ 'roberta-base', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in ['prod-roberta-base-cased']: return TFRobertaModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config, from_pt=True) elif bert_model_type in ['xlnet-base-cased']: return TFXLNetModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: return TFAlbertModel.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in ['gpt2', 'gpt2-medium']: return TFGPT2Model.from_pretrained(BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in ['transfo-xl']: return TFTransfoXLModel.from_pretrained( BERT_MODEL_FILE[bert_model_type], config=config) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: return TFDistilBertModel.from_pretrained( BERT_MODEL_FILE[bert_model_type], config=config) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}')
def __init__(self, model_name_path: str, use_dropout: bool, name: str): """ Pretrained Classification Model initializer. Args: model_name_path (str): model_name_path use_dropout (bool): use_dropout name (str): name """ self.model_name_path = model_name_path self.use_dropout = use_dropout super().__init__() self.pretrained_layer = TFAlbertModel.from_pretrained(model_name_path) self.dropout_layer = (tf.keras.layers.Dropout( rate=0.1, name="Dropout_layer") if self.use_dropout else None)
def __init__(self, *args, **kwargs): super(AlbertForComparison, self).__init__(self, args, kwargs) self.model_name = 'albert-base-v2' self.config = AlbertConfig.from_pretrained(self.model_name) self.config.output_hidden_states = False self.embedding_layer = TFAlbertModel.from_pretrained( self.model_name, config=self.config) self.pre_classifier = tf.keras.layers.Dense( self.config.hidden_size, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=self.config.initializer_range), activation="relu", name="pre_classifier", ) self.classifier = tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=self.config.initializer_range), name="classifier")
def initialize_embeddings_from_canonical(config, unique_entity_map, train_samples): batch_size = config['embedding_batch_size'] unique_entities = list(unique_entity_map.values()) input_data = [unique_entities[i * batch_size:(i + 1) * batch_size] for i in range((len(unique_entities) + batch_size - 1) // batch_size)] model = TFAlbertModel.from_pretrained('albert-base-v2') embedding_matrix, index_to_eid, eid_to_index, index = [], {}, {}, 0 for batch in input_data: model_inputs = [x.canonical_tokens for x in batch] masks = [x.canonical_attention_mask for x in batch] embeddings = tf.reduce_mean(model(tf.constant(model_inputs), attention_mask=np.array(masks))[0], axis=1) for entity, embedding in zip(batch, embeddings): entity.set_embedding(embedding) eid = entity.entity_id embedding_matrix.append(embedding) index_to_eid[index] = eid eid_to_index[eid] = index index = index + 1 for train_sample in train_samples: train_sample.set_embedding(unique_entity_map[train_sample.entity_id].entity_embedding) return index_to_eid, eid_to_index, embedding_matrix
def __init__(self, intent_size, slot_size, lr=1e-4, dropout_rate=0.2, units=300): super().__init__() self.albert = TFAlbertModel.from_pretrained('bert-base-uncased', trainable=True) self.inp_dropout = Dropout(dropout_rate) self.intent_dropout = Dropout(dropout_rate) self.fc_intent = Dense(units, activation='relu') self.trans_params = self.add_weight(shape=(slot_size, slot_size)) self.out_linear_intent = Dense(intent_size) self.out_linear_slot = Dense(slot_size) self.optimizer = Adam(lr) self.slots_accuracy = tf.keras.metrics.Accuracy() self.intent_accuracy = tf.keras.metrics.Accuracy() self.decay_lr = tf.optimizers.schedules.ExponentialDecay( lr, 1000, 0.95) self.logger = logging.getLogger('tensorflow') self.logger.setLevel(logging.INFO)
def __init__(self, nb_classes): super(AlbertPlusOutputLayer, self).__init__() self.albert_embedder = TFAlbertModel.from_pretrained('albert-base-v2') self.output_layer = Dense(nb_classes)
def __init__(self): super(AlbertEmbedder, self).__init__() self.albert_embedder = TFAlbertModel.from_pretrained('albert-base-v2')
def main( fsx_prefix: str, model_type: str, model_size: str, batch_size: int, max_seq_length: int, gradient_accumulation_steps: int, optimizer: str, name: str, learning_rate: float, end_learning_rate: float, warmup_steps: int, total_steps: int, skip_sop: bool, skip_mlm: bool, pre_layer_norm: bool, fast_squad: bool, dummy_eval: bool, squad_steps: List[int], hidden_dropout_prob: float, ): # Hard-coded values that don't need to be arguments max_predictions_per_seq = 20 log_frequency = 1000 checkpoint_frequency = 5000 validate_frequency = 2000 histogram_frequency = 100 do_gradient_accumulation = gradient_accumulation_steps > 1 if hvd.rank() == 0: # Run name should only be used on one process to avoid race conditions current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") platform = "eks" if args.fsx_prefix == "/fsx" else "sm" if skip_sop: loss_str = "-skipsop" elif skip_mlm: loss_str = "-skipmlm" else: loss_str = "" amp_str = ("-skipamp" if not tf.config.optimizer.get_experimental_options().get( "auto_mixed_precision", False) else "") ln_str = "-preln" if pre_layer_norm else "-postln" dropout_str = f"-{hidden_dropout_prob}dropout" if hidden_dropout_prob != 0 else "" name_str = f"-{name}" if name else "" metadata = f"{model_type}-{model_size}-{args.load_from}-{hvd.size()}gpus-{batch_size}batch-{gradient_accumulation_steps}accum-{learning_rate}lr-{args.max_grad_norm}maxgrad-{optimizer}opt-{total_steps}steps-{max_seq_length}seq{amp_str}{ln_str}{loss_str}{dropout_str}{name_str}" run_name = f"{current_time}-{platform}-{metadata}" # Logging should only happen on a single process # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" handlers = [ logging.FileHandler(f"{fsx_prefix}/logs/albert/{run_name}.log"), logging.StreamHandler(), ] logging.basicConfig(level=level, format=format, handlers=handlers) # Check that arguments passed in properly, only after registering the alert_func and logging assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm" wrap_global_functions(do_gradient_accumulation) if model_type == "albert": model_desc = f"albert-{model_size}-v2" elif model_type == "bert": model_desc = f"bert-{model_size}-uncased" config = AutoConfig.from_pretrained(model_desc) config.pre_layer_norm = pre_layer_norm config.output_hidden_states = True config.hidden_dropout_prob = hidden_dropout_prob model = TFAutoModelForPreTraining.from_config(config) if args.load_from == "scratch": pass else: assert model_type == "albert", "Only loading pretrained albert models is supported" huggingface_name = f"albert-{model_size}-v2" if args.load_from == "huggingface": albert = TFAlbertModel.from_pretrained(huggingface_name, config=config) model.albert = albert elif args.load_from == "huggingfacepreds": mlm_model = TFAlbertForMaskedLM.from_pretrained(huggingface_name, config=config) model.albert = mlm_model.albert model.cls.predictions = mlm_model.predictions tokenizer = get_tokenizer() schedule = LinearWarmupLinearDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=end_learning_rate, warmup_steps=warmup_steps, total_steps=total_steps, ) if optimizer == "lamb": opt = LAMB( learning_rate=schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) elif optimizer == "adam": opt = AdamW(weight_decay=0.0, learning_rate=schedule) opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer( opt, loss_scale="dynamic") gradient_accumulator = GradientAccumulator() # Train filenames are [1, 2047] # Val filenames are [0] # Note the different subdirectories train_glob = f"{fsx_prefix}/albert_pretraining/tfrecords/train/max_seq_len_{max_seq_length}_max_predictions_per_seq_{max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord" validation_glob = f"{fsx_prefix}/albert_pretraining/tfrecords/validation/max_seq_len_{max_seq_length}_max_predictions_per_seq_{max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord" train_filenames = glob.glob(train_glob) validation_filenames = glob.glob(validation_glob) train_dataset = get_mlm_dataset( filenames=train_filenames, max_seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq, batch_size=batch_size, ) # Of shape [batch_size, ...] train_dataset = train_dataset.batch( gradient_accumulation_steps ) # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, batch_size, ...] # train_dataset = ( # train_dataset.repeat() # ) # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps. train_dataset = train_dataset.prefetch(buffer_size=8) # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks if hvd.rank() == 0: validation_dataset = get_mlm_dataset( filenames=validation_filenames, max_seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq, batch_size=batch_size, ) # validation_dataset = validation_dataset.batch(1) validation_dataset = validation_dataset.prefetch(buffer_size=8) pbar = tqdm.tqdm(total_steps) summary_writer = None # Only create a writer if we make it through a successful step if hvd.rank() == 0: logger.info(f"Starting training, job name {run_name}") for i, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(i, dtype=tf.float32)) loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm = train_step( model=model, opt=opt, gradient_accumulator=gradient_accumulator, batch=batch, gradient_accumulation_steps=gradient_accumulation_steps, skip_sop=skip_sop, skip_mlm=skip_mlm, ) # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors if i == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) is_final_step = i >= total_steps - 1 do_squad = i in squad_steps or is_final_step # Squad requires all the ranks to train, but results are only returned on rank 0 if do_squad: squad_results = get_squad_results( model=model, model_size=model_size, step=i, fast=fast_squad, dummy_eval=dummy_eval, ) if hvd.rank() == 0: squad_exact, squad_f1 = squad_results["exact"], squad_results[ "f1"] logger.info( f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}" ) # Re-wrap autograph so it doesn't get arg mismatches wrap_global_functions(do_gradient_accumulation) if hvd.rank() == 0: do_log = i % log_frequency == 0 do_checkpoint = (i % checkpoint_frequency == 0) or is_final_step do_validation = (i % validate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}" pbar.set_description(description) if do_log: logger.info(f"Train step {i} -- {description}") if do_checkpoint: checkpoint_path = f"{fsx_prefix}/checkpoints/albert/{run_name}-step{i}.ckpt" logger.info(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) # model.load_weights(checkpoint_path) if do_validation: val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation( model=model, validation_dataset=validation_dataset, skip_sop=skip_sop, skip_mlm=skip_mlm, ) description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}" logger.info(f"Validation step {i} -- {description}") # Create summary_writer after the first step if summary_writer is None: summary_writer = tf.summary.create_file_writer( f"{fsx_prefix}/logs/albert/{run_name}") # Log to TensorBoard weight_norm = tf.math.sqrt( tf.math.reduce_sum([ tf.norm(var, ord=2)**2 for var in model.trainable_variables ])) with summary_writer.as_default(): tf.summary.scalar("weight_norm", weight_norm, step=i) tf.summary.scalar("learning_rate", learning_rate, step=i) tf.summary.scalar("train_loss", loss, step=i) tf.summary.scalar("train_mlm_loss", mlm_loss, step=i) tf.summary.scalar("train_mlm_acc", mlm_acc, step=i) tf.summary.scalar("train_sop_loss", sop_loss, step=i) tf.summary.scalar("train_sop_acc", sop_acc, step=i) tf.summary.scalar("grad_norm", grad_norm, step=i) if do_validation: tf.summary.scalar("val_loss", val_loss, step=i) tf.summary.scalar("val_mlm_loss", val_mlm_loss, step=i) tf.summary.scalar("val_mlm_acc", val_mlm_acc, step=i) tf.summary.scalar("val_sop_loss", val_sop_loss, step=i) tf.summary.scalar("val_sop_acc", val_sop_acc, step=i) if do_squad: tf.summary.scalar("squad_f1", squad_f1, step=i) tf.summary.scalar("squad_exact", squad_exact, step=i) if is_final_step: break if hvd.rank() == 0: pbar.close() logger.info(f"Finished pretraining, job name {run_name}")
def main(): parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments)) model_args, data_args, train_args, log_args = parser.parse_args_into_dataclasses( ) tf.random.set_seed(train_args.seed) tf.autograph.set_verbosity(0) # Settings init parse_bool = lambda arg: arg == "true" do_gradient_accumulation = train_args.gradient_accumulation_steps > 1 do_xla = not parse_bool(train_args.skip_xla) do_eager = parse_bool(train_args.eager) skip_sop = parse_bool(train_args.skip_sop) skip_mlm = parse_bool(train_args.skip_mlm) pre_layer_norm = parse_bool(model_args.pre_layer_norm) fast_squad = parse_bool(log_args.fast_squad) dummy_eval = parse_bool(log_args.dummy_eval) squad_steps = get_squad_steps(log_args.extra_squad_steps) is_sagemaker = data_args.fsx_prefix.startswith("/opt/ml") disable_tqdm = is_sagemaker global max_grad_norm max_grad_norm = train_args.max_grad_norm # Horovod init hvd.init() gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU") # XLA, AutoGraph tf.config.optimizer.set_jit(do_xla) tf.config.experimental_run_functions_eagerly(do_eager) if hvd.rank() == 0: # Run name should only be used on one process to avoid race conditions current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") platform = "sm" if is_sagemaker else "eks" if skip_sop: loss_str = "-skipsop" elif skip_mlm: loss_str = "-skipmlm" else: loss_str = "" metadata = (f"{model_args.model_type}" f"-{model_args.model_size}" f"-{model_args.load_from}" f"-{hvd.size()}gpus" f"-{train_args.batch_size}batch" f"-{train_args.gradient_accumulation_steps}accum" f"-{train_args.learning_rate}maxlr" f"-{train_args.end_learning_rate}endlr" f"-{train_args.learning_rate_decay_power}power" f"-{train_args.max_grad_norm}maxgrad" f"-{train_args.optimizer}opt" f"-{train_args.total_steps}steps" f"-{data_args.max_seq_length}seq" f"-{data_args.max_predictions_per_seq}preds" f"-{'preln' if pre_layer_norm else 'postln'}" f"{loss_str}" f"-{model_args.hidden_dropout_prob}dropout" f"-{train_args.seed}seed") run_name = f"{current_time}-{platform}-{metadata}-{train_args.name if train_args.name else 'unnamed'}" # Logging should only happen on a single process # https://stackoverflow.com/questions/9321741/printing-to-screen-and-writing-to-a-file-at-the-same-time level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" handlers = [ logging.FileHandler( f"{data_args.fsx_prefix}/logs/albert/{run_name}.log"), TqdmLoggingHandler(), ] logging.basicConfig(level=level, format=format, handlers=handlers) # Check that arguments passed in properly, only after registering the alert_func and logging assert not (skip_sop and skip_mlm), "Cannot use --skip_sop and --skip_mlm" wrap_global_functions(do_gradient_accumulation) if model_args.model_type == "albert": model_desc = f"albert-{model_args.model_size}-v2" elif model_args.model_type == "bert": model_desc = f"bert-{model_args.model_size}-uncased" config = AutoConfig.from_pretrained(model_desc) config.pre_layer_norm = pre_layer_norm config.hidden_dropout_prob = model_args.hidden_dropout_prob model = TFAutoModelForPreTraining.from_config(config) # Create optimizer and enable AMP loss scaling. schedule = LinearWarmupPolyDecaySchedule( max_learning_rate=train_args.learning_rate, end_learning_rate=train_args.end_learning_rate, warmup_steps=train_args.warmup_steps, total_steps=train_args.total_steps, power=train_args.learning_rate_decay_power, ) if train_args.optimizer == "lamb": opt = LAMB( learning_rate=schedule, weight_decay_rate=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-6, exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], ) elif train_args.optimizer == "adam": opt = AdamW(weight_decay=0.0, learning_rate=schedule) opt = tf.train.experimental.enable_mixed_precision_graph_rewrite( opt, loss_scale="dynamic") gradient_accumulator = GradientAccumulator() loaded_opt_weights = None if model_args.load_from == "scratch": pass elif model_args.load_from.startswith("huggingface"): assert (model_args.model_type == "albert" ), "Only loading pretrained albert models is supported" huggingface_name = f"albert-{model_args.model_size}-v2" if model_args.load_from == "huggingface": albert = TFAlbertModel.from_pretrained(huggingface_name, config=config) model.albert = albert else: model_ckpt, opt_ckpt = get_checkpoint_paths_from_prefix( model_args.checkpoint_path) model = TFAutoModelForPreTraining.from_config(config) if hvd.rank() == 0: model.load_weights(model_ckpt) loaded_opt_weights = np.load(opt_ckpt, allow_pickle=True) # We do not set the weights yet, we have to do a first step to initialize the optimizer. # Train filenames are [1, 2047], Val filenames are [0]. Note the different subdirectories # Move to same folder structure and remove if/else if model_args.model_type == "albert": train_glob = f"{data_args.fsx_prefix}/albert_pretraining/tfrecords/train/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord" validation_glob = f"{data_args.fsx_prefix}/albert_pretraining/tfrecords/validation/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/albert_*.tfrecord" if model_args.model_type == "bert": train_glob = f"{data_args.fsx_prefix}/bert_pretraining/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/training/*.tfrecord" validation_glob = f"{data_args.fsx_prefix}/bert_pretraining/max_seq_len_{data_args.max_seq_length}_max_predictions_per_seq_{data_args.max_predictions_per_seq}_masked_lm_prob_15/validation/*.tfrecord" train_filenames = glob.glob(train_glob) validation_filenames = glob.glob(validation_glob) train_dataset = get_mlm_dataset( filenames=train_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, batch_size=train_args.batch_size, ) # Of shape [batch_size, ...] # Batch of batches, helpful for gradient accumulation. Shape [grad_steps, batch_size, ...] train_dataset = train_dataset.batch(train_args.gradient_accumulation_steps) # One iteration with 10 dupes, 8 nodes seems to be 60-70k steps. train_dataset = train_dataset.prefetch(buffer_size=8) # Validation should only be done on one node, since Horovod doesn't allow allreduce on a subset of ranks if hvd.rank() == 0: validation_dataset = get_mlm_dataset( filenames=validation_filenames, max_seq_length=data_args.max_seq_length, max_predictions_per_seq=data_args.max_predictions_per_seq, batch_size=train_args.batch_size, ) # validation_dataset = validation_dataset.batch(1) validation_dataset = validation_dataset.prefetch(buffer_size=8) pbar = tqdm.tqdm(train_args.total_steps, disable=disable_tqdm) summary_writer = None # Only create a writer if we make it through a successful step logger.info(f"Starting training, job name {run_name}") i = 0 start_time = time.perf_counter() for batch in train_dataset: learning_rate = schedule(step=tf.constant(i, dtype=tf.float32)) loss_scale = opt.loss_scale() loss, mlm_loss, mlm_acc, sop_loss, sop_acc, grad_norm, weight_norm = train_step( model=model, opt=opt, gradient_accumulator=gradient_accumulator, batch=batch, gradient_accumulation_steps=train_args.gradient_accumulation_steps, skip_sop=skip_sop, skip_mlm=skip_mlm, ) # Don't want to wrap broadcast_variables() in a tf.function, can lead to asynchronous errors if i == 0: if hvd.rank() == 0 and loaded_opt_weights is not None: opt.set_weights(loaded_opt_weights) hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(opt.variables(), root_rank=0) i = opt.get_weights()[0] - 1 is_final_step = i >= train_args.total_steps - 1 do_squad = i in squad_steps or is_final_step # Squad requires all the ranks to train, but results are only returned on rank 0 if do_squad: squad_results = get_squad_results_while_pretraining( model=model, model_size=model_args.model_size, fsx_prefix=data_args.fsx_prefix, step=i, fast=log_args.fast_squad, dummy_eval=log_args.dummy_eval, ) if hvd.rank() == 0: squad_exact, squad_f1 = squad_results["exact"], squad_results[ "f1"] logger.info( f"SQuAD step {i} -- F1: {squad_f1:.3f}, Exact: {squad_exact:.3f}" ) # Re-wrap autograph so it doesn't get arg mismatches wrap_global_functions(do_gradient_accumulation) if hvd.rank() == 0: do_log = i % log_args.log_frequency == 0 do_checkpoint = ( (i > 0) and (i % log_args.checkpoint_frequency == 0)) or is_final_step do_validation = ( (i > 0) and (i % log_args.validation_frequency == 0)) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, MLM: {mlm_loss:.3f}, SOP: {sop_loss:.3f}, MLM_acc: {mlm_acc:.3f}, SOP_acc: {sop_acc:.3f}" pbar.set_description(description) if do_log: elapsed_time = time.perf_counter() - start_time if i == 0: logger.info(f"First step: {elapsed_time:.3f} secs") else: it_per_sec = log_args.log_frequency / elapsed_time logger.info( f"Train step {i} -- {description} -- It/s: {it_per_sec:.2f}" ) start_time = time.perf_counter() if do_checkpoint: checkpoint_prefix = f"{data_args.fsx_prefix}/checkpoints/albert/{run_name}-step{i}" model_ckpt = f"{checkpoint_prefix}.ckpt" opt_ckpt = f"{checkpoint_prefix}-opt.npy" logger.info( f"Saving model at {model_ckpt}, optimizer at {opt_ckpt}") model.save_weights(model_ckpt) # model.load_weights(model_ckpt) opt_weights = opt.get_weights() np.save(opt_ckpt, opt_weights) # opt.set_weights(opt_weights) if do_validation: val_loss, val_mlm_loss, val_mlm_acc, val_sop_loss, val_sop_acc = run_validation( model=model, validation_dataset=validation_dataset, skip_sop=skip_sop, skip_mlm=skip_mlm, ) description = f"Loss: {val_loss:.3f}, MLM: {val_mlm_loss:.3f}, SOP: {val_sop_loss:.3f}, MLM_acc: {val_mlm_acc:.3f}, SOP_acc: {val_sop_acc:.3f}" logger.info(f"Validation step {i} -- {description}") # Create summary_writer after the first step if summary_writer is None: summary_writer = tf.summary.create_file_writer( f"{data_args.fsx_prefix}/logs/albert/{run_name}") with summary_writer.as_default(): HP_MODEL_TYPE = hp.HParam("model_type", hp.Discrete(["albert", "bert"])) HP_MODEL_SIZE = hp.HParam("model_size", hp.Discrete(["base", "large"])) HP_LEARNING_RATE = hp.HParam("learning_rate", hp.RealInterval(1e-5, 1e-1)) HP_BATCH_SIZE = hp.HParam("global_batch_size", hp.IntInterval(1, 64)) HP_PRE_LAYER_NORM = hp.HParam("pre_layer_norm", hp.Discrete([True, False])) HP_HIDDEN_DROPOUT = hp.HParam("hidden_dropout") hparams = [ HP_MODEL_TYPE, HP_MODEL_SIZE, HP_BATCH_SIZE, HP_LEARNING_RATE, HP_PRE_LAYER_NORM, HP_HIDDEN_DROPOUT, ] HP_F1 = hp.Metric("squad_f1") HP_EXACT = hp.Metric("squad_exact") HP_MLM = hp.Metric("val_mlm_acc") HP_SOP = hp.Metric("val_sop_acc") HP_TRAIN_LOSS = hp.Metric("train_loss") HP_VAL_LOSS = hp.Metric("val_loss") metrics = [ HP_TRAIN_LOSS, HP_VAL_LOSS, HP_F1, HP_EXACT, HP_MLM, HP_SOP ] hp.hparams_config( hparams=hparams, metrics=metrics, ) hp.hparams( { HP_MODEL_TYPE: model_args.model_type, HP_MODEL_SIZE: model_args.model_size, HP_LEARNING_RATE: train_args.learning_rate, HP_BATCH_SIZE: train_args.batch_size * hvd.size(), HP_PRE_LAYER_NORM: model_args.pre_layer_norm == "true", HP_HIDDEN_DROPOUT: model_args.hidden_dropout_prob, }, trial_id=run_name, ) # Log to TensorBoard with summary_writer.as_default(): tf.summary.scalar("weight_norm", weight_norm, step=i) tf.summary.scalar("loss_scale", loss_scale, step=i) tf.summary.scalar("learning_rate", learning_rate, step=i) tf.summary.scalar("train_loss", loss, step=i) tf.summary.scalar("train_mlm_loss", mlm_loss, step=i) tf.summary.scalar("train_mlm_acc", mlm_acc, step=i) tf.summary.scalar("train_sop_loss", sop_loss, step=i) tf.summary.scalar("train_sop_acc", sop_acc, step=i) tf.summary.scalar("grad_norm", grad_norm, step=i) if do_validation: tf.summary.scalar("val_loss", val_loss, step=i) tf.summary.scalar("val_mlm_loss", val_mlm_loss, step=i) tf.summary.scalar("val_mlm_acc", val_mlm_acc, step=i) tf.summary.scalar("val_sop_loss", val_sop_loss, step=i) tf.summary.scalar("val_sop_acc", val_sop_acc, step=i) if do_squad: tf.summary.scalar("squad_f1", squad_f1, step=i) tf.summary.scalar("squad_exact", squad_exact, step=i) i += 1 if is_final_step: break if hvd.rank() == 0: pbar.close() logger.info(f"Finished pretraining, job name {run_name}")
import pandas as pd import nmslib import re from transformers import AlbertTokenizer, TFAlbertModel albert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") from transformers import AlbertConfig config = AlbertConfig.from_pretrained('./albert', output_hidden_states=True) model = TFAlbertModel.from_pretrained('./albert', config=config, from_pt=True) df = pd.read_csv('final_search.csv') search_index = nmslib.init(method='hnsw', space='cosinesimil') search_index.loadIndex('./final.nmslib') def search(query): e = albert_tokenizer.encode(query.lower()) input = tf.constant(e)[None, :] output = model(input) v = [0] * 768 for i in range(-1, -13, -1): v = v + output[2][i][0][0]
def get_albert(): ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='ids') att = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='att') tok_type_ids = keras.layers.Input(shape=(None, ), dtype=tf.int32, name='tti') config = AlbertConfig.from_pretrained(Config.Albert.config) config.output_hidden_states = True albert_model = TFAlbertModel.from_pretrained(Config.Albert.model, config=config) _, _, x = albert_model(ids, attention_mask=att, token_type_ids=tok_type_ids) x1 = keras.layers.Dropout(0.15)(x[-1]) x1 = keras.layers.Conv1D(768, 2, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.add([x1, x[-2]]) x1 = keras.layers.Conv1D(768, 5, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.add([x1, x[-3]]) x1 = keras.layers.Conv1D(768, 8, padding='same')(x1) x1 = keras.layers.LeakyReLU()(x1) x1 = keras.layers.LayerNormalization()(x1) x1 = keras.layers.add([x1, x[-4]]) x1 = keras.layers.Dense(1)(x1) x1 = keras.layers.Flatten()(x1) x1 = keras.layers.Activation('softmax', dtype='float32', name='sts')(x1) x2 = keras.layers.Dropout(0.15)(x[-1]) x2 = keras.layers.Conv1D(768, 2, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.add([x2, x[-2]]) x2 = keras.layers.Conv1D(768, 5, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.add([x2, x[-3]]) x2 = keras.layers.Conv1D(768, 8, padding='same')(x2) x2 = keras.layers.LeakyReLU()(x2) x2 = keras.layers.LayerNormalization()(x2) x2 = keras.layers.add([x2, x[-4]]) x2 = keras.layers.Dense(1)(x2) x2 = keras.layers.Flatten()(x2) x2 = keras.layers.Activation('softmax', dtype='float32', name='ets')(x2) model = keras.models.Model(inputs=[ids, att, tok_type_ids], outputs=[x1, x2]) optimizer = keras.optimizers.Adam(learning_rate=6e-5) if Config.Train.use_amp: optimizer = keras.mixed_precision.experimental.LossScaleOptimizer( optimizer, 'dynamic') loss = keras.losses.CategoricalCrossentropy( label_smoothing=Config.Train.label_smoothing) model.compile(loss=loss, optimizer=optimizer) return model
import os import sys sys.path.insert(0, os.path.abspath('../')) from transformers import AlbertModel, TFAlbertModel from tokenization_kbalbert import KbAlbertCharTokenizer kb_albert_model_path = '../kb-albert-model' text = '방카슈랑스는 금융의 겸업화 추세에 부응하여 금융산업의 선진화를 도모하고 금융소비자의 편익을 위하여 도입되었습니다.' tokenizer = KbAlbertCharTokenizer.from_pretrained(kb_albert_model_path) # PyTorch pt_model = AlbertModel.from_pretrained(kb_albert_model_path) pt_inputs = tokenizer(text, return_tensors='pt') pt_outputs = pt_model(**pt_inputs)[0] print(pt_outputs) # TensorFlow 2.0 tf_model = TFAlbertModel.from_pretrained(kb_albert_model_path) tf_inputs = tokenizer(text, return_tensors='tf') tf_outputs = tf_model(tf_inputs)[0] print(tf_outputs)
def _build_albert_from_transformers(self): from transformers import TFAlbertModel model = TFAlbertModel.from_pretrained(os.path.join( BASE_DIR, 'albert_base_zh_pytorch'), from_pt=True) return model
from transformers import TFAlbertForMaskedLM, TFAlbertModel, TFAlbertForSequenceClassification, AlbertForMaskedLM import os checkpoint = "albert-base-v1" model = AlbertForMaskedLM.from_pretrained(checkpoint) if not os.path.exists("~/saved/" + checkpoint): os.makedirs("~/saved/" + checkpoint) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint, from_pt=True) model.save_pretrained("~/saved/" + checkpoint) model = TFAlbertModel.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForMaskedLM.from_pretrained('~/saved/' + checkpoint) model = TFAlbertForSequenceClassification.from_pretrained('~/saved/' + checkpoint) print("nice model")