def get_model(args): if args.model_size == 'debug': num_hidden_layers = 1 embedding_size = 8 hidden_size = 16 num_hidden_groups = 1 intermediate_size = 32 num_attention_heads = 2 args.gen_ratio = 2 elif args.model_size == 'small': num_hidden_layers = 12 embedding_size = 128 hidden_size = 256 num_hidden_groups = 1 intermediate_size = 1024 num_attention_heads = 4 elif args.model_size == 'base': num_hidden_layers = 12 embedding_size = 128 hidden_size = 768 num_hidden_groups = 1 intermediate_size = 3072 num_attention_heads = 12 else: raise Exception('Which model? small, base, large') generator_config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, num_hidden_groups=num_hidden_groups, hidden_size=hidden_size // args.gen_ratio, intermediate_size=intermediate_size // args.gen_ratio, num_attention_heads=num_attention_heads // args.gen_ratio, ) discriminator_config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, num_hidden_groups=num_hidden_groups, hidden_size=hidden_size, intermediate_size=intermediate_size, num_attention_heads=num_attention_heads, ) model = Electra(args, gen_config=generator_config, dis_config=discriminator_config) return model
def __init__(self, args, token_vocab_size, output_dim=100): super(LMCDecoderBERT, self).__init__() self.pool_layers = args.pool_bert if args.debug_model: bert_dim = 100 num_hidden_layers = 1 embedding_size = 100 intermediate_size = 100 output_dim = 100 else: bert_dim = 256 num_hidden_layers = 2 embedding_size = 128 intermediate_size = 256 num_attention_heads = max(1, bert_dim // 64) print('Using {} attention heads in decoder'.format(num_attention_heads)) config = AlbertConfig( vocab_size=token_vocab_size, embedding_size=embedding_size, hidden_size=bert_dim, num_hidden_layers=num_hidden_layers, intermediate_size=intermediate_size, # 3072 is default num_attention_heads=num_attention_heads, output_hidden_states=self.pool_layers ) self.bert = AlbertModel(config) self.u = nn.Linear(bert_dim, output_dim, bias=True) self.v = nn.Linear(bert_dim, 1, bias=True) self.att_linear = nn.Linear(bert_dim, 1, bias=True) self.dropout = nn.Dropout(0.2)
def from_hocon(cls: Type[QueryCodeSiamese], config: ConfigTree) -> QueryCodeSiamese: """Load Query1Code1_CodeSearchModel from a config tree""" if "training.model.encoder.type" in config: if config["training.model.encoder.type"] == "albert": logger.info("Creating QueryCodeSiamese with Albert encoder") albert_config = AlbertConfig( **config["training.model.encoder"]) encoder = PreTrainedModelRecordable(AlbertModel(albert_config)) elif config["training.model.encoder.type"] == "bert": logger.info("Creating QueryCodeSiamese with Bert encoder") bert_config = BertConfig(**config["training.model.encoder"]) encoder = PreTrainedModelRecordable(BertModel(bert_config)) else: # default is BERT now logger.info("Creating QueryCodeSiamese with Bert encoder") bert_config = BertConfig(**config["training.model.encoder"]) encoder = PreTrainedModelRecordable(BertModel(bert_config)) model = QueryCodeSiamese( encoder=encoder, pooler=MeanWeightedPooler( input_size=config["training.model.encoder.hidden_size"])) return model
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = None if self.use_attention_mask: attention_mask = random_attention_mask( [self.batch_size, self.seq_length]) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = AlbertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, is_decoder=False, initializer_range=self.initializer_range, ) return config, input_ids, token_type_ids, attention_mask
def __init__(self, albert_name="ALBERT-base", device="cuda"): super().__init__() if albert_name == "ALBERT-base": albert_configuration = AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) elif albert_name == "ALBERT-xxlarge": albert_configuration = AlbertConfig() else: raise self.device = device self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.model = AlbertModel.from_pretrained('albert-base-v2').to( self.device) self.linear = nn.Linear(model.config.embedding_size, 2).to(self.device) self.dropout = nn.Dropout(0.1).to(self.device)
def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert(model, config, args.checkpoint) model.save_pretrained(args.output)
def prediction(): # data = input('请输入测试数据:') data = "Don't give me your attitude!" print(data) tokenized_data = tokenizer.tokenize(data) tokenized_data.insert(0, "[CLS]") tokenized_data.append("[SEP]") data_indexed = tokenizer.convert_tokens_to_ids(tokenized_data) data = torch.from_numpy(np.array(data_indexed)).to(device) data = data.unsqueeze(0) # [1, seq_length] config = AlbertConfig(hidden_size=768) model = ALBertClassifyModel(config, num_class=2, fc_dropout=DROPOUT) model.load_state_dict(torch.load(SAVE_MODEL_PATH)) model.to(device) model.eval() softmax = nn.Softmax(dim=1) with torch.no_grad(): predict = model(data) predict_softmax = softmax(predict) print(predict_softmax) predict = torch.argmax(predict_softmax, dim=1) print(predict)
def prepare_config_and_inputs(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) sequence_labels = None token_labels = None choice_labels = None if self.use_labels: sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) choice_labels = ids_tensor([self.batch_size], self.num_choices) config = AlbertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, return_dict=True, ) return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
def __init__( self, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, ): super().__init__() # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "ALBERT constructor." ) # TK: The following code checks the same once again. if vocab_size is not None: config = AlbertConfig( vocab_size_or_config_json_file=vocab_size, vocab_size=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings, ) model = AlbertModel(config) elif pretrained_model_name is not None: model = AlbertModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = AlbertConfig.from_json_file(config_filename) model = AlbertModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + " be passed into the ALBERT constructor" ) model.to(self._device) self.add_module("albert", model) self.config = model.config self._hidden_size = model.config.hidden_size
def __init__(self, bert_model='bert-base-cased'): super(BERTRepresenter, self).__init__() if 'albert' in bert_model.lower(): config = AlbertConfig() self.bert = AlbertModel(config).from_pretrained(bert_model) else: config = BertConfig() # config = BertConfig(vocab_size=24000, hidden_size=264) self.bert = BertModel(config).from_pretrained(bert_model)
def load_tokenizer_model(ckpt): state = torch.load(ckpt, map_location=torch.device('cpu')) tokenizer = NGRAMTokenizer(state['ngram']) config = AlbertConfig(**state['config_dict']) model = Consonant(config) model.load_state_dict(state['model_state_dict']) step = int(ckpt.split('-')[-1].split('.')[0]) return tokenizer, model, state['ngram'], step
def main(args): with open(args.config) as fp: data = json.loads(fp.read()) config = AlbertConfig(**data) model = AlbertForMaskedLM(config) model: AlbertForMaskedLM = load_tf_weights_in_albert( model, config, args.checkpoint) model.save_pretrained(args.output) tokenizer = AlbertTokenizer.from_pretrained(args.spiece, keep_accents=True) tokenizer.save_pretrained(args.output)
def __init__(self, in_dim, hidden_dim, out_dim, num_heads, num_classes=2): super(Summarizer, self).__init__() albert_base_configuration = AlbertConfig( hidden_size=256, num_attention_heads=4, intermediate_size=1024, ) self.tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') self.embedder = AlbertModel(albert_base_configuration) self.gat_classifier = GATClassifier(in_dim, hidden_dim, out_dim, num_heads, num_classes)
def __init__( self, d_emb: int, d_ff: int, d_model: int, dropout: float, max_seq_len: int, num_attention_heads: int, num_class: int, num_hidden_layers: int, type_vocab_size: int, vocab_size: int, ): super().__init__() # Construct ALBERT model. self.encoder = AlbertModel(AlbertConfig( attention_probs_dropout_prob=dropout, classifier_dropout_prob=dropout, embedding_size=d_emb, hidden_dropout_prob=dropout, hidden_size=d_model, initializer_range=0.02, inner_group_num=1, intermediate_size=d_ff, layer_norm_eps=1e-12, max_position_embeddings=max_seq_len, num_hidden_layers=num_hidden_layers, num_hidden_groups=1, num_attention_heads=num_attention_heads, type_vocab_size=type_vocab_size, vocab_size=vocab_size )) # Dropout layer between encoder and linear layer. self.dropout = nn.Dropout(dropout) # Linear layer project from `d_model` into `num_class`. self.linear_layer = nn.Linear( in_features=d_model, out_features=num_class ) # Linear layer initialization. with torch.no_grad(): nn.init.normal_( self.linear_layer.weight, mean=0.0, std=0.02 ) nn.init.zeros_(self.linear_layer.bias)
def __init__(self, vocab_size, max_len) -> None: super().__init__( AlbertConfig( vocab_size=vocab_size, hidden_size=512, num_attention_heads=8, num_hidden_layers=4, intermediate_size=1024, embedding_size=128, max_position_embeddings=max_len ) )
def get_config(self): return AlbertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range, num_hidden_groups=self.num_hidden_groups, )
def train(): train_sentences, train_labels = generate_data(FILE_PATH + 'Train_v1.txt', MAX_SEQ_LENGTH) test_sentences, test_labels = generate_data(FILE_PATH + 'Test_v1.txt', MAX_SEQ_LENGTH) config = AlbertConfig(hidden_size=768) model = ALBertClassifyModel(config, num_class=2, fc_dropout=DROPOUT) model.to(device) criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) start = time.time() try: for epoch in range(EPOCHS): model.train() total_loss = 0. batch_num = len(train_sentences) // BATCH_SIZE batch = generate_batch(train_sentences, train_labels, BATCH_SIZE) for i in range(batch_num): data_batch, labels_batch = next( batch) # labels_batch: [batch_size] outputs = model(data_batch) outputs = outputs.view( -1, outputs.shape[-1]) # [batch_size, class] optimizer.zero_grad() loss = criterion(outputs, labels_batch) total_loss += (loss.cpu().item() * BATCH_SIZE) loss.backward() optimizer.step() f1_score_test, accuracy_test = test(model, test_sentences, test_labels, BATCH_SIZE) print('epoch %d, loss_train %.4f, accuracy_test %.4f, f1_score_test % .4f, time %.2fmin' % \ (epoch+1, total_loss/(batch_num*BATCH_SIZE), accuracy_test, f1_score_test, (time.time()-start)/60)) torch.save(model.state_dict(), SAVE_MODEL_PATH) except KeyboardInterrupt: # ctrl + c print('检测到外部中断,训练结束,模型已自动保存~') path = './albert_model/epoch_' + str(epoch) + '_epochbert_model.pth' torch.save(model.state_dict(), path)
def initialize(self, ctx): torch.set_num_threads(1) self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest['model']['serializedFile'] model_pt_path = os.path.join(model_dir, serialized_file) setup_config_path = os.path.join(model_dir, "setup_config.json") if os.path.isfile(setup_config_path): with open(setup_config_path) as setup_config_file: self.setup_config = json.load(setup_config_file) else: logger.warning('Missing the setup_config.json file.') #Loading the model and tokenizer from checkpoint and config files based on the user's choice of mode #further setup config can be added. # model_pt_path = '../ckpt-0189000.bin' self.device = torch.device( "cpu" ) #"cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") if self.setup_config["save_mode"] == "torchscript": self.model = torch.jit.load(model_pt_path) self.tokenizer = NGRAMTokenizer(self.setup_config["ngram"]) elif self.setup_config["save_mode"] == "pretrained": state = torch.load(model_pt_path, map_location=self.device) config = AlbertConfig(**state['config_dict']) self.model = Consonant(config) self.model.load_state_dict(state['model_state_dict']) self.tokenizer = NGRAMTokenizer(state["ngram"]) else: logger.warning('Missing the checkpoint or state_dict.') self.model.to(self.device) self.model.eval() logger.debug( 'Transformer model from path {0} loaded successfully'.format( model_pt_path)) self.initialized = True
def get_model(args): if args.model_size == 'debug': num_hidden_layers = 1 embedding_size = 8 hidden_size = 16 intermediate_size = 32 num_attention_heads = 2 args.gen_ratio = 2 elif args.model_size == 'tiny': num_hidden_layers = 4 embedding_size = 128 hidden_size = 336 intermediate_size = 1344 num_attention_heads = 12 elif args.model_size == 'small': num_hidden_layers = 12 embedding_size = 128 hidden_size = 256 intermediate_size = 1024 num_attention_heads = 4 elif args.model_size == 'base': num_hidden_layers = 12 embedding_size = 768 hidden_size = 768 intermediate_size = 3072 num_attention_heads = 12 else: raise Exception('Which model? small, base, large') config = AlbertConfig( max_position_embeddings=args.seq_length, vocab_size=args.vocab_size, num_hidden_layers=num_hidden_layers, embedding_size=embedding_size, hidden_size=hidden_size // args.gen_ratio, intermediate_size=intermediate_size // args.gen_ratio, num_attention_heads=num_attention_heads // args.gen_ratio, ) model = AlbertForMaskedLM(config) return model
def __call_model_tf(self): if self.model_to_use.lower() == 'bert': self.config = BertConfig(num_labels=2) self.model = TFBertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=self.config) elif self.model_to_use.lower() == 'albert': self.config = AlbertConfig(num_labels=2) self.model = TFAlbertForSequenceClassification.from_pretrained( 'albert-base-v1', config=self.config) elif self.model_to_use.lower() == 'electra': print( 'Electra not avaiable for sequence classification with Tensorflow yet.' ) elif self.model_to_use.lower() == 'distilbert': self.config = DistilBertConfig(num_labels=2) self.model = TFDistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', config=self.config) else: print('Model not avaiable yet.')
def __call_model_torch(self): if self.model_to_use.lower() == 'bert': self.config = BertConfig(num_labels=2) self.model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', config=self.config) elif self.model_to_use.lower() == 'albert': self.config = AlbertConfig(num_labels=2) self.model = AlbertForSequenceClassification.from_pretrained( 'albert-base-v1', config=self.config) elif self.model_to_use.lower() == 'electra': self.config = ElectraConfig(num_labels=2) self.model = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=self.config) elif self.model_to_use.lower() == 'distilbert': self.config = DistilBertConfig(num_labels=2) self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', config=self.config) else: print('Model not avaiable yet.')
def __init__(self, train_path: str = None, dev_path: str = None, test_path: str = None, model_path: str = None, config_path: str = None, tokenizer: AlbertTokenizer = None, num_classes: int = 2, cuda_device: int = 0, batch_size: int = 4, num_workers: int = 0, lr: float = 2e-5, weight_decay: float = 0.1, warm_up: int = 20): super(KbAlbertClassificationModel, self).__init__() self.num_classes = num_classes self.cuda_device = cuda_device self.batch_size = batch_size self.num_workers = num_workers self.lr = lr self.weight_decay = weight_decay self.warm_up = warm_up self.save_hyperparameters() self.train_dataset = KbAlbertDataset(train_path, tokenizer) self.val_dataset = KbAlbertDataset(dev_path, tokenizer) self.test_dataset = KbAlbertDataset(test_path, tokenizer) f = open(config_path, encoding='UTF-8') config_dict = json.loads(f.read()) config = AlbertConfig(**config_dict) self.text_embedding = AlbertModel.from_pretrained(pretrained_model_name_or_path=model_path, config=config) self.classifier_hidden_size = self.text_embedding.config.hidden_size self.classifier = nn.Linear(self.classifier_hidden_size, self.num_classes)
def main(tokenizer_path, dataset_path, save_path='alectra-small', max_steps=1e6, accumulate_grad_batches=1, gpus=None, num_tpu_cores=None, distributed_backend=None, val_check_interval=0.25, val_check_percent=0.25, generator_type='albert', num_hidden_groups=1, d_loss_weight=50, mlm_prob=0.15, learning_rate=5e-4, warmup_steps=10000, batch_size=128, num_workers=2, tie_embedding_proj=False, tie_encoder=True, shuffle=True, lr_schedule='linear', resume_from_checkpoint=None, use_polyaxon=False): # init tokenizer. only need it for the special chars. tokenizer = BertWordPieceTokenizer(tokenizer_path) # init generator. if generator_type == 'albert': generator_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=3, num_attention_heads=1, num_hidden_groups=num_hidden_groups, intermediate_size=1024, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, classifier_dropout_prob=0.1, max_position_embeddings=128) generator = AlbertForMaskedLM(generator_config) elif generator_type == 'bert': generator_config = BertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=128, num_hidden_layers=3, num_attention_heads=1, intermediate_size=256, max_position_embeddings=128) generator = BertForMaskedLM(generator_config) tie_weights(generator.cls.predictions.decoder, generator.bert.embeddings.word_embeddings) else: raise Exception(f"invalid generator type: {generator_type}") # init discriminator. discriminator_config = AlbertConfig( vocab_size=tokenizer._tokenizer.get_vocab_size(), hidden_size=256, embedding_size=128, num_hidden_layers=12, num_attention_heads=4, num_hidden_groups=num_hidden_groups, intermediate_size=1024, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, classifier_dropout_prob=0.1, max_position_embeddings=128) discriminator = AlbertForTokenClassification(discriminator_config) # tie the embeddingg weights. tie_weights(discriminator.base_model.embeddings.word_embeddings, generator.base_model.embeddings.word_embeddings) tie_weights(discriminator.base_model.embeddings.position_embeddings, generator.base_model.embeddings.position_embeddings) tie_weights(discriminator.base_model.embeddings.token_type_embeddings, generator.base_model.embeddings.token_type_embeddings) if generator_type == 'albert' and tie_encoder: print('tying albert encoder layers') discriminator.albert.encoder.albert_layer_groups = generator.albert.encoder.albert_layer_groups if generator_type == 'albert' and tie_embedding_proj: print('tying embedding projection layers') discriminator.albert.encoder.embedding_hidden_mapping_in = generator.albert.encoder.embedding_hidden_mapping_in # init training module. training_config = DiscLMTrainingModuleConfig(max_steps, d_loss_weight=d_loss_weight, save_path=save_path, weight_decay=0.01, learning_rate=learning_rate, epsilon=1e-6, lr_schedule=lr_schedule, warmup_steps=warmup_steps) if use_polyaxon: checkpoint_fn = polyaxon_checkpoint_fn else: checkpoint_fn = None lightning_module = DiscLMTrainingModule(generator, discriminator, training_config, checkpoint_fn=checkpoint_fn) # init trainer. trainer = Trainer(accumulate_grad_batches=accumulate_grad_batches, gpus=gpus, num_tpu_cores=num_tpu_cores, distributed_backend=distributed_backend, max_steps=max_steps, resume_from_checkpoint=resume_from_checkpoint, val_check_percent=val_check_percent, val_check_interval=val_check_interval) # init dataloaders. train_loader, val_loader, _ = get_dataloaders(tokenizer, dataset_path, trainer, mlm_prob, batch_size, num_workers, shuffle) # train. trainer.fit(lightning_module, train_loader, val_loader) # save the model. output_path = os.path.join(save_path, 'discriminator', 'final') os.makedirs(output_path, exist_ok=True) lightning_module.discriminator.base_model.save_pretrained(output_path) if checkpoint_fn: checkpoint_fn(lightning_module)
train_dataset = np.array(list(dict(train_encodings).values())) val_dataset = np.array(list(dict(val_encodings).values())) BATCH_SIZE = 16 # Create a callback that saves the model's weights every x epochs checkpoint_path = "albert16_ckpt/cp-{epoch:04d}.ckpt" checkpoint_dir = os.path.dirname(checkpoint_path) cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True) save_model = True config = AlbertConfig(num_labels=3, return_dict=True, model_type='albert-base-v2') model = TFAlbertForSequenceClassification(config=config) if save_model: optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy']) model.fit(train_dataset[0], np.array(y_list), epochs=5, batch_size=BATCH_SIZE, callbacks=[cp_callback])
def main(): args = make_parser() os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus seed_everything(args.seed) # Prepare output directory if not os.path.exists(os.path.join('../', args.output_dir)): os.mkdir(os.path.join('../', args.output_dir)) args.output_dir = os.path.join('../', args.output_dir, args.exp_name) if os.path.exists(args.output_dir): flag_continue = input(f"Model name [{args.exp_name}] already exists. Do you want to overwrite? (y/n): ") if flag_continue.lower() == 'y' or flag_continue.lower() == 'yes': shutil.rmtree(args.output_dir) os.mkdir(args.output_dir) else: print("Exit pre-training") exit() else: os.mkdir(args.output_dir) # Setup for neptune logger neptune_api_key = os.environ['NEPTUNE_API_TOKEN'] neptune_project_name = 'kevinjo/cs372' neptune_experiment_name = args.exp_name neptune_logger = NeptuneLogger( api_key=neptune_api_key, project_name=neptune_project_name, experiment_name=neptune_experiment_name, tags=["torch", "pretrain"], params=vars(args) ) # Setup for pytorch-lightning params train_params = dict( logger=neptune_logger, gpus=args.n_gpu, gradient_clip_val=args.max_grad_norm, early_stop_callback=None, checkpoint_callback=False, # val_check_interval=args.validation_step, accumulate_grad_batches=args.grad_accum_steps, max_steps=args.max_steps, benchmark=args.benchmark, ) # Setup for albert model albert_base_configuration = AlbertConfig( classifier_dropout_prob = args.classifier_dropout_prob, hidden_size=args.hidden_size, embedding_size=args.embedding_size, num_attention_heads=args.num_attention_heads, num_hidden_layers=args.num_hidden_layers, num_hidden_groups=args.num_hidden_groups, intermediate_size=args.intermediate_size, vocab_size = args.vocab_size, max_position_embeddings= args.max_position_embeddings, output_vocab_size = args.output_vocab_size, type_vocab_size = args.type_vocab_size, ) model = ConsonantAlbert(args, albert_base_configuration) # Start model training trainer = pl.Trainer(auto_lr_find=False, profiler=False, amp_level='O2', precision=16, **train_params) if args.do_train: trainer.fit(model) return
import torch from transformers import AlbertModel, AlbertConfig from consonant.model.modeling import Consonant from consonant.model.tokenization import NGRAMTokenizer if __name__ == '__main__': ckpt = '../ckpt-0078000.bin' device = torch.device( "cpu" ) #"cuda:" + str(properties.get("gpu_id")) if torch.cuda.is_available() else "cpu") state = torch.load(ckpt, map_location=device) print(state['ngram']) config = AlbertConfig(**state['config_dict']) config.attention_probs_dropout_prob = 0.0 config.hidden_dropout_prob = 0.0 print(config) model = Consonant(config) model.load_state_dict(state['model_state_dict']) tokenizer = NGRAMTokenizer(1) inputs = tokenizer.encode("sample text", max_char_length=100, return_attention_mask=True) input_ids = torch.tensor([inputs["head_ids"]], dtype=torch.long) traced_model = torch.jit.trace(model, [input_ids, input_ids]) torch.jit.save(traced_model, "traced_model.pt")
def albert_config(cfg, args) -> AlbertConfig: model_name = ( f"calbert-{cfg.model.name}-{'uncased' if cfg.vocab.lowercase else 'cased'}" ) return AlbertConfig(vocab_size=cfg.vocab.max_size, **dict(cfg.model))
self.sum += val * n self.count += n self.avg = float(self.sum) / float(self.count) def is_int(s): try: int(s) return True except ValueError: return False def count_parameters(model): return sum(p.numel() for p in model.parameters() if p.requires_grad) if __name__ == '__main__': from transformers import BertConfig, AlbertConfig, BertModel, AlbertModel bert = BertModel( BertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072)) albert = AlbertModel( AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072)) print("Number of parameters in BERT: %d" % count_parameters(bert)) print("Number of parameters in ALBERT: %d" % count_parameters(albert))
def __init__(self, hparams): torch.nn.Module.__init__(self) assert isinstance( hparams.encoder, dict ), "hparams.encoder must be a dict. If not multi node types, use MonoplexEmbedder instead." assert isinstance( hparams.embedder, dict ), "hparams.embedder must be a dict. If not multi-layer, use MonoplexEmbedder instead." self.hparams = copy.copy(hparams) ################### Encoding #################### self.node_types = list(hparams.encoder.keys()) for node_type, encoder in hparams.encoder.items(): if encoder == "ConvLSTM": hparams.vocab_size = self.hparams.vocab_size[node_type] self.set_encoder(node_type, ConvLSTM(hparams)) elif encoder == "Albert": config = AlbertConfig( vocab_size=hparams.vocab_size, embedding_size=hparams.word_embedding_size, hidden_size=hparams.encoding_dim, num_hidden_layers=hparams.num_hidden_layers, num_hidden_groups=hparams.num_hidden_groups, hidden_dropout_prob=hparams.hidden_dropout_prob, attention_probs_dropout_prob=hparams. attention_probs_dropout_prob, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, type_vocab_size=1, max_position_embeddings=hparams.max_length, ) self.set_encoder(node_type, AlbertEncoder(config)) elif "NodeIDEmbedding" in encoder: # `encoder` is a dict with {"NodeIDEmbedding": hparams} self.set_encoder( node_type, NodeIDEmbedding(hparams=encoder["NodeIDEmbedding"])) elif "Linear" in encoder: encoder_hparams = encoder["Linear"] self.set_encoder( node_type, torch.nn.Linear(in_features=encoder_hparams["in_features"], out_features=hparams.encoding_dim)) else: raise Exception( "hparams.encoder must be one of {'ConvLSTM', 'Albert', 'NodeIDEmbedding'}" ) ################### Layer-specfic Embedding #################### self.layers = list(hparams.embedder) if hparams.multiplex_embedder == "ExpandedMultiplexGAT": self._embedder = ExpandedMultiplexGAT( in_channels=hparams.encoding_dim, out_channels=int(hparams.embedding_dim / len(self.node_types)), node_types=self.node_types, layers=self.layers, dropout=hparams.nb_attn_dropout) else: print( '"multiplex_embedder" used. Concatenate multi-layer embeddings instead.' ) ################### Classifier #################### if hparams.classifier == "Dense": self._classifier = DenseClassification(hparams) elif hparams.classifier == "HierarchicalAWX": self._classifier = HierarchicalAWX(hparams) else: raise Exception("hparams.classifier must be one of {'Dense'}") if hparams.use_hierar: label_map = pd.Series(range(len(hparams.classes)), index=hparams.classes).to_dict() hierar_relations = get_hierar_relations( hparams.hierar_taxonomy_file, label_map=label_map) self.criterion = ClassificationLoss( n_classes=hparams.n_classes, class_weight=None if not hasattr(hparams, "class_weight") else torch.tensor(hparams.class_weight), loss_type=hparams.loss_type, hierar_penalty=hparams.hierar_penalty if hparams.use_hierar else None, hierar_relations=hierar_relations if hparams.use_hierar else None)
def __init__(self, hparams): torch.nn.Module.__init__(self) assert isinstance( hparams.encoder, dict ), "hparams.encoder must be a dict. If not multi node types, use MonoplexEmbedder instead." assert isinstance( hparams.embedder, dict ), "hparams.embedder must be a dict. If not multi-layer, use MonoplexEmbedder instead." self.hparams = hparams ################### Encoding #################### self.node_types = list(hparams.encoder.keys()) for node_type, encoder in hparams.encoder.items(): if encoder == "ConvLSTM": assert not (len(hparams.encoder) > 1 and not len(hparams.vocab_size) > 1) self.set_encoder(node_type, ConvLSTM(hparams)) elif encoder == "Albert": assert not (len(hparams.encoder) > 1 and not len(hparams.vocab_size) > 1) config = AlbertConfig( vocab_size=hparams.vocab_size, embedding_size=hparams.word_embedding_size, hidden_size=hparams.encoding_dim, num_hidden_layers=hparams.num_hidden_layers, num_hidden_groups=hparams.num_hidden_groups, hidden_dropout_prob=hparams.hidden_dropout_prob, attention_probs_dropout_prob=hparams. attention_probs_dropout_prob, num_attention_heads=hparams.num_attention_heads, intermediate_size=hparams.intermediate_size, type_vocab_size=1, max_position_embeddings=hparams.max_length, ) self.set_encoder(node_type, AlbertEncoder(config)) elif "NodeIDEmbedding" in encoder: # `encoder` is a dict with {"NodeIDEmbedding": hparams} self.set_encoder( node_type, NodeIDEmbedding(hparams=encoder["NodeIDEmbedding"])) elif "Linear" in encoder: encoder_hparams = encoder["Linear"] self.set_encoder( node_type, torch.nn.Linear(in_features=encoder_hparams["in_features"], out_features=hparams.encoding_dim)) else: raise Exception( "hparams.encoder must be one of {'ConvLSTM', 'Albert', 'NodeIDEmbedding'}" ) ################### Layer-specfic Embedding #################### for subnetwork_type, embedder_model in hparams.embedder.items(): if embedder_model == "GAT": self.set_embedder(subnetwork_type, GAT(hparams)) elif embedder_model == "GCN": self.set_embedder(subnetwork_type, GCN(hparams)) elif embedder_model == "GraphSAGE": self.set_embedder(subnetwork_type, GraphSAGE(hparams)) else: raise Exception( f"Embedder model for hparams.embedder[{subnetwork_type}]] must be one of ['GAT', 'GCN', 'GraphSAGE']" ) ################### Multiplex Embedding #################### layers = list(hparams.embedder.keys()) self.layers = layers if hparams.multiplex_embedder == "MultiplexLayerAttention": self._multiplex_embedder = MultiplexLayerAttention( embedding_dim=hparams.embedding_dim, hidden_dim=hparams.multiplex_hidden_dim, attention_dropout=hparams.multiplex_attn_dropout, layers=layers) hparams.embedding_dim = hparams.multiplex_hidden_dim elif hparams.multiplex_embedder == "MultiplexNodeAttention": self._multiplex_embedder = MultiplexNodeAttention( embedding_dim=hparams.embedding_dim, hidden_dim=hparams.multiplex_hidden_dim, attention_dropout=hparams.multiplex_attn_dropout, layers=layers) hparams.embedding_dim = hparams.multiplex_hidden_dim else: print( '"multiplex_embedder" not used. Concatenate multi-layer embeddings instead.' ) hparams.embedding_dim = hparams.embedding_dim * len( hparams.embedder) ################### Classifier #################### if hparams.classifier == "Dense": self._classifier = DenseClassification(hparams) elif hparams.classifier == "HierarchicalAWX": self._classifier = HierarchicalAWX(hparams) else: raise Exception("hparams.classifier must be one of {'Dense'}") if hparams.use_hierar: label_map = pd.Series(range(len(hparams.classes)), index=hparams.classes).to_dict() hierar_relations = get_hierar_relations( hparams.hierar_taxonomy_file, label_map=label_map) self.criterion = ClassificationLoss( n_classes=hparams.n_classes, class_weight=None if not hasattr(hparams, "class_weight") else torch.tensor(hparams.class_weight), loss_type=hparams.loss_type, hierar_penalty=hparams.hierar_penalty if hparams.use_hierar else None, hierar_relations=hierar_relations if hparams.use_hierar else None)