def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = BertForSequenceClassification(config) model.eval() loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels) result = { "loss": loss, "logits": logits, } self.parent.assertListEqual( list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result)
def main(): bert_base_config = BertConfig.from_pretrained('bert-base-uncased', num_labels=2) bert_base_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=bert_base_config) count = 0 for name, param in bert_base_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in bert_base_uncased: ', count) roberta_config = RobertaConfig.from_pretrained('roberta-base', num_labels=2) roberta_model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=roberta_config) count = 0 for name, param in roberta_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in roberta: ', count) albert_config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=2) albert_model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', config=albert_config) count = 0 for name, param in albert_model.named_parameters(): if param.requires_grad: size = 1 for s in param.data.size(): size = s * size count += size print('The total number of parameters in albert: ', count)
def __init__(self, checkpoint_path='logs/checkpoint.pth', eval_report_path='logs/report.txt', is_training=True, train_path='train.csv', test_path='test.csv', log_dir='drive/My Drive/dm/logs/', batch_size=16): self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") self.learning_rate = 5e-5 self.num_epochs = 6 self.batch_size = batch_size self.log_interval = 1000 self.is_training = is_training self._plot_server = None self.log_dir = log_dir self.checkpoint_path = checkpoint_path self.best_model_path = checkpoint_path + '.best' self.eval_report = eval_report_path self.train_data_path = train_path self.test_data_path = test_path self.train_loader = QQPLoader(self.device, self.train_data_path, self.batch_size) self.test_loader = QQPLoader(self.device, self.test_data_path, self.batch_size) self.model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) self._maybe_load_checkpoint() self.model.to(self.device)
def save_and_reload(self, path, model_name): torch.cuda.empty_cache() self.model.to('cpu') # Save a trained model model_to_save = self.model.module if hasattr( self.model, 'module') else self.model # Only save the model it-self output_model_file = os.path.join(path, "{}.bin".format(model_name)) torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) if self.multi_label: self.model = BertForMultiLabelSequenceClassification.from_pretrained( self.pretrained_model_path, num_labels=len(self.data.labels), state_dict=model_state_dict) else: self.model = BertForSequenceClassification.from_pretrained( self.pretrained_model_path, num_labels=len(self.data.labels), state_dict=model_state_dict) if self.is_fp16: self.model.half() torch.cuda.empty_cache() self.model.to(self.device)
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) # self.bert = BertModel.from_pretrained(pretrain_path) self.bert = BertForSequenceClassification.from_pretrained( pretrain_path, num_labels=2) self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(pretrain_path)
def load_model(model_name: str, task_name: str): if model_name not in cache: cache[model_name] = dict() if task_name not in cache[model_name]: model_path = str(Path(f"models/{model_name}/{task_name}/")) model = BertForSequenceClassification.from_pretrained(model_path, config=config) cache[model_name][task_name] = model return cache[model_name][task_name]
def create_model(self): if self.model_configuration.bert_model in ("xlnet-base-cased",): model = XLNetForSequenceClassification.from_pretrained(self.model_configuration.bert_model, num_labels=self.model_configuration.num_labels) else: model = BertForSequenceClassification.from_pretrained(self.model_configuration.bert_model, num_labels=self.model_configuration.num_labels) model.to(device) return model
def __init__(self, pretrain_path, max_length): nn.Module.__init__(self) # self.bert = BertModel.from_pretrained(pretrain_path) self.bert = BertForSequenceClassification.from_pretrained( pretrain_path, num_labels=2) self.max_length = max_length self.tokenizer = BertTokenizer.from_pretrained(os.path.join( pretrain_path, 'bert_vocab.txt')) self.modelName = 'Bert'
def main(text): tokenizer = BertTokenizer.from_pretrained('./', do_lower_case=True) model = BertForSequenceClassification.from_pretrained('./') model.to(device) texts = [] preds = [] texts.append("[CLS] " + text[:509] + " [SEP]") tokenized_texts = [tokenizer.tokenize(sent) for sent in texts] input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences( input_ids, maxlen=100, dtype="long", truncating="post", padding="post" ) attention_masks = [[float(i>0) for i in seq] for seq in input_ids] prediction_inputs = torch.tensor(input_ids) prediction_masks = torch.tensor(attention_masks) prediction_data = TensorDataset( prediction_inputs, prediction_masks ) prediction_dataloader = DataLoader( prediction_data, sampler=SequentialSampler(prediction_data), batch_size=1 ) model.eval() preds = [] for batch in prediction_dataloader: # добавляем батч для вычисления на GPU batch = tuple(t.to(device) for t in batch) # Распаковываем данные из dataloader b_input_ids, b_input_mask = batch # При использовании .no_grad() модель не будет считать и хранить градиенты. # Это ускорит процесс предсказания меток для тестовых данных. with torch.no_grad(): logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Перемещаем logits и метки классов на CPU для дальнейшей работы logits = logits[0].detach().cpu().numpy() # Сохраняем предсказанные классы и ground truth batch_preds = np.argmax(logits, axis=1) preds.extend(batch_preds) return preds
def get_model(model, pretrained, resume, n_classes, dataset, log_dir): if resume: model = torch.load(os.path.join(log_dir, "last_model.pth")) d = train_data.input_size()[0] elif model_attributes[model]["feature_type"] in ( "precomputed", "raw_flattened", ): assert pretrained # Load precomputed features d = train_data.input_size()[0] model = nn.Linear(d, n_classes) model.has_aux_logits = False elif model == "resnet50": model = torchvision.models.resnet50(pretrained=pretrained) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) elif model == "resnet34": model = torchvision.models.resnet34(pretrained=pretrained) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) elif model == "wideresnet50": model = torchvision.models.wide_resnet50_2(pretrained=pretrained) d = model.fc.in_features model.fc = nn.Linear(d, n_classes) elif model.startswith('bert'): if dataset == "MultiNLI": assert dataset == "MultiNLI" from pytorch_transformers import BertConfig, BertForSequenceClassification config_class = BertConfig model_class = BertForSequenceClassification config = config_class.from_pretrained("bert-base-uncased", num_labels=3, finetuning_task="mnli") model = model_class.from_pretrained("bert-base-uncased", from_tf=False, config=config) elif dataset == "jigsaw": from transformers import BertForSequenceClassification model = BertForSequenceClassification.from_pretrained( model, num_labels=n_classes) print(f'n_classes = {n_classes}') else: raise NotImplementedError else: raise ValueError(f"{model} Model not recognized.") return model
def train( root=True, binary=False, bert="bert-large-uncased", epochs=30, batch_size=8, save=False, ): trainset = SSTDataset("train", root=root, binary=binary) devset = SSTDataset("dev", root=root, binary=binary) testset = SSTDataset("test", root=root, binary=binary) config = BertConfig.from_pretrained(bert) if not binary: config.num_labels = 5 model = BertForSequenceClassification.from_pretrained(bert, config=config) model = model.to(device) lossfn = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) for epoch in range(1, epochs): train_loss, train_acc = train_one_epoch(model, lossfn, optimizer, trainset, batch_size=batch_size) val_loss, val_acc = evaluate_one_epoch(model, lossfn, optimizer, devset, batch_size=batch_size) test_loss, test_acc = evaluate_one_epoch(model, lossfn, optimizer, testset, batch_size=batch_size) logger.info(f"epoch={epoch}") logger.info( f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, test_loss={test_loss:.4f}" ) logger.info( f"train_acc={train_acc:.3f}, val_acc={val_acc:.3f}, test_acc={test_acc:.3f}" ) if save: label = "binary" if binary else "fine" nodes = "root" if root else "all" torch.save(model, f"{bert}__{nodes}__{label}__e{epoch}.pickle") logger.success("Done!")
def main(): torch.cuda.empty_cache() parser = setup_parser() args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir ) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory already exists and is not empty.") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') args.n_gpu = torch.cuda.device_count() args.device = device set_seed(args) args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: {}".format(args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) ##Load Models config = BertConfig.from_pretrained(args.config_name) config.num_labels = 1 tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config, num_labels=1) model.to(args.device) args.n_gpu = 1 if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info('global step = {}, average loss = {}'.format( global_step, tr_loss)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info("saving model checkpoint to {}".format(args.output_dir)) model_to_save = model.module if hasattr(model, 'module') else model # model_to_save.save_pretrained(args.output_dir) torch.save(model_to_save.state_dict(), 'saved_model.pth') tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
def predict_bert(text): import torch from keras.preprocessing.sequence import pad_sequences import pandas as pd import numpy as np from pytorch_transformers import BertTokenizer, BertForSequenceClassification device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device == 'cpu': print('cpu') else: n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name(0)) model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) model.load_state_dict(torch.load(DIR_DATA_MODELS / 'BERT_model.h5')) sentences = '[CLS] ' + str(text) + ' [SEP]' tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tok = tokenizer.tokenize(sentences) input_ids = tokenizer.convert_tokens_to_ids(tok) input_ids = pad_sequences([input_ids, ''], maxlen=100, dtype="long", truncating="post", padding="post") attention_masks = [[float(i > 0) for i in seq] for seq in input_ids] train_inputs = torch.tensor(input_ids[0]).long().to(device) train_masks = torch.tensor(attention_masks[0]).long().to(device) train_inputs = train_inputs.unsqueeze_(0) train_masks = train_masks.unsqueeze_(0) model.to(device) logits = model(train_inputs, token_type_ids=None, attention_mask=train_masks) return logits
def forward(self, captions, position_ids, region_features, attention_mask): # batch_size = region_features.shape[0] embeddings = self.embedding_layer(region_features, captions, position_ids) # print(self.classifier);exit(0) attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) attention_mask = (1.0 - attention_mask) * -10000.0 model_checkpoint = "distilbert-base-uncased" model = AutoModelForSequenceClassification.from_pretrained( model_checkpoint, num_labels=2) print(model) exit(0) model = BertForSequenceClassification(BertConfig()) encoder = model.bert.encoder pooler = model.bert.pooler dropout = model.dropout classifier = model.classifier output = encoder(embeddings, attention_mask, head_mask=self.head_mask) # print(type(output)) # print(len(output));exit(0) output = pooler(output[0]) output = dropout(output) output = classifier(output) print(output.shape) print(output) exit(0) hidden_states = self.encoder(embeddings, attention_mask, self.head_mask)[0] output = self.classifier(hidden_states) print(hidden_states.shape, output.shape) exit(0) return self.classifier(hidden_states)
def test(ckpt): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5) load_checkpoint(model, ckpt) model.eval() # while(1): # sentence = input("Enter Sentence: ") sentence = sys.argv[1] encode = tokenizer.encode(sentence, add_special_tokens=True) padded = [encode + [0] * (512 - len(encode))] sentence = torch.tensor(padded) label = torch.tensor([0]) results = model(sentence, label) _softmax = F.softmax(results[0], dim=1) pred = torch.argmax(F.softmax(results[0], dim=1)).item() print(f"{pred+1}")
def _load_dnli_model(self): # Download pretrained weight dnli_model_fname = os.path.join(self.opt['datapath'], 'dnli_model.bin') if not os.path.exists(dnli_model_fname): print(f"[ Download pretrained dnli model params to {dnli_model_fname}]") download_from_google_drive( '1Qawz1pMcV0aGLVYzOgpHPgG5vLSKPOJ1', dnli_model_fname ) # Load pretrained weight print(f"[ Load pretrained dnli model from {dnli_model_fname}]") model_state_dict = torch.load(dnli_model_fname) dnli_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', state_dict=model_state_dict, num_labels=3) if self.use_cuda: dnli_model.cuda() dnli_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) return dnli_model, dnli_tokenizer
def predict_model(args, save=True): dataset_name = args.dataset_name[0] model_type = args.model_type test_dataset = path_tensor_dataset / f"{model_type}_{dataset_name}.pkl" test_dataset = pickle_load(test_dataset) test_dataloader = DataLoader(test_dataset, batch_size=args.batch_size, pin_memory=True, num_workers=4, shuffle=False) model_dir = path_model / f"{args.model_type}_{args.model_name}/checkpoint_epoch{args.epoch_num}" if model_type == "bert": model = BertForSequenceClassification.from_pretrained(model_dir, num_labels=126) elif model_type == "xlnet": model = XLNetForSequenceClassification.from_pretrained(model_dir, num_labels=126) else: raise ValueError("") model.zero_grad() model.eval() model = model.cuda(args.gpu_device_ids[0]) if args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=args.gpu_device_ids) res = [] for batch in tqdm(test_dataloader, desc="Iteration"): batch = tuple(x.cuda(args.gpu_device_ids[0]) for x in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2] } with torch.no_grad(): outputs = model(**inputs)[0] res.append(outputs) res = torch.cat(res, 0).cpu() if save: filename = f"{model_type}_{dataset_name}_epoch{args.epoch_num}_res.pkl" pickle_save(res, path_model_output / filename) return res
def __init__(self, device=torch.device( "cuda" if torch.cuda.is_available() else "cpu"), is_paralleled=False, BATCH_SIZE=128, CPU_COUNT=1, CHUNKSIZE=1): self.device = device if isinstance( device, torch.device) else torch.device(device) model_path = os.path.join(os.path.dirname(__file__), "support_model.bin") self.model_type = 'supportr' if not os.path.isfile(model_path): logger.info( f'Model {self.model_type} does not exist at {model_path}. Try to download it now.' ) model = 'support_model' fetch_pretrained_model(model, model_path) if self.device.type == "cpu": model_state_dict = torch.load(model_path, map_location=self.device.type) else: model_state_dict = torch.load(model_path) self.model = BertForSequenceClassification.from_pretrained( 'bert-base-cased', state_dict=model_state_dict, num_labels=1) if is_paralleled: if self.device.type == "cpu": print("Data parallel is not available with cpus") else: self.model = torch.nn.DataParallel(self.model) self.model.to(device) self.model.eval() self.batch_size = BATCH_SIZE self.cpu_count = CPU_COUNT self.chunksize = CHUNKSIZE
import pandas as pd from joblib import dump import torch from utils.dataset import MLBERT seed = 500 random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) # Network Hyperparameters num_classes = 1588 batch_size = 128 model_type = 'bert-base-multilingual-cased' model = BertForSequenceClassification.from_pretrained('model/') checkpoint = torch.load( 'model/checkpoints/metric_learning/bert/model_5epochs.pt', map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) cuda = torch.cuda.is_available() device = torch.device("cuda:0" if cuda else "cpu") if cuda: print("Cuda available") model.cuda() kwargs = {'num_workers': 0, 'pin_memory': True} if cuda else {} ml_test = MLBERT(train=False, file='processed_data/ml_test_bert.pt')
def main(): torch.manual_seed(42) # Random #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 128, 'learning_rate': 0.01, 'num_epochs': 5, 'num_layers': 2, 'oversample': False, 'soft_labels': False} # Glove params = { 'batch_size': 32, 'dropout': 0, 'hidden_dim': 128, 'learning_rate': 0.001, 'num_epochs': 5, 'num_layers': 2, 'oversample': False, 'soft_labels': False } # Random #params = {'batch_size': 32, 'dropout': 0, 'hidden_dim': 256, 'learning_rate': 0.0001, 'num_epochs': 5, 'num_layers': 3, 'oversample': False, 'soft_labels': False} #some params experiment_number = 1 test_percentage = 0.1 val_percentage = 0.2 batch_size = params["batch_size"] num_epochs = 5 #params["num_epochs"] dropout = params["dropout"] embedding_dim = 300 model_name = "CNN" #'Bert' #"CNN" #"LSTM" unsupervised = True embedding = "Glove" #"Random" ##"Glove" # "Both" # soft_labels = False combine = embedding == "Both" # LSTM parameters if model_name == "LSTM": hidden_dim = params["hidden_dim"] num_layers = params["num_layers"] # Bert parameter num_warmup_steps = 100 num_total_steps = 1000 if model_name == "Bert": embedding = "None" if embedding == "Both": combine = True embedding = "Random" else: combine = False learning_rate = params["learning_rate"] #5e-5, 3e-5, 2e-5 oversample_bool = False weighted_loss = True # load data dataset = Dataset("../data/cleaned_tweets_orig.csv", use_embedding=embedding, embedd_dim=embedding_dim, combine=combine, for_bert=(model_name == "Bert")) #dataset.oversample() train_data, val_test_data = split_dataset(dataset, test_percentage + val_percentage) val_data, test_data = split_dataset( val_test_data, test_percentage / (test_percentage + val_percentage)) # print(len(train_data)) #save_data(train_data, 'train') #save_data(test_data, 'test') #define loaders if oversample_bool: weights, targets = get_loss_weights(train_data, return_targets=True) class_sample_count = [ 1024 / 20, 13426, 2898 / 2 ] # dataset has 10 class-1 samples, 1 class-2 samples, etc. oversample_weights = 1 / torch.Tensor(class_sample_count) oversample_weights = oversample_weights[targets] # oversample_weights = torch.tensor([0.9414, 0.2242, 0.8344]) #torch.ones((3))- sampler = torch.utils.data.sampler.WeightedRandomSampler( oversample_weights, len(oversample_weights)) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=my_collate, sampler=sampler) else: train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=my_collate) val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, collate_fn=my_collate) #define model if model_name == "CNN": vocab_size = len(dataset.vocab) model = CNN(vocab_size, embedding_dim, combine=combine) elif model_name == "LSTM": vocab_size = len(dataset.vocab) model = LSTM(vocab_size, embedding_dim, batch_size=batch_size, hidden_dim=hidden_dim, lstm_num_layers=num_layers, combine=combine, dropout=dropout) elif model_name == "Bert": model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=3) train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, collate_fn=bert_collate) val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, collate_fn=bert_collate) #device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #LOSS : weighted cross entropy loss, by class counts of other classess if weighted_loss: weights = torch.tensor([0.9414, 0.2242, 0.8344], device=device) else: weights = torch.ones(3, device=device) #weights = torch.tensor([1.0, 1.0, 1.0], device = device) #get_loss_weights(train_data).to(device) # not to run again criterion = nn.CrossEntropyLoss(weight=weights) if soft_labels: criterion = weighted_soft_cross_entropy #latent model if unsupervised: vocab_size = len(dataset.vocab) criterion = nn.CrossEntropyLoss(weight=weights, reduction='none') model = Rationalisation_model(vocab_size, embedding_dim=embedding_dim, model=model_name, batch_size=batch_size, combine=combine, criterion=criterion) if not model_name == "Bert": model.embedding.weight.data.copy_(dataset.vocab.vectors) if combine: model.embedding_glove.weight.data.copy_(dataset.glove.vectors) #model to device model.to(device) #optimiser optimizer = optim.Adam(model.parameters(), lr=learning_rate) if model_name == "Bert": optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False) # Linear scheduler for adaptive lr scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps) else: scheduler = None plot_log = defaultdict(list) for epoch in range(num_epochs): #train and validate epoch_loss, epoch_acc = train_epoch(model, train_loader, optimizer, criterion, device, soft_labels=soft_labels, weights=weights, scheduler=scheduler, unsupervised=unsupervised) val_loss, val_acc = evaluate_epoch(model, val_loader, criterion, device, soft_labels=soft_labels, weights=weights, unsupervised=unsupervised) #save for plotting for name, point in zip( ["train_loss", "train_accuracy", "val_loss", "val_accuracy"], [epoch_loss, epoch_acc, val_loss, val_acc]): plot_log[f'{name}'] = point #realtime feel print(f'Epoch: {epoch+1}') print( f'\tTrain Loss: {epoch_loss:.5f} | Train Acc: {epoch_acc*100:.2f}%' ) print(f'\t Val. Loss: {val_loss:.5f} | Val. Acc: {val_acc*100:.2f}%') sample_sentences_and_z(model, train_loader, device, dataset.vocab) #save plot results_directory = f'plots/{experiment_number}' os.makedirs(results_directory, exist_ok=True) for name, data in plot_log.items(): save_plot(data, name, results_directory) #save model torch.save(model, os.path.join(results_directory, 'model_cnn.pth')) #confusion matrix and all that fun loss, acc, predictions, ground_truth = evaluate_epoch( model, val_loader, criterion, device, is_final=True, soft_labels=soft_labels, weights=weights, unsupervised=unsupervised) conf_matrix = confusion_matrix(ground_truth, predictions) class_report = classification_report(ground_truth, predictions) print('\nFinal Loss and Accuracy\n----------------\n') print(f'\t Val. Loss: {loss:.5f} | Val. Acc: {acc*100:.2f}%') print('\nCONFUSION MATRIX\n----------------\n') print(conf_matrix) print('\nCLASSSIFICATION REPORT\n----------------------\n') print(class_report) plot_confusion_matrix(ground_truth, predictions, classes=["Hate speech", "Offensive", "Neither"], normalize=False, title='Confusion matrix') plt.show()
else: input_ids = torch.tensor(tokenizer.encode(s), device=device).unsqueeze( 0) # Batch size 1 results.append( clf.forward(input_ids)[0].detach().cpu().numpy().flatten()) return np.array(results).reshape(-1, 2) print('loading models and data...') default = 'bert-base-uncased' mdir = '/scratch/users/vision/chandan/pacmed/glue/SST-2-3epoch' # '/scratch/users/vision/chandan/pacmed/glue/SST-2-middle/' device = 'cpu' tokenizer = BertTokenizer.from_pretrained(mdir) clf = BertForSequenceClassification.from_pretrained(mdir).eval().to(device) masked_predictor = BertForMaskedLM.from_pretrained(default).eval().to(device) lines = open('data/stsa.binary.test', 'r').read() lines = [line for line in lines.split('\n') if not line is ''] classes = [int(line[0]) for line in lines] reviews = [line[2:] for line in lines] num_reviews = 1821 # 1821 save_freq = 1 scores_iid = {} scores_conditional = {} scores_remove = {} scores_lime = {} # loop over reviews
return text, label rate_train_dataset = RateDataset(train_df) print(f"Train dataset: {len(rate_train_dataset)}") itr_num = len(rate_train_dataset) train_loader = DataLoader(rate_train_dataset, batch_size=16, shuffle=True, num_workers=2) device = torch.device("cuda:7") tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') #model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased') #config = BertConfig.from_pretrained("bert-base-uncased") model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5) #model = BertForMultiLabelSequenceClassification(config) model.to(device) optimizer = Adam(model.parameters(), lr=1e-6) itr = 1 p_itr = 500 s_itr = 10000 epochs = 5 total_loss = 0 total_len = 0 total_correct = 0 def save_checkpoint(model, save_pth):
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step, eval_every, early_stop, lr, weight_decay, lr_decay_in_layers, wd_decay_in_layers, max_length, max_title_rate, content_head_rate, batch_size, lr_scheduler_type, input_pattern, clean_method, warmup_rate, classifier_dropout, classifier_active, seed): arg_name_value_pairs = deepcopy(locals()) prefix = time.strftime('%Y%m%d_%H%M') logger = logging.getLogger('default') formatter = logging.Formatter("%(asctime)s %(message)s") if log_in_file: handler1 = logging.FileHandler(prefix + '.log') handler1.setFormatter(formatter) handler1.setLevel(logging.DEBUG) logger.addHandler(handler1) handler2 = logging.StreamHandler() handler2.setFormatter(formatter) handler2.setLevel(logging.DEBUG) logger.addHandler(handler2) logger.setLevel(logging.DEBUG) for arg_name, arg_value in arg_name_value_pairs.items(): logger.info(f'{arg_name}: {arg_value}') global tokenizer if lm_type == 'bert': tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt')) else: tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model')) global PAD, PAD_t, CLS_t, SEP_t PAD_t = '<pad>' CLS_t = '<cls>' SEP_t = '<sep>' PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0] logger.info(f'padding token is {PAD}') processed_train = preprocess( os.path.join(data_path, 'Train_DataSet.csv'), os.path.join(data_path, 'Train_DataSet_Label.csv'), tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'), False, tokenizer, max_length, input_pattern, clean_method, max_title_rate, content_head_rate, logger) logger.info('seed everything and create model') seed_everything(seed) no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight'] if lm_type == 'xlnet': model = XLNetForSequenceClassification.from_pretrained( lm_path, num_labels=3, summary_last_dropout=classifier_dropout) if classifier_active == 'relu': model.sequence_summary.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = [ 'transformer.mask_emb', 'transformer.word_embedding.weight' ] model_layer_names += [ f'transformer.layer.{i}.' for i in range(model.config.n_layer) ] model_layer_names += ['sequence_summary.summary', 'logits_proj'] else: model = BertForSequenceClassification.from_pretrained( lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout) if classifier_active == 'relu': model.bert.pooler.activation = nn.ReLU() if usegpu: model = model.cuda() model_layer_names = ['bert.embeddings'] model_layer_names += [ 'bert.encoder.layer.{}.'.format(i) for i in range(model.config.num_hidden_layers) ] model_layer_names += ['bert.pooler', 'classifier'] optimizer = optimizer = AdamW([{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and not any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': weight_decay * (wd_decay_in_layers**i) } for i, layer_name in enumerate(model_layer_names[::-1])] + [{ 'params': [ p for n, p in model.named_parameters() if layer_name in n and any(nd in n for nd in no_decay) ], 'lr': lr * (lr_decay_in_layers**i), 'weight_decay': .0 } for i, layer_name in enumerate(model_layer_names[::-1])]) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError model_state_0 = deepcopy(model.state_dict()) optimizer_state_0 = deepcopy(optimizer.state_dict()) test_iter = get_data_iter(processed_test, batch_size * 4, collect_test_func, shuffle=False) pred = np.zeros((len(processed_test), 3)) val_scores = [] for fold_idx, (train_idx, val_idx) in enumerate( KFold(n_splits=n_fold, shuffle=True, random_state=seed).split(processed_train)): model.load_state_dict(model_state_0) optimizer.load_state_dict(optimizer_state_0) if lr_scheduler_type == 'linear': lr_scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_rate, t_total=total_step) elif lr_scheduler_type == 'constant': lr_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmup_rate) else: raise ValueError train_iter = get_data_iter([processed_train[i] for i in train_idx], batch_size, collect_func) val_iter = get_data_iter([processed_train[i] for i in val_idx], batch_size * 4, collect_func, shuffle=False) best_model, best_score = training(model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, train_iter=train_iter, val_iter=val_iter, total_step=total_step, tokenizer=tokenizer, usegpu=usegpu, eval_every=eval_every, logger=logger, early_stop=early_stop, fold_idx=fold_idx) model.load_state_dict(best_model) val_scores.append(best_score) pred += predict(model, test_iter, usegpu) logger.info(f'average: {np.mean(val_scores):.6f}') pred = pred / n_fold prob_df = pd.DataFrame() submit = pd.DataFrame() submit['id'] = [i['id'] for i in processed_test] submit['label'] = pred.argmax(-1) prob_df['id'] = [i['id'] for i in processed_test] prob_df['0'] = pred[:, 0] prob_df['1'] = pred[:, 1] prob_df['2'] = pred[:, 2] submit.to_csv(f'submit_{prefix}.csv', index=False) prob_df.to_csv(f'probability_{prefix}.csv', index=False)
def main(): #os.environ["CUDA_VISIBLE_DEVICES"] = "0" torch.set_num_threads(1) parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval or not.") parser.add_argument("--eval_on", default="dev", help="Whether to run eval on the dev set or test set.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=64, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() #processors = FormationProcessor if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processor = FormationProcessor() tokenizer = BertTokenizer.from_pretrained( '/home/ypd-19-2/SpERT/model/bertbase-20210122T060007Z-001/bertbase') train_examples = None num_train_optimization_steps = 0 if args.do_train: train_examples = processor.get_train_examples() num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab # Prepare model config = BertConfig.from_pretrained(args.bert_model, num_labels=1, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained(args.bert_model, from_tf=False, config=config) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] warmup_steps = int(args.warmup_proportion * num_train_optimization_steps) optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=num_train_optimization_steps) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) global_step = 0 nb_tr_steps = 0 tr_loss = 0 #label_map = {i: label for i, label in enumerate(label_list, 1)} if args.do_train: train_features = convert_examples_to_features(train_examples, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) #label_map = {i : label for i, label in enumerate(label_list,1)} model_config = { "bert_model": args.bert_model, "do_lower": args.do_lower_case, "max_seq_length": args.max_seq_length, "num_labels": 1 } json.dump( model_config, open(os.path.join(args.output_dir, "model_config.json"), "w")) # Load a trained model and config that you have fine-tuned else: # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained(args.output_dir) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): loss_test = nn.L1Loss() if args.eval_on == "dev": eval_examples = processor.get_dev_examples() elif args.eval_on == "test": eval_examples = processor.get_test_examples() else: raise ValueError("eval on dev or test set only") eval_features = convert_examples_to_features(eval_examples, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 y_true = [] y_pred = [] for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)[0] #logits = torch.argmax(F.log_softmax(logits,dim=2),dim=2) input_mask = input_mask.to('cpu').numpy() batch_loss = loss_test(logits, label_ids) eval_loss += batch_loss y_true.append(label_ids) y_pred.append(logits) print('eval_loss') print(eval_loss / len(eval_dataloader))
import torch from pytorch_transformers import BertForSequenceClassification, BertTokenizer import numpy as np tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) def tokenization_step(input_seq, tok=tokenizer, pad=True): tokenized_mapped = tok.convert_tokens_to_ids(tok.tokenize(input_seq)) essay_size = len(tokenized_mapped) if pad: return (torch.LongTensor(np.array([101] + tokenized_mapped + [102] + [0] * (510 - essay_size)).reshape(1, -1)), torch.LongTensor(np.array([1]*(essay_size + 2) + [0]*(510-essay_size)).reshape(1, -1))) else: return (torch.LongTensor(np.array([101] + tokenized_mapped + [102]).reshape(-1, 1)), torch.LongTensor(np.array([1] * (essay_size + 2)).reshape(1, -1))) x, mask = tokenization_step("Hello World", pad=True) model = BertForSequenceClassification.from_pretrained('bert-base-uncased') y = model(x, attention_mask=mask)
if predictions is None: predictions = pred else: predictions = torch.cat((predictions, pred)) if compute_acc: print("correct:", correct, "total:", total) acc = correct / total return predictions, acc return predictions # 4. 训练该下游任务模型 # 定义模型 device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu") model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3).to(device) # 随机初始化,进行模型预测 print("*"*50) print(model.config) print("*"*50) _, acc = get_predictions(model, train_loader, compute_acc=True) print("classification acc:", acc) # 检查参数个数同时选择需要进行梯度更新的参数 def get_learnable_params(module): return [p for p in module.parameters() if p.requires_grad == True] model_params = get_learnable_params(model) clf_params = get_learnable_params(model.classifier) print("整个分类模型的参数量:", sum(p.numel() for p in model_params)) print("线性分类器的参数量:", sum(p.numel() for p in clf_params))
elif use_postag: idx_matches = svsm.match_senses(idx_vec, None, postags[idx], topn=None) else: idx_matches = svsm.match_senses(idx_vec, None, None, topn=1) matches.append(idx_matches) return matches, word_ind, tokens BERT_BASE_DIR = 'bert_torch_model/' vec_path = 'lmms_1024.bert-large-cased.npz' model = BertForSequenceClassification.from_pretrained(BERT_BASE_DIR, output_hidden_states=True) tokenizer = BertTokenizer.from_pretrained(BERT_BASE_DIR, do_lower_case=True) # To load bert model in GPU #model = model.cuda() model.eval() senses_vsm = SensesVSM(vec_path, normalize=True) @app.route("/synset_processing", methods=['POST']) def predict_synset():
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--train_data", default=None, type=str, required=True, help="The input training data file name." " Should be the .tsv file (or other data file) for the task.") parser.add_argument( "--val_data", default=None, type=str, required=True, help="The input validation data file name." " Should be the .tsv file (or other data file) for the task.") parser.add_argument( "--test_data", default=None, type=str, required=True, help="The input test data file name." " Should be the .tsv file (or other data file) for the task.") parser.add_argument("--log_path", default=None, type=str, required=True, help="The log file path.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) parser.add_argument("--save_model", default=False, action='store_true', help="Whether to save the model.") parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--embed_mode", default=None, type=str, required=True, help="The embedding type selected in the list: all, note, chunk, no.") parser.add_argument("--c", type=float, required=True, help="The parameter c for scaled adjusted mean method") parser.add_argument("--task_name", default="BERT_mortality_am", type=str, required=True, help="The name of the task.") ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--max_chunk_num", default=64, type=int, help= "The maximum total input chunk numbers after WordPiece tokenization.") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=1, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument( "--warmup_proportion", default=0.0, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--num_train_epochs", default=3, type=int, help="Total number of training epochs to perform.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.save_model: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) LOG_PATH = args.log_path MAX_LEN = args.max_seq_length config = DotMap() config.hidden_dropout_prob = 0.1 config.layer_norm_eps = 1e-12 config.initializer_range = 0.02 config.max_note_position_embedding = 1000 config.max_chunk_position_embedding = 1000 config.embed_mode = args.embed_mode config.layer_norm_eps = 1e-12 config.hidden_size = 768 config.task_name = args.task_name write_log( ("New Job Start! \n" "Data directory: {}, Directory Code: {}, Save Model: {}\n" "Output_dir: {}, Task Name: {}, embed_mode: {}\n" "max_seq_length: {}, max_chunk_num: {}\n" "train_batch_size: {}, eval_batch_size: {}\n" "learning_rate: {}, warmup_proportion: {}\n" "num_train_epochs: {}, seed: {}, gradient_accumulation_steps: {}" ).format(args.data_dir, args.data_dir.split('_')[-1], args.save_model, args.output_dir, config.task_name, config.embed_mode, args.max_seq_length, args.max_chunk_num, args.train_batch_size, args.eval_batch_size, args.learning_rate, args.warmup_proportion, args.num_train_epochs, args.seed, args.gradient_accumulation_steps), LOG_PATH) content = "config setting: \n" for k, v in config.items(): content += "{}: {} \n".format(k, v) write_log(content, LOG_PATH) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() write_log("Number of GPU is {}".format(n_gpu), LOG_PATH) for i in range(n_gpu): write_log(("Device Name: {}," "Device Capability: {}").format( torch.cuda.get_device_name(i), torch.cuda.get_device_capability(i)), LOG_PATH) train_file_path = os.path.join(args.data_dir, args.train_data) val_file_path = os.path.join(args.data_dir, args.val_data) test_file_path = os.path.join(args.data_dir, args.test_data) train_df = pd.read_csv(train_file_path) val_df = pd.read_csv(val_file_path) test_df = pd.read_csv(test_file_path) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=True) write_log("Tokenize Start!", LOG_PATH) train_labels, train_inputs, train_masks, train_note_ids = Tokenize_with_note_id( train_df, MAX_LEN, tokenizer) validation_labels, validation_inputs, validation_masks, validation_note_ids = Tokenize_with_note_id( val_df, MAX_LEN, tokenizer) test_labels, test_inputs, test_masks, test_note_ids = Tokenize_with_note_id( test_df, MAX_LEN, tokenizer) write_log("Tokenize Finished!", LOG_PATH) train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) test_inputs = torch.tensor(test_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) test_labels = torch.tensor(test_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) test_masks = torch.tensor(test_masks) write_log(("train dataset size is %d,\n" "validation dataset size is %d,\n" "test dataset size is %d") % (len(train_inputs), len(validation_inputs), len(test_inputs)), LOG_PATH) (train_labels, train_inputs, train_masks, train_ids, train_note_ids, train_chunk_ids) = concat_by_id_list_with_note_chunk_id( train_df, train_labels, train_inputs, train_masks, train_note_ids, MAX_LEN) (validation_labels, validation_inputs, validation_masks, validation_ids, validation_note_ids, validation_chunk_ids) = concat_by_id_list_with_note_chunk_id( val_df, validation_labels, validation_inputs, validation_masks, validation_note_ids, MAX_LEN) (test_labels, test_inputs, test_masks, test_ids, test_note_ids, test_chunk_ids) = concat_by_id_list_with_note_chunk_id( test_df, test_labels, test_inputs, test_masks, test_note_ids, MAX_LEN) model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=2) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] num_train_steps = int( len(train_df) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) m = torch.nn.Softmax(dim=1) start = time.time() # Store our loss and accuracy for plotting train_loss_set = [] # Number of training epochs (authors recommend between 2 and 4) epochs = args.num_train_epochs train_batch_generator = mask_batch_generator(args.max_chunk_num, train_inputs, train_labels, train_masks) validation_batch_generator = mask_batch_generator(args.max_chunk_num, validation_inputs, validation_labels, validation_masks) write_log("Training start!", LOG_PATH) # trange is a tqdm wrapper around the normal python range with torch.autograd.set_detect_anomaly(True): for epoch in trange(epochs, desc="Epoch"): # Training # Set our model to training mode (as opposed to evaluation mode) model.train() # Tracking variables tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 # Train the data for one epoch tr_ids_num = len(train_ids) tr_batch_loss = [] for step in range(tr_ids_num): b_input_ids, b_labels, b_input_mask = next( train_batch_generator) b_input_ids = b_input_ids.to(device) b_input_mask = b_input_mask.to(device) b_labels = b_labels.repeat(b_input_ids.shape[0]).to(device) # Forward pass outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss, logits = outputs[:2] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. train_loss_set.append(loss.item()) # Backward pass loss.backward() # Update parameters and take a step using the computed gradient if (step + 1) % args.train_batch_size == 0: optimizer.step() optimizer.zero_grad() train_loss_set.append(np.mean(tr_batch_loss)) tr_batch_loss = [] # Update tracking variables tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 write_log("Train loss: {}".format(tr_loss / nb_tr_steps), LOG_PATH) # Validation # Put model in evaluation mode to evaluate loss on the validation set model.eval() # Tracking variables eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 # Evaluate data for one epoch ev_ids_num = len(validation_ids) for step in range(ev_ids_num): with torch.no_grad(): b_input_ids, b_labels, b_input_mask = next( validation_batch_generator) b_input_ids = b_input_ids.to(device) b_input_mask = b_input_mask.to(device) b_labels = b_labels.repeat(b_input_ids.shape[0]) outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = outputs[-1] logits = m(logits).detach().cpu().numpy()[:, 1] label_ids = b_labels.numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 write_log( "Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps), LOG_PATH) output_checkpoints_path = os.path.join( args.output_dir, "bert_fine_tuned_with_note_checkpoint_%d.pt" % epoch) if args.save_model: if n_gpu > 1: torch.save( { 'epoch': epoch, 'model_state_dict': model.module.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, }, output_checkpoints_path) else: torch.save( { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': loss, }, output_checkpoints_path) end = time.time() write_log("total training time is: {}s".format(end - start), LOG_PATH) fig1 = plt.figure(figsize=(15, 8)) plt.title("Training loss") plt.xlabel("Chunk Batch") plt.ylabel("Loss") plt.plot(train_loss_set) if args.save_model: output_fig_path = os.path.join( args.output_dir, "bert_fine_tuned_with_note_training_loss.png") plt.savefig(output_fig_path, dpi=fig1.dpi) output_model_state_dict_path = os.path.join( args.output_dir, "bert_fine_tuned_with_note_state_dict.pt") if n_gpu > 1: torch.save(model.module.state_dict(), output_model_state_dict_path) else: torch.save(model.state_dict(), output_model_state_dict_path) write_log("Model saved!", LOG_PATH) else: output_fig_path = os.path.join( args.output_dir, "bert_fine_tuned_with_note_training_loss_{}_{}.png".format( args.seed, args.data_dir.split('_')[-1])) plt.savefig(output_fig_path, dpi=fig1.dpi) write_log("Model not saved as required", LOG_PATH) # Prediction on test set # Put model in evaluation mode model.eval() # Tracking variables predictions, true_labels, test_adm_ids = [], [], [] # Predict te_ids_num = len(test_ids) for step in range(te_ids_num): b_input_ids = test_inputs[step][-args.max_chunk_num:, :].to(device) b_input_mask = test_masks[step][-args.max_chunk_num:, :].to(device) b_labels = test_labels[step].repeat(b_input_ids.shape[0]) # Telling the model not to compute or store gradients, saving memory and speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) # Move logits and labels to CPU logits = outputs[-1] logits = m(logits).detach().cpu().numpy()[:, 1] label_ids = b_labels.numpy() adm_ids = test_ids[step].repeat(b_input_ids.shape[0]) # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) test_adm_ids.append(adm_ids) try: flat_logits = [item for sublist in predictions for item in sublist] except TypeError: flat_logits = [ item for sublist in predictions for item in test_func(sublist) ] flat_predictions = (np.array(flat_logits) >= 0.5).astype(np.int) try: flat_true_labels = [ item for sublist in true_labels for item in sublist ] except TypeError: flat_true_labels = [ item for sublist in true_labels for item in test_func(sublist) ] try: flat_adm_ids = [item for sublist in test_adm_ids for item in sublist] except TypeError: flat_adm_ids = [ item for sublist in test_adm_ids for item in test_func(sublist) ] output_chunk_df = pd.DataFrame({ 'logits': flat_logits, 'pred_label': flat_predictions, 'label': flat_true_labels, 'Adm_ID': flat_adm_ids }) if args.save_model: output_chunk_df.to_csv(os.path.join(args.output_dir, 'test_chunk_predictions.csv'), index=False) else: output_chunk_df.to_csv(os.path.join( args.output_dir, 'test_chunk_predictions_{}_{}.csv'.format( args.seed, args.data_dir.split('_')[-1])), index=False) output_df = get_patient_score(output_chunk_df, args.c) if args.save_model: output_df.to_csv(os.path.join(args.output_dir, 'test_predictions.csv'), index=False) else: output_df.to_csv(os.path.join( args.output_dir, 'test_predictions_{}_{}.csv'.format(args.seed, args.data_dir.split('_')[-1])), index=False) write_performance(output_df['label'].values, output_df['pred_label'].values, output_df['logits'].values, config, args)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .jsonl files (or other data files) for the task." ) parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) ## Other parameters parser.add_argument( '--num_choices', type=int, default=4, help= "Number of answer choices (will pad if less, throw exception if more)") parser.add_argument( "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument( "--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument( "--cache_dir", default="", type=str, help= "Where do you want to store the pre-trained models downloaded from s3") parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--max_steps", default=-1, type=int, help= "If > 0: set total number of training steps to perform. Override num_train_epochs." ) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument( "--eval_all_checkpoints", action='store_true', help= "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number" ) parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument( '--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) parser.add_argument( '--fp16_opt_level', type=str, default='O1', help= "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") ########################### # ### KYLES NEW SETTINGS # ########################### parser.add_argument('--dev_name2', default='', help="the name of the dev experiment") parser.add_argument('--dev_name', default='', help="the name of the dev experiment") parser.add_argument("--remove_model", default=False, action='store_true', help="Remove the pytorch model after done training") parser.add_argument("--override", default=False, action='store_true', help="Override the existing directory") parser.add_argument("--no_save_checkpoints", default=False, action='store_true', help="Don't save the model after each checkpoint ") parser.add_argument( "--run_existing", default='', help= "Run in eval model with an existing model, points to output_model_file" ) parser.add_argument("--bert_config", default='', help="Location of the existing BERT configuration") parser.add_argument( "--inoculate", default='', help= "Inoculate/continue training the model with challenge dataset (should contain pointer to existing fine-tuned model)" ) parser.add_argument( "--intermediate_model", default='', help= "Use the BERT weights of an intermediate BERT model (trained on some other task)" ) parser.add_argument( "--exclude_dataset", default='', type=str, help= "Datasets to exclude (in case of Aristo dataset with multiple datasets built in)" ) parser.add_argument("--train_name", default='', type=str, help="the number of multiple choice options") parser.add_argument( "--limit_train", default='', type=str, help="(for multi-dataset datasets) the datasets to use for training") parser.add_argument( "--limit_test", default='', type=str, help="(for multi-dataset datasets) the datasets to use for testing") args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.override: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(args.output_dir)) elif os.path.exists(args.output_dir) and args.override: shutil.rmtree(args.output_dir) os.makedirs(args.output_dir) else: os.makedirs(args.output_dir) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device ## create a backup of the run script with open(os.path.join(args.output_dir, "run.sh"), 'w') as runner: print("python -m mcqa_datasets.arc_mc %s" % ' '.join(sys.argv[1:]), file=runner) # Setup logging log_file = os.path.join(args.output_dir, "logger.log") logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, filename=log_file) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=args.num_choices) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) logger.info('loaded a pre-trained model..') if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) reader = ARCExampleReader() # Training if args.do_train: train_dataset = load_and_cache_examples(args, reader, tokenizer, evaluate=False) ## use an existing model or inoculate ## continue to train a model if args.inoculate: logger.info('Trying to load a pre-trained model..') model = model_class.from_pretrained(args.inoculate) logger.info('Finished loading..') #tokenizer = model_class.from_pretrained(args.inoculate) model.to(args.device) ## use an existing model, similar to the STILT idea; currently only works for BertForSequence elif args.intermediate_model: intermediate_model = BertForSequenceClassification.from_pretrained( args.intermediate_model) #intermediate_tokenizer = tokenizer_class.from_pretrained(args.intermediate_model) intermediate_model.to(args.device) ## just switch the bert weights..? model.bert = intermediate_model.bert #intermediate_model = model_class.from_trained() global_step, tr_loss = train(args, train_dataset=train_dataset, model=model, reader=reader, tokenizer=tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) ################################### # ## EVALUATION (KYLE's VERSION) # ################################### results = {} if args.do_eval and args.local_rank in [-1, 0]: ## run an existing model if args.run_existing: model = model_class.from_pretrained(args.run_existing) tokenizer = tokenizer_class.from_pretrained(args.run_existing) logger.info('Evaluating using an existing model: %s' % args.run_existing) else: model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) logger.info('Evaluating using the trained model: %s' % args.output_dir) try: model.to(args.device) except: raise ValueError( 'No model found, did you not train or link up with pre-trained model?' ) # tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) # model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) # model.to(args.device) ## the actual evaluation global_step = "" result = evaluate(args, model, reader, tokenizer, prefix=global_step) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) ## if args.dev_name2: global_step = "" results = {} logger.info('Now running on second development/held-out set...') ## update link args.dev_name = args.dev_name2 result = evaluate(args, model, reader, tokenizer, prefix=global_step, next_fname="next") result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) # Evaluation # results = {} # if args.do_eval and args.local_rank in [-1, 0]: # checkpoints = [args.output_dir] # if args.eval_all_checkpoints: # checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) # logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging # logger.info("Evaluate the following checkpoints: %s", checkpoints) # for checkpoint in checkpoints: # global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" # model = model_class.from_pretrained(checkpoint) # model.to(args.device) # result = evaluate(args, model, reader, tokenizer, prefix=global_step) # result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) # results.update(result) if args.remove_model: logger.info('REMOVING THE MODEL!') try: os.remove(os.path.join(args.output_dir, "pytorch_model.bin")) os.remove(os.path.join(args.output_dir, "vocab.txt")) except Exception as e: logger.error(e, exc_info=True) return results
def main(): # parse the arguments parser = argparse.ArgumentParser(description='Process some integers.') # required parameters parser.add_argument("func", default='help', type=str, help="train/test/help") parser.add_argument("--data_dir", default="data", type=str, required=False) parser.add_argument("--task_name", default=None, type=str, required=False) parser.add_argument("--tag", default=None, type=str, required=False) parser.add_argument("--input_dir", default=None, type=str, required=False) parser.add_argument("--output_dir", default=None, type=str, required=False) parser.add_argument("--model_name", default="bert-base-uncased", type=str, required=False) args = parser.parse_args() # do the func if args.func == "help": print("train to generate model, test to evaluate model") else: # gather parameters tag = args.tag if tag == None: tag = args.tag = str(uuid.uuid1()) print("params: {}\ntag: {}".format(str(args), tag)) device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = args.n_gpu = torch.cuda.device_count() logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger.warning("device: %s, n_gpu: %s", device, n_gpu) set_seed(args) args.task_name = args.task_name.lower() # TODO task specific settings num_labels = None if args.func == "train": pass # train on the task # gather parameters config = BertConfig.from_pretrained() output_dir = args.output_dir = args.output_dir if args.output_dir else "model" if os.path.exists(output_dir) and os.list(output_dir): raise ValueError("Output dir exists") config = BertConfig.from_pretrained(args.model_name, num_labels=num_labels, finetuning_task=args.task_name) tokenizer = BertTokenizer.from_pretrained(args.model_name, do_lower_case="uncased" in args.model_name) model = BertForSequenceClassification.from_pretrained( args.model_name, from_tf=False, config=config) elif args.func == "test": pass # test on the task else: raise NotImplementedError