# In[24]: output_model_file = "bert_pytorch.bin" lr=2e-5 batch_size = 32 accumulation_steps=1 np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True model = BertForSequenceClassification.from_pretrained("../working",cache_dir=None,num_labels=len(y_columns)) model.zero_grad() model = model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] train = train_dataset num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps) optimizer = BertAdam(optimizer_grouped_parameters, lr=lr, warmup=0.05,
def predict_civility(ds, **kwargs): global db print(os.getcwd()) ls = [] civility = False while civility: data = [] for doc in col.distinct('message', {'civility_class': None}): if len(data) < 2000: data.append({'message': doc}) else: break if len(data) > 0: print(len(data)) df = pd.DataFrame(data) df['label'] = 0 dev_df_bert = pd.DataFrame({ 'id': range(len(df)), 'label': df['label'], 'alpha': ['a'] * df.shape[0], 'text': df['message'].replace(r'\n', ' ', regex=True) }) dev_df_bert.to_csv('./home/jay/airflow/dags/data/dev.tsv', sep='\t', index=False, header=False) device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # This is where BERT will look for pre-trained models to load parameters from. CACHE_DIR = './home/jay/airflow/dags/cache/' # The maximum total input sequence length after WordPiece tokenization. # Sequences longer than this will be truncated, and sequences shorter than this will be padded. MAX_SEQ_LENGTH = 128 TRAIN_BATCH_SIZE = 24 EVAL_BATCH_SIZE = 8 LEARNING_RATE = 2e-5 RANDOM_SEED = 42 GRADIENT_ACCUMULATION_STEPS = 1 WARMUP_PROPORTION = 0.1 OUTPUT_MODE = 'classification' NUM_TRAIN_EPOCHS = 1 CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" Data = 'FB20' DATA_DIR = "./home/jay/airflow/dags/data/" categories = ["Uncivil"] # categories = ["Attack", "Advocacy", "Ceremonial", "CTA", "CI", "Image", "Issue"] for Category in categories: print(Category) TASK_NAME = Data + Category BERT_MODEL = TASK_NAME + '.tar.gz' # The output directory where the fine-tuned model and checkpoints will be written. OUTPUT_DIR = './home/jay/airflow/dags/outputs/' + TASK_NAME + '/' tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR + 'vocab.txt', do_lower_case=False) processor = BinaryClassificationProcessor() eval_examples = processor.get_dev_examples(DATA_DIR) label_list = processor.get_labels( ) # [0, 1] for binary classification num_labels = len(label_list) eval_examples_len = len(eval_examples) label_map = {label: i for i, label in enumerate(label_list)} eval_examples_for_processing = [ (example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in eval_examples ] process_count = cpu_count() - 1 # if __name__ == '__main__': # print('Preparing to convert' {eval_examples_len} examples..') # print(f'Spawning {process_count} processes..') with Pool(process_count) as p: eval_features = list( p.imap(convert_example_to_feature, eval_examples_for_processing)) all_input_ids = torch.tensor( [f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor( [f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE) # Load pre-trained model (weights) model = BertForSequenceClassification.from_pretrained( CACHE_DIR + BERT_MODEL, cache_dir=CACHE_DIR, num_labels=len(label_list)) print(label_list) model.to(device) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append(preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) df[Category] = preds del df['label'] dc = df.to_dict('records') for doc in dc: doc['civility_class'] = [] for c in categories: if doc[c] == 1: doc['civility_class'].append('uncivil') else: doc['civility_class'].append('civil') del doc[c] print(len(dc)) print(dc[0]) print("Pushing into DB") ct = 0 for doc in dc: for x in col.find({"message": doc['message'], 'marked': 0}): # x['marked']=1 x['civility_class'] = doc['civility_class'] col.update_one({'_id': x['_id']}, {"$set": x}, True) return "Done"
def main(): train_df = pd.read_csv(TRAIN_PATH) train_df['male'] = np.load( "../input/identity-column-data/male_labeled.npy") train_df['female'] = np.load( "../input/identity-column-data/female_labeled.npy") train_df['homosexual_gay_or_lesbian'] = np.load( "../input/identity-column-data/homosexual_gay_or_lesbian_labeled.npy") train_df['christian'] = np.load( "../input/identity-column-data/christian_labeled.npy") train_df['jewish'] = np.load( "../input/identity-column-data/jewish_labeled.npy") train_df['muslim'] = np.load( "../input/identity-column-data/muslim_labeled.npy") train_df['black'] = np.load( "../input/identity-column-data/black_labeled.npy") train_df['white'] = np.load( "../input/identity-column-data/white_labeled.npy") train_df['psychiatric_or_mental_illness'] = np.load( "../input/identity-column-data/psychiatric_or_mental_illness_labeled.npy" ) fold_df = pd.read_csv(FOLD_PATH) # y = np.where(train_df['target'] >= 0.5, 1, 0) y = train_df['target'].values y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") # Overall weights = np.ones((len(train_df), )) / 4 # Subgroup weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int) / 4 # Background Positive, Subgroup Negative weights += ( ((train_df["target"].values >= 0.5).astype(bool).astype(np.int) + (1 - (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int))) > 1).astype(bool).astype( np.int) / 4 # Background Negative, Subgroup Positive weights += ( ((train_df["target"].values < 0.5).astype(bool).astype(np.int) + (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype( np.int) / 4 loss_weight = 0.5 with timer('preprocessing text'): # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=False) X_text = convert_lines_head_tail( train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, head_len, tokenizer) X_text = np.array(X_text).astype("int32") del tokenizer gc.collect() with timer('train'): train_index = fold_df.fold_id != fold_id valid_index = fold_df.fold_id == fold_id X_train, y_train, y_aux_train, w_train = X_text[train_index], y[ train_index].astype("float32"), y_aux[train_index].astype( "float32"), weights[train_index].astype("float32") X_val, y_val, y_aux_val, w_val = X_text[valid_index], y[valid_index].astype("float32"),\ y_aux[valid_index].astype("float32"), weights[valid_index].astype("float32") test_df = train_df[valid_index] train_size = len(X_train) del X_text, y, y_aux, weights, train_index, valid_index, train_df, fold_df gc.collect() model = BertForSequenceClassification.from_pretrained( WORK_DIR, cache_dir=None, num_labels=n_labels) model.zero_grad() model = model.to(device) y_train = np.concatenate( (y_train.reshape(-1, 1), w_train.reshape(-1, 1), y_aux_train), axis=1).astype("float32") y_val = np.concatenate( (y_val.reshape(-1, 1), w_val.reshape(-1, 1), y_aux_val), axis=1).astype("float32") del w_train, w_val, y_aux_train, y_aux_val gc.collect() train_dataset = torch.utils.data.TensorDataset( torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float32)) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float32)) ran_sampler = torch.utils.data.RandomSampler(train_dataset) len_sampler = LenMatchBatchSampler(ran_sampler, batch_size=batch_size, drop_last=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=len_sampler) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) del X_train, y_train, X_val, y_val gc.collect() LOGGER.info(f"done data loader setup") param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=base_lr, warmup=0.005, t_total=num_train_optimization_steps) LOGGER.info(f"done optimizer loader setup") model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # criterion = torch.nn.BCEWithLogitsLoss().to(device) criterion = CustomLoss(loss_weight).to(device) LOGGER.info(f"done amp setup") for epoch in range(epochs): LOGGER.info(f"Starting {epoch} epoch...") LOGGER.info(f"length {train_size} train...") if epoch == 1: for param_group in optimizer.param_groups: param_group['lr'] = base_lr * gammas[1] tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels, base_lr, gamma=gammas[2 * epoch]) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_epoch{}_fold{}.pth'.format(exp, epoch, fold_id)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) LOGGER.info(f'Mean valid loss: {round(valid_loss,5)}') del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred[:, 0] test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.legend() plt.xticks(xs) plt.xlabel('Iter') plt.savefig("loss.png")
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForSequenceClassification BertForSequenceClassification.from_pretrained('bert-large-uncased', cache_dir='./', num_labels=5) BertTokenizer.from_pretrained('bert-large-uncased', cache_dir='./') BertForSequenceClassification.from_pretrained('bert-large-cased', cache_dir='./', num_labels=5) BertTokenizer.from_pretrained('bert-large-cased', cache_dir='./') BertModel.from_pretrained('bert-large-cased', cache_dir='./') BertTokenizer.from_pretrained('bert-large-cased', cache_dir='./') BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='./') BertModel.from_pretrained('bert-base-uncased', cache_dir='./')
def load_pretrained_model(args, processor): label_list = processor.get_labels() model = BertForSequenceClassification.from_pretrained(args.bert_checkpoint_dir, num_labels=len(label_list)) module_utils.set_requires_grad(model, False) return model
test_examples = [ InputExample('test', row.tweet, label='UNT') for row in test.itertuples() ] label_list = ['UNT', 'TIN'] if sys.argv[1] == 'C': test_examples = [ InputExample('test', row.tweet, label='IND') for row in test.itertuples() ] label_list = ['IND', 'GRP', 'OTH'] tokenizer = BertTokenizer.from_pretrained(VOCAB) if sys.argv[1] == 'C': model = BertForSequenceClassification.from_pretrained(MODEL, cache_dir=cache_dir, num_labels=3) else: model = BertForSequenceClassification.from_pretrained(MODEL, cache_dir=cache_dir, num_labels=2) model.load_state_dict( torch.load('./BERT/bert_task' + str(sys.argv[1]) + str(sys.argv[2]) + '.pkl')) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer
def new_model(self): self.model = BertForSequenceClassification.from_pretrained( self.bert_model, num_labels=self.num_classes) self.__init_model()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, choices=[ "bert-base-uncased", "bert-large-uncased", "bert-base-cased", "bert-large-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased", "bert-base-chinese", ], help="Bert pre-trained model selected in the list") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--labels", nargs='+', default=['0', '1'], help="labels") ## Other parameters parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_test", action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--do_distill", action='store_true', help="Whether to run distillation.") parser.add_argument("--blendcnn_channels", nargs='+', default=(100,) * 8, help="BlendCNN channels.") parser.add_argument("--blendcnn_act", default='relu', choices=list(ACT2FN.keys()), help="BlendCNN activation function.") parser.add_argument('--blendcnn_dropout', action='store_true', help="Whether to use dropout in BlendCNN") parser.add_argument('--blendcnn_pair', action='store_true', help="Whether to use BlendCNNForSequencePairClassification") parser.add_argument("--export_onnx", action='store_true', help="Whether to export model to onnx format.") parser.add_argument("--onnx_framework", choices=[ "caffe2", ], help="Select the ONNX framework to run eval") parser.add_argument("--eval_interval", default=1000, type=int, help="Specify eval interval during training.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") distiller.knowledge_distillation.add_distillation_args(parser) args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "custom": lambda: CustomProcessor(args.labels), } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not any((args.do_train, args.do_eval, args.do_test, args.do_distill, args.export_onnx)): raise ValueError("At least one of `do_train`, `do_eval`, `do_test`, `do_distill`, `export_onnx` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) global_step = 0 loss = 0 output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) onnx_model_file = os.path.join(args.output_dir, "model.onnx") eval_data = None if args.do_train: model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( args.local_rank), num_labels=num_labels) model = convert_model(args, model, device, n_gpu) tensorboard_log_dir = os.path.join(args.output_dir, './log') os.makedirs(tensorboard_log_dir, exist_ok=True) tensorboard_logger = SummaryWriter(tensorboard_log_dir) if args.do_eval and do_eval_or_test(args) and eval_data is None: eval_data = prepare(args, processor, label_list, tokenizer, 'dev') global_step, loss = train(args, model, output_model_file, processor, label_list, tokenizer, device, n_gpu, tensorboard_logger, eval_data) model_config = None model_embeddings = None if args.onnx_framework is None: # Load a trained model that you have fine-tuned if os.path.exists(output_model_file): model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage) else: model_state_dict = None model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model_config = copy.deepcopy(model.config) model_embeddings = model.bert.embeddings model = convert_model(args, model, device, n_gpu) else: import onnx model = onnx.load(onnx_model_file) onnx.checker.check_model(model) if args.do_distill: assert model_config is not None assert model_embeddings is not None output_distilled_model_file = os.path.join(args.output_dir, DISTILLER_WEIGHTS_NAME) teacher = model model_config.hidden_act = args.blendcnn_act if args.blendcnn_pair: student = BlendCNNForSequencePairClassification(model_config, num_labels=num_labels, channels=(model_config.hidden_size,) + args.blendcnn_channels, n_hidden_dense=(model_config.hidden_size,) * 2, use_dropout=args.blendcnn_dropout) else: student = BlendCNN(model_config, num_labels=num_labels, channels=(model_config.hidden_size,) + args.blendcnn_channels, n_hidden_dense=(model_config.hidden_size,) * 2, use_dropout=args.blendcnn_dropout) student.embeddings.load_state_dict(model_embeddings.state_dict()) student = convert_model(args, student, device, 1) if os.path.exists(output_distilled_model_file): logger.info( 'Loading existing distilled model {}, skipping distillation'.format(output_distilled_model_file)) student.load_state_dict(torch.load(output_distilled_model_file)) else: dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(student, teacher, args.kd_temp, dlw) tensorboard_log_dir = os.path.join(args.output_dir, './log') os.makedirs(tensorboard_log_dir, exist_ok=True) tensorboard_logger = SummaryWriter(tensorboard_log_dir) if args.do_eval and do_eval_or_test(args) and eval_data is None: eval_data = prepare(args, processor, label_list, tokenizer, 'dev') global_step, loss = distill(args, output_distilled_model_file, processor, label_list, tokenizer, device, n_gpu, tensorboard_logger, eval_data) model = student if do_eval_or_test(args): result = { 'global_step': global_step, 'loss': loss } model.float() name = '_distiller' if args.do_distill else '' if args.do_eval: if eval_data is None: eval_data = prepare(args, processor, label_list, tokenizer, 'dev') eval_loss, eval_accuracy, eval_probs = eval(args, model, eval_data, device, verbose=True) np.savetxt(os.path.join(args.output_dir, 'dev{}_probs.npy'.format(name)), eval_probs) result.update({ 'dev{}_loss'.format(name): eval_loss, 'dev{}_accuracy'.format(name): eval_accuracy, }) if args.do_test: eval_data = prepare(args, processor, label_list, tokenizer, 'test') eval_loss, eval_accuracy, eval_probs = eval(args, model, eval_data, device, verbose=True) np.savetxt(os.path.join(args.output_dir, 'test{}_probs.npy'.format(name)), eval_probs) result.update({ 'test{}_loss'.format(name): eval_loss, 'test{}_accuracy'.format(name): eval_accuracy, }) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.export_onnx: if not env_enabled(ENV_OPENAIGPT_GELU) or not env_enabled(ENV_DISABLE_APEX): raise ValueError('Both {} and {} must be 1 to properly export ONNX.'.format(ENV_OPENAIGPT_GELU, ENV_DISABLE_APEX)) if not isinstance(model, torch.nn.Module): raise ValueError('model is not an instance of torch.nn.Module.') import onnx import onnx.utils import onnx.optimizer dummy_input = get_dummy_input(args, processor, label_list, tokenizer, device) torch.onnx.export(model, dummy_input, onnx_model_file, input_names=['input_ids', 'input_mask', 'segment_ids'], output_names=['output_logit'], verbose=True) optimized_model = onnx.optimizer.optimize(onnx.load(onnx_model_file), [pass_ for pass_ in onnx.optimizer.get_available_passes() if 'split' not in pass_]) optimized_model = onnx.utils.polish_model(optimized_model) onnx.save(optimized_model, os.path.join(args.output_dir, 'optimized_model.onnx'))
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif OUTPUT_MODE == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE) model = BertForSequenceClassification.from_pretrained( OUTPUT_DIR, cache_dir=CACHE_DIR, num_labels=len(label_list)) model.to(device) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device)
def main(): train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size, random_state=seed) y = np.where(train_df['target'] >= 0.5, 1, 0) y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") weights = np.ones((len(train_df), )) / 4 weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int) / 4 weights += ( ((train_df["target"].values >= 0.5).astype(bool).astype(np.int) + (train_df[identity_columns].fillna(0).values < 0.5).sum( axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype( np.int) / 4 weights += ( ((train_df["target"].values < 0.5).astype(bool).astype(np.int) + (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype( np.int) / 4 loss_weight = 1.0 / weights.mean() with timer('preprocessing text'): #df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) train_lines = zip( train_df['comment_text'].fillna("DUMMY_VALUE").values.tolist()) result = Parallel(n_jobs=4, backend='multiprocessing')( delayed(convert_line_fast)(i, max_len, tokenizer) for i in train_lines) X_text = [r[0] for r in result] train_lengths = [r[1] for r in result] #X_text, train_lengths = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) test_df = train_df[train_size:] with timer('train'): X_train, y_train, y_aux_train, w_train = X_text[: train_size], y[: train_size], y_aux[: train_size], weights[: train_size] X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[ train_size:], y_aux[train_size:], weights[train_size:] model = BertForSequenceClassification.from_pretrained( WORK_DIR, cache_dir=None, num_labels=n_labels) model.zero_grad() model = model.to(device) y_train = np.concatenate( (y_train.reshape(-1, 1), w_train.reshape(-1, 1), y_aux_train), axis=1) y_val = np.concatenate( (y_val.reshape(-1, 1), w_val.reshape(-1, 1), y_aux_val), axis=1) train_dataset = torch.utils.data.TensorDataset( torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) #criterion = torch.nn.BCEWithLogitsLoss().to(device) criterion = CustomLoss(loss_weight).to(device) LOGGER.info(f"Starting 1 epoch...") tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_dic'.format(exp)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred[:, 0] test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.legend() plt.xticks(xs) plt.xlabel('Iter') plt.savefig("loss.png")
device = torch.device("cuda", LOCAL_RANK) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(LOCAL_RANK != -1), FP16)) # Load a trained model that you have fine-tuned output_model_file = os.path.join(OUTPUT_DIR, "pytorch_model.bin") model_state_dict = torch.load(output_model_file, map_location='cpu') # Modify if running on GPU model = BertForSequenceClassification.from_pretrained( BERT_MODEL, state_dict=model_state_dict, num_labels=len(label_list), multi_label=True) model.to(device) def eval_and_predict(examples: List[InputExample], multi_label, batch_size=8, eval=True): features = convert_examples_to_features(examples, label_list, MAX_SEQ_LENGTH, tokenizer, multi_label=multi_label) logger.info("***** Running evaluation *****")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--preprocess_data", action='store_true', help="to activate the preprocessing of the data if not done yet") parser.add_argument("--sup_input", default='data', type=str, required=False, help="The input labelled pickle file") parser.add_argument( "--pickle_input_sup", default="supervised.p", required=False, help="The preprocessed supervised data to unpickle from") parser.add_argument("--sequence_length", default=256, type=int, help="Length of the sequence used in the model") parser.add_argument( "--load_model", default=None, required=False, type=str, help="Name of a save model file to load and start from") parser.add_argument( "--unsup_input", default='data', type=str, required=False, help= "The input unlabelled pickle file. If preprocess_data is activate please enter the prefix of the files." ) parser.add_argument("--uda", default=True, type=bool, help="Whether or not to use uda.") parser.add_argument("--multi_gpu", action='store_true', help='to activate multi gpus') parser.add_argument("--batch_size", default=4, type=int, help='Batch size of the labelled data') parser.add_argument( '--unsup_ratio', default=3, type=int, help= 'To define the batch_size of unlabelled data, unsup_ratio * batch_size.' ) parser.add_argument( "--gradient_accumulation", default=3, type=int, help="how many gradients to accumulate before stepping down.") parser.add_argument( "--lr_classifier", default=10e-4, type=float, help=" Learning rate applied to the last layer - classifier layer - .") parser.add_argument( "--lr_model", default=10e-6, type=float, help= "Learning rate applied to the whole model bar the classifier layer.") parser.add_argument('--verbose', action='store_true', help="to activate the printing of intermediate values") parser.add_argument('--tensorboard', action='store_true', help="to activate tensorboard on port") parser.add_argument("--epoch", default=3, type=int, help="how many epochs to perform") parser.add_argument("--labelled_examples", default=20000, type=int, help="how many labelled examples to learn from") parser.add_argument( "--temperature", default=0.85, type=float, help= "Set the temperature on the pre_softmax layer for unsupervisded entropy" ) parser.add_argument( "--uda_threshold", default=-1, type=float, help="Set the minimal acceptable max probability for unsupervised data" ) parser.add_argument( "--sup_threshold", default=0.5, type=float, help= "Unused ... Set the maximal acceptable correct probability for supervised data" ) parser.add_argument( "--tsa", default='linear', type=str, help="Set the method to perform threshold annealing on supervised data" ) parser.add_argument( "--test_frequency", default=20, type=int, help="Perform test scoring every -test_frequency- gradient steps") parser.add_argument("--regularisation", action='store_true', help="Regularize the last layer.") args = parser.parse_args() if True: #args.tensorboard : train_log_dir = 'logs/' train_summary_writer = summary.create_file_writer(train_log_dir) if args.preprocess_data: with open(args.unsup_input + '/original.txt') as original: src = original.readlines() with open(args.unsup_input + '/paraphrase.txt') as paraphrase: tgt = paraphrase.readlines() unsupervised_data = prepare_unsupervised_data( src, tgt, max_seq_length=args.sequence_length) df_train = p.load(open(args.sup_input + '/train_label.p', 'rb')) df_test = p.load(open(args.sup_input + '/test_label.p', 'rb')) supervised_data = prepare_supervised_data( df_train, max_seq_length=args.sequence_length) test_data = prepare_supervised_data( df_test, max_seq_length=args.sequence_length) p.dump(unsupervised_data, open('unsupervised.p', 'wb')) p.dump(supervised_data, open(args.pickle_input_sup, 'wb')) p.dump(test_data, open('test.p', 'wb')) unsupervised_data = p.load(open('unsupervised.p', 'rb')) unsupervised_data = list(np.array(unsupervised_data).reshape(-1)) supervised_data = p.load(open(args.pickle_input_sup, 'rb')) test_data = p.load(open('test.p', 'rb')) ### Recuperation sous tensors des données non supervisées original_input_ids = torch.tensor( [f.input_ids[0] for f in unsupervised_data], dtype=torch.long) original_input_mask = torch.tensor( [f.input_mask[0] for f in unsupervised_data], dtype=torch.long) original_segment_ids = torch.tensor( [f.segment_ids[0] for f in unsupervised_data], dtype=torch.long) augmented_input_ids = torch.tensor( [f.input_ids[1] for f in unsupervised_data], dtype=torch.long) augmented_input_mask = torch.tensor( [f.input_mask[1] for f in unsupervised_data], dtype=torch.long) augmented_segment_ids = torch.tensor( [f.segment_ids[1] for f in unsupervised_data], dtype=torch.long) ### Recuperation sous tensors des données supervisées supervised_input_ids = torch.tensor([f.input_ids for f in supervised_data], dtype=torch.long) supervised_input_mask = torch.tensor( [f.input_mask for f in supervised_data], dtype=torch.long) supervised_segment_ids = torch.tensor( [f.segment_ids for f in supervised_data], dtype=torch.long) supervised_label_ids = torch.tensor([f.label_id for f in supervised_data], dtype=torch.long) test_input_ids = torch.tensor([f.input_ids for f in test_data], dtype=torch.long) test_input_mask = torch.tensor([f.input_mask for f in test_data], dtype=torch.long) test_segment_ids = torch.tensor([f.segment_ids for f in test_data], dtype=torch.long) test_label_ids = torch.tensor([f.label_id for f in test_data], dtype=torch.long) ### Creation des datasets unsupervised_dataset = TensorDataset(original_input_ids, original_input_mask, original_segment_ids,\ augmented_input_ids,augmented_input_mask,augmented_segment_ids) supervised_dataset = TensorDataset(supervised_input_ids,\ supervised_input_mask,supervised_segment_ids,\ supervised_label_ids) test_dataset = TensorDataset(test_input_ids,\ test_input_mask,test_segment_ids,\ test_label_ids) ### Training ### Variables unsup_train_batch_size = args.batch_size * args.unsup_ratio sup_train_batch_size = args.batch_size labelled_examples = args.labelled_examples unsup_train_sampler = RandomSampler(unsupervised_dataset) unsup_train_dataloader = DataLoader(unsupervised_dataset, sampler=unsup_train_sampler, batch_size=unsup_train_batch_size) # sup_train_sampler = RandomSampler(supervised_dataset) sup_subset_sampler = torch.utils.data.SubsetRandomSampler(\ np.random.randint(supervised_input_ids.size(0), size=labelled_examples)) sup_train_dataloader = DataLoader(supervised_dataset, sampler=sup_subset_sampler, batch_size=sup_train_batch_size) test_sampler = torch.utils.data.SubsetRandomSampler(\ np.random.randint(test_input_ids.size(0), size=10000)) test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=16) num_labels = 2 if args.load_model is not None: model = torch.load(args.load_model) else: model = BertForSequenceClassification.from_pretrained( 'bert-large-uncased', num_labels=num_labels).to(device) if args.multi_gpu: model = nn.DataParallel(model) ### Parameters param_optimizer = list(model.module.classifier.named_parameters()) lr = args.lr_classifier lr_bert = args.lr_model epochs = args.epoch accumulation_steps = args.gradient_accumulation uda_threshold = args.uda_threshold temperature = args.temperature tsa = True verbose = False no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0., 'lr': lr, 'max_grad_norm': -1 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.01, 'lr': lr }, { 'params': model.module.bert.parameters(), 'weight_decay': 0.01, 'lr': lr_bert, 'max_grad_norm': -1 }] #optimizer = BertAdam(optimizer_grouped_parameters) optimizer = torch.optim.Adam(optimizer_grouped_parameters) # Locally used variables global_step = 0 accuracy = 0 counter = 1 test_counter = 0 loss_function = CrossEntropyLoss(reduction='none') optimizer.zero_grad() best = 0 ### TRAINING for epoch in range(epochs): for step, batch in tqdm(enumerate(unsup_train_dataloader)): model.train() ### Unsupervised Loss batch = tuple(t.to(device) for t in batch) original_input, _, _, augmented_input, _, _ = batch if args.regularisation: with torch.no_grad(): originals = model(original_input) / temperature logits_original = F.log_softmax( model.bert(original_input)[1], dim=-1) entropy = -torch.exp(logits_original) * logits_original with train_summary_writer.as_default(): tf.summary.scalar('entropy', entropy.sum(-1).mean(0).item(), step=global_step) max_logits = torch.max(logits_original, dim=-1)[0] if uda_threshold > 0: loss_unsup_mask = torch.where( max_logits.cpu() < np.log(uda_threshold), torch.tensor([1], dtype=torch.uint8), torch.tensor([0], dtype=torch.uint8)) loss_unsup_mask.to(device) loss_unsup_mask = loss_unsup_mask.view(-1) logits_augmented = F.log_softmax( model.bert(augmented_input)[1], dim=-1) loss_unsup = kl_for_log_probs(logits_augmented, logits_original) if uda_threshold > 0: loss_unsup[loss_unsup_mask] = 0 loss_unsup = loss_unsup[loss_unsup > 0.] if loss_unsup.size(0) > 0: loss_unsup_mean = loss_unsup.mean(-1) with train_summary_writer.as_default(): tf.summary.scalar('Number of elements unsup', loss_unsup.size(0), global_step) tf.summary.scalar('Loss_Unsup', loss_unsup_mean.item(), step=global_step) loss_unsup_mean.backward() else: with torch.no_grad(): originals = model(original_input) / temperature logits_original = F.log_softmax(model(original_input), dim=-1) entropy = -torch.exp(logits_original) * logits_original with train_summary_writer.as_default(): tf.summary.scalar('entropy', entropy.sum(-1).mean(0).item(), step=global_step) max_logits = torch.max(logits_original, dim=-1)[0] if uda_threshold > 0: loss_unsup_mask = torch.where( max_logits.cpu() < np.log(uda_threshold), torch.tensor([1], dtype=torch.uint8), torch.tensor([0], dtype=torch.uint8)) loss_unsup_mask.to(device) loss_unsup_mask = loss_unsup_mask.view(-1) logits_augmented = F.log_softmax(model(augmented_input), dim=-1) loss_unsup = kl_for_log_probs(logits_augmented, logits_original) if uda_threshold > 0: loss_unsup[loss_unsup_mask] = 0 loss_unsup = loss_unsup[loss_unsup > 0.] if loss_unsup.size(0) > 0: loss_unsup_mean = loss_unsup.mean(-1) with train_summary_writer.as_default(): tf.summary.scalar('Number of elements unsup', loss_unsup.size(0), global_step) tf.summary.scalar('Loss_Unsup', loss_unsup_mean.item(), step=global_step) loss_unsup_mean.backward() ### Cleaning del loss_unsup del loss_unsup_mean del logits_original del logits_augmented gc.collect() torch.cuda.empty_cache() torch.cuda.ipc_collect() ### Supervised Loss for i, batch_sup in enumerate(sup_train_dataloader): if counter % (i + 1) == 0: batch_sup = tuple(t.to(device) for t in batch_sup) input_ids, input_mask, segment_ids, label_ids = batch_sup # tf.summary.scalar('learning rate', np.max(optimizer.get_lr(), step=global_step) logits = model(input_ids) loss_sup = loss_function(logits.view(-1, 2), label_ids.view(-1)) with torch.no_grad(): outputs = F.softmax(logits, dim=-1) sentiment_corrects = torch.sum( torch.max(outputs, -1)[1] == label_ids) sentiment_acc = sentiment_corrects.double( ) / sup_train_batch_size accuracy += sentiment_acc #accuracy_temp = accuracy/step with train_summary_writer.as_default(): tf.summary.scalar('Batch_score', sentiment_acc.item(), step=global_step) #tf.summary.scalar('Global_score', accuracy_temp.item(), step=global_step) number_of_elements = outputs.size(0) ### Threshold Annealing if tsa: tsa_start = 1. / num_labels tsa_threshold = get_tsa_threshold(global_step = global_step,\ num_train_step = 3000, start = tsa_start,\ end=1.,schedule = 'linear', scale = 5) loss_mask = torch.ones(loss_sup.size()).long() probas = torch.gather( outputs, dim=-1, index=label_ids.unsqueeze(1)).cpu() loss_mask = torch.where( probas > tsa_threshold, torch.tensor([1], dtype=torch.uint8), torch.tensor([0], dtype=torch.uint8)) loss_mask.to(device) loss_mask = loss_mask.view(-1) with train_summary_writer.as_default(): tf.summary.scalar('tsa_threshold', tsa_threshold, global_step) tf.summary.scalar('loss_sup', loss_sup.mean(-1).item(), step=global_step) loss_sup[loss_mask] = 0. number_of_elements = loss_mask.size(0) - loss_mask.sum( 0) if verbose: print('outputs', outputs) print('tsa_threshold', tsa_threshold) print('label_ids', label_ids) print('probas', probas) print('mask', loss_mask) print('post_loss', loss_sup) print('number_of_elements : ', loss_mask.size(0) - loss_mask.sum(0)) if number_of_elements > 0: loss_sup = loss_sup[loss_sup > 0.] nb_elements = loss_sup.size(0) loss_sup = loss_sup.mean(-1) loss_sup.backward() else: nb_elements = 0 loss_sup = torch.tensor([0.]) with train_summary_writer.as_default(): tf.summary.scalar('nb_elements_sup', nb_elements, global_step) tf.summary.scalar('Post_loss', loss_sup.item(), step=global_step) #tf.summary.scalar('Learning Rate',optimizer.get_lr()[0], step=global_step) # loss_sup.backward() #else: # loss_sup = torch.tensor([0.]) ### Cleaning del loss_sup del logits gc.collect() torch.cuda.empty_cache() torch.cuda.ipc_collect() counter += 1 if counter > labelled_examples + 1: counter = 1 break else: gc.collect() torch.cuda.empty_cache() torch.cuda.ipc_collect() continue ### Accumulation Steps and Gradient steps if (step + 1) % accumulation_steps == 0: torch.nn.utils.clip_grad_value_(model.parameters(), 1) optimizer.step() optimizer.zero_grad() ### Test set and Evaluation every x gradient steps if (step + 1) % 100 == 0: loss = [] sentiment_test_acc = 0 for test_step, test_batch in enumerate(test_dataloader): test_batch = tuple(t.to(device) for t in test_batch) input_ids, input_mask, segment_ids, label_ids = test_batch with torch.no_grad(): logits = model(input_ids) loss_test = loss_function(logits.view(-1, 2), label_ids.view(-1)).mean(-1) with train_summary_writer.as_default(): tf.summary.scalar( 'Test_loss_continuous', loss_test.item(), step=test_step + test_counter * len(test_dataloader)) loss.append(loss_test.item()) outputs = F.softmax(logits, dim=-1) sentiment_corrects = torch.sum( torch.max(outputs, -1)[1] == label_ids) sentiment_test_acc += sentiment_corrects.double() accuracy += sentiment_acc / input_ids.size(0) sentiment_test_acc = sentiment_test_acc / len(test_dataloader) with train_summary_writer.as_default(): tf.summary.scalar('Test_score', sentiment_test_acc.item() / 16, step=global_step) tf.summary.scalar('test_loss', np.array(loss).mean(), step=global_step) tf.summary.scalar('test_loss_std', np.array(loss).std(), step=global_step) test_counter += 1 print('best_score', best) if sentiment_test_acc.item() / 16 > best: model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save, "best_model_score.pt") best = sentiment_test_acc.item() / 16 ### Increase the global step tracker global_step += 1
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default=None, type=str, required=True, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run prediction on a given dataset.") parser.add_argument("--input_file_for_pred", default=None, type=str, help="File to run prediction on.") parser.add_argument("--output_file_for_pred", default=None, type=str, help="File to output predictions into.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "anli": AnliProcessor, "anli3": AnliProcessor3Option, 'anli_csk': AnliWithCSKProcessor, 'bin_anli': BinaryAnli, 'wsc': WSCProcessor } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model if task_name == 'bin_anli': model = BertForSequenceClassification.from_pretrained( args.bert_model, len(label_list)) else: model = BertForMultipleChoice.from_pretrained(args.bert_model, len(label_list), len(label_list)) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 model_save_path = os.path.join(args.output_dir, "bert-finetuned.model") tr_loss = None if args.do_train: if task_name.lower().startswith( "anli") or task_name.lower().startswith("wsc"): train_features = convert_examples_to_features_mc( train_examples, label_list, args.max_seq_length, tokenizer) else: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 status_tqdm = tqdm(train_dataloader, desc="Iteration") for step, batch in enumerate(status_tqdm): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 status_tqdm.set_description_str( "Iteration / Training Loss: {}".format( (tr_loss / nb_tr_examples))) torch.save(model, model_save_path) if args.do_eval: if args.do_predict and args.input_file_for_pred is not None: eval_examples = processor.get_examples_from_file( args.input_file_for_pred) else: eval_examples = processor.get_dev_examples(args.data_dir) if task_name.lower().startswith( "anli") or task_name.lower().startswith("wsc"): eval_features = convert_examples_to_features_mc( eval_examples, label_list, args.max_seq_length, tokenizer) else: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) logger.info( "***** Loading model from: {} *****".format(model_save_path)) model = torch.load(model_save_path) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 eval_predictions = [] eval_pred_probs = [] logger.info("***** Predicting ... *****".format(model_save_path)) for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_predictions.extend(np.argmax(logits, axis=1).tolist()) eval_pred_probs.extend([_compute_softmax(list(l)) for l in logits]) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps if tr_loss is not None else 0.0 } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if task_name == "wsc": pred_examples = list(TsvIO.read(args.input_file_for_pred)) else: pred_examples = read_jsonl_lines(args.input_file_for_pred) logger.info("***** Eval predictions *****") for record, pred, probs in zip(pred_examples, eval_predictions, eval_pred_probs): record['bert_prediction'] = pred record['bert_correct'] = pred == ( int(record[processor.label_field()]) - 1) record['bert_pred_probs'] = probs write_items([json.dumps(r) for r in pred_examples], args.output_file_for_pred)
label_list = processor.get_labels() num_labels = len(label_list) num_train_optimization_steps = int(train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) label_map = {label: i for i, label in enumerate(label_list)} train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples] process_count = cpu_count() - 1 with Pool(process_count) as p: train_features = list(tqdm_notebook(p.imap(convert_examples_to_features.convert_example_to_feature, \ train_examples_for_processing), total=train_examples_len)) model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=LEARNING_RATE, warmup=WARMUP_PROPORTION, t_total=num_train_optimization_steps)
test_inputs = torch.tensor(input_ids) test_masks = torch.tensor(attention_masks) fake_ids = torch.Tensor(fake_ids) print("set batch size") batch_size = 24 test_data = TensorDataset(test_inputs, test_masks, fake_ids) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) print("finished setting batch size") model = BertForSequenceClassification.from_pretrained(output_dir, num_labels=2) model.cuda() # Put model in evaluation mode to evaluate loss on the validation set model.eval() # Predict data by minibatch if(True): output_predictions = [] batch_cnt = 0 for batch in test_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_idstrs = batch # Telling the model not to compute or store gradients, saving memory and speeding up validation
Y_Val = train_df[target_column].values[num_to_load:] train_dataset = torch.utils.data.TensorDataset( torch.tensor(X, dtype=torch.long), torch.tensor(Y, dtype=torch.float)) output_model_file = 'bert_pytorch.bin' lr = 2e-5 batch_size = 32 accumulation_step = 2 np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True model = BertForSequenceClassification.from_pretrained( "./", cache_dir=None, num_labels=len(target_column)) model.zero_grad() model = model.to(device) params = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] train = train_dataset
train_batch_size = 32 eval_batch_size = 128 train_batch_size = train_batch_size // gradient_accumulation_steps output_dir = OutputDir num_train_epochs = NUMofEPOCH num_train_optimization_steps = int( len(TrainExamples) / train_batch_size / gradient_accumulation_steps) * num_train_epochs cache_dir = CacheDir learning_rate = LearningRate warmup_proportion = 0.1 max_seq_length = MAXSEQLEN # Load model tokenizer = BertTokenizer.from_pretrained(BERTModel) Model = BertForSequenceClassification.from_pretrained( BERTModel, cache_dir=cache_dir, num_labels=len(LabelList)) Model.to(device) if n_gpu > 1: Model = torch.nn.DataParallel(Model) # Load a trained model and config that you have fine-tuned # tokenizer = BertTokenizer.from_pretrained(BERTModel) # config = BertConfig(load_config_file) # Model = BertForSequenceClassification(config, num_labels = len(LabelList)) # Model.load_state_dict(torch.load(load_model_file)) # Model.to(device) # important to specific device # if n_gpu > 1: # Model = torch.nn.DataParallel(Model) # Prepare optimizer param_optimizer = list(Model.named_parameters())
train_examples_len = len(train_features) print("Train features count: ", train_examples_len) num_train_optimization_steps = ceil( train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print('n_gpu:', n_gpu) torch.cuda.manual_seed_all(RANDOM_SEED) model = BertForSequenceClassification.from_pretrained('bert-base-uncased', cache_dir='cache', num_labels=num_labels) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params':
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=64, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "arg": ArgProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, # "arg": 2, "arg": 3, } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() model_state_dict = torch.load('models/pytorch_model.bin', map_location=torch.device('cpu')) cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format(args.local_rank)) model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) # model.load_state_dict(torch.load('./models/pytorch_model1.bin', map_location=torch.device('cpu'))) # cache_dir=cache_dir, if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self Path(str(Path.cwd() / "data" / "output")).mkdir(parents=True, exist_ok=True) output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file, map_location=torch.device('cpu'))) else: # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels) # WHY DO THEY DO THIS # MY TRAINED ONE model_state_dict = torch.load('./models/pytorch_model.bin', map_location=torch.device('cpu')) model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, num_labels=num_labels) # INTERMDEDIATE IMHO TRAINED ONE - oh this one doesn't work.... why?! # model_state_dict = torch.load('./models/pytorch_model.bin', map_location=torch.device('cpu')) # model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict, # num_labels=num_labels) model.to(device) pred, prob = [], [] gold = [] if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) for a, b in zip(logits, label_ids): pred.append(np.argmax(a)) gold.append(b) # prob.append(a) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss} print(classification_report(gold, pred)) output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: f = open('predictions.txt', 'w') for line1 in pred: f.write(str(line1) + '\n')
api = KhaiiiApi() #토크나이저 선언 tokenizer = BertTokenizer.from_pretrained("./vocab.korean_morp.list", do_lower_case=False) #토큰 개수의 최대값 설정 MAX_LEN = 256 # 디바이스 설정 if torch.cuda.is_available(): device = torch.device("cuda") else: device = torch.device("cpu") # 분류를 위한 BERT 모델 생성 model = BertForSequenceClassification.from_pretrained( "/home/jupyter/pytorch-korbert", num_labels=2) # 모델 로드 checkpoint = torch.load('./kor_bert_senti1', map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) #update 모듈 def update(changing_doc): for i in range(len(changing_doc['messages'])): if ((changing_doc['messages'][i]['sender'] != 'admin') and (changing_doc['messages'][i]['complain'] == 100)): changing_doc['messages'][i]['complain'] = int( analysis(changing_doc['messages'][i]['message']) * 100) else: continue
dtype=torch.long) elif OUTPUT_MODE == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=EVAL_BATCH_SIZE) # Load pre-trained model (weights) model = BertForSequenceClassification.from_pretrained( CACHE_DIR + BERT_MODEL, cache_dir=CACHE_DIR, num_labels=len(label_list)) model.to(device) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] # predicting ~~~~~~~~~~~ for input_ids, input_mask, segment_ids, label_ids in tqdm_notebook( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad():
import numpy as np torch.backends.cudnn.deterministic = True torch.cuda.manual_seed(seed) torch.manual_seed(seed) np.random.seed(seed) tokenizer = BertTokenizer.from_pretrained(model_type) # ## Load Pre-Trained BERT Model from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam print('loading model') model = BertForSequenceClassification.from_pretrained(model_type, cache_dir=None, num_labels=1) # ## Fine-Tune BERT from torch.nn import functional as F from tqdm import tqdm, trange train_optimization_steps = int(epochs * len(dataset) / batch_size / accumulation_steps) param_optimizer = list(model.named_parameters()) optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
all_tokens = [] longer = 0 for text in tqdm_notebook(example): tokens_a = tokenizer.tokenize(text) if len(tokens_a) > max_seq_length: tokens_a = tokens_a[:max_seq_length] longer += 1 one_token = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a)) all_tokens.append(one_token) print(f"There are {longer} lines longer than {max_seq_length}") return np.array(all_tokens) device = torch.device('cpu') BERT_MODEL_PATH = Path('/content/drive/My Drive/cb1bert/uncased_L-12_H-768_A-12/') model = BertForSequenceClassification.from_pretrained(WORK_DIR, cache_dir=None, num_labels=3) model.load_state_dict(torch.load('/content/drive/My Drive/cb1bert/bert_pytorch.bin', map_location='cpu')) def test(): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) print("Enter title") title = input() print("Enter text") text = input() # intialise data of lists. input_data = {'title':[title], 'text':[text]} # Create DataFrame
for index in range(NUM_MODEL): seed_everything(seed + index) x_train_fold = torch.tensor(x_train, dtype=torch.long) y_train_fold = torch.tensor(y_train, dtype=torch.float) train_data = torch.utils.data.TensorDataset(x_train_fold, y_train_fold) train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True) print("model: {}".format(index)) net = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6) ## load pretrain model # net.load_state_dict(torch.load("../input/bert-model3/bert_pytorch_v3.pt")) net.load_state_dict( torch.load("../input/pytorch-943-bert/bert_pytorch.pt")) net.cuda() loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean') param_optimizer = list(net.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01
def train(args, train_dataloader, valid_dataloader, num_train_examples): device = torch.device(args.device) model = BertForSequenceClassification.from_pretrained( args.bert_model, #cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), num_labels=args.class_size).to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_steps = int(num_train_examples / args.batch_size / 1 * args.max_epoch) optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) if args.fl_loss: others_idx = 0 alpha = [(1. - args.fl_alpha) / 3.] * args.class_size alpha[others_idx] = args.fl_alpha criterion = FocalLoss(gamma=args.fl_gamma, alpha=alpha, size_average=True) else: criterion = nn.CrossEntropyLoss() writer = SummaryWriter(log_dir='runs/' + args.model_time) model.train() acc, loss, size, last_epoch = 0, 0, 0, -1 max_dev_acc = 0 max_dev_f1 = 0 best_model = None print("tarining start") for epoch in range(args.max_epoch): print('epoch: ', epoch + 1) for i, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch pred = model(input_ids, segment_ids, input_mask) optimizer.zero_grad() batch_loss = criterion(pred, label_ids) loss += batch_loss.item() batch_loss.backward() optimizer.step() _, pred = pred.max(dim=1) acc += (pred == label_ids).sum().float().cpu().item() size += len(pred) if (i + 1) % args.print_every == 0: acc = acc / size c = (i + 1) // args.print_every writer.add_scalar('loss/train', loss, c) writer.add_scalar('acc/train', acc, c) print( f'{i+1} steps - train loss: {loss:.3f} / train acc: {acc:.3f}' ) acc, loss, size = 0, 0, 0 if (i + 1) % args.validate_every == 0: c = (i + 1) // args.validate_every dev_loss, dev_acc, dev_f1 = test(model, valid_dataloader, criterion, args, device) if dev_acc > max_dev_acc: max_dev_acc = dev_acc if dev_f1 > max_dev_f1: max_dev_f1 = dev_f1 best_model = copy.deepcopy(model.state_dict()) writer.add_scalar('loss/dev', dev_loss, c) writer.add_scalar('acc/dev', dev_acc, c) writer.add_scalar('f1/dev', dev_f1, c) print( f'dev loss: {dev_loss:.4f} / dev acc: {dev_acc:.4f} / dev f1: {dev_f1:.4f} ' f'(max dev acc: {max_dev_acc:.4f} / max dev f1: {max_dev_f1:.4f})' ) model.train() writer.close() return best_model, max_dev_f1
labels = [i for i in range(4)] target_names = list(set(labels)) label2idx = {label: idx for idx, label in enumerate(target_names)} TARGET_NAME_PATH = os.path.join(os.path.expanduser("~"), "target_names.json") target_names = list(set(labels)) with open(TARGET_NAME_PATH, "w") as o: json.dump(target_names, o) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") BERT_MODEL = "bert-base-multilingual-uncased" tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = 4) model.to(device) logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO) logger = logging.getLogger(__name__) MAX_SEQ_LENGTH=100 class InputFeatures(object): """A single set of features of data.""" def __init__(self, input_ids, input_mask, segment_ids, label_id): self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.label_id = label_id
# device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') n_gpu = torch.cuda.device_count() if n_gpu > 1: logger.info(f"let's use {n_gpu} gpu") # random seed random.seed(44) np.random.seed(44) torch.manual_seed(44) if n_gpu > 1: torch.cuda.manual_seed_all(44) model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path=bert_model_path, cache_dir=bert_data_path, num_labels=num_class) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # optim param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ]
def train_Pytorch_BERT(texts, targets): epochs = 1 # No Time for more.. MAX_LEN = 35 batch_size = 32 nb_labels = 2 # OR len(input_data['target'].unique()) train_x, test_x, train_y, test_y = train_test_split(texts, targets, test_size=0.1, random_state=42) device_name = tf.test.gpu_device_name() if device_name != '/device:GPU:0': raise SystemError('GPU device not found') print('Found GPU at: {}'.format(device_name)) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.cuda.get_device_name(0) sentences = ["[CLS] " + str(tweet) + " [SEP]" for tweet in train_x] print(sentences[0]) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] print("Tokenize the first sentence:") print(tokenized_texts[0]) #input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], # maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") attention_masks = [] for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) train_inputs, validation_inputs, train_labels, validation_labels = train_test_split( input_ids, train_y, random_state=2018, test_size=0.1) train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=2018, test_size=0.1) train_inputs = torch.tensor(train_inputs) validation_inputs = torch.tensor(validation_inputs) train_labels = torch.tensor(train_labels) validation_labels = torch.tensor(validation_labels) train_masks = torch.tensor(train_masks) validation_masks = torch.tensor(validation_masks) train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=nb_labels) model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1) # Function to calculate the accuracy of our predictions vs labels def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) train_loss_set = [] for _ in trange(epochs, desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch optimizer.zero_grad() loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) train_loss_set.append(loss.item()) loss.backward() optimizer.step() tr_loss += loss.item() nb_tr_examples += b_input_ids.size(0) nb_tr_steps += 1 print("Train loss: {}".format(tr_loss / nb_tr_steps)) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in validation_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() tmp_eval_accuracy = flat_accuracy(logits, label_ids) eval_accuracy += tmp_eval_accuracy nb_eval_steps += 1 print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps)) plt.figure(figsize=(15, 5)) plt.title("Training loss") plt.xlabel("Batch") plt.ylabel("Loss") plt.plot(train_loss_set) plt.show() ####################### ###Evaluation part##### ####################### sentences = ["[CLS] " + str(query) + " [SEP]" for query in test_x] labels = test_y tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences] # input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], # maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts] input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") attention_masks = [] for seq in input_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) prediction_inputs = torch.tensor(input_ids) prediction_masks = torch.tensor(attention_masks) prediction_labels = torch.tensor(labels) batch_size = 32 prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) prediction_sampler = SequentialSampler(prediction_data) prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size) model.eval() predictions, true_labels = [], [] for batch in prediction_dataloader: batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() predictions.append(logits) true_labels.append(label_ids) flat_predictions = [item for sublist in predictions for item in sublist] flat_predictions = np.argmax(flat_predictions, axis=1).flatten() flat_true_labels = [item for sublist in true_labels for item in sublist] print('Classification accuracy using BERT Fine Tuning: {0:0.2%}'.format( accuracy_score(flat_true_labels, flat_predictions))) return model, tokenizer
# Create an iterator of our data with torch DataLoader train_data = TensorDataset(train_inputs, train_masks, train_labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) # Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2) # model.cuda() # # BERT model summary # BertForSequenceClassification( # (bert): BertModel( # (embeddings): BertEmbeddings( # (word_embeddings): Embedding(30522, 768, padding_idx=0) # (position_embeddings): Embedding(512, 768) # (token_type_embeddings): Embedding(2, 768) # (LayerNorm): BertLayerNorm() # (dropout): Dropout(p=0.1) # ) # (encoder): BertEncoder( # (layer): ModuleList( # (0): BertLayer(
def train_bert(config: PipeLineConfig): logging.basicConfig(level=logging.INFO) logging.info("Reading data...") input_folder = "../input/jigsaw-unintended-bias-in-toxicity-classification/" train = pd.read_csv(os.path.join(input_folder, "train.csv")) logging.info("Tokenizing...") with multiprocessing.Pool(processes=32) as pool: text_list = train.comment_text.tolist() sequences = pool.map(convert_line_cased, text_list) logging.info("Building ttensors for training...") sequences = np.array(sequences) lengths = np.argmax(sequences == 0, axis=1) lengths[lengths == 0] = sequences.shape[1] logging.info("Bulding target tesnor...") iden = train[IDENTITY_COLUMNS].fillna(0).values subgroup_target = np.hstack([ (iden >= 0.5).any(axis=1, keepdims=True).astype(np.int), iden, iden.max(axis=1, keepdims=True), ]) sub_target_weigths = (~train[IDENTITY_COLUMNS].isna().values.any( axis=1, keepdims=True)).astype(np.int) weights = np.ones(len(train)) weights += (iden >= 0.5).any(1) weights += (train["target"].values >= 0.5) & (iden < 0.5).any(1) weights += (train["target"].values < 0.5) & (iden >= 0.5).any(1) weights /= weights.mean() y_aux_train = train[AUX_TARGETS] y_train_torch = torch.tensor( np.hstack([ train.target.values[:, None], weights[:, None], y_aux_train, subgroup_target, sub_target_weigths, ])).float() perfect_output = torch.tensor( np.hstack([train.target.values[:, None], y_aux_train, subgroup_target])).float() logging.info("Seeding with seed %d ...", config.seed) seed_everything(config.seed) logging.info("Creating dataset...") dataset = data.TensorDataset( torch.from_numpy(sequences).long(), y_train_torch, torch.from_numpy(lengths)) train_loader = data.DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=clip_to_max_len, shuffle=True) logging.info("Creating a model...") model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=18) model.zero_grad() model = model.cuda() model.classifier.bias = nn.Parameter( perfect_bias(perfect_output.mean(0)).cuda()) logs_file = f"./tb_logs/final_{config.expname}" optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if should_decay(n)], "weight_decay": config.decay, }, { "params": [p for n, p in model.named_parameters() if not should_decay(n)], "weight_decay": 0.00, }, ] optimizer = BertAdam( optimizer_grouped_parameters, lr=config.lr, warmup=config.warmup, t_total=config.epochs * len(train_loader) // ACCUM_STEPS, ) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) model = model.train() writer = SummaryWriter(logs_file) agg = TensorboardAggregator(writer) custom_loss = prepare_loss(config) for _ in range(config.epochs): for j, (X, y) in enumerate(train_loader): X = X.cuda() y = y.cuda() y_pred = model(X, attention_mask=(X > 0)) loss = custom_loss(y_pred, y) accuracy = ((y_pred[:, 0] > 0) == (y[:, 0] > 0.5)).float().mean() agg.log({ "train_loss": loss.item(), "train_accuracy": accuracy.item() }) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (j + 1) % ACCUM_STEPS == 0: optimizer.step() optimizer.zero_grad() torch.save(model.state_dict(), f"./models/final-pipe3-{config.expname}.bin")