def build_model(args): if args.clf_model.lower() == "cnn": # easy for text tokenization tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = CNN_Text(args) elif args.clf_model.lower() == "robert": print("name is {}".format(args.model_name_or_path)) tokenizer = RobertaTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = RobertaConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = RobertaForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers if args.freeze: for n, p in model.named_parameters(): if "bert" in n: p.requires_grad = False elif args.clf_model.lower() == "bert": tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers # if args.freeze: # for n, p in model.named_parameters(): # if "bert" in n: # p.requires_grad = False else: tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = DistilBertConfig.from_pretrained( args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = DistilBertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) model.expand_class_head(args.multi_head) model = model.to(args.device) return tokenizer, model
def main(): import config as args processors = { "bert": bertProcessor, "bertf1c": bertf1cProcessor, "berts": bertsProcessor, "bertsf1c": bertsf1cProcessor, } device = torch.device("cuda") n_gpu = torch.cuda.device_count() logger.info("device %s n_gpu %d", device, n_gpu) # args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and 'model.pt' in os.listdir( args.output_dir): if args.do_train and not args.resume: raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() label_list = processor.get_labels() print(label_list) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) train_examples = None num_train_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(train_examples) / n_class / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) model = BertForSequenceClassification(args.bert_dir, 1) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_eval: eval_examples = processor.get_test_examples( args.data_dir) #### for test datasets eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in eval_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) if args.do_train: best_metric = 0 train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in train_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, segment_ids, input_mask, label_ids, 1) loss = loss.mean() # if args.gradient_accumulation_steps > 1: # loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 #if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() global_step += 1 model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, 1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] tmp_eval_accuracy = accuracy(logits, label_ids.reshape(-1)) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples if args.do_train: result = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = {'eval_loss': eval_loss} eval_f1, eval_T2 = f1_eval(logits_all, eval_features) result["f1"] = eval_f1 result["T2"] = eval_T2 logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) if eval_f1 >= best_metric: torch.save(model.state_dict(), os.path.join(args.output_dir, "model_best.pt")) best_metric = eval_f1 model.load_state_dict( torch.load(os.path.join(args.output_dir, "model_best.pt"))) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) model.load_state_dict(torch.load(os.path.join(args.output_dir, "model.pt"))) if args.do_eval: logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, 1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps if args.do_train: result = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = {'eval_loss': eval_loss} output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(args.output_dir, "logits_dev.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ") eval_examples = processor.get_test_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) input_ids = [] input_mask = [] segment_ids = [] label_id = [] for f in eval_features: input_ids.append([]) input_mask.append([]) segment_ids.append([]) for i in range(n_class): input_ids[-1].append(f[i].input_ids) input_mask[-1].append(f[i].input_mask) segment_ids[-1].append(f[i].segment_ids) label_id.append([f[0].label_id]) all_input_ids = torch.tensor(input_ids, dtype=torch.long) all_input_mask = torch.tensor(input_mask, dtype=torch.long) all_segment_ids = torch.tensor(segment_ids, dtype=torch.long) all_label_ids = torch.tensor(label_id, dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, segment_ids, input_mask, label_ids, 1) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for i in range(len(logits)): logits_all += [logits[i]] eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps if args.do_train: result = { 'eval_loss': eval_loss, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps } else: result = {'eval_loss': eval_loss} output_eval_file = os.path.join(args.output_dir, "eval_results_test.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) output_eval_file = os.path.join(args.output_dir, "logits_test.txt") with open(output_eval_file, "w") as f: for i in range(len(logits_all)): for j in range(len(logits_all[i])): f.write(str(logits_all[i][j])) if j == len(logits_all[i]) - 1: f.write("\n") else: f.write(" ")
mge.tensor(t) for t in batch ) batch_size = input_ids.shape[0] loss, logits, label_ids = net_eval( input_ids, segment_ids, input_mask, label_ids, net=net ) sum_loss += loss.mean().item() sum_accuracy += accuracy(logits, label_ids) total_examples += batch_size total_steps += 1 result = { "eval_loss": sum_loss / total_steps, "eval_accuracy": sum_accuracy / total_examples, } logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info("%s = %s", key, str(result[key])) if __name__ == "__main__": bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=False) args.vocab_file = vocab_file model = BertForSequenceClassification(config, num_labels=2, bert=bert) mrpc_dataset = MRPCDataset(args) model.load_state_dict(mge.load(args.load_model_path)) mrpc_dataset = MRPCDataset(args) eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader() eval(eval_dataloader, model)
def train(args, train_dataset, tokenizer): """ Train the model """ # Load the pretrianed model nn.load_parameters(args.pretrained_model) # Drop final layer for task-specific fine-tuning nn.parameter.pop_parameter('affine_seq_class/affine/W') nn.parameter.pop_parameter('affine_seq_class/affine/b') train_dataloader = data_iterator( train_dataset, batch_size=args.train_batch_size) global_step = 0 train_loss = 0.0 model = BertForSequenceClassification() input_ids = nn.Variable((args.train_batch_size, args.max_seq_length)) attention_mask = nn.Variable((args.train_batch_size, args.max_seq_length)) token_type_ids = nn.Variable((args.train_batch_size, args.max_seq_length)) labels = nn.Variable((args.train_batch_size, )) input_ids_eval = nn.Variable((args.eval_batch_size, args.max_seq_length)) attention_mask_eval = nn.Variable( (args.eval_batch_size, args.max_seq_length)) token_type_ids_eval = nn.Variable( (args.eval_batch_size, args.max_seq_length)) labels_eval = nn.Variable((args.eval_batch_size, )) activation = F.gelu if args.activation == 'relu': activation = F.relu loss, _, train_error = model(args, input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels, num_labels=args.num_labels, vocab_size=args.vocab_size, num_embed_dim=args.num_embed_dim, num_pos_ids=args.num_position_ids, num_attention_layers=args.num_attention_layers, num_attention_embed_dim=args.num_attention_embed_dim, num_attention_heads=args.num_attention_heads, num_attention_dim_feedforward=args.num_attention_dim_feedforward, attention_activation=activation, pool_outmap=args.num_pool_outmap, embed_dropout_prob=args.embed_dropout, attention_dropout_prob=args.attention_dropout, dropout_prob=args.last_dropout, test=False) loss.persistent = True if args.solver == 'Adam': solver = S.Adam(args.learning_rate, eps=args.adam_epsilon) else: solver = S.AdamW(args.learning_rate, eps=args.adam_epsilon) solver.set_parameters(nn.get_parameters()) monitor = Monitor(args.output_dir) monitor_loss = MonitorSeries( "Training Loss", monitor, interval=10) monitor_eloss = MonitorSeries( "Evaluation Loss", monitor, interval=10) monitor_train_error = MonitorSeries( "Training Error Rate", monitor, interval=10) monitor_lr = MonitorSeries( "learning Rate", monitor, interval=10) total_steps = train_dataloader.size // args.train_batch_size var_linear = total_steps * args.num_train_epochs var_warmup = total_steps * (args.num_train_epochs - 1) for epoch in range(args.num_train_epochs): logger.info("Starting Epoch %d out of %d", epoch+1, args.num_train_epochs) for it in range(total_steps): batch = train_dataloader.next() input_ids.d = batch[0] attention_mask.d = batch[1] token_type_ids.d = batch[2] labels.d = batch[3] learning_rate_linear = lr_linear(global_step, var_linear) learning_rate = args.learning_rate * learning_rate_linear if epoch == 0: learning_rate = args.learning_rate * (global_step/total_steps) if epoch > 0: learning_rate_linear = lr_linear( (global_step-total_steps), var_warmup) learning_rate = args.learning_rate * learning_rate_linear solver.zero_grad() nn.forward_all([loss, train_error], clear_no_need_grad=True) loss.backward(clear_buffer=True) solver.weight_decay(args.weight_decay) solver.clip_grad_by_norm(args.max_grad_norm) solver.set_learning_rate(learning_rate) solver.update() monitor_loss.add( (train_dataloader.size//args.train_batch_size)*epoch+it, loss.d.copy()) monitor_train_error.add( (train_dataloader.size//args.train_batch_size)*epoch+it, train_error.d.copy()) monitor_lr.add(global_step, learning_rate) global_step += 1 train_loss += F.mean(loss.data) eval_task_names = ( "mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) results = {} for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): print(eval_task) eval_dataset = BERTDataSource( args, tokenizer, evaluate=True, shuffle=False) if not os.path.exists(eval_output_dir): os.makedirs(eval_output_dir) eval_dataloader = data_iterator( eval_dataset, batch_size=args.eval_batch_size) total_eval_steps = eval_dataloader.size // args.eval_batch_size eval_loss = 0.0 nb_eval_steps = 0 preds = None out_label_ids = None tmp_eval_loss, logits, eval_error = model(args, input_ids=input_ids_eval, attention_mask=attention_mask_eval, token_type_ids=token_type_ids_eval, labels=labels_eval, num_labels=args.num_labels, vocab_size=args.vocab_size, num_embed_dim=args.num_embed_dim, num_pos_ids=args.num_position_ids, num_attention_layers=args.num_attention_layers, num_attention_embed_dim=args.num_attention_embed_dim, num_attention_heads=args.num_attention_heads, num_attention_dim_feedforward=args.num_attention_dim_feedforward, attention_activation=activation, pool_outmap=args.num_pool_outmap, embed_dropout_prob=args.embed_dropout, attention_dropout_prob=args.attention_dropout, dropout_prob=args.last_dropout, test=True) tmp_eval_loss.persistent = True eval_loss += F.mean(tmp_eval_loss) for it in range(total_eval_steps): print(it, " ", total_eval_steps) batch_eval = eval_dataloader.next() input_ids_eval.d = batch_eval[0] attention_mask_eval.d = batch_eval[1] token_type_ids_eval.d = batch_eval[2] labels_eval.d = batch_eval[3] nb_eval_steps += 1 eval_loss.forward() monitor_eloss.add(it, eval_loss.d.copy()) if preds is None: preds = logits.d.copy() out_label_ids = labels_eval.d.copy() else: preds = np.append(preds, logits.d.copy(), axis=0) out_label_ids = np.append( out_label_ids, labels_eval.d.copy(), axis=0) eval_loss = eval_loss.d / nb_eval_steps if args.output_mode == "classification": preds = np.argmax(preds, axis=1) elif args.output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(eval_task, preds, out_label_ids) results.update(result) output_eval_file = os.path.join( eval_output_dir, "", "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Evaluation results {} *****".format("")) for key in sorted(result.keys()): logger.info("%d %s = %s\n", epoch + 1, key, str(result[key])) writer.write("%d %s = %s\n" % (epoch+1, key, str(result[key]))) print("results", results) return results
checkpoint = "./output/tacred-large/checkpoint-12000/" pretrained_model_name = "bert-large-cased" do_lower = ("-uncased" in pretrained_model_name) input_file = "/home/jiaming/datasets/TACRED/data/tsv_cased/test.tsv" # output_eval_file = "./eval/tac_res.txt" output_eval_file = "./eval/tac_res_large.txt" batch_size = 16 """ Start eval """ additional_special_tokens = ["[E11]", "[E12]", "[E21]", "[E22]"] tokenizer = BertTokenizer.from_pretrained( pretrained_model_name, do_lower_case=do_lower, additional_special_tokens=additional_special_tokens) model = BertForSequenceClassification.from_pretrained(checkpoint) model.to(device) eval_dataset = load_examples(input_file, tokenizer) eval_sampler = SequentialSampler(eval_dataset) eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size, shuffle=False) eval_loss = 0.0 nb_eval_steps = 0 pred_logits = None out_label_ids = None input_ids = None
from data import read_data from model import BertForSequenceClassification from utils import get_project_root # loading config params project_root: Path = get_project_root() with open(str(project_root / "config.yml")) as f: params = yaml.load(f, Loader=yaml.FullLoader) # read and process data train_val_loaders, test_loaders = read_data(params) # initialize the model model = BertForSequenceClassification( pretrained_model_name=params["model"]["model_name"], num_classes=params["model"]["num_classes"], ) # specify criterion for the multi-class classification task, optimizer and scheduler criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=float(params["training"]["learn_rate"])) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer) # reproducibility set_global_seed(params["general"]["seed"]) prepare_cudnn(deterministic=True) # here we specify that we pass masks to the runner. So model's forward method will be called with # these arguments passed to it. runner = SupervisedRunner(input_key=("features", "attention_mask"))
def main(i): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='Cross-Modal-BERT-master/data/text', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--bert_model", default='Cross-Modal-BERT-master/pre-trained BERT', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--task_name", default='Multi', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default='Cross-Modal-BERT-master/CM-BERT_output', type=str, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=50, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, help="Whether to run training.'store_true'") parser.add_argument("--do_test", default=True, help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", default=True, help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=24, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=24, type=int, help="Total batch size for eval.") parser.add_argument("--test_batch_size", default=24, type=int, help="Total batch size for test.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.5e-5") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=11111, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") args = parser.parse_args() processors = { "multi": PgProcessor, } num_labels_task = { "multi": 1, } device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = 2 logger.info("device: {} n_gpu: {}".format(device, n_gpu)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps seed_num = np.random.randint(1,10000) random.seed(seed_num) np.random.seed(seed_num) torch.manual_seed(seed_num) if n_gpu > 0: torch.cuda.manual_seed_all(seed_num) if not args.do_train and not args.do_test: raise ValueError("At least one of `do_train` or `do_test` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int(len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format("-1")) ############################################################################################################## model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels) # Freezing all layer except for last transformer layer and its follows for name, param in model.named_parameters(): param.requires_grad = False if "encoder.layer.0" in name or "encoder.layer.1" in name: param.requires_grad = True if "encoder.layer.2" in name or "encoder.layer.3" in name : param.requires_grad = True if "encoder.layer.4" in name or "encoder.layer.5" in name: param.requires_grad = True if "encoder.layer.6" in name or "encoder.layer.7" in name: param.requires_grad = True if "encoder.layer.8" in name or "encoder.layer.9" in name : param.requires_grad = True if "encoder.layer.10" in name or "encoder.layer.11" in name: param.requires_grad = True if "BertFinetun" in name or "pooler" in name: param.requires_grad = True ############################################################################################################## model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] new_decay = ['BertFine'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and not any(np in n for np in new_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}, {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay )and any(np in n for np in new_decay)],'lr':0.01} ] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 train_audio,valid_audio,test_audio= pickle.load(open('Cross-Modal-BERT-master/data/audio/MOSI_cmu_audio_CLS.pickle','rb')) max_acc = 0 min_loss = 100 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_train_audio = torch.tensor(train_audio, dtype=torch.float32) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float32) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_train_audio, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) ## Evaluate for each epcoh eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) all_valid_audio = torch.tensor(valid_audio, dtype=torch.float32,requires_grad=True) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float32) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_valid_audio,all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) for _ in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, all_train_audio, label_ids = batch loss = model(input_ids, all_train_audio,segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids,all_valid_audio,label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) all_valid_audio = all_valid_audio.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, all_valid_audio,segment_ids, input_mask,label_ids) logits,_,_ = model(input_ids,all_valid_audio, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss/nb_tr_steps if args.do_train else None result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss} output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # Save a trained model and the associated configuration if eval_loss<min_loss: min_loss = eval_loss model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) if args.do_test: ## Evaluate for each epcoh test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("") logger.info("***** Running test *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.test_batch_size) all_test_audio = torch.tensor(test_audio, dtype=torch.float32,requires_grad=True) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.float32) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids,all_test_audio) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.test_batch_size) model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, num_labels = num_labels) model.load_state_dict(torch.load('Cross-Modal-BERT-master/CM-BERT_output/pytorch_model.bin')) model.to(device) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 predict_list = [] truth_list = [] text_attention_list = [] fusion_attention_list = [] with torch.no_grad(): for input_ids, input_mask, segment_ids, label_ids, all_test_audio in tqdm(test_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) all_test_audio = all_test_audio.to(device) with torch.no_grad(): tmp_test_loss = model(input_ids, all_test_audio,segment_ids, input_mask, label_ids) logits,text_attention,fusion_attention = model(input_ids, all_test_audio,segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() text_attention = text_attention.cpu().numpy() fusion_attention = fusion_attention.cpu().numpy() test_loss += tmp_test_loss.mean().item() for i in range(len(logits)): predict_list.append(logits[i]) truth_list.append(label_ids[i]) text_attention_list.append(text_attention[i]) fusion_attention_list.append(fusion_attention[i]) nb_test_examples += input_ids.size(0) nb_test_steps += 1 exclude_zero = False non_zeros = np.array([i for i, e in enumerate(predict_list) if e != 0 or (not exclude_zero)]) predict_list = np.array(predict_list).reshape(-1) truth_list = np.array(truth_list) predict_list1 = (predict_list[non_zeros] > 0) truth_list1 = (truth_list[non_zeros] > 0) test_loss = test_loss / nb_test_steps test_preds_a7 = np.clip(predict_list, a_min=-3., a_max=3.) test_truth_a7 = np.clip(truth_list, a_min=-3., a_max=3.) acc7 = accuracy_7(test_preds_a7,test_truth_a7) f_score = f1_score(predict_list1, truth_list1, average='weighted') acc = accuracy_score(truth_list1, predict_list1) corr = np.corrcoef(predict_list, truth_list)[0][1] mae = np.mean(np.absolute(predict_list - truth_list)) loss = tr_loss/nb_tr_steps if args.do_train else None results = {'test_loss': test_loss, 'global_step': global_step, 'loss': loss, 'acc':acc, 'F1':f_score, 'mae':mae, 'corr':corr, 'acc7':acc7} logger.info("***** test results *****") for key in sorted(results.keys()): logger.info(" %s = %s", key, str(results[key])) return results
optimizer.step() sum_loss += loss.mean().item() sum_accuracy += accuracy(logits, label_ids) total_examples += batch_size total_steps += 1 result = { "train_loss": sum_loss / total_steps, "train_accuracy": sum_accuracy / total_examples, } logger.info("***** Train results *****") for key in sorted(result.keys()): logger.info("%s = %s", key, str(result[key])) if __name__ == "__main__": bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=True) args.vocab_file = vocab_file model = BertForSequenceClassification(config, num_labels=2, bert=bert) mrpc_dataset = MRPCDataset(args) optimizer = optim.Adam(model.parameters(requires_grad=True), lr=args.learning_rate,) mrpc_dataset = MRPCDataset(args) train_dataloader, train_size = mrpc_dataset.get_train_dataloader() eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader() for epoch in range(args.num_train_epochs): logger.info("***** Epoch {} *****".format(epoch + 1)) train(train_dataloader, model, optimizer) mge.save(model.state_dict(), args.save_model_path) eval(eval_dataloader, model)
from transformers import AdamW, BertConfig from torch.autograd import Variable from transformers import get_linear_schedule_with_warmup import time import datetime import os import os.path import random import torch.nn as nn import torch.optim as optim import numpy as np from dataloader_training import dataloader train_dataloader = dataloader() device = torch.device("cuda") model = BertForSequenceClassification(3) model.cuda() optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) epochs = 10 total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) def format_time(elapsed): elapsed_rounded = int(round((elapsed))) return str(datetime.timedelta(seconds=elapsed_rounded)) def save_checkpoint(state, is_best, filename='./checkpoint_4.pth.tar'):
from nltk.corpus import stopwords import nltk nltk.download('stopwords') stop = set(stopwords.words('english')) from keras.preprocessing.sequence import pad_sequences from transformers import BertTokenizer from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from model import BertForSequenceClassification import torch device = torch.device("cuda") model = BertForSequenceClassification(3) model.cuda() import numpy as numpy import json import os import os.path import math import torch.nn.functional as F import numpy as np import pandas as pd # string=r"#$%&'!()*+,-.:;<=>?@[\]^_\"`{|}~" # def remove_URL(text): # return re.sub(r"https?://\S+|www\.\S+", "", text) # def remove_html(text): # html=re.compile(r'<.*?>') # return html.sub(r'',text) # def remove_punct(text):
def main(): parser = ArgumentParser( description="BERT for relation extraction (classification)") parser.add_argument('--config', dest='config') args = parser.parse_args() config = Config(args.config) if os.path.exists(config.output_dir) and os.listdir( config.output_dir ) and config.train and not config.overwrite_output_dir: raise ValueError( "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome." .format(config.output_dir)) # Setup CUDA, GPU & distributed training if config.local_rank == -1 or config.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not config.no_cuda else "cpu") config.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(config.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') config.n_gpu = 1 config.device = device # Setup logging logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", config.local_rank, device, config.n_gpu, bool(config.local_rank != -1)) # Set seed set_seed(config.seed) # Prepare GLUE task processor = data_processors["semeval"]() output_mode = output_modes["semeval"] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if config.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab bertconfig = BertConfig.from_pretrained(config.pretrained_model_name, num_labels=num_labels, finetuning_task=config.task_name) # './large-uncased-model', num_labels=num_labels, finetuning_task=config.task_name) bertconfig.l2_reg_lambda = config.l2_reg_lambda bertconfig.latent_entity_typing = config.latent_entity_typing if config.l2_reg_lambda > 0: logger.info("using L2 regularization with lambda %.5f", config.l2_reg_lambda) if config.latent_entity_typing: logger.info("adding the component of latent entity typing: %s", str(config.latent_entity_typing)) tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True, additional_special_tokens=additional_special_tokens) # 'bert-large-uncased', do_lower_case=True, additional_special_tokens=additional_special_tokens) model = BertForSequenceClassification.from_pretrained( config.pretrained_model_name, config=bertconfig) # './large-uncased-model', config=bertconfig) if config.local_rank == 0: # Make sure only the first process in distributed training will download model & vocab torch.distributed.barrier() model.to(config.device) # logger.info("Training/evaluation parameters %s", config) # Training if config.train: train_dataset = load_and_cache_examples(config, config.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(config, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if config.train and (config.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(config.output_dir) and config.local_rank in [ -1, 0 ]: os.makedirs(config.output_dir) logger.info("Saving model checkpoint to %s", config.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` # Take care of distributed/parallel training model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(config.output_dir) tokenizer.save_pretrained(config.output_dir) # Good practice: save your training arguments together with the trained model torch.save(config, os.path.join(config.output_dir, 'training_config.bin')) # Load a trained model and vocabulary that you have fine-tuned model = BertForSequenceClassification.from_pretrained( config.output_dir) tokenizer = BertTokenizer.from_pretrained( config.output_dir, do_lower_case=True, additional_special_tokens=additional_special_tokens) model.to(config.device) # Evaluation results = {} if config.eval and config.local_rank in [-1, 0]: tokenizer = BertTokenizer.from_pretrained( config.output_dir, do_lower_case=True, additional_special_tokens=additional_special_tokens) checkpoints = [config.output_dir] if config.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(config.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("pytorch_transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( '-')[-1] if len(checkpoints) > 1 else "" model = BertForSequenceClassification.from_pretrained(checkpoint) model.to(config.device) result = evaluate(config, model, tokenizer, prefix=global_step) result = dict( (k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
total_steps += 1 result = { "train_loss": sum_loss / total_steps, "train_accuracy": sum_accuracy / total_examples, } logger.info("***** Train results *****") for key in sorted(result.keys()): logger.info("%s = %s", key, str(result[key])) if __name__ == "__main__": bert, config, vocab_file = create_hub_bert(args.pretrained_bert, pretrained=True) args.vocab_file = vocab_file model = BertForSequenceClassification(config, num_labels=2, bert=bert) mrpc_dataset = MRPCDataset(args) optimizer = optim.Adam( model.parameters(requires_grad=True), lr=args.learning_rate, ) mrpc_dataset = MRPCDataset(args) train_dataloader, train_size = mrpc_dataset.get_train_dataloader() eval_dataloader, eval_size = mrpc_dataset.get_eval_dataloader() for epoch in range(args.num_train_epochs): logger.info("***** Epoch {} *****".format(epoch + 1)) train(train_dataloader, model, optimizer) mge.save(model.state_dict(), args.save_model_path) eval(eval_dataloader, model)
import numpy as np import time from model import BertForSequenceClassification from dataloader_training import dataloader import torch import math import torch.nn.functional as F import torch.nn as nn import torch.optim as optim import os import os.path validation_dataloader = dataloader() device = torch.device("cuda") model = BertForSequenceClassification(3) model.cuda() def flat_accuracy(preds, labels): pred_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() return np.sum(pred_flat == labels_flat) / len(labels_flat) resume_weights = './checkpoint_4.pth.tar' if os.path.isfile(resume_weights): if device: checkpoint = torch.load(resume_weights) start_epoch = checkpoint['epoch'] best_accuracy = checkpoint['best_accuracy'] model.load_state_dict(checkpoint['state_dict'])