N, D = train_x.shape _, num_classes = train_y.shape train_x = torch.from_numpy(train_x).type(torch.float) train_y = torch.from_numpy(train_y) test_x = torch.from_numpy(test_x).type(torch.float) test_y = torch.from_numpy(test_y) max_iters = 51 # pick a batch size, learning rate batch_size = 16 learning_rate = 1e-3 hidden_size = 64 batches = DataLoader(TensorDataset(train_x, train_y), shuffle=True, batch_size=batch_size) model = torch.nn.Sequential( torch.nn.Linear(D, hidden_size), torch.nn.Sigmoid(), torch.nn.Linear(hidden_size, num_classes), # torch.nn.Softmax() ) # print(model) optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9) # optimizer = torch.optim.Adam(model.parameters(),lr=1e-4) train_loss = []
def load_and_cache_examples(self, examples, evaluate=False, no_cache=False): """ Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures. Utility function for train() and eval() methods. Not intended to be used directly. """ process_count = self.args['process_count'] tokenizer = self.tokenizer output_mode = 'classification' args = self.args if not os.path.isdir(self.args['cache_dir']): os.mkdir(self.args['cache_dir']) mode = 'dev' if evaluate else 'train' cached_features_file = os.path.join( args['cache_dir'], f"cached_{mode}_{args['model_type']}_{args['max_seq_length']}_binary" ) if os.path.exists(cached_features_file) and not args[ 'reprocess_input_data'] and not no_cache: features = torch.load(cached_features_file) else: features = convert_examples_to_features( examples, args['max_seq_length'], tokenizer, output_mode, # xlnet has a cls token at the end cls_token_at_end=bool(args['model_type'] in ['xlnet']), cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args['model_type'] in ['xlnet'] else 0, sep_token=tokenizer.sep_token, # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 sep_token_extra=bool(args['model_type'] in ['roberta']), # pad on the left for xlnet pad_on_left=bool(args['model_type'] in ['xlnet']), pad_token=tokenizer.convert_tokens_to_ids( [tokenizer.pad_token])[0], pad_token_segment_id=4 if args['model_type'] in ['xlnet'] else 0, process_count=process_count) if not no_cache: torch.save(features, cached_features_file) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) return dataset
def test_eval(self): data = DATAMultiWOZ(debug=False, data_dir=self.data_dir) test_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) print('eval_examples的数量', len(test_examples)) dialogueID = [x.guid for x in test_examples] utterance_text = [x.text_history for x in test_examples] test_features = data.convert_examples_to_features( test_examples, self.tokenizer, self.max_seq_length) test_input_ids = torch.tensor(data.select_field( test_features, 'input_ids'), dtype=torch.long) test_input_mask = torch.tensor(data.select_field( test_features, 'input_mask'), dtype=torch.long) test_segment_ids = torch.tensor(data.select_field( test_features, 'segment_ids'), dtype=torch.long) test_utterance_mask = torch.tensor(data.select_field( test_features, 'utterance_mask'), dtype=torch.long) test_domainslot_mask = torch.tensor(data.select_field( test_features, 'domainslot_mask'), dtype=torch.long) test_label_tokens_start = torch.tensor( [f.label_tokens_start for f in test_features], dtype=torch.long) test_label_tokens_end = torch.tensor( [f.label_tokens_end for f in test_features], dtype=torch.long) test_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in test_features], dtype=torch.long) text_histtokens = [f.hist_token for f in test_features] test_data = TensorDataset(test_input_ids, test_input_mask, test_segment_ids, test_utterance_mask, test_domainslot_mask, test_label_tokens_start, test_label_tokens_end, test_label_sentence_domainslot) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=self.eval_batch_size) config = BertConfig.from_pretrained(self.model_name_or_path) model = BertForTokenClassification.from_pretrained(os.path.join( self.output_dir, "pytorch_model.bin"), self.args, config=config) model.to(self.device) model.eval() gold_labels_tokens_start = [] gold_labels_tokens_end = [] gold_label_sentence_domainslot = [] scores_tokens_start = [] scores_tokens_end = [] scores_sentence_domainslot = [] # ID = [x.guid for x in eval_examples] dialogueID = [x.guid for x in test_examples] # utterance_text = [x.text_eachturn for x in test_examples] for input_ids, input_mask, segment_ids, \ utterance_mask, domainslot_mask, \ label_tokens_start, label_tokens_end, \ label_sentence_domainslot in test_dataloader: input_ids = input_ids.to(self.device) input_mask = input_mask.to(self.device) segment_ids = segment_ids.to(self.device) utterance_mask = utterance_mask.to(self.device) domainslot_mask = domainslot_mask.to(self.device) label_tokens_start = label_tokens_start.to(self.device) label_tokens_end = label_tokens_end.to(self.device) label_sentence_domainslot = label_sentence_domainslot.to( self.device) logits_tokens_start, logits_tokens_end, logits_sentence_domainslot, _ = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, utterance_mask=utterance_mask, domainslot_mask=domainslot_mask) logits_tokens_start = logits_tokens_start.detach().view( -1, 2).cpu().numpy() logits_tokens_end = logits_tokens_end.detach().view( -1, 2).cpu().numpy() logits_sentence_domainslot = logits_sentence_domainslot.view( -1, 2).detach().cpu().numpy() label_tokens_start = label_tokens_start.view(-1).to('cpu').numpy() label_tokens_end = label_tokens_end.view(-1).to('cpu').numpy() label_sentence_domainslot = label_sentence_domainslot.to( 'cpu').numpy() scores_tokens_start.append(logits_tokens_start) scores_tokens_end.append(logits_tokens_end) scores_sentence_domainslot.append(logits_sentence_domainslot) gold_labels_tokens_start.append(label_tokens_start) gold_labels_tokens_end.append(label_tokens_end) gold_label_sentence_domainslot.append(label_sentence_domainslot) gold_labels_tokens_start = np.concatenate(gold_labels_tokens_start, 0) gold_labels_tokens_end = np.concatenate(gold_labels_tokens_end, 0) gold_label_sentence_domainslot = np.concatenate( gold_label_sentence_domainslot, 0) scores_tokens_start = np.concatenate(scores_tokens_start, 0) scores_tokens_end = np.concatenate(scores_tokens_end, 0) scores_sentence_domainslot = np.concatenate(scores_sentence_domainslot, 0) # 计算评价指标 # print(scores_tokens_start.shape) # print(scores_tokens_end.shape) # print(scores_sentence_domainslot.shape) # print(gold_labels_tokens_start.shape) assert scores_tokens_start.shape[0] == scores_tokens_end.shape[ 0] == gold_labels_tokens_start.shape[ 0] == gold_labels_tokens_end.shape[0] # eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain',report=True) # eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy,mode='dependcy',report=True) test_F1_tokenstart, test_F1_tokenend, F1_sentence_domainslot, F1_token_domainslot = compute_jointGoal_domainslot( dialogueID, text_histtokens, scores_tokens_start, scores_tokens_end, scores_sentence_domainslot, gold_labels_tokens_start, gold_labels_tokens_end, gold_label_sentence_domainslot, ) print('F1_token_domainslot', F1_token_domainslot, 'F1_sentence_domainslot', F1_sentence_domainslot, 'eval_F1_tokenstart', test_F1_tokenstart, 'eval_F1_tokenend', test_F1_tokenend)
def run(): args = parser.parse_args() nlayer = args.nlayer bidirection = args.bidirection file_path = args.file_path #'/content/drive/My Drive/Master_Final_Project/Genetic_attack/Code/nlp_adversarial_example_master_pytorch/glove.840B.300d.txt'#'/lustre/scratch/scratch/ucabdc3/lstm_attack' save_path = os.path.join(file_path, 'results') MAX_VOCAB_SIZE = 50000 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # with open(os.path.join(file_path, 'dataset_%d.pkl' %MAX_VOCAB_SIZE), 'rb') as f: # dataset = pickle.load(f) with open('aux_files/dataset_%d.pkl' % MAX_VOCAB_SIZE, 'rb') as f: dataset = pickle.load(f) # skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %MAX_VOCAB_SIZE) embedding_matrix = np.load('aux_files/embeddings_glove_%d.npy' % (MAX_VOCAB_SIZE)) embedding_matrix = torch.tensor(embedding_matrix.T).to(device) dist = np.load(('aux_files/dist_counter_%d.npy' % (MAX_VOCAB_SIZE))) # goog_lm = LM() # pytorch max_len = args.max_len # padded_train_raw = pad_sequences(dataset.train_seqs2, maxlen = max_len, padding = 'post') padded_test_raw = pad_sequences(dataset.test_seqs2, maxlen=max_len, padding='post') # # TrainSet # data_set = Data_infor(padded_train_raw, dataset.train_y) # num_train = len(data_set) # indx = list(range(num_train)) # train_set = Subset(data_set, indx) # TestSet batch_size = 1 SAMPLE_SIZE = args.sample_size data_set = Data_infor(padded_test_raw, dataset.test_y) num_test = len(data_set) indx = list(range(num_test)) all_test_set = Subset(data_set, indx) #indx = random.sample(indx, SAMPLE_SIZE) with open('attack_results_final_300.pkl', 'rb') as f: results = pickle.load(f) seqs = [] lens = [] tgts = [] for i in range(len(results[1])): if np.array(results[1][i]).shape == (): continue seqs.append(results[1][i]) lens.append(results[2][i]) tgts.append(results[3][i]) seqs = torch.tensor(seqs) lens = torch.tensor(lens) tgts = torch.tensor(tgts) test_set = TensorDataset(seqs, lens, tgts) all_test_loader = DataLoader(test_set, batch_size=128, shuffle=True) lstm_size = 128 rnn_state_save = os.path.join(file_path, 'best_lstm_0.7_0.001_300') model = SentimentAnalysis(batch_size=lstm_size, embedding_matrix=embedding_matrix, hidden_size=lstm_size, kept_prob=0.7, num_layers=nlayer, bidirection=bidirection) model.load_state_dict(torch.load(rnn_state_save)) model = model.to(device) model.eval() test_pred = torch.tensor([]) test_targets = torch.tensor([]) with torch.no_grad(): for batch_index, (seqs, length, target) in enumerate(all_test_loader): seqs, target, length = seqs.to(device), target.to( device), length.to(device) seqs = seqs.type(torch.LongTensor) len_order = torch.argsort(length, descending=True) length = length[len_order] seqs = seqs[len_order] target = target[len_order] output, pred_out = model.pred(seqs, length, False) test_pred = torch.cat((test_pred, pred_out.cpu()), dim=0) test_targets = torch.cat( (test_targets, target.type(torch.float).cpu())) accuracy = model.evaluate_accuracy(test_pred.numpy(), test_targets.numpy()) print('Test Accuracy:{:.4f}.'.format(accuracy))
def main(): class DictAttr(dict): def __getattr__(self, key): if key not in self: raise AttributeError(key) return self[key] def __setattr__(self, key, value): self[key] = value def __delattr__(self, key): del self[key] args = DictAttr() args.model_name = 'openai-gpt' args.train_dataset = "data_in/ROCStories/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" args.eval_dataset = "data_in/ROCStories/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" args.train_batch_size = 8 # parser = argparse.ArgumentParser() # parser.add_argument('--model_name', type=str, default='openai-gpt', # help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) #("Rick grew up in a troubled household. He never found good support in family, and turned to gangs. It wasn't long before Rick got shot in a robbery. The incident caused him to turn a new leaf.", 'He is happy now.', 'He joined a gang.', 0) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def setUp(self): super(TestCustomPinFn, self).setUp() inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5) tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5) self.dataset = TensorDataset(inps, tgts)
def setUp(self): super(TestDataLoader, self).setUp() self.data = torch.randn(100, 2, 3, 5) self.labels = torch.randperm(50).repeat(2) self.dataset = TensorDataset(self.data, self.labels)
# for validation set val_seq = torch.tensor(tokens_val['input_ids']) val_mask = torch.tensor(tokens_val['attention_mask']) val_y = torch.tensor(val_labels) # for test set test_seq = torch.tensor(tokens_test['input_ids']) test_mask = torch.tensor(tokens_test['attention_mask']) test_y = torch.tensor(test_labels) print(f'shape of train val test set: {train_y.shape}, {val_y.shape}, {test_y.shape}') # # Create DataLoaders # wrap tensors train_data = TensorDataset(train_seq, train_mask, train_y) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) val_data = TensorDataset(val_seq, val_mask, val_y) val_sampler = SequentialSampler(val_data) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size) test_data = TensorDataset(test_seq, test_mask, test_y) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size) """# # Freeze BERT Parameters for param in bert.parameters(): param.requires_grad = False"""
def get(logger=None, args=None): data = {} taskcla = [] t = 0 for dataset in datasets: data[t] = {} if 'Bing' in dataset: data[t]['name'] = dataset data[t]['ncla'] = 2 elif 'XuSemEval' in dataset: data[t]['name'] = dataset data[t]['ncla'] = 3 processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = Tokenizer() train_examples = processor.get_train_examples(dataset) train_features = data_utils.convert_examples_to_features_w2v( train_examples, label_list, args.max_term_length, args.max_sentence_length, tokenizer, word_index_pretrained, vocab_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) all_tokens_term_ids = torch.tensor( [f.tokens_term_ids for f in train_features], dtype=torch.long) all_tokens_sentence_ids = torch.tensor( [f.tokens_sentence_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) # print('all_tokens_term_ids: ',all_tokens_term_ids) train_data = TensorDataset(all_tokens_term_ids, all_tokens_sentence_ids, all_label_ids) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) data[t]['train'] = train_dataloader valid_examples = processor.get_dev_examples(dataset) valid_features=data_utils.convert_examples_to_features_w2v\ (valid_examples, label_list, args.max_term_length, args.max_sentence_length, tokenizer, word_index_pretrained, vocab_size) valid_all_tokens_term_ids = torch.tensor( [f.tokens_term_ids for f in valid_features], dtype=torch.long) valid_all_tokens_sentence_ids = torch.tensor( [f.tokens_sentence_ids for f in valid_features], dtype=torch.long) valid_all_label_ids = torch.tensor( [f.label_id for f in valid_features], dtype=torch.long) valid_data = TensorDataset(valid_all_tokens_term_ids, valid_all_tokens_sentence_ids, valid_all_label_ids) logger.info("***** Running validations *****") logger.info(" Num orig examples = %d", len(valid_examples)) logger.info(" Num split examples = %d", len(valid_features)) logger.info(" Batch size = %d", args.train_batch_size) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=args.train_batch_size) data[t]['valid'] = valid_dataloader processor = data_utils.AscProcessor() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model) eval_examples = processor.get_test_examples(dataset) eval_features = \ data_utils.convert_examples_to_features_w2v\ (eval_examples, label_list, args.max_term_length, args.max_sentence_length, tokenizer, word_index_pretrained, vocab_size) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_tokens_term_ids = torch.tensor( [f.tokens_term_ids for f in eval_features], dtype=torch.long) all_tokens_sentence_ids = torch.tensor( [f.tokens_sentence_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_tokens_term_ids, all_tokens_sentence_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) data[t]['test'] = eval_dataloader t += 1 # Others f_name = 'asc_random' data_asc = {} with open(f_name, 'r') as f_random_seq: random_sep = f_random_seq.readlines()[args.idrandom].split() print('random_sep: ', random_sep) print('domains: ', domains) print('random_sep: ', len(random_sep)) print('domains: ', len(domains)) for task_id in range(args.ntasks): # print('task_id: ',task_id) asc_id = domains.index(random_sep[task_id]) data_asc[task_id] = data[asc_id] taskcla.append((task_id, int(data[asc_id]['ncla']))) # Others n = 0 for t in data.keys(): n += data[t]['ncla'] data['ncla'] = n return data_asc, taskcla, vocab_size, embeddings
activation, w_init=w_init) torch.save(vae, 'pretrained_vae_n.pkl') else: if layer_wised == 0: vae = VAE(encoder_sizes, image_train[:, :, 0], activation) torch.save(vae, 'layerwisetrained_vae_n.pkl') else: vae = torch.load('layerwisetrained_vae_n.pkl') vae = vae.cuda() x_mean, _ = vae.get_latent( torch.from_numpy(image_train[:, :, 0]).cuda()) print("| Latent range: {}/{}".format(x_mean.min(), x_mean.max())) dataloader = DataLoader(TensorDataset( torch.from_numpy(image_train[:, :, 0])), batch_size=BATCH_SIZE, shuffle=True) # optimizer = optim.Adam(vae.get_para(),lr=0.0001,weight_decay=0.0001) # optimizer = optim.Adam(vae.get_para(),lr=0.002) optimizer = optim.SGD(vae.get_para(), lr=0.0001, momentum=0.9) lr_scheduler = StepLR(optimizer, step_size=1000, gamma=0.5) print("2.1 pretrain the VAE model") vae = pretrain(vae, optimizer, lr_scheduler, dataloader, epoch_num=100) torch.save(vae, 'pretrained_vae_n.pkl') if resume: print("|Load pretrained model: {}".format(resume))
input_ids_dev, attention_masks_dev = encode_data(tokenizer, questions_dev, passages_dev, max_seq_length) train_features = (input_ids_train, attention_masks_train, answers_train) dev_features = (input_ids_dev, attention_masks_dev, answers_dev) batch_size = 32 train_features_tensors = [ torch.tensor(feature, dtype=torch.long) for feature in train_features ] dev_features_tensors = [ torch.tensor(feature, dtype=torch.long) for feature in dev_features ] train_dataset = TensorDataset(*train_features_tensors) dev_dataset = TensorDataset(*dev_features_tensors) train_sampler = RandomSampler(train_dataset) dev_sampler = SequentialSampler(dev_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size) dev_dataloader = DataLoader(dev_dataset, sampler=dev_sampler, batch_size=batch_size) ######################################################## ########################this should be 5################ ########################################################
def create_batch_iter(mode): """构造迭代器""" processor, tokenizer = init_params() if mode == "train": examples = processor.get_train_examples(args.data_dir) num_train_steps = int( len(examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) batch_size = args.train_batch_size logger.info(" Num steps = %d", num_train_steps) elif mode == "dev": examples = processor.get_dev_examples(args.data_dir) batch_size = args.eval_batch_size else: raise ValueError("Invalid mode %s" % mode) label_list = processor.get_labels() # 特征 features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer) logger.info(" Num examples = %d", len(examples)) logger.info(" Batch size = %d", batch_size) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) all_output_mask = torch.tensor([f.output_mask for f in features], dtype=torch.long) # 数据集 data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_output_mask) if mode == "train": sampler = RandomSampler(data) elif mode == "dev": sampler = SequentialSampler(data) else: raise ValueError("Invalid mode %s" % mode) # 迭代器 iterator = DataLoader(data, sampler=sampler, batch_size=batch_size) if mode == "train": torch.save((iterator, num_train_steps), args.TRAIN_CACHE) return iterator, num_train_steps elif mode == "dev": torch.save(iterator, args.VALID_CACHE) return iterator else: raise ValueError("Invalid mode %s" % mode)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( '--optimize_on_cpu', default=False, action='store_true', help= "Whether to perform optimization and keep the optimizer averages on CPU" ) parser.add_argument( '--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=128, help= 'Loss scaling, positive power of 2 values can improve fp16 convergence.' ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') if args.fp16: logger.info( "16-bits training currently not supported in distributed training" ) args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496) logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}" .format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model) train_examples = None num_train_steps = None if args.do_train: train_examples = read_squad_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare model model = BertForQuestionAnswering.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.fp16: param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \ for n, param in model.named_parameters()] elif args.optimize_on_cpu: param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \ for n, param in model.named_parameters()] else: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if n in no_decay], 'weight_decay_rate': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_steps) global_step = 0 if args.do_train: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor( [f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if n_gpu == 1: batch = tuple( t.to(device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16 or args.optimize_on_cpu: if args.fp16 and args.loss_scale != 1.0: # scale down gradients for fp16 training for param in model.parameters(): param.grad.data = param.grad.data / args.loss_scale is_nan = set_optimizer_params_grad( param_optimizer, model.named_parameters(), test_nan=True) if is_nan: logger.info( "FP16 TRAINING: Nan in gradients, reducing loss scaling" ) args.loss_scale = args.loss_scale / 2 model.zero_grad() continue optimizer.step() copy_optimizer_params_to_model( model.named_parameters(), param_optimizer) else: optimizer.step() model.zero_grad() global_step += 1 if args.do_predict: eval_examples = read_squad_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model( input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append( RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") write_predictions(eval_examples, eval_features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, args.verbose_logging)
def trainingBert(data, bert_model): from transformers import BertTokenizer, BertForNextSentencePrediction import torch model = BertForNextSentencePrediction.from_pretrained(bert_model, return_dict=True) tokenizer = BertTokenizer.from_pretrained(bert_model) sentence1 = data[0] sentence2 = data[1] max_len = 1500 input_ids = [] attention_masks = [] labels = [] for x in range(len(data[0])): s1 = data[0][x] s2 = data[1][x] encoded_dict = tokenizer.encode_plus( s1, text_pair=s2, add_special_tokens = True, # Add '[CLS]' and '[SEP]' max_length = max_len, # Pad & truncate all sentences. truncation=True, pad_to_max_length = True, return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) encoded_dict_reverse = tokenizer.encode_plus( s2, text_pair=s1, add_special_tokens = True, # Add '[CLS]' and '[SEP]' max_length =max_len, # Pad & truncate all sentences. truncation=True, pad_to_max_length = True, return_attention_mask = True, # Construct attn. masks. return_tensors = 'pt', # Return pytorch tensors. ) input_ids.append(encoded_dict['input_ids']) attention_masks.append(encoded_dict['attention_mask']) labels.append(0) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) from torch.utils.data import TensorDataset, random_split dataset = TensorDataset(input_ids, attention_masks, labels) train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) print('{:>5,} training samples'.format(train_size)) print('{:>5,} validation samples'.format(val_size)) from torch.utils.data import DataLoader, RandomSampler, SequentialSampler batch_size = 16 train_dataloader = DataLoader( train_dataset, # The training samples. batch_size = batch_size # Trains with this batch size. ) validation_dataloader = DataLoader( val_dataset, # The validation samples. batch_size = batch_size # Evaluate with this batch size. ) from transformers.optimization import AdamW optimizer = AdamW(model.parameters(), lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps = 1e-8 # args.adam_epsilon - default is 1e-8. ) from transformers import get_linear_schedule_with_warmup epochs = 4 total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer) training_stats = [] for epoch_i in range(0, epochs): print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') for step, batch in enumerate(train_dataloader): b_input_ids = batch[0] b_input_mask = batch[1] b_labels = batch[2] res1 = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, next_sentence_label=b_labels) loss = res1[0] logits = res1[1] total_train_loss += loss.item() avg_train_loss = total_train_loss / len(train_dataloader) import os model_dir = str(epoch_i) + '/' output_dir = PROJECT_ROOT + 'model_save/' + model_dir if not os.path.exists(output_dir): os.makedirs(output_dir) print("Saving model to %s" % output_dir) model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print("") print("Running Validation...") # Tracking variables total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 # Evaluate data for one epoch for batch in validation_dataloader: b_input_ids = batch[0] b_input_mask = batch[1] b_labels = batch[2] res2 = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, next_sentence_label=b_labels) loss = res2[0] logits = res2[1] total_eval_loss += loss.item() logits = logits.numpy() label_ids = b_labels.numpy() accs = calc_acc(logits, label_ids) total_eval_accuracy += accs avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) avg_val_loss = total_eval_loss / len(validation_dataloader) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0))) return model
def predict(epoch=None): test_examples = processor.get_test_examples() test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_doc_ids = torch.tensor([f.guid for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_doc_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] ids = [] # FIXME: make it flexible to accept path all_ids_test = read_ids(os.path.join(args.data_dir, "ids_testing.txt")) for input_ids, input_mask, segment_ids, doc_ids in \ tqdm(test_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) doc_ids = doc_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) if len(ids) == 0: ids.append(doc_ids.detach().cpu().numpy()) else: ids[0] = np.append( ids[0], doc_ids.detach().cpu().numpy(), axis=0) ids = ids[0] preds = sigmoid(preds[0]) preds = (preds > 0.5).astype(int) id2preds = {val:preds[i] for i, val in enumerate(ids)} for i, val in enumerate(all_ids_test): if val not in id2preds: id2preds[val] = [] with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"), "rb") as rf: mlb = pkl.load(rf) preds = [mlb.classes_[preds[i, :].astype(bool)].tolist() for i in range(preds.shape[0])] id2preds = {val:preds[i] for i, val in enumerate(ids)} preds = [id2preds[val] if val in id2preds else [] for i, val in enumerate(all_ids_test)] with open(os.path.join(args.output_dir, f"preds_test{epoch}.txt"), "w") as\ wf: for idx, doc_id in enumerate(all_ids_test): line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n" wf.write(line)
max_seq_length, tokenizer) dev_examples = processor.get_dev_examples( "/home/wangwei/pt_workdir/bert_ner_task/data") dev_features = convert_examples_to_features(train_examples, label_list, max_seq_length, tokenizer) dev_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) dev_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) dev_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) dev_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) dev_data = TensorDataset(dev_input_ids, dev_input_mask, dev_segment_ids, dev_label_ids) dev_loader = DataLoader(dev_data, sampler=RandomSampler(dev_data), batch_size=train_batch_size) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", train_batch_size) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
def data_reader(self, data_filepath, label_filepath, jitter_filepath, train, type, should_batch=True, shuffle=True, infer=False): if infer: pass else: input_data, labels, jitter = read_npy(data_filepath), read_npy( label_filepath), read_npy(jitter_filepath) if train: self.logger.info(f'Original data size - before Augmentation') self.logger.info(f'Total data {str(len(input_data))}') self.logger.info( f'Event rate {str(sum(labels) / len(labels))}') self.logger.info( f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}' ) for x in input_data: self._min = min(np.min(x), self._min) self._max = max(np.max(x), self._max) self._mean, self._std = np.mean(input_data), np.std(input_data) self._jmean, self._jstd = np.mean(jitter), np.std(jitter) self._jmin, self._jmax = np.min(jitter), np.max(jitter) if self.data_augment: self.logger.info(f'Data Augmentation starts . . .') label_to_augment = 1 amount_to_augment = 1.3 ones_ids = [ idx for idx, x in enumerate(labels) if x == label_to_augment ] random_idxs = random.choices( ones_ids, k=int(len(ones_ids) * amount_to_augment)) data_to_augment = input_data[random_idxs] augmented_data, jitter_augmented_data = [], [] augmented_labels = [] for x in data_to_augment: x = librosaSpectro_to_torchTensor(x) x = random.choice([time_mask, freq_mask])(x)[0].numpy() augmented_data.append(x), augmented_labels.append( label_to_augment) # Jitter and shimmer # jitter_augmented_data, jitter_labels = BorderlineSMOTE().fit_resample(X=jitter, y=labels) # # assert np.mean(jitter_labels[len(jitter):][ # :len(augmented_data)]) == 1, 'Issue with Jitter Shimmer Augmentation' # # jitter = np.concatenate((jitter, jitter_augmented_data[len(jitter):][:len(augmented_data)])) input_data = np.concatenate((input_data, augmented_data)) labels = np.concatenate((labels, augmented_labels)) # Temp fix # input_data = input_data[:len(jitter)] # labels = labels[:len(jitter)] # assert len(jitter) == len( # input_data), "Input data and Jitter Shimmer augmentations don't match in length" self.logger.info(f'Data Augmentation done . . .') # data = [(x, y, z) for x, y, z in zip(input_data, labels, jitter)] # random.shuffle(data) # input_data, labels, jitter = np.array([x[0] for x in data]), [x[1] for x in data], np.array( # [x[2] for x in data]) data = [(x, y) for x, y in zip(input_data, labels)] random.shuffle(data) input_data, labels = np.array([x[0] for x in data ]), [x[1] for x in data] # Initialize pos_weight based on training data self.pos_weight = len([x for x in labels if x == 0]) / len( [x for x in labels if x == 1]) self.logger.info( f'Pos weight for the train data - {self.pos_weight}') self.logger.info(f'Total data {str(len(input_data))}') self.logger.info(f'Event rate {str(sum(labels) / len(labels))}') self.logger.info( f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}' ) self.logger.info( f'Min max values used for normalisation {self._min, self._max}' ) self.logger.info( f'Min max values used for normalisation {self._min, self._max}' ) # Normalizing `input data` on train dataset's min and max values if self.normalise: input_data = (input_data - self._min) / (self._max - self._min) input_data = (input_data - self._mean) / self._std # jitter = (jitter - self._jmin) / (self._jmax - self._jmin) # jitter = (jitter - self._jmean) / self._jstd self.dataset_sizes[type] = len(input_data) return DataLoader( TensorDataset( torch.Tensor(input_data).unsqueeze(1).repeat(1, 3, 1, 1), torch.Tensor(labels)), batch_size=self.batch_size # ,sampler=torch.utils.data.SubsetRandomSampler(list([x for x in range(10)])) )
def load_examples(args, tokenizer, mode): processor = processors[args.task](args) # Load data features from dataset file # NOTE: Get image features # Load data features from cache or dataset file cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}'.format(args.task, mode)) cached_img_features_file = os.path.join( args.data_dir, 'cached_img_{}_{}'.format(args.task, mode)) if os.path.exists(cached_features_file) and os.path.exists( cached_img_features_file): logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) logger.info("Loading img features from cached file %s", cached_img_features_file) all_img_features = torch.load(cached_img_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) img_feature_file = h5py.File( os.path.join(args.data_dir, args.h5_filename), 'r') if mode == "train": examples = processor.get_examples("train") img_ids = get_image_nums(args, args.train_file) all_img_features = load_vgg_features(img_feature_file, img_ids) elif mode == "dev": examples = processor.get_examples("dev") img_ids = get_image_nums(args, args.dev_file) all_img_features = load_vgg_features(img_feature_file, img_ids) elif mode == "test": examples = processor.get_examples("test") img_ids = get_image_nums(args, args.test_file) all_img_features = load_vgg_features(img_feature_file, img_ids) else: raise Exception("For mode, Only train, dev, test is available") label_len = len(get_label(args)) features = convert_examples_to_features(examples, args.max_seq_len, tokenizer, label_len) logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) torch.save(all_img_features, cached_img_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) print(all_input_ids.size()) print(all_attention_mask.size()) print(all_token_type_ids.size()) print(all_input_ids.size()) print(all_img_features.size()) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids, all_img_features) return dataset
def test_single_tensor(self): t = torch.randn(5, 10) source = TensorDataset(t) self.assertEqual(len(source), 5) for i in range(5): self.assertEqual(t[i], source[i][0])
def init_model(): bert_tokenizer = "/Users/quantum/Downloads/bert-base-chinese/bert_chinese_vocab.txt" train_model = "/Users/quantum/Downloads/2019217/pytorch_model_epoch_0.bin" bert_model = "/Users/quantum/Downloads/bert-base-chinese/bert-base-chinese.tar.gz" data_dir = "/Users/quantum/Downloads/bert-base-chinese/" max_seq_length = 256 do_lower_case = False processors = {"wnli": WNLIProcessor} task_name = "wnli" processor = processors[task_name]() label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(bert_tokenizer, do_lower_case=do_lower_case) # Load a trained model that you have fine-tuned model_state_dict = torch.load(train_model, map_location='cpu') # print(model_state_dict) model = BertForSequenceClassification.from_pretrained( bert_model, state_dict=model_state_dict) model.to(device) eval_examples = processor.get_test_examples(data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=10) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy} logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) return tokenizer, model
def test_len(self): source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5), torch.randperm(15)) self.assertEqual(len(source), 15)
def load_and_cache_examples(args, task, tokenizer, evaluate=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache processor = processors[task](language=args.language, train_language=args.train_language) output_mode = output_modes[task] # Load data features from cache or dataset file cached_features_file = os.path.join( args.data_dir, "cached_{}_{}_{}_{}_{}".format( "test" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), str(task), str(args.train_language if ( not evaluate and args.train_language is not None ) else args.language), ), ) if os.path.exists(cached_features_file) and not args.overwrite_cache: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() examples = (processor.get_test_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)) features = convert_examples_to_features( examples, tokenizer, label_list=label_list, max_length=args.max_seq_length, output_mode=output_mode, pad_on_left=False, pad_token=tokenizer.pad_token_id, pad_token_segment_id=tokenizer.pad_token_type_id, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) if output_mode == "classification": all_labels = torch.tensor([f.label for f in features], dtype=torch.long) else: raise ValueError("No other `output_mode` for XNLI.") dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) return dataset
def __init__(self, input_size, output_size): super(NN, self).__init__() self.layer1 = torch.nn.Linear(input_size, output_size, bias=False) #全結合層の定義 torch.nn.init.normal_(self.layer1.weight, 0.0, 1.0) # 正規乱数で重みを初期化 def forward(self, input): activation = torch.nn.Softmax(dim=-1) output = activation(self.layer1(input)) return output model = NN(300, 4) #(X, y)の組を作成 data_train = TensorDataset(X_train, y_train) #DataLoaderの作成 dataloader = DataLoader(data_train, batch_size=1, shuffle=True) creterion = torch.nn.CrossEntropyLoss() #最適化アルゴリズムの定義 optimizer = torch.optim.SGD(model.parameters(), lr=0.1) def accuracy(probs, y): cnt = 0 for i, prob in enumerate(probs): #tensorからndarrayに変換し、最大要素のindexを返す y_pred = np.argmax(prob.detach().numpy()) if y_pred == y.detach().numpy()[i]: cnt += 1
te_masks = [[float(i > 0) for i in ii] for ii in te_inputs] # convert to tensor! tr_inputs = torch.tensor(tr_inputs) val_inputs = torch.tensor(val_inputs) te_inputs = torch.tensor(te_inputs) tr_tags = torch.tensor(tr_tags) val_tags = torch.tensor(val_tags) te_tags = torch.tensor(te_tags) tr_masks = torch.tensor(tr_masks) val_masks = torch.tensor(val_masks) te_masks = torch.tensor(te_masks) train_data = TensorDataset(tr_inputs, tr_masks, tr_tags) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = bs) valid_data = TensorDataset(val_inputs, val_masks, val_tags) valid_sampler = SequentialSampler(valid_data) valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size = bs) test_data = TensorDataset(te_inputs, te_masks, te_tags) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = bs) #config = DistilBertConfig.from_pretrained("distillbert_ner_c_model_save") #model = DistillBertTagger(config = config) model = DistilBertForTokenClassification.from_pretrained(
X_valid = joblib.load('ch08/X_valid.joblib') y_valid = joblib.load('ch08/y_valid.joblib') X_valid = torch.from_numpy(X_valid.astype(np.float32)).clone() y_valid = torch.from_numpy(y_valid.astype(np.int64)).clone() X_test = joblib.load('ch08/X_test.joblib') y_test = joblib.load('ch08/y_test.joblib') X_test = torch.from_numpy(X_test.astype(np.float32)).clone() y_test = torch.from_numpy(y_test.astype(np.int64)).clone() X = X_train y = y_train X = X.to('cuda:0') y = y.to('cuda:0') ds = TensorDataset(X, y) net = nn.Linear(X.size()[1], 4) net = net.to('cuda:0') loss_fn = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=0.01) batchSize = [1, 2, 4, 8] for bs in batchSize: loader = DataLoader(ds, batch_size=bs, shuffle=True) train_losses = [] valid_losses = [] train_accs = [] valid_accs = []
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--corpus_type", default="mixed", type=str, required=True, help="Corpus type, mixed or categories") parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain.pkl files, " "named: train_data.pkl, dev_data.pkl, " "test_data.pkl and mlb.pkl (e.g. as in " "`exps-data/data`).") parser.add_argument("--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: " "bert-base-german-cased, bert-base-uncased, " "bert-large-uncased, bert-base-cased, " "bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, " "bert-base-chinese.") parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train.") parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model " "predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--use_data", default="orig", type=str, help="Original DE, tokenized DE or tokenized EN.") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained " "models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after " "WordPiece tokenization. \n" "Sequences longer than this will be truncated, " "and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if using an uncased model.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--loss_fct", default="bbce", type=str, help="Loss function to use BCEWithLogitsLoss (`bbce`)") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear " "learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before " "performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead " "of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. " "Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/ # debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "nts": NTSTaskProcessor } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of # sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - ' '%(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, " "16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // \ args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must " "be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) \ and args.do_train: raise ValueError("Output directory ({}) already exists and is not " "empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](args.data_dir, args.corpus_type, use_data=args.use_data) pos_weight = torch.tensor(processor.pos_weight, requires_grad=False, dtype=torch.float, device=device) label_list = processor.get_labels() num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples() num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps ) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // \ torch.distributed.get_world_size() # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForMultiLabelSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels, loss_fct=args.loss_fct ) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError("Please install apex from https:/" "/www.github.com/nvidia/apex to use distributed " "and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.do_train: if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError("Please install apex from " "https://www.github.com/nvidia/apex to use " "distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) warmup_linear = WarmupLinearSchedule( warmup=args.warmup_proportion, t_total=num_train_optimization_steps) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_cosine') global_step = 0 nb_tr_steps = 0 tr_loss = 0 def eval(epoch=None): eval_examples = processor.get_dev_examples() eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_doc_ids = torch.tensor([f.guid for f in eval_features], dtype=torch.long) # output_mode == "classification": all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.float) all_label_ids = all_label_ids.view(-1, num_labels) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_doc_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] ids = [] # FIXME: make it flexible to accept path all_ids_dev = read_ids(os.path.join(args.data_dir, "ids_development.txt")) for input_ids, input_mask, segment_ids, label_ids, doc_ids in \ tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) doc_ids = doc_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task # output_mode == "classification": loss_fct = BCEWithLogitsLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1, num_labels)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) if len(ids) == 0: ids.append(doc_ids.detach().cpu().numpy()) else: ids[0] = np.append( ids[0], doc_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps ids = ids[0] preds = sigmoid(preds[0]) preds = (preds > 0.5).astype(int) result = compute_metrics(task_name, preds, all_label_ids.numpy()) #result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None result['train_loss'] = loss result['eval_loss'] = eval_loss result['global_step'] = global_step output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('\n') with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"), "rb") as rf: mlb = pkl.load(rf) preds = [mlb.classes_[preds[i, :].astype(bool)].tolist() for i in range(preds.shape[0])] id2preds = {val:preds[i] for i, val in enumerate(ids)} preds = [id2preds[val] if val in id2preds else [] for i, val in enumerate(all_ids_dev)] with open(os.path.join(args.output_dir, f"preds_development" f"{epoch}.txt"), "w") as wf: for idx, doc_id in enumerate(all_ids_dev): line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n" wf.write(line) def predict(epoch=None): test_examples = processor.get_test_examples() test_features = convert_examples_to_features( test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_doc_ids = torch.tensor([f.guid for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_doc_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] ids = [] # FIXME: make it flexible to accept path all_ids_test = read_ids(os.path.join(args.data_dir, "ids_testing.txt")) for input_ids, input_mask, segment_ids, doc_ids in \ tqdm(test_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) doc_ids = doc_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) if len(ids) == 0: ids.append(doc_ids.detach().cpu().numpy()) else: ids[0] = np.append( ids[0], doc_ids.detach().cpu().numpy(), axis=0) ids = ids[0] preds = sigmoid(preds[0]) preds = (preds > 0.5).astype(int) id2preds = {val:preds[i] for i, val in enumerate(ids)} for i, val in enumerate(all_ids_test): if val not in id2preds: id2preds[val] = [] with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"), "rb") as rf: mlb = pkl.load(rf) preds = [mlb.classes_[preds[i, :].astype(bool)].tolist() for i in range(preds.shape[0])] id2preds = {val:preds[i] for i, val in enumerate(ids)} preds = [id2preds[val] if val in id2preds else [] for i, val in enumerate(all_ids_test)] with open(os.path.join(args.output_dir, f"preds_test{epoch}.txt"), "w") as\ wf: for idx, doc_id in enumerate(all_ids_test): line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n" wf.write(line) if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) # output_mode == "classification": all_label_ids = torch.tensor([f.label_ids for f in train_features], dtype=torch.float) all_label_ids = all_label_ids.view(-1, num_labels) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch # define a new function to compute loss values for both # output_modes logits = model(input_ids, segment_ids, input_mask, labels=None) # if output_mode == "classification": loss_fct = BCEWithLogitsLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1, num_labels)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles # this automatically lr_this_step = args.learning_rate * \ warmup_linear.get_lr( global_step/num_train_optimization_steps, args.warmup_proportion ) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 eval(epoch=epoch) predict(epoch=epoch) # save checkpoints # Save a trained model, configuration and tokenizer # model_to_save = model.module if hasattr(model, # 'module') else model # # If we save using the predefined names, we can load using # # `from_pretrained` # os.makedirs(f"{args.output_dir}/{epoch}") # output_model_file = os.path.join(f"{args.output_dir}/{epoch}", " # f"WEIGHTS_NAME) # output_config_file = os.path.join(f"{args.output_dir}/{epoch}", # CONFIG_NAME) # # torch.save(model_to_save.state_dict(), output_model_file) # model_to_save.config.to_json_file(output_config_file) # tokenizer.save_vocabulary(f"{args.output_dir}/{epoch}") # end save checkpoints if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # If we save using the predefined names, we can load using # `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = BertForMultiLabelSequenceClassification.from_pretrained( args.output_dir, num_labels=num_labels) tokenizer = BertTokenizer.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) else: model = BertForMultiLabelSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval() predict()
plt.legend(loc = 'lower right') plt.plot([0, 1], [0, 1],'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show() """# Preprocessing of data to feed to BERT""" # Run `preprocessing_for_bert` on the prediction set print('Tokenizing data...') pre_inputs, pre_masks = preprocessing_for_bert(df['reviews.text']) # Create the DataLoader for our prediction set pre_dataset = TensorDataset(pre_inputs, pre_masks) pre_sampler = SequentialSampler(pre_dataset) pre_dataloader = DataLoader(pre_dataset, sampler=pre_sampler, batch_size=10) """# **Load the saved BERT trained Model**""" model=torch.load("/content/drive/MyDrive/FYP Datasets/trained_model1.pth") """# **Testing** ## Get predictions """ #Compute predicted probabilities on the test set probs = bert_predict(model, pre_dataloader)
def eval(epoch=None): eval_examples = processor.get_dev_examples() eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_doc_ids = torch.tensor([f.guid for f in eval_features], dtype=torch.long) # output_mode == "classification": all_label_ids = torch.tensor([f.label_ids for f in eval_features], dtype=torch.float) all_label_ids = all_label_ids.view(-1, num_labels) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids, all_doc_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] ids = [] # FIXME: make it flexible to accept path all_ids_dev = read_ids(os.path.join(args.data_dir, "ids_development.txt")) for input_ids, input_mask, segment_ids, label_ids, doc_ids in \ tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) doc_ids = doc_ids.to(device) with torch.no_grad(): logits = model(input_ids, segment_ids, input_mask, labels=None) # create eval loss and other metric required by the task # output_mode == "classification": loss_fct = BCEWithLogitsLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1, num_labels)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) if len(ids) == 0: ids.append(doc_ids.detach().cpu().numpy()) else: ids[0] = np.append( ids[0], doc_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps ids = ids[0] preds = sigmoid(preds[0]) preds = (preds > 0.5).astype(int) result = compute_metrics(task_name, preds, all_label_ids.numpy()) #result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None result['train_loss'] = loss result['eval_loss'] = eval_loss result['global_step'] = global_step output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) writer.write('\n') with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"), "rb") as rf: mlb = pkl.load(rf) preds = [mlb.classes_[preds[i, :].astype(bool)].tolist() for i in range(preds.shape[0])] id2preds = {val:preds[i] for i, val in enumerate(ids)} preds = [id2preds[val] if val in id2preds else [] for i, val in enumerate(all_ids_dev)] with open(os.path.join(args.output_dir, f"preds_development" f"{epoch}.txt"), "w") as wf: for idx, doc_id in enumerate(all_ids_dev): line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n" wf.write(line)
def create_dataloader(self): data = DATAMultiWOZ( debug=False, data_dir=self.data_dir, ) train_examples = data.read_examples( os.path.join(self.data_dir, 'train.json')) train_features = data.convert_examples_to_features( train_examples, self.tokenizer, self.max_seq_length) all_input_ids = torch.tensor(data.select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(data.select_field( train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(data.select_field( train_features, 'segment_ids'), dtype=torch.long) all_utterance_mask = torch.tensor(data.select_field( train_features, 'utterance_mask'), dtype=torch.long) all_domainslot_mask = torch.tensor(data.select_field( train_features, 'domainslot_mask'), dtype=torch.long) all_label_tokens_start = torch.tensor( [f.label_tokens_start for f in train_features], dtype=torch.long) all_label_tokens_end = torch.tensor( [f.label_tokens_end for f in train_features], dtype=torch.long) all_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_utterance_mask, all_domainslot_mask, all_label_tokens_start, all_label_tokens_end, all_label_sentence_domainslot) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.train_batch_size) eval_examples = data.read_examples( os.path.join(self.data_dir, 'test.json')) eval_features = data.convert_examples_to_features( eval_examples, self.tokenizer, self.max_seq_length) eval_input_ids = torch.tensor(data.select_field( eval_features, 'input_ids'), dtype=torch.long) eval_input_mask = torch.tensor(data.select_field( eval_features, 'input_mask'), dtype=torch.long) eval_segment_ids = torch.tensor(data.select_field( eval_features, 'segment_ids'), dtype=torch.long) eval_utterance_mask = torch.tensor(data.select_field( eval_features, 'utterance_mask'), dtype=torch.long) eval_domainslot_mask = torch.tensor(data.select_field( eval_features, 'domainslot_mask'), dtype=torch.long) eval_label_tokens_start = torch.tensor( [f.label_tokens_start for f in eval_features], dtype=torch.long) eval_label_tokens_end = torch.tensor( [f.label_tokens_end for f in eval_features], dtype=torch.long) eval_label_sentence_domainslot = torch.tensor( [f.label_sentence_domainslot for f in eval_features], dtype=torch.long) eval_data = TensorDataset(eval_input_ids, eval_input_mask, eval_segment_ids, eval_utterance_mask, eval_domainslot_mask, eval_label_tokens_start, eval_label_tokens_end, eval_label_sentence_domainslot) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=self.eval_batch_size) return train_dataloader, eval_dataloader, train_examples, eval_examples
def main(): # Training settings def strpair(arg): p = tuple(arg.split(':')) if len(p) == 1: p = p + p return p parser = argparse.ArgumentParser( description='Ablation eval', epilog=textwrap.dedent(help_epilog), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--model', type=str, default=None, help='constructor for the model to test') parser.add_argument('--pthfile', type=str, default=None, help='filename of .pth file for the model') parser.add_argument('--outdir', type=str, default='dissect', required=True, help='directory for dissection output') parser.add_argument('--layers', type=strpair, nargs='+', help='space-separated list of layer names to edit' + ', in the form layername[:reportedname]') parser.add_argument('--classes', type=str, nargs='+', help='space-separated list of class names to ablate') parser.add_argument('--metric', type=str, default='iou', help='ordering metric for selecting units') parser.add_argument('--unitcount', type=int, default=30, help='number of units to ablate') parser.add_argument('--segmenter', type=str, help='directory containing segmentation dataset') parser.add_argument('--netname', type=str, default=None, help='name for network in generated reports') parser.add_argument('--batch_size', type=int, default=5, help='batch size for forward pass') parser.add_argument('--size', type=int, default=200, help='number of images to test') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA usage') parser.add_argument('--quiet', action='store_true', default=False, help='silences console output') if len(sys.argv) == 1: parser.print_usage(sys.stderr) sys.exit(1) args = parser.parse_args() # Set up console output verbose_progress(not args.quiet) # Speed up pytorch torch.backends.cudnn.benchmark = True # Set up CUDA args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: torch.backends.cudnn.benchmark = True # Take defaults for model constructor etc from dissect.json settings. with open(os.path.join(args.outdir, 'dissect.json')) as f: dissection = EasyDict(json.load(f)) if args.model is None: args.model = dissection.settings.model if args.pthfile is None: args.pthfile = dissection.settings.pthfile if args.segmenter is None: args.segmenter = dissection.settings.segmenter # Instantiate generator model = create_instrumented_model(args, gen=True, edit=True) if model is None: print('No model specified') sys.exit(1) # Instantiate model device = next(model.parameters()).device input_shape = model.input_shape # 4d input if convolutional, 2d input if first layer is linear. raw_sample = standard_z_sample(args.size, input_shape[1], seed=2).view((args.size, ) + input_shape[1:]) dataset = TensorDataset(raw_sample) # Create the segmenter segmenter = autoimport_eval(args.segmenter) # Now do the actual work. labelnames, catnames = (segmenter.get_label_and_category_names(dataset)) label_category = [ catnames.index(c) if c in catnames else 0 for l, c in labelnames ] labelnum_from_name = {n[0]: i for i, n in enumerate(labelnames)} segloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size, num_workers=10, pin_memory=(device.type == 'cuda')) # Index the dissection layers by layer name. dissect_layer = {lrec.layer: lrec for lrec in dissection.layers} # First, collect a baseline for l in model.ablation: model.ablation[l] = None # For each sort-order, do an ablation progress = default_progress() for classname in progress(args.classes): post_progress(c=classname) for layername in progress(model.ablation): post_progress(l=layername) rankname = '%s-%s' % (classname, args.metric) classnum = labelnum_from_name[classname] try: ranking = next(r for r in dissect_layer[layername].rankings if r.name == rankname) except: print('%s not found' % rankname) sys.exit(1) ordering = numpy.argsort(ranking.score) # Check if already done ablationdir = os.path.join(args.outdir, layername, 'pixablation') if os.path.isfile(os.path.join(ablationdir, '%s.json' % rankname)): with open(os.path.join(ablationdir, '%s.json' % rankname)) as f: data = EasyDict(json.load(f)) # If the unit ordering is not the same, something is wrong if not all(a == o for a, o in zip(data.ablation_units, ordering)): continue if len(data.ablation_effects) >= args.unitcount: continue # file already done. measurements = data.ablation_effects measurements = measure_ablation(segmenter, segloader, model, classnum, layername, ordering[:args.unitcount]) measurements = measurements.cpu().numpy().tolist() os.makedirs(ablationdir, exist_ok=True) with open(os.path.join(ablationdir, '%s.json' % rankname), 'w') as f: json.dump( dict(classname=classname, classnum=classnum, baseline=measurements[0], layer=layername, metric=args.metric, ablation_units=ordering.tolist(), ablation_effects=measurements[1:]), f)