def get_compiled_dataset(path: str, type: str = "split", test_size: float = 0.1, allowed_labelers: list = ['Vasily', 'Aydar'], allowed_labels: list = ['Access control', 'Confidentiality', 'Availability', 'Integrity', 'Operational', 'Accountability']): """ :param path: path to dataset file :param type (optional): either 'split' or '10-fold': 'split' will split data to train and test datasets, '10-fold' will split data into 10 train/test splits. :param test_size (optional): how big test dataset is, used only if type is 'split' :param allowed_labelers (optional): samples from which labelers are allowed :param allowed_labels (optional): samples with which labels are allowed :return: This function returns three objects: - train – train dataset or list of train datasets if ``type`` was set to 'split' - test – test dataset or list of test datasets if `type` was set to 'split' - encode_dict – dictionary which maps labels' names to labels' ids """ def read_csv_dataset(path): df = pd.read_csv(path, sep=',') df = df[['Requirement', 'Context (Keywords)', 'Name of Doc', 'Label', 'Comments.1', 'Labeled by.1']] df.columns = ['text', 'context', 'doc', 'label', 'comments', 'labeler'] # filter labels df = df[(df['label'].isin(allowed_labels))] # filter labelers df = df[(df['labeler'].isin(allowed_labelers))] encode_dict = {} def encode_cat(x): if x not in encode_dict.keys(): encode_dict[x] = len(encode_dict) return encode_dict[x] encoded_labels = [encode_cat(label) for label in df['label'].values] df = df.assign(encoded_label=pd.Series(encoded_labels, index=df.index)) return df, encode_dict df, encode_dict = read_csv_dataset(path) device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') class OwnDataset(torch.utils.data.Dataset): def __init__(self, df, encoder): self.df = df self.encoder = encoder self.encodings = tokenizer(list(self.df['text'].values), truncation=True, padding=True) self.labels = df['encoded_label'].values def __getitem__(self, idx): item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]).to(device) return item def __len__(self): return len(self.labels) tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') if type == "split": train, test = train_test_split(df, test_size=test_size, random_state=42) train_dataset = OwnDataset(train, tokenizer) test_dataset = OwnDataset(test, tokenizer) elif type == "10-fold": kfold = KFold(n_splits=10, shuffle=True, random_state=42) train_dataset, test_dataset = [], [] for train_index, val_index in kfold.split(df): train_df = df.iloc[train_index] val_df = df.iloc[val_index] train_dataset.append(OwnDataset(train_df, tokenizer)) test_dataset.append(OwnDataset(val_df, tokenizer)) else: raise ValueError("type is either 'split' or '10-fold'") return train_dataset, test_dataset, encode_dict
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM # Load the tokenizer #tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512) # the one we trained ourselves (akuapem) tokenizer = DistilBertTokenizerFast.from_pretrained("distilabena-base-v2-akuapem-twi-cased", max_len=512, do_lower_case=True) # the one we trained ourselves (asante, lowercase everything) #tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased") # you could also use pre-trained DistilmBERT tokenizer #tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-multilingual-cased", do_lower_case=True) # for asante, lowercase pretrained tokenizer #tokenizer.save_vocabulary("distilabena-base-akuapem-twi-cased") # when using pretrained tokenizer, be sure to save it locally tokenizer.save_vocabulary("distilabena-base-v2-asante-twi-uncased") # saving pretrained tokenizer locally in case of asante # Load DistilBERT multilingual base checkpoint #model = DistilBertForMaskedLM.from_pretrained("distilbert-base-multilingual-cased") # pretrained DistilmBERT weights model = DistilBertForMaskedLM.from_pretrained("distilabena-base-v2-akuapem-twi-cased") # in the case of Asante Twi, start with Akuapem model weights print("Number of parameters in the model:") print(model.num_parameters()) # Create dataset object for JW300 dataset (Akuapem) or Asante Twi Bible from transformers import LineByLineTextDataset dataset = LineByLineTextDataset( tokenizer=tokenizer, # file_path="../../data/jw300.en-tw.tw", # stage 1 - akuapem file_path="../../data/asante_twi_bible.txt", # stage 2 - asante block_size=128, ) # Create "data collator" from dataset and tokenizer - with 15% chance of masking from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) # Define training arguments from transformers import Trainer, TrainingArguments
aita_data = pd.read_csv('aita_clean.csv') aita_data_trimmed = aita_data[['body','is_asshole']].copy() print("Dataframe size before dropping empty rows is: "+str(aita_data_trimmed.size)) aita_data_trimmed = aita_data_trimmed[aita_data_trimmed['body'].astype(str).map(len) > 50] print("Dataframe size after dropping empty rows is: " +str(aita_data_trimmed.size)) aita_trimmed_texts = list(aita_data_trimmed['body']) aita_trimmed_labels = list(aita_data_trimmed['is_asshole']) train_texts, val_texts, train_labels, val_labels = train_test_split(aita_trimmed_texts, aita_trimmed_labels, test_size=.2) #print(aita_data_train['body'].astype(str).apply(lambda x:len(x)).max()) print("Generating tokens...") tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') train_encodings = tokenizer(train_texts, truncation=True, padding=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) print("Tokens generated. Constructing dataset...") class AITADataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item['labels'] = torch.tensor(self.labels[idx]) return item def __len__(self):
def main(): # define parser and arguments args = get_train_test_args() util.set_seed(args.seed) model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') if args.do_train: if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) args.save_dir = util.get_save_dir(args.save_dir, args.run_name) log = util.get_logger(args.save_dir, 'log_train') log.info(f'Args: {json.dumps(vars(args), indent=4, sort_keys=True)}') log.info("Preparing Training Data...") args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') trainer = Trainer(args, log) train_dataset, _ = get_dataset(args, args.train_datasets, args.train_dir, tokenizer, 'train') log.info("Preparing Validation Data...") val_dataset, val_dict = get_dataset(args, args.train_datasets, args.val_dir, tokenizer, 'val') train_loader = DataLoader(train_dataset, batch_size=args.batch_size, sampler=RandomSampler(train_dataset)) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, sampler=SequentialSampler(val_dataset)) if (args.checkpoint != 'none'): checkpoint_path = os.path.join(args.checkpoint, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained( checkpoint_path) best_scores = trainer.train(model, train_loader, val_loader, val_dict) if args.do_eval: args.device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') split_name = 'test' if 'test' in args.eval_dir else 'validation' log = util.get_logger(args.save_dir, f'log_{split_name}') trainer = Trainer(args, log) checkpoint_path = os.path.join(args.save_dir, 'checkpoint') model = DistilBertForQuestionAnswering.from_pretrained(checkpoint_path) model.to(args.device) eval_dataset, eval_dict = get_dataset(args, args.eval_datasets, args.eval_dir, tokenizer, split_name) eval_loader = DataLoader(eval_dataset, batch_size=args.batch_size, sampler=SequentialSampler(eval_dataset)) eval_preds, eval_scores = trainer.evaluate(model, eval_loader, eval_dict, return_preds=True, split=split_name) results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in eval_scores.items()) log.info(f'Eval {results_str}') # Write submission file sub_path = os.path.join(args.save_dir, split_name + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(eval_preds): csv_writer.writerow([uuid, eval_preds[uuid]])
def init_process(local_rank, backend, hparams, logger): os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' dist.init_process_group(backend, rank=local_rank, world_size=hparams.num_gpus) torch.cuda.set_device(local_rank) torch.backends.cudnn.benchmark = True if local_rank != 0: logger.setLevel(logging.WARNING) if local_rank == 0: writer = SummaryWriter() if not os.path.exists("save"): os.mkdir("save") save_path = "save/model_{}.pt".format( re.sub("\s+", "_", time.asctime())) random.seed(hparams.seed) reader = Reader(hparams) start = time.time() logger.info("Loading data...") reader.load_data("train") end = time.time() logger.info("Loaded. {} secs".format(end - start)) tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") lr = hparams.lr model = Dial(hparams).cuda() optimizer = Adam(model.parameters(), lr) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank, find_unused_parameters=True) # load saved model, optimizer if hparams.save_path is not None: load(model, optimizer, hparams.save_path) dist.barrier() train.max_iter = len(list(reader.make_batch(reader.train))) validate.max_iter = len(list(reader.make_batch(reader.dev))) train.warmup_steps = train.max_iter * hparams.max_epochs * hparams.warmup_steps train.global_step = 0 max_joint_acc = 0 early_stop_count = hparams.early_stop_count for epoch in range(hparams.max_epochs): logger.info("Train...") start = time.time() if local_rank == 0: train(model, reader, optimizer, writer, hparams, tokenizer, local_rank) else: train(model, reader, optimizer, None, hparams, tokenizer, local_rank) end = time.time() logger.info("epoch: {}, {:.4f} secs".format(epoch + 1, end - start)) logger.info("Validate...") loss, joint_acc, slot_acc = validate(model, reader, hparams, tokenizer, local_rank) logger.info( "loss: {:.4f}, joint accuracy: {:.4f}, slot accuracy: {:.4f}". format(loss, joint_acc, slot_acc)) if local_rank == 0: writer.add_scalar("Val/loss", loss, epoch + 1) writer.add_scalar("Val/joint_acc", joint_acc, epoch + 1) writer.add_scalar("Val/slot_acc", slot_acc, epoch + 1) if joint_acc > max_joint_acc: # save model if local_rank == 0: save(model, optimizer, save_path) logger.info("Saved to {}.".format(os.path.abspath(save_path))) dist.barrier() # synchronize max_joint_acc = joint_acc early_stop_count = hparams.early_stop_count else: # ealry stopping if early_stop_count == 0: logger.info("Early stopped.") break elif early_stop_count == 2: lr = lr / 2 logger.info("learning rate schedule: {}".format(lr)) for param in optimizer.param_groups: param["lr"] = lr early_stop_count -= 1 logger.info("early stop count: {}".format(early_stop_count)) logger.info("Training finished.")
def tag(): parser = ArgumentParser() parser.add_argument('text', type=str, help='Text to tag') parser.add_argument('--name', type=str, help='Name of the model to use', default='best') args = parser.parse_args() # ########## P R E P A R E D A T A ########## tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') # Pad to 512. All sentences in the dataset have a lower number of tokens. tokenized = tokenizer(args.text, padding='max_length', max_length=512, return_attention_mask=True, return_special_tokens_mask=True, return_offsets_mapping=True, return_token_type_ids=False) token_ids = torch.tensor(tokenized['input_ids'], dtype=torch.long).unsqueeze(0) offsets = torch.tensor(tokenized['offset_mapping'], dtype=torch.long).unsqueeze(0) att_masks = torch.tensor(tokenized['attention_mask'], dtype=torch.long).unsqueeze(0) special_masks = torch.tensor(tokenized['special_tokens_mask'], dtype=torch.long).unsqueeze(0) special_masks = special_masks.logical_not() # ########## P R E P A R E M O D E L (S) ########## weights_path = os.path.join('weights', args.name) checkpoint_names = [] for file in os.listdir(weights_path): if file.endswith('.ckpt'): checkpoint_names.append(os.path.join(weights_path, file)) # ########## O B T A I N P R E D I C T I O N S ########## predicted_spans = [] for i, checkpoint in enumerate(checkpoint_names): model = MultiDepthDistilBertModel.load_from_checkpoint( checkpoint_path=checkpoint) logits = model(token_ids, att_masks) preds = torch.argmax(logits, -1) predicted_spans.append(preds2spans(preds, special_masks, offsets)) if len(checkpoint_names) == 1: predicted_spans = predicted_spans[0] else: predicted_spans = compute_ensemble_predictions(predicted_spans) # ########## G E N E R A T E O U T P U T ########## predicted_spans = predicted_spans[0] text = '' inside = False for i, char in enumerate(args.text): if not inside and i in predicted_spans: text += '[' inside = True elif inside and i not in predicted_spans: text += ']' inside = False text += char if inside: text += ']' print(f"Input text --> {args.text}") print(f"Tagged text --> {text}")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--exp", type=int, default=2) parser.add_argument("--save", type=str, default="./model1_best_base.pt") args = parser.parse_args() # Data and Tokenization device = "cuda" if torch.cuda.is_available() else "cpu" tokenizer = DistilBertTokenizerFast.from_pretrained( "distilbert-base-uncased") batch_size = 4 train_dataset = TorchDataset( file_name="./data/diverse.triplets.train.tsv", queries_path="./data/diverse.queries.all.tsv", passages_path="./data/diverse.passages.all.tsv", ) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) dev_dataset = TorchDataset( file_name="./data/diverse.triplets.dev.tsv", queries_path="./data/diverse.queries.all.tsv", passages_path="./data/diverse.passages.all.tsv", ) dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False) # Model Training and Evaluation NUM_EPOCHS = 1 LEARNING_RATE = 0.00003 # load model model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased") if args.exp == 3: # model.load_state_dict(torch.load(model_path+"model1_best.pt")) model_frozen = copy.deepcopy(model) for param in model_frozen.distilbert.parameters(): param.requires_grad = False model = model_frozen model.to(device) model.train() if args.exp < 3: optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) elif args.exp == 3: optimizer = torch.optim.Adam(model.distilbert.parameters(), lr=LEARNING_RATE) def evaluate(inputs, model, tokenizer, labels): encodings = tokenizer( inputs, return_tensors="pt", truncation=True, padding=True, max_length=512, ) ids, masks = encodings["input_ids"], encodings["attention_mask"] outputs = model(ids.to(device), masks.to(device), labels=labels.to(device)) return outputs dataloader = train_dataloader N = len(dataloader) lowest_loss = float("inf") start = time.time() learning_curve_y = [] learning_curve_x = [] for epoch in range(NUM_EPOCHS): epoch_loss = 0 for i, (queries, pos_docs, neg_docs) in enumerate(dataloader): if args.exp != 1: optimizer.zero_grad() # set gradient to zero queries = list(queries) * 2 # 2*B docs = list(pos_docs) + list(neg_docs) labels = torch.cat( [torch.ones(len(pos_docs)), torch.zeros(len(neg_docs))]).long().to(device) # 2*batch, outputs = evaluate( inputs=list(zip(queries, docs)), model=model, tokenizer=tokenizer, labels=labels, ) loss = outputs.loss loss.backward() optimizer.step() epoch_loss += float(loss) if i % 10 == 0: elapsed_time = time.time() - start remaining_time = elapsed_time * (1 / (i + 1) * N - 1) print( f"{i}: remaining time: {remaining_time:.1f} | est. epoch loss: {epoch_loss / (i + 1):.4f}" ) if i % 10 == 0: with torch.no_grad(): correct = total = 0 val_start = time.time() for dq, dp, dn in dev_dataloader: queries = list(dq) * 2 # 2*B docs = list(dp) + list(dn) labels = torch.cat( [torch.ones(len(dp)), torch.zeros(len(dn))]).long().to(device) outputs = evaluate(inputs=list(zip(queries, docs)), model=model, tokenizer=tokenizer, labels=labels) predicted_classes = outputs.logits.argmax(dim=-1) correct += (labels == predicted_classes).sum() total += len(labels) if time.time() - val_start > 15: break print( f"{i}: est. validation accuracy: {correct / total:.4f}" ) learning_curve_y.append(correct / total) learning_curve_x.append(i * batch_size) # epoch normally if (epoch_loss / (i + 1)) < lowest_loss: if args.exp == 1: torch.save(model.state_dict(), "model1_best_pretrain.pt") elif args.exp == 2: torch.save(model.state_dict(), "model1_best_base.pt") elif args.exp == 3: torch.save(model.state_dict(), "model1_best_freeze.pt") lowest_loss = epoch_loss / (i + 1) print(f"loss for epoch {epoch} is {epoch_loss}") generate_data_for_plot(learning_curve_y, learning_curve_x)
def run(self): torch.cuda.empty_cache() # raw data polling and pretreatment datas self.generate_data() # generate save model directory self.generate_model_directory() if self.tokenizer_type != '': # generate corpus by Okt konlpy # self.generate_custom_morphs(self.list_memo) # generate tokenizer model self.generate_custom_vocab() tokenizer = None if self.tokenizer_type == '': # base tokenizer tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased", lowercase=True, strip_accents=False, local_files_only=False) else: # word piece tokenizer tokenizer = DistilBertTokenizerFast.from_pretrained(self.vocab_root_dir + self.vocab_dir, strip_accents=False, lowercase=True) self.setPrint('Load Customer Vocab size : {}'.format(tokenizer.vocab_size)) # tokenizer Loading check # tokenized_input_for_pytorch = tokenizer_for_load("i am very happy now", return_tensors="pt") # encoded_text = tokenizer("전화 통화가 정상적으로 안됨", return_tensors="pt") # self.setPrint("Tokens Text List: {}".format( # [tokenizer.convert_ids_to_tokens(s) for s in encoded_text['input_ids'].tolist()[0]])) # self.setPrint("Tokens IDX List: {}".format(encoded_text['input_ids'].tolist()[0])) # self.setPrint("Tokens Mask List: {}".format(encoded_text['attention_mask'].tolist()[0])) # transformed train data encoded_data_train = tokenizer.batch_encode_plus( self.Train_Data_X, add_special_tokens=True, return_attention_mask=True, # padding='longest', padding=True, max_length=256, return_tensors='pt', truncation=True ) # transformed validation data encoded_data_val = tokenizer.batch_encode_plus( self.Test_Data_X, add_special_tokens=True, return_attention_mask=True, # padding='longest', padding=True, max_length=256, return_tensors='pt', truncation=True ) input_ids_train = encoded_data_train['input_ids'] attention_masks_train = encoded_data_train['attention_mask'] labels_train = torch.tensor(self.Train_Data_Y) input_ids_test = encoded_data_val['input_ids'] attention_masks_test = encoded_data_val['attention_mask'] labels_test = torch.tensor(self.Test_Data_Y) dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train) dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test) # local_files_only = True self.model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(self.label_index), output_attentions=False, output_hidden_states=False, local_files_only=False).to(self.device) # dataLoader dataloader_train = DataLoader(dataset_train, sampler=RandomSampler(dataset_train), batch_size=self.batch_size, drop_last=True) dataloader_test = DataLoader(dataset_test, sampler=RandomSampler(dataset_test), batch_size=self.batch_size) optimizer = AdamW(self.model.parameters(), lr=self.learning_rate, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * self.epoch) # scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, # num_warmup_steps=0, # num_training_steps=len(dataloader_train) * self.epoch) # for loss f1 graph total_train_loss = np.array([0.0000] * self.epoch) total_val_loss = np.array([0.0000] * self.epoch) total_score = np.array([0.0000] * self.epoch) # Training start for epoch in range(1, self.epoch + 1): self.setPrint('Start of Epoch {}'.format(epoch)) self.model.train() loss_train_total = 0 for idx, batch in enumerate(dataloader_train): self.model.zero_grad() batch = tuple(b.to(self.device) for b in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2], } outputs = self.model(**inputs) loss = outputs[0] loss_train_total += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) optimizer.step() scheduler.step() if idx % 100 == 0: self.setPrint('[{}]Epoch {}/{} training_loss : {:.4f}'.format(epoch, idx, len(dataloader_train), loss.item() / len(batch))) # gpu memory reset batch = None torch.cuda.empty_cache() # model save torch.save(self.model.state_dict(), self.model_root_dir + self.model_dir + 'BERT_dict_epoch_{}.model'.format(epoch)) self.setPrint('Save fine_tuned_BERT_epoch_{}.model'.format(epoch)) self.setPrint('\nEnd of Epoch {}'.format(epoch)) loss_train_avg = loss_train_total / len(dataloader_train) self.setPrint('[{}] Epoch Training loss: {:.4f}'.format(epoch, loss_train_avg)) total_train_loss[epoch - 1] = round(loss_train_avg, 4) val_loss, predictions, true_vals = self.evaluate(dataloader_test) val_f1 = self.f1_score_func(predictions, true_vals) total_score[epoch - 1] = round(val_f1, 4) total_val_loss[epoch - 1] = round(val_loss, 4) self.setPrint('[{}] Validation loss: {:.4f}'.format(epoch, val_loss)) self.setPrint('[{}] F1 Score : {:.4f}'.format(epoch, val_f1)) # generate graph self.generate_graph(total_train_loss, total_val_loss, total_score)