def fit(self, completions, workdir=None, model_type='bert', pretrained_model='dslim/bert-base-NER', batch_size=32, learning_rate=5e-5, adam_epsilon=1e-8, num_train_epochs=3, weight_decay=0.0, logging_steps=1, warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None, **kwargs): print('\n\n\n\n >> entered fit in transformers') train_logs = train_logs or os.path.join(workdir, 'train_logs') os.makedirs(train_logs, exist_ok=True) logger.debug('Prepare models') cache_dir = os.path.expanduser(cache_dir) os.makedirs(cache_dir, exist_ok=True) model_type = model_type.lower() # assert model_type in MODEL_CLASSES.keys(), f'Input model type {model_type} not in {MODEL_CLASSES.keys()}' # assert pretrained_model in ALL_MODELS, f'Pretrained model {pretrained_model} not in {ALL_MODELS}' tokenizer = AutoTokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir) logger.debug('Read data') # read input data stream texts, list_of_spans = [], [] print('\n\n >> this is completions given to fit method in ner', completions) for item in completions: print('item', item) texts.append(item['data'][self.value]) list_of_spans.append(self.get_spans(item['annotations'][0])) print('>> extracted list of spans', list_of_spans) print('>> extracted texts', texts) logger.debug('Prepare dataset') pad_token_label_id = CrossEntropyLoss().ignore_index print('\n\n Giving the following inputs to create a trainstet', 'texts:', texts, 'list_of_spans', list_of_spans, 'tokenizer', tokenizer) train_set = SpanLabeledTextDataset( texts, list_of_spans, tokenizer, cls_token_at_end=model_type in ['xlnet'], cls_token_segment_id=2 if model_type in ['xlnet'] else 0, sep_token_extra=model_type in ['roberta'], pad_token_label_id=pad_token_label_id) try: print('so apparently trainset supports indexing') print('train_set[0]', train_set[0]) except: print('train_Set did not support indexing') if dump_dataset: dataset_file = os.path.join(workdir, 'train_set.txt') train_set.dump(dataset_file) print('\n\n >> successfully dumped trainset at ', dataset_file) print( '\n\n Adding the following number of labels to the model config file: ', train_set.num_labels) print( 'if you are loading from a pretrained checkpoit (huggingface hub), this number of labels should match' ) # config = config_class.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir) # config = AutoConfig.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir) config = AutoConfig.from_pretrained(pretrained_model, num_labels=9, cache_dir=cache_dir) print('\n\n> config, ', config) # model = model_class.from_pretrained(pretrained_model, config=config, cache_dir=cache_dir) print('\n\n>> loading model from hub checkpoint, ', pretrained_model) model = AutoModelForTokenClassification.from_pretrained( pretrained_model, config=config, cache_dir=cache_dir) print('\n\n successfully loaded model from hub checkpoint') batch_padding = SpanLabeledTextDataset.get_padding_function( model_type, tokenizer, pad_token_label_id) train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, collate_fn=batch_padding) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_training_steps = len(train_loader) * num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) tr_loss, logging_loss = 0, 0 global_step = 0 if train_logs: tb_writer = SummaryWriter( logdir=os.path.join(train_logs, os.path.basename(workdir))) epoch_iterator = trange(num_train_epochs, desc='Epoch') loss_queue = deque(maxlen=10) for _ in epoch_iterator: batch_iterator = tqdm(train_loader, desc='Batch') for step, batch in enumerate(batch_iterator): model.train() inputs = { 'input_ids': batch['input_ids'], 'attention_mask': batch['input_mask'], 'labels': batch['label_ids'], 'token_type_ids': batch['segment_ids'] } if model_type == 'distilbert': inputs.pop('token_type_ids') model_output = model(**inputs) loss = model_output[0] loss.backward() tr_loss += loss.item() optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: last_loss = (tr_loss - logging_loss) / logging_steps loss_queue.append(last_loss) if train_logs: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', last_loss, global_step) logging_loss = tr_loss # slope-based early stopping if len(loss_queue) == loss_queue.maxlen: slope = calc_slope(loss_queue) if train_logs: tb_writer.add_scalar('slope', slope, global_step) if abs(slope) < 1e-2: break print('\n\n >>> trained successfully') if train_logs: tb_writer.close() model_to_save = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_save.save_pretrained(workdir) tokenizer.save_pretrained(workdir) print( '\n\n\n >>> this mapping based on trains set will be used to create label map', train_set.tag_idx_map.items()) label_map = {i: t for t, i in train_set.tag_idx_map.items()} print('\n\n created label_map in fit ', label_map) print('\n\n overiding labels map') label_map = { "0": "O", "1": "B-MISC", "2": "I-MISC", "3": "B-PER", "4": "I-PER", "5": "B-ORG", "6": "I-ORG", "7": "B-LOC", "8": "I-LOC" } print(label_map) return { 'model_path': workdir, 'batch_size': batch_size, 'pad_token_label_id': pad_token_label_id, 'dataset_params_dict': train_set.get_params_dict(), 'model_type': model_type, 'pretrained_model': pretrained_model, 'label_map': label_map }
def fit(self, completions, workdir=None, model_type='bert', pretrained_model='bert-base-uncased', batch_size=32, learning_rate=5e-5, adam_epsilon=1e-8, num_train_epochs=100, weight_decay=0.0, logging_steps=1, warmup_steps=0, save_steps=50, dump_dataset=True, cache_dir='~/.heartex/cache', train_logs=None, **kwargs): train_logs = train_logs or os.path.join(workdir, 'train_logs') os.makedirs(train_logs, exist_ok=True) logger.debug('Prepare models') cache_dir = os.path.expanduser(cache_dir) os.makedirs(cache_dir, exist_ok=True) model_type = model_type.lower() # assert model_type in MODEL_CLASSES.keys(), f'Input model type {model_type} not in {MODEL_CLASSES.keys()}' # assert pretrained_model in ALL_MODELS, f'Pretrained model {pretrained_model} not in {ALL_MODELS}' tokenizer = AutoTokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir) logger.debug('Read data') # read input data stream texts, list_of_spans = [], [] for item in completions: texts.append(item['data'][self.value]) list_of_spans.append(self.get_spans(item['completions'][0])) logger.debug('Prepare dataset') pad_token_label_id = CrossEntropyLoss().ignore_index train_set = SpanLabeledTextDataset( texts, list_of_spans, tokenizer, cls_token_at_end=model_type in ['xlnet'], cls_token_segment_id=2 if model_type in ['xlnet'] else 0, sep_token_extra=model_type in ['roberta'], pad_token_label_id=pad_token_label_id) if dump_dataset: dataset_file = os.path.join(workdir, 'train_set.txt') train_set.dump(dataset_file) # config = config_class.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir) config = AutoConfig.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir) # model = model_class.from_pretrained(pretrained_model, config=config, cache_dir=cache_dir) model = AutoModelForTokenClassification.from_pretrained( pretrained_model, config=config, cache_dir=cache_dir) batch_padding = SpanLabeledTextDataset.get_padding_function( model_type, tokenizer, pad_token_label_id) train_loader = DataLoader(dataset=train_set, batch_size=batch_size, shuffle=True, collate_fn=batch_padding) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] num_training_steps = len(train_loader) * num_train_epochs optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps) tr_loss, logging_loss = 0, 0 global_step = 0 if train_logs: tb_writer = SummaryWriter( logdir=os.path.join(train_logs, os.path.basename(workdir))) epoch_iterator = trange(num_train_epochs, desc='Epoch') loss_queue = deque(maxlen=10) for _ in epoch_iterator: batch_iterator = tqdm(train_loader, desc='Batch') for step, batch in enumerate(batch_iterator): model.train() inputs = { 'input_ids': batch['input_ids'], 'attention_mask': batch['input_mask'], 'labels': batch['label_ids'], 'token_type_ids': batch['segment_ids'] } if model_type == 'distilbert': inputs.pop('token_type_ids') model_output = model(**inputs) loss = model_output[0] loss.backward() tr_loss += loss.item() optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: last_loss = (tr_loss - logging_loss) / logging_steps loss_queue.append(last_loss) if train_logs: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', last_loss, global_step) logging_loss = tr_loss # slope-based early stopping if len(loss_queue) == loss_queue.maxlen: slope = calc_slope(loss_queue) if train_logs: tb_writer.add_scalar('slope', slope, global_step) if abs(slope) < 1e-2: break if train_logs: tb_writer.close() model_to_save = model.module if hasattr( model, "module") else model # Take care of distributed/parallel training model_to_save.save_pretrained(workdir) tokenizer.save_pretrained(workdir) label_map = {i: t for t, i in train_set.tag_idx_map.items()} return { 'model_path': workdir, 'batch_size': batch_size, 'pad_token_label_id': pad_token_label_id, 'dataset_params_dict': train_set.get_params_dict(), 'model_type': model_type, 'pretrained_model': pretrained_model, 'label_map': label_map }
def fit(self, completions, workdir=None, cache_dir=None, pretrained_model='bert-base-multilingual-cased', maxlen=64, batch_size=32, num_epochs=100, logging_steps=1, train_logs=None, **kwargs): input_texts = [] output_labels, output_labels_idx = [], [] label2idx = {l: i for i, l in enumerate(self.labels)} for completion in completions: # get input text from task data if completion['completions'][0].get('skipped'): continue input_text = completion['data'][self.value] input_texts.append(input_text) # get an annotation output_label = completion['completions'][0]['result'][0]['value'][ 'choices'][0] output_labels.append(output_label) output_label_idx = label2idx[output_label] output_labels_idx.append(output_label_idx) new_labels = set(output_labels) added_labels = new_labels - set(self.labels) if len(added_labels) > 0: print('Label set has been changed. Added ones: ' + str(list(added_labels))) self.labels = list(sorted(new_labels)) label2idx = {l: i for i, l in enumerate(self.labels)} output_labels_idx = [label2idx[label] for label in output_labels] tokenizer = BertTokenizer.from_pretrained(pretrained_model, cache_dir=cache_dir) train_dataloader = prepare_texts(input_texts, tokenizer, maxlen, RandomSampler, batch_size, output_labels_idx) model = self.reset_model(pretrained_model, cache_dir, device) total_steps = len(train_dataloader) * num_epochs optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) global_step = 0 total_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(num_epochs, desc='Epoch') if train_logs: tb_writer = SummaryWriter( logdir=os.path.join(train_logs, os.path.basename(output_dir))) else: tb_writer = None loss_queue = deque(maxlen=10) for epoch in train_iterator: epoch_iterator = tqdm(train_dataloader, desc='Iteration') for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2] } outputs = model(**inputs) loss = outputs[0] loss.backward() total_loss += loss.item() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % logging_steps == 0: last_loss = (total_loss - logging_loss) / logging_steps loss_queue.append(last_loss) if tb_writer: tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', last_loss, global_step) logging_loss = total_loss # slope-based early stopping if len(loss_queue) == loss_queue.maxlen: slope = calc_slope(loss_queue) if tb_writer: tb_writer.add_scalar('slope', slope, global_step) if abs(slope) < 1e-2: break if tb_writer: tb_writer.close() model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training # noqa model_to_save.save_pretrained(workdir) tokenizer.save_pretrained(workdir) return { 'model_path': workdir, 'batch_size': batch_size, 'maxlen': maxlen, 'pretrained_model': pretrained_model, 'labels': self.labels }