예제 #1
0
    def fit(self,
            completions,
            workdir=None,
            model_type='bert',
            pretrained_model='dslim/bert-base-NER',
            batch_size=32,
            learning_rate=5e-5,
            adam_epsilon=1e-8,
            num_train_epochs=3,
            weight_decay=0.0,
            logging_steps=1,
            warmup_steps=0,
            save_steps=50,
            dump_dataset=True,
            cache_dir='~/.heartex/cache',
            train_logs=None,
            **kwargs):
        print('\n\n\n\n >> entered fit in transformers')

        train_logs = train_logs or os.path.join(workdir, 'train_logs')
        os.makedirs(train_logs, exist_ok=True)
        logger.debug('Prepare models')
        cache_dir = os.path.expanduser(cache_dir)
        os.makedirs(cache_dir, exist_ok=True)

        model_type = model_type.lower()
        # assert model_type in MODEL_CLASSES.keys(), f'Input model type {model_type} not in {MODEL_CLASSES.keys()}'
        # assert pretrained_model in ALL_MODELS, f'Pretrained model {pretrained_model} not in {ALL_MODELS}'

        tokenizer = AutoTokenizer.from_pretrained(pretrained_model,
                                                  cache_dir=cache_dir)

        logger.debug('Read data')
        # read input data stream
        texts, list_of_spans = [], []
        print('\n\n >> this is completions given to fit method in ner',
              completions)
        for item in completions:
            print('item', item)
            texts.append(item['data'][self.value])
            list_of_spans.append(self.get_spans(item['annotations'][0]))

        print('>> extracted list of spans', list_of_spans)
        print('>> extracted texts', texts)
        logger.debug('Prepare dataset')
        pad_token_label_id = CrossEntropyLoss().ignore_index
        print('\n\n Giving the following inputs to create a trainstet',
              'texts:', texts, 'list_of_spans', list_of_spans, 'tokenizer',
              tokenizer)
        train_set = SpanLabeledTextDataset(
            texts,
            list_of_spans,
            tokenizer,
            cls_token_at_end=model_type in ['xlnet'],
            cls_token_segment_id=2 if model_type in ['xlnet'] else 0,
            sep_token_extra=model_type in ['roberta'],
            pad_token_label_id=pad_token_label_id)
        try:
            print('so apparently trainset supports indexing')
            print('train_set[0]', train_set[0])
        except:
            print('train_Set did not support indexing')

        if dump_dataset:
            dataset_file = os.path.join(workdir, 'train_set.txt')
            train_set.dump(dataset_file)
            print('\n\n >> successfully dumped trainset at ', dataset_file)

        print(
            '\n\n Adding the following number of labels to the model config file: ',
            train_set.num_labels)
        print(
            'if you are loading from a pretrained checkpoit (huggingface hub), this number of labels should match'
        )
        # config = config_class.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir)
        # config = AutoConfig.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir)
        config = AutoConfig.from_pretrained(pretrained_model,
                                            num_labels=9,
                                            cache_dir=cache_dir)
        print('\n\n> config, ', config)
        # model = model_class.from_pretrained(pretrained_model, config=config, cache_dir=cache_dir)
        print('\n\n>> loading model from hub checkpoint, ', pretrained_model)
        model = AutoModelForTokenClassification.from_pretrained(
            pretrained_model, config=config, cache_dir=cache_dir)
        print('\n\n successfully loaded model from hub checkpoint')
        batch_padding = SpanLabeledTextDataset.get_padding_function(
            model_type, tokenizer, pad_token_label_id)

        train_loader = DataLoader(dataset=train_set,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=batch_padding)

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        num_training_steps = len(train_loader) * num_train_epochs
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=learning_rate,
                          eps=adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=num_training_steps)

        tr_loss, logging_loss = 0, 0
        global_step = 0
        if train_logs:
            tb_writer = SummaryWriter(
                logdir=os.path.join(train_logs, os.path.basename(workdir)))
        epoch_iterator = trange(num_train_epochs, desc='Epoch')
        loss_queue = deque(maxlen=10)
        for _ in epoch_iterator:
            batch_iterator = tqdm(train_loader, desc='Batch')
            for step, batch in enumerate(batch_iterator):

                model.train()
                inputs = {
                    'input_ids': batch['input_ids'],
                    'attention_mask': batch['input_mask'],
                    'labels': batch['label_ids'],
                    'token_type_ids': batch['segment_ids']
                }
                if model_type == 'distilbert':
                    inputs.pop('token_type_ids')

                model_output = model(**inputs)
                loss = model_output[0]
                loss.backward()
                tr_loss += loss.item()
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                if global_step % logging_steps == 0:
                    last_loss = (tr_loss - logging_loss) / logging_steps
                    loss_queue.append(last_loss)
                    if train_logs:
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', last_loss, global_step)
                    logging_loss = tr_loss

            # slope-based early stopping
            if len(loss_queue) == loss_queue.maxlen:
                slope = calc_slope(loss_queue)
                if train_logs:
                    tb_writer.add_scalar('slope', slope, global_step)
                if abs(slope) < 1e-2:
                    break
        print('\n\n >>> trained successfully')
        if train_logs:
            tb_writer.close()

        model_to_save = model.module if hasattr(
            model,
            "module") else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(workdir)
        tokenizer.save_pretrained(workdir)
        print(
            '\n\n\n >>> this mapping based on trains set will be used to create label map',
            train_set.tag_idx_map.items())
        label_map = {i: t for t, i in train_set.tag_idx_map.items()}
        print('\n\n created label_map in fit ', label_map)

        print('\n\n overiding labels map')
        label_map = {
            "0": "O",
            "1": "B-MISC",
            "2": "I-MISC",
            "3": "B-PER",
            "4": "I-PER",
            "5": "B-ORG",
            "6": "I-ORG",
            "7": "B-LOC",
            "8": "I-LOC"
        }
        print(label_map)
        return {
            'model_path': workdir,
            'batch_size': batch_size,
            'pad_token_label_id': pad_token_label_id,
            'dataset_params_dict': train_set.get_params_dict(),
            'model_type': model_type,
            'pretrained_model': pretrained_model,
            'label_map': label_map
        }
예제 #2
0
    def fit(self,
            completions,
            workdir=None,
            model_type='bert',
            pretrained_model='bert-base-uncased',
            batch_size=32,
            learning_rate=5e-5,
            adam_epsilon=1e-8,
            num_train_epochs=100,
            weight_decay=0.0,
            logging_steps=1,
            warmup_steps=0,
            save_steps=50,
            dump_dataset=True,
            cache_dir='~/.heartex/cache',
            train_logs=None,
            **kwargs):
        train_logs = train_logs or os.path.join(workdir, 'train_logs')
        os.makedirs(train_logs, exist_ok=True)
        logger.debug('Prepare models')
        cache_dir = os.path.expanduser(cache_dir)
        os.makedirs(cache_dir, exist_ok=True)

        model_type = model_type.lower()
        # assert model_type in MODEL_CLASSES.keys(), f'Input model type {model_type} not in {MODEL_CLASSES.keys()}'
        # assert pretrained_model in ALL_MODELS, f'Pretrained model {pretrained_model} not in {ALL_MODELS}'

        tokenizer = AutoTokenizer.from_pretrained(pretrained_model,
                                                  cache_dir=cache_dir)

        logger.debug('Read data')
        # read input data stream
        texts, list_of_spans = [], []
        for item in completions:
            texts.append(item['data'][self.value])
            list_of_spans.append(self.get_spans(item['completions'][0]))

        logger.debug('Prepare dataset')
        pad_token_label_id = CrossEntropyLoss().ignore_index
        train_set = SpanLabeledTextDataset(
            texts,
            list_of_spans,
            tokenizer,
            cls_token_at_end=model_type in ['xlnet'],
            cls_token_segment_id=2 if model_type in ['xlnet'] else 0,
            sep_token_extra=model_type in ['roberta'],
            pad_token_label_id=pad_token_label_id)

        if dump_dataset:
            dataset_file = os.path.join(workdir, 'train_set.txt')
            train_set.dump(dataset_file)

        # config = config_class.from_pretrained(pretrained_model, num_labels=train_set.num_labels, cache_dir=cache_dir)
        config = AutoConfig.from_pretrained(pretrained_model,
                                            num_labels=train_set.num_labels,
                                            cache_dir=cache_dir)
        # model = model_class.from_pretrained(pretrained_model, config=config, cache_dir=cache_dir)
        model = AutoModelForTokenClassification.from_pretrained(
            pretrained_model, config=config, cache_dir=cache_dir)

        batch_padding = SpanLabeledTextDataset.get_padding_function(
            model_type, tokenizer, pad_token_label_id)

        train_loader = DataLoader(dataset=train_set,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=batch_padding)

        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            weight_decay
        }, {
            'params': [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.0
        }]

        num_training_steps = len(train_loader) * num_train_epochs
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=learning_rate,
                          eps=adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=num_training_steps)

        tr_loss, logging_loss = 0, 0
        global_step = 0
        if train_logs:
            tb_writer = SummaryWriter(
                logdir=os.path.join(train_logs, os.path.basename(workdir)))
        epoch_iterator = trange(num_train_epochs, desc='Epoch')
        loss_queue = deque(maxlen=10)
        for _ in epoch_iterator:
            batch_iterator = tqdm(train_loader, desc='Batch')
            for step, batch in enumerate(batch_iterator):

                model.train()
                inputs = {
                    'input_ids': batch['input_ids'],
                    'attention_mask': batch['input_mask'],
                    'labels': batch['label_ids'],
                    'token_type_ids': batch['segment_ids']
                }
                if model_type == 'distilbert':
                    inputs.pop('token_type_ids')

                model_output = model(**inputs)
                loss = model_output[0]
                loss.backward()
                tr_loss += loss.item()
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                if global_step % logging_steps == 0:
                    last_loss = (tr_loss - logging_loss) / logging_steps
                    loss_queue.append(last_loss)
                    if train_logs:
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', last_loss, global_step)
                    logging_loss = tr_loss

            # slope-based early stopping
            if len(loss_queue) == loss_queue.maxlen:
                slope = calc_slope(loss_queue)
                if train_logs:
                    tb_writer.add_scalar('slope', slope, global_step)
                if abs(slope) < 1e-2:
                    break

        if train_logs:
            tb_writer.close()

        model_to_save = model.module if hasattr(
            model,
            "module") else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(workdir)
        tokenizer.save_pretrained(workdir)
        label_map = {i: t for t, i in train_set.tag_idx_map.items()}

        return {
            'model_path': workdir,
            'batch_size': batch_size,
            'pad_token_label_id': pad_token_label_id,
            'dataset_params_dict': train_set.get_params_dict(),
            'model_type': model_type,
            'pretrained_model': pretrained_model,
            'label_map': label_map
        }
    def fit(self,
            completions,
            workdir=None,
            cache_dir=None,
            pretrained_model='bert-base-multilingual-cased',
            maxlen=64,
            batch_size=32,
            num_epochs=100,
            logging_steps=1,
            train_logs=None,
            **kwargs):
        input_texts = []
        output_labels, output_labels_idx = [], []
        label2idx = {l: i for i, l in enumerate(self.labels)}
        for completion in completions:
            # get input text from task data

            if completion['completions'][0].get('skipped'):
                continue

            input_text = completion['data'][self.value]
            input_texts.append(input_text)

            # get an annotation
            output_label = completion['completions'][0]['result'][0]['value'][
                'choices'][0]
            output_labels.append(output_label)
            output_label_idx = label2idx[output_label]
            output_labels_idx.append(output_label_idx)

        new_labels = set(output_labels)
        added_labels = new_labels - set(self.labels)
        if len(added_labels) > 0:
            print('Label set has been changed. Added ones: ' +
                  str(list(added_labels)))
            self.labels = list(sorted(new_labels))
            label2idx = {l: i for i, l in enumerate(self.labels)}
            output_labels_idx = [label2idx[label] for label in output_labels]

        tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                                  cache_dir=cache_dir)

        train_dataloader = prepare_texts(input_texts, tokenizer, maxlen,
                                         RandomSampler, batch_size,
                                         output_labels_idx)
        model = self.reset_model(pretrained_model, cache_dir, device)

        total_steps = len(train_dataloader) * num_epochs
        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        global_step = 0
        total_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(num_epochs, desc='Epoch')
        if train_logs:
            tb_writer = SummaryWriter(
                logdir=os.path.join(train_logs, os.path.basename(output_dir)))
        else:
            tb_writer = None
        loss_queue = deque(maxlen=10)
        for epoch in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc='Iteration')
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2]
                }
                outputs = model(**inputs)
                loss = outputs[0]
                loss.backward()
                total_loss += loss.item()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                if global_step % logging_steps == 0:
                    last_loss = (total_loss - logging_loss) / logging_steps
                    loss_queue.append(last_loss)
                    if tb_writer:
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', last_loss, global_step)
                    logging_loss = total_loss

            # slope-based early stopping
            if len(loss_queue) == loss_queue.maxlen:
                slope = calc_slope(loss_queue)
                if tb_writer:
                    tb_writer.add_scalar('slope', slope, global_step)
                if abs(slope) < 1e-2:
                    break

        if tb_writer:
            tb_writer.close()

        model_to_save = model.module if hasattr(
            model, 'module'
        ) else model  # Take care of distributed/parallel training  # noqa
        model_to_save.save_pretrained(workdir)
        tokenizer.save_pretrained(workdir)

        return {
            'model_path': workdir,
            'batch_size': batch_size,
            'maxlen': maxlen,
            'pretrained_model': pretrained_model,
            'labels': self.labels
        }