def predict(self, tasks, **kwargs):
        texts = [task['data'][self.value] for task in tasks]
        predict_dataloader = prepare_texts(texts, self.tokenizer, self.maxlen,
                                           SequentialSampler, self.batch_size)

        pred_labels, pred_scores = [], []
        for batch in predict_dataloader:
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
            with torch.no_grad():
                outputs = self.model(**inputs)
                logits = outputs[0]

            batch_preds = logits.detach().cpu().numpy()

            argmax_batch_preds = np.argmax(batch_preds, axis=-1)
            pred_labels.extend(str(self.labels[i]) for i in argmax_batch_preds)

            max_batch_preds = np.max(batch_preds, axis=-1)
            pred_scores.extend(float(s) for s in max_batch_preds)

        predictions = []
        for predicted_label, score in zip(pred_labels, pred_scores):
            result = [{
                'from_name': self.from_name,
                'to_name': self.to_name,
                'type': 'choices',
                'value': {
                    'choices': [predicted_label]
                }
            }]

            predictions.append({'result': result, 'score': score})
        return predictions
    def fit(self,
            completions,
            workdir=None,
            cache_dir=None,
            pretrained_model='bert-base-multilingual-cased',
            maxlen=64,
            batch_size=32,
            num_epochs=100,
            logging_steps=1,
            train_logs=None,
            **kwargs):
        input_texts = []
        output_labels, output_labels_idx = [], []
        label2idx = {l: i for i, l in enumerate(self.labels)}
        for completion in completions:
            # get input text from task data

            if completion['completions'][0].get('skipped'):
                continue

            input_text = completion['data'][self.value]
            input_texts.append(input_text)

            # get an annotation
            output_label = completion['completions'][0]['result'][0]['value'][
                'choices'][0]
            output_labels.append(output_label)
            output_label_idx = label2idx[output_label]
            output_labels_idx.append(output_label_idx)

        new_labels = set(output_labels)
        added_labels = new_labels - set(self.labels)
        if len(added_labels) > 0:
            print('Label set has been changed. Added ones: ' +
                  str(list(added_labels)))
            self.labels = list(sorted(new_labels))
            label2idx = {l: i for i, l in enumerate(self.labels)}
            output_labels_idx = [label2idx[label] for label in output_labels]

        tokenizer = BertTokenizer.from_pretrained(pretrained_model,
                                                  cache_dir=cache_dir)

        train_dataloader = prepare_texts(input_texts, tokenizer, maxlen,
                                         RandomSampler, batch_size,
                                         output_labels_idx)
        model = self.reset_model(pretrained_model, cache_dir, device)

        total_steps = len(train_dataloader) * num_epochs
        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        global_step = 0
        total_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(num_epochs, desc='Epoch')
        if train_logs:
            tb_writer = SummaryWriter(
                logdir=os.path.join(train_logs, os.path.basename(output_dir)))
        else:
            tb_writer = None
        loss_queue = deque(maxlen=10)
        for epoch in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc='Iteration')
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2]
                }
                outputs = model(**inputs)
                loss = outputs[0]
                loss.backward()
                total_loss += loss.item()

                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                if global_step % logging_steps == 0:
                    last_loss = (total_loss - logging_loss) / logging_steps
                    loss_queue.append(last_loss)
                    if tb_writer:
                        tb_writer.add_scalar('lr',
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar('loss', last_loss, global_step)
                    logging_loss = total_loss

            # slope-based early stopping
            if len(loss_queue) == loss_queue.maxlen:
                slope = calc_slope(loss_queue)
                if tb_writer:
                    tb_writer.add_scalar('slope', slope, global_step)
                if abs(slope) < 1e-2:
                    break

        if tb_writer:
            tb_writer.close()

        model_to_save = model.module if hasattr(
            model, 'module'
        ) else model  # Take care of distributed/parallel training  # noqa
        model_to_save.save_pretrained(workdir)
        tokenizer.save_pretrained(workdir)

        return {
            'model_path': workdir,
            'batch_size': batch_size,
            'maxlen': maxlen,
            'pretrained_model': pretrained_model,
            'labels': self.labels
        }
示例#3
0
from utils import load_data, prepare_texts, query_for_answers
import argparse

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'Client for querying docker image hosted on specified server')
    parser.add_argument('test_path', type=str, help='path for test.txt file')
    parser.add_argument('-s',
                        '--separator',
                        default=';',
                        help='separator in test.txt file')
    parser.add_argument(
        '-u',
        '--server_url',
        default='http://35.234.121.157:8501/v1/models/cnn:predict',
        help=
        'google cloud platform kubernetes cluster ip address with docker image and trained model within'
    )

    args = parser.parse_args()
    texts, labels = load_data(filename=args.test_path,
                              separator=args.separator)
    texts = prepare_texts(texts)
    print('Connecting with gcp...')
    SERVER_URL = args.server_url
    acc = query_for_answers(texts, labels, SERVER_URL)
    print('Accuracy for data test set:')
    print(acc)