Exemplo n.º 1
0
    def predict(self, X, device=None, path=None):

        logging.info('')
        logging.info('=' * 72)
        logging.info("Predict")
        logging.info('=' * 72)

        # Get/set device
        set_model_device(self, device)

        # Configure training mode
        self.eval()

        # Create data set
        dataset = self.dataset_class( \
                                X = X,
                                pretrained = self.pretrained,
                                device = device,
                                doc_definition = self.doc_definition,
                                sent_definition = self.sent_definition,
                                max_length = self.max_length,
                                max_sent_count = self.max_sent_count,
                                linebreak_bound = self.linebreak_bound,
                                keep_ws = self.keep_ws)

        # Create data loader
        dataloader = DataLoader(dataset,  \
                                shuffle = False,
                                batch_size = self.batch_size)

        # deactivate autograd
        with torch.no_grad():

            pbar = tqdm(total=int(len(dataloader) / dataloader.batch_size))
            y = []
            for i, (input_ids, attention_mask) in enumerate(dataloader):

                verbose = False

                # Push data through model
                doc_scores, sent_scores = self(input_ids,
                                               attention_mask,
                                               verbose=verbose)


                y_batch = dataset.postprocess_y( \
                                    attention_mask  = attention_mask,
                                    doc_scores = doc_scores,
                                    sent_scores = sent_scores,
                                    )
                y.extend(y_batch)

                pbar.update()
            pbar.close()

        if path is not None:
            f = os.path.join(path, PREDICTIONS_FILE)
            joblib.dump(y, f)

        return y
Exemplo n.º 2
0
    def predict(self, X, device=None, path=None):

        logging.info('')
        logging.info('=' * 72)
        logging.info("Predict")
        logging.info('=' * 72)

        # Do not shuffle
        shuffle = False

        # Get/set device
        set_model_device(self, device)

        # Configure training mode
        self.eval()

        # Set number of cores
        torch.set_num_threads(self.num_workers)

        # Create data set
        dataset = self.dataset_class(X, **self.dataset_params, device=device)

        # Create data loader
        dataloader = DataLoader(dataset,
                                shuffle=False,
                                **self.dataloader_params)

        pbar = tqdm(total=int(len(dataloader) / dataloader.batch_size))
        y = []
        for i, (indices, seq_tensor, seq_mask, span_indices,
                span_mask) in enumerate(dataloader):

            verbose = False

            # Push data through model
            out = self( \
                            seq_tensor = seq_tensor,
                            seq_mask = seq_mask,
                            span_indices = span_indices,
                            span_mask = span_mask,
                            verbose = verbose)

            y_batch = dataset.postprocess_y( \
                                indices = indices,
                                span_scores = out["span_scores"],
                                span_mask = span_mask,
                                role_scores = out["top_role_scores"],
                                role_span_mask = out["top_span_mask"],
                                role_indices = out["top_indices"],
                                )
            y.extend(y_batch)

            pbar.update()
        pbar.close()

        return y
Exemplo n.º 3
0
def encode_documents(encoded_dict, pretrained, \
            word_pieces_keep = None,
            device = None,
            train = False,
            detach = True,
            move_to_cpu = True,
            max_length = None):

    model = AutoModel.from_pretrained(pretrained)

    if train:
        model.train()
    else:
        model.eval()
    set_model_device(model, device)

    if word_pieces_keep is None:
        word_pieces_keep = [None for _ in encoded_dict]
    assert len(word_pieces_keep) == len(encoded_dict)

    logging.info("Encoding documents...")

    X = []
    mask =[]
    pbar = tqdm(total=len(encoded_dict))
    for encoded, wp_keep in zip(encoded_dict, word_pieces_keep):
        x, m = encode_document( \
                    encoded_dict = encoded,
                    model = model,
                    word_pieces_keep = wp_keep,
                    device = device,
                    detach = detach,
                    move_to_cpu = move_to_cpu,
                    max_length = max_length)

        X.append(x)
        mask.append(m)

        pbar.update()

    pbar.close()

    return (X, mask)
Exemplo n.º 4
0
    def fit(self, X, y, device=None, path=None, shuffle=True):

        logging.info('')
        logging.info('=' * 72)
        logging.info("Fit")
        logging.info('=' * 72)

        # Get/set device
        set_model_device(self, device)

        # Configure training mode
        self.train()

        # Set number of cores
        torch.set_num_threads(self.num_workers)

        # Create data set
        dataset = self.dataset_class(X,
                                     y=y,
                                     **self.dataset_params,
                                     device=device)

        # Create data loader
        dataloader = DataLoader(dataset,
                                shuffle=shuffle,
                                **self.dataloader_params)

        # Create optimizer
        optimizer = optim.Adam(self.parameters(), **self.optimizer_params)

        # Create loss plotter
        plotter = PlotLoss(path=path)

        # Create prf aggregator
        prf_agg = PRFAggregator()

        # Loop on epochs
        pbar = tqdm(total=self.num_epochs)
        for j in range(self.num_epochs):

            loss_epoch = 0
            losses_epoch = OrderedDict()
            prf = []

            # Loop on mini-batches
            for i, (indices, seq_tensor, seq_mask, span_indices, span_mask,
                    y_true) in enumerate(dataloader):

                verbose = False  #(i == 0) and (j == 0)

                # Reset gradients
                self.zero_grad()

                y_pred = self( \
                                seq_tensor = seq_tensor,
                                seq_mask = seq_mask,
                                span_indices = span_indices,
                                span_mask = span_mask,
                                verbose = verbose)

                loss, loss_dict = self.loss(y_true, y_pred)

                plotter.update_batch(loss, loss_dict)

                prf_agg.update_counts(self.perf_counts(y_true, y_pred))

                # Backprop loss
                loss.backward()

                loss_epoch += loss.item()
                for k, v in loss_dict.items():
                    if i == 0:
                        losses_epoch[k] = v.item()
                    else:
                        losses_epoch[k] += v.item()

                # Clip loss
                clip_grad_norm_(self.parameters(),
                                self.hyperparams['grad_max_norm'])

                # Update
                optimizer.step()

            plotter.update_epoch(loss_epoch, losses_epoch)

            msg = []
            msg.append('epoch={}'.format(j))
            msg.append('{}={:.1e}'.format('Total', loss_epoch))
            for k, ls in losses_epoch.items():
                msg.append('{}={:.1e}'.format(k, ls))

            msg.append(prf_agg.prf())
            prf_agg.reset()

            msg = ", ".join(msg)

            pbar.set_description(desc=msg)
            pbar.update()
            print()

        pbar.close()

        return True
Exemplo n.º 5
0
    def predict(self, X, device=None, path=None, return_prob=False):

        logging.info('')
        logging.info('='*72)
        logging.info("Predict")
        logging.info('='*72)

        # Get/set device
        set_model_device(self, device)

        # Configure training mode
        self.eval()

        # Set number of cores
        torch.set_num_threads(self.num_workers)

        # Create data set
        dataset = self.dataset_class(X, **self.dataset_params, device=device)

        # Create data loader
        dataloader = DataLoader(dataset, shuffle=False, **self.dataloader_params)

        pbar = tqdm(total=int(len(dataloader)/dataloader.batch_size))
        y = []
        #for i, (doc_indices, seq_tensor, seq_mask, span_indices, span_mask) in enumerate(dataloader):
        for i, (doc_indices, seq_tensor, seq_mask) in enumerate(dataloader):

            verbose = False

            # Push data through model
            #out = self(seq_tensor, seq_mask, span_indices, span_mask, verbose=verbose)
            out = self(seq_tensor, seq_mask, verbose=verbose)


            if return_prob:
                y_batch = dataset.postprocess_y_prob( \
                                doc_indices = doc_indices,
                                seq_mask  = seq_mask,
                                doc_scores = out["doc_scores"]
                                )
            else:
                y_batch = dataset.postprocess_y( \
                                doc_indices = doc_indices,
                                seq_mask  = seq_mask,
                                doc_scores = out["doc_scores"],
                                sent_scores = out["sent_scores"],
                                #span_scores = out["span_scores"],
                                #span_mask = span_mask,
                                #role_scores = out["top_role_scores"],
                                #role_span_mask = out["top_span_mask"],
                                #role_indices = out["top_indices"],
                                )
            y.extend(y_batch)

            pbar.update()
        pbar.close()


        if path is not None:
            f = os.path.join(path, PREDICTIONS_FILE)
            joblib.dump(y, f)

        return y
Exemplo n.º 6
0
    def fit(self, X, y, device=None, path=None, shuffle=True):
        '''


        Parameters
        ----------

        X: documents as list of strings [doc [str]]
        y: labels as list of dictionarys

        '''

        logging.info('')
        logging.info('=' * 72)
        logging.info("Fit")
        logging.info('=' * 72)

        # Get/set device
        set_model_device(self, device)

        # Configure training mode
        self.train()

        # Create data set
        dataset = self.dataset_class( \
                                X = X,
                                y = y,
                                pretrained = self.pretrained,
                                device = device,
                                doc_definition = self.doc_definition,
                                sent_definition = self.sent_definition,
                                max_length = self.max_length,
                                max_sent_count = self.max_sent_count,
                                linebreak_bound = self.linebreak_bound,
                                keep_ws = self.keep_ws)

        # Create data loader
        dataloader = DataLoader(dataset,  \
                                shuffle = shuffle,
                                batch_size = self.batch_size)

        # Create optimizer
        '''
        https://github.com/huggingface/transformers/issues/657

        pretrained = model.bert.parameters()
        # Get names of pretrained parameters (including `bert.` prefix)
        pretrained_names = [f'bert.{k}' for (k, v) in model.bert.named_parameters()]

        new_params= [v for k, v in model.named_parameters() if k not in pretrained_names]

        optimizer = AdamW(
            [{'params': pretrained}, {'params': new_params, 'lr': learning_rate * 10}],
            lr=learning_rate,
        )




        )


        '''

        if self.lr_ratio == 1:
            optimizer = AdamW(self.parameters(), lr=self.lr)
        else:
            pretrained = self.bert.parameters()
            pretrained_names = [
                f'bert.{k}' for (k, v) in self.bert.named_parameters()
            ]
            new_params = [
                v for k, v in self.named_parameters()
                if k not in pretrained_names
            ]
            optimizer = AdamW([{
                'params': pretrained
            }, {
                'params': new_params,
                'lr': self.lr * self.lr_ratio
            }],
                              lr=self.lr)
        # define cross entropy
        #cross_entropy  = nn.NLLLoss(reduction=self.loss_reduction)

        # Create loss plotter
        plotter = PlotLoss(path=path)

        # Create prf aggregator
        prf_agg = PRFAggregator()

        # Loop on epochs
        pbar = tqdm(total=self.num_epochs)
        for j in range(self.num_epochs):

            loss_epoch = 0
            losses_epoch = OrderedDict()
            prf = []

            # Loop on mini-batches
            for i, (input_ids, attention_mask, doc_labels,
                    sent_labels) in enumerate(dataloader):

                verbose = False  #(i == 0) and (j == 0)

                # Reset gradients
                self.zero_grad()

                doc_scores, sent_scores = self(input_ids,
                                               attention_mask,
                                               verbose=verbose)

                loss_dict = OrderedDict()
                for k in doc_labels:
                    loss_dict[f"doc_{k[0:3]}"] = F.cross_entropy( \
                                                    input = doc_scores[k],
                                                    target = doc_labels[k],
                                                    reduction = self.loss_reduction)

                if self.use_sent_objective:
                    for k in doc_labels:
                        ls = []
                        for t in sent_labels[k]:
                            scores = sent_scores[k][t]
                            labels = sent_labels[k][t]

                            doc_count, sent_count, _ = tuple(scores.shape)

                            scores = scores.view(doc_count * sent_count, -1)
                            labels = labels.view(doc_count * sent_count)

                            l = F.cross_entropy( \
                                input = scores,
                                target = labels,
                                reduction = self.loss_reduction)
                            ls.append(l)
                        ls = aggregate(torch.stack(ls), self.loss_reduction)
                        loss_dict[f"sent_{k[0:3]}"] = ls

                loss = [v for k, v in loss_dict.items() if v is not None]
                loss = aggregate(torch.stack(loss), self.loss_reduction)

                plotter.update_batch(loss, loss_dict)

                #prf_agg.update_counts(self.perf_counts(y_true, y_pred))

                # Backprop loss
                loss.backward()

                loss_epoch += loss.item()
                for k, v in loss_dict.items():
                    if i == 0:
                        losses_epoch[k] = v.item()
                    else:
                        losses_epoch[k] += v.item()

                # Clip loss
                clip_grad_norm_(self.parameters(), self.grad_max_norm)

                # Update
                optimizer.step()

            plotter.update_epoch(loss_epoch, losses_epoch)

            msg = []
            msg.append('epoch={}'.format(j))
            msg.append('{}={:.1e}'.format('Total', loss_epoch))
            for k, ls in losses_epoch.items():
                msg.append('{}={:.1e}'.format(k, ls))

            #msg.append(prf_agg.prf())
            #prf_agg.reset()

            msg = ", ".join(msg)
            pbar.set_description(desc=msg)
            pbar.update()

        pbar.close()

        return True
Exemplo n.º 7
0
def encode_documents(input_ids, mask, \
    pretrained=PRETRAINED,
    device=None,
    train=False):


    logging.info("Embedding using AutoModel")

    model = AutoModel.from_pretrained(pretrained)

    if train:
        model.train()
    else:
        model.eval()


    set_model_device(model, device)

    X = []
    masks = []
    pbar = tqdm(total=len(input_ids))
    assert len(input_ids) == len(mask)
    for i, (ids, msk) in enumerate(zip(input_ids, mask)):


        ids = set_tensor_device(ids, device)
        msk = set_tensor_device(msk, device)

        x = model( \
            ids,
            token_type_ids=None,
            attention_mask=msk)[0]

        x = x.cpu().detach()
        X.append(x)

        if i == 1:

            logging.info("Encode documents")

            #logging.info("-"*80)

            #logging.info("")
            #logging.info('IDs: {}\n{}'.format(ids.shape, ids))
            logging.info('IDs: {}'.format(ids.shape))

            #logging.info("")
            #logging.info('Mask: {}\n{}'.format(msk.shape, msk))
            logging.info('Mask: {}'.format(msk.shape))

            #logging.info("")
            #logging.info('X: {}\n{}'.format(x.shape, x))
            logging.info('X: {}'.format(x.shape))
            logging.info('')
            #logging.info("")
            #logging.info("-"*80)

    pbar.update()
    pbar.close()

    logging.info("")
    logging.info('Document count: {}'.format(len(X)))
    logging.info("")

    return X