Exemplo n.º 1
0
    def train_batch(self, train_data):
        wl = self.args.vocab.wl
        cl = self.args.vocab.cl
        clip_rate = self.args.clip

        batch_size = self.args.batch_size
        num_train = len(train_data)
        total_batch = num_train // batch_size + 1
        prog = Progbar(target=total_batch)
        ## set model in train model
        self.model.train()
        train_loss = []
        for i, (words, label_ids) in enumerate(
                self.args.vocab.minibatches(train_data,
                                            batch_size=batch_size)):
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids,
                                                              pad_tok=0,
                                                              wthres=wl,
                                                              cthres=cl)
            char_ids, word_lengths = seqPAD.pad_sequences(char_ids,
                                                          pad_tok=0,
                                                          nlevels=2,
                                                          wthres=wl,
                                                          cthres=cl)
            label_ids, _ = seqPAD.pad_sequences(label_ids,
                                                pad_tok=0,
                                                wthres=wl,
                                                cthres=cl)

            data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,
                                                    sequence_lengths, char_ids,
                                                    word_lengths)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors
            mask_tensor = word_tensor > 0

            label_score = self.model(word_tensor, sequence_lengths,
                                     char_tensor, word_lengths,
                                     char_seq_recover)

            batch_loss = self.model.NLL_loss(label_score, mask_tensor,
                                             label_tensor)

            train_loss.append(batch_loss.data.tolist()[0])
            self.model.zero_grad()
            batch_loss.backward()
            if clip_rate > 0:
                torch.nn.utils.clip_grad_norm(self.model.parameters(),
                                              clip_rate)
            self.optimizer.step()

            prog.update(i + 1, [("Train loss", batch_loss.data.tolist()[0])])
        return np.mean(train_loss)
    def evaluate_batch(self, eva_data):
        with torch.no_grad():
            wl = self.args.vocab.wl
            batch_size = self.args.batch_size
            ## set model in eval model
            self.model.eval()
            start = time.time()
            y_true = Data2tensor.idx2tensor([], self.device)
            y_pred = Data2tensor.idx2tensor([], self.device)
            for i, (words, label_ids) in enumerate(
                    self.args.vocab.minibatches(eva_data,
                                                batch_size=batch_size)):
                word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                                  pad_tok=0,
                                                                  wthres=wl)

                data_tensors = Data2tensor.sort_tensors(
                    label_ids, word_ids, sequence_lengths, self.device)
                label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

                y_true = torch.cat([y_true, label_tensor])
                label_score = self.model(word_tensor, sequence_lengths)
                label_prob, label_pred = self.model.inference(label_score, k=1)

                y_pred = torch.cat([y_pred, label_pred])
            #measures = Classifier.class_metrics(y_true, y_pred.squeeze())
            measures = Classifier.class_metrics(
                y_true.data.cpu().numpy(),
                y_pred.squeeze().data.cpu().numpy())

            end = time.time() - start
            speed = len(y_true) / end
        return measures, speed
    def predict(self, sent, k=1):
        """

        :param sent: processed sentence
        :param asp: an aspect mentioned inside sent
        :param k: int
        :return: top k predictions
        """
        wl = self.args.vocab.wl
        ## set model in eval model
        self.model.eval()

        fake_label = [0]
        words = self.word2idx(sent)
        word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                          pad_tok=0,
                                                          wthres=wl)

        data_tensors = Data2tensor.sort_tensors(fake_label, word_ids,
                                                sequence_lengths, self.device)
        fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

        label_score = self.model(word_tensor, sequence_lengths)
        label_prob, label_pred = self.model.inference(label_score, k)
        return label_prob, label_pred
Exemplo n.º 4
0
    def train_batch(self,train_data):
        wl = self.args.vocab.wl
        clip_rate = self.args.clip
        
        batch_size = self.args.batch_size
        num_train = len(train_data)
        total_batch = num_train//batch_size+1
        prog = Progbar(target=total_batch)
        ## set model in train model
        self.model.train()
        train_loss = []
        for i,(words, label_ids) in enumerate(self.args.vocab.minibatches(train_data, batch_size=batch_size)):
            word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=wl)

            data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,sequence_lengths,self.device)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

            self.model.zero_grad()
            label_score = self.model(word_tensor, sequence_lengths)
            # print("inside training batch, ", label_score.size(), label_tensor.size(), label_score, label_tensor)
            batch_loss = self.model.NLL_loss(label_score, label_tensor)
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            
            if clip_rate>0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_rate)
                
            self.optimizer.step()
            
            prog.update(i + 1, [("Train loss", batch_loss.item())])
        return np.mean(train_loss)
Exemplo n.º 5
0
    def evaluate_batch(self, eval_data):
        start_time = time.time()
        eval_batch = self.args.vocab.minibatches(
            eval_data, batch_size=self.args.batch_size)
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        total_loss = 0.
        total_word = 0
        with torch.no_grad():
            for seq_batch in eval_batch:
                word_pad_ids, seq_lens = seqPAD.pad_sequences(
                    seq_batch, pad_tok=self.args.vocab.w2i[PAD])
                seq_tensor = Data2tensor.idx2tensor(word_pad_ids, self.device)
                hidden = self.model.init_hidden(seq_tensor.size(0))
                for i in range(0, seq_tensor.size(1) - 1, self.args.bptt):
                    data, target = self.bptt_batch(seq_tensor, i)
                    mask_target = target > 0
                    output, hidden = self.model(data, hidden)
                    batch_loss = self.model.NLL_loss(output, target)
                    total_loss += batch_loss.item()
                    hidden = self.repackage_hidden(hidden)
                    total_word = total_word + mask_target.sum().item()

        cur_loss = total_loss / total_word
        elapsed = time.time() - start_time
        print('-' * 89)
        print('| EVALUATION | words {:5d} | lr {:02.2f} | words/s {:5.2f} | '
              'loss {:5.2f} | ppl {:8.2f}'.format(total_word, self.args.lr,
                                                  total_word / elapsed,
                                                  cur_loss,
                                                  math.exp(cur_loss)))
        print('-' * 89)
        return cur_loss, total_word, elapsed
Exemplo n.º 6
0
    def predict(self, sent):
        numtags = len(self.args.vocab.l2i)
        wl = self.args.vocab.wl
        cl = self.args.vocab.cl
        ## set model in eval model
        self.model.eval()

        words = self.word2idx(sent)
        char_ids, word_ids = zip(*words)
        fake_label = [[0] * len(word_ids)]

        char_ids, word_ids = zip(*words)
        word_ids, sequence_lengths = seqPAD.pad_sequences([word_ids],
                                                          pad_tok=0,
                                                          wthres=wl,
                                                          cthres=cl)
        char_ids, word_lengths = seqPAD.pad_sequences([char_ids],
                                                      pad_tok=0,
                                                      nlevels=2,
                                                      wthres=wl,
                                                      cthres=cl)

        data_tensors = Data2tensor.sort_tensors(fake_label,
                                                word_ids,
                                                sequence_lengths,
                                                char_ids,
                                                word_lengths,
                                                volatile_flag=True)
        fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors
        label_score = self.model(word_tensor, sequence_lengths, char_tensor,
                                 word_lengths, char_seq_recover)

        if numtags > 2:
            label_prob, label_pred = label_score.data.max(1)
        else:
            label_prob = F.sigmoid(label_score.squeeze())
            label_pred = (label_prob >= 0.5).data.long()
        return label_prob, label_pred
Exemplo n.º 7
0
 def predict(self, sent, k=1):
     cl = self.args.vocab.cl            
      ## set model in eval model
     self.model.eval()
     
     fake_label = [0]        
     words = self.word2idx(sent)
     word_ids, sequence_lengths = seqPAD.pad_sequences([words], pad_tok=0, wthres=cl)
 
     data_tensors = Data2tensor.sort_tensors(fake_label, word_ids,sequence_lengths, volatile_flag=True)    
     fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
     label_score = self.model(word_tensor, sequence_lengths)
     label_prob, label_pred = self.model.inference(label_score, k)
     return label_prob, label_pred 
Exemplo n.º 8
0
    def train_batch(self, train_data, epoch=0):
        total_loss = 0.
        total_word = 0
        total_seq = 0
        start_time = time.time()
        train_batch = self.args.vocab.minibatches(
            train_data, batch_size=self.args.batch_size)
        # Turn on training mode which enables dropout.
        self.model.train()
        for batch, seq_batch in enumerate(train_batch):
            word_pad_ids, seq_lens = seqPAD.pad_sequences(
                seq_batch, pad_tok=self.args.vocab.w2i[PAD])
            seq_tensor = Data2tensor.idx2tensor(word_pad_ids, self.device)
            # seq_tensor = [batch_size, seq_len]
            total_seq += seq_tensor.size(0)
            hidden = self.model.init_hidden(seq_tensor.size(0))
            for i in range(0, seq_tensor.size(1) - 1, self.args.bptt):
                # data = [batch_size, bptt]
                # target = [batch_size, bptt]
                data, target = self.bptt_batch(seq_tensor, i)
                mask_target = target > 0
                # Starting each batch, we detach the hidden state from how it was previously produced.
                # If we didn't, the model would try backpropagating all the way to start of the dataset.
                hidden = self.repackage_hidden(hidden)
                self.model.zero_grad()
                output, hidden = self.model(data, hidden)
                loss = self.model.NLL_loss(output, target)
                loss.backward()

                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.args.clip)
                for p in self.model.parameters():
                    p.data.add_(-self.args.lr, p.grad.data)

                total_loss += loss.item()
                total_word = total_word + mask_target.sum().item()

            cur_loss = total_loss / total_word
            elapsed = time.time() - start_time
            print('-' * 89)
            print(
                '| TRAINING | epoch {:3d} | batch {:5d} | sequences {:5d} | words {:5d} | lr {:02.2f} | '
                'words/s {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch + 1, total_seq, total_word, self.args.lr,
                    total_word / elapsed, cur_loss, math.exp(cur_loss)))
            print('-' * 89)
Exemplo n.º 9
0
def scoring(sent, args, classifier):
    cl = args.vocab.cl
    ## set model in eval model
    classifier.model.eval()

    fake_label = [0]
    words = classifier.word2idx(sent)
    word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                      pad_tok=0,
                                                      wthres=cl)

    data_tensors = Data2tensor.sort_tensors(fake_label,
                                            word_ids,
                                            sequence_lengths,
                                            volatile_flag=True)
    fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
    label_score = classifier.model(word_tensor, sequence_lengths)
    #    label_prob, label_pred = classifier.model.inference(label_score)
    return label_score
Exemplo n.º 10
0
    def evaluate_batch(self, eval_data):
        start_time = time.time()
        eval_batch = self.args.vocab.minibatches_with_label(
            eval_data, batch_size=self.args.batch_size)
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        total_loss = 0.
        total_docs = 0
        y_true, y_pred = [], []
        with torch.no_grad():
            for doc_batch, lb_batch in eval_batch:
                doc_pad_ids, doc_lengths = seqPAD.pad_sequences(
                    doc_batch, pad_tok=self.args.vocab.w2i[PAD])
                #######################
                # YOUR CODE STARTS HERE
                doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, self.device)
                doc_lengths_tensor = Data2tensor.idx2tensor(
                    doc_lengths, self.device)
                lb_tensor = Data2tensor.idx2tensor(lb_batch, self.device)
                total_docs += doc_tensor.size(0)
                output, _, _ = self.model(doc_tensor, doc_lengths_tensor)
                loss = self.model.NLL_loss(output, lb_tensor)
                label_prob, label_pred = self.model.inference(output, k=1)
                #print("shape label_tensor",lb_tensor.shape)
                #print("shape label_pred",label_pred.squeeze(1).shape)
                y_true.extend(lb_tensor)
                y_pred.extend(label_pred.squeeze(1))
                total_loss += loss.item()
                # YOUR CODE ENDS HERE
                #######################

        precision, recall, f1, acc = Sentimentmodel.cal_metrics(y_true, y_pred)
        cur_loss = total_loss / total_docs
        elapsed = time.time() - start_time
        metrics = {
            "precision": precision * 100,
            "recall": recall * 100,
            "f1": f1 * 100,
            "acc": acc * 100,
            "loss": cur_loss
        }
        return metrics, total_docs, elapsed
Exemplo n.º 11
0
    def train_batch(self, train_data):
        total_loss = 0.
        total_docs = 0
        start_time = time.time()
        train_batch = self.args.vocab.minibatches_with_label(
            train_data, batch_size=self.args.batch_size)
        # Turn on training mode which enables dropout.
        self.model.train()
        for batch, (doc_batch, lb_batch) in enumerate(train_batch):
            doc_pad_ids, doc_lengths = seqPAD.pad_sequences(
                doc_batch, pad_tok=self.args.vocab.w2i[PAD])
            doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, self.device)
            doc_lengths_tensor = Data2tensor.idx2tensor(
                doc_lengths, self.device)
            lb_tensor = Data2tensor.idx2tensor(lb_batch, self.device)
            # doc_tensor = [batch_size, max_doc_length]
            total_docs += doc_tensor.size(0)

            self.model.zero_grad()
            output, _, _ = self.model(doc_tensor, doc_lengths_tensor)
            loss = self.model.NLL_loss(output, lb_tensor)
            avg_loss = loss / doc_tensor.size(0)
            avg_loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           self.args.clip)

            # update parameters in all sub-graphs
            self.model_optimizer.step()
            # for p in self.model.parameters():
            #     p.data.add_(p.grad.data, alpha=-self.args.lr)

            total_loss += loss.item()

        cur_loss = total_loss / total_docs
        elapsed = time.time() - start_time
        # print('-' * 89)
        # print('| TRAINING | epoch {:3d} | documents {:5d} | lr {:02.2f} | documents/s {:5.2f} | '
        #       'loss {:5.2f}'.format(epoch, total_docs, self.args.lr, total_docs / elapsed, cur_loss))
        # print('-' * 89)
        return cur_loss, total_docs, elapsed
Exemplo n.º 12
0
 def predict_null(self, sent, asp):
     wl = self.classifier.args.vocab.wl
     ## set model in eval model
     self.classifier.model.eval()
     fake_label = [0]
     words, asp_loc = self.classifier.word2idx(sent, asp)
     word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                       pad_tok=0,
                                                       wthres=wl)
     data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc],
                                             word_ids, sequence_lengths,
                                             self.classifier.device)
     fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
     word_h_n = self.classifier.model.rnn.get_all_hiddens(
         word_tensor, sequence_lengths).mean(1)
     label_score = self.classifier.model.hidden2tag(word_h_n)
     label_score = self.classifier.model.dropfinal(label_score)
     label_prob, label_pred = self.classifier.model.inference(
         label_score, len(self.i2l))
     return label_prob, label_pred
Exemplo n.º 13
0
def predict_null(classifier, sent, asp, i2l):
    from utils.data_utils import Data2tensor, seqPAD
    wl = classifier.args.vocab.wl
    ## set model in eval model
    classifier.model.eval()
    fake_label = [0]
    words, asp_loc = classifier.word2idx(sent, asp)
    word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                      pad_tok=0,
                                                      wthres=wl)
    data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc], word_ids,
                                            sequence_lengths,
                                            classifier.device)
    fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
    arange_tensor = Data2tensor.idx2tensor(list(range(word_tensor.size(0))),
                                           classifier.device)
    word_h_n = classifier.model.rnn.get_all_hiddens(word_tensor,
                                                    sequence_lengths).mean(1)
    label_score = classifier.model.hidden2tag(word_h_n)
    label_score = classifier.model.dropfinal(label_score)
    label_prob, label_pred = classifier.model.inference(label_score, len(i2l))
    return label_prob, label_pred
Exemplo n.º 14
0
 def evaluate_batch(self, eva_data):
     cl = self.args.vocab.cl    
     
     batch_size = self.args.batch_size  
      ## set model in eval model
     self.model.eval()
     num_label = 0
     num_correct = 0
     for i,(words, label_ids) in enumerate(self.args.vocab.minibatches(eva_data, batch_size=batch_size)):
         word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=cl)
         data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,sequence_lengths, volatile_flag=True)
         label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors 
         
         label_score = self.model(word_tensor, sequence_lengths)
         label_prob, label_pred = self.model.inference(label_score, k=1)
             
         assert len(label_pred)==len(label_tensor)
         correct_pred = (label_pred.squeeze()==label_tensor.data).sum()
         assert correct_pred <=batch_size  
         num_label += len(label_tensor)
         num_correct += correct_pred
     acc = num_correct/num_label  
     return acc 
Exemplo n.º 15
0
    def evaluate_batch(self, eva_data):
        with torch.no_grad():
            wl = self.args.vocab.wl
            batch_size = self.args.batch_size
            ## set model in eval model
            self.model.eval()
            start = time.time()
            y_true = Data2tensor.idx2tensor([], self.device)
            y_pred = Data2tensor.idx2tensor([], self.device)
            for i, (words, asp_locs, label_ids) in enumerate(
                    self.args.vocab.minibatches(eva_data,
                                                batch_size=batch_size)):
                word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                                  pad_tok=0,
                                                                  wthres=wl)

                data_tensors = Data2tensor.sort_tensors(
                    label_ids, asp_locs, word_ids, sequence_lengths,
                    self.device)
                label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
                arange_tensor = Data2tensor.idx2tensor(
                    list(range(word_tensor.size(0))), self.device)

                y_true = torch.cat([y_true, label_tensor])
                label_score = self.model(word_tensor, sequence_lengths,
                                         aspect_tensor, arange_tensor)
                label_prob, label_pred = self.model.inference(label_score, k=1)

                y_pred = torch.cat([y_pred, label_pred])
#            acc = metrics.accuracy_score(y_true, y_pred)
#            print(y_pred.size())
#            print(y_true.size())
            measures = Classifier.class_metrics(y_true, y_pred.squeeze())
            end = time.time() - start
            speed = len(y_true) / end
#        print("Gradient flag: ", label_score.requires_grad)
        return measures, speed
Exemplo n.º 16
0
    def evaluate_batch(self, eva_data):
        wl = self.args.vocab.wl
        cl = self.args.vocab.cl

        batch_size = self.args.batch_size
        ## set model in eval model
        self.model.eval()
        correct_preds = 0.
        total_preds = 0.
        total_correct = 0.
        accs = []
        pred_results = []
        gold_results = []
        for i, (words, label_ids) in enumerate(
                self.args.vocab.minibatches(eva_data, batch_size=batch_size)):
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids,
                                                              pad_tok=0,
                                                              wthres=wl,
                                                              cthres=cl)
            char_ids, word_lengths = seqPAD.pad_sequences(char_ids,
                                                          pad_tok=0,
                                                          nlevels=2,
                                                          wthres=wl,
                                                          cthres=cl)
            label_ids, _ = seqPAD.pad_sequences(label_ids,
                                                pad_tok=0,
                                                wthres=wl,
                                                cthres=cl)

            data_tensors = Data2tensor.sort_tensors(label_ids,
                                                    word_ids,
                                                    sequence_lengths,
                                                    char_ids,
                                                    word_lengths,
                                                    volatile_flag=True)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors
            mask_tensor = word_tensor > 0

            label_score = self.model(word_tensor, sequence_lengths,
                                     char_tensor, word_lengths,
                                     char_seq_recover)

            label_prob, label_pred = self.model.inference(
                label_score, mask_tensor)

            pred_label, gold_label = recover_label(label_pred, label_tensor,
                                                   mask_tensor,
                                                   self.args.vocab.l2i,
                                                   word_seq_recover)
            pred_results += pred_label
            gold_results += gold_label
        acc, p, r, f = get_ner_fmeasure(gold_results, pred_results)

        #            label_pred = label_pred.cpu().data.numpy()
        #            label_tensor = label_tensor.cpu().data.numpy()
        #            sequence_lengths = sequence_lengths.cpu().data.numpy()
        #
        #            for lab, lab_pred, length in zip(label_tensor, label_pred, sequence_lengths):
        #                lab      = lab[:length]
        #                lab_pred = lab_pred[:length]
        #                accs    += [a==b for (a, b) in zip(lab, lab_pred)]
        #
        #                lab_chunks      = set(NERchunks.get_chunks(lab, self.args.vocab.l2i))
        #                lab_pred_chunks = set(NERchunks.get_chunks(lab_pred, self.args.vocab.l2i))
        #
        #                correct_preds += len(lab_chunks & lab_pred_chunks)
        #                total_preds   += len(lab_pred_chunks)
        #                total_correct += len(lab_chunks)
        #
        #        p   = correct_preds / total_preds if correct_preds > 0 else 0
        #        r   = correct_preds / total_correct if correct_preds > 0 else 0
        #        f  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        #        acc = np.mean(accs)

        return acc, f