예제 #1
0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # Splitting the data in batches
        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_acc_total = 0
        plot_acc_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda and torch.cuda.is_available():
                train_sentences1 = train_sentences1.cuda()
                train_sentences2 = train_sentences2.cuda()
                train_labels = train_labels.cuda()

            assert train_sentences1.size(0) == train_sentences2.size(0)

            score = self.model(train_sentences1, sent_len1, train_sentences2,
                               sent_len2)
            n_correct = (torch.max(score, 1)[1].view(
                train_labels.size()).data == train_labels.data).sum()

            loss = self.criterion(score, train_labels)
            # Important if we are using nn.DataParallel()
            if loss.size(0) > 1:
                loss = loss.mean()
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            clip_grad_norm(
                filter(lambda p: p.requires_grad, self.model.parameters()),
                self.config.max_norm)
            self.optimizer.step()

            print_acc_total += 100. * n_correct / len(
                train_batches[batch_no - 1])
            plot_acc_total += 100. * n_correct / len(
                train_batches[batch_no - 1])

            if batch_no % self.config.print_every == 0:
                print_acc_avg = print_acc_total / self.config.print_every
                print_acc_total = 0
                print('%s (%d %d%%) %.2f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_acc_avg))

            if batch_no % self.config.plot_every == 0:
                plot_acc_avg = plot_acc_total / self.config.plot_every
                self.train_accuracies.append(plot_acc_avg)
                plot_acc_total = 0
예제 #2
0
    def train(self):
        # Turn on training mode which enables dropout.
        self.model.train()

        # Splitting the data in batches
        batches, batch_labels = [], []
        for task_name, task in self.train_corpus.items():
            train_batches = helper.batchify(task.data, self.config.batch_size)
            batches.extend(train_batches)
            batch_labels.extend([task_name] * len(train_batches))

        combined = list(zip(batches, batch_labels))
        numpy.random.shuffle(combined)
        batches[:], batch_labels[:] = zip(*combined)
        print('number of train batches = ', len(batches))

        start = time.time()
        print_acc_total = 0
        plot_acc_total = 0
        num_back = 0

        num_batches = len(batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            if self.config.use_elmo:
                train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_elmo_tensors(
                    batches[batch_no - 1], self.dictionary)
            else:
                train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors(
                    batches[batch_no - 1], self.dictionary)

            if self.config.cuda:
                train_sentences1 = train_sentences1.cuda()
                train_sentences2 = train_sentences2.cuda()
                train_labels = train_labels.cuda()
            assert train_sentences1.size(0) == train_sentences2.size(0)

            score = self.model(train_sentences1, sent_len1, train_sentences2, sent_len2, batch_labels[batch_no - 1])
            n_correct = (torch.max(score, 1)[1].view(train_labels.size()).data == train_labels.data).sum()
            loss = self.criterion(score, train_labels)

            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            clip_grad_norm(filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm)
            self.optimizer.step()

            print_acc_total += 100. * n_correct / len(batches[batch_no - 1])
            plot_acc_total += 100. * n_correct / len(batches[batch_no - 1])

            if batch_no % self.config.print_every == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = '%s (%d %d%%) %.2f' % (helper.show_progress(start, batch_no / num_batches), batch_no,
                                                  batch_no / num_batches * 100, print_acc_total / batch_no)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)
예제 #3
0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # splitting the data in batches
        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            session_queries, session_query_length, rel_docs, rel_docs_length, doc_labels = helper.session_to_tensor(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                # batch_size x session_length x max_query_length
                session_queries = session_queries.cuda()
                # batch_size x session_length
                session_query_length = session_query_length.cuda()
                # batch_size x session_length x num_rel_docs_per_query x max_doc_length
                rel_docs = rel_docs.cuda()
                # batch_size x session_length x num_rel_docs_per_query
                rel_docs_length = rel_docs_length.cuda()
                # batch_size x session_length x num_rel_docs_per_query
                doc_labels = doc_labels.cuda()

            loss = self.model(session_queries, session_query_length, rel_docs,
                              rel_docs_length, doc_labels)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            clip_grad_norm(
                filter(lambda p: p.requires_grad, self.model.parameters()),
                self.config.max_norm)
            self.optimizer.step()

            print_loss_total += loss.data[0]
            plot_loss_total += loss.data[0]

            if batch_no % self.config.print_every == 0:
                print_loss_avg = print_loss_total / self.config.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.config.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.config.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # splitting the data in batches
        train_batches = helper.batchify(train_corpus.data, self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_queries, query_len, train_clicks, doc_len, click_labels = helper.batch_to_tensor(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                # batch_size x max_query_length
                train_queries = train_queries.cuda()
                # batch_size x num_clicks_per_query x max_document_length
                train_clicks = train_clicks.cuda()
                # batch_size x num_clicks_per_query
                click_labels = click_labels.cuda()

            score = self.model(train_queries, query_len, train_clicks, doc_len)
            # loss = self.compute_loss(score, click_labels)
            loss = f.binary_cross_entropy_with_logits(score, click_labels)
            # Important if we are using nn.DataParallel()
            if loss.size(0) > 1:
                loss = loss.mean()
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            clip_grad_norm(filter(lambda p: p.requires_grad, self.model.parameters()), self.config.max_norm)
            self.optimizer.step()

            print_loss_total += loss.data[0]
            plot_loss_total += loss.data[0]

            if batch_no % self.config.print_every == 0:
                print_loss_avg = print_loss_total / self.config.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (
                    helper.show_progress(start, batch_no / num_batches), batch_no,
                    batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.config.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.config.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # splitting the data in batches
        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            videos, video_len, descriptions, des_len = helper.videos_to_tensor(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                videos = videos.cuda(
                )  # batch_size x max_images_per_video x num_image_features
                descriptions = descriptions.cuda(
                )  # batch_size x max_description_length
                des_len = des_len.cuda()  # batch_size

            loss = self.model(videos, video_len, descriptions, des_len)
            # Important if we are using nn.DataParallel()
            if loss.size(0) > 1:
                loss = loss.mean()
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            clip_grad_norm(
                filter(lambda p: p.requires_grad, self.model.parameters()),
                self.config.max_norm)
            self.optimizer.step()

            print_loss_total += loss.data[0]
            plot_loss_total += loss.data[0]

            if batch_no % self.config.print_every == 0:
                print_loss_avg = print_loss_total / self.config.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.config.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.config.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0
예제 #6
0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # splitting the data in batches
        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_sessions, length, train_clicks, click_labels = helper.session_to_tensor(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                # batch_size x session_length x max_query_length
                train_sessions = train_sessions.cuda()
                # batch_size x session_length x num_clicks_per_query x max_document_length
                train_clicks = train_clicks.cuda()
                # batch_size x session_length
                length = length.cuda()
                # batch_size x session_length x num_clicks_per_query
                click_labels = click_labels.cuda()

            loss = self.model(train_sessions, length, train_clicks,
                              click_labels)
            # Important if we are using nn.DataParallel()
            if loss.size(0) > 1:
                loss = loss.mean()
            loss.backward()
            self.optimizer.step()

            print_loss_total += loss.data[0]
            plot_loss_total += loss.data[0]

            if batch_no % self.config.print_every == 0:
                print_loss_avg = print_loss_total / self.config.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.config.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.config.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # splitting the data in batches
        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_queries, train_docs, click_labels = helper.batch_to_tensor(
                train_batches[batch_no - 1], self.dictionary,
                self.config.max_query_length, self.config.max_doc_length)
            if self.config.cuda:
                # batch_size x max_query_length x vocab_size
                train_queries = train_queries.cuda()
                # batch_size x x num_rel_docs_per_query x max_doc_length x vocab_size
                train_docs = train_docs.cuda()
                # batch_size x num_rel_docs_per_query
                click_labels = click_labels.cuda()

            softmax_prob = self.model(train_queries, train_docs)
            loss = self.compute_loss(softmax_prob, click_labels)
            loss.backward()
            self.optimizer.step()

            print_loss_total += loss.data[0]
            plot_loss_total += loss.data[0]

            if batch_no % self.config.print_every == 0:
                print_loss_avg = print_loss_total / self.config.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.config.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.config.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0
예제 #8
0
    def validate(self, dev_corpus):
        # Turn on evaluation mode which disables dropout.
        self.model.eval()

        print_every = self.config.print_every
        start = time.time()

        dev_batches = helper.batchify(dev_corpus.data, self.config.batch_size)
        print('number of dev batches = ', len(dev_batches))

        num_batches = len(dev_batches)
        n_correct, n_total = 0, 0
        for batch_no in range(1, num_batches + 1):
            dev_sentences1, sent_len1, dev_sentences2, sent_len2, dev_labels = helper.batch_to_tensors(
                dev_batches[batch_no - 1], self.dictionary, True)
            if self.config.cuda:
                dev_sentences1 = dev_sentences1.cuda()
                dev_sentences2 = dev_sentences2.cuda()
                dev_labels = dev_labels.cuda()

            assert dev_sentences1.size(0) == dev_sentences2.size(0)

            score = self.model(dev_sentences1, sent_len1, dev_sentences2,
                               sent_len2)
            n_correct += (torch.max(score, 1)[1].view(
                dev_labels.size()).data == dev_labels.data).sum()
            n_total += len(dev_batches[batch_no - 1])

            print_acc = 100. * n_correct / n_total
            if batch_no % print_every == 0 or self.config.debug:
                p = 100.0
                print('%s (%d %d%%) (%.2f) %.2f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, p, print_acc))

        return 100. * n_correct / n_total
예제 #9
0
    def train(self, train_corpus):
        # Turn on training mode which enables dropout.
        self.model.train()

        # Splitting the data in batches
        train_batches = helper.batchify(train_corpus.data, self.config.batch_size)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_acc_total = 0
        plot_acc_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                train_sentences1 = train_sentences1.cuda()
                train_sentences2 = train_sentences2.cuda()
                train_labels = train_labels.cuda()

            assert train_sentences1.size(0) == train_sentences2.size(0)

            score = self.model(train_sentences1, sent_len1, train_sentences2, sent_len2)
            n_correct = (torch.max(score, 1)[1].view(train_labels.size()).data == train_labels.data).sum()
            loss = self.criterion(score, train_labels)
            # Important if we are using nn.DataParallel()
            if loss.size(0) > 1:
                loss = loss.mean()
            loss.backward()

            # gradient clipping (off by default)
            shrink_factor = 1
            total_norm = 0

            for p in self.model.parameters():
                if p.requires_grad:
                    p.grad.data.div_(train_sentences1.size(0))  # divide by the actual batch size
                    total_norm += p.grad.data.norm() ** 2
            total_norm = numpy.sqrt(total_norm)

            if total_norm > self.config.clip:
                shrink_factor = self.config.clip / total_norm
            current_lr = self.optimizer.param_groups[0]['lr']  # current lr (no external "lr", for adam)
            self.optimizer.param_groups[0]['lr'] = current_lr * shrink_factor  # just for update

            self.optimizer.step()
            self.optimizer.param_groups[0]['lr'] = current_lr

            print_acc_total += 100. * n_correct / len(train_batches[batch_no - 1])
            plot_acc_total += 100. * n_correct / len(train_batches[batch_no - 1])

            if batch_no % self.config.print_every == 0:
                print_acc_avg = print_acc_total / self.config.print_every
                print_acc_total = 0
                print('%s (%d %d%%) %.2f' % (
                    helper.show_progress(start, batch_no / num_batches), batch_no,
                    batch_no / num_batches * 100, print_acc_avg))

            if batch_no % self.config.plot_every == 0:
                plot_acc_avg = plot_acc_total / self.config.plot_every
                self.train_accuracies.append(plot_acc_avg)
                plot_acc_total = 0
def evaluate(model, batches, dictionary, outfile=None, selection_time=0.9318): #selection_time=0.9318 for IMDB by budget model
    # Turn on evaluation mode which disables dropout.
    model.eval()

    n_correct, n_total = 0, 0
    y_preds, y_true, output = [], [], []
    start = time.time()
    num_batches = len(batches)

    num_tokens_padded = 0
    selection_time = 0
    selected_tokens = 0

    for batch_no in range(len(batches)):
        test_sentences1, sent_len1, test_sentences2, sent_len2, test_labels = helper.batch_to_tensors(batches[batch_no],
                                                                                                      dictionary, True)
        if args.cuda:
            test_sentences1 = test_sentences1.cuda()
            test_sentences2 = test_sentences2.cuda()
            test_labels = test_labels.cuda()
        assert test_sentences1.size(0) == test_sentences1.size(0)

        selected_tokens+= sum(sent_len1)+sum(sent_len2)
        num_tokens_padded += 2*(force_min_sen_len*args.eval_batch_size)

        score = model(test_sentences1, sent_len1, test_sentences2, sent_len2)
        preds = torch.max(score, 1)[1]
        if outfile:
            predictions = preds.data.cpu().tolist()
            for i in range(len(batches[batch_no])):
                output.append([batches[batch_no][i].id, predictions[i]])
        else:
            y_preds.extend(preds.data.cpu().tolist())
            y_true.extend(test_labels.data.cpu().tolist())
            n_correct += (preds.view(test_labels.size()).data == test_labels.data).sum()
            n_total += len(batches[batch_no])

        if (batch_no+1) % args.print_every == 0:
            padded_p = 100.0 * selected_tokens/num_tokens_padded
            print_acc_avg = 100. * n_correct / n_total
            print('%s (%d %d%%) (padded %.2f) %.2f' % (
                helper.show_progress(start, (batch_no+1) / num_batches), (batch_no+1),
                (batch_no+1) / num_batches * 100, padded_p, print_acc_avg))


    now = time.time()
    s = now - start

    estimated_full_text_padded_time = (s ) * num_tokens_padded / selected_tokens
    s+=selection_time 

    print('estimated full text time padded = %s'% (helper.convert_to_minutes(estimated_full_text_padded_time)))

    padded_p = 100.0 * selected_tokens/num_tokens_padded
    padded_speed_up = 1.0*estimated_full_text_padded_time/s
    

    print_acc_avg = 100. * n_correct / n_total
    print('total: %s (%d %d%%)(padded %.2f) %.2f' % (
        helper.show_progress(start, (batch_no+1) / num_batches), (batch_no+1),
        (batch_no+1) / num_batches * 100, padded_p, print_acc_avg))
    print('estimated padded speed up =  %0.2f, selection text percentage spped up padded = %0.2f' % (padded_speed_up,  100.0/padded_p ))



    if outfile:
        target_names = ['entailment', 'neutral', 'contradiction']
        with open(outfile, 'w') as f:
            f.write('pairID,gold_label' + '\n')
            for item in output:
                f.write(str(item[0]) + ',' + target_names[item[1]] + '\n')
    else:
        return 100. * n_correct / n_total, 100. * f1_score(numpy.asarray(y_true), numpy.asarray(y_preds),
                                                           average='weighted'), s
    def train(self, train_corpus, epoch):
        # Turn on training mode which enables dropout.
        self.model.train()

        # Splitting the data in batches
        shuffle = True
        # if self.config.task == 'sst': shuffle = False
        print(shuffle)

        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size, shuffle)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_acc_total = 0
        plot_acc_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                train_sentences1 = train_sentences1.cuda()
                train_sentences2 = train_sentences2.cuda()
                train_labels = train_labels.cuda()

            assert train_sentences1.size(0) == train_sentences2.size(0)

            score = self.model(train_sentences1, sent_len1, train_sentences2,
                               sent_len2)
            n_correct = (torch.max(score, 1)[1].view(
                train_labels.size()).data == train_labels.data).sum()
            # print (' score size ', score.size(), train_labels.size())
            loss = self.criterion(score, train_labels)

            ############################ custom new_loss ############################

            # z2 = z_pred.dimshuffle((0,1,"x"))
            # logpz = - T.nnet.binary_crossentropy(probs, z2) * masks
            # logpz = self.logpz = logpz.reshape(x.shape)
            # probs = self.probs = probs.reshape(x.shape)

            # # batch
            # z = z_pred
            # self.zsum = T.sum(z, axis=0, dtype=theano.config.floatX)
            # self.zdiff = T.sum(T.abs_(z[1:]-z[:-1]), axis=0, dtype=theano.config.floatX)

            # zsum = generator.zsum
            # zdiff = generator.zdiff
            # logpz = generator.logpz

            # coherent_factor = args.sparsity * args.coherent
            # loss = self.loss = T.mean(loss_vec) #this is not needed as in cost_vec loss_vec is used
            # sparsity_cost = self.sparsity_cost = T.mean(zsum) * args.sparsity + \
            #                                      T.mean(zdiff) * coherent_factor
            # cost_vec = loss_vec + zsum * args.sparsity + zdiff * coherent_factor
            # cost_logpz = T.mean(cost_vec * T.sum(logpz, axis=0))
            # self.obj = T.mean(cost_vec)

            ############################ custom new_loss ############################

            if loss.size(0) > 1:
                loss = loss.mean()
            # print ('loss:', loss)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            grad_norm = clip_grad_norm(
                filter(lambda p: p.requires_grad, self.model.parameters()),
                self.config.max_norm)
            # if epoch==11:
            # print(batch_no, grad_norm)
            self.optimizer.step()

            print_acc_total += 100. * n_correct / len(
                train_batches[batch_no - 1])
            plot_acc_total += 100. * n_correct / len(
                train_batches[batch_no - 1])

            if batch_no % self.config.print_every == 0:
                print_acc_avg = print_acc_total / self.config.print_every
                print_acc_total = 0
                print('%s (%d %d%%) %.2f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_acc_avg))

            if batch_no % self.config.plot_every == 0:
                plot_acc_avg = plot_acc_total / self.config.plot_every
                self.train_accuracies.append(plot_acc_avg)
                plot_acc_total = 0
예제 #12
0
    def train(self):
        # Turn on training mode which enables dropout.
        self.generator.train()

        # Splitting the data in batches
        batches, batch_labels = [], []
        for task_name, task in self.train_corpus.items():
            train_batches = helper.batchify(task.data, self.config.batch_size)
            batches.extend(train_batches)
            batch_labels.extend([task_name] * len(train_batches))

        combined = list(zip(batches, batch_labels))
        numpy.random.shuffle(combined)
        batches[:], batch_labels[:] = zip(*combined)
        print('number of train batches = ', len(batches))

        start = time.time()
        num_back, print_acc_total, plot_acc_total = 0, 0, 0

        num_batches = len(batches)
        for batch_no in range(1, num_batches + 1):
            if self.config.use_elmo:
                train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_elmo_input(
                    batches[batch_no - 1], self.dictionary)
            else:
                train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors(
                    batches[batch_no - 1], self.dictionary)

            if self.config.cuda:
                train_sentences1 = train_sentences1.cuda()
                train_sentences2 = train_sentences2.cuda()
                train_labels = train_labels.cuda()

            assert train_sentences1.size(0) == train_sentences2.size(0)

            if self.config.adversarial:
                self.optimizerD.zero_grad()
                scores, diff_loss, shared_rep = self.generator(
                    train_sentences1, sent_len1, train_sentences2, sent_len2,
                    batch_labels[batch_no - 1])
                n_correct = (torch.max(scores, 1)[1].view(
                    train_labels.size()).data == train_labels.data).sum()
                shared_sent_rep1 = shared_rep[0]
                shared_sent_rep2 = shared_rep[1]
                # runt the discriminator to distinguish tasks
                task_prob1 = self.discriminator(
                    shared_sent_rep1.detach())  # B X num_tasks
                task_prob2 = self.discriminator(
                    shared_sent_rep2.detach())  # B X num_tasks
                comb_prob = torch.cat((task_prob1, task_prob2),
                                      0)  # 2B X num_tasks
                task_prob = torch.sum(comb_prob,
                                      0).squeeze()  # size = |num_tasks|
                adv_loss = -1 * task_prob[self.task_ids[batch_labels[batch_no -
                                                                     1]]]
                adv_loss.backward()
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
                clip_grad_norm(
                    filter(lambda p: p.requires_grad,
                           self.discriminator.parameters()),
                    self.config.max_norm)
                self.optimizerD.step()

                self.optimizerG.zero_grad()
                cross_entropy_loss = self.criterion(scores, train_labels)
                # runt the discriminator to distinguish tasks
                task_prob1 = self.discriminator(
                    shared_sent_rep1)  # B X num_tasks
                task_prob2 = self.discriminator(
                    shared_sent_rep2)  # B X num_tasks
                comb_prob = torch.cat((task_prob1, task_prob2),
                                      0)  # 2B X num_tasks
                task_prob = torch.sum(comb_prob,
                                      0).squeeze()  # size = |num_tasks|
                adv_loss = -1 * task_prob[self.task_ids[batch_labels[batch_no -
                                                                     1]]]
                total_loss = cross_entropy_loss + self.config.beta * adv_loss + self.config.gamma * diff_loss
                # Important if we are using nn.DataParallel()
                if total_loss.size(0) > 1:
                    total_loss = total_loss.mean()
                total_loss.backward()
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
                clip_grad_norm(
                    filter(lambda p: p.requires_grad,
                           self.generator.parameters()), self.config.max_norm)
                self.optimizerG.step()
            else:
                self.optimizerG.zero_grad()
                scores = self.generator(train_sentences1, sent_len1,
                                        train_sentences2, sent_len2,
                                        batch_labels[batch_no - 1])
                n_correct = (torch.max(scores, 1)[1].view(
                    train_labels.size()).data == train_labels.data).sum()
                loss = self.criterion(scores, train_labels)
                # Important if we are using nn.DataParallel()
                if loss.size(0) > 1:
                    loss = loss.mean()
                loss.backward()
                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
                clip_grad_norm(
                    filter(lambda p: p.requires_grad,
                           self.generator.parameters()), self.config.max_norm)
                self.optimizerG.step()

            print_acc_total += 100. * n_correct / len(batches[batch_no - 1])
            plot_acc_total += 100. * n_correct / len(batches[batch_no - 1])

            if batch_no % self.config.print_every == 0:
                sys.stdout.write("\b" * num_back)
                sys.stdout.write(" " * num_back)
                sys.stdout.write("\b" * num_back)
                log_info = '%s (%d %d%%) %.2f%%' % (helper.show_progress(
                    start, batch_no /
                    num_batches), batch_no, batch_no / num_batches * 100,
                                                    print_acc_total / batch_no)
                sys.stdout.write(log_info)
                sys.stdout.flush()
                num_back = len(log_info)

            if batch_no % self.config.plot_every == 0:
                plot_acc_avg = plot_acc_total / self.config.plot_every
                self.train_accuracies.append(plot_acc_avg)
                plot_acc_total = 0

            # this releases all cache memory and becomes visible to other applications
            torch.cuda.empty_cache()
예제 #13
0
    def train(self, train_corpus, epoch):
        # Turn on training mode which enables dropout.
        self.model.train()

        # Splitting the data in batches
        shuffle = True
        # if self.config.task == 'sst': shuffle = False
        print(shuffle)

        train_batches = helper.batchify(train_corpus.data,
                                        self.config.batch_size, shuffle)
        print('number of train batches = ', len(train_batches))

        start = time.time()
        print_acc_total = 0
        plot_acc_total = 0

        num_batches = len(train_batches)
        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_sentences1, sent_len1, train_sentences2, sent_len2, train_labels = helper.batch_to_tensors(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                train_sentences1 = train_sentences1.cuda()
                train_sentences2 = train_sentences2.cuda()
                train_labels = train_labels.cuda()

            assert train_sentences1.size(0) == train_sentences2.size(0)

            # print(' train label size: ', train_labels.size(), ' train data size: ', train_sentences1.size())
            # print(' labels: ', train_labels)
            score = self.model(train_sentences1)
            n_correct = (torch.max(score, 1)[1].view(
                train_labels.size()).data == train_labels.data).sum()
            # print (' score size ', score.size(), train_labels.size())
            loss = self.criterion(score, train_labels)

            if loss.size(0) > 1:
                loss = loss.mean()
            # print ('loss:', loss)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            grad_norm = clip_grad_norm(
                filter(lambda p: p.requires_grad, self.model.parameters()),
                self.config.max_norm)
            # if epoch==11:
            # print(batch_no, grad_norm)
            self.optimizer.step()

            print_acc_total += 100. * n_correct / len(
                train_batches[batch_no - 1])
            plot_acc_total += 100. * n_correct / len(
                train_batches[batch_no - 1])

            if batch_no % self.config.print_every == 0:
                print_acc_avg = print_acc_total / self.config.print_every
                print_acc_total = 0
                print('%s (%d %d%%) %.2f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_acc_avg))

            if batch_no % self.config.plot_every == 0:
                plot_acc_avg = plot_acc_total / self.config.plot_every
                self.train_accuracies.append(plot_acc_avg)
                plot_acc_total = 0
예제 #14
0
    def train(self, train_batches, dev_batches, epoch_no):
        # Turn on training mode which enables dropout.
        self.model.train()

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(train_batches)
        print('epoch %d started' % epoch_no)

        for batch_no in range(1, num_batches + 1):
            # Clearing out all previous gradient computations.
            self.optimizer.zero_grad()
            train_sessions, length = helper.session_to_tensor(
                train_batches[batch_no - 1], self.dictionary)
            if self.config.cuda:
                train_sessions = train_sessions.cuda()
                length = length.cuda()

            loss = self.model(train_sessions, length)
            # Important if we are using nn.DataParallel()
            if loss.size(0) > 1:
                loss = torch.mean(loss)
            loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs.
            clip_grad_norm(self.model.parameters(), self.config.clip)
            self.optimizer.step()

            print_loss_total += loss.data[0]
            plot_loss_total += loss.data[0]

            if batch_no % self.config.print_every == 0:
                print_loss_avg = print_loss_total / self.config.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.config.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.config.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0

            if batch_no % self.config.dev_every == 0:
                dev_loss = self.validate(dev_batches)
                self.dev_losses.append(dev_loss)
                print('validation loss = %.4f' % dev_loss)
                if self.best_dev_loss == -1 or self.best_dev_loss > dev_loss:
                    self.best_dev_loss = dev_loss
                    helper.save_checkpoint(
                        {
                            'epoch': epoch_no,
                            'state_dict': self.model.state_dict(),
                            'best_loss': self.best_dev_loss,
                            'optimizer': self.optimizer.state_dict(),
                        }, self.config.save_path + 'model_best.pth.tar')
                else:
                    self.times_no_improvement += 1
                    # no improvement in validation loss for last n times, so stop training
                    if self.times_no_improvement == 20:
                        self.stop = True
                        break
예제 #15
0
    def train(self, train_dataset):
        batches_idx = helper.get_batches_idx(len(train_dataset),
                                             self.args.batch_size)
        print('number of train batches = ', len(batches_idx))

        start = time.time()
        print_loss_total = 0
        plot_loss_total = 0

        num_batches = len(batches_idx)
        for batch_no in range(1, num_batches + 1):  #1,...num_batches
            batch_idx = batches_idx[batch_no - 1]
            batch_data = [train_dataset.dataset[i] for i in batch_idx]

            #将一批数据转换为模型输入的格式
            (hist_query_input, hist_doc_input, session_num, hist_query_num,
             hist_query_len, hist_click_num, hist_doc_len, cur_query_input,
             cur_doc_input, cur_query_num, cur_query_len, cur_click_num,
             cur_doc_len, query, q_len, doc, d_len, y, next_q, next_q_len,
             _) = helper.batch_to_tensor(batch_data, self.args.max_query_len,
                                         self.args.max_doc_len)

            indices, slots_num = self.model.get_memory_input(session_num)
            feed_dict = {
                self.model.hist_query_input: hist_query_input,
                self.model.hist_doc_input: hist_doc_input,
                self.model.session_num: session_num,
                self.model.hist_query_num: hist_query_num,
                self.model.hist_query_len: hist_query_len,
                self.model.hist_click_num: hist_click_num,
                self.model.hist_doc_len: hist_doc_len,
                self.model.cur_query_input: cur_query_input,
                self.model.cur_doc_input: cur_doc_input,
                self.model.cur_query_num: cur_query_num,
                self.model.cur_query_len: cur_query_len,
                self.model.cur_click_num: cur_click_num,
                self.model.cur_doc_len: cur_doc_len,
                self.model.q: query,
                self.model.q_len: q_len,
                self.model.d: doc,
                self.model.d_len: d_len,
                self.model.y: y,  # 0/1
                self.model.indices: indices,
                self.model.slots_num: slots_num,
                self.model.next_q: next_q,
                self.model.next_q_len: next_q_len
            }

            #计算loss + 优化参数
            loss_ = self.sess.run(self.model.loss, feed_dict=feed_dict)
            train_op_ = self.sess.run(self.model.train_op, feed_dict=feed_dict)

            print_loss_total += loss_
            plot_loss_total += loss_

            if batch_no % self.args.print_every == 0:
                print_loss_avg = print_loss_total / self.args.print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' %
                      (helper.show_progress(start, batch_no / num_batches),
                       batch_no, batch_no / num_batches * 100, print_loss_avg))

            if batch_no % self.args.plot_every == 0:
                plot_loss_avg = plot_loss_total / self.args.plot_every
                self.train_losses.append(plot_loss_avg)
                plot_loss_total = 0