Пример #1
0
def save_model(net, dev_loss, em, f1, global_steps):
    
    model_dir = '/data/xuwenshen/workspace/squad/code/multi_task/models/'
    
    model_dir = model_dir + "loss-{:3f}-em-{:3f}-f1-{:3f}-steps-{:d}-model.pkl".format(dev_loss, em, f1, global_steps)
    
    torch.save(net.state_dict(), model_dir)
def save_models(netG, netD, outputDir, epoch):
    '''
    Saves model state dictionary for generator and discriminator networks.
    Inputs are the networks (netG, netD), the system path in which to save(outputDir) and the current 'epoch'.
    '''
    torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (outputDir, epoch))
    torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (outputDir, epoch))
Пример #3
0
    def save_checkpoint(self, model, optimizer, epoch, index=0):
        """
        :params model: model
        :params optimizer: optimizer
        :params epoch: training epoch
        :params index: index of saved file, default: 0
        Note: if we add hook to the grad by using register_hook(hook), then the hook function
        can not be saved so we need to save state_dict() only. Although save state dictionary
        is recommended, some times we still need to save the whole model as it can save all
        the information of the trained model, and we do not need to create a new network in
        next time. However, the GPU information will be saved too, which leads to some issues
        when we use the model on different machine
        """

        # get state_dict from model and optimizer
        model = self.list2sequential(model)
        if isinstance(model, nn.DataParallel):
            model = model.module
        model = model.state_dict()
        optimizer = optimizer.state_dict()

        # save information to a dict
        self.check_point_params['model'] = model
        self.check_point_params['optimizer'] = optimizer
        self.check_point_params['epoch'] = epoch

        # save to file
        torch.save(self.check_point_params, os.path.join(
            self.save_path, "checkpoint_%03d.pth" % index))
Пример #4
0
def load_word_vectors(path):
    if os.path.isfile(path+'.pth') and os.path.isfile(path+'.vocab'):
        print('==> File found, loading to memory')
        vectors = torch.load(path+'.pth')
        vocab = Vocab(filename=path+'.vocab')
        return vocab, vectors
    # saved file not found, read from txt file
    # and create tensors for word vectors
    print('==> File not found, preparing, be patient')
    count = sum(1 for line in open(path+'.txt',encoding='latin-1'))
    with open(path+'.txt','r') as f:
        contents = f.readline().rstrip('\n').split(' ')
        dim = len(contents[1:])
    words = [None]*(count)
    vectors = torch.zeros(count,dim)
    with open(path+'.txt','r',encoding='latin-1') as f:
        idx = 0
        for line in f:
            contents = line.rstrip('\n').split(' ')
            words[idx] = contents[0]
            #print(contents[1:])
            vectors[idx] = torch.Tensor(list(map(float, contents[1:])))
            idx += 1
    with open(path+'.vocab','w',encoding='latin-1') as f:
        for word in words:
            f.write(word+'\n')
    vocab = Vocab(filename=path+'.vocab')
    torch.save(vectors, path+'.pth')
    return vocab, vectors
Пример #5
0
    def fit(self, train_loader, dev_loader, test_loader,
            epochs, interval, eta, file):
        # 记录迭代时间
        total_time = timedelta()
        # 记录最大准确率及对应的迭代次数
        max_e, max_acc = 0, 0.0
        # 设置优化器为Adam
        self.optimizer = optim.Adam(params=self.parameters(), lr=eta)

        for epoch in range(1, epochs + 1):
            start = datetime.now()
            # 更新参数
            self.update(train_loader)

            print(f"Epoch: {epoch} / {epochs}:")
            loss, train_acc = self.evaluate(train_loader)
            print(f"{'train:':<6} Loss: {loss:.4f} Accuracy: {train_acc:.2%}")
            loss, dev_acc = self.evaluate(dev_loader)
            print(f"{'dev:':<6} Loss: {loss:.4f} Accuracy: {dev_acc:.2%}")
            loss, test_acc = self.evaluate(test_loader)
            print(f"{'test:':<6} Loss: {loss:.4f} Accuracy: {test_acc:.2%}")
            t = datetime.now() - start
            print(f"{t}s elapsed\n")
            total_time += t

            # 保存效果最好的模型
            if dev_acc > max_acc:
                torch.save(self, file)
                max_e, max_acc = epoch, dev_acc
            elif epoch - max_e >= interval:
                break
        print(f"max accuracy of dev is {max_acc:.2%} at epoch {max_e}")
        print(f"mean time of each epoch is {total_time / epoch}s\n")
def get_vanilla_vgg_features(cut_idx=-1):
    if not os.path.exists('vgg_features.pth'):
        os.system(
            'wget --no-check-certificate -N https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg19-d01eb7cb.pth')
        vgg_weights = torch.load('vgg19-d01eb7cb.pth')
        # fix compatibility issues
        map = {'classifier.6.weight':u'classifier.7.weight', 'classifier.6.bias':u'classifier.7.bias'}
        vgg_weights = OrderedDict([(map[k] if k in map else k,v) for k,v in vgg_weights.iteritems()])

        

        model = models.vgg19()
        model.classifier = nn.Sequential(View(), *model.classifier._modules.values())
        

        model.load_state_dict(vgg_weights)
        
        torch.save(model.features, 'vgg_features.pth')
        torch.save(model.classifier, 'vgg_classifier.pth')

    vgg = torch.load('vgg_features.pth')
    if cut_idx > 36:
        vgg_classifier = torch.load('vgg_classifier.pth')
        vgg = nn.Sequential(*(vgg._modules.values() + vgg_classifier._modules.values()))

    vgg.eval()

    return vgg
Пример #7
0
    def download(self):
        if self._check_datafile_exists():
            print('# Found cached data {}'.format(self.data_file))
            return

        if not self._check_downloaded():
            # download files
            url = self.urls[self.name][0]
            filename = self.urls[self.name][1]
            md5 = self.urls[self.name][2]
            fpath = os.path.join(self.root, filename)

            download_url(url, self.root, filename, md5)

            print('# Extracting data {}\n'.format(self.data_down))

            import zipfile
            with zipfile.ZipFile(fpath, 'r') as z:
                z.extractall(self.data_dir)

            os.unlink(fpath)

        # process and save as torch files
        print('# Caching data {}'.format(self.data_file))

        dataset = (
            read_image_file(self.data_dir, self.image_ext, self.lens[self.name]),
            read_info_file(self.data_dir, self.info_file),
            read_matches_files(self.data_dir, self.matches_files)
        )

        with open(self.data_file, 'wb') as f:
            torch.save(dataset, f)
Пример #8
0
    def save(self, path=None):
        """Save model parameters if model_file is set."""
        path = self.opt.get('model_file', None) if path is None else path

        if path and hasattr(self, 'optims'):
            model = {}
            model['enc_lt'] = self.enc_lt.state_dict()
            if self.opt['lookuptable'] not in ['enc_dec', 'all']:
                # dec_lt is not shared with enc_lt, so save it
                model['dec_lt'] = self.dec_lt.state_dict()
            if self.opt['decoder'] != 'shared':
                model['encoder'] = self.encoder.state_dict()
            model['decoder'] = self.decoder.state_dict()
            model['h2e'] = self.h2e.state_dict()
            model['e2o'] = self.e2o.state_dict()
            model['optims'] = {k: v.state_dict()
                               for k, v in self.optims.items()}
            model['longest_label'] = self.longest_label
            model['opt'] = self.opt

            for attn_name in ['attn', 'attn_v', 'attn_combine']:
                if hasattr(self, attn_name):
                    model[attn_name] = getattr(self, attn_name).state_dict()

            with open(path, 'wb') as write:
                torch.save(model, write)
Пример #9
0
    def save(self, save_optimizer=False, save_path=None, **kwargs):
        """serialize models include optimizer and other info
        return path where the model-file is stored.

        Args:
            save_optimizer (bool): whether save optimizer.state_dict().
            save_path (string): where to save model, if it's None, save_path
                is generate using time str and info from kwargs.
        
        Returns:
            save_path(str): the path to save models.
        """
        save_dict = dict()

        save_dict['model'] = self.faster_rcnn.state_dict()
        save_dict['config'] = opt._state_dict()
        save_dict['other_info'] = kwargs
        save_dict['vis_info'] = self.vis.state_dict()

        if save_optimizer:
            save_dict['optimizer'] = self.optimizer.state_dict()

        if save_path is None:
            timestr = time.strftime('%m%d%H%M')
            save_path = 'checkpoints/fasterrcnn_%s' % timestr
            for k_, v_ in kwargs.items():
                save_path += '_%s' % v_

        t.save(save_dict, save_path)
        self.vis.save([self.vis.env])
        return save_path
Пример #10
0
def test(epoch, best_acc):
    slope = get_slope(epoch)

    model.eval()
    test_loss = 0.0
    correct = 0.0
    for data, target in test_loader:
        if args.cuda:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data, volatile=True), Variable(target)
        output = model((data, slope))
        test_loss += F.nll_loss(output, target, size_average=False).data[0] # sum up batch loss
        pred = output.data.max(1, keepdim=True)[1] # get the index of the max log-probability
        correct += pred.eq(target.data.view_as(pred)).cpu().sum()

    test_loss /= len(test_loader.dataset)
    test_acc = correct / len(test_loader.dataset)
    print 'Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
          test_loss, int(correct), len(test_loader.dataset),
          100. * test_acc)

    if test_acc >= best_acc:
        torch.save(model.state_dict(), os.path.join('models','{}.pth'.format(model_name)))

    return test_loss, test_acc
Пример #11
0
def train(train_iter, dev_iter, test_iter, model_lstm, text_field, label_field, args):
    loss_function = nn.NLLLoss()
    optimizer = optim.Adam(model_lstm.parameters(), lr=1e-3)
    best_test_acc = 0.0
    no_up = 0

    for i in range(1, args.epochs+1):
        print('epoch: %d start!' % i)
        train_epoch(model_lstm, train_iter, dev_iter, test_iter, loss_function, optimizer, i, args)
        dev_acc = evaluate(model_lstm, dev_iter, loss_function, 'dev')
        test_acc = evaluate(model_lstm, test_iter, loss_function, 'test')

        if test_acc > best_test_acc:
            print('New Best Test!!!')
            best_test_acc = test_acc
            # os.system('rm best_models/mr_best_model_minibatch_acc_*.model')
            if not os.path.isdir(args.save_dir): os.makedirs(args.save_dir)
            save_prefix = os.path.join(args.save_dir, 'snapshot')
            save_path = '{}epoch{}.pt'.format(save_prefix, i)
            # torch.save(model_lstm.state_dict(),'best_models/mr_best_model_minibatch_acc_' + str(int(test_acc * 10000)) + '.model')
            torch.save(model_lstm, save_path)
            no_up = 0
        else:
            no_up += 1
            if no_up >= 10:
                exit()
        print('now best test acc:', best_test_acc)
Пример #12
0
def save_checkpoint(state, track_list, filename):
    """
    save checkpoint
    """
    with open(filename+'.json', 'w') as f:
        json.dump(track_list, f)
    torch.save(state, filename+'.model')
def extract_features_targets(model, features_size, loader, path_data, cuda=False):
    if os.path.isfile(path_data):
        print('Load features from {}'.format(path_data))
        return torch.load(path_data)

    print('\nExtract features on {}set'.format(loader.dataset.set))

    features = torch.Tensor(len(loader.dataset), features_size)
    targets = torch.Tensor(len(loader.dataset), len(loader.dataset.classes))

    for batch_id, batch in enumerate(tqdm(loader)):
        img = batch[0]
        target = batch[2]
        current_bsize = img.size(0)
        from_ = int(batch_id * loader.batch_size)
        to_ = int(from_ + current_bsize)

        if cuda:
            img = img.cuda(async=True)

        input = Variable(img, requires_grad=False)
        output = model(input)

        features[from_:to_] = output.data.cpu()
        targets[from_:to_] = target

    os.system('mkdir -p {}'.format(os.path.dirname(path_data)))
    print('save ' + path_data)
    torch.save((features, targets), path_data)
    print('')
    return features, targets
def train(args):

    data_file = h5py.File(args.h5_path, 'r')
    screens = data_file['screens']
    variables = data_file['variables']
    labels = data_file['action_labels']
    print('Dataset size =', len(screens))
    action_sets = data_file['action_sets'][:]
    episodes = data_file['episodes'][:]
    input_shape = screens[0].shape
    train_generator = data_generator(args, screens, variables, labels, episodes)

    model = BaseModelLSTM(input_shape[0], len(action_sets), variables.shape[1])

    #source_model = torch.load('imitation_model_lstm_bn0.pth')
    #model.load_state_dict(source_model.state_dict())
    #del source_model

    if USE_CUDA:
        model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=5e-4)
    optimizer.zero_grad()
    running_loss = 0
    running_accuracy = 0
    batch_time = time.time()
    cp = 0

    for batch, (screens, variables, labels, terminals) in enumerate(train_generator):
        screens, variables, labels = Variable(screens), Variable(variables), Variable(labels)
        outputs = model(screens, variables)
        loss = criterion(outputs, labels)
        model.set_terminal(terminals)

        running_loss += loss.data[0]
        _, pred = outputs.data.max(1)
        accuracy = (pred == labels.data).float().mean()
        running_accuracy += accuracy

        if batch % args.episode_size == args.episode_size - 1:
            loss.backward()
            optimizer.step()
            model.reset()
            optimizer.zero_grad()

            running_loss /= args.episode_size
            running_accuracy /= args.episode_size

            print(
                '[{:d}] loss: {:.3f}, accuracy: {:.3f}, time: {:.6f}'.format(
                    batch + 1, running_loss, running_accuracy, time.time()-batch_time
                )
            )
            running_loss = 0
            running_accuracy = 0
            batch_time = time.time()

        if batch % args.checkpoint_rate == args.checkpoint_rate - 1:
            cp += 1
            torch.save(model, args.checkpoint_file)
Пример #15
0
def torch_to_pytorch(model, t7_file, output):
    py_layers = []
    for layer in list(model.children()):
        py_layer_serial(layer, py_layers)

    t7_data = torchfile.load(t7_file)
    t7_layers = []
    for layer in t7_data:
        torch_layer_serial(layer, t7_layers)

    j = 0
    for i, py_layer in enumerate(py_layers):
        py_name = type(py_layer).__name__
        t7_layer = t7_layers[j]
        t7_name = t7_layer[0].split('.')[-1]
        if layer_map[t7_name] != py_name:
            raise RuntimeError('%s does not match %s' % (py_name, t7_name))

        if py_name == 'LSTM':
            n_layer = 2 if py_layer.bidirectional else 1
            n_layer *= py_layer.num_layers
            t7_layer = t7_layers[j:j + n_layer]
            j += n_layer
        else:
            j += 1

        load_params(py_layer, t7_layer)

    torch.save(model.state_dict(), output)
Пример #16
0
def train(net):
    net.train()
    priorbox = PriorBox()
    with torch.no_grad():
        priors = priorbox.forward()
        priors = priors.to(device)

    dataloader = DataLoader(VOCDetection(), batch_size=2, collate_fn=detection_collate, num_workers=12)

    for epoch in range(1000):
        loss_ls, loss_cs = [], []
        load_t0 = time.time()
        if epoch > 500:
            adjust_learning_rate(optimizer, 1e-4)

        for images, targets in dataloader:
            images = images.to(device)
            targets = [anno.to(device) for anno in targets]
            out = net(images)
            optimizer.zero_grad()
            loss_l, loss_c = criterion(out, priors, targets)

            loss = 2 * loss_l + loss_c
            loss.backward()
            optimizer.step()
            loss_cs.append(loss_c.item())
            loss_ls.append(loss_l.item())
        load_t1 = time.time()

        print(f'{np.mean(loss_cs)}, {np.mean(loss_ls)} time:{load_t1-load_t0}')
        torch.save(net.state_dict(), 'Final_FaceBoxes.pth')
Пример #17
0
def decompose_model_seq(model, layer_name, model_file):
    print(model)
    model.cpu()
    for i, (name, conv_layer) in enumerate(model.named_modules()):
        ## for sequential nets, 'in' is sufficient
        ## as long as there are not 2 homonimous layers
        if layer_name in name:
            print(name)

            if args.cp:
                rank = max(conv_layer.weight.data.shape) // 3
                rank, _ = choose_compression(
                    conv_layer, ranks=[rank, rank], compression_factor=5, flag='cpd')
                print('rank: ', rank)

                rank = cp_ranks(conv_layer)
                print('rank: ', rank)

                decomposed = cp_decomposition_conv_layer_BN(conv_layer, rank, matlab=False)
                # decomposed = cp_xavier_conv_layer(conv_layer, rank)
            else:
                
                decomposed = tucker_decomposition_conv_layer(conv_layer)

    # first modules return a sequential, then we need to call the proper layer 
    model._modules['sequential']._modules[layer_name] = decomposed 
    torch.save(model, model_file)
    return model
Пример #18
0
def train(train_loader, num_epochs=20):
    autoencoder = deep_models.ConvolutionalDenoisingAutoencoder()
    # weights = torch.randn(2)
    criterion = nn.BCELoss() # binary cross entropy loss
    optimizer = optim.Adadelta(autoencoder.parameters())

    for epoch in range(num_epochs):
        for i, data in enumerate(train_loader, 0):
            ## zero the gradient params
            optimizer.zero_grad()
            ### forward + backprop + optimize
            output = autoencoder(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()
            print(
                'epoch [{}/{}], loss:{:.4f}'
                .format(epoch+1, num_epochs, loss.data[0])
            )
            # if epoch % 5 == 0:
            #     vutils.save_image(
            #         output.data,
            #         join(RESULTS_DIR, '/image_{}.png'.format(epoch))
            #     )
    torch.save(autoencoder.state_dict(), join(MODELS_DIR, 'autoencoder.pth'))
    return
Пример #19
0
def train_model(args):
    """Load the data, train the model, test the model, export / save the model
    """
    torch.manual_seed(args.seed)

    # Open our dataset
    train_loader, test_loader = data_utils.load_data(args.test_split,
                                                     args.batch_size)

    # Create the model
    net = model.SonarDNN().double()
    optimizer = optim.SGD(net.parameters(), lr=args.lr,
                          momentum=args.momentum, nesterov=False)

    # Train / Test the model
    for epoch in range(1, args.epochs + 1):
        train(net, train_loader, optimizer, epoch)
        test(net, test_loader)

    # Export the trained model
    torch.save(net.state_dict(), args.model_name)

    if args.model_dir:
        # Save the model to GCS
        data_utils.save_model(args.model_dir, args.model_name)
Пример #20
0
    def on_end_epoch(state):
        print('[Epoch %d] Training Loss: %.4f (Accuracy: %.2f%%)' % (
            state['epoch'], meter_loss.value()[0], meter_accuracy.value()[0]))

        train_loss_logger.log(state['epoch'], meter_loss.value()[0])
        train_error_logger.log(state['epoch'], meter_accuracy.value()[0])

        reset_meters()

        engine.test(processor, get_iterator(False))
        test_loss_logger.log(state['epoch'], meter_loss.value()[0])
        test_accuracy_logger.log(state['epoch'], meter_accuracy.value()[0])
        confusion_logger.log(confusion_meter.value())

        print('[Epoch %d] Testing Loss: %.4f (Accuracy: %.2f%%)' % (
            state['epoch'], meter_loss.value()[0], meter_accuracy.value()[0]))

        torch.save(model.state_dict(), 'epochs/epoch_%d.pt' % state['epoch'])

        # Reconstruction visualization.

        test_sample = next(iter(get_iterator(False)))

        ground_truth = (test_sample[0].unsqueeze(1).float() / 255.0)
        _, reconstructions = model(Variable(ground_truth).cuda())
        reconstruction = reconstructions.cpu().view_as(ground_truth).data

        ground_truth_logger.log(
            make_grid(ground_truth, nrow=int(BATCH_SIZE ** 0.5), normalize=True, range=(0, 1)).numpy())
        reconstruction_logger.log(
            make_grid(reconstruction, nrow=int(BATCH_SIZE ** 0.5), normalize=True, range=(0, 1)).numpy())
Пример #21
0
    def save_network(self, network, network_name, epoch_count, gpu_ids):
        save_filename = network_name + '-' + str(epoch_count)
        save_path = os.path.join(self.save_dir, save_filename)
        torch.save(network.cpu().state_dict(), save_path)

        if len(gpu_ids) and torch.cuda.is_available():
            network.cuda(device=gpu_ids[0])
Пример #22
0
    def test_serialization_built_vocab(self):
        self.write_test_ppid_dataset(data_format="tsv")
        question_field = data.Field(sequential=True)
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", None)]
        tsv_dataset = data.TabularDataset(
            path=self.test_ppid_dataset_path, format="tsv",
            fields=tsv_fields)

        question_field.build_vocab(tsv_dataset)

        question_pickle_filename = "question.pl"
        question_pickle_path = os.path.join(self.test_dir, question_pickle_filename)
        torch.save(question_field, question_pickle_path)

        loaded_question_field = torch.load(question_pickle_path)

        assert loaded_question_field == question_field

        test_example_data = [["When", "do", "you", "use", "シ",
                              "instead", "of", "し?"],
                             ["What", "is", "2+2", "<pad>", "<pad>",
                              "<pad>", "<pad>", "<pad>"],
                             ["Here", "is", "a", "sentence", "with",
                              "some", "oovs", "<pad>"]]

        # Test results of numericalization
        original_numericalization = question_field.numericalize(test_example_data)
        pickled_numericalization = loaded_question_field.numericalize(test_example_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Пример #23
0
def save(net, filename):
    if isinstance(net, nn.DataParallel):
        net = net.module

    data = dict(args=net.args,
                state_dict=net.state_dict())
    torch.save(data, filename)
    def test_load_to_gpu_from_gpu(self):
        # This test will make sure that the initializer works on the GPU
        self.net1.cuda(device=0)
        self.net2.cuda(device=0)

        # Verify the parameters are on the GPU
        assert self.net1.linear_1.weight.is_cuda is True
        assert self.net1.linear_1.bias.is_cuda is True
        assert self.net2.linear_1.weight.is_cuda is True
        assert self.net2.linear_1.bias.is_cuda is True

        # We need to manually save the parameters to a file because setUp()
        # only does it for the CPU
        temp_file = self.TEST_DIR / "gpu_weights.th"
        torch.save(self.net2.state_dict(), temp_file)

        applicator = self._get_applicator("linear_1.*", temp_file)
        applicator(self.net1)

        # Verify the parameters are still on the GPU
        assert self.net1.linear_1.weight.is_cuda is True
        assert self.net1.linear_1.bias.is_cuda is True
        assert self.net2.linear_1.weight.is_cuda is True
        assert self.net2.linear_1.bias.is_cuda is True

        # Make sure the weights are identical
        assert self._are_equal(self.net1.linear_1, self.net2.linear_1)
def save():
    # save net1
    net1 = torch.nn.Sequential(
        torch.nn.Linear(1, 10),
        torch.nn.ReLU(),
        torch.nn.Linear(10, 1)
    )
    optimizer = torch.optim.SGD(net1.parameters(), lr=0.5)
    loss_func = torch.nn.MSELoss()

    for t in range(100):
        prediction = net1(x)
        loss = loss_func(prediction, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # plot result
    plt.figure(1, figsize=(10, 3))
    plt.subplot(131)
    plt.title('Net1')
    plt.scatter(x.data.numpy(), y.data.numpy())
    plt.plot(x.data.numpy(), prediction.data.numpy(), 'r-', lw=5)

    # 2 ways to save the net
    torch.save(net1, 'net.pkl')  # save entire net
    torch.save(net1.state_dict(), 'net_params.pkl')   # save only the parameters
Пример #26
0
def save_checkpoint(model, output_path):    

    ## if not os.path.exists(output_dir):
    ##    os.makedirs("model/")        
    torch.save(model, output_path)
        
    print("Checkpoint saved to {}".format(output_path))
Пример #27
0
def test(epoch):
    global best_acc
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    for batch_idx, (inputs, targets) in enumerate(testloader):
        inputs, targets = Variable(inputs), Variable(targets)
        outputs = net(inputs)
        loss = criterion(outputs, targets)

        #test_loss += loss.data[0]
        test_loss+=loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += targets.size(0)
        correct += predicted.eq(targets.data).cpu().sum()

        progress_bar(batch_idx, len(testloader), 'Loss: %.3f | Acc: %.3f%% (%d/%d)'
                     % (test_loss / (batch_idx + 1), 100. * correct / total, correct, total))
        # Save checkpoint.
        acc = 100. * correct / total
        if acc > best_acc:
            print('Saving..')
            state = {
                'net': net.module if use_cuda else net,
                'acc': acc,
                'epoch': epoch,
            }
            if not os.path.isdir('checkpoint'):
                os.mkdir('checkpoint')
            torch.save(state, './checkpoint/ckpt.t7')
            best_acc = acc
Пример #28
0
    def _comput_mean(self):
        meanstd_file = './data/300W_LP/mean.pth.tar'
        if os.path.isfile(meanstd_file):
            ms = torch.load(meanstd_file)
        else:
            print("\tcomputing mean and std for the first time, it may takes a while, drink a cup of coffe...")
            mean = torch.zeros(3)
            std = torch.zeros(3)
            if self.is_train:
                for i in range(self.total):
                    a = self.anno[i]
                    img_path = os.path.join(self.img_folder, self.anno[i].split('_')[0],
                                            self.anno[i][:-8] + '.jpg')
                    img = load_image(img_path)
                    mean += img.view(img.size(0), -1).mean(1)
                    std += img.view(img.size(0), -1).std(1)

            mean /= self.total
            std /= self.total
            ms = {
                'mean': mean,
                'std': std,
            }
            torch.save(ms, meanstd_file)
        if self.is_train:
            print('\tMean: %.4f, %.4f, %.4f' % (ms['mean'][0], ms['mean'][1], ms['mean'][2]))
            print('\tStd:  %.4f, %.4f, %.4f' % (ms['std'][0], ms['std'][1], ms['std'][2]))
        return ms['mean'], ms['std']
Пример #29
0
    def save_model(self):
        path = self.config.data_path
        if os.path.isdir('data'):
            path = 'data/{0}'.format(self.config.data_path)

        print('save model parameters to {0}'.format(path))
        torch.save(self.model.state_dict(), path)
Пример #30
0
    def test_serialization(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>"] + list("loves") + ["</w>"],
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>"] + list("cries") + ["</w>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                ["<cpad>"] * 7,
            ]
        ]

        field_pickle_filename = "char_field.pl"
        field_pickle_path = os.path.join(self.test_dir, field_pickle_filename)
        torch.save(field, field_pickle_path)

        loaded_field = torch.load(field_pickle_path)
        assert loaded_field == field

        original_numericalization = field.numericalize(examples_data)
        pickled_numericalization = loaded_field.numericalize(examples_data)

        assert torch.all(torch.eq(original_numericalization, pickled_numericalization))
Пример #31
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet'
                ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Пример #32
0
 def save_network(network, epoch_label):
     save_filename = 'net_%s.pth' % epoch_label
     save_path = os.path.join('./model', name, save_filename)
     torch.save(network.cpu().state_dict(), save_path)
     if torch.cuda.is_available():
         network.cuda(gpu_ids[0])
Пример #33
0
def train():
    parser = argparse.ArgumentParser(description="recognition argument")
    parser.add_argument("dir", default="models")
    parser.add_argument(
        "--arch",
        choices=[
            'BLSTM', 'LSTM', 'VGGBLSTM', 'VGGLSTM', 'LSTMrowCONV', 'TDNN_LSTM',
            'BLSTMN'
        ],
        default='BLSTM')
    parser.add_argument("--min_epoch", type=int, default=15)
    parser.add_argument("--output_unit", type=int)
    parser.add_argument("--lamb", type=float, default=0.1)
    parser.add_argument("--hdim", type=int, default=512)
    parser.add_argument("--layers", type=int, default=6)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument("--feature_size", type=int, default=120)
    parser.add_argument("--data_path")
    parser.add_argument("--lr", type=float,default=0.001)
    parser.add_argument("--stop_lr", type=float,default=0.00001)
    parser.add_argument("--resume", action="store_true")
    parser.add_argument("--pkl", action="store_true")
    parser.add_argument("--pretrained_model_path")
    args = parser.parse_args()

    os.makedirs(args.dir + '/board', exist_ok=True)
    writer = SummaryWriter(args.dir +'/board')
    # save configuration
    with open(args.dir + '/config.json', "w") as fout:
        config = {
            "arch": args.arch,
            "output_unit": args.output_unit,
            "hdim": args.hdim,
            "layers": args.layers,
            "dropout": args.dropout,
            "feature_size": args.feature_size,
        }
        json.dump(config, fout)

    model = Model(args.arch, args.feature_size, args.hdim, args.output_unit,
                  args.layers, args.dropout, args.lamb)
    
    if args.resume:
        print("resume from {}".format(args.pretrained_model_path))
        pretrained_dict = torch.load(args.pretrained_model_path)
        model.load_state_dict(pretrained_dict)
        
    device = torch.device("cuda:0")
    model.cuda()
    model = nn.DataParallel(model)
    model.to(device)

    lr = args.lr
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    if args.pkl:
        tr_dataset = SpeechDatasetMemPickle(args.data_path + "/tr.pkl") 
    else:
        tr_dataset = SpeechDatasetMem(args.data_path + "/tr.hdf5")

    tr_dataloader = DataLoader(
        tr_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        pin_memory=True,
        num_workers=0,
        collate_fn=PadCollate())

    if args.pkl:
        cv_dataset = SpeechDatasetMemPickle(args.data_path + "/cv.pkl") 
    else:
        cv_dataset = SpeechDatasetMem(args.data_path + "/cv.hdf5")

    cv_dataloader = DataLoader(
        cv_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        pin_memory=True,
        num_workers=0,
        collate_fn=PadCollate())

    prev_t = 0
    epoch = 0
    prev_cv_loss = np.inf
    model.train()
    while True:
        # training stage
        torch.save(model.module.state_dict(), args.dir + "/best_model")
        epoch += 1

        for i, minibatch in enumerate(tr_dataloader):
            print("training epoch: {}, step: {}".format(epoch, i))
            logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch

            sys.stdout.flush()
            model.zero_grad()
            optimizer.zero_grad()

            loss = model(logits, labels_padded, input_lengths, label_lengths)
            partial_loss = torch.mean(loss.cpu())
            weight = torch.mean(path_weights)
            real_loss = partial_loss - weight

            loss.backward(loss.new_ones(len(TARGET_GPUS)))

            optimizer.step()
            t2 = timeit.default_timer()
            writer.add_scalar('training loss',
                            real_loss.item(),
                            (epoch-1) * len(tr_dataloader) + i)
            prev_t = t2

        # save model
        torch.save(model.module.state_dict(),
                   args.dir + "/model.epoch.{}".format(epoch))

        # cv stage
        model.eval()
        cv_losses_sum = []
        count = 0

        for i, minibatch in enumerate(cv_dataloader):
            print("cv epoch: {}, step: {}".format(epoch, i))
            logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch

            loss = model(logits, labels_padded, input_lengths, label_lengths)
            loss_size = loss.size(0)
            count = count + loss_size
            partial_loss = torch.mean(loss.cpu())
            weight = torch.mean(path_weights)
            real_loss = partial_loss - weight
            real_loss_sum = real_loss * loss_size
            cv_losses_sum.append(real_loss_sum.item())
            print("cv_real_loss: {}".format(real_loss.item()))

        cv_loss = np.sum(np.asarray(cv_losses_sum)) / count
        print("mean_cv_loss: {}".format(cv_loss))
        
        writer.add_scalar('mean_cv_loss',cv_loss,epoch)
        if epoch < args.min_epoch or cv_loss <= prev_cv_loss:
            torch.save(model.module.state_dict(), args.dir + "/best_model")
            prev_cv_loss = cv_loss
        else:
            print(
                "cv loss does not improve, decay the learning rate from {} to {}"
                .format(lr, lr / 10.0))
            adjust_lr(optimizer, lr / 10.0)
            lr = lr / 10.0
            if (lr < args.stop_lr):
                print("learning rate is too small, finish training")
                break

        model.train()

    ctc_crf_base.release_env(gpus)
Пример #34
0
# %%
# other example
plot_knn_examples(embeddings)

# %%
# What's next?

# You could use the pre-trained model and train a classifier on top.
pretrained_resnet_backbone = model.backbone

# you can also store the backbone and use it in another code
state_dict = {
    'resnet18_parameters': pretrained_resnet_backbone.state_dict()
}
torch.save(state_dict, 'model.pth')

# %%
# THIS COULD BE IN A NEW FILE (e.g. inference.py
#
# Make sure you place the `model.pth` file in the same folder as this code

# load the model in a new file for inference
resnet18_new = torchvision.models.resnet18()
last_conv_channels = list(resnet.children())[-1].in_features
# note that we need to create exactly the same backbone in order to load the weights
backbone_new = nn.Sequential(
    *list(resnet.children())[:-1],
    nn.Conv2d(last_conv_channels, num_ftrs, 1),
)
     z=znew
     put_trainable_values(net1,z)
     put_trainable_values(net2,z)
     put_trainable_values(net3,z)

     if check_results:
       verification_error_check(net1,net2,net3)
  

print('Finished Training')


if save_model:
 torch.save({
     'model_state_dict':net1.state_dict(),
     'epoch':epoch,
     'optimizer_state_dict':opt1.state_dict(),
     'running_loss':running_loss1,
     },'./s1.model')
 torch.save({
     'model_state_dict':net2.state_dict(),
     'epoch':epoch,
     'optimizer_state_dict':opt2.state_dict(),
     'running_loss':running_loss2,
     },'./s2.model')
 torch.save({
     'model_state_dict':net3.state_dict(),
     'epoch':epoch,
     'optimizer_state_dict':opt3.state_dict(),
     'running_loss':running_loss3,
     },'./s3.model')
Пример #36
0
    #         t+=1
    #     # if svm_predict == label:
    #     #     s+=1
    #     #print(t,s)
    # # here map is acc

    # knn_map = (t / float(len(test_loader)))
    #svm_map = (s / float(len(test_loader)))
    map = CalcHR.CalcMap(T,H_B,test_labels_onehot.numpy(),train_labels_onehot.numpy())
    print('####################################')
#    print('knn_map:',knn_map)
    print('map:',map)
    #print('svm_map:',svm_map)
    #map = round(map,5)

    if map > max_map:
        max_map = map
        np.save(str(opt.bit)+"H_B.npy",H_B)
        np.save(str(opt.bit)+'test.npy',T)
        np.save('train_label.npy',train_labels_onehot.numpy())
        np.save('test_label.npy',test_labels_onehot.numpy())
        torch.save(G,'./G3_models.pt')
        torch.save(H,'./H3_models.pt')







Пример #37
0
        optimizerD.step()

        # Update G with fake data
        netG.zero_grad()
        y_fake_r = netD(fake_maps)
        loss_G = criterion(y_fake_r, real_label)
        loss_G.backward()
        optimizerG.step()

    # print info about losses and save them to file
    print('Epoch {} loss_D_real: {:.4f} loss_D_fake: {:.4f} loss_G: {:.4f}'\
          .format(epoch, loss_D_real.mean().item(), loss_D_fake.mean().item(),
                  loss_G.mean().item()))
    f = open('%s/losses/loss_64.txt' % root_out, 'a')
    f.write('%d %.3e %.3e %.3e\n' %
            (epoch, loss_D_real.mean().item(), loss_D_fake.mean().item(),
             loss_G.mean().item()))
    f.close()

    # save images from generator to file
    with torch.no_grad():
        viz_sample = netG(viz_noise)[:49]
        vutils.save_image(viz_sample, '%s/images_64/fake_samples_%d.png'\
                          %(root_out,epoch), normalize=True, nrow=7, range=(-1.0, 1.0))

    # save networks
    torch.save(netG.state_dict(),
               '%s/models_64/Net_Gen_%d.pt' % (root_out, epoch))
    torch.save(netD.state_dict(),
               '%s/models_64/Net_Dis_%d.pt' % (root_out, epoch))
Пример #38
0
def load_and_cache_examples(args,
                            task,
                            tokenizer,
                            evaluate=False,
                            filter_long_sequences=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task]()
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, 'cached_{}_{}_{}_{}_{}'.format(
            'dev' if evaluate else 'train',
            list(filter(None, args.model_name_or_path.split('/'))).pop(),
            str(args.max_seq_length), str(task),
            'filtered' if filter_long_sequences else 'all'))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
            # HACK(label indices are swapped in RoBERTa pretrained model)
            label_list[1], label_list[2] = label_list[2], label_list[1]
        examples = processor.get_dev_examples(
            args.data_dir) if evaluate else processor.get_train_examples(
                args.data_dir)
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=bool(
                args.model_type in ['xlnet']),  # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
            filter_long_sequences=filter_long_sequences,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    elif output_mode == "regression":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.float)

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset
Пример #39
0
def main():
    opt = opts().parse()
    now = datetime.datetime.now()
    logger = Logger(opt.saveDir + '/logs_{}'.format(now.isoformat()))

    if opt.loadModel != 'none':
        model = torch.load(opt.loadModel).cuda()
    else:
        model = HourglassNet3D(opt.nStack, opt.nModules, opt.nFeats,
                               opt.nRegModules).cuda()

    criterion = torch.nn.MSELoss().cuda()
    optimizer = torch.optim.RMSprop(model.parameters(),
                                    opt.LR,
                                    alpha=ref.alpha,
                                    eps=ref.epsilon,
                                    weight_decay=ref.weightDecay,
                                    momentum=ref.momentum)

    if opt.ratio3D < ref.eps:
        val_loader = torch.utils.data.DataLoader(MPII(opt,
                                                      'val',
                                                      returnMeta=True),
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=int(ref.nThreads))
    else:
        val_loader = torch.utils.data.DataLoader(H36M(opt, 'val'),
                                                 batch_size=1,
                                                 shuffle=False,
                                                 num_workers=int(ref.nThreads))

    if opt.test:
        val(0, opt, val_loader, model, criterion)
        return

    train_loader = torch.utils.data.DataLoader(
        Fusion(opt, 'train'),
        batch_size=opt.trainBatch,
        shuffle=True if opt.DEBUG == 0 else False,
        num_workers=int(ref.nThreads))

    for epoch in range(1, opt.nEpochs + 1):
        loss_train, acc_train, mpjpe_train, loss3d_train = train(
            epoch, opt, train_loader, model, criterion, optimizer)
        logger.scalar_summary('loss_train', loss_train, epoch)
        logger.scalar_summary('acc_train', acc_train, epoch)
        logger.scalar_summary('mpjpe_train', mpjpe_train, epoch)
        logger.scalar_summary('loss3d_train', loss3d_train, epoch)
        if epoch % opt.valIntervals == 0:
            loss_val, acc_val, mpjpe_val, loss3d_val = val(
                epoch, opt, val_loader, model, criterion)
            logger.scalar_summary('loss_val', loss_val, epoch)
            logger.scalar_summary('acc_val', acc_val, epoch)
            logger.scalar_summary('mpjpe_val', mpjpe_val, epoch)
            logger.scalar_summary('loss3d_val', loss3d_val, epoch)
            torch.save(model,
                       os.path.join(opt.saveDir, 'model_{}.pth'.format(epoch)))
            logger.write(
                '{:8f} {:8f} {:8f} {:8f} {:8f} {:8f} {:8f} {:8f} \n'.format(
                    loss_train, acc_train, mpjpe_train, loss3d_train, loss_val,
                    acc_val, mpjpe_val, loss3d_val))
        else:
            logger.write('{:8f} {:8f} {:8f} {:8f} \n'.format(
                loss_train, acc_train, mpjpe_train, loss3d_train))
        adjust_learning_rate(optimizer, epoch, opt.dropLR, opt.LR)
    logger.close()
Пример #40
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected in the list: " +
                        ", ".join(MODEL_CLASSES.keys()))
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected in the list: "
        + ", ".join(ALL_MODELS))
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list: " +
        ", ".join(processors.keys()))
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name")
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after tokenization. Sequences longer "
        "than this will be truncated, sequences shorter will be padded.")
    parser.add_argument(
        "--filter_long_sequences_train",
        action='store_true',
        help=
        "If set train sequences longer as max_seq_length are filtered instead of truncated."
    )
    parser.add_argument(
        "--filter_long_sequences_eval",
        action='store_true',
        help=
        "If set eval sequences longer as max_seq_length are filtered instead of truncated."
    )
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action='store_true',
        help="Rul evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs."
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")

    parser.add_argument('--logging_steps',
                        type=int,
                        default=50,
                        help="Log every X updates steps.")
    parser.add_argument('--save_steps',
                        type=int,
                        default=50,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action='store_true',
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number"
    )
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Avoid using CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        '--overwrite_cache',
        action='store_true',
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="For distributed training: local_rank")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="For distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="For distant debugging.")
    args = parser.parse_args()

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl')
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank, device, args.n_gpu, bool(args.local_rank != -1),
        args.fp16)

    # Set seed
    set_seed(args)

    # Prepare GLUE task
    args.task_name = args.task_name.lower()
    if args.task_name not in processors:
        raise ValueError("Task not found: %s" % (args.task_name))
    processor = processors[args.task_name]()
    args.output_mode = output_modes[args.task_name]
    label_list = processor.get_labels()
    num_labels = len(label_list)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    args.model_type = args.model_type.lower()
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
    config = config_class.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task_name,
        cache_dir=args.cache_dir if args.cache_dir else None)
    tokenizer = tokenizer_class.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None)
    model = model_class.from_pretrained(
        args.model_name_or_path,
        from_tf=bool('.ckpt' in args.model_name_or_path),
        config=config,
        cache_dir=args.cache_dir if args.cache_dir else None)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(
            args,
            args.task_name,
            tokenizer,
            evaluate=False,
            filter_long_sequences=args.filter_long_sequences_train)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)

    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Create output directory if needed
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        model = model_class.from_pretrained(args.output_dir)
        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(
                    glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME,
                              recursive=True)))
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                '-')[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""

            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, prefix=prefix)
            result = dict(
                (k + '_{}'.format(global_step), v) for k, v in result.items())
            results.update(result)

    return results
        100. * correct / len(test_loader.dataset)))

if __name__ == '__main__':

    batch_size = 100
    epochs = 50

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
    train_data = datasets.CIFAR10('./data', train=True, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, **kwargs)

    test_data = datasets.CIFAR10('./data', train=False, transform=transform)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True, **kwargs)

    model = Model().to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01)

    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, epoch, optimizer, criterion)
        test(model, device, test_loader, criterion)

    torch.save(model.state_dict(),"mnist_cnn.pt")
Пример #42
0
        G_exp.add_scalar_value('G_loss', G_loss.data[0], step=batch_idx + it * train_size)
        G_loss.backward(retain_variables = True)
        G_solver.step()

    if  it % 2 == 0:
        z.data.resize_(mb_size, z_dim).normal_(0, 1)
        samples = netG(z).data.numpy()[:16]

        fig = plt.figure(figsize=(4, 4))
        gs = gridspec.GridSpec(4, 4)
        gs.update(wspace=0.05, hspace=0.05)

        for index, sample in  enumerate(samples):
            ax = plt.subplot(gs[index])
            plt.axis('off')
            ax.set_xticklabels([])
            ax.set_yticklabels([])
            ax.set_aspect('equal')
            plt.imshow(sample.reshape(28, 28), cmap='Greys_r')

        if not os.path.exists('out/'):
            os.makedirs('out/')
        
        plt.savefig('out/{}.png'.format(str(cnt).zfill(3)), bbox_inches='tight')
        torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % ('./out', it))
        torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % ('./out', it))
        cnt += 1
        plt.close(fig)


Пример #43
0
                                warmup_for=5,
                                min_lr=args.learning_rate / args.epochs)

    trainer = Trainer(net, train_loader, val_loader, loss, optimizer, scheduler)
    if iscuda: trainer = trainer.cuda()

    # Training loop #
    max_val_error = 1e5
    for epoch in range(args.epochs):
        print(f"\n>> Starting epoch {epoch}...")
        trainer.train()

        val_error = trainer.validate()
        if val_error < max_val_error:
            print(f"\n>> Saving model (the lowest validation error)")
            torch.save({'net': args.net, 'state_dict': net.state_dict()},
                       osp.join(args.save_path, "min_val_error_checkpoint.pt"))
            max_val_error = val_error

        torch.save({'net': args.net, 'state_dict': net.state_dict()},
                   osp.join(args.save_path, "checkpoint_" + str(epoch) + ".pt"))
        '''
        if epoch == 30:
            print(f"\n>> Saving model (30th epoch)")
            torch.save({'net': args.net, 'state_dict': net.state_dict()},
                       osp.join(args.save_path, "checkpoint_30th.pt"))
        '''

    print(f"\n>> Saving model to {args.save_path}")
    torch.save({'net': args.net, 'state_dict': net.state_dict()},
               osp.join(args.save_path, "end_train_checkpoint.pt"))
        print('loss: %.7f' % avg_total_loss)
        writer.add_scalar('data/total_loss', avg_total_loss, epoch)
        avg_unsupervised_loss = unsupervised_loss.avg
        print('unsupervised loss: %.7f' % avg_unsupervised_loss)
        writer.add_scalar('data/unsupervised_loss', avg_unsupervised_loss,
                          epoch)
        avg_supervised_loss = supervised_loss.avg
        print('supervised loss: %.7f' % avg_supervised_loss)
        writer.add_scalar('data/supervised_loss', avg_supervised_loss, epoch)
        print('end of display \n')

    if (epoch % opt.save_latest_freq == 0):
        print('saving the latest model (epoch %d, total_epoch %d)' %
              (epoch, opt.niter))
        torch.save(
            model.state_dict(),
            os.path.join('.', opt.checkpoints_dir, opt.name,
                         'sound_localization_latest' + str(epoch) + '.pth'))

    if (epoch % opt.validation_freq == 0 and opt.validation_on):
        model.eval()
        opt.mode = 'val'
        print('Display validation results at (epoch %d, total_epoch %d)' %
              (epoch, opt.niter))
        val_err = evaluate(model, writer, epoch, dataloader_test, opt)
        print('end of display \n')
        model.train()
        opt.mode = 'train'
        #save the model that achieves the smallest validation error
        if val_err < best_err:
            best_err = val_err
            print(
Пример #45
0
def main():

	# prepare data
	if args.download:
		print("Downloading data . . .")
		download_data()
	if args.load_preproc:
		print("Loading preprocessed data . . .")
		data = torch.load('data/preproc_data')
	else:
		print("Preprocessing data . . .")
		data = preprocess_data()
		print("Saving preprocessed data . . .")
		torch.save(data, 'data/preproc_data')
	print("Making DataLoader . . .")
	train_loader, val_loader, test_loader = get_dataloader(data, args.batch_size)

	# setup model & dirs
	if args.load_model != '':
		print("Loading model . . .")
		pass
	else:
		print("Initializing model . . .")
		net = Net()
	model_root = 'data/models/'
	if not os.path.exists(model_root):
		os.makedirs(model_root)
	i = 0
	while True:
		model_name = args.name + "(%d)"%(i)
		model_dir = os.path.join(model_root, model_name)
		if os.path.exists(model_dir):
			i += 1
			continue
		else:
			os.makedirs(model_dir)
			break

	# train model
	if args.train:
		print("Training model . . .")
		net.train()
		optimizer = optim.Adam(net.parameters(), lr=args.lr)
		best_perf = None
		for epoch in range(1, args.epochs+1):
			loss = train_epoch(net, train_loader, optimizer)
			if epoch % args.print_every == 0:
				print("(Epoch %d) Training loss: %4f"%(epoch, loss))
			if epoch % args.validate_every == 0:
				loss, perf = test_epoch(net, val_loader)
				print("Validation loss: %4f, accuracy: %g"%(loss, perf))
				if best_perf==None or perf > best_perf:
					# save best model
					best_perf = perf
					fname = model_name+'_best_(epoch%d)'%(epoch)
					torch.save(net, os.path.join(model_dir, fname))

	# test model
	print("Testing model . . .")
	net.eval()
	loss, perf = test_epoch(net, test_loader)
	print("Test loss: %4f, accuracy: %g"%(loss, perf))
	pass
Пример #46
0
                    marker='.')
        plt.scatter(num_epoch + 1,
                    one_epoch_validation_loss,
                    linewidths=0.5,
                    c='blue',
                    marker='.')
        plt.pause(0.00001)

        if num_epoch % FC_save_model_epochs_interval is 0:
            print('saving model...')
            model_save_path = FC_model_save_folder + '/epoch_' + str(
                num_epoch + 1)
            torch.save(
                {
                    'epoch': num_epoch + 1,
                    'FC_model_state_dict': FC_model.state_dict(),
                    'FC_optimizer_state_dict': FC_model_optimizer.state_dict(),
                    'loss': all_epochs_train_loss_hist_for_final_graph,
                }, model_save_path)

    plt.clf()
    shist = [h for h in all_epochs_train_loss_hist_for_final_graph]
    vhist = [h for h in all_epochs_validation_loss_hist_for_final_graph]
    plt.title('training_loss')
    plt.xlabel("Training Epochs")
    plt.ylabel("Error")
    plt.plot(range(1, total_epochs + 1), shist, label="train")
    plt.plot(range(1, total_epochs + 1), vhist, label="validation")
    # plt.ylim((0,1.))
    # plt.xticks(np.arange(1, num_epochs+1, 1.0))
    plt.legend()
Пример #47
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--mode', help='{train,test}', required=True)
    parser.add_argument('--output_dir', help='File to write model checkpoints and log files to', required=False)
    parser.add_argument('--dataset', help='{genia, genia_full, litbank, litbank_full, ontonotes}', required=True)
    parser.add_argument('--metric', help='{accuracy,fscore,span_fscore}', default='span_fscore', required=False)

    parser.add_argument("--batch_size", default=16, type=int, help="The batch size on GPU.")
    parser.add_argument("--gradient_accumulation_steps", default=1, type=int, 
                        help="Number of updates steps to accumulate before performing a backward pass.")

    parser.add_argument('--freeze_bert', help='Whether to freeze BERT weights', action='store_true')
    parser.add_argument('--self_attention', help='Whether sequences should be allowed to attention to themsleves', action='store_true')
    parser.add_argument('--vanilla', help='Whether to add LSTM encoders to model', action='store_true')
    parser.add_argument("--lstm_dim", default=128, type=int, help="LSTM hidden dimension size.")

    parser.add_argument('--pretrained_dir', help='Directory to read custom fine-tuned (BERT) base model weights from', required=False)
    parser.add_argument('--checkpoint_file', help='File to read checkpointed model weights from (to resume training or test)', required=False)
    parser.add_argument('--model_type', help='Pretrained BERT configuration checkpoint, e.g. bert-base-cased', required=True)

    parser.add_argument('--lr', type=float, default=2e-5, required=False)
    parser.add_argument('--num_epochs', type=int, default=100, required=False)
    parser.add_argument('--k', type=int, help='How many context sequences to attend over', default=10, required=False)

    args = parser.parse_args()

    os.makedirs(args.output_dir, exist_ok=True)

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
        handlers=[
            logging.FileHandler(os.path.join(args.output_dir, 
                "training.log" if args.mode == 'train' else 'test.log'), mode='w+'),
            logging.StreamHandler()
        ]
    )

    logging.info("Running on: {}".format(device))
    logging.getLogger("transformers.configuration_utils").setLevel(logging.ERROR)
    logging.getLogger("transformers.tokenization_utils").setLevel(logging.ERROR)

    if args.dataset == 'genia':
        dataset = GENIA
    elif args.dataset == 'genia_full':
        dataset = GENIA_FULL
    elif args.dataset == 'litbank':
        dataset = LITBANK
    elif args.dataset == 'litbank_full':
        dataset = LITBANK_FULL
    elif args.dataset == 'ontonotes':
        dataset = ONTONOTES
    else:
        raise ValueError("Invalid dataset")

    if 'google' in args.model_type:
        tokenizer_type = 'bert-base-uncased'
    else:
        tokenizer_type = args.model_type

    do_lower_case = 'uncased' in tokenizer_type
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_type, do_lower_case=do_lower_case, do_basic_tokenize=False)

    metric = None
    if args.metric.lower() == "fscore":
        metric = metrics.check_f1_two_lists
    elif args.metric.lower() == "accuracy":
        metric = metrics.get_accuracy
    elif args.metric.lower() == "span_fscore":
        metric = metrics.check_span_f1_two_lists

    tagset = read_tagset(dataset['tagset'])

    model = load_model(args.model_type, args.pretrained_dir, args.checkpoint_file, len(tagset), args.freeze_bert, args.lstm_dim, args.vanilla)
    model.to(device)

    mode = args.mode

    if mode == 'train':

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": 0.01,
            },
            {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-6)
        loss_function = nn.CrossEntropyLoss(ignore_index=-100)

        train_documents = DocumentAttentionDataset(dataset['train_dir'], tokenizer, args.k, args.self_attention)
        train_data_loader = DataLoader(train_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x)

        dev_documents = DocumentAttentionDataset(dataset['dev_dir'], tokenizer, args.k, args.self_attention)
        dev_data_loader = DataLoader(dev_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x)

        global_steps = 0 # number of backward passes
        steps = 0 # number of forward passes (not divided by accummulation)
        total_loss = 0
        best_val = -1
        best_idx = 0
        patience = 3

        for epoch in range(args.num_epochs):

            logging.info("*** TRAINING ****\n")
            logging.info("Epoch: {}".format(epoch))
            total_loss = 0

            for i, document_batch in enumerate(train_data_loader):

                document = document_batch[0]
                document_loss = 0

                batches = get_batches(document, args.batch_size, tagset, args.k)

                context = model.get_static_context_representation(batches, args.batch_size)

                num_batches = len(batches['inputs'])
                num_labeled = sum([len(batches['inputs'][b]) for b in range(num_batches)])
                num_attn = sum([np.prod(batches['attn_sentence_idx'][b].shape) for b in range(num_batches)])

                logging.info("Document {}/{}: (len={}, attn={}, batches={})".format(i+1, len(train_documents), num_labeled, num_attn, num_batches))

                model.train()

                for b in range(num_batches):
                    inputs = batches['inputs'][b].to(device)
                    transforms = batches['transforms'][b].to(device)
                    masks = batches['masks'][b].to(device)
                    labels = batches['labels'][b].to(device)

                    attn_sentence_idx = batches['attn_sentence_idx'][b].to(device)
                    attn_word_idx = batches['attn_word_idx'][b].to(device)
                    attn_dists = batches['attn_dists'][b].to(device)
                    attn_mask = batches['attn_masks'][b].to(device)

                    logits = model.forward(inputs=inputs, masks=masks, transforms=transforms, full_context=context,
                                            attn_sentence_idx=attn_sentence_idx, attn_word_idx=attn_word_idx,
                                            attn_dists=attn_dists, attn_mask=attn_mask)

                    loss = loss_function(logits.view(-1, model.num_labels), labels.view(-1))
                    loss /= args.gradient_accumulation_steps
                    document_loss += loss.item()
                    steps += 1

                    if steps % args.gradient_accumulation_steps == 0:
                        total_loss += loss.item()
                        loss.backward()
                        optimizer.step()
                        model.zero_grad()
                        global_steps += 1

                        if global_steps % 100 == 0:
                            logging.info("Global step: {}".format(global_steps))

                logging.info("Loss: {}".format(document_loss / num_labeled))
                total_loss += document_loss

            logging.info("Epoch total loss: {}".format(total_loss))
            logging.info("*** EVALUATING ***\n")
            value = model.evaluate(dev_data_loader, args.batch_size, metric, tagset, args.k)
            logging.info("DEV {}: {}".format(args.metric, value))

            if value > best_val:
                best_idx = epoch
                best_val = value

                model_dir = os.path.join(args.output_dir, "checkpoint-{}.bin".format(best_idx))
                logging.info("Saving model @ {}".format(model_dir))
                torch.save(model.state_dict(), model_dir)

            elif (epoch - best_idx) > patience:
                logging.info("Aborting training after {} epochs of patience".format(patience))
                logging.info("Best model @ epoch {} with {}={}".format(best_idx, args.metric, best_val))
                del model # allows torch to free gpu memory before loading best model from disk
                break

        logging.info("*** TESTING ***\n")

        best_model_dir = os.path.join(args.output_dir, "checkpoint-{}.bin".format(best_idx))
        logging.info("Loading best model from {}".format(best_model_dir))
        best_model = load_model(args.model_type, None, best_model_dir, len(tagset), args.freeze_bert, args.lstm_dim, args.vanilla)
        best_model.to(device)

        test_documents = DocumentAttentionDataset(dataset['test_dir'], tokenizer, args.k, args.self_attention)
        test_data_loader = DataLoader(test_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x)

        value = best_model.evaluate(test_data_loader, args.batch_size, metric, tagset, args.k)
        logging.info("TEST {}: {}".format(args.metric, value))

        return


    elif mode == 'predict':
        prediction_file = os.path.join(args.output_dir, "predictions.txt")

        # TODO
        pass


    elif mode == 'test':
        logging.info("*** TESTING ***\n")

        test_documents = DocumentAttentionDataset(dataset['test_dir'], tokenizer, args.k, args.self_attention)
        test_data_loader = DataLoader(test_documents, batch_size=1, shuffle=True, num_workers=1, collate_fn=lambda x: x)

        value = model.evaluate(test_data_loader, args.batch_size, metric, tagset, args.k)
        logging.info("TEST {}: {}".format(args.metric, value))
Пример #48
0
def train_net(model, data_path, pre_model, save_dir, batch_size, lr, log_after, cuda, device, one_hot=False):
    if not pre_model:
        print(model)
    writer = SummaryWriter()
    if cuda:
        print('GPU')
        model.cuda(device=device)
        print('log: training started on device: {}'.format(device))
    # define loss and optimizer
    optimizer = Adam(model.parameters(), lr=lr)
    lr_final = 0.0000003
    num_epochs = 500
    LR_decay = (lr_final/lr)**(1./num_epochs)
    scheduler = lr_scheduler.ExponentialLR(optimizer=optimizer, gamma=LR_decay)
    # print(LR_decay, optimizer.state)
    # print(optimizer.param_groups[0]['lr'])
    # criterion = nn.CrossEntropyLoss()
    criterion = nn.CrossEntropyLoss()
    train_loader, val_dataloader, test_loader = get_dataloaders(path_to_nparray=data_path,
                                                                batch_size=batch_size,
                                                                normalize=True)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)

    if True:
        i = 1
        m_loss, m_accuracy = [], []
        if pre_model:
            # self.load_state_dict(torch.load(pre_model)['model'])
            model.load_state_dict(torch.load(os.path.join(save_dir, "model-"+pre_model+'.pt')))
            print('log: resumed model {} successfully!'.format(pre_model))
            print(model)

            # starting point
            # model_number = int(pre_model.split('/')[1].split('-')[1].split('.')[0])
            model_number = int(pre_model) #re.findall('\d+', str(pre_model))[0])
            i = i + model_number - 1
        else:
            print('log: starting anew...')

        while i < num_epochs:
            i += 1
            net_loss = []
            # new model path
            save_path = os.path.join(save_dir, 'model-{}.pt'.format(i))
            # remember to save only five previous models, so
            del_this = os.path.join(save_dir, 'model-{}.pt'.format(i-5))
            if os.path.exists(del_this):
                os.remove(del_this)
                print('log: removed {}'.format(del_this))

            if i > 1 and not os.path.exists(save_path):
                torch.save(model.state_dict(), save_path)
                print('log: saved {}'.format(save_path))

            correct_count, total_count = 0, 0
            for idx, data in enumerate(train_loader):
                ##########################
                model.train() # train mode at each epoch, just in case...
                ##########################
                test_x, label = data
                if cuda:
                    test_x = test_x.cuda(device=device)
                    label = label.cuda(device=device)
                # forward
                out_x, pred = model(test_x)
                # out_x, pred = out_x.cpu(), pred.cpu()
                loss = criterion(out_x, label)
                net_loss.append(loss.item())

                # get accuracy metric
                if one_hot:
                    batch_correct = (torch.argmax(label, dim=1).eq(pred.long())).double().sum().item()
                else:
                    batch_correct = (label.eq(pred.long())).double().sum().item()
                correct_count += batch_correct
                # print(batch_correct)
                total_count += np.float(pred.size(0))
                if idx % log_after == 0 and idx > 0:
                    print('{}. ({}/{}) image size = {}, loss = {}: accuracy = {}/{}'.format(i,
                                                                                            idx,
                                                                                            len(train_loader),
                                                                                            out_x.size(),
                                                                                            loss.item(),
                                                                                            batch_correct,
                                                                                            pred.size(0)))
                #################################
                # three steps for backprop
                model.zero_grad()
                loss.backward()
                # perform gradient clipping between loss backward and optimizer step
                clip_grad_norm_(model.parameters(), 0.05)
                optimizer.step()
                #################################
            # remember this should be in the epoch loop ;)
            scheduler.step()  # to dynamically change the learning rate
            mean_accuracy = correct_count / total_count * 100
            mean_loss = np.asarray(net_loss).mean()
            m_loss.append((i, mean_loss))
            m_accuracy.append((i, mean_accuracy))

            writer.add_scalar(tag='train loss', scalar_value=mean_loss, global_step=i)
            writer.add_scalar(tag='train over_all accuracy', scalar_value=mean_accuracy, global_step=i)

            print('####################################')
            print('epoch {} -> total loss = {:.5f}, total accuracy = {:.5f}% (lr: {})'.format(i,
                                                                                              mean_loss,
                                                                                              mean_accuracy,
                                                                                              optimizer.param_groups[0]['lr']))
            print('####################################')

            # validate model after each epoch
            with torch.no_grad():
                eval_net(model=model, writer=writer, criterion=criterion,
                         val_loader=val_dataloader, denominator=batch_size,
                         cuda=cuda, device=device, global_step=i, one_hot=one_hot)
    pass
Пример #49
0
                ###########################
                netG.zero_grad()
                label.fill_(
                    real_label)  # fake labels are real for generator cost
                output = netD(fake)
                errG = criterion(output, label)
                errG.backward()
                D_G_z2 = output.mean().item()
                optimizerG.step()

                print(
                    '[%d/%d][%d/%d] Loss_D: %.4f Loss_G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f'
                    % (epoch, opt.niter, i, len(dataloader), errD.item(),
                       errG.item(), D_x, D_G_z1, D_G_z2))
                if i % 1 == 0:
                    vutils.save_image(real_cpu,
                                      '%s/real_samples.png' % opt.outDict,
                                      normalize=True)
                    fake = netG(fixed_noise)
                    vutils.save_image(fake.detach(),
                                      '%s/fake_samples_epoch_%03d_%06d.png' %
                                      (opt.outDict, epoch, i),
                                      normalize=True)
    # ---DCGAN Training End---

    # do checkpointing
    torch.save(netG.state_dict(),
               '%s/netG_epoch_%d.pth' % (opt.outDict, epoch))
    torch.save(netD.state_dict(),
               '%s/netD_epoch_%d.pth' % (opt.outDict, epoch))
Пример #50
0
 def snapshot(self, model, name):
     # filename = model.name + '_iter_{:d}'.format(iter) + '.pth'
     filename = f"{name}.pth"
     filename = os.path.join(self.output_dir, filename)
     torch.save(model.state_dict(), filename)
     self.logger('Wrote snapshot to: {:s}'.format(filename))
Пример #51
0
def train_model(model,
                train_dataloader,
                test_dataloader,
                criterion,
                optimizer,
                scheduler,
                num_epochs=25,
                inference=False):
    model = model.float()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
    model = model.to(device)

    if not inference:
        print('Start training CRNet...')
        for epoch in range(num_epochs):
            model.train()
            scheduler.step()

            running_loss = 0.0
            for i, data in enumerate(train_dataloader, 0):
                inputs, scores, classes = data['image'], data['score'], data[
                    'class']

                inputs = inputs.to(device)
                scores = scores.to(device)
                classes = classes.to(device)

                optimizer.zero_grad()

                inputs = inputs.float()
                scores = scores.float().view(cfg['batch_size'], 1)
                # classes = classes.int().view(cfg['batch_size'], 3)

                reg_out, cls_out = model(inputs)
                loss = criterion(cls_out, classes, reg_out, scores)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
                if i % 10 == 9:  # print every 10 mini-batches
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 10))
                    running_loss = 0.0

        print('Finished training CRNet...\n')
        print('Saving trained model...')
        model_path_dir = './model'
        mkdirs_if_not_exist(model_path_dir)
        torch.save(model.state_dict(), os.path.join(model_path_dir,
                                                    'crnet.pth'))
        print('CRNet has been saved successfully~')

    else:
        print('Loading pre-trained model...')
        model.load_state_dict(torch.load(os.path.join('./model/crnet.pth')))

    model.eval()

    print('Start testing CRNet...')
    predicted_labels = []
    gt_labels = []
    filenames = []
    for data in test_dataloader:
        images, scores, classes, filename = data['image'], data['score'], data[
            'class'], data['filename']
        images = images.to(device)

        reg_out, cls_out = model.forward(images)

        # bat_list = []
        # for out in F.softmax(cls_out).to("cpu"):
        #     tmp = 0
        #     for i in range(0, 3, 1):
        #         tmp += out[i] * (i - 1)
        #     bat_list.append(float(tmp.detach().numpy()))

        # predicted_labels += (0.6 * reg_out.to("cpu").detach().numpy() + 0.4 * np.array(bat_list)).tolist()

        predicted_labels += reg_out.to("cpu").detach().numpy().tolist()
        gt_labels += scores.to("cpu").detach().numpy().tolist()
        filenames += filename

    from sklearn.metrics import mean_absolute_error, mean_squared_error

    mae_lr = round(
        mean_absolute_error(np.array(gt_labels),
                            np.array(predicted_labels).ravel()), 4)
    rmse_lr = round(
        np.math.sqrt(
            mean_squared_error(np.array(gt_labels),
                               np.array(predicted_labels).ravel())), 4)
    pc = round(
        np.corrcoef(np.array(gt_labels),
                    np.array(predicted_labels).ravel())[0, 1], 4)

    print(
        '===============The Mean Absolute Error of CRNet is {0}===================='
        .format(mae_lr))
    print(
        '===============The Root Mean Square Error of CRNet is {0}===================='
        .format(rmse_lr))
    print(
        '===============The Pearson Correlation of CRNet is {0}===================='
        .format(pc))

    col = ['filename', 'gt', 'pred']
    df = pd.DataFrame([[filenames[i], gt_labels[i], predicted_labels[i][0]]
                       for i in range(len(gt_labels))],
                      columns=col)
    df.to_excel("./output.xlsx", sheet_name='Output', index=False)
    print('Output Excel has been generated~')
def train(train_data, val_data, model, args):
    '''
        Train the model
        Use val_data to do early stopping

        Args:
            model (dict): {'ebd': embedding, 'clf': classifier}
    '''
    # creating a tmp directory to save the models
    out_dir = os.path.abspath(os.path.join(
                                  os.path.curdir,
                                  "tmp-runs",
                                  str(int(time.time() * 1e7))))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)

    # Write results
    # write_acc_tr = 'acc_base.csv'
    # init_csv(write_acc_tr)
    # write_acc_val = 'val_acc_base.csv'
    # init_csv(write_acc_val)

    best_acc = 0
    sub_cycle = 0
    best_path = None

    # grad_param generates the learnable parameters from the classifier
    params_to_opt = grad_param(model, ['ebd', 'clf'])
    opt = torch.optim.Adam(params_to_opt, lr=args.lr)

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            opt, 'max', patience=args.patience//2, factor=0.1, verbose=True)

    print("{}, Start training".format(
        datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S')), flush=True)

    train_gen = ParallelSampler(train_data, args, args.train_episodes)
    train_gen_val = ParallelSampler(train_data, args, args.val_episodes)
    val_gen = ParallelSampler(val_data, args, args.val_episodes)

    for ep in range(args.train_epochs):
        sampled_tasks = train_gen.get_epoch()

        grad = {'clf': [], 'ebd': []}

        if not args.notqdm:
            sampled_tasks = tqdm(sampled_tasks, total=train_gen.num_episodes,
                    ncols=80, leave=False, desc=colored('Training on train',
                        'yellow'))

        for task in sampled_tasks:
            if task is None:
                break
            train_one(task, model, opt, args, grad)

        if ep % 10 == 0:
            acc, std = test(train_data, model, args, args.val_episodes, False,
                            train_gen_val.get_epoch())
            print("{}, {:s} {:2d}, {:s} {:s}{:>7.4f} ± {:>6.4f} ".format(
                datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'),
                "ep", ep,
                colored("train", "red"),
                colored("acc:", "blue"), acc, std,
                ), flush=True)

            # write_csv(write_acc_tr, acc, std, ep)

        # Evaluate validation accuracy
        cur_acc, cur_std = test(val_data, model, args, args.val_episodes, False,
                                val_gen.get_epoch())
        print(("{}, {:s} {:2d}, {:s} {:s}{:>7.4f} ± {:>6.4f}, "
               "{:s} {:s}{:>7.4f}, {:s}{:>7.4f}").format(
               datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'),
               "ep", ep,
               colored("val  ", "cyan"),
               colored("acc:", "blue"), cur_acc, cur_std,
               colored("train stats", "cyan"),
               colored("ebd_grad:", "blue"), np.mean(np.array(grad['ebd'])),
               colored("clf_grad:", "blue"), np.mean(np.array(grad['clf'])),
               ), flush=True)

        # if ep % 10 == 0: write_csv(write_acc_val, cur_acc, cur_std, ep)

        # Update the current best model if val acc is better
        if cur_acc > best_acc:
            best_acc = cur_acc
            best_path = os.path.join(out_dir, str(ep))

            # save current model
            print("{}, Save cur best model to {}".format(
                datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'),
                best_path))

            torch.save(model['ebd'].state_dict(), best_path + '.ebd')
            torch.save(model['clf'].state_dict(), best_path + '.clf')

            sub_cycle = 0
        else:
            sub_cycle += 1

        # Break if the val acc hasn't improved in the past patience epochs
        if sub_cycle == args.patience:
            break

    print("{}, End of training. Restore the best weights".format(
            datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S')),
            flush=True)

    # restore the best saved model
    model['ebd'].load_state_dict(torch.load(best_path + '.ebd'))
    model['clf'].load_state_dict(torch.load(best_path + '.clf'))

    if args.save:
        # save the current model
        out_dir = os.path.abspath(os.path.join(
                                      os.path.curdir,
                                      "saved-runs",
                                      str(int(time.time() * 1e7))))
        if not os.path.exists(out_dir):
            os.makedirs(out_dir)

        best_path = os.path.join(out_dir, 'best')

        print("{}, Save best model to {}".format(
            datetime.datetime.now().strftime('%02y/%02m/%02d %H:%M:%S'),
            best_path), flush=True)

        torch.save(model['ebd'].state_dict(), best_path + '.ebd')
        torch.save(model['clf'].state_dict(), best_path + '.clf')

        with open(best_path + '_args.txt', 'w') as f:
            for attr, value in sorted(args.__dict__.items()):
                f.write("{}={}\n".format(attr, value))

    return
Пример #53
0
def get_gqn_dataset_with_name(name,
                              train_batch_size,
                              eval_batch_size,
                              kwargs,
                              allow_empty_context=False,
                              target_sample_method='remaining',
                              max_cond_size=20,
                              max_target_size=20,
                              num_data=None
                              ):
    # init dataset (train / val)
    train_dataset = SceneDataset(
            root='data/gqn-datasets',
            name=name,
            train=True,
            img_size=64,
            allow_empty_context=allow_empty_context,
            target_sample_method=target_sample_method,
            max_cond_size=max_cond_size,
            max_target_size=max_target_size,
            )
    val_dataset = SceneDataset(
            root='data/gqn-datasets',
            name=name,
            train=True,
            img_size=64,
            allow_empty_context=allow_empty_context,
            target_sample_method='remaining', #'full',
            max_cond_size=max_cond_size,
            max_target_size=max_target_size,
            )
    test_dataset = SceneDataset(
            root='data/gqn-datasets',
            name=name,
            train=False,
            img_size=64,
            allow_empty_context=allow_empty_context,
            target_sample_method='remaining', #'full',
            max_cond_size=max_cond_size,
            max_target_size=max_target_size,
            )

    # set num data
    if num_data is not None:
        num_data = min(len(train_dataset.samples), num_data)
        suffix = num_data
        train_dataset.samples = train_dataset.samples[:num_data]
        val_dataset.samples   = val_dataset.samples[:num_data]
    else:
        num_data = len(train_dataset.samples)
        suffix = None

    # split train and val
    os.system('mkdir -p cache/gqn-datasets/{}'.format(name))
    split_filename = os.path.join(
            'cache/gqn-datasets/{}'.format(name),
            'split-{}.pt'.format(suffix) if suffix is not None else 'split.pt'
            )
    if os.path.exists(split_filename):
        indices = torch.load(split_filename)
    else:
        indices = torch.from_numpy(np.random.permutation(num_data))
        torch.save(indices, open(split_filename, 'wb'))
    train_dataset.samples = [train_dataset.samples[index] for index in indices[:num_data-20000]]
    val_dataset.samples   = [val_dataset.samples[index]   for index in indices[num_data-20000:]]

    # init dataloader
    train_loader = torch.utils.data.DataLoader(train_dataset,
            batch_size=train_batch_size, shuffle=True, collate_fn=collate_fn, **kwargs)
    val_loader = torch.utils.data.DataLoader(val_dataset,
            batch_size=train_batch_size, shuffle=False, collate_fn=collate_fn, **kwargs)
    test_loader = torch.utils.data.DataLoader(test_dataset,
            batch_size=eval_batch_size, shuffle=False, collate_fn=collate_fn, **kwargs)

    # init info
    info = {}
    info['nviews'] = train_dataset.num_views
    info['max_cond_size'] = train_dataset.max_cond_size
    info['max_target_size'] = train_dataset.max_target_size
    info['allow_empty_context'] = train_dataset.allow_empty_context
    info['target_sample_method'] = train_dataset.target_sample_method

    return train_loader, val_loader, test_loader, info
Пример #54
0
def trainEpoches(encoder,
                 decoder,
                 criterion,
                 print_every=10,
                 learning_rate=0.001,
                 l2=0.0001):
    start = time.time()
    out_losses = []
    print_loss_total = 0  # Reset every print_every
    # plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(),
                                   lr=learning_rate,
                                   weight_decay=l2)  # SGD
    decoder_optimizer = optim.Adam(decoder.parameters(),
                                   lr=learning_rate,
                                   weight_decay=l2)
    # training_pairs = [tensorsFromPair(random.choice(pairs))
    # 				  for i in range(n_iters)]

    # for iter in range(1, n_iters + 1):
    # training_pair = training_pairs[iter - 1]
    # for epoch in range(epoches):
    # i = 0
    mini_batches = get_minibatches(train_datasets, BATCH)
    batches_size = len(train_datasets[0]) // BATCH  # len(list(mini_batches))
    for i, data in enumerate(mini_batches):
        if i == batches_size:
            break
        # for i, data in enumerate(train_dataloader, 1):
        sentences, tags = data
        input_tensor, input_length = padding_sequence(sentences,
                                                      pad_token=EMBEDDING_SIZE)
        target_tensor, target_length = padding_sequence(tags,
                                                        pad_token=TAG_SIZE)
        if torch.cuda.is_available():
            input_tensor = Variable(
                torch.cuda.LongTensor(input_tensor, device=device)).cuda()
            target_tensor = Variable(
                torch.cuda.LongTensor(target_tensor, device=device)).cuda()
        else:
            input_tensor = Variable(
                torch.LongTensor(input_tensor, device=device))
            target_tensor = Variable(
                torch.LongTensor(target_tensor, device=device))

        loss = train(input_tensor, target_tensor, encoder, decoder,
                     encoder_optimizer, decoder_optimizer,
                     criterion)  # , input_length, target_length
        out_losses.append(loss)
        print_loss_total += loss
        # plot_loss_total += loss

        if i % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(' (%d %d%%) %.4f' %
                  (i, float(i) / batches_size * 100, print_loss_avg))
            # print('%s (%d %d%%) %.4f' % (timeSince(start, float(i) / batches_size),
            # i, float(i) / batches_size * 100, print_loss_avg))

        # plot_loss_avg = plot_loss_total / plot_every
        # plot_losses.append(plot_loss_avg)
        # plot_loss_total = 0
        # i += 1
    np.save("loss", out_losses)
    if epoch % 10 == 0:
        model_name = "./model/model_encoder_epoch" + str(epoch) + ".pkl"
        torch.save(encoder, model_name)
        model_name = "./model/model_decoder_epoch" + str(epoch) + ".pkl"
        torch.save(decoder, model_name)
        print("Model has been saved")
Пример #55
0
    def cache(self, name, cache, url=None):
        path = os.path.join(cache, name)
        path_pt = path + '.pt'

        if not os.path.isfile(path_pt):
            if not os.path.isfile(path) and url:
                logger.info('Downloading vectors from {}'.format(url))
                if not os.path.exists(cache):
                    os.makedirs(cache)
                dest = os.path.join(cache, os.path.basename(url))
                if not os.path.isfile(dest):
                    with tqdm(unit='B', unit_scale=True, miniters=1, desc=dest) as t:
                        urlretrieve(url, dest, reporthook=reporthook(t))
                logger.info('Extracting vectors into {}'.format(cache))
                ext = os.path.splitext(dest)[1][1:]
                if ext == 'zip':
                    with zipfile.ZipFile(dest, "r") as zf:
                        zf.extractall(cache)
                elif ext == 'gz':
                    with tarfile.open(dest, 'r:gz') as tar:
                        tar.extractall(path=cache)
            if not os.path.isfile(path):
                raise RuntimeError('no vectors found at {}'.format(path))

            # str call is necessary for Python 2/3 compatibility, since
            # argument must be Python 2 str (Python 3 bytes) or
            # Python 3 str (Python 2 unicode)
            itos, vectors, dim = [], array.array(str('d')), None

            # Try to read the whole file with utf-8 encoding.
            binary_lines = False
            try:
                with io.open(path, encoding="utf8") as f:
                    lines = [line for line in f]
            # If there are malformed lines, read in binary mode
            # and manually decode each word from utf-8
            except:
                logger.warning("Could not read {} as UTF8 file, "
                               "reading file as bytes and skipping "
                               "words with malformed UTF8.".format(path))
                with open(path, 'rb') as f:
                    lines = [line for line in f]
                binary_lines = True

            logger.info("Loading vectors from {}".format(path))
            for line in tqdm(lines, total=len(lines)):
                # Explicitly splitting on " " is important, so we don't
                # get rid of Unicode non-breaking spaces in the vectors.
                entries = line.rstrip().split(" ")
                word, entries = entries[0], entries[1:]
                if dim is None and len(entries) > 1:
                    dim = len(entries)
                elif len(entries) == 1:
                    logger.warning("Skipping token {} with 1-dimensional "
                                   "vector {}; likely a header".format(word, entries))
                    continue
                elif dim != len(entries):
                    raise RuntimeError(
                        "Vector for token {} has {} dimensions, but previously "
                        "read vectors have {} dimensions. All vectors must have "
                        "the same number of dimensions.".format(word, len(entries), dim))

                if binary_lines:
                    try:
                        if isinstance(word, six.binary_type):
                            word = word.decode('utf-8')
                    except:
                        logger.info("Skipping non-UTF8 token {}".format(repr(word)))
                        continue
                vectors.extend(float(x) for x in entries)
                itos.append(word)

            self.itos = itos
            self.stoi = {word: i for i, word in enumerate(itos)}
            self.vectors = torch.Tensor(vectors).view(-1, dim)
            self.dim = dim
            logger.info('Saving vectors to {}'.format(path_pt))
            torch.save((self.itos, self.stoi, self.vectors, self.dim), path_pt)
        else:
            logger.info('Loading vectors from {}'.format(path_pt))
            self.itos, self.stoi, self.vectors, self.dim = torch.load(path_pt)
Пример #56
0
def saveModel(FOLD, model_dict):
    dirs = f'./new_trained_models/fold{FOLD}/'
    if not os.path.exists(dirs):
        os.makedirs(dirs)
    for mod, model in model_dict.items():
        torch.save(model.state_dict(), f'{dirs}/{mod}')
def main():
    config_yaml, local_rank = parse_my_arguments()
    args = args_from_yaml(config_yaml)
    args.local_rank = local_rank

    # args = parse_arguments()
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    device, args = setup_training(args)

    # Prepare optimizer
    model, optimizer, checkpoint, global_step = prepare_model_and_optimizer(args, device)

    if is_main_process():
        print("SEED {}".format(args.seed))

    if args.do_train:
        if is_main_process():
            logger.info("***** Running training *****")
            # logger.info("  Num examples = %d", len(train_data))
            logger.info("  Batch size = %d", args.train_batch_size)
            print("  LR = ", args.learning_rate)
            print("Training. . .")

        model.train()
        most_recent_ckpts_paths = []
        average_loss = 0.0  # averaged loss every args.log_freq steps
        epoch = 0
        training_steps = 0

        pool = ProcessPoolExecutor(1)

        # Note: We loop infinitely over epochs, termination is handled via iteration count
        while True:
            if not args.resume_from_checkpoint or epoch > 0 or args.phase2:
                files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
                         os.path.isfile(os.path.join(args.input_dir, f)) and 'training' in f]
                files.sort()
                num_files = len(files)
                random.shuffle(files)
                f_start_id = 0
            else:
                f_start_id = checkpoint['files'][0]
                files = checkpoint['files'][1:]
                args.resume_from_checkpoint = False
                num_files = len(files)

            shared_file_list = {}

            if torch.distributed.is_initialized() and torch.distributed.get_world_size() > num_files:
                remainder = torch.distributed.get_world_size() % num_files
                data_file = files[(
                                              f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_start_id) % num_files]
            else:
                data_file = files[
                    (f_start_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files]

            previous_file = data_file

            train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
            train_sampler = RandomSampler(train_data)
            train_dataloader = DataLoader(train_data, sampler=train_sampler,
                                          batch_size=args.train_batch_size * args.n_gpu, num_workers=4,
                                          pin_memory=True)
            # shared_file_list["0"] = (train_dataloader, data_file)

            overflow_buf = None
            if args.allreduce_post_accumulation:
                overflow_buf = torch.cuda.IntTensor([0])

            for f_id in range(f_start_id + 1, len(files)):

                if torch.distributed.get_world_size() > num_files:
                    data_file = files[(f_id * torch.distributed.get_world_size() + torch.distributed.get_rank() + remainder * f_id) % num_files]
                else:
                    data_file = files[
                        (f_id * torch.distributed.get_world_size() + torch.distributed.get_rank()) % num_files]

                logger.info("file no %s file %s" % (f_id, previous_file))

                previous_file = data_file

                dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq,
                                             shared_file_list, args)

                train_iter = tqdm(train_dataloader, desc="Iteration") if is_main_process() else train_dataloader
                for step, batch in enumerate(train_iter):

                    training_steps += 1
                    batch = [t.to(device) for t in batch]
                    input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
                    loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
                                 masked_lm_labels=masked_lm_labels, next_sentence_label=next_sentence_labels,
                                 checkpoint_activations=args.checkpoint_activations)
                    if args.n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu.

                    divisor = args.gradient_accumulation_steps
                    if args.gradient_accumulation_steps > 1:
                        if not args.allreduce_post_accumulation:
                            # this division was merged into predivision
                            loss = loss / args.gradient_accumulation_steps
                            divisor = 1.0
                    if args.fp16:
                        with amp.scale_loss(loss, optimizer,
                                            delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()
                    average_loss += loss.item()

                    if training_steps % args.gradient_accumulation_steps == 0:
                        global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step)

                    if global_step >= args.max_steps:
                        last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq
                        last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps
                        average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda()
                        average_loss = average_loss / (last_num_steps * divisor)
                        if torch.distributed.is_initialized():
                            average_loss /= torch.distributed.get_world_size()
                            torch.distributed.all_reduce(average_loss)
                        if is_main_process():
                            logger.info("Total Steps:{} Final Loss = {}".format(
                                training_steps / args.gradient_accumulation_steps, average_loss.item()))
                    elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0:
                        if is_main_process():
                            print("Step:{} Average Loss = {} Step Loss = {} LR {}".format(global_step, average_loss / (
                                    args.log_freq * divisor),
                                                                                          loss.item() * args.gradient_accumulation_steps / divisor,
                                                                                          optimizer.param_groups[0][
                                                                                              'lr']))
                        average_loss = 0

                    if global_step >= args.max_steps or training_steps % (
                            args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0:
                        if is_main_process():
                            # Save a trained model
                            logger.info("** ** * Saving fine - tuned model ** ** * ")
                            model_to_save = model.module if hasattr(model,
                                                                    'module') else model  # Only save the model it-self
                            if args.resume_step < 0 or not args.phase2:
                                output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step))
                            else:
                                output_save_file = os.path.join(args.output_dir,
                                                                "ckpt_{}.pt".format(global_step + args.phase1_end_step))
                            if args.do_train:
                                torch.save({'model': model_to_save.state_dict(),
                                            'optimizer': optimizer.state_dict(),
                                            'master params': list(amp.master_params(optimizer)),
                                            'files': [f_id] + files}, output_save_file)

                                most_recent_ckpts_paths.append(output_save_file)
                                if len(most_recent_ckpts_paths) > 3:
                                    ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
                                    os.remove(ckpt_to_be_removed)

                        if global_step >= args.max_steps:
                            del train_dataloader
                            # thread.join()
                            return args

                del train_dataloader
                # thread.join()
                # Make sure pool has finished and switch train_dataloader
                # NOTE: Will block until complete
                train_dataloader, data_file = dataset_future.result(timeout=None)

            epoch += 1
Пример #58
0
def main():
    # init model, ResNet18() can be also used here for training
    # model = WideResNet().to(device)
    if args.network == 'smallCNN':
        model = SmallCNN().to(device)
    elif args.network == 'wideResNet':
        model = WideResNet().to(device)
    elif args.network == 'resnet':
        model = ResNet().to(device)
    else:
        model = VGG(args.network, num_classes=10).to(device)
    sys.stdout = Logger(os.path.join(args.log_dir, args.log_file))
    print(model)
    criterion_prox = Proximity(10, args.feat_size, True)
    criterion_conprox = Con_Proximity(10, args.feat_size, True)
    optimizer_prox = optim.SGD(criterion_prox.parameters(), lr=args.lr_prox)
    optimizer_conprox = optim.SGD(criterion_conprox.parameters(), lr=args.lr_conprox)
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    if args.fine_tune:
        base_dir = args.base_dir
        state_dict = torch.load("{}/{}_ep{}.pt".format(base_dir, args.base_model, args.checkpoint))
        opt = torch.load("{}/opt-{}_ep{}.tar".format(base_dir, args.base_model, args.checkpoint))
        model.load_state_dict(state_dict)
        optimizer.load_state_dict(opt)


    natural_acc = []
    robust_acc = []

    for epoch in range(1, args.epochs + 1):
        # adjust learning rate for SGD
        adjust_learning_rate(optimizer, epoch)
        adjust_learning_rate(optimizer_prox, epoch)
        adjust_learning_rate(optimizer_conprox, epoch)

        start_time = time.time()

        # adversarial training
        train(model, device, train_loader, optimizer,
              criterion_prox, optimizer_prox,
              criterion_conprox, optimizer_conprox, epoch)

        # evaluation on natural examples
        print('================================================================')
        print("Current time: {}".format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
        # eval_train(model, device, train_loader)
        # eval_test(model, device, test_loader)
        natural_err_total, robust_err_total = eval_adv_test_whitebox(model, device, test_loader)
        with open(os.path.join(stats_dir, '{}.txt'.format(args.save_model)), "a") as f:
            f.write("{} {} {}\n".format(epoch, natural_err_total, robust_err_total))

        print('using time:', datetime.timedelta(seconds=round(time.time() - start_time)))

        natural_acc.append(natural_err_total)
        robust_acc.append(robust_err_total)


        file_name = os.path.join(stats_dir, '{}_stat{}.npy'.format(args.save_model, epoch))
        # np.save(file_name, np.stack((np.array(self.train_loss), np.array(self.test_loss),
        #                              np.array(self.train_acc), np.array(self.test_acc),
        #                              np.array(self.elasticity), np.array(self.x_grads),
        #                              np.array(self.fgsms), np.array(self.pgds),
        #                              np.array(self.cws))))
        np.save(file_name, np.stack((np.array(natural_acc), np.array(robust_acc))))

        # save checkpoint
        if epoch % args.save_freq == 0:
            torch.save(model.state_dict(),
                       os.path.join(model_dir, '{}_ep{}.pt'.format(args.save_model, epoch)))
            torch.save(optimizer.state_dict(),
                       os.path.join(model_dir, 'opt-{}_ep{}.tar'.format(args.save_model, epoch)))
            print("Ep{}: Model saved as {}.".format(epoch, args.save_model))
        print('================================================================')
Пример #59
0
def strip_optimizer(f='weights/last.pt'):  # from utils.utils import *; strip_optimizer()
    # Strip optimizer from *.pt files for lighter files (reduced by 2/3 size)
    x = torch.load(f, map_location=torch.device('cpu'))
    x['optimizer'] = None
    torch.save(x, f)
Пример #60
0
        label.fill_(real_label)#-- fake labels are real for generator cost
        errGA = criterion.forward(output_A,label)
        lossG += errGA.item()
        #errGA = errGA * 1/2
        errGA.backward()
        errG = (errGA + errGD)/2
        optimizerG.step()
        # print('batch_step',i+len(train_dataloader)*epoch)
        if (i==0) & (epoch == 0):
            start_time=time.time()
        if (i == 536) & (epoch == 0):
            end_time = time.time()
            print('1_epoch takes {} seconds'.format(end_time-start_time))
        if (i+len(train_dataloader)*epoch) % 67==0:
            print('epoch={},batch={}, lossG={}, lossA={}, lossD={}'.format(epoch,i,lossG/2,lossA/3,lossD/3))
            writer.add_scalar('data_adam/lossA', errA, i+len(train_dataloader)*epoch)
            writer.add_scalar('data_adam/lossD', errD, i+len(train_dataloader)*epoch)
            writer.add_scalar('data_adam/lossG', errG, i+len(train_dataloader)*epoch)
        if (i+len(train_dataloader)*epoch) % 134==0 :
            fake = (fake +1)/2
            ass_label = (ass_label +1)/2
            writer.add_image('data/generator_adam_loss_{} input_picture'.format(i+len(train_dataloader)*epoch),input_img[:8],i+len(train_dataloader)*epoch)
            writer.add_image('data/generator_adam_loss_{} fake_picture'.format(i+len(train_dataloader)*epoch),fake[:8],i+len(train_dataloader)*epoch)
            writer.add_image('data/generator_adam_loss_{} ground_truth'.format(i+len(train_dataloader)*epoch),ass_label[:8],i+len(train_dataloader)*epoch)
    # do checkpointing
    if (epoch+1) % 1 ==0:
        torch.save(netG.state_dict(), '%s/adam_netG_epoch_%d.pth' % (opt.outf, epoch+1))
        torch.save(netD.state_dict(), '%s/adam_netD_epoch_%d.pth' % (opt.outf, epoch+1))
        torch.save(netA.state_dict(), '%s/adam_netA_epoch_%d.pth' % (opt.outf, epoch+1))
writer.close()