示例#1
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['node_idx_gnn'] = batch_data[dd][ff][
                                'node_idx_gnn'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['node_idx_feat'] = batch_data[dd][ff][
                                'node_idx_feat'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['label'] = batch_data[dd][ff][
                                'label'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['att_idx'] = batch_data[dd][ff][
                                'att_idx'].pin_memory().to(gpu_id,
                                                           non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    if batch_fwd:
                        train_loss = model(*batch_fwd).mean()
                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
示例#2
0
  def train(self):
    # create data loader
    train_dataset = eval(self.dataset_conf.loader_name)(
        self.config, split='train')
    dev_dataset = eval(self.dataset_conf.loader_name)(self.config, split='dev')
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self.train_conf.batch_size,
        shuffle=self.train_conf.shuffle,
        num_workers=self.train_conf.num_workers,
        collate_fn=train_dataset.collate_fn,
        drop_last=False)
    subset_indices = range(self.subsample_size)
    train_loader_sub = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=self.train_conf.batch_size,
        shuffle=False,
        num_workers=self.train_conf.num_workers,
        collate_fn=train_dataset.collate_fn,
        drop_last=False,
        sampler=SubsetRandomSampler(subset_indices))
    dev_loader_sub = torch.utils.data.DataLoader(
          dev_dataset,
          batch_size=self.train_conf.batch_size,
          shuffle=False,
          num_workers=self.train_conf.num_workers,
          collate_fn=dev_dataset.collate_fn,
          drop_last=False,
          sampler=SubsetRandomSampler(subset_indices))

    # create models
    model = eval(self.model_conf.name)(self.model_conf)

    if self.use_gpu:
      model = nn.DataParallel(model, device_ids=self.gpus).cuda()

    # create optimizer
    params = filter(lambda p: p.requires_grad, model.parameters())
    if self.train_conf.optimizer == 'SGD':
      optimizer = optim.SGD(
          params,
          lr=self.train_conf.lr,
          momentum=self.train_conf.momentum,
          weight_decay=self.train_conf.wd)
    elif self.train_conf.optimizer == 'Adam':
      optimizer = optim.Adam(
          params, lr=self.train_conf.lr, weight_decay=self.train_conf.wd)
    else:
      raise ValueError("Non-supported optimizer!")

    early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False)

    lr_scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=self.warmup_setps)

    # reset gradient
    optimizer.zero_grad()

    # resume training or use prxetrained model
    if self.train_conf.is_resume:
      if self.train_conf.pretrain:
        model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device)
        model.load_state_dict(model_snapshot["model"],strict=False)
        model.to(self.device)
      else:
        model_snapshot = torch.load(self.train_conf.resume_model,map_location=self.device)
        model.load_state_dict(model_snapshot["model"],strict=True)
        model.to(self.device)

    # Training Loop
    num_train = len(train_dataset)
    iter_count = 0
    best_val_loss = np.inf
    best_val_loss_test = np.inf
    best_win_pct_val = 0
    best_win_pct_val_test = 0

    results = defaultdict(list)
    for epoch in range(self.train_conf.max_epoch):

      # --------------------------------validation---------------------------------------------
      if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0:

        #calculate validation loss
        model.eval()
        with torch.no_grad():
          result_dataset_val = self.cal_dataset_loss(model,dev_loader_sub)

        if self.is_val:
          logger.info("-----------------Avg. Validation Loss = {:.4f}, "
          "NMLL = {:.4f}, NMLL_opt = {:.4f}, Win_pct = {:.2f}%, "
          "NMLL_test = {:.4f}, NMLL_test_opt = {:.4f}, "
          "Win_pct_test = {:.2f}%--------------------".format(
            result_dataset_val['loss'], 
            result_dataset_val['nmll'], result_dataset_val['nmll_opt_sm'],
            result_dataset_val['win_pct_ai']*100, 
            result_dataset_val['nmll_test'], result_dataset_val['nmll_opt_sm_test'],
            result_dataset_val['win_pct_ai_test']*100))
          self.writer.add_scalar('nmll_opt_val', result_dataset_val['nmll_opt_sm'], iter_count)
          self.writer.add_scalar('nmll_opt_test_val', result_dataset_val['nmll_opt_sm_test'], iter_count)
          self.writer.add_scalar('win_pct_ai_val', result_dataset_val['win_pct_ai'], iter_count)
          self.writer.add_scalar('win_pct_ai_test_val', result_dataset_val['win_pct_ai_test'], iter_count)
        else:
          logger.info("-----------------Avg. Validation Loss = {:.4f}, "
            "NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, "
            "NMLL_test = {:.4f}, NMLL_test_orig = {:.4f}, "
            "Win_pct_test = {:.2f}%--------------------".format(
              result_dataset_val['loss'], 
              result_dataset_val['nmll'], result_dataset_val['nmll_orig'],
              result_dataset_val['win_pct']*100, 
              result_dataset_val['nmll_test'], result_dataset_val['nmll_test_orig'],
              result_dataset_val['win_pct_test']*100))

        self.writer.add_scalar('val_loss', result_dataset_val['loss'], iter_count)
        self.writer.add_scalar('nmll_loss_val', result_dataset_val['nmll'], iter_count)
        self.writer.add_scalar('nmll_loss_orig_val', result_dataset_val['nmll_orig'], iter_count)
        self.writer.add_scalar('nmll_loss_test_val', result_dataset_val['nmll_test'], iter_count)
        self.writer.add_scalar('nmll_loss_test_orig_val', result_dataset_val['nmll_test_orig'], iter_count)
        self.writer.add_scalar('win_pct_val', result_dataset_val['win_pct'], iter_count)
        self.writer.add_scalar('win_pct_val_test', result_dataset_val['win_pct_test'], iter_count)
        results['val_loss'] += [result_dataset_val['loss']]
        results['nmll_loss_val'] += [result_dataset_val['nmll']]
        results['nmll_loss_orig_val'] += [result_dataset_val['nmll_orig']]
        results['nmll_loss_test_val'] += [result_dataset_val['nmll_test']]
        results['nmll_loss_test_orig_val'] += [result_dataset_val['nmll_test_orig']]
        results['win_pct_val'] += [result_dataset_val['win_pct']]
        results['win_pct_val_test'] += [result_dataset_val['win_pct_test']]

        # save best model
        if result_dataset_val['loss'] < best_val_loss:
          best_val_loss = result_dataset_val['loss']
          best_val_loss_test = result_dataset_val['nmll_test']
          if self.is_val:
            best_win_pct_val = result_dataset_val['win_pct_ai']
            best_win_pct_val_test = result_dataset_val['win_pct_ai_test']
          else:
            best_win_pct_val = result_dataset_val['win_pct']
            best_win_pct_val_test = result_dataset_val['win_pct_test']
          snapshot(
              model.module if self.use_gpu else model,
              optimizer,
              self.config,
              epoch + 1,
              tag='best')

        logger.info("Current Best Validation Loss = {:.4f}".format(best_val_loss))

        # check early stop
        if early_stop.tick([result_dataset_val['loss']]):
          snapshot(
              model.module if self.use_gpu else model,
              optimizer,
              self.config,
              epoch + 1,
              tag='last')
          self.writer.close()
          break

      # --------------------------------------training-----------------------------------
      model.train()
      for data in train_loader:
        optimizer.zero_grad()

        if self.use_gpu:
          data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'] = data_to_gpu(
                data['max_node_size'],data['X_data_tr'],data['X_data_val'],data['X_data_test'],data['F_tr'],data['F_val'],data['F_test'],data['N_val'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_test'],data['kernel_mask_test'],data['diagonal_mask_test'],data['node_mask_tr'],data['dim_mask'], data['nmll'], data['dim_size'])

        if self.model_conf.name == 'GpSMDoubleAtt':
          mu, var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test'])
        elif self.model_conf.name == 'GpSMDoubleAttNoMu':
          var, weights, nmll, nmll_test = model(data['X_data_tr'],data['X_data_val'],data['F_tr'],data['F_val'],data['node_mask_tr'],data['dim_mask'],data['kernel_mask_val'],data['diagonal_mask_val'],data['N_val'],device = self.device,eval_mode = True,X_data_test = data['X_data_test'],F_data_test = data['F_test'],kernel_mask_test=data['kernel_mask_test'],diagonal_mask_test=data['diagonal_mask_test'],N_data_test=data['N_test'])
        else:
          raise ValueError("No model of given name!")
        # print("Outside: input size", data['X_data'].shape, "output_size", nmll.shape)

        nmll_orig = data['nmll']
        win_pct_train = torch.sum(nmll<nmll_orig+0.01).float()/nmll.shape[0]

        data_dim_vec = data['X_data_tr'].shape[-1]
        nmll_loss_train = torch.mean(nmll)

        train_loss = nmll_loss_train

        # calculate gradient
        train_loss.backward()

        nmll_loss_orig = torch.mean(nmll_orig)

        # calculate gradient norm
        grad_norm = 0
        for p in model.parameters():
          if p.requires_grad:
            param_norm = p.grad.data.norm()
            grad_norm += param_norm.item() ** 2
        grad_norm = grad_norm ** (1./2)
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        train_loss = float(train_loss.data.cpu().numpy())
        nmll_loss_train = float(nmll_loss_train.data.cpu().numpy())
        nmll_loss_train_orig = float(nmll_loss_orig.data.cpu().numpy())
        win_pct_train = float(win_pct_train.data.cpu().numpy())


        self.writer.add_scalar('train_loss', train_loss, iter_count)
        self.writer.add_scalar('nmll_loss_train', nmll_loss_train, iter_count)
        self.writer.add_scalar('nmll_loss_train_orig', nmll_loss_train_orig, iter_count)
        self.writer.add_scalar('win_pct_train', win_pct_train, iter_count)
        self.writer.add_scalar('grad_norm', grad_norm, iter_count)

        results['nmll_loss_train'] += [nmll_loss_train]
        results['nmll_loss_train_orig'] += [nmll_loss_train_orig]
        results['train_loss'] += [train_loss]
        results['win_pct_train'] += [win_pct_train]
        results['train_step'] += [iter_count]
        results['grad_norm'] += [grad_norm]

        # display loss
        if (iter_count + 1) % self.train_conf.display_iter == 0:
          logger.info("Loss @ epoch {:04d} iteration {:08d} = {:.4f}, NMLL = {:.4f}, NMLL_orig = {:.4f}, Win_pct = {:.2f}%, Grad_norm = {:.4f}, LR = {:.2e}".format(
              epoch + 1, iter_count + 1, train_loss, nmll_loss_train, nmll_loss_train_orig, win_pct_train*100, grad_norm, get_lr(optimizer)))

        iter_count += 1

      # snapshot model
      if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
        logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
        snapshot(model.module
                 if self.use_gpu else model, optimizer, self.config, epoch + 1)

      lr_scheduler.step()


    #look at predictions, for debug purpose
    model.eval()
    with torch.no_grad():
      results_sample_tr = self.cal_sample_result(model,train_loader_sub)
      results_sample_dev = self.cal_sample_result(model,dev_loader_sub)
      result_dataset_tr = self.cal_dataset_loss(model,train_loader_sub)
      result_dataset_dev = self.cal_dataset_loss(model,dev_loader_sub)

    
    train_loss = result_dataset_tr['loss']
    results['best_val_loss'] = best_val_loss
    results['win_count_tr'] = results_sample_tr['win_pct']
    results['win_count_dev'] = results_sample_dev['win_pct']
    results['nmll_loss_sample_tr'] = results_sample_tr['nmll_loss_sample']
    results['nmll_loss_sample_dev'] = results_sample_dev['nmll_loss_sample']
    pickle.dump(results,
                open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
    self.writer.close()
    logger.info("Best Validation Loss = {:.4f}, "
      "Best Win_pct_val = {:.2f}%, " 
      "Best Val Loss on Test = {:.4f}, "
      "Best Win_pct_val_test = {:.2f}%, "
      "Final Training NMLL = {:.4f}, "
      "Training NMLL original = {:.4f}, "
      "Win_pct_train = {:.2f}%, "
      "Final Dev NMLL = {:.4f}, "
      "Dev NMLL original = {:.4f}, "
      "Win_pct_dev = {:.2f}%, "
      "Final Dev Test NMLL = {:.4f}, "
      "Dev Test NMLL original = {:.4f}, "
      "Win_pct_test_dev = {:.2f}%.".format(
        best_val_loss, \
        best_win_pct_val*100, \
        best_val_loss_test, \
        best_win_pct_val_test*100, \
        result_dataset_tr['nmll'], \
        result_dataset_tr['nmll_orig'], \
        result_dataset_tr['win_pct']*100, \
        result_dataset_dev['nmll'], \
        result_dataset_dev['nmll_orig'], \
        result_dataset_dev['win_pct']*100, \
        result_dataset_dev['nmll_test'], \
        result_dataset_dev['nmll_test_orig'], \
        result_dataset_dev['win_pct_test']*100))


    avg_nmll_tr = np.mean(results_sample_tr['nmll_sample_compare'],0)
    logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_tr['win_pct']*100))
    logger.info('Average NMLL on training samples: true = {}, learned = {}'.format(avg_nmll_tr[1],avg_nmll_tr[0]))
    avg_nmll_dev = np.mean(results_sample_dev['nmll_sample_compare'],0)
    logger.info('% of GPs with higher marginal likelihood = {:.2f}%'.format(results_sample_dev['win_pct']*100))
    logger.info('Average NMLL on testing samples: true = {}, learned = {}'.format(avg_nmll_dev[1],avg_nmll_dev[0]))
    snapshot(
        model.module if self.use_gpu else model,
        optimizer,
        self.config,
        self.train_conf.max_epoch + 1,
        tag='final')
    return None
示例#3
0
    def train(self):
        # create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            split='train')
        dev_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                          split='dev')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)
        dev_loader = torch.utils.data.DataLoader(
            dev_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=False,
            num_workers=self.train_conf.num_workers,
            collate_fn=dev_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)

        if self.use_gpu:
            model = nn.DataParallel(model, device_ids=self.gpus).cuda()

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=10, is_decrease=False)

        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_steps,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        if self.train_conf.is_resume:
            load_model(model,
                       self.train_conf.resume_model,
                       optimizer=optimizer)

        # Training Loop
        iter_count = 0
        best_val_loss = np.inf
        results = defaultdict(list)
        for epoch in range(self.train_conf.max_epoch):
            # validation
            if (epoch + 1) % self.train_conf.valid_epoch == 0 or epoch == 0:
                model.eval()
                val_loss = []

                for data in tqdm(dev_loader):
                    if self.use_gpu:
                        data['node_feat'], data['node_mask'], data[
                            'label'] = data_to_gpu(data['node_feat'],
                                                   data['node_mask'],
                                                   data['label'])

                        if self.model_conf.name == 'LanczosNet':
                            data['L'], data['D'], data['V'] = data_to_gpu(
                                data['L'], data['D'], data['V'])
                        elif self.model_conf.name == 'GraphSAGE':
                            data['nn_idx'], data[
                                'nonempty_mask'] = data_to_gpu(
                                    data['nn_idx'], data['nonempty_mask'])
                        elif self.model_conf.name == 'GPNN':
                            data['L'], data['L_cluster'], data[
                                'L_cut'] = data_to_gpu(data['L'],
                                                       data['L_cluster'],
                                                       data['L_cut'])
                        else:
                            data['L'] = data_to_gpu(data['L'])[0]

                    with torch.no_grad():
                        if self.model_conf.name == 'AdaLanczosNet':
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        elif self.model_conf.name == 'LanczosNet':
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            data['D'],
                                            data['V'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        elif self.model_conf.name == 'GraphSAGE':
                            pred, _ = model(data['node_feat'],
                                            data['nn_idx'],
                                            data['nonempty_mask'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        elif self.model_conf.name == 'GPNN':
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            data['L_cluster'],
                                            data['L_cut'],
                                            label=data['label'],
                                            mask=data['node_mask'])
                        else:
                            pred, _ = model(data['node_feat'],
                                            data['L'],
                                            label=data['label'],
                                            mask=data['node_mask'])

                    curr_loss = (pred - data['label']
                                 ).abs().cpu().numpy() * self.const_factor
                    val_loss += [curr_loss]

                val_loss = float(np.mean(np.concatenate(val_loss)))
                logger.info("Avg. Validation MAE = {}".format(val_loss))
                self.writer.add_scalar('val_loss', val_loss, iter_count)
                results['val_loss'] += [val_loss]

                # save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    snapshot(model.module if self.use_gpu else model,
                             optimizer,
                             self.config,
                             epoch + 1,
                             tag='best')

                logger.info(
                    "Current Best Validation MAE = {}".format(best_val_loss))

                # check early stop
                if early_stop.tick([val_loss]):
                    snapshot(model.module if self.use_gpu else model,
                             optimizer,
                             self.config,
                             epoch + 1,
                             tag='last')
                    self.writer.close()
                    break

            # training
            model.train()
            lr_scheduler.step()
            for data in train_loader:
                optimizer.zero_grad()

                if self.use_gpu:
                    data['node_feat'], data['node_mask'], data[
                        'label'] = data_to_gpu(data['node_feat'],
                                               data['node_mask'],
                                               data['label'])

                    if self.model_conf.name == 'LanczosNet':
                        data['L'], data['D'], data['V'] = data_to_gpu(
                            data['L'], data['D'], data['V'])
                    elif self.model_conf.name == 'GraphSAGE':
                        data['nn_idx'], data['nonempty_mask'] = data_to_gpu(
                            data['nn_idx'], data['nonempty_mask'])
                    elif self.model_conf.name == 'GPNN':
                        data['L'], data['L_cluster'], data[
                            'L_cut'] = data_to_gpu(data['L'],
                                                   data['L_cluster'],
                                                   data['L_cut'])
                    else:
                        data['L'] = data_to_gpu(data['L'])[0]

                if self.model_conf.name == 'AdaLanczosNet':
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                elif self.model_conf.name == 'LanczosNet':
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          data['D'],
                                          data['V'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                elif self.model_conf.name == 'GraphSAGE':
                    _, train_loss = model(data['node_feat'],
                                          data['nn_idx'],
                                          data['nonempty_mask'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                elif self.model_conf.name == 'GPNN':
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          data['L_cluster'],
                                          data['L_cut'],
                                          label=data['label'],
                                          mask=data['node_mask'])
                else:
                    _, train_loss = model(data['node_feat'],
                                          data['L'],
                                          label=data['label'],
                                          mask=data['node_mask'])

                # assign gradient
                train_loss.backward()
                optimizer.step()
                train_loss = float(train_loss.data.cpu().numpy())
                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                # display loss
                if (iter_count + 1) % self.train_conf.display_iter == 0:
                    logger.info(
                        "Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count + 1, train_loss))

                iter_count += 1

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model, optimizer,
                         self.config, epoch + 1)

        results['best_val_loss'] += [best_val_loss]
        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()
        logger.info("Best Validation MAE = {}".format(best_val_loss))

        return best_val_loss
    def train(self):
        torch.autograd.set_detect_anomaly(True)

        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,  # true for grid
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)
        criterion = nn.BCEWithLogitsLoss()

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)
            criterion = criterion.cuda()
        model.train()

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        # TODO: not used?
        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        best_acc = 0.
        # resume training
        # TODO: record resume_epoch to the saved file
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            train_iterator = train_loader.__iter__()

            avg_acc_whole_epoch = 0.
            cnt = 0.

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                avg_acc = 0.
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            # data['node_idx_gnn'] = batch_data[dd][ff]['node_idx_gnn'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['node_idx_feat'] = batch_data[dd][ff]['node_idx_feat'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['label'] = batch_data[dd][ff]['label'].pin_memory().to(gpu_id, non_blocking=True)
                            # data['att_idx'] = batch_data[dd][ff]['att_idx'].pin_memory().to(gpu_id, non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['complete_graph_label'] = batch_data[dd][ff][
                                'complete_graph_label'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    pred = model(*batch_fwd)
                    label = data['complete_graph_label'][:, None]
                    train_loss = criterion(pred, label).mean()
                    train_loss.backward()

                    pred = (torch.sigmoid(pred) > 0.5).type_as(label)
                    avg_acc += (pred.eq(label)).float().mean().item()

                    avg_train_loss += train_loss.item()

                    # assign gradient

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                lr_scheduler.step()
                avg_train_loss /= self.dataset_conf.num_fwd_pass  # num_fwd_pass always 1
                avg_acc /= self.dataset_conf.num_fwd_pass

                avg_acc_whole_epoch += avg_acc
                cnt += len(data['complete_graph_label'])

                # reduce
                self.writer.add_scalar('train_loss', avg_train_loss,
                                       iter_count)
                self.writer.add_scalar('train_acc', avg_acc, iter_count)
                results['train_loss'] += [avg_train_loss]
                results['train_acc'] += [avg_acc]
                results['train_step'] += [iter_count]

                # if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                #   logger.info("NLL Loss @ epoch {:04d} iteration {:08d} = {}\tAcc = {}".format(epoch + 1, iter_count, train_loss, avg_acc))

            avg_acc_whole_epoch /= cnt
            is_new_best = avg_acc_whole_epoch > best_acc
            if is_new_best:
                logger.info('!!! New best')
                best_acc = avg_acc_whole_epoch
            logger.info("Avg acc = {} @ epoch {:04d}".format(
                avg_acc_whole_epoch, epoch + 1))

            # snapshot model
            if (epoch +
                    1) % self.train_conf.snapshot_epoch == 0 or is_new_best:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
示例#5
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        model = eval(self.model_conf.name)(self.config)
        print('number of parameters : {}'.format(
            sum([np.prod(x.shape) for x in model.parameters()])))

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)

        from copy import deepcopy
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            deepcopy(optimizer),
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            has_sampled = False
            model.train()
            # lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data.append(data)
                        iter_count += 1

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = {}
                            data['adj'] = batch_data[dd][ff]['adj'].pin_memory(
                            ).to(gpu_id, non_blocking=True)
                            data['edges'] = batch_data[dd][ff][
                                'edges'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['node_idx_gnn'] = batch_data[dd][ff][
                                'node_idx_gnn'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['node_idx_feat'] = batch_data[dd][ff][
                                'node_idx_feat'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            data['label'] = batch_data[dd][ff][
                                'label'].pin_memory().to(gpu_id,
                                                         non_blocking=True)
                            data['att_idx'] = batch_data[dd][ff][
                                'att_idx'].pin_memory().to(gpu_id,
                                                           non_blocking=True)
                            data['subgraph_idx'] = batch_data[dd][ff][
                                'subgraph_idx'].pin_memory().to(
                                    gpu_id, non_blocking=True)
                            batch_fwd.append((data, ))

                    if batch_fwd:
                        train_loss = model(*batch_fwd).mean()
                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

            if (epoch + 1) % 20 == 0 and not has_sampled:
                has_sampled = True
                print('saving graphs')
                model.eval()
                graphs_gen = [
                    get_graph(aa.cpu().data.numpy())
                    for aa in model.module._sampling(10)
                ]
                model.train()

                vis_graphs = []
                for gg in graphs_gen:
                    CGs = [gg.subgraph(c) for c in nx.connected_components(gg)]
                    CGs = sorted(CGs,
                                 key=lambda x: x.number_of_nodes(),
                                 reverse=True)
                    vis_graphs += [CGs[0]]

                total = len(vis_graphs)  #min(3, len(vis_graphs))
                draw_graph_list(vis_graphs[:total],
                                2,
                                int(total // 2),
                                fname='sample/gran_%d.png' % epoch,
                                layout='spring')

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1
示例#6
0
    def train(self):
        ### create data loader
        train_dataset = eval(self.dataset_conf.loader_name)(self.config,
                                                            self.graphs_train,
                                                            tag='train')
        train_loader = torch.utils.data.DataLoader(
            train_dataset,
            batch_size=self.train_conf.batch_size,
            shuffle=self.train_conf.shuffle,
            num_workers=self.train_conf.num_workers,
            collate_fn=train_dataset.collate_fn,
            drop_last=False)

        # create models
        # model = eval(self.model_conf.name)(self.config)
        from model.transformer import make_model
        model = make_model(max_node=self.config.model.max_num_nodes,
                           d_out=20,
                           N=7,
                           d_model=64,
                           d_ff=64,
                           dropout=0.4)  # d_out, N, d_model, d_ff, h
        # d_out=20, N=15, d_model=16, d_ff=16, dropout=0.2) # d_out, N, d_model, d_ff, h
        # d_out=20, N=3, d_model=64, d_ff=64, dropout=0.1) # d_out, N, d_model, d_ff, h

        if self.use_gpu:
            model = DataParallel(model, device_ids=self.gpus).to(self.device)

        # create optimizer
        params = filter(lambda p: p.requires_grad, model.parameters())
        if self.train_conf.optimizer == 'SGD':
            optimizer = optim.SGD(params,
                                  lr=self.train_conf.lr,
                                  momentum=self.train_conf.momentum,
                                  weight_decay=self.train_conf.wd)
        elif self.train_conf.optimizer == 'Adam':
            optimizer = optim.Adam(params,
                                   lr=self.train_conf.lr,
                                   weight_decay=self.train_conf.wd)
        else:
            raise ValueError("Non-supported optimizer!")

        early_stop = EarlyStopper([0.0], win_size=100, is_decrease=False)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=self.train_conf.lr_decay_epoch,
            gamma=self.train_conf.lr_decay)

        # reset gradient
        optimizer.zero_grad()

        # resume training
        resume_epoch = 0
        if self.train_conf.is_resume:
            model_file = os.path.join(self.train_conf.resume_dir,
                                      self.train_conf.resume_model)
            load_model(model.module if self.use_gpu else model,
                       model_file,
                       self.device,
                       optimizer=optimizer,
                       scheduler=lr_scheduler)
            resume_epoch = self.train_conf.resume_epoch

        # Training Loop
        iter_count = 0
        results = defaultdict(list)
        for epoch in range(resume_epoch, self.train_conf.max_epoch):
            model.train()
            lr_scheduler.step()
            train_iterator = train_loader.__iter__()

            for inner_iter in range(len(train_loader) // self.num_gpus):
                optimizer.zero_grad()

                batch_data = []
                if self.use_gpu:
                    for _ in self.gpus:
                        data = train_iterator.next()
                        batch_data += [data]

                avg_train_loss = .0
                for ff in range(self.dataset_conf.num_fwd_pass):
                    batch_fwd = []

                    if self.use_gpu:
                        for dd, gpu_id in enumerate(self.gpus):
                            data = batch_data[dd]

                            adj, lens = data['adj'], data['lens']

                            # this is only for grid
                            # adj = adj[:, :, :100, :100]
                            # lens = [min(99, x) for x in lens]

                            adj = adj.to('cuda:%d' % gpu_id)

                            # build masks
                            node_feat, attn_mask, lens = preprocess(adj, lens)
                            batch_fwd.append(
                                (node_feat, attn_mask.clone(), lens))

                    if batch_fwd:
                        node_feat, attn_mask, lens = batch_fwd[0]
                        log_theta, log_alpha = model(*batch_fwd)

                        train_loss = model.module.mix_bern_loss(
                            log_theta, log_alpha, adj, lens)

                        avg_train_loss += train_loss

                        # assign gradient
                        train_loss.backward()

                # clip_grad_norm_(model.parameters(), 5.0e-0)
                optimizer.step()
                avg_train_loss /= float(self.dataset_conf.num_fwd_pass)

                # reduce
                train_loss = float(avg_train_loss.data.cpu().numpy())

                self.writer.add_scalar('train_loss', train_loss, iter_count)
                results['train_loss'] += [train_loss]
                results['train_step'] += [iter_count]

                if iter_count % self.train_conf.display_iter == 0 or iter_count == 1:
                    logger.info(
                        "NLL Loss @ epoch {:04d} iteration {:08d} = {}".format(
                            epoch + 1, iter_count, train_loss))

                if epoch % 50 == 0 and inner_iter == 0:
                    model.eval()
                    print('saving graphs')
                    graphs_gen = [get_graph(adj[0].cpu().data.numpy())] + [
                        get_graph(aa.cpu().data.numpy())
                        for aa in model.module.sample(
                            19, max_node=self.config.model.max_num_nodes)
                    ]
                    model.train()

                    vis_graphs = []
                    for gg in graphs_gen:
                        CGs = [
                            gg.subgraph(c) for c in nx.connected_components(gg)
                        ]
                        CGs = sorted(CGs,
                                     key=lambda x: x.number_of_nodes(),
                                     reverse=True)
                        try:
                            vis_graphs += [CGs[0]]
                        except:
                            pass

                    try:
                        total = len(vis_graphs)  #min(3, len(vis_graphs))
                        draw_graph_list(vis_graphs[:total],
                                        4,
                                        int(total // 4),
                                        fname='sample/trans_sl:%d_%d.png' %
                                        (int(model.module.self_loop), epoch),
                                        layout='spring')
                    except:
                        print('sample saving failed')

            # snapshot model
            if (epoch + 1) % self.train_conf.snapshot_epoch == 0:
                logger.info("Saving Snapshot @ epoch {:04d}".format(epoch + 1))
                snapshot(model.module if self.use_gpu else model,
                         optimizer,
                         self.config,
                         epoch + 1,
                         scheduler=lr_scheduler)

        pickle.dump(
            results,
            open(os.path.join(self.config.save_dir, 'train_stats.p'), 'wb'))
        self.writer.close()

        return 1