예제 #1
0
파일: models.py 프로젝트: zgcgreat/WSDM
    def __init__(self, input_dim=None, output_dim=1, init_path=None, opt_algo='gd', learning_rate=1e-2, l2_weight=0,
                 random_seed=None):
        Model.__init__(self)
        init_vars = [('w', [input_dim, output_dim], 'xavier', dtype),
                     ('b', [output_dim], 'zero', dtype)]
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = tf.sparse_placeholder(dtype)
            self.y = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)  # 初始化变量w, b

            w = self.vars['w']
            b = self.vars['b']
            xw = tf.sparse_tensor_dense_matmul(self.X, w)
            logits = tf.reshape(xw + b, [-1])
            self.y_prob = tf.sigmoid(logits)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=logits)) + \
                        l2_weight * tf.nn.l2_loss(xw)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #2
0
파일: models.py 프로젝트: zgcgreat/WSDM
    def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None,
                 embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        print('num_inputs:{0}\\t\tlayer_size:{1}'.format(num_inputs, layer_sizes))
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype))  # 为每个特征值初始化一个长度为10的向量
        node_in = num_inputs * embed_size  # 将每个特征embeding 为10维的向量, 总共16个特征,所以是160个输入  网络为[160, 500, 1]
        for i in range(len(layer_sizes)):
            init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype))
            init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype))
            node_in = layer_sizes[i]

        print('init_vars:', init_vars)
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1)  # 将每个特征的隐含向量连起来,组成网络的输入,160维
            l = xw

            for i in range(len(layer_sizes)):
                wi = self.vars['w%d' % i]
                bi = self.vars['b%d' % i]
                print('第{0}个隐藏层l.shape, wi.shape, bi.shape'.format(i), l.shape, wi.shape, bi.shape)
                l = tf.nn.dropout(
                    utils.activate(
                        tf.matmul(l, wi) + bi,
                        layer_acts[i]),
                    self.layer_keeps[i])

            l = tf.squeeze(l)  # 从tensor中删除所有大小是1的维度
            self.y_prob = tf.sigmoid(l)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y))
            if layer_l2 is not None:
                self.loss += embed_l2 * tf.nn.l2_loss(xw)
                for i in range(len(layer_sizes)):
                    wi = self.vars['w%d' % i]
                    self.loss += layer_l2[i] * tf.nn.l2_loss(wi)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #3
0
파일: models.py 프로젝트: zgcgreat/WSDM
    def __init__(self, input_dim=None, output_dim=1, factor_order=10, init_path=None, opt_algo='gd', learning_rate=1e-2,
                 l2_w=0, l2_v=0, random_seed=None):
        Model.__init__(self)
        init_vars = [('w', [input_dim, output_dim], 'xavier', dtype),
                     ('v', [input_dim, factor_order], 'xavier', dtype),
                     ('b', [output_dim], 'zero', dtype)]
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = tf.sparse_placeholder(dtype)
            self.y = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)

            w = self.vars['w']
            v = self.vars['v']
            b = self.vars['b']

            X_square = tf.SparseTensor(self.X.indices, tf.square(self.X.values), tf.to_int64(tf.shape(self.X)))
            xv = tf.square(tf.sparse_tensor_dense_matmul(self.X, v))
            p = 0.5 * tf.reshape(
                tf.reduce_sum(xv - tf.sparse_tensor_dense_matmul(X_square, tf.square(v)), 1),
                [-1, output_dim])
            xw = tf.sparse_tensor_dense_matmul(self.X, w)
            logits = tf.reshape(xw + b + p, [-1])
            self.y_prob = tf.sigmoid(logits)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=self.y)) + \
                        l2_w * tf.nn.l2_loss(xw) + \
                        l2_v * tf.nn.l2_loss(xv)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #4
0
    def __init__(self,
                 train_loader,
                 test_loader,
                 embed_size=10,
                 layer_size=None,
                 layer_act=None,
                 layer_keeps=None,
                 opt_algo='gd',
                 learning_rate=0.01,
                 epoch=10,
                 early_stop_round=None,
                 l2=None,
                 random_seed=None):
        self.graph = tf.Graph()

        self.train_loader = train_loader
        self.test_loader = test_loader
        self.embed_size = embed_size
        self.layer_size = layer_size
        self.layer_act = layer_act
        self.layer_keeps = layer_keeps

        self.num_fields = len(config.FIELD_SIZES)
        self.var_list = []
        for idx in range(self.num_fields):
            self.var_list.append([
                'embed_{}'.format(idx),
                [config.FIELD_SIZES[idx], self.embed_size], 'xavier'
            ])

        in_size = self.num_fields * self.embed_size
        for idx in range(len(layer_size)):
            self.var_list.append(
                ['w_{}'.format(idx), [in_size, layer_size[idx]], 'xavier'])
            self.var_list.append(
                ['b_{}'.format(idx), [layer_size[idx]], 'zero'])
            in_size = layer_size[idx]

        self.var_dict = utils.get_var(self.var_list)

        self.opt_algo = opt_algo
        self.learning_rate = learning_rate
        self.epoch = epoch
        self.early_stop_round = early_stop_round
        self.l2 = l2
        self.random_seed = random_seed

        self.time_scores = []
        self.train_scores = []
        self.test_scores = []

        #         with self.graph.as_default():
        if self.random_seed is not None:
            tf.set_random_seed(self.random_seed)
        self.X = [
            tf.sparse_placeholder(config.DTYPE) for n in range(self.num_fields)
        ]
        self.y = tf.placeholder(config.DTYPE)

        with tf.variable_scope('Dense_Real_Layer'):
            w_embed = [
                self.var_dict['embed_{}'.format(idx)]
                for idx in range(self.num_fields)
            ]
            xw = tf.concat([
                tf.sparse_tensor_dense_matmul(self.X[idx], w_embed[idx])
                for idx in range(self.num_fields)
            ], 1)
            layer_out = xw

        for idx in range(len(layer_size)):
            with tf.variable_scope('Hiden_Layer_{}'.format(idx)):
                wi = self.var_dict['w_{}'.format(idx)]
                bi = self.var_dict['b_{}'.format(idx)]
                layer_out = tf.nn.dropout(
                    utils.activate(
                        tf.matmul(layer_out, wi) + bi, self.layer_act[idx]),
                    self.layer_keeps[idx])

        layer_out = tf.squeeze(layer_out)
        self.y_preds = tf.sigmoid(layer_out)

        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y,
                                                    logits=layer_out))
        if self.l2 is not None:
            for idx in range(self.num_fields):
                self.loss += self.l2 * tf.nn.l2_loss(
                    self.var_dict['embed_{}'.format(idx)])
            for idx in range(len(self.layer_size)):
                self.loss += self.l2 * tf.nn.l2_loss(
                    self.var_dict['w_{}'.format(idx)])

        self.optimizer = utils.get_optimizer(self.opt_algo, self.learning_rate,
                                             self.loss)

        self.sess = tf.Session()
        tf.global_variables_initializer().run(session=self.sess)
예제 #5
0
def main(_):
    vocab = Vocab()
    vocab.load_from_pickle()
    reader = Reader(vocab)

    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    with tf.Graph().as_default(), tf.Session(config=config_proto) as session:
        with tf.variable_scope("Model") as scope:
            if cfg.training:
                with tf.variable_scope("LR"):
                    g_lr = tf.get_variable("g_lr",
                                           shape=[],
                                           initializer=tf.zeros_initializer,
                                           trainable=False)
                    d_lr = tf.get_variable("d_lr",
                                           shape=[],
                                           initializer=tf.zeros_initializer,
                                           trainable=False)
                g_optimizer = utils.get_optimizer(g_lr, cfg.g_optimizer)
                d_optimizer = utils.get_optimizer(d_lr, cfg.d_optimizer)
                model = EncoderDecoderModel(vocab,
                                            True,
                                            use_gan=cfg.use_gan,
                                            g_optimizer=g_optimizer,
                                            d_optimizer=d_optimizer)
                scope.reuse_variables()
                eval_model = EncoderDecoderModel(vocab,
                                                 False,
                                                 use_gan=cfg.use_gan)
            else:
                test_model = EncoderDecoderModel(vocab,
                                                 False,
                                                 use_gan=cfg.use_gan)
                scope.reuse_variables()
            generator = EncoderDecoderModel(vocab,
                                            False,
                                            use_gan=cfg.use_gan,
                                            generator=True)
            decode_op = beam_decode_op(generator, vocab, cfg.beam_size)
        saver = tf.train.Saver()
        try:
            # try to restore a saved model file
            saver.restore(session, cfg.load_file)
            print("Model restored from", cfg.load_file)
        except ValueError:
            if cfg.training:
                tf.initialize_all_variables().run()
                print("No loadable model file, new model initialized.")
            else:
                print("You need to provide a valid model file for testing!")
                sys.exit(1)

        if cfg.training:
            steps = 0
            train_perps = []
            valid_perps = []
            session.run(tf.assign(g_lr, cfg.g_learning_rate))
            session.run(tf.assign(d_lr, cfg.d_learning_rate))
            if cfg.sc_use_kld_weight:
                min_kld_weight = cfg.anneal_max - 1e-4
            else:
                min_kld_weight = -1
            scheduler = utils.Scheduler(cfg.min_d_acc, cfg.max_d_acc,
                                        cfg.max_perplexity, min_kld_weight,
                                        cfg.sc_list_size, cfg.sc_decay)
            for i in range(cfg.max_epoch):
                print("\nEpoch: %d" % (i + 1))
                perplexity, steps = run_epoch(i, session, model, generator,
                                              reader.training(), vocab, saver,
                                              steps, cfg.max_steps, scheduler,
                                              cfg.use_gan, cfg.gen_every,
                                              decode_op)
                print("Epoch: %d Train Perplexity: %.3f" % (i + 1, perplexity))
                train_perps.append(perplexity)
                if cfg.validate_every > 0 and (i +
                                               1) % cfg.validate_every == 0:
                    perplexity, _ = run_epoch(i, session,
                                              eval_model, generator,
                                              reader.validation(), vocab, None,
                                              0, -1, None, cfg.use_gan, -1,
                                              decode_op)
                    print("Epoch: %d Validation Perplexity: %.3f" %
                          (i + 1, perplexity))
                    valid_perps.append(perplexity)
                else:
                    valid_perps.append(None)
                print('Train:', train_perps)
                print('Valid:', valid_perps)
                if steps >= cfg.max_steps:
                    break
        else:
            print('\nTesting')
            perplexity, _ = run_epoch(0, session, test_model, generator,
                                      reader.testing(), vocab, None, 0,
                                      cfg.max_steps, None, cfg.use_gan, -1,
                                      decode_op)
            print("Test Perplexity: %.3f" % perplexity)
예제 #6
0
def train(args):
    print(args)
    dataset = MovieLens(args.data_name,
                        args.device,
                        use_one_hot_fea=args.use_one_hot_fea,
                        symm=args.gcn_agg_norm_symm,
                        test_ratio=args.data_test_ratio,
                        valid_ratio=args.data_valid_ratio)
    print("Loading data finished ...\n")

    args.src_in_units = dataset.user_feature_shape[1]
    args.dst_in_units = dataset.movie_feature_shape[1]
    args.rating_vals = dataset.possible_rating_values

    ### build the net
    net = Net(args=args)
    net = net.to(args.device)
    nd_possible_rating_values = th.FloatTensor(
        dataset.possible_rating_values).to(args.device)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### perpare training data
    train_gt_labels = dataset.train_labels
    train_gt_ratings = dataset.train_truths

    ### prepare the logger
    train_loss_logger = MetricLogger(
        ['iter', 'loss', 'rmse'], ['%d', '%.4f', '%.4f'],
        os.path.join(args.save_dir, 'train_loss%d.csv' % args.save_id))
    valid_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                     os.path.join(
                                         args.save_dir,
                                         'valid_loss%d.csv' % args.save_id))
    test_loss_logger = MetricLogger(['iter', 'rmse'], ['%d', '%.4f'],
                                    os.path.join(
                                        args.save_dir,
                                        'test_loss%d.csv' % args.save_id))

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_iter = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0

    dataset.train_enc_graph = dataset.train_enc_graph.to(args.device)
    dataset.train_dec_graph = dataset.train_dec_graph.to(args.device)
    dataset.valid_enc_graph = dataset.train_enc_graph
    dataset.valid_dec_graph = dataset.valid_dec_graph.to(args.device)
    dataset.test_enc_graph = dataset.test_enc_graph.to(args.device)
    dataset.test_dec_graph = dataset.test_dec_graph.to(args.device)

    print("Start training ...")
    dur = []
    for iter_idx in range(1, args.train_max_iter):
        if iter_idx > 3:
            t0 = time.time()
        net.train()
        pred_ratings = net(dataset.train_enc_graph, dataset.train_dec_graph,
                           dataset.user_feature, dataset.movie_feature)
        loss = rating_loss_net(pred_ratings, train_gt_labels).mean()
        count_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(net.parameters(), args.train_grad_clip)
        optimizer.step()

        if iter_idx > 3:
            dur.append(time.time() - t0)

        if iter_idx == 1:
            print("Total #Param of net: %d" % (torch_total_param_num(net)))
            print(
                torch_net_info(net,
                               save_path=os.path.join(
                                   args.save_dir, 'net%d.txt' % args.save_id)))

        real_pred_ratings = (th.softmax(pred_ratings, dim=1) *
                             nd_possible_rating_values.view(1, -1)).sum(dim=1)
        rmse = ((real_pred_ratings - train_gt_ratings)**2).sum()
        count_rmse += rmse.item()
        count_num += pred_ratings.shape[0]

        if iter_idx % args.train_log_interval == 0:
            train_loss_logger.log(iter=iter_idx,
                                  loss=count_loss / (iter_idx + 1),
                                  rmse=count_rmse / count_num)
            logging_str = "Iter={}, loss={:.4f}, rmse={:.4f}, time={:.4f}".format(
                iter_idx, count_loss / iter_idx, count_rmse / count_num,
                np.average(dur))
            count_rmse = 0
            count_num = 0

        if iter_idx % args.train_valid_interval == 0:
            valid_rmse = evaluate(args=args,
                                  net=net,
                                  dataset=dataset,
                                  segment='valid')
            valid_loss_logger.log(iter=iter_idx, rmse=valid_rmse)
            logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

            if valid_rmse < best_valid_rmse:
                best_valid_rmse = valid_rmse
                no_better_valid = 0
                best_iter = iter_idx
                test_rmse = evaluate(args=args,
                                     net=net,
                                     dataset=dataset,
                                     segment='test')
                best_test_rmse = test_rmse
                test_loss_logger.log(iter=iter_idx, rmse=test_rmse)
                logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
            else:
                no_better_valid += 1
                if no_better_valid > args.train_early_stopping_patience\
                    and learning_rate <= args.train_min_lr:
                    logging.info(
                        "Early stopping threshold reached. Stop training.")
                    break
                if no_better_valid > args.train_decay_patience:
                    new_lr = max(learning_rate * args.train_lr_decay_factor,
                                 args.train_min_lr)
                    if new_lr < learning_rate:
                        learning_rate = new_lr
                        logging.info("\tChange the LR to %g" % new_lr)
                        for p in optimizer.param_groups:
                            p['lr'] = learning_rate
                        no_better_valid = 0
        if iter_idx % args.train_log_interval == 0:
            print(logging_str)
    print('Best Iter Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
          format(best_iter, best_valid_rmse, best_test_rmse))
    train_loss_logger.close()
    valid_loss_logger.close()
    test_loss_logger.close()
    def __init__(self,
                 field_size=None,
                 embed_size=10,
                 layer_sizes=None,
                 layer_acts=None,
                 drop_out=None,
                 embed_l2=None,
                 layer_l2=None,
                 init_path=None,
                 opt_algo='gd',
                 learning_rate=1e-3,
                 random_seed=None):

        Model.__init__(self)

        init_vars = []
        num_inputs = len(field_size)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_size[i],
                                               embed_size], 'xavier', dtype))
        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
        node_in = num_inputs * embed_size + num_pairs
        for i in range(len(layer_sizes)):
            init_vars.append(('w%d' % i, [node_in,
                                          layer_sizes[i]], 'xavier', dtype))
            init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype))
            node_in = layer_sizes[i]

        self.graph = tf.Graph()
        with self.graph.as_default():
            if (random_seed is not None):
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i]
                  for i in range(num_inputs)]  # [num_inputs, field_size[i], k]
            xw = tf.concat([
                tf.sparse_tensor_dense_matmul(self.X[i], w0[i])
                for i in range(num_inputs)
            ], 1)  # [num_inputs*k]
            xw3d = tf.reshape(
                xw, [-1, num_inputs, embed_size])  # [batch, num_inputs, k]

            row = []  # num_pairs
            col = []  # num_pairs
            for i in range(num_inputs - 1):
                for j in range(i + 1, num_inputs):
                    row.append(i)
                    col.append(j)

            p = tf.transpose(
                tf.gather(
                    tf.transpose(xw3d, [1, 0, 2]),  # [num_inputs, batch, k]
                    row),  # [num_pairs, batch, k]
                [1, 0, 2])  # [batch, num_pairs, k]

            q = tf.transpose(
                tf.gather(
                    tf.transpose(xw3d, [1, 0, 2]),  # [num_inputs, batch, k]
                    col),  # [num_pairs, batch, k]
                [1, 0, 2])  # [batch, num_pairs, k]

            p = tf.reshape(
                p, [-1, num_pairs, embed_size])  # [batch, num_pairs, k]
            q = tf.reshape(
                q, [-1, num_pairs, embed_size])  # [batch, num_pairs, k]

            ip = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, num_pairs])
            l = tf.concat([xw, ip], 1)  # [num_inputs*k + num_pairs]

            for i in range(len(layer_sizes)):
                w = self.vars['w%d' % i]
                b = self.vars['b%d' % i]
                l = utils.activate(tf.matmul(l, w) + b, layer_acts[i])
                l = tf.nn.dropout(l, self.layer_keeps[i])

            print('l', l)
            l = tf.squeeze(l)
            self.y_prob = tf.sigmoid(l)
            print('l', l)
            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l,
                                                        labels=self.y))

            if (layer_l2 is not None):
                self.loss += embed_l2 * tf.nn.l2_loss(xw)
                for i in range(len(layer_sizes)):
                    w = self.vars['w%d' % i]
                    self.loss += layer_l2 * tf.nn.l2_loss(w)

            self.optimizer = utils.get_optimizer(opt_algo, learning_rate,
                                                 self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #8
0
    def __init__(self, params):
        """Creates a Trainer.
    """
        utils.set_default_param_values_and_env_vars(params)
        self.params = params

        # Setup logging & log the version.
        utils.setup_logging(params.logging_verbosity)

        self.job_name = self.params.job_name  # "" for local training
        self.is_distributed = bool(self.job_name)
        self.task_index = self.params.task_index
        self.local_rank = self.params.local_rank
        self.start_new_model = self.params.start_new_model
        self.train_dir = self.params.train_dir
        self.num_gpus = self.params.num_gpus
        if self.num_gpus and not self.is_distributed:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        # print self.params parameters
        if self.start_new_model and self.local_rank == 0:
            pp = pprint.PrettyPrinter(indent=2, compact=True)
            logging.info(pp.pformat(params.values()))

        if self.local_rank == 0:
            logging.info("PyTorch version: {}.".format(torch.__version__))
            logging.info("NCCL Version {}".format(torch.cuda.nccl.version()))
            logging.info("Hostname: {}.".format(socket.gethostname()))

        if self.is_distributed:
            self.num_nodes = len(params.worker_hosts.split(';'))
            self.world_size = self.num_nodes * self.num_gpus
            self.rank = self.task_index * self.num_gpus + self.local_rank
            dist.init_process_group(backend='nccl',
                                    init_method='env://',
                                    timeout=datetime.timedelta(seconds=30))
            if self.local_rank == 0:
                logging.info('World Size={} => Total batch size {}'.format(
                    self.world_size, self.batch_size * self.world_size))
            self.is_master = bool(self.rank == 0)
        else:
            self.world_size = 1
            self.is_master = True

        # create a mesage builder for logging
        self.message = utils.MessageBuilder()

        # load reader and model
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.num_gpus,
                                                          is_training=True)

        # load model
        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params,
                                                   self.reader.n_classes,
                                                   is_training=True)
        # add normalization as first layer of model
        if self.params.add_normalization:
            # In order to certify radii in original coordinates rather than standardized coordinates, we
            # add the noise _before_ standardizing, which is why we have standardization be the first
            # layer of the classifier rather than as a part of preprocessing as is typical.
            normalize_layer = self.reader.get_normalize_layer()
            self.model = torch.nn.Sequential(normalize_layer, self.model)

        # define DistributedDataParallel job
        self.model = SyncBatchNorm.convert_sync_batchnorm(self.model)
        torch.cuda.set_device(params.local_rank)
        self.model = self.model.cuda()
        i = params.local_rank
        self.model = DistributedDataParallel(self.model,
                                             device_ids=[i],
                                             output_device=i)
        if self.local_rank == 0:
            logging.info('Model defined with DistributedDataParallel')

        # define set for saved ckpt
        self.saved_ckpts = set([0])

        # define optimizer
        self.optimizer = utils.get_optimizer(self.params.optimizer,
                                             self.params.optimizer_params,
                                             self.params.init_learning_rate,
                                             self.params.weight_decay,
                                             self.model.parameters())

        # define learning rate scheduler
        self.scheduler = utils.get_scheduler(self.optimizer,
                                             self.params.lr_scheduler,
                                             self.params.lr_scheduler_params)

        # if start_new_model is False, we restart training
        if not self.start_new_model:
            if self.local_rank == 0:
                logging.info('Restarting training...')
            self._load_state()

        # define Lipschitz regularization module
        if self.params.lipschitz_regularization:
            if self.local_rank == 0:
                logging.info(
                    "Lipschitz regularization with decay {}, start after epoch {}"
                    .format(self.params.lipschitz_decay,
                            self.params.lipschitz_start_epoch))
            self.lipschitz = LipschitzRegularization(self.model, self.params,
                                                     self.reader,
                                                     self.local_rank)

        # exponential moving average
        self.ema = None
        if getattr(self.params, 'ema', False) > 0:
            self.ema = utils.EMA(self.params.ema)

        # if adversarial training, create the attack class
        if self.params.adversarial_training:
            if self.local_rank == 0:
                logging.info('Adversarial Training')
            attack_params = self.params.adversarial_training_params
            if 'eps_iter' in attack_params.keys(
            ) and attack_params['eps_iter'] == -1:
                eps = attack_params['eps']
                n_iter = attack_params['nb_iter']
                attack_params['eps_iter'] = eps / n_iter * 2
                if self.local_rank == 0:
                    logging.info('Learning rate for attack: {}'.format(
                        attack_params['eps_iter']))
            self.attack = utils.get_attack(
                self.model, self.reader.n_classes,
                self.params.adversarial_training_name, attack_params)

        # init noise
        if self.params.adaptive_noise and self.params.additive_noise:
            raise ValueError(
                "Adaptive and Additive Noise should not be set together")
        if self.params.adaptive_noise:
            if self.local_rank == 0:
                logging.info('Training with Adaptive Noise: {} {}'.format(
                    self.params.noise_distribution, self.params.noise_scale))
        elif self.params.additive_noise:
            if self.local_rank == 0:
                logging.info('Training with Noise: {} {}'.format(
                    self.params.noise_distribution, self.params.noise_scale))
        if self.params.adaptive_noise or self.params.additive_noise:
            self.noise = utils.Noise(self.params)

        # stability training
        if self.params.stability_training:
            if self.local_rank == 0:
                logging.info("Training with Stability Training: {}".format(
                    self.params.stability_training_lambda))
            if not any([
                    self.params.adversarial_training,
                    self.params.adaptive_noise, self.params.additive_noise
            ]):
                raise ValueError(
                    "Adversarial Training or Adaptive Noise should be activated"
                )
예제 #9
0
if __name__ == "__main__":
    config = get_config()

    # Setting seed for reproducability
    set_seed()

    # Get data
    train_dataset, val_dataset = get_datasets(config)

    # MODEL
    model = get_model(config['model'])
    # Define loss
    loss = CrossEntropy()
    # Define optimizer
    optimizer = get_optimizer(config['train']['optimizer'], model, loss)

    # Main loop
    train_loss_hist, val_loss_hist, val_acc_hist = list(), list(), list()
    pbar = tqdm(range(config['train']['epochs']))
    lr_decay_config = config['train']['lr_decay']
    for i in pbar:
        # TRAINING
        model.train()
        for j in range(len(train_dataset)):
            x, y = train_dataset[j]
            train_loss = optimizer.step(x, y)
            if lr_decay_config['use']:
                optimizer.decay_learning_rate(i, lr_decay_config['decay_fraction'], lr_decay_config['decay_frequency'])
            train_loss_hist.append((i + j / len(train_dataset), train_loss))
예제 #10
0
def main(_):
    data_tr, labels_tr, data_te, labels_te, unlabeled = input_data.load_data(
        FLAGS.dataset_name, FLAGS.num_labeled)
    print("    train shapes:", data_tr.shape, labels_tr.shape)
    print("     test shapes:", data_te.shape, labels_te.shape)
    print("unlabeled shapes:", unlabeled.shape)

    data_tr_batch, labels_tr_batch = u.load_shuffle_batch(
        data_tr,
        labels_tr,
        batch_size=FLAGS.batch_size,
        capacity=FLAGS.batch_size * 100,
        min_after_dequeue=FLAGS.batch_size * 20)
    data_te_batch, labels_te_batch = u.load_batch(data_te, labels_te,
                                                  FLAGS.batch_size)
    data_unlabeled_batch, _ = u.load_batch(unlabeled,
                                           np.zeros(unlabeled.shape[0]),
                                           FLAGS.batch_size)

    with tf.variable_scope('model') as scope:
        model = models.get_model(FLAGS.model_name)
        logits_tr = model(data_tr_batch, is_training=True)
        scope.reuse_variables()
        logits_te = model(data_te_batch, is_training=False)

    loss_tr = u.get_supervised_loss(logits=logits_tr, labels=labels_tr_batch)
    loss_te = u.get_supervised_loss(logits=logits_te, labels=labels_te_batch)

    acc_tr = u.get_accuracy(logits_tr, labels_tr_batch)
    acc_te = u.get_accuracy(logits_te, labels_te_batch)

    step = tf.Variable(0, trainable=False, dtype=tf.int32)
    optimizer = u.get_optimizer(FLAGS.optimizer_type, FLAGS.learning_rate,
                                step, FLAGS.lr_decay_steps,
                                FLAGS.lr_decay_factor)
    train_op = u.get_train_op(optimizer, loss_tr, step)

    with tf.Session() as sess:

        def eval_test():
            loss = 0.0
            acc = 0.0
            eval_iters = int(data_te.shape[0] / FLAGS.batch_size)
            for j in range(eval_iters):
                l, a = sess.run([loss_te, acc_te])
                loss += l
                acc += a
            loss /= eval_iters
            acc /= eval_iters
            return loss, acc

        # initialize the variables
        init_op = tf.global_variables_initializer()
        sess.run(init_op)

        # initialize the queue threads to start to shovel data
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        for i in tqdm(range(FLAGS.num_iters)):
            _, cur_loss_tr, cur_acc_tr = sess.run([train_op, loss_tr, acc_tr])

            if i % FLAGS.eval_interval == 0:
                print('train loss: %.4f train acc: %.4f' %
                      (cur_loss_tr, cur_acc_tr))
                cur_loss_te, cur_acc_te = eval_test()
                print(' test loss: %.4f  test acc: %.4f' %
                      (cur_loss_te, cur_acc_te))

        # stop our queue threads and properly close the session
        coord.request_stop()
        coord.join(threads)
        sess.close()
예제 #11
0
파일: train.py 프로젝트: qingnengli/MMFI
def TFGAN(inputs,targets):

    traindir = os.path.join(logdir, 'GG12\\PIX2PIX_MINMAX_1024')
    if tf.gfile.Exists(traindir):
      tf.gfile.DeleteRecursively(traindir)
    tf.gfile.MakeDirs(traindir)

    # Create a GANModel tuple.
    fiber_output, fiber_input = inputs
    encoder, label = targets
    real_data = tf.concat((label,fiber_input),-1)
    #######################################################################
    ##########################  GAN MODEL #################################
    #######################################################################
    gan_model = tfgan.gan_model(
        generator_fn=generator_fn,
        discriminator_fn=pix2pix_D,
        real_data=real_data,
        generator_inputs=fiber_output,
        generator_scope='Generator',
        discriminator_scope='Discriminator')

    #######################################################################
    ##########################  GAN SUMMARY ###############################
    #######################################################################
    with tf.name_scope('Train_summary'):
      generated_data, generated_input = tf.split(gan_model.generated_data,2,-1)
      reshaped_fiber_input = get_summary_image(fiber_input, FLAGS.grid_size)
      reshaped_label = get_summary_image(label, FLAGS.grid_size)
      reshaped_generated_input = get_summary_image(generated_input, FLAGS.grid_size)
      reshaped_generated_data = get_summary_image(generated_data, FLAGS.grid_size)
      tf.summary.image('Input_Fiber', reshaped_fiber_input)
      tf.summary.image('Input_Generator', reshaped_generated_input)
      tf.summary.image('Data_Real', reshaped_label)
      tf.summary.image('Data_Generator', reshaped_generated_data)

    #######################################################################
    ##########################  GAN LOSS  #################################
    #######################################################################
    with tf.name_scope('pixel_loss'):
      pixel_loss = combine_loss(gan_model.generated_data,
                                gan_model.real_data,
                                add_summary=True)
    with tf.name_scope('gan_loss'):
      gan_loss = tfgan.gan_loss(
        gan_model,
        generator_loss_fn=tfgan.losses.modified_generator_loss,
        discriminator_loss_fn=tfgan.losses.modified_discriminator_loss,
        gradient_penalty_weight=1.0, # only in wassertein_loss
      )
      tfgan.eval.add_regularization_loss_summaries(gan_model)
    with tf.name_scope('Train_Loss'):
      gan_loss = tfgan.losses.combine_adversarial_loss(
          gan_loss, gan_model, pixel_loss,
          weight_factor=FLAGS.adversarial_loss_weight)

    #######################################################################
    ##########################   GAN OPS   ################################
    #######################################################################
    with tf.name_scope('Train_ops'):
      gen_lr = get_lr(1e-5,decay_steps=5000)
      dis_lr = get_lr(5e-5,decay_steps=5000)
      train_ops = tfgan.gan_train_ops(
          gan_model,  gan_loss,
          generator_optimizer=get_optimizer(gen_lr),
          discriminator_optimizer=get_optimizer(dis_lr),
          # summarize_gradients=False,
          # colocate_gradients_with_ops=True,
          # transform_grads_fn=tf.contrib.training.clip_gradient_norms_fn(1e3),
          # aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
          )
      psnr = tf.reduce_mean(tf.image.psnr(generated_data, label, max_val = 1.0))
      ssim = tf.reduce_mean(tf.image.ssim(generated_data, label, max_val = 1.0))
      corr = correlation(generated_data, label)
      tf.summary.scalar('PSNR', psnr)
      tf.summary.scalar('SSIM', ssim)
      tf.summary.scalar('Relation', corr)
      tf.summary.scalar('generator_lr', gen_lr)
      # tf.summary.scalar('discriminator_lr', dis_lr)

    #######################################################################
    ##########################   GAN TRAIN   ##############################
    #######################################################################
    train_steps = tfgan.GANTrainSteps(generator_train_steps=1, discriminator_train_steps=1)
    message = tf.string_join([' Train step: ', tf.as_string(tf.train.get_or_create_global_step()),
                              '   PSNR:', tf.as_string(psnr), '   SSIM:', tf.as_string(ssim),
                              '   Correlation:', tf.as_string(corr)
                              ], name='status_message')

    tfgan.gan_train(train_ops, logdir = traindir,  get_hooks_fn=tfgan.get_joint_train_hooks(train_steps),
                    hooks=[tf.train.StopAtStepHook(num_steps=FLAGS.max_iter),
                           tf.train.LoggingTensorHook([message], every_n_iter=FLAGS.log_n_steps),
                           get_tfgan_init_fn('E:\GitHub\MMFI\log\\GG12\\CNN', 'Generator'),
                           # get_tfgan_init_fn('E:\GitHub\MMFI\log\\G2\\pix2pix_D', 'Discriminator'),
                           ],
                    save_summaries_steps = FLAGS.save_summaries_steps*2,
                    save_checkpoint_secs = FLAGS.save_interval_secs)
예제 #12
0
파일: train.py 프로젝트: qingnengli/MMFI
def Generator_all(inputs,targets):

  traindir = os.path.join(logdir, 'GG12\\CNN')
  if tf.gfile.Exists(traindir):
    tf.gfile.DeleteRecursively(traindir)
  tf.gfile.MakeDirs(traindir)

  fiber_output,fiber_input = inputs
  encoder, label = targets

  with tf.variable_scope('Generator'):
    with tf.variable_scope('G1'):
      generated_input = pix2pix_G(fiber_output) * circle(FLAGS.input_size,FLAGS.input_size)
    with tf.variable_scope('G2'):
      generated_data = pix2pix_G(generated_input) * circle(FLAGS.input_size,FLAGS.input_size)

  with tf.name_scope('Train_summary'):
    reshaped_fiber_input = get_summary_image(fiber_input,FLAGS.grid_size)
    reshaped_label = get_summary_image(label,FLAGS.grid_size)
    reshaped_generated_input = get_summary_image(generated_input,FLAGS.grid_size)
    reshaped_generated_data = get_summary_image(generated_data,FLAGS.grid_size)
    tf.summary.image('Input_Fiber', reshaped_fiber_input)
    tf.summary.image('Input_Generator', reshaped_generated_input)
    tf.summary.image('Data_Real', reshaped_label)
    tf.summary.image('Data_Generator', reshaped_generated_data)

  with tf.name_scope('g1_loss'):
    G1_loss = combine_loss(generated_input, fiber_input, add_summary=True)
  with tf.name_scope('g2_loss'):
    G2_loss = combine_loss(generated_data, label, add_summary=True)
  with tf.name_scope('Train_Loss'):
    reg_loss = tf.losses.get_regularization_loss()
    total_loss = G1_loss + G2_loss + reg_loss
    total_loss = tf.check_numerics(total_loss, 'Loss is inf or nan.')
    tf.summary.scalar('Regularization_loss',reg_loss)
    tf.summary.scalar('G1_loss', G1_loss)
    tf.summary.scalar('G2_loss', G2_loss)
    tf.summary.scalar('Total_loss',total_loss)

  lr = get_lr(1e-5,decay_steps=5000)
  optimizer = get_optimizer(lr)
  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  train_op = slim.learning.create_train_op(total_loss, optimizer, update_ops =update_ops,
                                           variables_to_train=
                                           tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                                             scope='Generator')
                                           )

  with tf.name_scope('Train_ops'):
    psnr = tf.reduce_mean(tf.image.psnr(generated_data, label, max_val=1.0))
    ssim = tf.reduce_mean(tf.image.ssim(generated_data, label, max_val=1.0))
    corr = correlation(generated_data, label)
    tf.summary.scalar('PSNR', psnr)
    tf.summary.scalar('SSIM', ssim)
    tf.summary.scalar('Relation', corr)
    tf.summary.scalar('Learning_rate', lr)

  slim.learning.train(train_op, traindir,
                      number_of_steps =FLAGS.max_iter,
                      log_every_n_steps=FLAGS.log_n_steps,
                      init_fn=get_multimodel_init_fn(ckpt1='E:\GitHub\MMFI\log\\G1\\pix2pix_G',
                                                     include1='Generator/G1',
                                                     ckpt2='E:\GitHub\MMFI\log\\G2\\pix2pix_G',
                                                     include2='Generator/G2'),
                      save_summaries_secs=FLAGS.save_summaries_secs,
                      save_interval_secs = FLAGS.save_interval_secs)
예제 #13
0
def main():
    opt = get_model_config()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(opt)

    # Model setting
    logger.info('Build Model')

    generator = define_G(3, 3, opt.ngf).to(device)
    total_param = sum([p.numel() for p in generator.parameters()])
    logger.info(f'Generator size: {total_param} tensors')

    discriminator = define_D(3 + 3, opt.ndf, opt.disc).to(device)
    total_param = sum([p.numel() for p in discriminator.parameters()])
    logger.info(f'Discriminator size: {total_param} tensors')

    if torch.cuda.device_count() > 1:
        logger.info(f"Let's use {torch.cuda.device_count()} GPUs!")
        generator = DataParallel(generator)
        discriminator = DataParallel(discriminator)

    if opt.mode == 'train':
        dirname = datetime.now().strftime("%m%d%H%M") + f'_{opt.name}'
        log_dir = os.path.join('./experiments', dirname)
        os.makedirs(log_dir, exist_ok=True)
        logger.info(f'LOG DIR: {log_dir}')

        # Dataset setting
        logger.info('Set the dataset')
        image_size: Tuple[int] = (opt.image_h, opt.image_w)
        train_transform, val_transform = get_transforms(
            image_size,
            augment_type=opt.augment_type,
            image_norm=opt.image_norm)

        trainset = TrainDataset(image_dir=os.path.join(opt.data_dir, 'train'),
                                transform=train_transform)
        valset = TrainDataset(image_dir=os.path.join(opt.data_dir, 'val'),
                              transform=val_transform)

        train_loader = DataLoader(dataset=trainset,
                                  batch_size=opt.batch_size,
                                  shuffle=True,
                                  num_workers=opt.num_workers)
        val_loader = DataLoader(dataset=valset,
                                batch_size=opt.batch_size,
                                shuffle=False,
                                num_workers=opt.num_workers)

        # Loss setting
        criterion = {}
        criterion['gan'] = GANLoss(use_lsgan=True).to(device)
        criterion['l1'] = torch.nn.L1Loss().to(device)

        # Optimizer setting
        g_optimizer = get_optimizer(generator.parameters(), opt.optimizer,
                                    opt.lr, opt.weight_decay)
        d_optimizer = get_optimizer(discriminator.parameters(), opt.optimizer,
                                    opt.lr, opt.weight_decay)
        logger.info(
            f'Initial Learning rate(G): {g_optimizer.param_groups[0]["lr"]:.6f}'
        )
        logger.info(
            f'Initial Learning rate(D): {d_optimizer.param_groups[0]["lr"]:.6f}'
        )

        # Scheduler setting
        g_scheduler = get_scheduler(g_optimizer, opt.scheduler, opt)
        d_scheduler = get_scheduler(d_optimizer, opt.scheduler, opt)

        # Tensorboard setting
        writer = SummaryWriter(log_dir=log_dir)

        logger.info('Start to train!')
        train_process(opt,
                      generator,
                      discriminator,
                      criterion,
                      g_optimizer,
                      d_optimizer,
                      g_scheduler,
                      d_scheduler,
                      train_loader=train_loader,
                      val_loader=val_loader,
                      log_dir=log_dir,
                      writer=writer,
                      device=device)

    # TODO: write inference code
    elif opt.mode == 'test':
        logger.info(f'Model loaded from {opt.checkpoint}')

        model.eval()
        logger.info('Start to test!')
        test_status = inference(model=model,
                                test_loader=test_loader,
                                device=device,
                                criterion=criterion)
예제 #14
0
def do_train(sess, args):
  # set CPU as the default device for the graph. Some of the operations will be moved to GPU later.
  with tf.device('/cpu:0'):

    # Images and labels placeholders
    images_ph= tf.placeholder(tf.float32, shape=(None,)+ tuple(args.processed_size), name='input')
    labels_ph= tf.placeholder(tf.int32, shape=(None), name='label')

    # a placeholder for determining if we train or validate the network. This placeholder will be used to set dropout rates and batchnorm paramaters.
    is_training_ph= tf.placeholder(tf.bool, name='is_training')

    #epoch number
    epoch_number = tf.get_variable('epoch_number', [], dtype= tf.int32, initializer= tf.constant_initializer(0), trainable= False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES])
    global_step = tf.get_variable('global_step', [], dtype= tf.int32, initializer= tf.constant_initializer(0), trainable= False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, SAVE_VARIABLES])

    # Weight Decay policy
    wd = utils.get_policy(args.WD_policy, args.WD_details)

    # Learning rate decay policy (if needed)
    lr = utils.get_policy(args.LR_policy, args.LR_details)

    # Create an optimizer that performs gradient descent.
    optimizer = utils.get_optimizer(args.optimizer, lr)

    # build the computational graph using the provided configuration.
    dnn_model= model(images_ph, labels_ph, utils.loss, optimizer, wd, args.architecture, args.depth, args.num_classes, is_training_ph, args.transfer_mode, num_gpus= args.num_gpus)

    # Create a pipeline to read data from disk
    # a placeholder for setting the input pipeline batch size. This is employed to ensure that we feed each validation example only once to the network.
    # Because we only use 1 GPU for validation, the validation batch size should not be more than 512.
    batch_size_tf= tf.placeholder_with_default(min(512, args.batch_size), shape=())

    # A data loader pipeline to read training images and their labels
    train_loader= loader(args.train_info, args.delimiter, args.raw_size, args.processed_size, True, args.chunked_batch_size, args.num_prefetch, args.num_threads, args.path_prefix, args.shuffle)
    # The loader returns images, their labels, and their paths
    images, labels, info = train_loader.load()

    # If validation data are provided, we create an input pipeline to load the validation data
    if args.run_validation:
      val_loader= loader(args.val_info, args.delimiter, args.raw_size, args.processed_size, False, batch_size_tf, args.num_prefetch, args.num_threads, args.path_prefix)
      val_images, val_labels, val_info = val_loader.load()

    # Get training operations to run from the deep learning model
    train_ops = dnn_model.train_ops()

    # Build an initialization operation to run below.
    init = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
    sess.run(init)

    if args.retrain_from is not None:
      dnn_model.load(sess, args.retrain_from)

    # Set the start epoch number
    start_epoch = sess.run(epoch_number + 1)

    # Start the queue runners.
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)
    
    # Setup a summary writer
    summary_writer = tf.summary.FileWriter(args.log_dir, sess.graph)

    # The main training loop
    for epoch in range(start_epoch, start_epoch + args.num_epochs):

      # update epoch_number
      sess.run(epoch_number.assign(epoch))

      print("Epoch %d started"%(epoch))
      # Trainig batches
      for step in range(args.num_batches):
        sess.run(global_step.assign(step+epoch*args.num_batches))
        # train the network on a batch of data (It also measures time)
        start_time = time.time()

        # load a batch from input pipeline
        img,lbl = sess.run([images, labels], options= args.run_options, run_metadata= args.run_metadata)

        # train on the loaded batch of data
        _, loss_value, top1_accuracy, topn_accuracy = sess.run(train_ops, feed_dict = {images_ph: img, labels_ph: lbl, is_training_ph: True},
                options= args.run_options, run_metadata= args.run_metadata)
        duration = time.time() - start_time

        # Check for errors
        assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

        # Logging every ten batches and writing tensorboard summaries every hundred batches
        if step % 10 == 0:

          num_examples_per_step = args.chunked_batch_size * args.num_gpus
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = duration / args.num_gpus
          
          # Log
          format_str = ('%s: epoch %d, step %d, loss = %.2f, Top-1 = %.2f Top-'+str(args.top_n)+' = %.2f (%.1f examples/sec; %.3f sec/batch)')
          print(format_str % (datetime.now(), epoch, step, loss_value, top1_accuracy, topn_accuracy, examples_per_sec, sec_per_batch))
          sys.stdout.flush()
        
        if step % 100 == 0:
          summary_str = sess.run(tf.summary.merge_all(), feed_dict={images_ph: img, labels_ph: lbl, is_training_ph: True})
          summary_writer.add_summary(summary_str, args.num_batches * epoch + step)
          if args.log_debug_info:
            summary_writer.add_run_metadata(run_metadata, 'epoch%d step%d' % (epoch, step))

      # Save the model checkpoint periodically after each training epoch
      checkpoint_path = os.path.join(args.log_dir, args.snapshot_prefix)
      dnn_model.save(sess, checkpoint_path, global_step= epoch)

      print("Epoch %d ended. a checkpoint saved at %s"%(epoch,args.log_dir))
      sys.stdout.flush()
      # if validation data are provided, evaluate accuracy on the validation set after the end of each epoch
      if args.run_validation:

        print("Evaluating on validation set")

        true_predictions_count = 0  # Counts the number of correct predictions
        true_topn_predictions_count = 0 # Counts the number of top-n correct predictions
        total_loss= 0.0 # measures cross entropy loss
        all_count = 0 # Count the total number of examples

        # The validation loop
        for step in range(args.num_val_batches):
          # Load a batch of data
          val_img,val_lbl = sess.run([val_images, val_labels], feed_dict={batch_size_tf: args.num_val_samples%min(512, args.batch_size)} if step== args.num_val_batches-1 else None, 
                  options= args.run_options, run_metadata= args.run_metadata)

          # validate the network on the loaded batch
          val_loss, top1_predictions, topn_predictions = sess.run([train_ops[1], train_ops[2], train_ops[3]], feed_dict={ images_ph: val_img, labels_ph: val_lbl, is_training_ph: False}, 
                  options= args.run_options, run_metadata= args.run_metadata)

          all_count += val_lbl.shape[0]
          true_predictions_count += int(round(val_lbl.shape[0]*top1_predictions))
          true_topn_predictions_count += int(round(val_lbl.shape[0]*topn_predictions))
          total_loss += val_loss*val_lbl.shape[0]
          if step%10==0:
            print("Validation step %d of %d"%(step, args.num_val_batches))
            sys.stdout.flush()

        print("Total number of validation examples %d, Loss %.2f, Top-1 Accuracy %.2f, Top-%d Accuracy %.2f" %
                (all_count, total_loss/all_count, true_predictions_count/all_count, args.top_n, true_topn_predictions_count/all_count))
        sys.stdout.flush()

    coord.request_stop()
    coord.join(threads)
    sess.close()
예제 #15
0
파일: models.py 프로젝트: ZM7/Tencent-Game
    def __init__(self,
                 field_sizes=None,
                 embed_size=10,
                 layer_sizes=None,
                 layer_acts=None,
                 drop_out=None,
                 embed_l2=None,
                 layer_l2=None,
                 init_path=None,
                 opt_algo='gd',
                 learning_rate=1e-2,
                 random_seed=None,
                 layer_norm=True):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i],
                                               embed_size], 'xavier', dtype))
        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
        node_in = num_inputs * embed_size + num_pairs
        init_vars.append(('kernel', [embed_size, num_pairs,
                                     embed_size], 'xavier', dtype))
        for i in range(len(layer_sizes)):
            init_vars.append(('w%d' % i, [node_in,
                                          layer_sizes[i]], 'xavier', dtype))
            init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype))
            node_in = layer_sizes[i]
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([
                tf.sparse_tensor_dense_matmul(self.X[i], w0[i])
                for i in range(num_inputs)
            ], 1)
            xw3d = tf.reshape(xw, [-1, num_inputs, embed_size])

            row = []
            col = []
            for i in range(num_inputs - 1):
                for j in range(i + 1, num_inputs):
                    row.append(i)
                    col.append(j)
            # batch * pair * k
            p = tf.transpose(
                # pair * batch * k
                tf.gather(
                    # num * batch * k
                    tf.transpose(xw3d, [1, 0, 2]),
                    row),
                [1, 0, 2])
            # batch * pair * k
            q = tf.transpose(tf.gather(tf.transpose(xw3d, [1, 0, 2]), col),
                             [1, 0, 2])
            # b * p * k
            p = tf.reshape(p, [-1, num_pairs, embed_size])
            # b * p * k
            q = tf.reshape(q, [-1, num_pairs, embed_size])
            # k * p * k
            k = self.vars['kernel']

            # batch * 1 * pair * k
            p = tf.expand_dims(p, 1)
            # batch * pair
            kp = tf.reduce_sum(
                # batch * pair * k
                tf.multiply(
                    # batch * pair * k
                    tf.transpose(
                        # batch * k * pair
                        tf.reduce_sum(
                            # batch * k * pair * k
                            tf.multiply(p, k),
                            -1),
                        [0, 2, 1]),
                    q),
                -1)

            #
            # if layer_norm:
            #     # x_mean, x_var = tf.nn.moments(xw, [1], keep_dims=True)
            #     # xw = (xw - x_mean) / tf.sqrt(x_var)
            #     # x_g = tf.Variable(tf.ones([num_inputs * embed_size]), name='x_g')
            #     # x_b = tf.Variable(tf.zeros([num_inputs * embed_size]), name='x_b')
            #     # x_g = tf.Print(x_g, [x_g[:10], x_b])
            #     # xw = xw * x_g + x_b
            #     p_mean, p_var = tf.nn.moments(op, [1], keep_dims=True)
            #     op = (op - p_mean) / tf.sqrt(p_var)
            #     p_g = tf.Variable(tf.ones([embed_size**2]), name='p_g')
            #     p_b = tf.Variable(tf.zeros([embed_size**2]), name='p_b')
            #     # p_g = tf.Print(p_g, [p_g[:10], p_b])
            #     op = op * p_g + p_b

            l = tf.concat([xw, kp], 1)
            for i in range(len(layer_sizes)):
                wi = self.vars['w%d' % i]
                bi = self.vars['b%d' % i]
                l = tf.nn.dropout(
                    utils.activate(tf.matmul(l, wi) + bi, layer_acts[i]),
                    self.layer_keeps[i])

            l = tf.squeeze(l)
            self.y_prob = tf.sigmoid(l)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l,
                                                        labels=self.y))
            if layer_l2 is not None:
                self.loss += embed_l2 * tf.nn.l2_loss(xw)  #tf.concat(w0, 0))
                for i in range(len(layer_sizes)):
                    wi = self.vars['w%d' % i]
                    self.loss += layer_l2[i] * tf.nn.l2_loss(wi)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate,
                                                 self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #16
0
파일: models.py 프로젝트: ZM7/Tencent-Game
    def __init__(self,
                 field_sizes=None,
                 embed_size=10,
                 filter_sizes=None,
                 layer_acts=None,
                 drop_out=None,
                 init_path=None,
                 opt_algo='gd',
                 learning_rate=1e-2,
                 random_seed=None):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i],
                                               embed_size], 'xavier', dtype))
        init_vars.append(('f1', [embed_size, filter_sizes[0], 1,
                                 2], 'xavier', dtype))
        init_vars.append(('f2', [embed_size, filter_sizes[1], 2,
                                 2], 'xavier', dtype))
        init_vars.append(('w1', [2 * 3 * embed_size, 1], 'xavier', dtype))
        init_vars.append(('b1', [1], 'zero', dtype))

        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([
                tf.sparse_tensor_dense_matmul(self.X[i], w0[i])
                for i in range(num_inputs)
            ], 1)
            l = xw

            l = tf.transpose(tf.reshape(l, [-1, num_inputs, embed_size, 1]),
                             [0, 2, 1, 3])
            f1 = self.vars['f1']
            l = tf.nn.conv2d(l, f1, [1, 1, 1, 1], 'SAME')
            l = tf.transpose(
                utils.max_pool_4d(tf.transpose(l, [0, 1, 3, 2]),
                                  int(num_inputs / 2)), [0, 1, 3, 2])
            f2 = self.vars['f2']
            l = tf.nn.conv2d(l, f2, [1, 1, 1, 1], 'SAME')
            l = tf.transpose(
                utils.max_pool_4d(tf.transpose(l, [0, 1, 3, 2]), 3),
                [0, 1, 3, 2])
            l = tf.nn.dropout(
                utils.activate(tf.reshape(l, [-1, embed_size * 3 * 2]),
                               layer_acts[0]), self.layer_keeps[0])
            w1 = self.vars['w1']
            b1 = self.vars['b1']
            l = tf.matmul(l, w1) + b1

            l = tf.squeeze(l)
            self.y_prob = tf.sigmoid(l)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l,
                                                        labels=self.y))
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate,
                                                 self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #17
0
파일: models.py 프로젝트: zgcgreat/WSDM
    def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None,
                 embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype))
        num_pairs = int(num_inputs * (num_inputs - 1) / 2)
        node_in = num_inputs * embed_size + num_pairs
        # node_in = num_inputs * (embed_size + num_inputs)
        for i in range(len(layer_sizes)):
            init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype))
            init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype))
            node_in = layer_sizes[i]
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1)
            xw3d = tf.reshape(xw, [-1, num_inputs, embed_size])

            row = []
            col = []
            for i in range(num_inputs-1):
                for j in range(i+1, num_inputs):
                    row.append(i)
                    col.append(j)
            # batch * pair * k
            p = tf.transpose(
                # pair * batch * k
                tf.gather(
                    # num * batch * k
                    tf.transpose(
                        xw3d, [1, 0, 2]),
                    row),
                [1, 0, 2])
            # batch * pair * k
            q = tf.transpose(
                tf.gather(
                    tf.transpose(
                        xw3d, [1, 0, 2]),
                    col),
                [1, 0, 2])
            p = tf.reshape(p, [-1, num_pairs, embed_size])
            q = tf.reshape(q, [-1, num_pairs, embed_size])
            ip = tf.reshape(tf.reduce_sum(p * q, [-1]), [-1, num_pairs])

            # simple but redundant
            # batch * n * 1 * k, batch * 1 * n * k
            # ip = tf.reshape(
            #     tf.reduce_sum(
            #         tf.expand_dims(xw3d, 2) *
            #         tf.expand_dims(xw3d, 1),
            #         3),
            #     [-1, num_inputs**2])
            l = tf.concat([xw, ip], 1)

            for i in range(len(layer_sizes)):
                wi = self.vars['w%d' % i]
                bi = self.vars['b%d' % i]
                l = tf.nn.dropout(
                    utils.activate(
                        tf.matmul(l, wi) + bi,
                        layer_acts[i]),
                    self.layer_keeps[i])

            l = tf.squeeze(l)
            self.y_prob = tf.sigmoid(l)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y))
            if layer_l2 is not None:
                self.loss += embed_l2 * tf.nn.l2_loss(xw)
                for i in range(len(layer_sizes)):
                    wi = self.vars['w%d' % i]
                    self.loss += layer_l2[i] * tf.nn.l2_loss(wi)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #18
0
def main():

    import argparse
    parser = argparse.ArgumentParser(
        description="imsitu VSRL. Training, evaluation and prediction.")
    parser.add_argument("--gpuid",
                        default=-1,
                        help="put GPU id > -1 in GPU mode",
                        type=int)
    #parser.add_argument("--command", choices = ["train", "eval", "resume", 'predict'], required = True)
    parser.add_argument('--resume_training',
                        action='store_true',
                        help='Resume training from the model [resume_model]')
    parser.add_argument('--resume_model',
                        type=str,
                        default='',
                        help='The model we resume')
    parser.add_argument('--verb_module',
                        type=str,
                        default='',
                        help='pretrained verb module')
    parser.add_argument('--train_role',
                        action='store_true',
                        help='cnn fix, verb fix, role train from the scratch')
    parser.add_argument(
        '--finetune_verb',
        action='store_true',
        help='cnn fix, verb finetune, role train from the scratch')
    parser.add_argument(
        '--finetune_cnn',
        action='store_true',
        help='cnn finetune, verb finetune, role train from the scratch')
    parser.add_argument('--output_dir',
                        type=str,
                        default='./trained_models',
                        help='Location to output the model')
    parser.add_argument('--evaluate',
                        action='store_true',
                        help='Only use the testing mode')
    #todo: train role module separately with gt verbs

    args = parser.parse_args()

    batch_size = 640
    #lr = 5e-6
    lr = 0.0001
    lr_max = 5e-4
    lr_gamma = 0.1
    lr_step = 25
    clip_norm = 50
    weight_decay = 1e-4
    n_epoch = 500
    n_worker = 3

    dataset_folder = 'imSitu'
    imgset_folder = 'resized_256'

    train_set = json.load(open(dataset_folder + "/train.json"))
    encoder = imsitu_encoder(train_set)

    model = model_vsrl_small_finetune.RelationNetworks(encoder, args.gpuid)

    # To group up the features
    cnn_features, verb_features, role_features = utils.group_features(model)

    train_set = imsitu_loader(imgset_folder, train_set, encoder,
                              model.train_preprocess())

    train_loader = torch.utils.data.DataLoader(train_set,
                                               batch_size=32,
                                               shuffle=True,
                                               num_workers=n_worker)

    dev_set = json.load(open(dataset_folder + "/dev.json"))
    dev_set = imsitu_loader(imgset_folder, dev_set, encoder,
                            model.train_preprocess())
    dev_loader = torch.utils.data.DataLoader(dev_set,
                                             batch_size=32,
                                             shuffle=True,
                                             num_workers=n_worker)

    traindev_set = json.load(open(dataset_folder + "/dev.json"))
    traindev_set = imsitu_loader(imgset_folder, traindev_set, encoder,
                                 model.train_preprocess())
    traindev_loader = torch.utils.data.DataLoader(traindev_set,
                                                  batch_size=8,
                                                  shuffle=True,
                                                  num_workers=n_worker)

    utils.set_trainable(model, False)
    if args.train_role:
        print('CNN fix, Verb fix, train role from the scratch from: {}'.format(
            args.verb_module))
        args.train_all = False
        if len(args.verb_module) == 0:
            raise Exception('[pretrained verb module] not specified')
        utils.load_net(args.verb_module, [model.conv, model.verb],
                       ['conv', 'verb'])
        optimizer_select = 1
        model_name = 'cfx_vfx_rtrain'

    elif args.finetune_verb:
        print('CNN fix, Verb finetune, train role from the scratch from: {}'.
              format(args.verb_module))
        args.train_all = True
        if len(args.verb_module) == 0:
            raise Exception('[pretrained verb module] not specified')
        utils.load_net(args.verb_module, [model.conv, model.verb],
                       ['conv', 'verb'])
        optimizer_select = 2
        model_name = 'cfx_vft_rtrain'

    elif args.finetune_cnn:
        print(
            'CNN finetune, Verb finetune, train role from the scratch from: {}'
            .format(args.verb_module))
        args.train_all = True
        if len(args.verb_module) == 0:
            raise Exception('[pretrained verb module] not specified')
        utils.load_net(args.verb_module, [model.conv, model.verb],
                       ['conv', 'verb'])
        optimizer_select = 3
        model_name = 'cft_vft_rtrain'

    elif args.resume_training:
        print('Resume training from: {}'.format(args.resume_model))
        args.train_all = True
        if len(args.resume_model) == 0:
            raise Exception('[pretrained verb module] not specified')
        utils.load_net(args.resume_model, [model])
        optimizer_select = 0
        model_name = 'resume_all'
    else:
        if not args.evaluate:
            print('Training from the scratch.')
        optimizer_select = 0
        args.train_all = True
        model_name = 'train_full'

    optimizer = utils.get_optimizer(lr, weight_decay, optimizer_select,
                                    cnn_features, verb_features, role_features)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    if args.gpuid >= 0:
        #print('GPU enabled')
        model.cuda()

    #optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                step_size=lr_step,
                                                gamma=lr_gamma)
    #gradient clipping, grad check

    if args.evaluate:
        top1, top5, val_loss = eval(model,
                                    dev_loader,
                                    encoder,
                                    args.gpuid,
                                    write_to_file=True)

        top1_avg = top1.get_average_results()
        top5_avg = top5.get_average_results()

        avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
                    top5_avg["value"] + top5_avg["value-all"]
        avg_score /= 8

        print('Dev average :{:.2f} {} {}'.format(
            avg_score * 100, utils.format_dict(top1_avg, '{:.2f}', '1-'),
            utils.format_dict(top5_avg, '{:.2f}', '5-')))

        #write results to csv file
        gt_labels = top1.gt_situation
        pred_labels = top1.predicted_situation
        verb_pred = top1.verb_pred

        with open("gt_rn_only.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(gt_labels)

        with open("pred_rn_only.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerows(pred_labels)

        with open("verbpred_rn_only.csv", "w") as f:
            writer = csv.writer(f)
            writer.writerow(['verb', 'total', 'predicted'])
            for key, value in verb_pred.items():
                writer.writerow([key, value[0], value[1]])

        print('Writing predictions to file completed !')

    else:

        print('Model training started!')
        train(model, train_loader, dev_loader, traindev_loader, optimizer,
              scheduler, n_epoch, args.output_dir, encoder, args.gpuid,
              clip_norm, lr_max, model_name, args)
예제 #19
0
파일: models.py 프로젝트: ZM7/Tencent-Game
    def __init__(self,
                 field_sizes=None,
                 embed_size=10,
                 layer_sizes=None,
                 layer_acts=None,
                 drop_out=None,
                 embed_l2=None,
                 layer_l2=None,
                 init_path=None,
                 opt_algo='gd',
                 learning_rate=1e-2,
                 random_seed=None):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i],
                                               embed_size], 'xavier', dtype))
        node_in = num_inputs * embed_size
        for i in range(len(layer_sizes) - 1):
            init_vars.append(('w%d' % i, [node_in,
                                          layer_sizes[i]], 'xavier', dtype))
            init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero', dtype))
            node_in = layer_sizes[i]
        init_vars.append(('w_final', [2 * node_in,
                                      layer_sizes[-1]], 'xavier', dtype))
        init_vars.append(('b_final', [layer_sizes[-1]], 'zero', dtype))
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            # 38个field--一个大的系数矩阵(6086维),其中按照每个field的维数进行分别embedding,最后再拼接
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([
                tf.sparse_tensor_dense_matmul(self.X[i], w0[i])
                for i in range(num_inputs)
            ], 1)
            l = xw
            la = []
            for i in range(len(layer_sizes) - 2):
                wi = self.vars['w%d' % i]
                bi = self.vars['b%d' % i]
                print(l.shape, wi.shape, bi.shape)
                l = tf.nn.dropout(
                    utils.activate(tf.matmul(l, wi) + bi, layer_acts[i]),
                    self.layer_keeps[i])
                la.append(l)
            l_final = tf.nn.dropout(
                utils.activate(
                    tf.matmul(l, self.vars['w%d' % (len(layer_sizes) - 2)]) +
                    self.vars['b%d' % (len(layer_sizes) - 2)],
                    layer_acts[len(layer_sizes) - 2]),
                self.layer_keeps[len(layer_sizes) - 2])
            la_new = tf.concat([x for x in la], 0)
            H = tf.reshape(la_new,
                           [-1, len(layer_sizes) - 2, layer_sizes[0]
                            ])  # shape = [batch_size,3,128]
            H_T = tf.transpose(H, [0, 2, 1])  # shape=[batch_size,128,3]
            S_0 = tf.matmul(H, H_T)
            mask = [x for x in range(len(layer_sizes) - 2)]
            mask_zero = tf.ones([len(layer_sizes) - 2,
                                 len(layer_sizes) - 2]) - tf.one_hot(
                                     mask,
                                     len(layer_sizes) - 2)
            S = tf.multiply(S_0, mask_zero)
            print(S.shape)
            A = tf.nn.softmax(S, name='attention')  # shape = batch_size *3*3
            G = tf.reduce_sum(tf.matmul(A, H), 1)  # shape = batch_size * 128
            print(G.shape)
            M = tf.concat([l_final, G], 1)
            w_final = self.vars['w_final']
            b_final = self.vars['b_final']
            l_final = tf.matmul(M, w_final) + b_final
            l_final = tf.squeeze(l_final)
            self.y_prob = l_final

            self.loss = tf.reduce_sum(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l_final,
                                                        labels=self.y))
            if layer_l2 is not None:
                self.loss += embed_l2 * tf.nn.l2_loss(xw)
                for i in range(len(layer_sizes) - 1):
                    wi = self.vars['w%d' % i]
                    self.loss += layer_l2[i] * tf.nn.l2_loss(wi)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate,
                                                 self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
            self.saver = tf.train.Saver(max_to_keep=3)
예제 #20
0
파일: PNN.py 프로젝트: fanqieCJL/tfctr
    def __init__(self,
                 train_loader,
                 test_loader,
                 embed_size=10,
                 product_way='in',
                 layer_size=None,
                 layer_act=None,
                 layer_keeps=None,
                 opt_algo='gd',
                 learning_rate=0.01,
                 epoch=10,
                 early_stop_round=None,
                 l2=None,
                 random_seed=None):
        self.graph = tf.Graph()

        self.train_loader = train_loader
        self.test_loader = test_loader
        self.embed_size = embed_size
        self.product_way = product_way
        self.layer_size = layer_size
        self.layer_act = layer_act
        self.layer_keeps = layer_keeps

        self.num_fields = len(config.FIELD_SIZES)
        self.var_list = []
        for idx in range(self.num_fields):
            self.var_list.append([
                'embed_{}'.format(idx),
                [config.FIELD_SIZES[idx], self.embed_size], 'xavier'
            ])

        num_pairs = int(self.num_fields * (self.num_fields - 1) / 2)
        if self.product_way == 'out':
            self.var_list.append([
                'kernel', [self.embed_size, num_pairs, self.embed_size],
                'xavier'
            ])

        in_size = self.num_fields * self.embed_size + num_pairs
        for idx in range(len(layer_size)):
            self.var_list.append(
                ['w_{}'.format(idx), [in_size, layer_size[idx]], 'xavier'])
            self.var_list.append(
                ['b_{}'.format(idx), [layer_size[idx]], 'zero'])
            in_size = layer_size[idx]

        self.var_dict = utils.get_var(self.var_list)

        self.opt_algo = opt_algo
        self.learning_rate = learning_rate
        self.epoch = epoch
        self.early_stop_round = early_stop_round
        self.l2 = l2
        self.random_seed = random_seed

        self.time_scores = []
        self.train_scores = []
        self.test_scores = []

        #         with self.graph.as_default():
        if self.random_seed is not None:
            tf.set_random_seed(self.random_seed)
        self.X = [
            tf.sparse_placeholder(config.DTYPE) for n in range(self.num_fields)
        ]
        self.y = tf.placeholder(config.DTYPE)

        with tf.variable_scope('Embedding_Layer'):
            w_embed = [
                self.var_dict['embed_{}'.format(idx)]
                for idx in range(self.num_fields)
            ]
            xw = tf.concat([
                tf.sparse_tensor_dense_matmul(self.X[idx], w_embed[idx])
                for idx in range(self.num_fields)
            ], 1)
            layer_out = xw
            xw3d = tf.reshape(xw, [-1, self.num_fields, self.embed_size])

        with tf.variable_scope('Product_Layer'):
            row = []
            col = []
            for i in range(self.num_fields - 1):
                for j in range(i + 1, self.num_fields):
                    row.append(i)
                    col.append(j)
            p = tf.transpose(tf.gather(tf.transpose(xw3d, [1, 0, 2]), row),
                             [1, 0, 2])

            q = tf.transpose(tf.gather(tf.transpose(xw3d, [1, 0, 2]), col),
                             [1, 0, 2])

            p = tf.reshape(p, [-1, num_pairs, self.embed_size])
            q = tf.reshape(q, [-1, num_pairs, self.embed_size])
            if self.product_way == 'in':
                product = tf.reshape(tf.reduce_sum(p * q, [-1]),
                                     [-1, num_pairs])
            else:
                k = self.var_dict['kernel']
                p = tf.expand_dims(p, 1)
                product = tf.reduce_sum(
                    tf.multiply(
                        tf.transpose(tf.reduce_sum(tf.multiply(p, k), -1),
                                     [0, 2, 1]), q), -1)

            layer_out = tf.concat([layer_out, product], 1)

        for idx in range(len(layer_size)):
            with tf.variable_scope('Hiden_Layer_{}'.format(idx)):
                wi = self.var_dict['w_{}'.format(idx)]
                bi = self.var_dict['b_{}'.format(idx)]
                layer_out = tf.nn.dropout(
                    utils.activate(
                        tf.matmul(layer_out, wi) + bi, self.layer_act[idx]),
                    self.layer_keeps[idx])

        layer_out = tf.squeeze(layer_out)
        self.y_preds = tf.sigmoid(layer_out)

        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y,
                                                    logits=layer_out))
        if self.l2 is not None:
            for idx in range(self.num_fields):
                self.loss += self.l2 * tf.nn.l2_loss(
                    self.var_dict['embed_{}'.format(idx)])
            for idx in range(len(self.layer_size)):
                self.loss += self.l2 * tf.nn.l2_loss(
                    self.var_dict['w_{}'.format(idx)])

        self.optimizer = utils.get_optimizer(self.opt_algo, self.learning_rate,
                                             self.loss)

        self.sess = tf.Session()
        tf.global_variables_initializer().run(session=self.sess)
예제 #21
0
파일: train.py 프로젝트: ChoiDM/PIPnet
# Option
DATAROOT = '/home/taey16/storage' if not USE_NSML else os.path.join(
    DATASET_PATH[1], 'taey16', 'storage')
opt = parse_option(DATAROOT, USE_NSML=USE_NSML, print_option=False)

# Data Loader
dataset_trn, dataset_val = get_dataloader(opt)

# Loading model
net = create_model(opt)

# Loss Function
hm_criterion, off_criterion = get_loss_function(opt)

# Optimizer
optimizer = get_optimizer(net, opt)
scheduler = CosineAnnealingLR(optimizer,
                              eta_min=opt.lr * opt.eta_min_ratio,
                              T_max=(opt.max_epoch - opt.lr_warmup_epoch))

# Initial Best Score
global_iter, best_nme, best_epoch = [0, 10000, 0]

#NOTE: main loop for training
if __name__ == "__main__":

    if USE_NSML:
        scope = locals()
        nsml_bind_model(nsml, scope, 0, net, optimizer)
    else:
        scope = None
예제 #22
0
def main():
    ## dynamically adjust hyper-parameters for ResNets according to base_width
    if args.base_width != 64 and 'sat' in args.loss:
        factor = 64. / args.base_width
        args.sat_alpha = args.sat_alpha**(1. / factor)
        args.sat_es = int(args.sat_es * factor)
        print("Adaptive parameters adjustment: alpha = {:.3f}, Es = {:d}".format(args.sat_alpha, args.sat_es))

    print(args)
    global best_prec1, best_auc

    # Check the save_dir exists or not
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir)
        os.makedirs(os.path.join(args.save_dir, 'train'))
        os.makedirs(os.path.join(args.save_dir, 'val'))
        os.makedirs(os.path.join(args.save_dir, 'test'))

    # prepare dataset
    if args.dataset == 'nexperia':
        train_loader, num_classes, targets = get_loader(args)
    else:
        train_loader, val_loaders, test_loader, num_classes, targets = get_loader(args)
    
    model = get_model(args, num_classes, base_width=args.base_width)
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    model.cuda()

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            if args.dataset=='nexperia_split':
                best_auc = checkpoint['best_auc']
            else:
                best_prec1 = checkpoint['best_prec1']
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    torch.cuda.manual_seed(args.seed)
    cudnn.benchmark = True

    criterion = get_loss(args, labels=targets, num_classes=num_classes)
    optimizer = get_optimizer(model, args)
    scheduler = get_scheduler(optimizer, args)
    
    train_timeline = Timeline()
    val_timeline = Timeline()
    test_timeline = Timeline()

    if args.evaluate:
        validate(test_loader, model)
        return

    print("*" * 40)
    start = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        scheduler.step(epoch)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, train_timeline, args.sat_es, args.dataset, args.mod)
        print("*" * 40)
        
        if args.dataset!='nexperia':
            # evaluate on validation sets
            prec1 = 0
            if args.dataset=='nexperia_split':
                print('val:')
                val_auc = validate(val_loaders, model, epoch, val_timeline, args.dataset, 'val', criterion)
                print("*" * 40)
                
                print('test:')
                test_auc = validate(test_loader, model, epoch, test_timeline, args.dataset, 'test', criterion)
            else:
                for name, val_loader in zip(args.val_sets, val_loaders):
                    print(name +":", end="\t")
                    prec1 = validate(val_loader, model)
            print("*" * 40)
            
            if args.dataset=='nexperia_split':
                # remember best auc and save checkpoint
                is_best = val_auc > best_auc
                best_auc = max(val_auc, best_auc)
                if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0:
                    filename = 'checkpoint_{}.tar'.format(epoch + 1)
                else:
                    filename = None
                save_checkpoint(args.save_dir, {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_auc': best_auc,
                }, is_best, filename=filename)

            else:
                # remember best prec@1 and save checkpoint
                is_best = prec1 > best_prec1
                best_prec1 = max(prec1, best_prec1)
                if args.save_freq > 0 and (epoch + 1) % args.save_freq == 0:
                    filename = 'checkpoint_{}.tar'.format(epoch + 1)
                else:
                    filename = None
                save_checkpoint(args.save_dir, {
                    'epoch': epoch + 1,
                    'state_dict': model.state_dict(),
                    'best_prec1': best_prec1,
                }, is_best, filename=filename)
                
        if hasattr(criterion, 'outputs'):
            criterion.weights[epoch] = criterion.outputs[criterion.true_labels.index]
            criterion.clean_weights[epoch] = criterion.outputs[criterion.clean_labels.index]
        else:
            criterion.weights[epoch] = criterion.soft_labels[criterion.true_labels.index]
            criterion.clean_weights[epoch] = criterion.soft_labels[criterion.clean_labels.index]
                            
    if args.dataset!='nexperia':
        # evaludate latest checkpoint
        print("Test acc of latest checkpoint:", end='\t')
        validate(test_loader, model, epoch, test_timeline, args.dataset, last=True)
        print("*" * 40)
        
        # evaluate best checkpoint
        if args.dataset=='nexperia_split':
            checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint_best.tar'))
            print("Best validation auc ({}th epoch): {:.2f}%".format(checkpoint['epoch'], best_auc*100.))
            model.load_state_dict(checkpoint['state_dict'])
            print("Test acc of best checkpoint:", end='\t')
            validate(test_loader, model, checkpoint['epoch'], test_timeline, args.dataset, last=True)
            print("*" * 40)
        else:
            if len(val_loaders) > 0:
                checkpoint = torch.load(os.path.join(args.save_dir, 'checkpoint_best.tar'))
                print("Best validation acc ({}th epoch): {:.2f}%".format(checkpoint['epoch'], best_prec1))
                model.load_state_dict(checkpoint['state_dict'])
                print("Test acc of best checkpoint:", end='\t')
                validate(test_loader, model, last=True)
                print("*" * 40)

    time_elapsed = time.time() - start
    print('It takes {:.0f}m {:.0f}s to train.'.format(time_elapsed // 60, time_elapsed % 60))
    
    # save best result
    filename = 'train_results.tar'
    save_checkpoint(args.save_dir, {
        'num_epochs': args.epochs,
        'state_dict': model.state_dict(),
    }, is_best=True, filename=filename)

    # save soft label
    if hasattr(criterion, 'soft_labels'):
        out_fname = os.path.join(args.save_dir, 'updated_soft_labels.npy')
        np.save(out_fname, criterion.soft_labels.cpu().numpy())
        print("Updated soft labels is saved to {}".format(out_fname))
        
    # save weights change of 106 images
    if hasattr(criterion, 'weights'):
        out_fname = os.path.join(args.save_dir, 'weights_change.npy')
        np.save(out_fname, criterion.weights.cpu().numpy())
        print("weights change is saved to {}".format(out_fname))
        
    if hasattr(criterion, 'clean_weights'):
        out_fname = os.path.join(args.save_dir, 'clean_weights_change.npy')
        np.save(out_fname, criterion.clean_weights.cpu().numpy())
        print("clean weights change is saved to {}".format(out_fname))

    # save timelines
    train_acc_class = torch.cat(train_timeline.acc_class, dim=0)
    train_loss_class = torch.cat(train_timeline.loss_class, dim=0)
    train_acc_bi_class = torch.cat(train_timeline.acc_bi_class, dim=0)
    train_loss_bi_class = torch.cat(train_timeline.loss_bi_class, dim=0)
    train_me_class = torch.cat(train_timeline.me_class, dim=0)
    train_me_bi_class = torch.cat(train_timeline.me_bi_class, dim=0)
    
    val_acc_class = torch.cat(val_timeline.acc_class, dim=0)
    val_loss_class = torch.cat(val_timeline.loss_class, dim=0)
    val_acc_bi_class = torch.cat(val_timeline.acc_bi_class, dim=0)
    val_loss_bi_class = torch.cat(val_timeline.loss_bi_class, dim=0)
    val_me_class = torch.cat(val_timeline.me_class, dim=0)
    val_me_bi_class = torch.cat(val_timeline.me_bi_class, dim=0)
    
    test_acc_class = torch.cat(test_timeline.acc_class, dim=0)
    test_loss_class = torch.cat(test_timeline.loss_class, dim=0)
    test_acc_bi_class = torch.cat(test_timeline.acc_bi_class, dim=0)
    test_loss_bi_class = torch.cat(test_timeline.loss_bi_class, dim=0)
    test_me_class = torch.cat(test_timeline.me_class, dim=0)
    test_me_bi_class = torch.cat(test_timeline.me_bi_class, dim=0)
    
    np.save(os.path.join(args.save_dir, 'train', 'loss.npy'), train_timeline.loss)
    np.save(os.path.join(args.save_dir, 'train', 'acc.npy'), train_timeline.acc)
    np.save(os.path.join(args.save_dir, 'train', 'loss_bi.npy'), train_timeline.loss_bi)
    np.save(os.path.join(args.save_dir, 'train', 'acc_bi.npy'), train_timeline.acc_bi)
    np.save(os.path.join(args.save_dir, 'train', 'loss_class.npy'), train_loss_class)
    np.save(os.path.join(args.save_dir, 'train', 'acc_class.npy'), train_acc_class)
    np.save(os.path.join(args.save_dir, 'train', 'loss_bi_class.npy'), train_loss_bi_class)
    np.save(os.path.join(args.save_dir, 'train', 'acc_bi_class.npy'), train_acc_bi_class)
    np.save(os.path.join(args.save_dir, 'train', 'margin_error.npy'), train_timeline.margin_error)
    np.save(os.path.join(args.save_dir, 'train', 'margin_error_bi.npy'), train_timeline.margin_error_bi)
    np.save(os.path.join(args.save_dir, 'train', 'margin_error_class.npy'), train_me_class)
    np.save(os.path.join(args.save_dir, 'train', 'margin_error_bi_class.npy'), train_me_bi_class)
    np.save(os.path.join(args.save_dir, 'train', 'auc.npy'), train_timeline.auc)
    np.save(os.path.join(args.save_dir, 'train', 'fpr_991.npy'), train_timeline.fpr_991)
    np.save(os.path.join(args.save_dir, 'train', 'fpr_993.npy'), train_timeline.fpr_993)
    np.save(os.path.join(args.save_dir, 'train', 'fpr_995.npy'), train_timeline.fpr_995)
    np.save(os.path.join(args.save_dir, 'train', 'fpr_997.npy'), train_timeline.fpr_997)
    np.save(os.path.join(args.save_dir, 'train', 'fpr_999.npy'), train_timeline.fpr_999)
    np.save(os.path.join(args.save_dir, 'train', 'fpr_1.npy'), train_timeline.fpr_1)
    print("other training details are saved to {}".format(os.path.join(args.save_dir, 'train')))

    np.save(os.path.join(args.save_dir, 'val', 'loss.npy'), val_timeline.loss)
    np.save(os.path.join(args.save_dir, 'val', 'acc.npy'), val_timeline.acc)
    np.save(os.path.join(args.save_dir, 'val', 'loss_bi.npy'), val_timeline.loss_bi)
    np.save(os.path.join(args.save_dir, 'val', 'acc_bi.npy'), val_timeline.acc_bi)
    np.save(os.path.join(args.save_dir, 'val', 'loss_class.npy'), val_loss_class)
    np.save(os.path.join(args.save_dir, 'val', 'acc_class.npy'), val_acc_class)
    np.save(os.path.join(args.save_dir, 'val', 'loss_bi_class.npy'), val_loss_bi_class)
    np.save(os.path.join(args.save_dir, 'val', 'acc_bi_class.npy'), val_acc_bi_class)
    np.save(os.path.join(args.save_dir, 'val', 'margin_error.npy'), val_timeline.margin_error_bi)
    np.save(os.path.join(args.save_dir, 'val', 'margin_error_bi.npy'), val_timeline.margin_error_bi)
    np.save(os.path.join(args.save_dir, 'val', 'margin_error_class.npy'), val_me_class)
    np.save(os.path.join(args.save_dir, 'val', 'margin_error_bi_class.npy'), val_me_bi_class)
    np.save(os.path.join(args.save_dir, 'val', 'auc.npy'), val_timeline.auc)
    np.save(os.path.join(args.save_dir, 'val', 'fpr_991.npy'), val_timeline.fpr_991)
    np.save(os.path.join(args.save_dir, 'val', 'fpr_993.npy'), val_timeline.fpr_993)
    np.save(os.path.join(args.save_dir, 'val', 'fpr_995.npy'), val_timeline.fpr_995)
    np.save(os.path.join(args.save_dir, 'val', 'fpr_997.npy'), val_timeline.fpr_997)
    np.save(os.path.join(args.save_dir, 'val', 'fpr_999.npy'), val_timeline.fpr_999)
    np.save(os.path.join(args.save_dir, 'val', 'fpr_1.npy'), val_timeline.fpr_1)
    print("other validating details are saved to {}".format(os.path.join(args.save_dir, 'val')))

    np.save(os.path.join(args.save_dir, 'test', 'loss.npy'), test_timeline.loss)
    np.save(os.path.join(args.save_dir, 'test', 'acc.npy'), test_timeline.acc)
    np.save(os.path.join(args.save_dir, 'test', 'loss_bi.npy'), test_timeline.loss_bi)
    np.save(os.path.join(args.save_dir, 'test', 'acc_bi.npy'), test_timeline.acc_bi)
    np.save(os.path.join(args.save_dir, 'test', 'loss_class.npy'), test_loss_class)
    np.save(os.path.join(args.save_dir, 'test', 'acc_class.npy'), test_acc_class)
    np.save(os.path.join(args.save_dir, 'test', 'loss_bi_class.npy'), test_loss_bi_class)
    np.save(os.path.join(args.save_dir, 'test', 'acc_bi_class.npy'), test_acc_bi_class)
    np.save(os.path.join(args.save_dir, 'test', 'margin_error.npy'), test_timeline.margin_error_bi)
    np.save(os.path.join(args.save_dir, 'test', 'margin_error_bi.npy'), test_timeline.margin_error_bi)
    np.save(os.path.join(args.save_dir, 'test', 'margin_error_class.npy'), test_me_class)
    np.save(os.path.join(args.save_dir, 'test', 'margin_error_bi_class.npy'), test_me_bi_class)
    np.save(os.path.join(args.save_dir, 'test', 'auc.npy'), test_timeline.auc)
    np.save(os.path.join(args.save_dir, 'test', 'fpr_991.npy'), test_timeline.fpr_991)
    np.save(os.path.join(args.save_dir, 'test', 'fpr_993.npy'), test_timeline.fpr_993)
    np.save(os.path.join(args.save_dir, 'test', 'fpr_995.npy'), test_timeline.fpr_995)
    np.save(os.path.join(args.save_dir, 'test', 'fpr_997.npy'), test_timeline.fpr_997)
    np.save(os.path.join(args.save_dir, 'test', 'fpr_999.npy'), test_timeline.fpr_999)
    np.save(os.path.join(args.save_dir, 'test', 'fpr_1.npy'), test_timeline.fpr_1)
    print("other testing details are saved to {}".format(os.path.join(args.save_dir, 'test')))
예제 #23
0
            correct = test(model, test_dataloader)

        state_keeper.update(time, epoch, loss_dict, correct)

        save_path = "pretrained/{prefix}.{time}.pth".format(prefix=args.prefix,
                                                            time=time)
        torch.save(model.state_dict(), f=save_path)
        print("Current model has been saved under {}.".format(save_path))


if __name__ == "__main__":
    state_keeper = utils.StateKeeper(args)
    if args.exname == "TransferLearning":
        state_keeper_aux = utils.StateKeeper(args, state_keeper_name="aux")

    for time in range(args.times):
        model = utils.get_model(args)
        optimizer = utils.get_optimizer(args.optim, args.lr, model)
        forward_epoch(model, train_dataloader, test_dataloader, optimizer,
                      state_keeper, time, args.epochs)
        if args.exname == "TransferLearning":
            optimizer_aux = utils.get_optimizer(args.optim, args.lr_aux, model)
            forward_epoch(model, train_dataloader_aux, test_dataloader_aux,
                          optimizer_aux, state_keeper_aux, time,
                          args.epochs_aux)

    state_keeper.save()
    if args.exname == "TransferLearning":
        state_keeper_aux.save()
    print("Done!")
예제 #24
0
def train(args):
    # set logger
    logging_dir = args.output_dir if args.output_dir else 'train-{}'.format(utils.get_datetime_string())
    os.mkdir('{}'.format(logging_dir))
    logging.basicConfig(
        level=logging.INFO,
        filename='{}/log.txt'.format(logging_dir),
        format='%(asctime)s %(message)s',
        filemode='w'
    )

    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

    logging.info('=========== Taks {} started! ==========='.format(args.output_dir))
    for arg in vars(args):
        logging.info('{}: {}'.format(arg, getattr(args, arg)))
    logging.info('========================================')

    # initialize loader
    multi_scale = len(args.layers) if args.network != 'unet' else 0
    train_set = utils.SegmentationImageFolder(os.sep.join([args.dataroot, 'train']),
                                              image_folder=args.img_dir,
                                              segmentation_folder=args.seg_dir,
                                              labels=args.color_labels,
                                              image_size=(args.image_width, args.image_height),
                                              random_horizontal_flip=args.random_horizontal_flip,
                                              random_rotation=args.random_rotation,
                                              random_crop=args.random_crop,
                                              random_square_crop=args.random_square_crop,
                                              label_regr=args.regression,
                                              multi_scale=multi_scale)
    val_set = utils.SegmentationImageFolder(os.sep.join([args.dataroot, 'val']),
                                            image_folder=args.img_dir,
                                            segmentation_folder=args.seg_dir,
                                            labels=args.color_labels,
                                            image_size=(args.image_width, args.image_height),
                                            random_square_crop=args.random_square_crop,
                                            label_regr=args.regression)
    train_loader = torch.utils.data.DataLoader(train_set, batch_size=args.batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_set, batch_size=args.val_batch_size)

    # initialize model, input channels need to be calculated by hand
    n_classes = len(args.color_labels)

    if args.network == 'unet':
        network = networks.UNet
        criterion = nn.MSELoss() if args.regression else utils.CrossEntropyLoss2D()
    elif args.network == 'triangle':
        network = networks.TriangleNet
        criterion = utils.MSCrossEntropyLoss2D([0.15]+[0.85/float(multi_scale)]*multi_scale)
    else:
        pass
    val_criterion = utils.CrossEntropyLoss2D()

    if args.regression:
        model = network(args.layers, 3, 1, groups=args.groups)
    else:
        model = network(args.layers, 3, n_classes, groups=args.groups)
    if not args.cpu:
        model.cuda()

    # train
    iterations = 0
    for epoch in range(args.epochs):
        model.train()
        # update lr according to lr policy
        if epoch in args.lr_policy:
            lr = args.lr_policy[epoch]
            optimizer = utils.get_optimizer(args.optimizer, model.parameters(),
                                            lr=lr, momentum=args.momentum, nesterov=args.nesterov)
            if epoch > 0:
                logging.info('| Learning Rate | Epoch: {: >3d} | Change learning rate to {}'.format(epoch+1, lr))
            else:
                logging.info('| Learning Rate | Initial learning rate: {}'.format(lr))

        # iterate all samples
        losses = utils.AverageMeter()
        for i_batch, (img, seg) in enumerate(train_loader):

            img = Variable(img)
            seg = Variable(seg) if not multi_scale else [Variable(x) for x in seg]

            if not args.cpu:
                img = img.cuda()
                seg = seg.cuda() if not multi_scale else [x.cuda() for x in seg]

            # compute output
            output = model(img)
            loss = criterion(output, seg)
            losses.update(loss.data[0])

            # compute gradient and do SGD step
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # logging training curve
            if iterations % args.print_interval == 0:
                logging.info(
                    '| Iterations: {: >6d} '
                    '| Epoch: {: >3d}/{: >3d} '
                    '| Batch: {: >4d}/{: >4d} '
                    '| Training loss: {:.6f}'.format(
                        iterations, 
                        epoch+1, args.epochs,
                        i_batch, len(train_loader)-1,
                        losses.avg
                    )
                )
                losses = utils.AverageMeter()

            # validation on all val samples
            if iterations % args.validation_interval == 0:
                model.eval()
                val_losses = utils.AverageMeter()
                gt_pixel_count = [0] * n_classes
                pred_pixel_count = [0] * n_classes
                intersection_pixel_count = [0] * n_classes
                union_pixel_count = [0] * n_classes

                for img, seg in val_loader:

                    img = Variable(img)
                    seg = Variable(seg)

                    if not args.cpu:
                        img = img.cuda()
                        seg = seg.cuda()

                    # compute output
                    output = model(img)
                    loss = val_criterion(output, seg)
                    val_losses.update(loss.data[0], float(img.size(0))/float(args.batch_size))
                    output_numpy = output.data.numpy() if args.cpu else output.data.cpu().numpy()
                    pred_labels = numpy.argmax(output_numpy, axis=1)
                    gt_labels = seg.data.numpy() if args.cpu else seg.data.cpu().numpy()

                    pred_labels = pred_labels.flatten()
                    gt_labels = gt_labels.flatten()

                    for i in range(n_classes):
                        pred_pixel_count[i] += (pred_labels == i).sum()
                        gt_pixel_count[i] += (gt_labels == i).sum()
                        gt_dumb = numpy.full(gt_labels.shape, -1, dtype=numpy.int)
                        pred_dumb = numpy.full(pred_labels.shape, -2, dtype=numpy.int)
                        gt_dumb[gt_labels == i] = 0
                        pred_dumb[pred_labels == i] = 0
                        intersection_pixel_count[i] += (gt_dumb == pred_dumb).sum()
                        pred_dumb[gt_labels == i] = 0
                        union_pixel_count[i] += (pred_dumb == 0).sum()

                # calculate mPA & mIOU
                mPA = 0
                mIOU = 0
                for i in range(n_classes):
                    mPA += float(intersection_pixel_count[i]) / float(gt_pixel_count[i])
                    mIOU += float(intersection_pixel_count[i]) / float(union_pixel_count[i])
                mPA /= float(n_classes)
                mIOU /= float(n_classes)

                logging.info(
                    '| Iterations: {: >6d} '
                    '| Epoch: {: >3d}/{: >3d} '
                    '| Average mPA: {:.4f} '
                    '| Average mIOU: {:.4f} '
                    '| Validation loss: {:.6f} '.format(
                        iterations, 
                        epoch+1, args.epochs,
                        mPA,
                        mIOU,
                        val_losses.avg
                    )
                )

                model.train()

            if iterations % args.checkpoint_interval == 0 and iterations > 0:
                model_weights_path = '{}/iterations-{:0>6d}-epoch-{:0>3d}.pth'.format(logging_dir, iterations, epoch+1)
                torch.save(model.state_dict(), model_weights_path)
                logging.info('| Checkpoint | {} is saved!'.format(model_weights_path))

            iterations += 1
예제 #25
0
def run(proc_id, n_gpus, args, devices, dataset):
    dev_id = devices[proc_id]
    train_labels = dataset.train_labels
    train_truths = dataset.train_truths
    num_edges = train_truths.shape[0]

    reverse_types = {
        to_etype_name(k): 'rev-' + to_etype_name(k)
        for k in dataset.possible_rating_values
    }
    reverse_types.update({v: k for k, v in reverse_types.items()})
    sampler = dgl.dataloading.MultiLayerNeighborSampler([None],
                                                        return_eids=True)
    dataloader = dgl.dataloading.EdgeDataLoader(dataset.train_enc_graph, {
        to_etype_name(k): th.arange(
            dataset.train_enc_graph.number_of_edges(etype=to_etype_name(k)))
        for k in dataset.possible_rating_values
    },
                                                sampler,
                                                batch_size=args.minibatch_size,
                                                shuffle=True,
                                                drop_last=False)

    if proc_id == 0:
        valid_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.valid_dec_graph,
            th.arange(dataset.valid_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.valid_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)
        test_dataloader = dgl.dataloading.EdgeDataLoader(
            dataset.test_dec_graph,
            th.arange(dataset.test_dec_graph.number_of_edges()),
            sampler,
            g_sampling=dataset.test_enc_graph,
            batch_size=args.minibatch_size,
            shuffle=False,
            drop_last=False)

    if n_gpus > 1:
        dist_init_method = 'tcp://{master_ip}:{master_port}'.format(
            master_ip='127.0.0.1', master_port='12345')
        world_size = n_gpus
        th.distributed.init_process_group(backend="nccl",
                                          init_method=dist_init_method,
                                          world_size=world_size,
                                          rank=dev_id)
    if n_gpus > 0:
        th.cuda.set_device(dev_id)

    nd_possible_rating_values = \
        th.FloatTensor(dataset.possible_rating_values)
    nd_possible_rating_values = nd_possible_rating_values.to(dev_id)

    net = Net(args=args, dev_id=dev_id)
    net = net.to(dev_id)
    if n_gpus > 1:
        net = DistributedDataParallel(net,
                                      device_ids=[dev_id],
                                      output_device=dev_id)
    rating_loss_net = nn.CrossEntropyLoss()
    learning_rate = args.train_lr
    optimizer = get_optimizer(args.train_optimizer)(net.parameters(),
                                                    lr=learning_rate)
    print("Loading network finished ...\n")

    ### declare the loss information
    best_valid_rmse = np.inf
    no_better_valid = 0
    best_epoch = -1
    count_rmse = 0
    count_num = 0
    count_loss = 0
    print("Start training ...")
    dur = []
    iter_idx = 1

    for epoch in range(1, args.train_max_epoch):
        if epoch > 1:
            t0 = time.time()
        net.train()
        with tqdm.tqdm(dataloader) as tq:
            for step, (input_nodes, pair_graph, blocks) in enumerate(tq):
                head_feat, tail_feat, blocks = load_subtensor(
                    input_nodes, pair_graph, blocks, dataset,
                    dataset.train_enc_graph)
                frontier = blocks[0]
                compact_g = flatten_etypes(pair_graph, dataset,
                                           'train').to(dev_id)
                true_relation_labels = compact_g.edata['label']
                true_relation_ratings = compact_g.edata['rating']

                head_feat = head_feat.to(dev_id)
                tail_feat = tail_feat.to(dev_id)
                frontier = frontier.to(dev_id)

                pred_ratings = net(compact_g, frontier, head_feat, tail_feat,
                                   dataset.possible_rating_values)
                loss = rating_loss_net(pred_ratings,
                                       true_relation_labels.to(dev_id)).mean()
                count_loss += loss.item()
                optimizer.zero_grad()
                loss.backward()
                nn.utils.clip_grad_norm_(net.parameters(),
                                         args.train_grad_clip)
                optimizer.step()

                if proc_id == 0 and iter_idx == 1:
                    print("Total #Param of net: %d" %
                          (torch_total_param_num(net)))

                real_pred_ratings = (
                    th.softmax(pred_ratings, dim=1) *
                    nd_possible_rating_values.view(1, -1)).sum(dim=1)
                rmse = ((real_pred_ratings -
                         true_relation_ratings.to(dev_id))**2).sum()
                count_rmse += rmse.item()
                count_num += pred_ratings.shape[0]

                tq.set_postfix(
                    {
                        'loss': '{:.4f}'.format(count_loss / iter_idx),
                        'rmse': '{:.4f}'.format(count_rmse / count_num)
                    },
                    refresh=False)

                iter_idx += 1

        if epoch > 1:
            epoch_time = time.time() - t0
            print("Epoch {} time {}".format(epoch, epoch_time))

        if epoch % args.train_valid_interval == 0:
            if n_gpus > 1:
                th.distributed.barrier()
            if proc_id == 0:
                valid_rmse = evaluate(args=args,
                                      dev_id=dev_id,
                                      net=net,
                                      dataset=dataset,
                                      dataloader=valid_dataloader,
                                      segment='valid')
                logging_str += ',\tVal RMSE={:.4f}'.format(valid_rmse)

                if valid_rmse < best_valid_rmse:
                    best_valid_rmse = valid_rmse
                    no_better_valid = 0
                    best_epoch = epoch
                    test_rmse = evaluate(args=args,
                                         dev_id=dev_id,
                                         net=net,
                                         dataset=dataset,
                                         dataloader=test_dataloader,
                                         segment='test')
                    best_test_rmse = test_rmse
                    logging_str += ', Test RMSE={:.4f}'.format(test_rmse)
                else:
                    no_better_valid += 1
                    if no_better_valid > args.train_early_stopping_patience\
                        and learning_rate <= args.train_min_lr:
                        logging.info(
                            "Early stopping threshold reached. Stop training.")
                        break
                    if no_better_valid > args.train_decay_patience:
                        new_lr = max(
                            learning_rate * args.train_lr_decay_factor,
                            args.train_min_lr)
                        if new_lr < learning_rate:
                            logging.info("\tChange the LR to %g" % new_lr)
                            learning_rate = new_lr
                            for p in optimizer.param_groups:
                                p['lr'] = learning_rate
                            no_better_valid = 0
                            print("Change the LR to %g" % new_lr)
            # sync on evalution
            if n_gpus > 1:
                th.distributed.barrier()

        print(logging_str)
    if proc_id == 0:
        print(
            'Best epoch Idx={}, Best Valid RMSE={:.4f}, Best Test RMSE={:.4f}'.
            format(best_epoch, best_valid_rmse, best_test_rmse))
예제 #26
0
            # Log all train files
            shutil.copyfile('experiment.ini',
                            experiment_dir + 'experiment.ini')
            shutil.copyfile('train.py', experiment_dir + 'train.py')
            shutil.copyfile('flow_utils.py', experiment_dir + 'flow_utils.py')

            # Log serialized prior
            if prior_type != 'normal':
                shutil.copyfile('centers.mat', experiment_dir + 'centers.mat')
                utils.log(means, experiment_dir + 'means.pkl')
                utils.log(covariances, experiment_dir + 'covariances.pkl')
                utils.log(weights, experiment_dir + 'weights.pkl')

        # Create optimizer
        scheduler = utils.get_scheduler(lr, lr_schedule)
        opt_init, opt_update, get_params = utils.get_optimizer(
            optimizer, scheduler, b1, b2)
        opt_state = opt_init(params)
        update_fn = private_update if private else update

        best_test_params, best_test_loss = None, None
        pbar = tqdm(pbar_range)
        for iteration in pbar:
            batch, X = utils.get_batch(sampling, key, X, minibatch_size,
                                       iteration)

            # Possible with Poisson subsampling
            if batch.shape[0] == 0:
                continue

            # Perform model update
            temp_key, key = random.split(key)
예제 #27
0
def main():
    # parse arg and start experiment
    global args
    best_err1 = 100.
    best_epoch = 0

    args = arg_parser.parse_args()
    args.config_of_data = config.datasets[args.data]
    args.num_classes = config.datasets[args.data]['num_classes']
    if configure is None:
        args.tensorboard = False
        print(Fore.RED +
              'WARNING: you don\'t have tesnorboard_logger installed' +
              Fore.RESET)

    # optionally resume from a checkpoint
    if args.resume:
        if args.resume and os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            old_args = checkpoint['args']
            print('Old args:')
            print(old_args)
            # set args based on checkpoint
            if args.start_epoch <= 0:
                args.start_epoch = checkpoint['epoch'] + 1
            best_epoch = args.start_epoch - 1
            best_err1 = checkpoint['best_err1']
            for name in arch_resume_names:
                if name in vars(args) and name in vars(old_args):
                    setattr(args, name, getattr(old_args, name))
            model = getModel(**vars(args))
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(Fore.RED +
                                                          args.resume +
                                                          Fore.RESET),
                  file=sys.stderr)
            return
    else:
        # create model
        print("=> creating model '{}'".format(args.arch))
        model = getModel(**vars(args))

    cudnn.benchmark = True

    # define loss function (criterion) and pptimizer
    criterion = nn.CrossEntropyLoss().cuda()

    # define optimizer
    optimizer = get_optimizer(model, args)

    Trainer = import_module(args.trainer).Trainer
    trainer = Trainer(model, criterion, optimizer, args)

    # create dataloader
    if args.evaluate == 'train':
        train_loader, _, _ = getDataloaders(splits=('train'), **vars(args))
        trainer.test(train_loader, best_epoch)
        return
    elif args.evaluate == 'val':
        _, val_loader, _ = getDataloaders(splits=('val'), **vars(args))
        trainer.test(val_loader, best_epoch)
        return
    elif args.evaluate == 'test':
        _, _, test_loader = getDataloaders(splits=('test'), **vars(args))
        trainer.test(test_loader, best_epoch)
        return
    else:
        train_loader, val_loader, _ = getDataloaders(splits=('train', 'val'),
                                                     **vars(args))

    # check if the folder exists
    create_save_folder(args.save, args.force)

    # set up logging
    global log_print, f_log
    f_log = open(os.path.join(args.save, 'log.txt'), 'w')

    def log_print(*args):
        print(*args)
        print(*args, file=f_log)

    log_print('args:')
    log_print(args)
    print('model:', file=f_log)
    print(model, file=f_log)
    log_print('# of params:',
              str(sum([p.numel() for p in model.parameters()])))
    f_log.flush()
    torch.save(args, os.path.join(args.save, 'args.pth'))
    scores = [
        'epoch\tlr\ttrain_loss\tval_loss\ttrain_err1'
        '\tval_err1\ttrain_err5\tval_err'
    ]
    if args.tensorboard:
        configure(args.save, flush_secs=5)

    for epoch in range(args.start_epoch, args.epochs + 1):

        # train for one epoch
        train_loss, train_err1, train_err5, lr = trainer.train(
            train_loader, epoch)

        if args.tensorboard:
            log_value('lr', lr, epoch)
            log_value('train_loss', train_loss, epoch)
            log_value('train_err1', train_err1, epoch)
            log_value('train_err5', train_err5, epoch)

        # evaluate on validation set
        val_loss, val_err1, val_err5 = trainer.test(val_loader, epoch)

        if args.tensorboard:
            log_value('val_loss', val_loss, epoch)
            log_value('val_err1', val_err1, epoch)
            log_value('val_err5', val_err5, epoch)

        # save scores to a tsv file, rewrite the whole file to prevent
        # accidental deletion
        scores.append(
            ('{}\t{}' + '\t{:.4f}' * 6).format(epoch, lr, train_loss, val_loss,
                                               train_err1, val_err1,
                                               train_err5, val_err5))
        with open(os.path.join(args.save, 'scores.tsv'), 'w') as f:
            print('\n'.join(scores), file=f)

        # remember best err@1 and save checkpoint
        is_best = val_err1 < best_err1
        if is_best:
            best_err1 = val_err1
            best_epoch = epoch
            print(Fore.GREEN + 'Best var_err1 {}'.format(best_err1) +
                  Fore.RESET)
            # test_loss, test_err1, test_err1 = validate(
            #     test_loader, model, criterion, epoch, True)
            # save test
        save_checkpoint(
            {
                'args': args,
                'epoch': epoch,
                'best_epoch': best_epoch,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'best_err1': best_err1,
            }, is_best, args.save)
        if not is_best and epoch - best_epoch >= args.patience > 0:
            break
    print('Best val_err1: {:.4f} at epoch {}'.format(best_err1, best_epoch))
예제 #28
0
    config = utils.load_config(args.config)

    global_params = config["globals"]

    utils.set_seed(global_params["seed"])
    device = utils.get_device(global_params)
    output_dir = global_params["output_dir"]

    data_conf = config["data"]
    if args.generate:
        for c in data_conf.values():
            utils.generate_data(c)

    model = models.get_model(config).to()
    criterion = utils.get_criterion(config)
    optimizer = utils.get_optimizer(model, config)
    scheduler = utils.get_scheduler(optimizer, config)
    loaders = {
        phase: utils.get_loader(config, phase)
        for phase in ["train", "valid3", "valid", "valid12"]
    }

    runner = SupervisedRunner(device=device,
                              input_key=["objects", "externals", "triplet"],
                              input_target_key="targets")
    runner.train(model=model,
                 criterion=criterion,
                 optimizer=optimizer,
                 loaders=loaders,
                 scheduler=scheduler,
                 num_epochs=global_params["num_epochs"],
예제 #29
0
 def get_optimizer(self):
     optimizer_name = self.config[self.network_type]["optimizer"]
     if  optimizer_name == "DistillOptimizer":
         return U.get_optimizer(optimizer_name)(self.learning_rate, self.momentum, self.max_norm)
     elif optimizer_name == "Momentum":
         return U.get_optimizer(optimizer_name)(self.learning_rate, self.momentum)
예제 #30
0
def main():

    print("\n_________________________________________________\n")
    print(now(), "train_model.py main() running.")

    parser = argparse.ArgumentParser(description="Deep Thinking")
    parser.add_argument("--checkpoint",
                        default="check_default",
                        type=str,
                        help="where to save the network")
    parser.add_argument("--dataset",
                        default="CIFAR10",
                        type=str,
                        help="dataset")
    parser.add_argument("--depth",
                        default=1,
                        type=int,
                        help="depth of the network")
    parser.add_argument("--epochs",
                        default=200,
                        type=int,
                        help="number of epochs for training")
    parser.add_argument("--lr", default=0.1, type=float, help="learning rate")
    parser.add_argument("--lr_factor",
                        default=0.1,
                        type=float,
                        help="learning rate decay factor")
    parser.add_argument("--lr_schedule",
                        nargs="+",
                        default=[100, 150],
                        type=int,
                        help="how often to decrease lr")
    parser.add_argument("--mode",
                        default="default",
                        type=str,
                        help="which  testing mode?")
    parser.add_argument("--model",
                        default="resnet18",
                        type=str,
                        help="model for training")
    parser.add_argument("--model_path",
                        default=None,
                        type=str,
                        help="where is the model saved?")
    parser.add_argument("--no_save_log",
                        action="store_true",
                        help="do not save log file")
    parser.add_argument("--optimizer",
                        default="SGD",
                        type=str,
                        help="optimizer")
    parser.add_argument("--output",
                        default="output_default",
                        type=str,
                        help="output subdirectory")
    parser.add_argument("--problem",
                        default="classification",
                        type=str,
                        help="problem type (classification or segmentation)")
    parser.add_argument("--save_json", action="store_true", help="save json")
    parser.add_argument("--save_period",
                        default=None,
                        type=int,
                        help="how often to save")
    parser.add_argument("--test_batch_size",
                        default=50,
                        type=int,
                        help="batch size for testing")
    parser.add_argument("--test_dataset",
                        type=str,
                        default=None,
                        help="name of the testing dataset")
    parser.add_argument("--test_iterations",
                        default=None,
                        type=int,
                        help="how many, if testing with a different "
                        "number iterations than training")
    parser.add_argument("--train_batch_size",
                        default=128,
                        type=int,
                        help="batch size for training")
    parser.add_argument("--train_log",
                        default="train_log.txt",
                        type=str,
                        help="name of the log file")
    parser.add_argument("--val_period",
                        default=20,
                        type=int,
                        help="how often to validate")
    parser.add_argument("--width",
                        default=4,
                        type=int,
                        help="width of the network")

    args = parser.parse_args()

    if args.save_period is None:
        args.save_period = args.epochs
    print(args)

    # summary writer
    train_log = args.train_log
    try:
        array_task_id = train_log[:-4].split("_")[-1]
    except:
        array_task_id = 1
    writer = SummaryWriter(log_dir=f"{args.output}/runs/{train_log[:-4]}")

    if not args.no_save_log:
        to_log_file(args, args.output, train_log)

    # set device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    ####################################################
    #               Dataset and Network and Optimizer
    trainloader, testloader = get_dataloaders(
        args.dataset,
        args.train_batch_size,
        test_batch_size=args.test_batch_size)

    # load model from path if a path is provided
    if args.model_path is not None:
        print(f"Loading model from checkpoint {args.model_path}...")
        net, start_epoch, optimizer_state_dict = load_model_from_checkpoint(
            args.model, args.model_path, args.dataset, args.width, args.depth)
        start_epoch += 1

    else:
        net = get_model(args.model, args.dataset, args.width, args.depth)
        start_epoch = 0
        optimizer_state_dict = None

    net = net.to(device)
    pytorch_total_params = sum(p.numel() for p in net.parameters())
    optimizer = get_optimizer(args.optimizer, args.model, net, args.lr,
                              args.dataset)

    print(net)
    print(
        f"This {args.model} has {pytorch_total_params/1e6:0.3f} million parameters."
    )
    print(f"Training will start at epoch {start_epoch}.")

    if optimizer_state_dict is not None:
        print(f"Loading optimizer from checkpoint {args.model_path}...")
        optimizer.load_state_dict(optimizer_state_dict)
        warmup_scheduler = warmup.ExponentialWarmup(optimizer, warmup_period=0)
    else:
        warmup_scheduler = warmup.ExponentialWarmup(optimizer, warmup_period=5)

    lr_scheduler = MultiStepLR(optimizer,
                               milestones=args.lr_schedule,
                               gamma=args.lr_factor,
                               last_epoch=-1)
    optimizer_obj = OptimizerWithSched(optimizer, lr_scheduler,
                                       warmup_scheduler)
    np.set_printoptions(precision=2)
    torch.backends.cudnn.benchmark = True
    test_setup = TestingSetup(args.problem.lower(), args.mode.lower())
    ####################################################

    ####################################################
    #        Train
    print(f"==> Starting training for {args.epochs - start_epoch} epochs...")

    for epoch in range(start_epoch, args.epochs):

        loss, acc = train(net, trainloader, args.problem.lower(),
                          optimizer_obj, device)

        print(f"{now()} Training loss at epoch {epoch}: {loss}")
        print(f"{now()} Training accuracy at epoch {epoch}: {acc}")

        # if the loss is nan, then stop the training
        if np.isnan(float(loss)):
            print("Loss is nan, exiting...")
            sys.exit()

        # tensorboard loss writing
        writer.add_scalar("Loss/loss", loss, epoch)
        writer.add_scalar("Accuracy/acc", acc, epoch)

        for i in range(len(optimizer.param_groups)):
            writer.add_scalar(f"Learning_rate/group{i}",
                              optimizer.param_groups[i]["lr"], epoch)

        if (epoch + 1) % args.val_period == 0:
            train_acc = test(net, trainloader, test_setup, device)
            test_acc = test(net, testloader, test_setup, device)

            print(f"{now()} Training accuracy: {train_acc}")
            print(f"{now()} Testing accuracy: {test_acc}")

            stats = [train_acc, test_acc]
            stat_names = ["train_acc", "test_acc"]
            for stat_idx, stat in enumerate(stats):
                stat_name = os.path.join("val", stat_names[stat_idx])
                writer.add_scalar(stat_name, stat, epoch)

        if (epoch + 1) % args.save_period == 0 or (epoch + 1) == args.epochs:
            state = {
                "net": net.state_dict(),
                "epoch": epoch,
                "optimizer": optimizer.state_dict()
            }
            out_str = os.path.join(
                args.checkpoint,
                f"{args.model}_{args.dataset}_{args.optimizer}"
                f"_depth={args.depth}"
                f"_width={args.width}"
                f"_lr={args.lr}"
                f"_batchsize={args.train_batch_size}"
                f"_epoch={args.epochs-1}"
                f"_{array_task_id}.pth")

            print("saving model to: ", args.checkpoint, " out_str: ", out_str)
            if not os.path.isdir(args.checkpoint):
                os.makedirs(args.checkpoint)
            torch.save(state, out_str)

    writer.flush()
    writer.close()
    ####################################################

    ####################################################
    #        Test
    print("==> Starting testing...")

    if args.test_iterations is not None:
        assert isinstance(
            net.iters, int), "Cannot test feed-forward model with iterations."
        net.iters = args.test_iterations

    train_acc = test(net, trainloader, test_setup, device)
    test_acc = test(net, testloader, test_setup, device)

    print(f"{now()} Training accuracy: {train_acc}")
    print(f"{now()} Testing accuracy: {test_acc}")

    model_name_str = f"{args.model}_depth={args.depth}_width={args.width}"
    stats = OrderedDict([("model", model_name_str),
                         ("num_params", pytorch_total_params),
                         ("learning rate", args.lr),
                         ("lr_factor", args.lr_factor), ("lr", args.lr),
                         ("epochs", args.epochs),
                         ("train_batch_size", args.train_batch_size),
                         ("optimizer", args.optimizer),
                         ("dataset", args.dataset), ("train_acc", train_acc),
                         ("test_acc", test_acc),
                         ("test_iter", args.test_iterations)])

    if args.save_json:
        to_json(stats, args.output)
예제 #31
0
def main():
    # parse arg and start experiment
    global args
    best_ap = -1.
    best_iter = 0

    args = parser.parse_args()
    args.config_of_data = config.datasets[args.data]
    # args.num_classes = config.datasets[args.data]['num_classes']
    if configure is None:
        args.tensorboard = False
        print(Fore.RED +
              'WARNING: you don\'t have tesnorboard_logger installed' +
              Fore.RESET)

    # optionally resume from a checkpoint
    if args.resume:
        if args.resume and os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            old_args = checkpoint['args']
            print('Old args:')
            print(old_args)
            # set args based on checkpoint
            if args.start_iter <= 0:
                args.start_iter = checkpoint['iter'] + 1
            best_iter = args.start_iter - 1
            best_ap = checkpoint['best_ap']
            for name in arch_resume_names:
                if name in vars(args) and name in vars(old_args):
                    setattr(args, name, getattr(old_args, name))
            model = get_model(**vars(args))
            model.load_state_dict(checkpoint['state_dict'])
            print("=> loaded checkpoint '{}' (iter {})"
                  .format(args.resume, checkpoint['iter']))
        else:
            print(
                "=> no checkpoint found at '{}'".format(
                    Fore.RED +
                    args.resume +
                    Fore.RESET),
                file=sys.stderr)
            return
    else:
        # create model
        print("=> creating model '{}'".format(args.arch))
        model = get_model(**vars(args))

    # cudnn.benchmark = True
    cudnn.enabled = False

    # create dataloader
    if args.evaluate == 'val':
        train_loader, val_loader, test_loader = getDataloaders(
            splits=('val'), **vars(args))
        validate(val_loader, model, best_iter)
        return
    elif args.evaluate == 'test':
        train_loader, val_loader, test_loader = getDataloaders(
            splits=('test'), **vars(args))
        validate(test_loader, model, best_iter)
        return
    else:
        train_loader, val_loader, test_loader = getDataloaders(
            splits=('train', 'val'), **vars(args))

    # define optimizer
    optimizer = get_optimizer(model, args)

    # check if the folder exists
    if os.path.exists(args.save):
        print(Fore.RED + args.save + Fore.RESET
              + ' already exists!', file=sys.stderr)
        if not args.force:
            ans = input('Do you want to overwrite it? [y/N]:')
            if ans not in ('y', 'Y', 'yes', 'Yes'):
                os.exit(1)
        print('remove existing ' + args.save)
        shutil.rmtree(args.save)
    os.makedirs(args.save)
    print('create folder: ' + Fore.GREEN + args.save + Fore.RESET)

    # copy code to save folder
    if args.save.find('debug') < 0:
        shutil.copytree(
            '.',
            os.path.join(
                args.save,
                'src'),
            symlinks=True,
            ignore=shutil.ignore_patterns(
                '*.pyc',
                '__pycache__',
                '*.path.tar',
                '*.pth',
                '*.ipynb',
                '.*',
                'data',
                'save',
                'save_backup'))

    # set up logging
    global log_print, f_log
    f_log = open(os.path.join(args.save, 'log.txt'), 'w')

    def log_print(*args):
        print(*args)
        print(*args, file=f_log)
    log_print('args:')
    log_print(args)
    print('model:', file=f_log)
    print(model, file=f_log, flush=True)
    # log_print('model:')
    # log_print(model)
    # log_print('optimizer:')
    # log_print(vars(optimizer))
    log_print('# of params:',
              str(sum([p.numel() for p in model.parameters()])))
    torch.save(args, os.path.join(args.save, 'args.pth'))
    scores = ['iter\tlr\ttrain_loss\tval_ap']
    if args.tensorboard:
        configure(args.save, flush_secs=5)

    for i in range(args.start_iter, args.niters + 1, args.eval_freq):
        # print('iter {:3d} lr = {:.6e}'.format(i, lr))
        # if args.tensorboard:
        #     log_value('lr', lr, i)

        # train for args.eval_freq iterations
        train_loss = train(train_loader, model, optimizer,
                           i, args.eval_freq)
        i += args.eval_freq - 1

        # evaluate on validation set
        val_ap = validate(val_loader, model, i)

        # save scores to a tsv file, rewrite the whole file to prevent
        # accidental deletion
        scores.append(('{}\t{}' + '\t{:.4f}' * 2)
                      .format(i, lr, train_loss, val_ap))
        with open(os.path.join(args.save, 'scores.tsv'), 'w') as f:
            print('\n'.join(scores), file=f)

        # remember best err@1 and save checkpoint
        # TODO: change this
        is_best = val_ap > best_ap
        if is_best:
            best_ap = val_ap
            best_iter = i
            print(Fore.GREEN + 'Best var_err1 {}'.format(best_ap) +
                  Fore.RESET)
        save_checkpoint({
            'args': args,
            'iter': i,
            'best_iter': best_iter,
            'arch': args.arch,
            'state_dict': model.state_dict(),
            'best_ap': best_ap,
        }, is_best, args.save)
        if not is_best and i - best_iter >= args.patience > 0:
            break
    print('Best val_ap: {:.4f} at iter {}'.format(best_ap, best_iter))
예제 #32
0
파일: main.py 프로젝트: omihub777/sim-real
    api_key=api_key,
    project_name="sim_real",
    auto_metric_logging=True,
    auto_param_logging=True,
)

if args.mixed_precision:
    print("Applied: Mixed Precision")
    tf.keras.mixed_precision.set_global_policy("mixed_float16")

train_ds, test_ds = get_dataset(args)
grid = image_grid(next(iter(train_ds))[0])[0]
logger.log_image(grid.numpy())
model = get_model(args)
criterion = get_criterion(args)
optimizer = get_optimizer(args)
lr_scheduler = get_lr_scheduler(args)
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=args.patience, restore_best_weights=True)
experiment_name = get_experiment_name(args)
logger.set_name(experiment_name)
logger.log_parameters(vars(args))
with logger.train():
    filename =f'{args.model_name}.hdf5'
    checkpoint = tf.keras.callbacks.ModelCheckpoint(filename, monitor='val_accuracy', mode='max', save_best_only=True, verbose=True)

    model.compile(loss=criterion, optimizer=optimizer, metrics=['accuracy'])
    if args.dry_run:
        print("[INFO] Turn off all callbacks")
        model.fit(train_ds, validation_data=test_ds, epochs=args.epochs, steps_per_epoch=2)
    else:
        model.fit(train_ds, validation_data=test_ds, epochs=args.epochs, callbacks=[lr_scheduler, early_stop, checkpoint])
예제 #33
0
파일: models.py 프로젝트: zgcgreat/WSDM
    def __init__(self, field_sizes=None, embed_size=10, filter_sizes=None, layer_acts=None, drop_out=None,
                 init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype))
        init_vars.append(('f1', [embed_size, filter_sizes[0], 1, 2], 'xavier', dtype))
        init_vars.append(('f2', [embed_size, filter_sizes[1], 2, 2], 'xavier', dtype))
        init_vars.append(('w1', [2 * 3 * embed_size, 1], 'xavier', dtype))
        init_vars.append(('b1', [1], 'zero', dtype))
        print('init_vars: ', init_vars)

        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1)
            l = xw

            l = tf.transpose(tf.reshape(l, [-1, num_inputs, embed_size, 1]), [0, 2, 1, 3])  # 变为 16 x 10 矩阵
            f1 = self.vars['f1']
            l = tf.nn.conv2d(l, f1, [1, 1, 1, 1], 'SAME')
            l = tf.transpose(
                utils.max_pool_4d(
                    tf.transpose(l, [0, 1, 3, 2]),
                    int(num_inputs / 2)),
                [0, 1, 3, 2])
            f2 = self.vars['f2']
            l = tf.nn.conv2d(l, f2, [1, 1, 1, 1], 'SAME')
            l = tf.transpose(
                utils.max_pool_4d(
                    tf.transpose(l, [0, 1, 3, 2]), 3),
                [0, 1, 3, 2])
            l = tf.nn.dropout(
                utils.activate(
                    tf.reshape(l, [-1, embed_size * 3 * 2]),
                    layer_acts[0]),
                self.layer_keeps[0])
            w1 = self.vars['w1']
            b1 = self.vars['b1']
            l = tf.matmul(l, w1) + b1

            l = tf.squeeze(l)
            self.y_prob = tf.sigmoid(l)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y))
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
    input_vector = concatenate([q_vector, d_vector])
    print("Concatenated vector: {iv}".format(iv=input_vector))
    dense = Dense(config_model_param["layers_size"][0], activation=config_model_param['hidden_activation'],
                  name="MLP_combine_0")(input_vector)
    i = 0
    for i in range(config_model_param["num_layers"]-2):
        # dense = Dropout(0.25)(dense)
        dense = Dense(config_model_param["layers_size"][i+1], activation=config_model_param['hidden_activation'],
                      name="MLP_combine_"+str(i+1))(dense)

    # dense = Dropout(0.5)(dense)
    out_size = get_input_label_size(config_data)
    out_labels = Dense(out_size, activation=config_model_param['output_activation'], name="MLP_out")(dense)
    model = Model(inputs=[query, doc], outputs=out_labels)
    model2 = Model(inputs=[query, doc], outputs=input_vector)
    optimizer = get_optimizer(config_model_param["optimizer"])(lr=config_model_param["learning_rate"])
    print(optimizer)
    model.compile(optimizer=optimizer, loss=config_model_train["loss_function"],
                  metrics=config_model_train["metrics"])
    print(model.summary())
    plot_model(model, to_file=join(config_model_train["train_details"], config_model_param['model_name']+".png"))
    # save model and resume
    # serialize model to JSON
    model_json = model.to_json()
    with open(join(config_model_train["train_details"], config_model_param["model_name"] + ".json"), "w") as json_file:
        json_file.write(model_json)
    print("Saved model to disk.")

    print("Reading training data:")
    print("[First]:\nRead label files to relations...")
    relations, relation_labeler = read_lablers_to_relations(config_data["labels"])
예제 #35
0
파일: models.py 프로젝트: zgcgreat/WSDM
    def __init__(self, field_sizes=None, embed_size=10, layer_sizes=None, layer_acts=None, drop_out=None,
                 embed_l2=None, layer_l2=None, init_path=None, opt_algo='gd', learning_rate=1e-2, random_seed=None,
                 layer_norm=True):
        Model.__init__(self)
        init_vars = []
        num_inputs = len(field_sizes)
        for i in range(num_inputs):
            init_vars.append(('embed_%d' % i, [field_sizes[i], embed_size], 'xavier', dtype))
        node_in = num_inputs * embed_size + embed_size * embed_size
        for i in range(len(layer_sizes)):
            init_vars.append(('w%d' % i, [node_in, layer_sizes[i]], 'xavier', dtype))
            init_vars.append(('b%d' % i, [layer_sizes[i]], 'zero',  dtype))
            node_in = layer_sizes[i]
        self.graph = tf.Graph()
        with self.graph.as_default():
            if random_seed is not None:
                tf.set_random_seed(random_seed)
            self.X = [tf.sparse_placeholder(dtype) for i in range(num_inputs)]
            self.y = tf.placeholder(dtype)
            self.keep_prob_train = 1 - np.array(drop_out)
            self.keep_prob_test = np.ones_like(drop_out)
            self.layer_keeps = tf.placeholder(dtype)
            self.vars = utils.init_var_map(init_vars, init_path)
            w0 = [self.vars['embed_%d' % i] for i in range(num_inputs)]
            xw = tf.concat([tf.sparse_tensor_dense_matmul(self.X[i], w0[i]) for i in range(num_inputs)], 1)

            z = tf.reduce_sum(tf.reshape(xw, [-1, num_inputs, embed_size]), 1)
            op = tf.reshape(
                tf.matmul(tf.reshape(z, [-1, embed_size, 1]),
                          tf.reshape(z, [-1, 1, embed_size])),
                [-1, embed_size * embed_size])

            if layer_norm:
                # x_mean, x_var = tf.nn.moments(xw, [1], keep_dims=True)
                # xw = (xw - x_mean) / tf.sqrt(x_var)
                # x_g = tf.Variable(tf.ones([num_inputs * embed_size]), name='x_g')
                # x_b = tf.Variable(tf.zeros([num_inputs * embed_size]), name='x_b')
                # x_g = tf.Print(x_g, [x_g[:10], x_b])
                # xw = xw * x_g + x_b
                p_mean, p_var = tf.nn.moments(op, [1], keep_dims=True)
                op = (op - p_mean) / tf.sqrt(p_var)
                p_g = tf.Variable(tf.ones([embed_size**2]), name='p_g')
                p_b = tf.Variable(tf.zeros([embed_size**2]), name='p_b')
                # p_g = tf.Print(p_g, [p_g[:10], p_b])
                op = op * p_g + p_b

            l = tf.concat([xw, op], 1)
            for i in range(len(layer_sizes)):
                wi = self.vars['w%d' % i]
                bi = self.vars['b%d' % i]
                l = tf.nn.dropout(
                    utils.activate(
                        tf.matmul(l, wi) + bi,
                        layer_acts[i]),
                    self.layer_keeps[i])

            l = tf.squeeze(l)
            self.y_prob = tf.sigmoid(l)

            self.loss = tf.reduce_mean(
                tf.nn.sigmoid_cross_entropy_with_logits(logits=l, labels=self.y))
            if layer_l2 is not None:
                self.loss += embed_l2 * tf.nn.l2_loss(tf.concat(w0, 0))
                for i in range(len(layer_sizes)):
                    wi = self.vars['w%d' % i]
                    self.loss += layer_l2[i] * tf.nn.l2_loss(wi)
            self.optimizer = utils.get_optimizer(opt_algo, learning_rate, self.loss)

            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            tf.global_variables_initializer().run(session=self.sess)
예제 #36
0
def vae_estimator(hparams):

    # Get a session
    sess = tf.Session()

    # Set up palceholders
    A = tf.placeholder(tf.float32,
                       shape=(hparams.n_input, hparams.num_measurements),
                       name='A')
    y_batch = tf.placeholder(tf.float32,
                             shape=(hparams.batch_size,
                                    hparams.num_measurements),
                             name='y_batch')

    # Create the generator
    # TODO: Move z_batch definition here
    z_batch, x_hat_batch, restore_path, restore_dict = mnist_model_def.vae_gen(
        hparams)

    # measure the estimate
    if hparams.measurement_type == 'project':
        y_hat_batch = tf.identity(x_hat_batch, name='y_hat_batch')
    else:
        y_hat_batch = tf.abs(tf.matmul(x_hat_batch, A, name='y_hat_batch'))

    # define all losses
    m_loss1_batch = tf.reduce_mean(tf.abs(y_batch - y_hat_batch), 1)
    m_loss2_batch = tf.reduce_mean((y_batch - y_hat_batch)**2, 1)
    zp_loss_batch = tf.reduce_sum(z_batch**2, 1)

    # define total loss
    total_loss_batch = hparams.mloss1_weight * m_loss1_batch \
                     + hparams.mloss2_weight * m_loss2_batch \
                     + hparams.zprior_weight * zp_loss_batch
    total_loss = tf.reduce_mean(total_loss_batch)

    # Compute means for logging
    m_loss1 = tf.reduce_mean(m_loss1_batch)
    m_loss2 = tf.reduce_mean(m_loss2_batch)
    zp_loss = tf.reduce_mean(zp_loss_batch)

    # Set up gradient descent
    var_list = [z_batch]
    global_step = tf.Variable(0, trainable=False, name='global_step')
    learning_rate = utils.get_learning_rate(global_step, hparams)
    opt = utils.get_optimizer(learning_rate, hparams)
    update_op = opt.minimize(total_loss,
                             var_list=var_list,
                             global_step=global_step,
                             name='update_op')
    opt_reinit_op = utils.get_opt_reinit_op(opt, var_list, global_step)

    # Intialize and restore model parameters
    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    restorer = tf.train.Saver(var_list=restore_dict)
    restorer.restore(sess, restore_path)

    def estimator(A_val, y_batch_val, z_batch_val, hparams):
        """Function that returns the estimated image"""
        best_keeper = utils.BestKeeper(hparams)
        assign_z_opt_op = z_batch.assign(z_batch_val)
        if hparams.measurement_type == 'project':
            feed_dict = {y_batch: y_batch_val}
        else:
            feed_dict = {A: A_val, y_batch: y_batch_val}
        for i in range(hparams.num_random_restarts):
            sess.run(opt_reinit_op)
            sess.run(assign_z_opt_op)
            for j in range(hparams.max_update_iter):
                _, lr_val, total_loss_val, \
                m_loss1_val, m_loss2_val, zp_loss_val, z_batch_val = sess.run([update_op, learning_rate, total_loss,
                                        m_loss1,
                                        m_loss2,
                                        zp_loss, z_batch], feed_dict=feed_dict)
                logging_format = 'rr {} iter {} lr {} total_loss {} m_loss1 {} m_loss2 {} zp_loss {}'
                print logging_format.format(i, j, lr_val, total_loss_val,
                                            m_loss1_val, m_loss2_val,
                                            zp_loss_val)

                if hparams.gif and ((j % hparams.gif_iter) == 0):
                    images = sess.run(x_hat_batch, feed_dict=feed_dict)
                    for im_num, image in enumerate(images):
                        save_dir = '{0}/{1}/'.format(hparams.gif_dir, im_num)
                        utils.set_up_dir(save_dir)
                        save_path = save_dir + '{0}.png'.format(j)
                        image = image.reshape(hparams.image_shape)
                        save_image(image, save_path)

            x_hat_batch_val, z_batch_val, total_loss_batch_val = sess.run(
                [x_hat_batch, z_batch, total_loss_batch], feed_dict=feed_dict)
            best_keeper.report(x_hat_batch_val, z_batch_val,
                               total_loss_batch_val)

        return best_keeper.get_best()

    return estimator
def train_supernet(results_dir, model, task_sampler, train_iter, valid_iter, device, config):
    """
    :param results_dir:
    :param model:
    :param task_sampler:
    :param train_iter:
    :param valid_iter:
    :param device:
    :param config:
    :return:
    """
    writer = None
    since = time.time()

    seed = set_seed(config["TRAIN"]["train_seed"])
    config["TRAIN"]["train_seed"] = seed
    with open(os.path.join(results_dir, "config.yaml"), "w") as f:
        yaml.dump(config, f)

    # metrics
    total_metrics = {
        "train": [],
        "valid": [],
    }

    # data iterators
    iters = {"train": train_iter, "valid": valid_iter}

    # training stuff
    optimizer = get_optimizer(model.parameters(), config["TRAIN"]["OPTIMIZER"])
    scheduler = get_scheduler(optimizer, config["TRAIN"]["SCHEDULER"])
    criterion = get_criterion(config["TRAIN"]["CRITERION"])

    # training
    for epoch in range(config["TRAIN"]["num_epochs"]):

        print("-" * 100)
        print("Iter Epoch {}/{}".format(epoch + 1, config["TRAIN"]["num_epochs"]))
        print("-" * 100)

        epoch_metrics = {
            "train": {
                "learning_rate": [],
                "losses_train": [],
                "accs_train": [],
            },
            "valid": {
                "losses_valid": [],
                "accs_valid": [],
            }
        }

        for phase in ["train", "valid"]:

            for iter_cpt, (x, y) in tqdm(enumerate(iters[phase]), ncols=100, total=len(iters[phase])):

                # perform an update
                if phase == "train":

                    model.train()
                    tasks = task_sampler.sample(n_monte=config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"])
                    loss_t = None
                    accs_t = []

                    for task in tasks:

                        # forward
                        x_t, y_t = x.to(device), y.to(device)
                        preds_t = model.forward(x_t, task)

                        # computing gradient
                        if loss_t is None:
                            loss_t = criterion(preds_t, y_t) / config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"]
                        else:
                            loss_t += criterion(preds_t, y_t) / config["TRAIN"]["GRAPH_SAMPLER"]["n_monte"]

                        # saving accuracies
                        accs_t.append(np.mean((torch.max(preds_t, dim=1)[1] == y_t).cpu().numpy()))

                    # update
                    loss_t.backward()
                    optimizer.step()
                    scheduler.step(epoch)
                    model.none_grad()

                    # adding metrics
                    epoch_metrics[phase]["learning_rate"].append(scheduler.get_lr())
                    epoch_metrics[phase]["losses_train"].append(loss_t.item())
                    epoch_metrics[phase]["accs_train"].append(np.mean(accs_t))

                elif config["TRAIN"]["perform_valid"]:

                    model.eval()
                    task = task_sampler.sample()[0]

                    # forward
                    x_v, y_v = x.to(device), y.to(device)
                    with torch.no_grad():
                        preds_v = model.forward(x_v, task)
                        loss_v = criterion(preds_v, y_v)

                    # adding metrics
                    epoch_metrics[phase]["losses_valid"].append(loss_v.item())
                    epoch_metrics[phase]["accs_valid"].append(
                        np.mean((torch.max(preds_v, dim=1)[1] == y_v).cpu().numpy()))

                else:
                    break

        # average metrics over epoch
        to_print = "\n"
        for phase in ["train", "valid"]:
            to_print += phase.upper() + ":\n"
            for key in epoch_metrics[phase].keys():
                if len(epoch_metrics[phase][key]) > 0:
                    epoch_metrics[phase][key] = np.mean(epoch_metrics[phase][key])
                    to_print += "{}: {:.4f}".format(key, epoch_metrics[phase][key]) + "\n"
                else:
                    epoch_metrics[phase][key] = None
            total_metrics[phase].append(epoch_metrics[phase])
            to_print += "\n"

        # tensorboard integration to plot nice curves
        if config["TRAIN"]["use_tensorboard"]:
            if config["TRAIN"]["use_tensorboard"] and writer is None:
                writer = SummaryWriter(results_dir)
            for phase in ["train", "valid"]:
                for key, value in epoch_metrics[phase].items():
                    if value is not None:
                        writer.add_scalar(phase + "/" + key, value, epoch)

        time_elapsed = time.time() - since
        print(to_print + "Time Elapsed: {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60))

        # save everything
        if config["TRAIN"]["save"] and ((epoch + 1) % config["TRAIN"]["save_period"] == 0):

            # saving model
            weights_path = os.path.join(results_dir, "model_weights_epoch_{0}_of_{1}.pth".
                                        format(epoch + 1, config["TRAIN"]["num_epochs"]))
            torch.save(model.state_dict(), weights_path)

            # saving stuff to retrieve
            with open(os.path.join(results_dir, "total_metrics.pkl"), "wb") as handle:
                pickle.dump(total_metrics, handle, protocol=pickle.HIGHEST_PROTOCOL)

    time_elapsed = time.time() - since
    print("Training complete in {:.0f}m {:.0f}s".format(time_elapsed // 60, time_elapsed % 60))

    return total_metrics
예제 #38
0
    def __init__(self, opt):
        self.opt = opt
        self.device = torch.device("cuda" if opt.ngpu else "cpu")

        self.model, self.classifier = models.get_model(opt.net_type,
                                                       opt.classifier_type,
                                                       opt.pretrained,
                                                       int(opt.nclasses))
        self.model = self.model.to(self.device)
        self.classifier = self.classifier.to(self.device)

        if opt.ngpu > 1:
            self.model = nn.DataParallel(self.model)

        self.loss = models.init_loss(opt.loss_type)
        self.loss = self.loss.to(self.device)

        self.optimizer = utils.get_optimizer(self.model, self.opt)
        self.lr_scheduler = utils.get_lr_scheduler(self.opt, self.optimizer)
        self.alpha_scheduler = utils.get_margin_alpha_scheduler(self.opt)

        self.train_loader = datasets.generate_loader(opt, 'train')
        self.test_loader = datasets.generate_loader(opt, 'val')

        self.epoch = 0
        self.best_epoch = False
        self.training = False
        self.state = {}

        self.train_loss = utils.AverageMeter()
        self.test_loss = utils.AverageMeter()
        self.batch_time = utils.AverageMeter()
        self.test_metrics = utils.ROCMeter()
        self.best_test_loss = utils.AverageMeter()
        self.best_test_loss.update(np.array([np.inf]))

        self.visdom_log_file = os.path.join(self.opt.out_path, 'log_files',
                                            'visdom.log')
        self.vis = Visdom(port=opt.visdom_port,
                          log_to_filename=self.visdom_log_file,
                          env=opt.exp_name + '_' + str(opt.fold))

        self.vis_loss_opts = {
            'xlabel': 'epoch',
            'ylabel': 'loss',
            'title': 'losses',
            'legend': ['train_loss', 'val_loss']
        }

        self.vis_tpr_opts = {
            'xlabel': 'epoch',
            'ylabel': 'tpr',
            'title': 'val_tpr',
            'legend': ['tpr@fpr10-2', 'tpr@fpr10-3', 'tpr@fpr10-4']
        }

        self.vis_epochloss_opts = {
            'xlabel': 'epoch',
            'ylabel': 'loss',
            'title': 'epoch_losses',
            'legend': ['train_loss', 'val_loss']
        }