예제 #1
0
class Trainer(object):
    def __init__(self, config, rng):
        self.config = config
        self.rng = rng
        self.task = config.task
        self.model_dir = config.model_dir
        self.gpu_memory_fraction = config.gpu_memory_fraction
        self.checkpoint_secs = config.checkpoint_secs
        self.log_step = config.log_step
        self.num_epoch = config.num_epochs

        ## import data Loader ##
        data_dir = config.data_dir
        dataset_name = config.task
        batch_size = config.batch_size
        num_time_steps = config.num_time_steps
        self.data_loader = BatchLoader(data_dir, dataset_name, batch_size,
                                       num_time_steps)

        ## Need to think about how we construct adj matrix(W)
        W = self.data_loader.adj
        laplacian = W / W.max()
        laplacian = scipy.sparse.csr_matrix(laplacian, dtype=np.float32)
        lmax = graph.lmax(laplacian)

        #idx2char = batchLoader_.idx2char
        #char2idx = batchLoader_.char2idx
        #batch_x, batch_y = batchLoader_.next_batch(0) 0:train 1:valid 2:test
        #batchLoader_.reset_batch_pointer(0)

        ## define model ##
        self.model = Model(config, laplacian, lmax)

        ## model saver / summary writer ##
        self.saver = tf.train.Saver()
        self.model_saver = tf.train.Saver(self.model.model_vars)
        self.summary_writer = tf.summary.FileWriter(self.model_dir)

        sv = tf.train.Supervisor(logdir=self.model_dir,
                                 is_chief=True,
                                 saver=self.saver,
                                 summary_op=None,
                                 summary_writer=self.summary_writer,
                                 save_summaries_secs=300,
                                 save_model_secs=self.checkpoint_secs,
                                 global_step=self.model.model_step)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.gpu_memory_fraction,
            allow_growth=True)  # seems to be not working
        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     gpu_options=gpu_options)

        self.sess = sv.prepare_or_wait_for_session(config=sess_config)

    def train(self):
        print("[*] Checking if previous run exists in {}"
              "".format(self.model_dir))
        latest_checkpoint = tf.train.latest_checkpoint(self.model_dir)
        if tf.train.latest_checkpoint(self.model_dir) is not None:
            print("[*] Saved result exists! loading...")
            self.saver.restore(self.sess, latest_checkpoint)
            print("[*] Loaded previously trained weights")
            self.b_pretrain_loaded = True
        else:
            print("[*] No previous result")
            self.b_pretrain_loaded = False

        print("[*] Training starts...")
        self.model_summary_writer = None

        ##Training
        for n_epoch in trange(self.num_epoch, desc="Training[epoch]"):
            self.data_loader.reset_batch_pointer(0)
            for k in trange(self.data_loader.sizes[0], desc="[per_batch]"):
                # Fetch training data
                batch_x, batch_y = self.data_loader.next_batch(0)
                batch_x_onehot = convert_to_one_hot(batch_x,
                                                    self.config.num_node)
                if self.config.model_type == 'lstm':
                    reshaped = batch_x_onehot.reshape([
                        self.config.batch_size, self.config.num_node,
                        self.config.num_time_steps
                    ])
                    batch_x = reshaped
                elif self.config.model_type == 'glstm':
                    reshaped = batch_x_onehot.reshape([
                        self.config.batch_size, self.config.num_time_steps, 1,
                        self.config.num_node
                    ])
                    batch_x = np.transpose(reshaped, (0, 3, 2, 1))

                feed_dict = {
                    self.model.rnn_input: batch_x,
                    self.model.rnn_output: batch_y
                }
                res = self.model.train(self.sess,
                                       feed_dict,
                                       self.model_summary_writer,
                                       with_output=True)
                self.model_summary_writer = self._get_summary_writer(res)

            if n_epoch % 10 == 0:
                self.saver.save(self.sess, self.model_dir)
                print(batch_x, batch_y)

    def test(self):
        self.model_summary_writer = None

        #Testing
        for n_sample in trange(self.data_loader.sizes[2], desc="Testing"):
            batch_x, batch_y = self.data_loader.next_batch(2)
            batch_x_onehot = convert_to_one_hot(batch_x, self.config.num_node)
            reshaped = batch_x_onehot.reshape([
                self.config.batch_size, self.config.num_time_steps, 1,
                self.config.num_node
            ])
            batch_x = np.transpose(reshaped, (0, 3, 2, 1))

            feed_dict = {
                self.model.rnn_input: batch_x,
                self.model.rnn_output: batch_y
            }
            res = self.model.test(self.sess,
                                  feed_dict,
                                  self.model_summary_writer,
                                  with_output=True)
            self.model_summary_writer = self._get_summary_writer(res)

    def _get_summary_writer(self, result):
        if result['step'] % self.log_step == 0:
            return self.summary_writer
        else:
            return None
예제 #2
0
    logging.info("Validation Accuracy: " + str(sum(validation_accuracy) / valid_loader.num_batches) + " " + str(datetime.now()))
    print("Validation Accuracy: " + str(sum(validation_accuracy) / valid_loader.num_batches))

# Start training
with tf.Session() as sess:
    # instrument for tensorboard
    writer = tf.summary.FileWriter(
            os.path.join('tensorboard/log/', time.strftime("%Y-%m-%d-%H-%M-%S")))
    writer.add_graph(sess.graph)
 
    # Run the initializer
    sess.run(init)

    for e in range(epochs):
        epoch_accuracy = []
        train_loader.reset_batch_pointer()
        for b in range(train_loader.num_batches):
            batch_x, batch_y = train_loader.next_batch()
            loss, acc, _ = sess.run([loss_op, accuracy, train_op], feed_dict={X: batch_x, Y: batch_y})
            epoch_accuracy.append(acc)
            if b == train_loader.num_batches-1  and (e + 1) % display_step == 0:      
                # Calculate epoch loss and accuracy
                logging.info("Epoch " + str(e + 1) + 
                        ", batch " + str(b) +
                        ", Minibatch Loss= " + \
                        "{:.4f}".format(loss) +  "    " + 
                        str(datetime.now()))
                logging.info("Epoch Accuracy: " + str(sum(epoch_accuracy) / train_loader.num_batches))  
                print("Epoch Accuracy: " + str(sum(epoch_accuracy) / train_loader.num_batches) + "    Batch Loss: " +str( loss)) 
 
                # check validation accuracy
예제 #3
0
class Trainer(object):
    def __init__(self, config, rng):
        self.config = config
        self.rng = rng
        self.task = config.task
        self.model_dir = config.model_dir
        self.gpu_memory_fraction = config.gpu_memory_fraction
        self.checkpoint_secs = config.checkpoint_secs
        self.log_step = config.log_step
        self.num_epoch = config.num_epochs

        ## import data Loader ##
        data_dir = config.data_dir
        dataset_name = config.task
        batch_size = config.batch_size
        num_time_steps = config.num_time_steps
        self.data_loader = BatchLoader(data_dir, dataset_name, batch_size,
                                       num_time_steps)

        ## Need to think about how we construct adj matrix(W)
        # Oh no. Are you kidding me??
        W = self.data_loader.adj
        laplacian = W / W.max()  # 作了归一化
        laplacian = scipy.sparse.csr_matrix(laplacian,
                                            dtype=np.float32)  # 将矩阵用CSR的方式存储
        lmax = graph.lmax(laplacian)  # Q:作用未知

        ## define model ##
        self.model = Model(config, laplacian, lmax)

        ## model saver / summary writer ##
        self.saver = tf.train.Saver()
        self.model_saver = tf.train.Saver(self.model.model_vars)
        self.summary_writer = tf.summary.FileWriter(self.model_dir)

        sv = tf.train.Supervisor(logdir=self.model_dir,
                                 is_chief=True,
                                 saver=self.saver,
                                 summary_op=None,
                                 summary_writer=self.summary_writer,
                                 save_summaries_secs=300,
                                 save_model_secs=self.checkpoint_secs,
                                 global_step=self.model.model_step)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.gpu_memory_fraction,
            allow_growth=True)  # seems to be not working
        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     gpu_options=gpu_options)

        self.sess = sv.prepare_or_wait_for_session(config=sess_config)

    def train(self):
        # print("[*] Checking if previous run exists in {}"
        #       "".format(self.model_dir))
        # latest_checkpoint = tf.train.latest_checkpoint(self.model_dir)
        # if tf.train.latest_checkpoint(self.model_dir) is not None:
        #     print("[*] Saved result exists! loading...")
        #     self.saver.restore(
        #         self.sess,
        #         latest_checkpoint
        #     )
        #     print("[*] Loaded previously trained weights")
        #     self.b_pretrain_loaded = True
        # else:
        #     print("[*] No previous result")
        #     self.b_pretrain_loaded = False
        #
        # print("[*] Training starts...")
        # self.model_summary_writer = None

        ##Training

        for n_epoch in trange(self.num_epoch, desc="Training[epoch]"):
            self.data_loader.reset_batch_pointer(0)
            #sizes[0] traindata
            for k in trange(self.data_loader.sizes[0], desc="[per_batch]"):
                # Fetch training data
                batch_x, batch_y = self.data_loader.next_batch(0)
                #得到每一行的数据
                # batch_x_onehot = convert_to_one_hot(batch_x, self.config.num_node)
                #对进行coo编码转换
                if self.config.model_type == 'lstm':
                    reshaped = batch_x.reshape([
                        self.config.batch_size, self.config.num_node,
                        self.config.num_time_steps
                    ])

                    batch_x = reshaped
                elif self.config.model_type == 'glstm':
                    reshaped = batch_x.reshape([
                        self.config.batch_size, self.config.num_time_steps,
                        self.config.feat_in, self.config.num_node
                    ])
                    #[20,50,1,50]->[20,50,1,50] batchsize,num_node,1,numtime_steps
                    batch_x = np.transpose(reshaped, (0, 3, 2, 1))

                batch_y = np.transpose(batch_y, (0, 3, 2, 1))
                feed_dict = {
                    self.model.rnn_input: batch_x,
                    self.model.rnn_output: batch_y
                }
                # res = self.model.train(self.sess, feed_dict, self.model_summary_writer,
                #                        with_output=True)
                res = self.model.train(self.sess, feed_dict, with_output=True)

                # self.model_summary_writer = self._get_summary_writer(res)
            res_output = res['output']
            res_shape = res_output.shape
            threshold = 1.0
            total = self.config.batch_size * self.config.num_node * self.config.num_time_steps

            # res_output[:,:,0,:] - res_output[:,:,1,:]
            for i in range(res_shape[0]):
                for j in range(res_shape[1]):
                    for k in range(res_shape[3]):
                        x1, x2 = res_output[i, j, 0, k], res_output[i, j, 1, k]
                        if x1 - x2 > threshold:
                            res_output[i, j, 0, k] = 1
                            res_output[i, j, 1, k] = 0
                        elif x1 - x2 < -1 * threshold:
                            res_output[i, j, 0, k] = 0
                            res_output[i, j, 1, k] = 1
                        else:
                            res_output[i, j, 0, k] = 0
                            res_output[i, j, 1, k] = 0
            res_output = np.swapaxes(res_output, 2, 3).reshape([-1, 2])
            batch_y_bp = np.swapaxes(batch_y, 2, 3).reshape([-1, 2])
            acc = roc_auc_score(res_output, batch_y_bp)

            print('acc:  ', acc)
            # print('res:',res)
            if n_epoch % 10 == 0:
                self.saver.save(self.sess, self.model_dir)
        with DeepExplain(session=self.sess) as de:
            logits = self.model.rnn_output
            xi, yi = self.data_loader.next_batch(0)
            reshaped = xi.reshape([
                self.config.batch_size, self.config.num_time_steps,
                self.config.feat_in, self.config.num_node
            ])
            # [20,50,1,50]->[20,50,1,50] batchsize,num_node,1,numtime_steps
            xi = np.transpose(reshaped, (0, 3, 2, 1))
            # x = np.reshape(xi[0, :, :, 0], [110, 2])
            yreshaped = yi.reshape([
                self.config.batch_size, self.config.num_time_steps,
                self.config.feat_in, self.config.num_node
            ])
            yi = np.transpose(yreshaped, (0, 3, 2, 1))
            # y = np.reshape(yi[0, :, :, 0], [110, 2])
            # applyy = logits*yi
            # print("[*] Checking if previous run exi
            print("[*] Saved result exists! loading...")

            print("asdfghjklweeeeeerty")
            print(logits)
            # data_loader=BatchLoader(data_dir, dataset_name,
            #             batch_size, num_time_steps)
            print("ok")
            # X = tf.placeholder("float", [self.config.batch_size,self.config.num_node,self.config.feat_in,self.config.num_time_steps])
            attributions = de.explain('grad*input', xi, self.model.rnn_input,
                                      xi)
            np.savetxt('0_features.csv', attributions[0], delimiter=', ')
            print('Done')

    def test(self):
        print("[*] Checking if previous run exists in {}"
              "".format(self.model_dir))
        latest_checkpoint = tf.train.latest_checkpoint(self.model_dir)
        if tf.train.latest_checkpoint(self.model_dir) is not None:
            print("[*] Saved result exists! loading...")
            self.saver.restore(self.sess, latest_checkpoint)
            print("[*] Loaded previously trained weights")
            self.b_pretrain_loaded = True
        else:
            print("[*] No previous result")
            self.b_pretrain_loaded = False

        print("[*] Testing starts...")
        self.model_summary_writer = None
        ##Testing
        for n_sample in trange(self.data_loader.sizes[2], desc="Testing"):
            batch_x, batch_y = self.data_loader.next_batch(2)
            # batch_x_onehot = convert_to_one_hot(batch_x, self.config.num_node)
            reshaped = batch_x.reshape([
                self.config.batch_size, self.config.num_time_steps,
                self.config.feat_in, self.config.num_node
            ])
            batch_x = np.transpose(reshaped, (0, 3, 2, 1))

            feed_dict = {
                self.model.rnn_input: batch_x,
                self.model.rnn_output: batch_y
            }
            res = self.model.test(self.sess,
                                  feed_dict,
                                  self.model_summary_writer,
                                  with_output=True)
            self.model_summary_writer = self._get_summary_writer(res)

    def _get_summary_writer(self, result):
        if result['step'] % self.log_step == 0:
            return self.summary_writer
        else:
            return None
예제 #4
0
class Trainer(object):
    def __init__(self, config, rng):
        self.config = config
        self.rng = rng
        self.model_dir = config.model_dir
        self.gpu_memory_fraction = config.gpu_memory_fraction
        self.checkpoint_secs = config.checkpoint_secs
        self.log_step = config.log_step
        self.num_epoch = config.num_epochs
        self.stop_win_size = config.stop_win_size
        self.stop_early = config.stop_early

        ## import data Loader ##ir
        batch_size = config.batch_size
        server_name = config.server_name
        mode = config.mode
        target = config.target
        sample_rate = config.sample_rate
        win_size = config.win_size
        hist_range = config.hist_range
        s_month = config.s_month
        e_month = config.e_month
        e_date = config.e_date
        s_date = config.s_date
        data_rm = config.data_rm
        coarsening_level = config.coarsening_level
        cnn_mode = config.conv
        is_coarsen = config.is_coarsen

        self.data_loader = BatchLoader(server_name, mode, target, sample_rate,
                                       win_size, hist_range, s_month, s_date,
                                       e_month, e_date, data_rm, batch_size,
                                       coarsening_level, cnn_mode, is_coarsen)

        actual_node = self.data_loader.adj.shape[0]
        if config.conv == 'gcnn':
            graphs = self.data_loader.graphs
            if config.is_coarsen:
                L = [
                    graph.laplacian(A, normalized=config.normalized)
                    for A in graphs
                ]
            else:
                L = [
                    graph.laplacian(self.data_loader.adj,
                                    normalized=config.normalized)
                ] * len(graphs)
        elif config.conv == 'cnn':
            L = [actual_node]
            tmp_node = actual_node
            while tmp_node > 0:
                tmp_node = int(tmp_node / 2)
                L.append(tmp_node)
        else:
            raise ValueError("Unsupported config.conv {}".format(config.conv))

        tf.reset_default_graph()
        ## define model ##
        self.model = Model(config, L, actual_node)

        ## model saver / summary writer ##
        self.saver = tf.train.Saver()
        self.model_saver = tf.train.Saver(self.model.model_vars)
        self.summary_writer = tf.summary.FileWriter(self.model_dir)
        # Checkpoint
        # meta file: describes the saved graph structure, includes
        # GraphDef, SaverDef, and so on; then apply
        # tf.train.import_meta_graph('/tmp/model.ckpt.meta'),
        # will restore Saver and Graph.

        # index file: it is a string-string immutable
        # table(tensorflow::table::Table). Each key is a name of a tensor
        # and its value is a serialized BundleEntryProto.
        # Each BundleEntryProto describes the metadata of a
        # tensor: which of the "data" files contains the content of a tensor,
        # the offset into that file, checksum, some auxiliary data, etc.
        #
        # data file: it is TensorBundle collection, save the values of all variables.
        sv = tf.train.Supervisor(logdir=self.model_dir,
                                 is_chief=True,
                                 saver=self.saver,
                                 summary_op=None,
                                 summary_writer=self.summary_writer,
                                 save_summaries_secs=300,
                                 save_model_secs=self.checkpoint_secs,
                                 global_step=self.model.model_step)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.gpu_memory_fraction,
            allow_growth=True)  # seems to be not working
        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     gpu_options=gpu_options)
        #
        self.sess = sv.prepare_or_wait_for_session(config=sess_config)

        # init = tf.global_variables_initializer()
        # self.sess = tf.Session(config=sess_config)
        # self.sess.run(init)

    def train(self, val_best_score=10, save=False, index=1, best_model=None):
        print("[*] Checking if previous run exists in {}"
              "".format(self.model_dir))
        latest_checkpoint = tf.train.latest_checkpoint(self.model_dir)
        if tf.train.latest_checkpoint(self.model_dir) is not None:
            print("[*] Saved result exists! loading...")
            self.saver.restore(self.sess, latest_checkpoint)
            print("[*] Loaded previously trained weights")
            self.b_pretrain_loaded = True
        else:
            print("[*] No previous result")
            self.b_pretrain_loaded = False

        print("[*] Training starts...")
        self.model_summary_writer = None

        val_loss = 0
        lr = 0
        tmp_best_loss = float('+inf')
        validation_loss_window = np.zeros(self.stop_win_size)
        validation_loss_window[:] = float('+inf')
        ##Training
        for n_epoch in trange(self.num_epoch, desc="Training[epoch]"):
            self.data_loader.reset_batch_pointer(0)
            loss_epoch = []
            for k in trange(self.data_loader.sizes[0], desc="[per_batch]"):
                # Fetch training data
                batch_x, batch_y, weight_y,\
                count_y, _ = self.data_loader.next_batch(0)

                feed_dict = {
                    self.model.cnn_input: batch_x,
                    self.model.output_label: batch_y,
                    self.model.ph_labels_weight: weight_y,
                    self.model.is_training: True
                }
                res = self.model.train(self.sess,
                                       feed_dict,
                                       self.model_summary_writer,
                                       with_output=True)
                loss_epoch.append(res['loss'])
                lr = res['lr']
                self.model_summary_writer = self._get_summary_writer(res)

            val_loss = self.validate()
            train_loss = np.mean(loss_epoch)

            validation_loss_window[n_epoch % self.stop_win_size] = val_loss

            if self.stop_early:
                if np.abs(validation_loss_window.mean() - val_loss) < 1e-4:
                    print('Validation loss did not decrease. Stopping early.')
                    break

            if n_epoch % 10 == 0:
                if save:
                    self.saver.save(self.sess, self.model_dir)
                if val_loss < val_best_score:
                    val_best_score = val_loss
                    best_model = self.model_dir
                if val_loss < tmp_best_loss:
                    tmp_best_loss = val_loss
                print("Searching {}...".format(index))
                print("Epoch {}: ".format(n_epoch))
                print("LR: ", lr)
                print("  Train Loss: ", train_loss)
                print("  Validate Loss: ", val_loss)
                print("  Current Best Loss: ", val_best_score)
                print("  Current Model Dir: ", best_model)

        return tmp_best_loss

    def validate(self):

        loss = []
        for n_sample in trange(self.data_loader.sizes[1], desc="Validating"):
            batch_x, batch_y, weight_y, count_y,\
            _ = self.data_loader.next_batch(1)

            feed_dict = {
                self.model.cnn_input: batch_x,
                self.model.output_label: batch_y,
                self.model.ph_labels_weight: weight_y,
                self.model.is_training: False
            }
            res = self.model.test(self.sess,
                                  feed_dict,
                                  self.summary_writer,
                                  with_output=True)
            loss.append(res['loss'])

        return np.nanmean(loss)

    def test(self):

        loss = []
        gt_y = []
        pred_y = []
        w_y = []
        counts_y = []
        vel_list_y = []
        for n_sample in trange(self.data_loader.sizes[2], desc="Testing"):
            batch_x, batch_y, weight_y, \
            count_y, vel_list = self.data_loader.next_batch(2)

            feed_dict = {
                self.model.cnn_input: batch_x,
                self.model.output_label: batch_y,
                self.model.ph_labels_weight: weight_y,
                self.model.is_training: False
            }
            res = self.model.test(self.sess,
                                  feed_dict,
                                  self.summary_writer,
                                  with_output=True)
            loss.append(res['loss'])
            gt_y.append(batch_y)
            w_y.append(weight_y)
            counts_y.append(count_y)
            vel_list_y.append(vel_list)
            pred_y.append(res['pred'])

        final_gt = np.concatenate(gt_y, axis=0)
        final_pred = np.concatenate(pred_y, axis=0)
        final_weight = np.concatenate(w_y, axis=0)
        final_count = np.concatenate(counts_y, axis=0)
        final_vel_list = np.concatenate(vel_list_y, axis=0)

        result_dict = {
            'ground_truth': final_gt,
            'prediction': final_pred,
            'weight': final_weight,
            'count': final_count,
            'vel_list': final_vel_list
        }

        test_loss = np.mean(loss)
        print("Test Loss: ", test_loss)

        return result_dict
        # self.model_summary_writer = self._get_summary_writer(res)

    def _get_summary_writer(self, result):
        if result['step'] % self.log_step == 0:
            return self.summary_writer
        else:
            return None