Пример #1
0
    def execute(self):
        result_file = os.path.join(self.result_dir, "train_result_{}.txt".format(self.task_index))
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(self.server.target, config=config) as sess:
            K.set_session(sess)
            if self.go_on:
                self.restore_model()
            tb_callback = TensorBoard(log_dir=self.log_dir, write_grads=True, write_images=True)
            ckpt_callback = ModelCheckpoint(self.checkpoint_path,
                                            monitor='loss',
                                            save_weights_only=True)
            reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, verbose=1)
            early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=10, verbose=1)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [tb_callback, ckpt_callback] if self.task_index == 0 else []

            callbacks += [reduce_lr, early_stopping]
            # try:
            his = self.model.fit_generator(self.generate_rdd_data(),
                                           steps_per_epoch=self.steps_per_epoch,
                                           # validation_data=self.val_generate_data(val_data),
                                           # validation_steps=max(1, self.val_num // self.batch_size),
                                           epochs=self.epochs + self.initial_epoch,
                                           initial_epoch=self.initial_epoch,
                                           workers=0,
                                           callbacks=callbacks)
            logger.debug("{}-{}".format(self.task_index, his.history))
            ModelDir.write_result(result_file, self.get_results(his), self.go_on)
            # except Exception as e:
            #     logger.debug(str(e))
            self.save_model()
            self.tf_feed.terminate()
Пример #2
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir,
            "recurrent_predict_result_{}.txt".format(self.task_index))
        with tf.Session(self.server.target) as sess:
            K.set_session(sess)
            self.load_model()
            for x, y in self.generate_rdd_data():
                x_len = x.shape[1]
                if x_len < self.units:
                    break
                x_train = np.array(x[:self.units])
                for _ in range(self.steps):
                    ys = self.model.predict(x_train, batch_size=1)
                    y_label = np.argmax(ys, 1)
                    if self.feature_type == 'one_hot':
                        shape = ys.shape
                        y_l = np.zeros(shape)
                        y_l[..., y_label] = 1
                        x_train = np.array(
                            [x_train.tolist()[0][1:] + y_l.tolist()])
                    else:
                        x_train = np.array(
                            [x_train.tolist()[0][1:] + y_label.tolist()])

                    ModelDir.write_str(result_file,
                                       str(y_label.tolist()[0]) + " ", True)
                ModelDir.write_str(result_file, "\n", True)

            self.tf_feed.terminate()
            self.delete_tmp_dir()
Пример #3
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir, "train_result_{}.txt".format(self.task_index))
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.Session(self.server.target, config=config) as sess:
            K.set_session(sess)
            if self.go_on:
                self.restore_model()
            tb_callback = TensorBoard(log_dir=self.log_dir,
                                      write_grads=True,
                                      write_images=True)
            ckpt_callback = ModelCheckpoint(self.checkpoint_path,
                                            monitor='loss',
                                            save_weights_only=True)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [tb_callback, ckpt_callback
                         ] if self.task_index == 0 else None

            # train on data read from a generator which is producing data from a Spark RDD
            his = self.model.fit_generator(
                generator=self.generate_rdd_data(),
                steps_per_epoch=self.steps_per_epoch,
                epochs=self.epochs + self.initial_epoch,
                callbacks=callbacks,
                workers=0,
                initial_epoch=self.initial_epoch)
            self.save_model()
            ModelDir.write_result(result_file, self.get_results(his),
                                  self.go_on)
            self.tf_feed.terminate()
Пример #4
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir, "train_result_{}.txt".format(self.task_index))
        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.summary.merge_all()
        # Start running operations on the Graph.
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=self.gpu_memory_fraction)
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options, log_device_placement=False)) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            summary_writer = tf.summary.FileWriter(self.log_dir, sess.graph)
            coord = tf.train.Coordinator()
            tf.train.start_queue_runners(coord=coord, sess=sess)

            steps = 0
            while not sess.should_stop() and not self.tf_feed.should_stop():
                if self.go_on:
                    self.restore_model(sess)

                # Training and validation loop
                print('Running training')
                image_list, label_list = self.get_data()
                # Enqueue one epoch of image paths and labels
                labels_array = np.expand_dims(np.array(image_list), 1)
                image_paths_array = np.expand_dims(np.array(label_list), 1)
                control_value = facenet.RANDOM_ROTATE * self.random_rotate + \
                                facenet.RANDOM_CROP * self.random_crop + \
                                facenet.RANDOM_FLIP * self.random_flip + \
                                facenet.FIXED_STANDARDIZATION * self.use_fixed_image_standardization
                control_array = np.ones_like(labels_array) * control_value
                enqueue_op = tf.get_collection(OUTPUTS)[0]
                feed_dict = dict(
                    zip(tf.get_collection(INPUTS),
                        [image_paths_array, labels_array, control_array]))
                sess.run(enqueue_op, feed_dict)

                self.model.add_params(batch_size=self.batch_size,
                                      steps_per_epoch=self.steps_per_epoch,
                                      phase_train=True,
                                      n_classes=self.n_classes)
                keys = ["_task_index", "_epoch"]
                for epoch in range(1, self.epochs + 1):
                    for _ in range(self.steps_per_epoch - 1):
                        sess.run(self.model.fetches,
                                 feed_dict=self.model.feed_dict)
                    res = sess.run(self.model.fetches + [summary_op],
                                   feed_dict=self.model.feed_dict)
                    steps = sess.run(self.global_step)
                    summary_writer.add_summary(res[-1], global_step=steps)
                    results = [dict(zip(keys, res))]
                    ModelDir.write_result(result_file, results, True)
            summary = tf.Summary()
            summary_writer.add_summary(summary, global_step=steps)
            self.tf_feed.terminate()
Пример #5
0
    def train(self, save_dir, result_dir, checkpoint_dir, log_dir):
        result_file = os.path.join(result_dir, "train_result.txt")
        train_set = self.train_set
        config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        with tf.compat.v1.Session(config=config) as sess:
            # K.set_session(sess)
            if self.go_on:
                self.restore_model(checkpoint_dir)
            tb_callback = TensorBoard(log_dir=log_dir, write_images=True)
            checkpoint_file = os.path.join(checkpoint_dir,
                                           self.name + '_checkpoint_{epoch}')
            ckpt_callback = ModelCheckpoint(
                checkpoint_file,
                # monitor='loss',
                save_weights_only=True)
            reduce_lr = ReduceLROnPlateau(monitor='loss',
                                          factor=0.1,
                                          patience=3,
                                          verbose=1)
            early_stopping = EarlyStopping(monitor='loss',
                                           min_delta=0,
                                           patience=10,
                                           verbose=1)

            # add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
            callbacks = [tb_callback, ckpt_callback]
            # callbacks = []

            self.model.compile(optimizer=Adam(lr=1e-4),
                               loss={
                                   'yolo_loss': lambda y_true, y_pred: y_pred
                               })
            # print('Unfreeze all of the layers.')
            callbacks.extend([reduce_lr, early_stopping])
            steps_per_epoch = len(train_set) // self.batch_size
            # note that more GPU memory is required after unfreezing the body
            # try:
            his = self.model.fit_generator(
                self.train_generate_data(train_set),
                steps_per_epoch=steps_per_epoch,
                # validation_data=self.val_generate_data(val_data),
                # validation_steps=max(1, self.val_num // self.batch_size),
                epochs=self.initial_epoch + 1,
                initial_epoch=self.initial_epoch,
                workers=1,
                callbacks=callbacks)
            logger.debug(str(his.history))
            # except Exception as e:
            #     logger.debug(str(e))
            # logger.debug('end')
            save_model_path = os.path.join(save_dir, 'model.h5')
            self.model.save(save_model_path)
            ModelDir.write_result(result_file, self.get_results(his))
Пример #6
0
 def execute(self):
     result_file = os.path.join(
         self.result_dir, "predict_result_{}.txt".format(self.task_index))
     with tf.Session(self.server.target) as sess:
         K.set_session(sess)
         self.load_model()
         his = self.model.predict_generator(self.generate_rdd_data(),
                                            steps=self.steps_per_epoch)
         ModelDir.write_result(result_file, self.get_results(his))
         self.tf_feed.terminate()
         self.delete_tmp_dir()
Пример #7
0
 def train(self,
           data_rdd,
           model_rdd,
           batch_size,
           epochs,
           model_dir,
           go_on=False):
     n_samples = data_rdd.count()
     # steps_per_epoch = n_samples // batch_size // self.num_workers
     steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers)
     assert steps_per_epoch > 0
     md = ModelDir(model_dir, 'train*')
     if go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     worker = TFTrainWorker(model_rdd,
                            go_on=go_on,
                            batch_size=batch_size,
                            epochs=epochs,
                            steps_per_epoch=steps_per_epoch,
                            **md.to_dict())
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=epochs, feed_timeout=60000)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
Пример #8
0
 def execute(self):
     result_file = os.path.join(
         self.result_dir, "predict_result_{}.txt".format(self.task_index))
     config = tf.ConfigProto(allow_soft_placement=True)
     config.gpu_options.allow_growth = True
     with tf.Session(self.server.target, config=config) as sess:
         self.load_model(sess)
         for _ in range(self.steps_per_epoch):
             x, y = self.generate_rdd_data
             if len(x) == 0:
                 break
             predictions = sess.run(self.model.outputs['y'],
                                    self.feed_dict(x=x))
             y_pred = np.argmax(predictions, 1)
             y_true = np.argmax(y, 1) if y is not None else None
             logger.debug(predictions)
             results = self.get_results(y_pred, y_true)
             ModelDir.write_result(result_file, results, True)
         self.tf_feed.terminate()
Пример #9
0
    def execute(self):
        result_file = os.path.join(
            self.result_dir, "train_result_{}.txt".format(self.task_index))
        config = tf.ConfigProto(allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        summary_op = tf.summary.merge_all()
        with tf.Session(self.server.target, config=config) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            self.summary_writer = tf.summary.FileWriter(
                self.log_dir, sess.graph)
            coord = tf.train.Coordinator()
            tf.train.start_queue_runners(coord=coord, sess=sess)

            if self.go_on:
                self.restore_checkpoint(sess)
            names, values = zip(*self.model.fetches.items())
            names = list(names)
            values = list(values)
            res, summary_str = None, None
            for epoch in range(1, self.epochs + 1):
                for _ in range(self.steps_per_epoch):
                    x, y = self.generate_rdd_data
                    if len(x) == 0:
                        break
                    if summary_op is not None:
                        *res, summary_str = sess.run(values + [summary_op],
                                                     self.feed_dict(x=x, y=y))
                    else:
                        res = sess.run(values, self.feed_dict(x=x, y=y))
                result = dict(
                    (k, v) for k, v in zip(names, res) if v is not None)
                result.update(self.common_dict(epoch + self.initial_epoch))
                ModelDir.write_result(result_file, [result], True)
                self.save_checkpoint(sess, epoch + self.initial_epoch,
                                     summary_str)

            self.model.write_model(self.model_config_path, False)
            self.save_model(sess)
            self.tf_feed.terminate()
Пример #10
0
 def main(self):
     md = ModelDir(self.model_dir, 'train*')
     if self.go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     self.build_model()
     self.train(**md.to_dict())
Пример #11
0
 def evaluate(self, data_rdd, steps, model_dir):
     md = ModelDir(model_dir, 'evaluate*')
     steps_per_epoch = data_rdd.count() if steps <= 0 else steps
     steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers)
     worker = EvaluateWorker(steps_per_epoch=steps_per_epoch, **md.to_dict())
     md.delete_result_file()
     cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=1)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
Пример #12
0
 def recurrent_predict(self, data_rdd, units, steps, feature_type, model_dir):
     md = ModelDir(model_dir, 'recurrent_predict*')
     worker = RecurrentPredictWorker(units=units,
                                     steps=steps,
                                     feature_type=feature_type,
                                     **md.to_dict())
     md.delete_result_file()
     cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000)
     cluster.shutdown()
     results = md.read_result(True)
     return self.sqlc.createDataFrame([{"result": result} for result in results])
Пример #13
0
 def yolov3_tiny_train(self,
                       model_rdd,
                       batch_size,
                       epochs,
                       classes_path,
                       anchors_path,
                       train_path,
                       val_path,
                       image_size,
                       model_dir,
                       weights_path=None,
                       freeze_body=2,
                       go_on=False):
     columns = model_rdd.columns
     assert "model_config" in columns, "not exists model layer config!"
     assert tf.io.gfile.exists(train_path), "train dataset path not exists!"
     data_rdd = self.sc.textFile(train_path)
     n_samples = data_rdd.count()
     steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers)
     md = ModelDir(model_dir, 'train*')
     if go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     worker = YOLOV3TinyModelTrainWorker(model_rdd,
                                         go_on=go_on,
                                         batch_size=batch_size,
                                         epochs=epochs,
                                         classes_path=classes_path,
                                         anchors_path=anchors_path,
                                         weights_path=weights_path,
                                         val_path=val_path,
                                         image_size=image_size,
                                         steps_per_epoch=steps_per_epoch,
                                         freeze_body=freeze_body,
                                         **md.to_dict())
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
Пример #14
0
 def predict(self, data_rdd, steps, model_dir, output_prob=False):
     md = ModelDir(model_dir, 'predict*')
     steps_per_epoch = data_rdd.count() if steps <= 0 else steps
     steps_per_epoch = math.ceil(steps_per_epoch / self.num_workers)
     worker = PredictWorker(steps_per_epoch=steps_per_epoch,
                            output_prob=output_prob,
                            **md.to_dict())
     md.delete_result_file()
     cluster = TFCluster.run(self.sc, worker, self.tf_args, self.cluster_size, self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd.rdd, num_epochs=1, feed_timeout=6000)
     cluster.shutdown()
     results = md.read_result()
     return self.sqlc.createDataFrame(results)
Пример #15
0
 def yolov3_train(self,
                  model_rdd,
                  data_dir,
                  batch_size,
                  epochs,
                  image_size,
                  model_dir,
                  weights_path=None,
                  freeze_body=2,
                  go_on=False):
     train_path = os.path.join(data_dir, 'train.txt')
     assert tf.io.gfile.exists(train_path), "train dataset path not exists!"
     data_rdd = self.sc.textFile(train_path)
     n_samples = data_rdd.count()
     steps_per_epoch = math.ceil(n_samples / batch_size / self.num_workers)
     md = ModelDir(model_dir, 'train*')
     if go_on:
         md.create_model_dir()
     else:
         md = md.rebuild_model_dir()
     worker = YOLOV3ModelTrainWorker(model_rdd,
                                     data_dir,
                                     go_on=go_on,
                                     batch_size=batch_size,
                                     epochs=epochs,
                                     image_size=image_size,
                                     steps_per_epoch=steps_per_epoch,
                                     freeze_body=freeze_body,
                                     **md.to_dict())
     cluster = TFCluster.run(self.sc,
                             worker,
                             self.tf_args,
                             self.cluster_size,
                             self.num_ps,
                             input_mode=self.input_mode)
     cluster.train(data_rdd, num_epochs=epochs, feed_timeout=60000)
     cluster.shutdown()
     results = md.read_result()
     if results:
         return self.sqlc.createDataFrame(results)