Exemplo n.º 1
0
if __name__ == '__main__':
    cf = fl.FLAGS

    tf.logging.set_verbosity(tf.logging.INFO)

    # tf.set_random_seed(123)
    if cf.mode == 'train':
        train_dataset_path = DATASET_PATH + '/train/train_data'
        tb.make_tfrecords("ir_ph1_v2", "train", train_dataset_path,
                          "./dataset/train/", 4, 3, False)

        files = glob.glob("./dataset/train/*_train*tfrecord")
        print(files)
        files.sort()
        assert len(files) > 0
        num_examples = util.count_records(files)
        global_step = tf.Variable(0, trainable=False)

        image_preprocessing_fn = None
        if cf.preprocessing_name:
            image_preprocessing_fn = preprocessing_factory.get_preprocessing(
                cf.preprocessing_name, is_training=True)

        def train_pre_process(example_proto):
            features = {
                "image/encoded":
                tf.FixedLenFeature((), tf.string, default_value=""),
                "image/class/label":
                tf.FixedLenFeature((), tf.int64, default_value=0),
                'image/height':
                tf.FixedLenFeature((), tf.int64, default_value=0),
Exemplo n.º 2
0
        return image, parsed_features["image/class/label"]

    files_op = tf.placeholder(tf.string, shape=[None], name="files")
    num_examples_op = tf.placeholder(tf.int64, shape=(), name="num_examples")
    dataset = tf.data.TFRecordDataset(files_op)
    dataset = dataset.map(train_pre_process)
    dataset = dataset.batch(num_examples_op)
    iterator = dataset.make_initializable_iterator()
    images, labels = iterator.get_next()

    embedding_op = model_fn.build_model(images, None, args, None, False)

    query_files = glob.glob(os.path.join(args.data_dir, "*_query*tfrecord"))
    query_files.sort()
    assert len(query_files) > 0
    query_num_examples = util.count_records(query_files)
    index_files = glob.glob(os.path.join(args.data_dir, "*_index*tfrecord"))
    index_files.sort()
    assert len(index_files) > 0
    index_num_examples = util.count_records(index_files)

    embedding_batch_size = 512

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver(tf.global_variables())
    saver.restore(sess, tf.train.latest_checkpoint(args.model_dir))

    sess.run(iterator.initializer,
def main(cf, hyper_param_txt, hostname):
    tf.logging.set_verbosity(tf.logging.INFO)
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = F.gpu_no
    print("CUDA Visible device", device_lib.list_local_devices())
    start_time = datetime.now().strftime('%Y%m%d%H%M%S')
    start_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    if not os.path.isdir(cf.save_dir):
        os.makedirs(cf.save_dir)
    f = open(os.path.join(cf.save_dir, "train_parameters_%s.txt" % start_time),
             mode="w+")
    f.write(hyper_param_txt)

    # inputs_ph = tf.placeholder(tf.float32, [None, cf.train_image_size, cf.train_image_size, cf.train_image_channel],
    #                            name="inputs")
    # labels_ph = tf.placeholder(tf.int32, [None], name="labels")
    tf.set_random_seed(123)

    files = glob.glob(os.path.join(cf.data_dir, "*_train*tfrecord"))
    files.sort()
    assert len(files) > 0
    num_examples = util.count_records(files)
    global_step = tf.Variable(0, trainable=False)

    image_preprocessing_fn = None
    if cf.preprocessing_name:
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            cf.preprocessing_name, is_training=True)

    def sampling_pre_process(example_proto):
        features = {
            "image/encoded": tf.FixedLenFeature((),
                                                tf.string,
                                                default_value=""),
            "image/class/label": tf.FixedLenFeature((),
                                                    tf.int64,
                                                    default_value=0),
            'image/height': tf.FixedLenFeature((), tf.int64, default_value=0),
            'image/width': tf.FixedLenFeature((), tf.int64, default_value=0)
        }
        if cf.use_attr:
            features["image/attr"] = tf.VarLenFeature(dtype=tf.int64)

        parsed_features = tf.parse_single_example(example_proto, features)
        image = parsed_features["image/encoded"]

        label = parsed_features["image/class/label"]
        if cf.use_attr:
            return image, label, parsed_features["image/attr"]
        else:
            return image, label

    def train_pre_process(img_string):
        image = tf.image.decode_jpeg(img_string, cf.train_image_channel)

        if image_preprocessing_fn is not None:
            image = image_preprocessing_fn(image, cf.train_image_size,
                                           cf.train_image_size)
        else:
            image = tf.cast(image, tf.float32)

            image = tf.expand_dims(image, 0)
            image = tf.image.resize_image_with_pad(image, cf.train_image_size,
                                                   cf.train_image_size)
            # image = tf.image.resize_bilinear(image, [224, 224], align_corners=False)
            image = tf.squeeze(image, [0])

            image = tf.divide(image, 255.0)
            image = tf.subtract(image, 0.5)
            image = tf.multiply(image, 2.0)

        return image

    string_img_pl = tf.placeholder(tf.string, (None))
    pair_dataset = tf.data.Dataset.from_tensor_slices(string_img_pl)
    pair_dataset = pair_dataset.map(
        train_pre_process, num_parallel_calls=cf.num_preprocessing_threads)
    pair_dataset = pair_dataset.batch(cf.batch_size)
    pair_dataset = pair_dataset.prefetch(cf.batch_size)
    pair_iterator = pair_dataset.make_initializable_iterator()
    pair_images = pair_iterator.get_next()

    steps_each_epoch = int(num_examples / cf.batch_size)
    if num_examples % cf.batch_size > 0:
        steps_each_epoch += 1
    dataset = tf.data.TFRecordDataset(files)
    dataset = dataset.map(sampling_pre_process,
                          num_parallel_calls=cf.num_preprocessing_threads)
    dataset = dataset.shuffle(cf.shuffle_buffer_size)
    dataset = dataset.repeat()
    dataset = dataset.batch(cf.sampling_buffer_size)
    dataset = dataset.prefetch(cf.sampling_buffer_size)

    iterator = dataset.make_one_shot_iterator()
    # iterator = dataset.make_initializable_iterator()
    if cf.use_attr:
        images, labels, attrs = iterator.get_next()
    else:
        images, labels = iterator.get_next()

    images_ph = tf.placeholder(tf.float32, [
        cf.batch_size, cf.train_image_size, cf.train_image_size,
        cf.train_image_channel
    ],
                               name="inputs")
    labels_ph = tf.placeholder(tf.int32, [cf.batch_size], name="labels")
    if cf.use_attr:
        attrs_ph = tf.placeholder(tf.float32, [cf.batch_size, cf.attr_dim],
                                  name="attrs")
        if not cf.use_attr_net:
            cf.embedding_size = cf.attr_dim
    else:
        attrs_ph = None
    # seed_ph = tf.placeholder(tf.int64, (), name="shuffle_seed")

    loss_op, end_points, train_op = model_fn.build_model(
        images_ph,
        labels_ph,
        cf,
        attrs_ph,
        True,
        cf.use_attr_net,
        cf.num_hidden_attr_net,
        num_examples,
        global_step,
        use_old_model=cf.use_old_model)
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Add summaries for end_points.
    for end_point in end_points:
        x = end_points[end_point]
        summaries.add(tf.summary.histogram('activations/' + end_point, x))
        summaries.add(
            tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x)))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
        summaries.add(tf.summary.histogram(variable.op.name, variable))

    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    if cf.quantize_delay >= 0:
        tf.contrib.quantize.create_training_graph(
            quant_delay=cf.quantize_delay)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(cf.save_dir, sess.graph)

    epoch = 1
    steps = 1
    latest_epoch = 0
    if cf.checkpoint_path is not None and (
            os.path.isfile(cf.checkpoint_path) or
        (os.path.isdir(cf.checkpoint_path)
         and tf.train.latest_checkpoint(cf.checkpoint_path) is not None)):
        latest_checkpoint = tf.train.latest_checkpoint(cf.checkpoint_path)
        exclusions = []
        if cf.checkpoint_exclude_scopes:
            exclusions = [
                scope.strip()
                for scope in cf.checkpoint_exclude_scopes.split(',')
            ]
        variables_to_restore = []
        for var in slim.get_model_variables():
            for exclusion in exclusions:
                if var.op.name.startswith(exclusion):
                    break
            else:
                variables_to_restore.append(var)

        saver_for_restore = tf.train.Saver(var_list=variables_to_restore,
                                           max_to_keep=cf.keep_checkpoint_max)
        if os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint(
                cf.checkpoint_path) is not None:
            cp = tf.train.latest_checkpoint(cf.checkpoint_path)
        else:
            cp = cf.checkpoint_path
        saver_for_restore.restore(sess, cp)
        if os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint(
                cf.checkpoint_path) is not None:
            latest_epoch = int(
                os.path.basename(latest_checkpoint).split("-")[1])
            epoch = latest_epoch + 1
            cf.max_number_of_epochs += latest_epoch
        f.write("%s:%s\n" % ("restore_checkpoint", latest_checkpoint))
    saver = tf.train.Saver(tf.global_variables(),
                           max_to_keep=cf.keep_checkpoint_max)
    f.close()
    num_trained_images = 0
    last_saved_epoch = None
    last_saved_step = None
    start_avg_loss_steps = 10
    start_total_loss = 0.
    while True:
        # sess.run(iterator.initializer, feed_dict={seed_ph: steps})
        try:
            start = time.time()
            if cf.use_attr:
                tmp_images, tmp_labels, tmp_attrs = sess.run(
                    [images, labels, attrs])
                tmp_attrs = np.reshape(tmp_attrs.values,
                                       [cf.sampling_buffer_size, cf.attr_dim])
                tmp_attrs = tmp_attrs.astype(np.float64)
            else:
                tmp_images, tmp_labels = sess.run([images, labels])

            pair_indices = set()
            single_index_map = {}
            label_buffer = {}
            for i, tmp_label in enumerate(tmp_labels):
                if tmp_label in label_buffer:
                    pair_indices.add(i)
                    pair_indices.add(label_buffer[tmp_label])
                    if tmp_label in single_index_map:
                        del single_index_map[tmp_label]
                else:
                    label_buffer[tmp_label] = i
                    single_index_map[tmp_label] = i
            pair_indices = list(pair_indices)
            if len(pair_indices) > cf.batch_size:
                pair_indices = pair_indices[:cf.batch_size]
            elif len(pair_indices) < cf.batch_size:
                pair_indices += list(
                    single_index_map.values())[:cf.batch_size -
                                               len(pair_indices)]
            # print(pair_indices)
            batch_images = tmp_images[pair_indices]
            sess.run(pair_iterator.initializer,
                     feed_dict={string_img_pl: batch_images})
            batch_images = sess.run(pair_images)

            batch_labels = tmp_labels[pair_indices]
            if cf.use_attr:
                batch_attrs = tmp_attrs[pair_indices]

            sampling_time = time.time() - start
            tmp_images = None
            tmp_labels = None
            start = time.time()
            feed_dict = {images_ph: batch_images, labels_ph: batch_labels}
            if cf.use_attr:
                feed_dict[attrs_ph] = batch_attrs
            if steps % cf.save_summaries_steps == 0:
                loss, _, summary = sess.run([loss_op, train_op, summary_op],
                                            feed_dict=feed_dict)
                summary_writer.add_summary(summary, steps)
            else:
                loss, _ = sess.run([loss_op, train_op], feed_dict=feed_dict)
            if steps <= start_avg_loss_steps:
                start_total_loss += loss
            train_time = time.time() - start

            if steps % cf.log_every_n_steps == 0:
                now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
                print(
                    "[%s: %d epoch(%d/%d), %d steps] sampling time: %f, train time: %f, loss: %f"
                    % (now, epoch, steps % steps_each_epoch, steps_each_epoch,
                       steps, sampling_time, train_time, loss))
            num_trained_images += cf.batch_size

            if cf.use_save_steps:
                if steps % cf.save_interval_steps == 0:
                    saver.save(sess, cf.save_dir + "/model.ckpt", steps)
                    last_saved_step = steps

            if cf.max_number_of_steps is not None and steps >= cf.max_number_of_steps:
                break
            steps += 1

            if num_trained_images >= num_examples:
                if not cf.use_save_steps and cf.save_interval_epochs >= 1 and (
                        epoch - latest_epoch) % cf.save_interval_epochs == 0:
                    saver.save(sess, cf.save_dir + "/model.ckpt", epoch)
                    last_saved_epoch = epoch
                if epoch >= cf.max_number_of_epochs:
                    break
                epoch += 1
                num_trained_images = 0

        except tf.errors.OutOfRangeError:
            break

    if cf.use_save_steps:
        if last_saved_step is None or last_saved_step < steps:
            saver.save(sess, cf.save_dir + "/model.ckpt", steps)
    else:
        if last_saved_epoch is None or last_saved_epoch < epoch:
            saver.save(sess, cf.save_dir + "/model.ckpt", epoch)

    summary_writer.add_summary(sess.run(summary_op, feed_dict=feed_dict),
                               steps)

    sess.close()
    tf.reset_default_graph()

    if cf.notify_after_training:
        txt = "%s[%s]\n\n" % (hostname,
                              socket.gethostbyname(socket.gethostname()))
        txt += "start avg loss : %f" % (start_total_loss /
                                        start_avg_loss_steps)
        txt += "last loss : %f" % loss
        txt += "start time: %s\n" % start_time_str
        txt += "end time: %s\n" % datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if cf.eval_after_training:
            txt += "going to evaluate"
        else:
            txt += "not going to evaluate"
        txt += "\n[params]\n"
        txt += hyper_param_txt
        util.send_msg_to_slack(
            "\n\n==================================\nTraining is Done\n" + txt)

    if cf.eval_after_training:
        cuda.select_device(0)
        cuda.close()
        eval_cmd = 'python -u multiple_search_models.py --model_dir="%s" --embedding_size=%d --data_dir="%s" --model_name=%s --max_top_k=%d --shutdown_after_train=%d --gpu_no=%s --step_type=%s --image_size=%s --eval_batch_size=%d --preprocessing_name=%s --notify_after_training=%d --use_old_model=%d --save_static_data=%d' % (
            cf.save_dir, cf.embedding_size, cf.data_dir, cf.model_name,
            cf.eval_max_top_k, 1 if cf.shutdown_after_train else 0, cf.gpu_no,
            "step" if cf.use_save_steps else "epoch", cf.train_image_size,
            cf.eval_batch_size, cf.preprocessing_name,
            1 if cf.notify_after_training else 0, 1 if cf.use_old_model else 0,
            1 if cf.save_static_data else 0)
        print(eval_cmd)
        os.system(eval_cmd)
    else:
        if cf.shutdown_after_train:
            os.system("sudo shutdown now")