Exemplo n.º 1
0
            dataset = dataset.prefetch(cf.batch_size * 8)

        iterator = dataset.make_one_shot_iterator()
        images, labels = iterator.get_next()
    else:
        num_examples = None
        global_step = None

    images_ph = tf.placeholder(tf.float32, [
        None, cf.train_image_size, cf.train_image_size, cf.train_image_channel
    ],
                               name="inputs")
    labels_ph = tf.placeholder(tf.int32, [None], name="labels")
    if cf.mode == 'train':
        if cf.use_pair_sampling:
            loss_op, end_points, train_op, embeddings_op = model_fn.build_model(
                images_ph, labels_ph, cf, True, num_examples, global_step)
        else:
            loss_op, end_points, train_op, embeddings_op = model_fn.build_model(
                images, labels, cf, True, num_examples, global_step)
    else:
        embeddings_op = model_fn.build_model(images_ph,
                                             labels_ph,
                                             cf,
                                             is_training=False)

    if cf.mode == 'train':
        if cf.fine_tuning and cf.model_name in pretrained_map:
            import urllib.request
            import tarfile

            pretrained_url = pretrained_map[cf.model_name]
    assert os.path.isfile(args.image), "Image {} not found".format(args.iamge)

    image_string = tf.read_file(args.image)
    image_decoded = tf.image.decode_jpeg(image_string, channels=3)
    image = tf.image.convert_image_dtype(image_decoded, tf.float32)
    resized_image = tf.image.resize_images(image, [params.image_size, params.image_size])
    image = tf.clip_by_value(resized_image, 0.0, 0.1)
    image = tf.expand_dims(image, 0)

    inputs = {"images": image}

    print(image.get_shape().as_list())
    
    # Building model
    with tf.variable_scope('model'):
        logits = build_model(False, inputs, params) # logits shape: (1, 6)
   
    predictions = tf.argmax(logits, 1)             # min max in col=1
    probs = tf.nn.softmax(logits=logits)            
    
    # list all the variables of graphs
    # for var in tf.all_variables():
    #    print(var)

    # from tensorflow.contrib.framework.python.framework import checkpoint_utils
    # var_list = checkpoint_utils.list_variables(os.path.join(args.model_dir, args.restore_from))
    # for var in var_list:
    #    print(var)


    # Initialize tf.Saver
Exemplo n.º 3
0
    def infer(queries, db):
        def _parse_function(filename):
            image_string = tf.read_file(filename)
            image_decoded = tf.image.decode_jpeg(image_string, channels=3)
            eval_image_size = cf.train_image_size
            if cf.preprocessing_name is not None:
                image_preprocessing_fn = preprocessing_factory.get_preprocessing(cf.preprocessing_name,
                                                                                 is_training=False)
                image_decoded = image_preprocessing_fn(image_decoded, eval_image_size, eval_image_size)
            else:
                image = tf.cast(image_decoded, tf.float32)

                image = tf.expand_dims(image, 0)
                image = tf.image.resize_image_with_pad(image, cf.train_image_size, cf.train_image_size)
                image = tf.squeeze(image, [0])

                image = tf.divide(image, 255.0)
                image = tf.subtract(image, 0.5)
                image_decoded = tf.multiply(image, 2.0)

            return image_decoded

        dataset_queries = tf.data.Dataset.from_tensor_slices(queries)
        dataset_queries = dataset_queries.map(_parse_function)
        dataset_queries = dataset_queries.batch(len(queries))
        iterator = dataset_queries.make_one_shot_iterator()
        features = iterator.get_next()
        query_imgs = sess.run(features)

        dataset_db = tf.data.Dataset.from_tensor_slices(db)
        dataset_db = dataset_db.map(_parse_function)
        dataset_db = dataset_db.batch(len(db))
        iterator_db = dataset_db.make_one_shot_iterator()
        features_db = iterator_db.get_next()
        db_imgs = sess.run(features_db)

        checkpoints = cf.nsml_eval_checkpoints.split(",")
        sim_matrix = None
        model_names = cf.nsml_eval_models.split(",")
        eval_sessions = cf.nsml_eval_sessions.split(",")

        embedding_nums = [int(v) for v in cf.nsml_eval_embeddings.split(",")]
        for i, cp in enumerate(checkpoints):
            tf.reset_default_graph()
            images_ph = tf.placeholder(tf.float32,
                                       [None, cf.train_image_size, cf.train_image_size, cf.train_image_channel],
                                       name="inputs")
            query_feed_dict = {images_ph: query_imgs}
            index_feed_dict = {images_ph: db_imgs}
            model_cf = {"model_name": model_names[i], "embedding_size": embedding_nums[i]}
            embeddings_op = model_fn.build_model(images_ph, None, model_cf, is_training=False)

            tf_config = tf.ConfigProto()
            tf_config.gpu_options.allow_growth = True
            global_sess = tf.Session(config=tf_config)
            global_sess.run(tf.global_variables_initializer())

            nsml.load(checkpoint=cp, session=eval_sessions[i])

            query_vecs = global_sess.run(embeddings_op, feed_dict=query_feed_dict)
            reference_vecs = global_sess.run(embeddings_op, feed_dict=index_feed_dict)

            print('test data load queries {} query_img {} references {} reference_img {}'.
                  format(len(queries), len(query_imgs), len(db), len(db_imgs)))

            print('inference start')

            # l2 normalization
            query_vecs = l2_normalize(query_vecs)
            reference_vecs = l2_normalize(reference_vecs)

            # Calculate cosine similarity
            if sim_matrix is None:
                sim_matrix = np.dot(query_vecs, reference_vecs.T)
            else:
                sim_matrix += np.dot(query_vecs, reference_vecs.T)
        sim_matrix /= len(checkpoints)
        retrieval_results = {}

        for (i, query) in enumerate(queries):
            query = query.split('/')[-1].split('.')[0]
            sim_list = zip(db, sim_matrix[i].tolist())
            sorted_sim_list = sorted(sim_list, key=lambda x: x[1], reverse=True)

            ranked_list = [k.split('/')[-1].split('.')[0] for (k, v) in sorted_sim_list]  # ranked list

            retrieval_results[query] = ranked_list
        print('done')

        return list(zip(range(len(retrieval_results)), retrieval_results.items()))
def main(cf, hyper_param_txt, hostname):
    tf.logging.set_verbosity(tf.logging.INFO)
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = F.gpu_no
    print("CUDA Visible device", device_lib.list_local_devices())
    start_time = datetime.now().strftime('%Y%m%d%H%M%S')
    start_time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    if not os.path.isdir(cf.save_dir):
        os.makedirs(cf.save_dir)
    f = open(os.path.join(cf.save_dir, "train_parameters_%s.txt" % start_time),
             mode="w+")
    f.write(hyper_param_txt)

    # inputs_ph = tf.placeholder(tf.float32, [None, cf.train_image_size, cf.train_image_size, cf.train_image_channel],
    #                            name="inputs")
    # labels_ph = tf.placeholder(tf.int32, [None], name="labels")
    tf.set_random_seed(123)

    files = glob.glob(os.path.join(cf.data_dir, "*_train*tfrecord"))
    files.sort()
    assert len(files) > 0
    num_examples = util.count_records(files)
    global_step = tf.Variable(0, trainable=False)

    image_preprocessing_fn = None
    if cf.preprocessing_name:
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            cf.preprocessing_name, is_training=True)

    def sampling_pre_process(example_proto):
        features = {
            "image/encoded": tf.FixedLenFeature((),
                                                tf.string,
                                                default_value=""),
            "image/class/label": tf.FixedLenFeature((),
                                                    tf.int64,
                                                    default_value=0),
            'image/height': tf.FixedLenFeature((), tf.int64, default_value=0),
            'image/width': tf.FixedLenFeature((), tf.int64, default_value=0)
        }
        if cf.use_attr:
            features["image/attr"] = tf.VarLenFeature(dtype=tf.int64)

        parsed_features = tf.parse_single_example(example_proto, features)
        image = parsed_features["image/encoded"]

        label = parsed_features["image/class/label"]
        if cf.use_attr:
            return image, label, parsed_features["image/attr"]
        else:
            return image, label

    def train_pre_process(img_string):
        image = tf.image.decode_jpeg(img_string, cf.train_image_channel)

        if image_preprocessing_fn is not None:
            image = image_preprocessing_fn(image, cf.train_image_size,
                                           cf.train_image_size)
        else:
            image = tf.cast(image, tf.float32)

            image = tf.expand_dims(image, 0)
            image = tf.image.resize_image_with_pad(image, cf.train_image_size,
                                                   cf.train_image_size)
            # image = tf.image.resize_bilinear(image, [224, 224], align_corners=False)
            image = tf.squeeze(image, [0])

            image = tf.divide(image, 255.0)
            image = tf.subtract(image, 0.5)
            image = tf.multiply(image, 2.0)

        return image

    string_img_pl = tf.placeholder(tf.string, (None))
    pair_dataset = tf.data.Dataset.from_tensor_slices(string_img_pl)
    pair_dataset = pair_dataset.map(
        train_pre_process, num_parallel_calls=cf.num_preprocessing_threads)
    pair_dataset = pair_dataset.batch(cf.batch_size)
    pair_dataset = pair_dataset.prefetch(cf.batch_size)
    pair_iterator = pair_dataset.make_initializable_iterator()
    pair_images = pair_iterator.get_next()

    steps_each_epoch = int(num_examples / cf.batch_size)
    if num_examples % cf.batch_size > 0:
        steps_each_epoch += 1
    dataset = tf.data.TFRecordDataset(files)
    dataset = dataset.map(sampling_pre_process,
                          num_parallel_calls=cf.num_preprocessing_threads)
    dataset = dataset.shuffle(cf.shuffle_buffer_size)
    dataset = dataset.repeat()
    dataset = dataset.batch(cf.sampling_buffer_size)
    dataset = dataset.prefetch(cf.sampling_buffer_size)

    iterator = dataset.make_one_shot_iterator()
    # iterator = dataset.make_initializable_iterator()
    if cf.use_attr:
        images, labels, attrs = iterator.get_next()
    else:
        images, labels = iterator.get_next()

    images_ph = tf.placeholder(tf.float32, [
        cf.batch_size, cf.train_image_size, cf.train_image_size,
        cf.train_image_channel
    ],
                               name="inputs")
    labels_ph = tf.placeholder(tf.int32, [cf.batch_size], name="labels")
    if cf.use_attr:
        attrs_ph = tf.placeholder(tf.float32, [cf.batch_size, cf.attr_dim],
                                  name="attrs")
        if not cf.use_attr_net:
            cf.embedding_size = cf.attr_dim
    else:
        attrs_ph = None
    # seed_ph = tf.placeholder(tf.int64, (), name="shuffle_seed")

    loss_op, end_points, train_op = model_fn.build_model(
        images_ph,
        labels_ph,
        cf,
        attrs_ph,
        True,
        cf.use_attr_net,
        cf.num_hidden_attr_net,
        num_examples,
        global_step,
        use_old_model=cf.use_old_model)
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    # Add summaries for end_points.
    for end_point in end_points:
        x = end_points[end_point]
        summaries.add(tf.summary.histogram('activations/' + end_point, x))
        summaries.add(
            tf.summary.scalar('sparsity/' + end_point, tf.nn.zero_fraction(x)))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
        summaries.add(tf.summary.histogram(variable.op.name, variable))

    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    if cf.quantize_delay >= 0:
        tf.contrib.quantize.create_training_graph(
            quant_delay=cf.quantize_delay)

    tf_config = tf.ConfigProto()
    tf_config.gpu_options.allow_growth = True
    sess = tf.Session(config=tf_config)
    sess.run(tf.global_variables_initializer())

    summary_writer = tf.summary.FileWriter(cf.save_dir, sess.graph)

    epoch = 1
    steps = 1
    latest_epoch = 0
    if cf.checkpoint_path is not None and (
            os.path.isfile(cf.checkpoint_path) or
        (os.path.isdir(cf.checkpoint_path)
         and tf.train.latest_checkpoint(cf.checkpoint_path) is not None)):
        latest_checkpoint = tf.train.latest_checkpoint(cf.checkpoint_path)
        exclusions = []
        if cf.checkpoint_exclude_scopes:
            exclusions = [
                scope.strip()
                for scope in cf.checkpoint_exclude_scopes.split(',')
            ]
        variables_to_restore = []
        for var in slim.get_model_variables():
            for exclusion in exclusions:
                if var.op.name.startswith(exclusion):
                    break
            else:
                variables_to_restore.append(var)

        saver_for_restore = tf.train.Saver(var_list=variables_to_restore,
                                           max_to_keep=cf.keep_checkpoint_max)
        if os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint(
                cf.checkpoint_path) is not None:
            cp = tf.train.latest_checkpoint(cf.checkpoint_path)
        else:
            cp = cf.checkpoint_path
        saver_for_restore.restore(sess, cp)
        if os.path.isdir(cf.checkpoint_path) and tf.train.latest_checkpoint(
                cf.checkpoint_path) is not None:
            latest_epoch = int(
                os.path.basename(latest_checkpoint).split("-")[1])
            epoch = latest_epoch + 1
            cf.max_number_of_epochs += latest_epoch
        f.write("%s:%s\n" % ("restore_checkpoint", latest_checkpoint))
    saver = tf.train.Saver(tf.global_variables(),
                           max_to_keep=cf.keep_checkpoint_max)
    f.close()
    num_trained_images = 0
    last_saved_epoch = None
    last_saved_step = None
    start_avg_loss_steps = 10
    start_total_loss = 0.
    while True:
        # sess.run(iterator.initializer, feed_dict={seed_ph: steps})
        try:
            start = time.time()
            if cf.use_attr:
                tmp_images, tmp_labels, tmp_attrs = sess.run(
                    [images, labels, attrs])
                tmp_attrs = np.reshape(tmp_attrs.values,
                                       [cf.sampling_buffer_size, cf.attr_dim])
                tmp_attrs = tmp_attrs.astype(np.float64)
            else:
                tmp_images, tmp_labels = sess.run([images, labels])

            pair_indices = set()
            single_index_map = {}
            label_buffer = {}
            for i, tmp_label in enumerate(tmp_labels):
                if tmp_label in label_buffer:
                    pair_indices.add(i)
                    pair_indices.add(label_buffer[tmp_label])
                    if tmp_label in single_index_map:
                        del single_index_map[tmp_label]
                else:
                    label_buffer[tmp_label] = i
                    single_index_map[tmp_label] = i
            pair_indices = list(pair_indices)
            if len(pair_indices) > cf.batch_size:
                pair_indices = pair_indices[:cf.batch_size]
            elif len(pair_indices) < cf.batch_size:
                pair_indices += list(
                    single_index_map.values())[:cf.batch_size -
                                               len(pair_indices)]
            # print(pair_indices)
            batch_images = tmp_images[pair_indices]
            sess.run(pair_iterator.initializer,
                     feed_dict={string_img_pl: batch_images})
            batch_images = sess.run(pair_images)

            batch_labels = tmp_labels[pair_indices]
            if cf.use_attr:
                batch_attrs = tmp_attrs[pair_indices]

            sampling_time = time.time() - start
            tmp_images = None
            tmp_labels = None
            start = time.time()
            feed_dict = {images_ph: batch_images, labels_ph: batch_labels}
            if cf.use_attr:
                feed_dict[attrs_ph] = batch_attrs
            if steps % cf.save_summaries_steps == 0:
                loss, _, summary = sess.run([loss_op, train_op, summary_op],
                                            feed_dict=feed_dict)
                summary_writer.add_summary(summary, steps)
            else:
                loss, _ = sess.run([loss_op, train_op], feed_dict=feed_dict)
            if steps <= start_avg_loss_steps:
                start_total_loss += loss
            train_time = time.time() - start

            if steps % cf.log_every_n_steps == 0:
                now = datetime.now().strftime('%Y/%m/%d %H:%M:%S')
                print(
                    "[%s: %d epoch(%d/%d), %d steps] sampling time: %f, train time: %f, loss: %f"
                    % (now, epoch, steps % steps_each_epoch, steps_each_epoch,
                       steps, sampling_time, train_time, loss))
            num_trained_images += cf.batch_size

            if cf.use_save_steps:
                if steps % cf.save_interval_steps == 0:
                    saver.save(sess, cf.save_dir + "/model.ckpt", steps)
                    last_saved_step = steps

            if cf.max_number_of_steps is not None and steps >= cf.max_number_of_steps:
                break
            steps += 1

            if num_trained_images >= num_examples:
                if not cf.use_save_steps and cf.save_interval_epochs >= 1 and (
                        epoch - latest_epoch) % cf.save_interval_epochs == 0:
                    saver.save(sess, cf.save_dir + "/model.ckpt", epoch)
                    last_saved_epoch = epoch
                if epoch >= cf.max_number_of_epochs:
                    break
                epoch += 1
                num_trained_images = 0

        except tf.errors.OutOfRangeError:
            break

    if cf.use_save_steps:
        if last_saved_step is None or last_saved_step < steps:
            saver.save(sess, cf.save_dir + "/model.ckpt", steps)
    else:
        if last_saved_epoch is None or last_saved_epoch < epoch:
            saver.save(sess, cf.save_dir + "/model.ckpt", epoch)

    summary_writer.add_summary(sess.run(summary_op, feed_dict=feed_dict),
                               steps)

    sess.close()
    tf.reset_default_graph()

    if cf.notify_after_training:
        txt = "%s[%s]\n\n" % (hostname,
                              socket.gethostbyname(socket.gethostname()))
        txt += "start avg loss : %f" % (start_total_loss /
                                        start_avg_loss_steps)
        txt += "last loss : %f" % loss
        txt += "start time: %s\n" % start_time_str
        txt += "end time: %s\n" % datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        if cf.eval_after_training:
            txt += "going to evaluate"
        else:
            txt += "not going to evaluate"
        txt += "\n[params]\n"
        txt += hyper_param_txt
        util.send_msg_to_slack(
            "\n\n==================================\nTraining is Done\n" + txt)

    if cf.eval_after_training:
        cuda.select_device(0)
        cuda.close()
        eval_cmd = 'python -u multiple_search_models.py --model_dir="%s" --embedding_size=%d --data_dir="%s" --model_name=%s --max_top_k=%d --shutdown_after_train=%d --gpu_no=%s --step_type=%s --image_size=%s --eval_batch_size=%d --preprocessing_name=%s --notify_after_training=%d --use_old_model=%d --save_static_data=%d' % (
            cf.save_dir, cf.embedding_size, cf.data_dir, cf.model_name,
            cf.eval_max_top_k, 1 if cf.shutdown_after_train else 0, cf.gpu_no,
            "step" if cf.use_save_steps else "epoch", cf.train_image_size,
            cf.eval_batch_size, cf.preprocessing_name,
            1 if cf.notify_after_training else 0, 1 if cf.use_old_model else 0,
            1 if cf.save_static_data else 0)
        print(eval_cmd)
        os.system(eval_cmd)
    else:
        if cf.shutdown_after_train:
            os.system("sudo shutdown now")