Пример #1
0
def get_frame_input_feature(input_file):
    features = []
    record_iterator = tf.python_io.tf_record_iterator(path=input_file)
    for i, string_record in enumerate(record_iterator):
        example = tf.train.SequenceExample()
        example.ParseFromString(string_record)

        # traverse the Example format to get data
        video_id = example.context.feature['video_id'].bytes_list.value[0]
        label = example.context.feature['labels'].int64_list.value[:]
        rgbs = []
        audios = []
        rgb_feature = example.feature_lists.feature_list['rgb'].feature
        for i in range(len(rgb_feature)):
            rgb = np.fromstring(rgb_feature[i].bytes_list.value[0], dtype=np.uint8).astype(np.float32)
            rgb = utils.Dequantize(rgb, 2, -2)
            rgbs.append(rgb)
        audio_feature = example.feature_lists.feature_list['audio'].feature
        for i in range(len(audio_feature)):
            audio = np.fromstring(audio_feature[i].bytes_list.value[0], dtype=np.uint8).astype(np.float32)
            audio = utils.Dequantize(audio, 2, -2)
            audios.append(audio)
        rgbs = np.array(rgbs)
        audios = np.array(audios)
        features.append((video_id, label, rgbs, audios))
    return features
Пример #2
0
    def get_video_matrix(self, features, feature_size, max_frames,
                         max_quantized_value, min_quantized_value):
        """Decodes features from an input string and quantizes it.

    Args:
      features: raw feature values
      feature_size: length of each frame feature vector
      max_frames: number of frames (rows) in the output feature_matrix
      max_quantized_value: the maximum of the quantized value.
      min_quantized_value: the minimum of the quantized value.

    Returns:
      feature_matrix: matrix of all frame-features
      num_frames: number of frames in the sequence
    """
        decoded_features = tf.reshape(
            tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
            [-1, feature_size])

        num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
        feature_matrix = utils.Dequantize(decoded_features,
                                          max_quantized_value,
                                          min_quantized_value)
        feature_matrix = resize_axis(feature_matrix, 0, max_frames)
        if self.prepare_distill:
            def_feature_matrix = tf.reshape(tf.decode_raw(features, tf.uint8),
                                            [-1, feature_size])
            def_feature_matrix = resize_axis(def_feature_matrix, 0, max_frames)
            return feature_matrix, num_frames, def_feature_matrix
        return feature_matrix, num_frames
Пример #3
0
    def do_pca(self, input, max_quantized_value=2.0, min_quantized_value=-2.0):
        reduce_dim = 1024
        load_file = open("model_pca_tag_category_100w.pickle", "rb")
        mean_block3 = pickle.load(load_file)
        component_block3 = pickle.load(load_file)
        component_block3 = component_block3[:, 0:reduce_dim]
        singular_values_ = pickle.load(load_file)
        singular_block3 = tf.constant(singular_values_,
                                      dtype=tf.float32,
                                      name='pac_singular_block3')
        mean_block3 = tf.constant(mean_block3,
                                  dtype=tf.float32,
                                  name='pac_mean_block3')
        component_block3 = tf.constant(component_block3,
                                       dtype=tf.float32,
                                       name='pac_component_block3')
        res_fea_pca = tf.matmul(
            input - mean_block3,
            component_block3) / tf.sqrt(singular_block3[0:reduce_dim] + 1e-4)

        res_fea = utils.quantize(res_fea_pca,
                                 max_quantized_value=max_quantized_value,
                                 min_quantized_value=min_quantized_value)
        res_fea = utils.Dequantize(res_fea,
                                   max_quantized_value=max_quantized_value,
                                   min_quantized_value=min_quantized_value)
        # res_fea_pca = tf.reshape(res_fea_pca, [-1, frams, reduce_dim])
        # res_fea = tf.reshape(res_fea_pca, tf.shape(res_fea_pca))
        return res_fea
Пример #4
0
    def get_video_matrix(self, features, feature_size, max_frames,
                         max_quantized_value, min_quantized_value):
        """Decodes features from an input string and quantizes it.

    Args:
      features: raw feature values
      feature_size: length of each frame feature vector
      max_frames: number of frames (rows) in the output feature_matrix
      max_quantized_value: the maximum of the quantized value.
      min_quantized_value: the minimum of the quantized value.

    Returns:
      feature_matrix: matrix of all frame-features
      num_frames: number of frames in the sequence
    """
        decoded_features = tf.reshape(
            tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
            [-1, feature_size])

        num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
        feature_matrix = utils.Dequantize(decoded_features,
                                          max_quantized_value,
                                          min_quantized_value)
        if feature_size == 1024:
            feature_matrix = feature_matrix * tf.transpose(
                tf.sqrt(self.pca_eigenvals + 1e-4))
            feature_matrix = tf.reduce_sum(
                tf.multiply(tf.expand_dims(feature_matrix, 1),
                            self.pca_eigenvecs), 2)
            feature_matrix += np.transpose(self.pca_mean)
        feature_matrix = resize_axis(feature_matrix, 0, max_frames)
        return feature_matrix, num_frames
Пример #5
0
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file):
    print("Input file: " +input_wav_file)

    
    if (os.path.isfile(input_wav_file)):
      examples_batch = vggish_input.wavfile_to_examples(input_wav_file)
      #print(examples_batch)
      pproc = vggish_postprocess.Postprocessor(pca_params)

      with tf.Graph().as_default(), tf.Session() as sess:
       # Define the model in inference mode, load the checkpoint, and
       # locate input and output tensors.
       vggish_slim.define_vggish_slim(training=False)
       vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
       features_tensor = sess.graph.get_tensor_by_name(
          vggish_params.INPUT_TENSOR_NAME)
       embedding_tensor = sess.graph.get_tensor_by_name(
          vggish_params.OUTPUT_TENSOR_NAME)

       # Run inference and postprocessing.
       [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
       #print(embedding_batch)
       postprocessed_batch = pproc.postprocess(embedding_batch)
       #print(postprocessed_batch)
       num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32)
    
       video_batch_val = np.zeros((1, 300, 128), dtype=np.float32)
       video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2)
    

 #  extract_n_predict()
       predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file)
       return(predicted_class)
      tf.reset_default_graph()
    def get_video_matrix(self, features, feature_size, max_frames,
                         max_quantized_value, min_quantized_value):
        """Decodes features from an input string and quantizes it.

    Args:
      features: raw feature values
      feature_size: length of each frame feature vector
      max_frames: number of frames (rows) in the output feature_matrix
      max_quantized_value: the maximum of the quantized value.
      min_quantized_value: the minimum of the quantized value.

    Returns:
      feature_matrix: matrix of all frame-features
      num_frames: number of frames in the sequence
    """
        decoded_features = tf.reshape(
            tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
            [-1, feature_size])
        interval = FLAGS.crop_interval

        if FLAGS.crop:
            ind = tf.multinomial(tf.log([[1.] * interval]), 1)[0, 0]
            length_local = tf.shape(decoded_features, out_type=tf.int64)[0]
            start_idx = tf.minimum(ind, length_local - 1)
            index = tf.range(start_idx, length_local, interval)
            decoded_features = tf.reshape(tf.gather(decoded_features, index),
                                          [-1, feature_size])

        num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
        feature_matrix = utils.Dequantize(decoded_features,
                                          max_quantized_value,
                                          min_quantized_value)
        feature_matrix = resize_axis(feature_matrix, 0, max_frames)
        return feature_matrix, num_frames
Пример #7
0
def build_graph():
    feature_names = ['rgb', 'audio']
    feature_sizes = [1024, 128]
    max_quantized_value = 2
    min_quantized_value = -2

    seq_example_bytes = tf.placeholder(tf.string)
    contexts, features = tf.parse_single_sequence_example(
        seq_example_bytes,
        context_features={
            "video_id": tf.FixedLenFeature([], tf.string),
            "labels": tf.VarLenFeature(tf.int64)
        },
        sequence_features={
            feature_name: tf.FixedLenSequenceFeature([], dtype=tf.string)
            for feature_name in feature_names
        })

    decoded_features = {
        name: tf.reshape(
            tf.cast(tf.decode_raw(features[name], tf.uint8), tf.float32),
            [-1, size])
        for name, size in zip(feature_names, feature_sizes)
    }
    feature_matrices = {
        name: utils.Dequantize(decoded_features[name], max_quantized_value,
                               min_quantized_value)
        for name in feature_names
    }

    tf.add_to_collection("vid_tsr", contexts['video_id'])
    tf.add_to_collection("labs_tsr", contexts['labels'].values)
    tf.add_to_collection("rgb_tsr", feature_matrices['rgb'])
    tf.add_to_collection("audio_tsr", feature_matrices['audio'])
    tf.add_to_collection("seq_example_bytes", seq_example_bytes)
Пример #8
0
    def get_video_matrix(self, features, feature_size, max_frames,
                         max_quantized_value, min_quantized_value):

        decoded_features = tf.reshape(
            tf.cast(tf.decode_raw(features, tf.uint8), tf.float32),
            [-1, feature_size])

        num_frames = tf.minimum(tf.shape(decoded_features)[0], max_frames)
        feature_matrix = utils.Dequantize(decoded_features,
                                          max_quantized_value,
                                          min_quantized_value)
        feature_matrix = resize_axis(feature_matrix, 0, max_frames)
        return feature_matrix, num_frames
Пример #9
0
	def prepare_reader(self, filename_queue, max_quantized_value=2, min_quantized_value=-2):
		"""Creates a single reader thread for YouTube8M SequenceExamples.

		Args:
			filename_queue: A tensorflow queue of filename locations.
			max_quantized_value: the maximum of the quantized value.
			min_quantized_value: the minimum of the quantized value.

		Returns:
			A tuple of video indexes, video features, labels, and padding data.
		"""
		reader = tf.TFRecordReader()
		_, serialized_example = reader.read(filename_queue)

		contexts, features = tf.parse_single_sequence_example(
				serialized_example,
				context_features={"video_id": tf.FixedLenFeature([], tf.string),
								   "labels": tf.VarLenFeature(tf.int64)},
				sequence_features={
						"rgb" : tf.FixedLenSequenceFeature([], dtype=tf.string),
						"audio": tf.FixedLenSequenceFeature([], dtype=tf.string)
				})

		# read ground truth labels
		labels = tf.cast(tf.sparse_to_dense(contexts["labels"].values, (self.num_classes,), 1, 
			validate_indices=False), tf.int32)

		rgb = tf.reshape(tf.cast(tf.decode_raw(features["rgb"], tf.uint8), tf.float32), [-1, FLAGS.rgb_size])
		audio = tf.reshape(tf.cast(tf.decode_raw(features["audio"], tf.uint8), tf.float32), [-1, FLAGS.audio_size])
		num_frames = tf.minimum(tf.shape(rgb)[0], self.max_frames)
		tf.assert_equal(tf.shape(rgb)[0], tf.shape(audio)[0])

		rgb = resize_axis(utils.Dequantize(rgb,max_quantized_value,min_quantized_value), 0, self.max_frames)
		audio = resize_axis(utils.Dequantize(audio,max_quantized_value,min_quantized_value), 0, self.max_frames)
		

		return contexts["video_id"], labels, rgb, audio, num_frames
Пример #10
0
def main(unused_argv):
    print("Input file: " + FLAGS.input_video_label)

    for wav_file, st_time, end_time, label in csv.reader(open(
            FLAGS.input_video_label),
                                                         delimiter='\t'):
        print(wav_file, st_time, end_time, label)
        if (os.path.isfile(wav_file)):
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
            #print(examples_batch)
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

            with tf.Graph().as_default(), tf.Session() as sess:
                # Define the model in inference mode, load the checkpoint, and
                # locate input and output tensors.
                vggish_slim.define_vggish_slim(training=False)
                vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
                features_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.INPUT_TENSOR_NAME)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.OUTPUT_TENSOR_NAME)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
                #print(embedding_batch)
                postprocessed_batch = pproc.postprocess(embedding_batch)
                #print(postprocessed_batch)
                num_frames_batch_val = np.array([postprocessed_batch.shape[0]],
                                                dtype=np.int32)

                video_batch_val = np.zeros((1, 300, 128), dtype=np.float32)
                video_batch_val[
                    0, 0:postprocessed_batch.shape[0], :] = utils.Dequantize(
                        postprocessed_batch.astype(float), 2, -2)

                inference(video_batch_val, num_frames_batch_val,
                          FLAGS.checkpoint_file, FLAGS.train_dir,
                          FLAGS.output_file)

            tf.reset_default_graph()
Пример #11
0
def frame_example_2_np(seq_example_bytes,
                       max_quantized_value=2,
                       min_quantized_value=-2):
    feature_names = ['rgb', 'audio']
    feature_sizes = [1024, 128]
    with tf.Graph().as_default():
        contexts, features = tf.parse_single_sequence_example(
            seq_example_bytes,
            context_features={
                "video_id": tf.FixedLenFeature([], tf.string),
                "labels": tf.VarLenFeature(tf.int64)
            },
            sequence_features={
                feature_name: tf.FixedLenSequenceFeature([], dtype=tf.string)
                for feature_name in feature_names
            })

        decoded_features = {
            name: tf.reshape(
                tf.cast(tf.decode_raw(features[name], tf.uint8), tf.float32),
                [-1, size])
            for name, size in zip(feature_names, feature_sizes)
        }
        feature_matrices = {
            name: utils.Dequantize(decoded_features[name], max_quantized_value,
                                   min_quantized_value)
            for name in feature_names
        }

        with tf.Session() as sess:
            vid = sess.run(contexts['video_id'])
            labs = sess.run(contexts['labels'].values)
            rgb = sess.run(feature_matrices['rgb'])
            audio = sess.run(feature_matrices['audio'])

    return vid, labs, rgb, audio
Пример #12
0
def _process_videos(thread_index, ranges, name, videos, num_shards):
    """Processes and saves a subset of video metadata as TFRecord files in one thread.

    Each thread produces N shards where N = num_shards / num_threads.
    For instance, if num_shards = 128, and num_threads = 2, then the first
    thread would produce shards [0, 64).

    Args:
        thread_index: Integer thread identifier within [0, len(ranges)].
        ranges: A list of pairs of integers specifying the ranges of the datset to
            process in parallel.
        name: Unique identifier specifying the dataset.
        videos: List of VideoMetadata.
        num_shards: Integer number of shards for the output files.
    """

    for i in range(len(videos)):
        vid = videos[i]
        filename_queue = tf.train.string_input_producer([vid],
                                                        num_epochs=1,
                                                        shuffle=True)
        reader = tf.TFRecordReader()
        _, serialized_examples = reader.read(filename_queue)
        context_features = {
            "video_id": tf.FixedLenFeature([], tf.string),
            "labels": tf.VarLenFeature(tf.int64)
        }

        sequence_features = {
            "rgb": tf.FixedLenSequenceFeature([], dtype=tf.string),
            "audio": tf.FixedLenSequenceFeature([], dtype=tf.string)
        }

        contexts, features = tf.parse_single_sequence_example(
            serialized_examples,
            context_features=context_features,
            sequence_features=sequence_features)
        output_file = os.path.join(FLAGS.output_dir, vid.split('/')[-1])
        writer = tf.python_io.TFRecordWriter(output_file)

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            vis_dic = tf.contrib.learn.KMeansClustering(
                num_clusters=256,
                relative_tolerance=0.0001,
                model_dir='/data1/yj/kmeans/')
            counter = 0
            while not coord.should_stop():
                feat = sess.run(features)
                cont = sess.run(contexts)
                decoded_rgb = tf.reshape(tf.cast(
                    tf.decode_raw(features['rgb'], tf.uint8), tf.float32),
                                         shape=[-1, 1024])
                temp = sess.run(decoded_rgb)
                vlad = VLAD_tf(utils.Dequantize(temp), vis_dic)
                sequence_example = _to_sequence_example(cont, vlad)
                if sequence_example is not None:
                    counter += 1
                    writer.write(sequence_example.SerializeToString())
            writer.close()
            print("%s [thread %d]: Wrote %d %s working data to %s." %
                  (datetime.now(), thread_index, counter, FLAGS.type,
                   output_file))
            sys.stdout.flush()
            shard_counter = 0
        print("%s [thread %d]: Wrote %d %s working data to %d shards." %
              (datetime.now(), thread_index, counter, FLAGS.type,
               num_shards_per_batch))
        sys.stdout.flush()
Пример #13
0
    def create_model(self, model_input, vocab_size, num_frames,
                     **unused_params):
        """Creates a model which uses a logistic classifier over the average of the
    frame-level features.

    This class is intended to be an example for implementors of frame level
    models. If you want to train a model over averaged features it is more
    efficient to average them beforehand rather than on the fly.

    Args:
      model_input: A 'batch_size' x 'max_frames' x 'num_features' matrix of
                   input features.
      vocab_size: The number of classes in the dataset.
      num_frames: A vector of length 'batch' which indicates the number of
           frames for each video (before padding).

    Returns:
      A dictionary with a tensor containing the probability predictions of the
      model in the 'predictions' key. The dimensions of the tensor are
      'batch_size' x 'num_classes'.
    """
        ##########################################################original logisticModel

        # logging.info("model_input_shape: %s." ,str(model_input))
        #
        # ###(1,300,1024),padding to 300 frames even if the true num_frames not 300.
        # ##if use audio_information, the vector becomes(?,300,1152),since 1152=1024+128
        # num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        # feature_size = model_input.get_shape().as_list()[2]
        #
        # denominators = tf.reshape(
        #     tf.tile(num_frames, [1, feature_size]), [-1, feature_size])
        # ##
        # logging.info("denominators: %s.", str(denominators))
        #
        # ##(1,1024)
        # avg_pooled = tf.reduce_sum(model_input,
        #                            axis=[1]) / denominators
        # ##an average 1024 feature
        # output = slim.fully_connected(
        #     avg_pooled, vocab_size, activation_fn=tf.nn.sigmoid,
        #     weights_regularizer=slim.l2_regularizer(1e-8))
        # return {"predictions": output}

        #############################################################

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        feature_size = model_input.get_shape().as_list()[2]
        extrac_frames = 100
        model_input = utils.SampleFramesOrdered(model_input, num_frames,
                                                extrac_frames)  #
        model_input = tf.expand_dims(model_input, -1)
        logging.info("model_input_after_shape: %s.", str(model_input))
        #batchsize*extrac_frames*feature_size

        ########

        filters = [16, 64, 256, 1024, 4096]
        #dequantize
        model_input = tools.Dequantize(model_input)

        x = self._conv('conv1',
                       model_input,
                       time_stride=30,
                       in_filters=1,
                       out_filters=400,
                       feature_size=feature_size,
                       strides=[1, 10, 1, 1],
                       padding='VALID')
        logging.info("after_conv1: %s.", str(x))
        #8
        bias = tf.get_variable('bias1', [400],
                               tf.float32,
                               initializer=tf.zeros_initializer())

        x = self._relu(x + bias, 0.0)

        x = tf.nn.max_pool(x,
                           ksize=[1, 8, 1, 1],
                           strides=[1, 8, 1, 1],
                           padding='VALID',
                           name="max1")
        #42

        # logging.info("x_after_maxpool1: %s.", str(x))
        # x=self._conv('conv2',x,time_stride=3,in_filters=filters[0],out_filters=filters[2],feature_size=1,
        #              strides=[1,1,1,1],padding='SAME')
        # bias = tf.get_variable('bias2', [filters[2]], tf.float32, initializer=tf.zeros_initializer())
        #
        # x = self._relu(x + bias,0.0)
        #
        # x=tf.nn.max_pool(x,ksize=[1,41,1,1],strides=[1,41,1,1],padding='VALID',name="max2")
        # #21
        #
        # # x=self.group_conv(name='group',x=x,time_stride=21,in_filters=filters[1],out_filters=filters[2],strides=[1,1,1,1])
        #
        # x=tf.nn.relu6(x,name='relu6')

        x = tf.contrib.layers.flatten(x)
        # x=tf.nn.dropout(x,keep_prob=0.5)

        logging.info("output: %s.", x)

        # hidden = slim.fully_connected(
        #     x, 8196, activation_fn=None,
        #     weights_regularizer=slim.l2_regularizer(1e-8))
        # # drop=tf.nn.dropout(hidden,keep_prob=0.5)
        # hidden=tf.nn.relu(hidden,'relu6')

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        # logging.info("DBoF_activitions:%s", str(activation))
        return aggregated_model().create_model(model_input=x,
                                               vocab_size=vocab_size,
                                               **unused_params)
Пример #14
0
    def create_model(self,
                     model_input,
                     vocab_size,
                     num_frames,
                     iterations=None,
                     add_batch_norm=None,
                     sample_random_frames=None,
                     cluster_size=None,
                     hidden_size=None,
                     is_training=True,
                     **unused_params):
        iterations = iterations or FLAGS.iterations
        add_batch_norm = add_batch_norm or FLAGS.dbof_add_batch_norm
        random_frames = sample_random_frames or FLAGS.sample_random_frames
        cluster_size = cluster_size or FLAGS.dbof_cluster_size
        hidden1_size = hidden_size or FLAGS.dbof_hidden_size

        num_frames = tf.cast(tf.expand_dims(num_frames, 1), tf.float32)
        if random_frames:
            model_input = utils.SampleRandomFrames(model_input, num_frames,
                                                   iterations)
        else:
            model_input = utils.SampleRandomSequence(model_input, num_frames,
                                                     iterations)
        max_frames = model_input.get_shape().as_list()[1]
        feature_size = model_input.get_shape().as_list()[2]
        reshaped_input = tf.reshape(model_input, [-1, feature_size])
        ##dequantize
        logging.info("preprocessed input:%s", str(reshaped_input))

        reshaped_input = tools.Dequantize(reshaped_input)

        logging.info("deQuantized input:%s", str(reshaped_input))

        tf.summary.histogram("input_hist", reshaped_input)

        if add_batch_norm:
            reshaped_input = slim.batch_norm(reshaped_input,
                                             center=True,
                                             scale=True,
                                             is_training=is_training,
                                             scope="input_bn")

        cluster_weights = tf.get_variable(
            "cluster_weights", [feature_size, cluster_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(feature_size)))
        tf.summary.histogram("cluster_weights", cluster_weights)
        activation = tf.matmul(reshaped_input, cluster_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="cluster_bn")
        else:
            cluster_biases = tf.get_variable(
                "cluster_biases", [cluster_size],
                initializer=tf.random_normal(stddev=1 /
                                             math.sqrt(feature_size)))
            tf.summary.histogram("cluster_biases", cluster_biases)
            activation += cluster_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("cluster_output", activation)

        activation = tf.reshape(activation, [-1, max_frames, cluster_size])
        activation = utils.FramePooling(activation, FLAGS.dbof_pooling_method)

        hidden1_weights = tf.get_variable(
            "hidden1_weights", [cluster_size, hidden1_size],
            initializer=tf.random_normal_initializer(stddev=1 /
                                                     math.sqrt(cluster_size)))
        tf.summary.histogram("hidden1_weights", hidden1_weights)
        activation = tf.matmul(activation, hidden1_weights)
        if add_batch_norm:
            activation = slim.batch_norm(activation,
                                         center=True,
                                         scale=True,
                                         is_training=is_training,
                                         scope="hidden1_bn")
        else:
            hidden1_biases = tf.get_variable(
                "hidden1_biases", [hidden1_size],
                initializer=tf.random_normal_initializer(stddev=0.01))
            tf.summary.histogram("hidden1_biases", hidden1_biases)
            activation += hidden1_biases
        activation = tf.nn.relu6(activation)
        tf.summary.histogram("hidden1_output", activation)
        #dropout
        activation = tf.nn.dropout(activation, 0.5)

        aggregated_model = getattr(video_level_models,
                                   FLAGS.video_level_classifier_model)
        logging.info("DBoF_activitions:%s", str(activation))
        return aggregated_model().create_model(model_input=activation,
                                               vocab_size=vocab_size,
                                               **unused_params)
Пример #15
0
  vid_batch = tf.train.batch_join(
      [[decoded_rgb]],
      batch_size=batch_size,
      capacity=batch_size * 2,
      dynamic_pad=True)
  '''
    vis_dic = tf.contrib.learn.KMeansClustering(num_clusters=256,
                                                relative_tolerance=0.0001,
                                                model_dir='/data1/yj/kmeans/')
    while not coord.should_stop():
        #print sess.run(features)
        '''
    print type(sess.run(features))
    print len(sess.run(features))
    print sess.run(features).keys()
    print type(sess.run(features)['rgb'])
    '''
        #drgb = utils.Dequantize(sess.run(decoded_rgb))
        #pudb.set_trace()
        #rgb_VLAD = VLAD_tf(drgb, vis_dic)
        drgb = sess.run(decoded_audio)
        drgb = drgb[::20]
        drgb = utils.Dequantize(drgb)
        rgbs.append(drgb)
        if len(rgbs) % 10000 == 0:
            print 'doing...'
        if len(rgbs) > 80000:
            rgb_stack = np.concatenate(rgbs, axis=0)
            pkl.dump(rgb_stack, open(cluster_dir, 'w'))
            break