Exemplo n.º 1
0
def extract_audioset_features(paths, path2gt):
    """Extracts Audioset features and their corresponding ground_truth and identifiers (the path).

       Audioset features are extracted from non-overlapping audio patches of 0.96 seconds, 
       where each audio patch covers 64 mel bands and 96 frames of 10 ms each.

       We repeat ground_truth and identifiers to fit the number of extracted Audioset features.
    """
    # 1) Extract log-mel spectrograms
    first_audio = True
    for p in paths:
        if first_audio:
            input_data = vggish_input.wavfile_to_examples(
                config['audio_folder'] + p)
            ground_truth = np.repeat(path2gt[p], input_data.shape[0], axis=0)
            identifiers = np.repeat(p, input_data.shape[0], axis=0)
            first_audio = False
        else:
            tmp_in = vggish_input.wavfile_to_examples(config['audio_folder'] +
                                                      p)
            input_data = np.concatenate((input_data, tmp_in), axis=0)
            tmp_gt = np.repeat(path2gt[p], tmp_in.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(p, tmp_in.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    # 2) Load Tensorflow model to extract Audioset features
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        extracted_feat = sess.run([embedding_tensor],
                                  feed_dict={features_tensor: input_data})
        feature = np.squeeze(np.asarray(extracted_feat))

    return [feature, ground_truth, identifiers]
Exemplo n.º 2
0
    def embedding(self, input_paths, output_paths):
        """Run VGGish embedding."""
        paths = list(zip(input_paths, output_paths))

        with tf.Graph().as_default(), tf.Session() as sess:
            vggish_slim.define_vggish_slim()
            vggish_slim.load_vggish_slim_checkpoint(sess,
                                                    self.model_checkpoint)

            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            func = partial(
                self._embed,
                sess=sess,
                features_tensor=features_tensor,
                embedding_tensor=embedding_tensor,
            )

            self.single_process(func, paths)
Exemplo n.º 3
0
def CreateVGGishNetwork(sess, hop_size=0.96):  # Hop size is in seconds.
    """Define VGGish model, load the checkpoint, and return a dictionary that points
  to the different tensors defined by the model.
  """
    vggish_slim.define_vggish_slim()
    checkpoint_path = 'vggish_model.ckpt'
    vggish_params.EXAMPLE_HOP_SECONDS = hop_size

    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    layers = {
        'conv1': 'vggish/conv1/Relu',
        'pool1': 'vggish/pool1/MaxPool',
        'conv2': 'vggish/conv2/Relu',
        'pool2': 'vggish/pool2/MaxPool',
        'conv3': 'vggish/conv3/conv3_2/Relu',
        'pool3': 'vggish/pool3/MaxPool',
        'conv4': 'vggish/conv4/conv4_2/Relu',
        'pool4': 'vggish/pool4/MaxPool',
        'fc1': 'vggish/fc1/fc1_2/Relu',
        'fc2': 'vggish/fc2/Relu',
        'embedding': 'vggish/embedding',
        'features': 'vggish/input_features',
    }
    g = tf.get_default_graph()
    for k in layers:
        layers[k] = g.get_tensor_by_name(layers[k] + ':0')

    return {
        'features': features_tensor,
        'embedding': embedding_tensor,
        'layers': layers,
    }
Exemplo n.º 4
0
def OutputAudioEmbeddings(wav_file_path, save_path):

    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.

    if os.path.isfile(wav_file_path) and not os.path.isfile(save_path +
                                                            '.npy'):
        wav_file = wav_file_path
        print(wav_file_path)
        print(save_path + '.npy')
        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        # print(examples_batch)

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        # If needed, prepare a record writer to store the postprocessed embeddings.
        writer = tf.python_io.TFRecordWriter(
            FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            # print(embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            # print(postprocessed_batch.shape)
            np.save(save_path, postprocessed_batch)
def extract_wav_features(f_dir):
    examples_batch = vggish_input.wavfile_to_examples(f_dir)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor('vggish_pca_params.npz')
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)
        return postprocessed_batch
Exemplo n.º 6
0
def generate_embedding(filePath):
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz")
    output = None
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt")
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        examples_batch = vggish_input_mod.wavfile_to_examples(filePath)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        output = postprocessed_batch
    return output
def main(wav_file, flag_for_data, data, model_type):
    """
    #Specify the path for the downloaded or recorded audio files and
    #also path for writing the embeddings or pickle files
    """
    if flag_for_data == 0:
        if wav_file:
            pkl = wav_file[:-4] + '.pkl'
        # print (pkl)
        examples_batch = vggish_input.wavfile_to_examples(wav_file)

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        with tf.Graph().as_default(), tf.Session() as sess:

            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            postprocessed_batch = pproc.postprocess(embedding_batch)
            return postprocessed_batch
        # print(postprocessed_batch)
    elif flag_for_data == 1:
        predict_prob, predictions = model_function_binary_relevance.predictions_wavfile(
            data, model_type)
        K.clear_session()
        return predict_prob, predictions
    def setup(self):
        # Paths to downloaded VGGish files.
        self.checkpoint_path = 'vggish_model.ckpt'
        self.pca_params_path = 'vggish_pca_params.npz'
        self.batch_size = 60

        # If we can't find the trained model files, download them
        if not os.path.exists(self.checkpoint_path):
            print('AudiosetAnalysis: Downloading model file {} (please wait - this may take a while)'.format(self.checkpoint_path))
            urllib.urlretrieve('https://storage.googleapis.com/audioset/vggish_model.ckpt', self.checkpoint_path)
        if not os.path.exists(self.pca_params_path):
            print('AudiosetAnalysis: Downloading params file {} (please wait - this may take a while)'.format(self.pca_params_path))
            urllib.urlretrieve('https://storage.googleapis.com/audioset/vggish_pca_params.npz', self.pca_params_path)

        # Define VGGish
        self.sess = tf.Graph().as_default()
        config = tf.ConfigProto(device_count={'CPU': 4})
        self.sess = tf.Session(config=config)

        # Load the checkpoint
        vggish_slim.define_vggish_slim()
        vggish_slim.load_vggish_slim_checkpoint(self.sess, self.checkpoint_path)
        self.features_tensor = self.sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        self.embedding_tensor = self.sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
Exemplo n.º 9
0
def loadVGGish(sess, number_of_classes, lr = vggish_params.LEARNING_RATE):
    embeddings = vggish_slim.define_vggish_slim(True) # Do we train VGG-ish?

    # Define a shallow classification model and associated training ops on top
    # of VGGish.
    with tf.variable_scope('mymodel'):
        # Add a fully connected layer with 100 units.
        num_units = 100
        fc = slim.fully_connected(embeddings, num_units)

        # Add a classifier layer at the end, consisting of parallel logistic
        # classifiers, one per class. This allows for multi-class tasks.
        logits = slim.fully_connected(
          fc, number_of_classes, activation_fn=None, scope='logits')
        pred = tf.sigmoid(logits, name='prediction')

        # Add training ops.
        with tf.variable_scope('train'):
            global_step = tf.Variable(
                0, name='global_step', trainable=False,
                collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                             tf.GraphKeys.GLOBAL_STEP])

        # Labels are assumed to be fed as a batch multi-hot vectors, with
        # a 1 in the position of each positive class label, and 0 elsewhere.
        labels = tf.placeholder(
            tf.float32, shape=(None, number_of_classes), name='labels')

        # Cross-entropy label loss.
        xent = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=labels, name='xent')
        loss = tf.reduce_mean(xent, name='loss_op')
        tf.summary.scalar('loss', loss)

        # We use the same optimizer and hyperparameters as used to train VGGish.
        optimizer = tf.train.AdamOptimizer(
            learning_rate=lr,
            epsilon=vggish_params.ADAM_EPSILON)
        optimizer.minimize(loss, global_step=global_step, name='train_op')

    # Initialize all variables in the model, and then load the pre-trained
    # VGGish checkpoint.
    sess.run(tf.global_variables_initializer())
    vggish_slim.load_vggish_slim_checkpoint(sess, './vggish_model.ckpt') 
    return logits, pred
Exemplo n.º 10
0
    def define_model(self, sess):
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

        # Define a shallow classification model and associated training ops on top
        # of VGGish.
        with tf.variable_scope('mymodel'):
            # Add a fully connected layer with 100 units.
            num_units = 100
            fc = slim.fully_connected(embeddings, num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            self.logits = slim.fully_connected(fc,
                                               self._NUM_CLASSES,
                                               activation_fn=None,
                                               scope='logits')
            self.prediction = tf.sigmoid(self.logits, name='prediction')

            if (self.isTrain):
                self.add_training_op(sess)
Exemplo n.º 11
0
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
with tf.Graph().as_default(), tf.Session() as sess:
    vggish_slim.define_vggish_slim()
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: input_batch})
    print('VGGish embedding: ', embedding_batch[0])
    expected_embedding_mean = 0.131
    expected_embedding_std = 0.238
    np.testing.assert_allclose(
        [np.mean(embedding_batch),
         np.std(embedding_batch)],
        [expected_embedding_mean, expected_embedding_std],
Exemplo n.º 12
0
def main(_):

    parser = argparse.ArgumentParser(description="Options")
    parser.add_argument("-i", "--input", help="Input of catalog", default="./")
    parser.add_argument("-s",
                        "--save_serialized_file",
                        default="../serialized_file.pickle",
                        help="Save serialiazed file")
    parser.add_argument("-l",
                        "--load_serialized_files",
                        help="Load serialiazed files from catalog")
    parser.add_argument("-e",
                        "--etalon_class_file",
                        help="Load etalon file of class")
    parser.add_argument("-t",
                        "--etalon_class_id",
                        help="Load etalon file with content of class id")
    parser.add_argument("-m",
                        "--merge",
                        help="Number seconds for merge",
                        default=1)
    parser.add_argument("-v",
                        "--vad",
                        help="use VAD optimization",
                        default=True)
    parser.add_argument("--save_list_files",
                        help="Serialize list of files",
                        default=False)
    parser.add_argument("--layer",
                        help="Layer of serialized data",
                        default='embedding')
    parser.add_argument("--load_test_data", help="Load test data from catalog")
    parser.add_argument("--models",
                        help="Catalog for save/load checkpoint of models")
    parser.add_argument("--folds", help="Number of folds", default=int(3))
    parser.add_argument("--lr", help="learning rate", default=float(0.0004))
    parser.add_argument("--lr_lim",
                        help="limit of learning rate",
                        default=float(0.0001))
    parser.add_argument("--factor",
                        help="new_lr = factor * lr",
                        default=float(0.8))
    parser.add_argument("--tensorboard",
                        help="new_lr = factor * lr",
                        default="tensorboard")
    parser.add_argument("--batch_size", help="Batch size", default=int(32))
    parser.add_argument("--scheduler_mode",
                        help="Scheduler of learning rate",
                        default=None)
    parser.add_argument("--optimizer",
                        help="Optimizer of learning rate",
                        default='SGD')

    args = parser.parse_args()

    checkPath = os.path.normpath(str(args.models))
    loadSerializeFiles = args.load_serialized_files
    loadTestData = args.load_test_data
    etalon_class_file = args.etalon_class_file
    etalon_class_id = args.etalon_class_id
    merge_sec = args.merge
    vad_optimization = bool(args.vad)
    save_list_files = bool(args.save_list_files)
    saveSerilizationFile = args.save_serialized_file
    inputCatalog = args.input
    layer = args.layer
    folds = int(args.folds)
    lr = float(args.lr)
    lr_lim = float(args.lr_lim)
    factor = float(args.factor)
    tboard = str(args.tensorboard)
    batch_size = int(args.batch_size)
    scheduler_mode = args.scheduler_mode
    optimizer = str(args.optimizer)

    if checkPath == "None":
        checkPath = getExecPath()
        t, h = os.path.split(checkPath)
        dtn = str(dt.datetime.now()).split(" ")
        dtn = str(dtn[1]).split(".")
        dtn = str(dtn[0]).replace(":", "_")
        checkPath = t + '//nnmodels_' + dtn

    checkPath = os.path.normpath(checkPath)
    t, h = os.path.split(checkPath)
    tboard = t + '//' + tboard + "//" + h

    if not os.path.exists(checkPath):
        os.makedirs(checkPath)

    checkPath = checkPath + '//' + str(layer) + '_lr_' + str(
        lr) + '_factor_' + str(factor) + '_folds_' + str(folds)
    checkPath = os.path.normpath(checkPath)
    _, h = os.path.split(checkPath)
    tboard = os.path.normpath(tboard + '//' + h)

    if save_list_files:

        serializeData = loadSerializedListFiles(loadTestData)

        with open(os.path.normpath(
                "D:\\repo\\ML\\test_post_competition.csv")) as f:

            et_files = list()

            for line in f.readlines():
                block = line.split(",")
                et_files.append([block[0], block[1]])

        new_merge_data = list()
        count = len(serializeData)
        pos = 0
        for sdata in serializeData:
            fname = sdata['file_name']
            features = sdata['features']

            for label_et in et_files:

                if (str(fname) == str(label_et[0])
                        and str(label_et[1]) != "None"):
                    label_id, _ = find_type_class(etalon_class_id,
                                                  label_et[1],
                                                  pos=1)
                    features_data = {
                        "file_name": fname,
                        "label_id": int(label_id),
                        "features": features
                    }
                    new_merge_data.append(features_data)
                    break

            pos = pos + 1

            status_string = str(pos) + "/" + str(count)
            print(termcolor.colored(status_string, "green"))

        t, h = os.path.split(loadTestData)
        h = str(h).split(".")
        save_list = os.path.normpath(str(t + h[0] + "_label.features"))
        with open(save_list, "wb") as f2:
            pickle.dump(new_merge_data, f2)
            print(termcolor.colored("save: " + save_list, "green"))

        return

    if loadSerializeFiles:

        serializeData = loadSerializedListFiles(loadSerializeFiles)

        if not serializeData:
            raise Exception("Can not unpack serialized data")

        skf = StratifiedKFold(n_splits=folds)
        y = np.asarray([labels['label_id'] for labels in serializeData])

        modelsPath, _ = os.path.split(checkPath)
        modelsPath = modelsPath + "//*.hdf5"
        models = glob.glob(modelsPath)

        if len(models) > 0:
            for i in models:
                print(termcolor.colored(str(i), "green"))
        else:
            print(
                termcolor.colored(
                    str("Not find models in catalog:" + checkPath), "red"))

        if len(models) == 0:
            i = int(0)
            for train_index, valid_index in skf.split(serializeData, y):

                train_data = DataGenerator(serializeData,
                                           train_index,
                                           count_class=41,
                                           batch_size=batch_size,
                                           koeff_merge=int(merge_sec),
                                           layer=layer)
                valid_data = DataGenerator(serializeData,
                                           valid_index,
                                           count_class=41,
                                           batch_size=batch_size,
                                           koeff_merge=int(merge_sec),
                                           layer=layer)

                shape = train_data.get_shape()

                dnn = VggDNN(input_shape=shape, lr=lr, optimizer=optimizer)

                postfix = '_' + str(i) + '.hdf5'
                dnn_model_path = os.path.normpath(checkPath + postfix)
                tboard = os.path.normpath(tboard + postfix)

                dnn.train(train_data,
                          valid_data,
                          checkPath=dnn_model_path,
                          batch_size=batch_size,
                          factor=float(factor),
                          tensorboardPath=tboard,
                          lim_lr=lr_lim,
                          scheduler_mode=scheduler_mode,
                          iteration=train_data.__len__())
                i = i + 1
                # del dnn

            models = glob.glob(modelsPath)

        # batch_data.deleted_garbage()

        actual = list()
        predicts = list()

        pos = int(0)

        # status_string = "Number of models: " + str(len(models))
        # print(termcolor.colored(status_string,"green"))

        dnn_models = list()
        for m in models:
            dnn_models.append(VggDNN(path=m))

        # resuls_predict_string = list()
        #
        # resuls_predict_string.append(["fname,label"])
        # #
        # # for soundfiles,labels,names in batch_data.get_soundfiles():
        # #         count = batch_data.get_countfiles()
        # #
        # #         for sound,label,name in zip(soundfiles,labels,names):
        # #
        # #             predict_merge = np.empty(shape=(0,41));
        # #
        # #             for model in dnn_models:
        # #
        # #                 # sound_ex = np.expand_dims(sound, axis=2)
        # #                 # predict = model.predict_on_batch(np.asarray(sound_ex))
        # #                 predict = model.predict_on_batch(np.asarray(sound))
        # #                 # mean_predict = np.mean(predict, axis=0)
        # #                 # mean_predict = mean_predict.reshape(np.shape(predict)[1],1)
        # #                 predict_merge = np.concatenate((predict_merge,predict), axis=0)
        # #
        # #             mean = np.mean(np.asarray(predict_merge),axis=0)
        # #             amax = np.argsort(mean, axis=0)
        # #             amax = amax[::-1]
        # #
        # #             predict_string = name
        # #
        # #             if etalon_class_id:
        # #                 for r in amax[0:3]:
        # #                     predict_string = predict_string + str(" ") + str(find_type_class(etalon_class_id,r)[1])
        # #
        # #                 predict_string = predict_string + " origin: "+ str(find_type_class(etalon_class_id,np.argmax(label)))
        # #
        # #             resuls_predict_string.append(predict_string)
        # #
        # #             actual.append([np.argmax(label)])
        # #             predicts.append(list(amax))
        # #
        # #             pos = pos + 1
        # #             status_string = "Calculate predict: " + str(pos) + "/" + str(count)
        # #             print(termcolor.colored(status_string,"green"))
        # #             # print(termcolor.colored(str(mean),"green"))
        #
        # met = average_precision.mapk(actual,predicts,k=3)
        # result_string = "Predict: "+str(met)
        #
        # print(termcolor.colored(result_string,"green"))
        # # print(termcolor.colored(resuls_predict_string,"green"))
        # rp = getExecPath()
        # t,h = os.path.split(rp)
        # t = t + "//result.log"
        # print_result(t,resuls_predict_string)

        pos = int(0)

        result_predict_string = list()
        result_predict_string.append(str("fname,label"))

        actual = list()
        predicts = list()

        if loadTestData and os.path.isdir(loadTestData):
            test_data = loadSerializedListFiles(loadTestData)
            batch_data = Batcher(test_data,
                                 layer=layer,
                                 koeff_merge=int(merge_sec),
                                 shuffle=True,
                                 n_splits=2)

            for soundfiles, labels, names in batch_data.get_soundfiles():
                count = batch_data.get_countfiles()

                for sound, label, name in zip(soundfiles, labels, names):

                    predict_merge = np.empty(shape=(0, 41))

                    for model in dnn_models:
                        predict = model.predict_on_batch(np.asarray(sound))
                        predict_merge = np.concatenate(
                            (predict_merge, predict), axis=0)

                    mean = np.mean(np.asarray(predict_merge), axis=0)
                    amax = np.argsort(mean, axis=0)
                    amax = amax[::-1]

                    predict_string = name + str(",")

                    if etalon_class_id:
                        for r in amax[0:3]:
                            predict_string = predict_string + str(" ") + str(
                                find_type_class(etalon_class_id, r)[1])

                    result_predict_string.append(predict_string)

                    actual.append([np.argmax(label)])
                    predicts.append(list(amax))

                    pos = pos + 1
                    status_string = "Calculate predict: " + str(
                        pos) + "/" + str(count)
                    print(termcolor.colored(status_string, "green"))

            met = average_precision.mapk(actual, predicts, k=3)
            result_string = "Predict test data: " + str(met)

            print(termcolor.colored(result_string, "green"))

            rp = getExecPath()
            t, h = os.path.split(rp)
            t = t + "//test_result.log"
            print_result(t, result_predict_string)

        return

    Catalogs = inputCatalog + "/*/"
    Catalogs = glob.glob(Catalogs)

    Catalogs = Catalogs + [inputCatalog]

    listWavFiles, countFiles = makeWaveFilesList(Catalogs)

    if not listWavFiles:
        print(inputCatalog + ": this catalog has not wav files")
        return
    else:

        processedFiles = 0

        for wav_file in listWavFiles:
            processedFiles = processedFiles + 1
            error_string = "Input file : " + wav_file + " - "

            if (os.path.getsize(wav_file) == 0):
                print(termcolor.colored(error_string, "red"))
                continue

            sample_rate, wav_data = wavfile.read(wav_file)

            if vad_optimization == True:
                wav_data = vad.apply_vad(wav_data, sample_rate)

            assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
            samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]

            if (np.shape(samples)[0] < sample_rate):
                append_size = sample_rate - np.shape(samples)[0]
                samples = np.append(samples, np.full(append_size, float(0)))

            examples_batch = vggish_input.waveform_to_examples(
                samples, sample_rate)

            # Prepare a postprocessor to munge the model embeddings.
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

            try:
                with tf.Graph().as_default(), tf.Session() as sess:
                    # Define the model in inference mode, load the checkpoint, and
                    # locate input and output tensors.
                    vggish_slim.define_vggish_slim(training=False)
                    vggish_slim.load_vggish_slim_checkpoint(
                        sess, FLAGS.checkpoint)
                    features_tensor = sess.graph.get_tensor_by_name(
                        vggish_params.INPUT_TENSOR_NAME)
                    embedding_tensor = sess.graph.get_tensor_by_name(
                        vggish_params.OUTPUT_TENSOR_NAME)
                    flatten_result = sess.graph.get_tensor_by_name(
                        vggish_params.OUTPUT_TENSOR_FL_NAME)

                    # Run inference and postprocessing.
                    [embedding_batch, flatten_batch
                     ] = sess.run([embedding_tensor, flatten_result],
                                  feed_dict={features_tensor: examples_batch})

                    postprocessed_batch = pproc.postprocess(embedding_batch)

                    wav_file_name = os.path.basename(wav_file)
                    label_id, _ = find_id_class(etalon_class_file,
                                                etalon_class_id, wav_file_name)

                    if label_id == -1:
                        label_id = getCatalogName(wav_file)

                    features_batch = {
                        "embedding": embedding_batch,
                        "flatten": flatten_batch,
                        "postprocessing": postprocessed_batch
                    }
                    features_data = {
                        "file_name": wav_file_name,
                        "label_id": int(label_id),
                        "features": features_batch
                    }

                    if saveSerilizationFile:
                        with open(
                                saveSerilizationFile + "." +
                                str(processedFiles) + ".features", "wb") as f:
                            pickle.dump(features_data, f)

                    error_string = error_string + "successful"

                    color = "green"

                    if label_id == -1:
                        color = "yellow"
                        error_string = error_string + ". File is not classification"

                    error_string = error_string + " (" + str(
                        processedFiles) + "/" + str(countFiles) + ")"
                    print(termcolor.colored(error_string, color))
            except:
                error_string = error_string + "failed"
                error_string = error_string + " (" + str(
                    processedFiles) + "/" + str(countFiles) + ")"
                print(termcolor.colored(error_string, "red"))
                continue

        if saveSerilizationFile:

            tail, head = os.path.split(saveSerilizationFile)

            serilizationFilesList = tail + "//*.features"
            serilizationFilesList = glob.glob(serilizationFilesList)

            features_data_merge = list()

            for sfeature in serilizationFilesList:
                with open(sfeature, "rb") as fd:
                    features_data = pickle.load(fd)
                    features_data_merge.append(features_data)

                os.remove(sfeature)

            with open(saveSerilizationFile + ".features", "wb") as f:
                pickle.dump(features_data_merge, f)
Exemplo n.º 13
0
def preprocess_data():
    audio_root_dir = Path(r'C:\Users\zhanglichuan\Desktop\ECE496\data')
    audio_file_pattern = Path(r'**/*.wav')
    # takes about 6-8 min on my machine
    counter = 0
    oldm, oldn = 0, 0

    for audio_file in glob.iglob(str(audio_root_dir / audio_file_pattern),
                                 recursive=True):
        #load label
        sample = wavfile_to_examples(audio_file)
        #print(audio_file)
        image_path = re.sub('.wav', '.jpg', os.path.split(audio_file)[1])
        image_path = os.path.join(image_dir, image_path)
        #print(image_path)
        input_image = load_img(image_path,
                               target_size=(image_size, image_size),
                               color_mode='grayscale')
        input_image = img_to_array(input_image)
        #input_image = np.expand_dims(input_image, axis=0)
        input_image = preprocess_input(input_image)
        if sample.shape[0] == 0 or get_emotion_label(
                audio_file) == 0 or get_emotion_label(audio_file) == 1:
            continue
        else:
            labels.append(get_emotion_label(audio_file) - 2)
            temp_dict[counter] = sample
            image_list.append(input_image)
        if counter % 100 == 0:
            print('Processing the {}th file: {}'.format(counter, audio_file))
        counter += 1

    oldm, oldn = 0, 0
    check = temp_dict
    print("start to construct embedding feature from input")
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(
            sess,
            r'C:\Users\zhanglichuan\Desktop\ECE496\lstm\vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        counter = 0
        for key in temp_dict:
            #print(counter)
            #print(temp_dict[key])
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: temp_dict[key]})
            embedding_dict[key] = embedding_batch
            m, n = embedding_batch.shape[0], embedding_batch.shape[1]
            if m > oldm:
                oldm = m
            if n > oldn:
                oldn = n
            if counter % 100 == 0:
                print('Processing the {}th file: {}'.format(
                    counter, audio_file))
            counter += 1
    maxLen = oldm * oldn
    pproc = vggish_postprocess.Postprocessor(
        r'C:\Users\zhanglichuan\Desktop\ECE496\lstm\vggish_pca_params.npz')
    train_set = []
    counter = 0
    for key in embedding_dict:
        print(key)
        test = embedding_dict[key]
        embed_sample = embedding_dict[key].flatten()
        tempOne = np.pad(embed_sample, (0, maxLen - embed_sample.shape[0]),
                         mode='constant',
                         constant_values=0)
        temp_embed = np.reshape(tempOne, (1, oldm, oldn))

        if counter == 0:
            train_set = temp_embed
        else:
            train_set = np.concatenate((train_set, temp_embed), axis=0)
        if counter % 100 == 0:
            print('Processing the {}th file: {}'.format(counter, audio_file))
        counter += 1
    print("preprocess finished")

    with open(os.path.join(script_path, 'labels.txt'), 'wb') as tfp:
        pickle.dump(labels, tfp)
    with open(os.path.join(script_path, 'train_set.txt'), 'wb') as tdfp:
        pickle.dump(train_set, tdfp)
    with open(os.path.join(script_path, 'image.txt'), 'wb') as imfp:
        pickle.dump(image_list, imfp)
Exemplo n.º 14
0
def main(_):
    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    # In this simple example, we run the examples from all audio files in a directory through the model.
    for infile in os.listdir(path=FLAGS.wav_dir):
        wav_file = str(FLAGS.wav_dir) + infile

        #####Parse file name for the video id  depending on the dataset ####

        ## EC 50 Dataset, Format: cross validation group - sound file id from freesound - Sound segment - Label (0 to 49).wav ##
        if (FLAGS.dataset == "EC50"):
            namegps = infile.split('-')
            videoid = namegps[0] + "-" + namegps[1] + "-" + namegps[2]
            label = namegps[3].split('.')[0]

        ## Urban Sound Dataset, Format: sound file id from freesound - Label (0 to 9) - occurance id - sound segment id.wav ##
        elif (FLAGS.dataset == "UrbanSound"):
            namegps = infile.split('-')
            videoid = namegps[0] + "-" + namegps[2] + "-" + namegps[3].split(
                '.')[0]
            label = namegps[1]

        else:
            print("Please specify one of the supported datasets.")

        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        print(examples_batch)

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            print(embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            print(postprocessed_batch)

            # Write the postprocessed embeddings as a SequenceExample, in a similar
            # format as the features released in AudioSet. Each row of the batch of
            # embeddings corresponds to roughly a second of audio (96 10ms frames), and
            # the rows are written as a sequence of bytes-valued features, where each
            # feature value contains the 128 bytes of the whitened quantized embedding.
            seq_example = tf.train.SequenceExample(
                context=tf.train.Features(
                    feature={
                        'labels':
                        tf.train.Feature(int64_list=tf.train.Int64List(
                            value=[int(label)])),
                        'video_id':
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[bytes(videoid, 'utf-8')]))
                    }),
                feature_lists=tf.train.FeatureLists(
                    feature_list={
                        vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                        tf.train.FeatureList(feature=[
                            tf.train.Feature(bytes_list=tf.train.BytesList(
                                value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ])
                    }))
            print(seq_example)
            if writer:
                writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
Exemplo n.º 15
0
freq = 1000
sr = 44100
t = np.linspace(0, num_secs, int(num_secs * sr))
x = np.sin(2 * np.pi * freq * t)

# Produce a batch of log mel spectrogram examples.
input_batch = vggish_input.waveform_to_examples(x, sr)
print('Log Mel Spectrogram example: ', input_batch[0])
np.testing.assert_equal(
    input_batch.shape,
    [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS])

# Define VGGish, load the checkpoint, and run the batch through the model to
# produce embeddings.
with tf.Graph().as_default(), tf.Session() as sess:
  vggish_slim.define_vggish_slim()
  vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

  features_tensor = sess.graph.get_tensor_by_name(
      vggish_params.INPUT_TENSOR_NAME)
  embedding_tensor = sess.graph.get_tensor_by_name(
      vggish_params.OUTPUT_TENSOR_NAME)
  [embedding_batch] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: input_batch})
  print('VGGish embedding: ', embedding_batch[0])
  expected_embedding_mean = 0.131
  expected_embedding_std = 0.238
  np.testing.assert_allclose(
      [np.mean(embedding_batch), np.std(embedding_batch)],
      [expected_embedding_mean, expected_embedding_std],
      rtol=rel_error)
Exemplo n.º 16
0
                    os.path.join(audio_path, folder, sentence, scenario)):
                if audio[-4:] == '.wav':
                    wav_name = os.path.join(audio_path, folder, sentence,
                                            scenario, audio)
                    wav_rate, wav_samples = wavfile.read(wav_name)
                    if len(wav_samples) < wav_rate:
                        wav_samples = numpy.pad(
                            wav_samples, (0, wav_rate - len(wav_samples)),
                            'constant')

                    samples = vggish_input.waveform_to_examples(
                        wav_samples, wav_rate)

                    with tensorflow.Graph().as_default(), tensorflow.Session(
                    ) as session:
                        vggish_slim.define_vggish_slim(training=False)
                        vggish_slim.load_vggish_slim_checkpoint(
                            session, args.model_file)

                        samples_tensor = session.graph.get_tensor_by_name(
                            vggish_params.INPUT_TENSOR_NAME)
                        features_tensor = session.graph.get_tensor_by_name(
                            vggish_params.OUTPUT_TENSOR_NAME)

                        [features
                         ] = session.run([features_tensor],
                                         feed_dict={samples_tensor: samples})

                    output_file = os.path.join(acoustic_features_path, folder,
                                               sentence, scenario)
                    os.makedirs(output_file, exist_ok=True)
Exemplo n.º 17
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.
  if FLAGS.wav_file:
    wav_file = FLAGS.wav_file
  else:
    # Write a WAV of a sine wav into an in-memory file object.
    num_secs = 5
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)
    # Convert to signed 16-bit samples.
    samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
    wav_file = six.BytesIO()
    wavfile.write(wav_file, sr, samples)
    wav_file.seek(0)
  examples_batch = vggish_input.wavfile_to_examples(wav_file)
  print(examples_batch)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  # If needed, prepare a record writer to store the postprocessed embeddings.
  writer = tf.python_io.TFRecordWriter(
      FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    print(embedding_batch)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)

    #pdb.set_trace()

    # Write the postprocessed embeddings as a SequenceExample, in a similar
    # format as the features released in AudioSet. Each row of the batch of
    # embeddings corresponds to roughly a second of audio (96 10ms frames), and
    # the rows are written as a sequence of bytes-valued features, where each
    # feature value contains the 128 bytes of the whitened quantized embedding.
    seq_example = tf.train.SequenceExample(
        feature_lists=tf.train.FeatureLists(
            feature_list={
                vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(
                        feature=[
                            tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ]
                    )
            }
        )
    )
    print(seq_example)
    if writer:
      writer.write(seq_example.SerializeToString())

  if writer:
    writer.close()
Exemplo n.º 18
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.
  if FLAGS.wav_file:
    wav_file = FLAGS.wav_file
  else:
    # Write a WAV of a sine wav into an in-memory file object.
    num_secs = 5
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)
    # Convert to signed 16-bit samples.
    samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
    wav_file = six.BytesIO()
    wavfile.write(wav_file, sr, samples)
    wav_file.seek(0)
  examples_batch = vggish_input.wavfile_to_examples(wav_file)
  print(examples_batch)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  # If needed, prepare a record writer to store the postprocessed embeddings.
  writer = tf.python_io.TFRecordWriter(
      FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    print(embedding_batch)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)

    # Write the postprocessed embeddings as a SequenceExample, in a similar
    # format as the features released in AudioSet. Each row of the batch of
    # embeddings corresponds to roughly a second of audio (96 10ms frames), and
    # the rows are written as a sequence of bytes-valued features, where each
    # feature value contains the 128 bytes of the whitened quantized embedding.
    seq_example = tf.train.SequenceExample(
        feature_lists=tf.train.FeatureLists(
            feature_list={
                vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(
                        feature=[
                            tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ]
                    )
            }
        )
    )
    print(seq_example)
    if writer:
      writer.write(seq_example.SerializeToString())

  if writer:
    writer.close()
Exemplo n.º 19
0
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    #print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        #print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        #print(postprocessed_batch)

        ######### ######### ######### ######### ######### ######### #########
        # CHANGED CODE - SUPPRESSED THIS PART
        #
        # supresss this part; we don't really need to write a sequence example
        #
        ######### ######### ######### ######### ######### ######### #########

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
    #   seq_example = tf.train.SequenceExample(
    #       feature_lists=tf.train.FeatureLists(
    #           feature_list={
    #               vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
    #                   tf.train.FeatureList(
    #                       feature=[
    #                           tf.train.Feature(
    #                               bytes_list=tf.train.BytesList(
    #                                   value=[embedding.tobytes()]))
    #                           for embedding in postprocessed_batch
    #                       ]
    #                   )
    #           }
    #       )
    #   )
    #   #print(seq_example)
    #   if writer:
    #     writer.write(seq_example.SerializeToString())

    # if writer:
    #   writer.close()

    ######### ######### ######### ######### ######### ######### ######### #########
    # MODIFIED CODE HERE
    #
    # to allow for featurization into a processdir
    #
    ######### ######### ######### ######### ######### ######### ######### #########

    try:
        os.chdir(os.getcwd() + '/processdir')
    except:
        os.mkdir(os.getcwd() + '/processdir')
        os.chdir(os.getcwd() + '/processdir')

    #print(len(postprocessed_batch))
    #print(type(postprocessed_batch))
    filepath = sys.argv[2]
    i1 = filepath[::-1].find('/')
    jsonfilename = filepath[-1 * i1:][0:-4] + '.json'
    print('writing data to ' + jsonfilename)
    jsonfile = open(jsonfilename, 'w')
    data = {
        'features': postprocessed_batch.tolist(),
    }
    json.dump(data, jsonfile)
    jsonfile.close()
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.

    # if FLAGS.wav_file:
    #   wav_file = FLAGS.wav_file
    # else:
    #   # Write a WAV of a sine wav into an in-memory file object.
    #   num_secs = 5
    #   freq = 1000
    #   sr = 44100
    #   t = np.linspace(0, num_secs, int(num_secs * sr))
    #   x = np.sin(2 * np.pi * freq * t)
    #   # Convert to signed 16-bit samples.
    #   samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
    #   wav_file = six.BytesIO()
    #   wavfile.write(wav_file, sr, samples)
    #   wav_file.seek(0)

    #global vggish_params.VERBOSE
    vggish_params.VERBOSE = FLAGS.verbose

    if FLAGS.wav_file_inputdir:
        wav_file_list = glob.glob(
            os.path.join(FLAGS.wav_file_inputdir, "*.wav"))
    else:
        if FLAGS.wav_file_list:
            wav_file_list = [
                x for x in map(lambda x: x.strip('\n'),
                               open(FLAGS.wav_file_list, 'r').readlines())
            ]
        else:
            if FLAGS.wav_file:
                wav_file_list = [FLAGS.wav_file]
            else:
                print(
                    "must supply wave file path, file with list of paths, or input directory"
                )
                return

    for wav_file in wav_file_list:

        print('RAW WAV FILE: {}'.format(wav_file))

        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        vprint('examples_batch shape')
        vprint(str(examples_batch.shape))

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        # If needed, prepare a record writer to store the postprocessed embeddings.
        outputfile_tf = os.path.join(
            FLAGS.output, "{}.tfrecord".format(os.path.basename(wav_file)))
        print('TF FILE output : {}'.format(outputfile_tf))

        writer = tf.python_io.TFRecordWriter(outputfile_tf)

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})

            vprint('embedding_batch shape')
            vprint(str(embedding_batch.shape))

            postprocessed_batch = pproc.postprocess(embedding_batch,
                                                    FLAGS.clip_and_quantize)
            #vprint(postprocessed_batch)
            vprint('postprocessed_batch shape')
            vprint(str(postprocessed_batch.shape))

            #calculate means
            postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0)
            postprocessed_batch_median = np.median(postprocessed_batch, axis=0)

            vprint('postprocessed_batch_mean shape')
            vprint(str(postprocessed_batch_mean.shape))

            # Write the postprocessed embeddings as a SequenceExample, in a similar
            # format as the features released in AudioSet. Each row of the batch of
            # embeddings corresponds to roughly a second of audio (96 10ms frames), and
            # the rows are written as a sequence of bytes-valued features, where each
            # feature value contains the 128 bytes of the whitened quantized embedding.

            context_features = {
                'mean_audio':
                tf.train.Feature(float_list=tf.train.FloatList(
                    value=postprocessed_batch_mean)),
                'median_audio':
                tf.train.Feature(float_list=tf.train.FloatList(
                    value=postprocessed_batch_median))
            }

            seq_example = tf.train.SequenceExample(
                context=tf.train.Features(feature=context_features),
                feature_lists=tf.train.FeatureLists(
                    feature_list={
                        vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                        tf.train.FeatureList(feature=[
                            tf.train.Feature(bytes_list=tf.train.BytesList(
                                value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ])
                    }))
            #print(seq_example)
            if writer:
                writer.write(seq_example.SerializeToString())

                jsout = {
                    'filename': os.path.basename(wav_file),
                    'mean_audio': postprocessed_batch_mean.tolist(),
                    'median_audio': postprocessed_batch_median.tolist(),
                    'audio': postprocessed_batch.tolist()
                }

                outputfile = os.path.join(
                    FLAGS.output, "{}.json".format(os.path.basename(wav_file)))
                print(outputfile)
                print('JSON FILE output : {}'.format(outputfile))

                with open(outputfile, 'w') as outfile:
                    json.dump(jsout, outfile)

        if writer:
            writer.close()
def extract_features(dirname, label):
    pproc = vggish_postprocess.Postprocessor(
        os.path.join(SELF_DIR, "vggish_pca_params.npz"))

    for wav_file in glob.glob(dirname + "*.wav"):
        print(wav_file)
        try:
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
        except:
            continue
        tfrecord_path = os.path.join(
            FLAGS.dest,
            os.path.basename(wav_file)[:-3] + "tfrecord")
        writer = tf.python_io.TFRecordWriter(tfrecord_path)

        with tf.Graph().as_default(), tf.Session() as sess:
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(
                sess, os.path.join(SELF_DIR, "vggish_model.ckpt"))
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)
            try:
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
            except:
                continue
            postprocessed_batch = pproc.postprocess(embedding_batch)

            nBatches = len(postprocessed_batch)

            if nBatches < 10:
                nBatches = 1
            else:
                nBatches = int(nBatches / 10)

            for i in range(nBatches):
                seq_example = tf.train.SequenceExample(
                    context=tf.train.Features(
                        feature={
                            "labels":
                            tf.train.Feature(int64_list=tf.train.Int64List(
                                value=[label]))
                        }),
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                            tf.train.FeatureList(feature=[
                                tf.train.Feature(bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch[i * 10:i *
                                                                     10 + 10]
                            ])
                        }))

                if writer:
                    writer.write(seq_example.SerializeToString())

        if writer:
            writer.close()
Exemplo n.º 22
0
def main(_):

    print("please speak a word into the microphone")
    record_to_file('demo.wav')

    y, sr = librosa.load('demo.wav')
    print("sampling rate:", sr)
    print("Recorded sound wave: ")
    print(sr)

    wav_file = 'demo.wav'
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # print(examples_batch)
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        # print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        # print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        tf_seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        # print(tf_seq_example)
        if writer:
            writer.write(tf_seq_example.SerializeToString())

    if writer:
        writer.close()

    X = []
    max_len = 10
    n_frames = len(
        tf_seq_example.feature_lists.feature_list['audio_embedding'].feature)
    # print("number of frames = ", n_frames)

    audio_frame = []
    for i in range(n_frames):
        audio_frame.append(
            np.frombuffer(
                tf_seq_example.feature_lists.feature_list['audio_embedding'].
                feature[i].bytes_list.value[0], np.uint8).astype(np.float32))

    pad = [np.zeros([128], np.float32) for i in range(max_len - n_frames)]
    audio_frame += pad
    X.append(audio_frame)

    X = np.array(X)
    # print("Dimension before adding newaxis", X.shape)

    # X = X[newaxis,:,:]
    # print("Dimension after adding newaxis", X.shape)

    #Loading LSTM model
    m4 = load_model('src/models/1LayerLSTM__Loss=BinCE_20Epochs_july02.h5')
    p4 = m4.predict(X)

    print("Gunshot score for inference_sample: ====> ", float(p4 * 100),
          "percent confidence")
    if (p4 >= 0.51):
        print("Gunshot present in the clip")
    else:
        print("Gunshot is not present in the clip")
Exemplo n.º 23
0
def main(_):
    # Create folders, if necessary
    for p in (output_dir, log_dir, log_dir_test, log_dir_train, model_dir):
        create_dir(p)

    # allow_soft_placement gives fallback GPU, log_device_placement=True displays device info
    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True)) as sess:
        now = datetime.datetime.now().isoformat().replace(":", "_")
        fmt = logging.Formatter(
            '%(asctime)s:%(name)s:%(levelname)s:%(message)s', '%Y%m%d-%H%M%S')

        # TF logger
        tflog = logging.getLogger('tensorflow')
        tflog.setLevel(log_level)
        tflog_fh = logging.FileHandler(
            os.path.join(log_dir,
                         "{}-{}-tf.log".format(FLAGS.model_version, now)))
        tflog_fh.setLevel(log_level)
        tflog_fh.setFormatter(fmt)
        tflog_sh = logging.StreamHandler(sys.stdout)
        tflog_sh.setLevel(log_level)
        tflog_sh.setFormatter(fmt)
        tflog.addHandler(tflog_fh)
        tflog.addHandler(tflog_sh)

        # Root logger
        log = logging.getLogger()
        log.setLevel(log_level)
        root_fh = logging.FileHandler(
            os.path.join(log_dir,
                         "{}-{}-run.log".format(FLAGS.model_version, now)))
        root_fh.setFormatter(fmt)
        root_fh.setLevel(log_level)
        root_sh = logging.StreamHandler(sys.stdout)
        root_sh.setFormatter(fmt)
        root_sh.setLevel(log_level)
        log.addHandler(root_fh)
        log.addHandler(root_sh)

        start = time.time()
        log.info("Model version: {}".format(FLAGS.model_version))
        log.info("Number of epochs: {}".format(FLAGS.num_batches))
        log.info("Number of classes: {}".format(FLAGS.num_classes))
        log.info("Number of Mini batches: {}".format(FLAGS.num_mini_batches))
        log.info("Validation enabled: {}".format(FLAGS.validation))
        log.info("Size of Validation set: {}".format(FLAGS.test_size))
        log.info("Saving model after every {}th step".format(FLAGS.save_step))

        run_options = tf.RunOptions(report_tensor_allocations_upon_oom=True)

        # Define VGGish as our convolutional blocks
        embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

        # Define a shallow classification model and associated training ops on top of VGGish.
        with tf.variable_scope('mymodel'):
            # Add a fully connected layer with 100 units.
            num_units = 100
            fc = slim.fully_connected(embeddings, num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            logits = slim.fully_connected(fc,
                                          FLAGS.num_classes,
                                          activation_fn=None,
                                          scope='logits')

            # Use Sigmoid as our activation function
            tf.sigmoid(logits, name='prediction')

            log.debug("Logits: {}".format(logits))

            # Add training ops.
            with tf.variable_scope('train'):

                global_step = tf.Variable(0,
                                          name='global_step',
                                          trainable=False,
                                          collections=[
                                              tf.GraphKeys.GLOBAL_VARIABLES,
                                              tf.GraphKeys.GLOBAL_STEP
                                          ])

                # Labels are assumed to be fed as a batch multi-hot vectors, with
                # a 1 in the position of each positive class label, and 0 elsewhere.
                """
        Accipiter_gentilis  --> [1, 0, 0]  
        Cygnus_olor         --> [0, 1, 0] 
        Regulus_regulus     --> [0, 0, 1]
        """
                labels = tf.placeholder(tf.float32,
                                        shape=(None, FLAGS.num_classes),
                                        name='labels')

                # Cross-entropy label loss.
                xent = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                               labels=labels,
                                                               name='xent')
                loss = tf.reduce_mean(xent, name='loss_op')
                tf.summary.scalar('loss', loss)

                # We use the same optimizer and hyperparameters as used to train VGGish.
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=vggish_params.LEARNING_RATE,
                    epsilon=vggish_params.ADAM_EPSILON)
                optimizer.minimize(loss,
                                   global_step=global_step,
                                   name='train_op')

            # Add evaluation ops.
            with tf.variable_scope("evaluation"):
                prediction = tf.argmax(logits, 1)
                correct_prediction = tf.equal(tf.argmax(logits, 1),
                                              tf.argmax(labels, 1))
                accuracy = tf.reduce_mean(
                    tf.cast(correct_prediction, tf.float32))

        # Create a summarizer that summarizes loss and accuracy
        # TODO: Fix validation loss summary
        tf.summary.scalar("Accuracy", accuracy)
        # Add average loss summary over entire batch
        tf.summary.scalar("Loss", tf.reduce_mean(xent))
        # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
        summary_op = tf.summary.merge_all()

        # TensorBoard stuff
        train_writer = tf.summary.FileWriter(log_dir_train, sess.graph)
        validation_writer = tf.summary.FileWriter(log_dir_test, sess.graph)

        #tf.global_variables_initializer().run()

        # Initialize all variables in the model, and then load the pre-trained
        # VGGish checkpoint.
        sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

        # Locate all the tensors and ops we need for the training loop.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        output_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
        global_step_tensor = sess.graph.get_tensor_by_name(
            'mymodel/train/global_step:0')
        loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
        train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

        # Load all input with corresponding labels
        log.info("Loading data set and mapping birds to training IDs...")
        all_examples, all_labels = load_spectrogram(os.path.join(data_dir),
                                                    log)

        # Create training and test sets
        X_train_entire, X_validation_entire, y_train_entire, y_validation_entire = sk.train_test_split(
            all_examples, all_labels, test_size=FLAGS.test_size)

        # Test set stays the same throughout all epochs
        (X_validation,
         y_validation) = get_random_batches(X_validation_entire,
                                            y_validation_entire, log)

        # Start training
        for step in range(FLAGS.num_batches):
            log.info("######## Epoch {}/{} started ########".format(
                step + 1, FLAGS.num_batches))

            # Shuffle the order of input examples to foster generalization
            (X_train, y_train) = get_random_batches(X_train_entire,
                                                    y_train_entire, log)

            # Train on n batches per epoch
            minibatch_n = FLAGS.num_mini_batches
            minibatch_size = len(X_train) / minibatch_n
            if minibatch_size <= 0:
                log.error(
                    "Size of minibatch too small ({}), choose smaller number of minibatches or use more classes!"
                    .format(minibatch_size))
                sys.exit(1)

            counter = 1
            for i in range(0, len(X_train), minibatch_size):
                log.info("(Epoch {}/{}) ==> Minibatch {} started ...".format(
                    step + 1, FLAGS.num_batches, counter))

                # Get pair of (X, y) of the current minibatch/chunk
                X_train_mini = X_train[i:i + minibatch_size]
                y_train_mini = y_train[i:i + minibatch_size]

                log.info("Size of mini batch (features): {}".format(
                    len(X_train_mini)))
                log.info("Size of mini batch (labels): {}".format(
                    len(y_train_mini)))

                # Actual execution of the graph
                [summary, num_steps, loss, _, train_acc,
                 temp] = sess.run([
                     summary_op, global_step_tensor, loss_tensor, train_op,
                     accuracy, prediction
                 ],
                                  feed_dict={
                                      features_tensor: X_train_mini,
                                      labels_tensor: y_train_mini
                                  },
                                  options=run_options)

                train_writer.add_summary(summary, step * minibatch_size + i)
                log.info("Loss in minibatch: {} ".format(loss))
                log.info(
                    "Training accuracy in minibatch: {}".format(train_acc))

                log.info(
                    "(Epoch {}/{}) ==> Minibatch {} finished ...\n".format(
                        step + 1, FLAGS.num_batches, counter))
                counter += 1

                # Test set mini batching
                minibatch_valid_size = 4
                val_acc_entire = 0.
                for j in range(0, len(X_validation), minibatch_valid_size):
                    X_validation_mini = X_validation[j:j +
                                                     minibatch_valid_size]
                    y_validation_mini = y_validation[j:j +
                                                     minibatch_valid_size]

                    summary, val_acc, pred, corr_pred = sess.run(
                        [summary_op, accuracy, prediction, correct_prediction],
                        feed_dict={
                            features_tensor: X_validation_mini,
                            labels_tensor: y_validation_mini
                        },
                        options=run_options)
                    val_acc_entire += val_acc

                    validation_writer.add_summary(
                        summary, step * minibatch_valid_size + j)

                average_val_acc = val_acc_entire / (j / minibatch_valid_size)
                log.info("Epoch {} -- Validation Accuracy: {}".format(
                    step + 1, average_val_acc))
                log.debug("Correct prediction: {}".format(corr_pred))
            # Save model to disk.
            saver = tf.train.Saver()
            if step % FLAGS.save_step == 0:
                save_path = saver.save(sess,
                                       os.path.join(
                                           model_dir,
                                           "jibjib_model-{}.ckpt".format(
                                               FLAGS.model_version)),
                                       global_step=step)
                log.info("Model saved to {}".format(save_path))

        now = datetime.datetime.now().isoformat().replace(":",
                                                          "_").split(".")[0]
        end = time.time()
        out = "Training finished after {}s".format(end - start)
        log.info(out)
Exemplo n.º 24
0
        (x + 1) * batch_size, len(labeled_data))]
    features = [example for (example, _) in batch_labeled_data]
    labels = [label for (_, label) in batch_labeled_data]
    return (features, labels)


f2 = h5py.File('val.hdf5', 'r')
val_data, val_label = f2['val_data'], f2['val_label']
val_labeled_data = list(zip(val_data, val_label))

f3 = h5py.File('test.hdf5', 'r')
test_data, test_label = f3['test_data'], f3['test_label']
test_labeled_data = list(zip(test_data, test_label))

# Define VGGish.
embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

# Define a shallow classification model and associated training ops on top of VGGish.
# Add a fully connected layer with FLAGS.num_units units.
num_units = FLAGS.num_units

fc = slim.fully_connected(embeddings, num_units)

# Add a classifier layer at the end, consisting of parallel logistic classifiers, one per class. This allows for multi-class tasks.
logits = slim.fully_connected(fc,
                              _NUM_CLASSES,
                              activation_fn=None,
                              scope='logits')
# logits = tf.sigmoid(logits, name='prediction')

# Add training ops.
Exemplo n.º 25
0
def main(unused_argv):
    print("Input file: " + FLAGS.input_video_label)
    print("Output tfrecord file: " + FLAGS.tfrecord_file)

    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    for wav_file, st_time, end_time, label in csv.reader(open(
            FLAGS.input_video_label),
                                                         delimiter='\t'):
        print(wav_file, st_time, end_time, label)
        if (os.path.isfile(wav_file)):
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
            #print(examples_batch)
            pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

            with tf.Graph().as_default(), tf.Session() as sess:
                # Define the model in inference mode, load the checkpoint, and
                # locate input and output tensors.
                vggish_slim.define_vggish_slim(training=False)
                vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
                features_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.INPUT_TENSOR_NAME)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    vggish_params.OUTPUT_TENSOR_NAME)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
                #print(embedding_batch)
                postprocessed_batch = pproc.postprocess(embedding_batch)
                #print(postprocessed_batch)

                # Write the postprocessed embeddings as a SequenceExample, in a similar
                # format as the features released in AudioSet. Each row of the batch of
                # embeddings corresponds to roughly a second of audio (96 10ms frames), and
                # the rows are written as a sequence of bytes-valued features, where each
                # feature value contains the 128 bytes of the whitened quantized embedding.
                seq_example = tf.train.SequenceExample(
                    context=tf.train.Features(
                        feature={
                            vggish_params.LABELS_FEATURE_KEY:
                            _int64_list_feature(sorted(map(int, label))),
                            vggish_params.VIDEO_FILE_KEY_FEATURE_KEY:
                            _bytes_feature(_make_bytes(map(ord, wav_file))),
                        }),
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                            tf.train.FeatureList(feature=[
                                tf.train.Feature(bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch
                            ])
                        }))

                #print(seq_example)
                if writer:
                    writer.write(seq_example.SerializeToString())

            tf.reset_default_graph()

    if writer:
        writer.close()
Exemplo n.º 26
0
def main(_):
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

        # Define a shallow classification model and associated training ops on top
        # of VGGish.
        with tf.variable_scope("mymodel"):
            # Add a fully connected layer with 100 units.
            num_units = 100
            fc = slim.fully_connected(embeddings, num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            logits = slim.fully_connected(fc,
                                          _NUM_CLASSES,
                                          activation_fn=None,
                                          scope="logits")
            tf.sigmoid(logits, name="prediction")

            # Add training ops.
            with tf.variable_scope("train"):
                global_step = tf.Variable(
                    0,
                    name="global_step",
                    trainable=False,
                    collections=[
                        tf.GraphKeys.GLOBAL_VARIABLES,
                        tf.GraphKeys.GLOBAL_STEP,
                    ],
                )

                # Labels are assumed to be fed as a batch multi-hot vectors, with
                # a 1 in the position of each positive class label, and 0 elsewhere.
                labels = tf.placeholder(tf.float32,
                                        shape=(None, _NUM_CLASSES),
                                        name="labels")

                # Cross-entropy label loss.
                xent = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                               labels=labels,
                                                               name="xent")
                loss = tf.reduce_mean(xent, name="loss_op")
                tf.summary.scalar("loss", loss)

                # We use the same optimizer and hyperparameters as used to train VGGish.
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=vggish_params.LEARNING_RATE,
                    epsilon=vggish_params.ADAM_EPSILON,
                )
                optimizer.minimize(loss,
                                   global_step=global_step,
                                   name="train_op")

        # Initialize all variables in the model, and then load the pre-trained
        # VGGish checkpoint.
        sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

        # Locate all the tensors and ops we need for the training loop.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        labels_tensor = sess.graph.get_tensor_by_name("mymodel/train/labels:0")
        global_step_tensor = sess.graph.get_tensor_by_name(
            "mymodel/train/global_step:0")
        loss_tensor = sess.graph.get_tensor_by_name("mymodel/train/loss_op:0")
        train_op = sess.graph.get_operation_by_name("mymodel/train/train_op")

        # The training loop.
        for _ in range(FLAGS.num_batches):
            (features, labels) = _get_examples_batch()
            [num_steps, loss, _] = sess.run(
                [global_step_tensor, loss_tensor, train_op],
                feed_dict={
                    features_tensor: features,
                    labels_tensor: labels
                },
            )
            print("Step %d: loss %g" % (num_steps, loss))
Exemplo n.º 27
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.

  #Read in .wav files from input directory, create array of wav_file names

  wav_file_direc = "./audio_input/"
  embedding_direc = "./json_output/"
  checkpoint = "vggish_model.ckpt"
  pca_params = "vggish_pca_params.npz"


  wav_files = listdir(wav_file_direc)
  
  #Initialize array of batches and read each wav_file in wav_files array
  batches = []

  for wav_file in wav_files:
    if "wav" in wav_file:
      print(join(wav_file_direc,wav_file))
      examples_batch = vggish_input.wavfile_to_examples(join(wav_file_direc,wav_file))
      batches.append(examples_batch)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(pca_params)

  output_dicts = []
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)


    output_sequences = []
    #Create a JSON output file for each audio file
    for batch in batches:
      # Run inference and postprocessing.
      [embedding_batch] = sess.run([embedding_tensor],
                                   feed_dict={features_tensor: batch})
      postprocessed_batch = pproc.postprocess(embedding_batch)

      seq_example = tf.train.SequenceExample(
          feature_lists=tf.train.FeatureLists(
              feature_list={
                  vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                      tf.train.FeatureList(
                          feature=[
                              tf.train.Feature(
                                  bytes_list=tf.train.BytesList(
                                      value=[embedding]))
                              for embedding in postprocessed_batch
                          ]
                      )
              }
          )
      )

      output_sequences.append(seq_example)

  for i in range(0, len(wav_files)):
    with open(join(embedding_direc,wav_files[i][:-3])+"json", 'w') as outfile:
      json.dump(output_sequences[i], outfile)
Exemplo n.º 28
0
def main(unused_argv):
    extractor = feature_extractor.YouTube8MFeatureExtractor(FLAGS.model_dir)
    writer = tf.python_io.TFRecordWriter(FLAGS.output_tfrecords_file)
    total_written = 0
    total_error = 0
    for video_file, labels in csv.reader(open(FLAGS.input_videos_csv)):
        rgb_features = []
        for rgb in frame_iterator(video_file,
                                  every_ms=1000.0 / FLAGS.frames_per_second):
            features = extractor.extract_rgb_frame_features(rgb[:, :, ::-1])
            rgb_features.append(_bytes_feature(quantize(features)))

        if not rgb_features:
            print >> sys.stderr, 'Could not get features for ' + video_file
            total_error += 1
            continue

        # Create SequenceExample proto and write to output.
        feature_list = {
            FLAGS.image_feature_key:
            tf.train.FeatureList(feature=rgb_features),
        }
        if FLAGS.insert_zero_audio_features:
            try:
                wav_file = video_file + '.wav'
                examples_batch = vggish_input.wavfile_to_examples(wav_file)
                pproc = vggish_postprocess.Postprocessor(
                    'vggish_pca_params.npz')
                with tf.Graph().as_default(), tf.Session() as sess:
                    # Define the model in inference mode, load the checkpoint, and
                    # locate input and output tensors.
                    vggish_slim.define_vggish_slim(training=False)
                    vggish_slim.load_vggish_slim_checkpoint(
                        sess, 'vggish_model.ckpt')
                    features_tensor = sess.graph.get_tensor_by_name(
                        vggish_params.INPUT_TENSOR_NAME)
                    embedding_tensor = sess.graph.get_tensor_by_name(
                        vggish_params.OUTPUT_TENSOR_NAME)
                    [embedding_batch
                     ] = sess.run([embedding_tensor],
                                  feed_dict={features_tensor: examples_batch})
                    postprocessed_batch = pproc.postprocess(embedding_batch)
                    feature_list['audio'] = tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
            except:
                feature_list['audio'] = tf.train.FeatureList(
                    feature=[_bytes_feature(_make_bytes([0] * 128))] *
                    len(rgb_features))

        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    FLAGS.labels_feature_key:
                    _int64_list_feature(sorted(map(int, labels.split(';')))),
                    FLAGS.video_file_key_feature_key:
                    _bytes_feature(_make_bytes(map(ord, video_file))),
                }),
            feature_lists=tf.train.FeatureLists(feature_list=feature_list))
        writer.write(example.SerializeToString())
        total_written += 1

    writer.close()
    print('Successfully encoded %i out of %i videos' %
          (total_written, total_written + total_error))
Exemplo n.º 29
0
def main(wav_file=None,
         checkpoint='audioset/vggish_model.ckpt',
         pca_params='audioset/vggish_pca_params.npz',
         tfrecord_file=None):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    if not wav_file:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # print(examples_batch.shape)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    # writer = tf.python_io.TFRecordWriter(
    #     tfrecord_file) if tfrecord_file else None

    # with tf.Graph().as_default(), tf.Session() as sess:
    with tf.Graph().as_default():
        # config = tf.ConfigProto()
        # restrict tensorflow memory usage
        config = tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.2),\
            allow_soft_placement=True)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        # print(embedding_batch.shape)
        sess.close()
        postprocessed_batch = pproc.postprocess(embedding_batch)
        # print(postprocessed_batch.shape)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        # seq_example = tf.train.SequenceExample(
        #     feature_lists=tf.train.FeatureLists(
        #         feature_list={
        #             vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
        #                 tf.train.FeatureList(
        #                     feature=[
        #                         tf.train.Feature(
        #                             bytes_list=tf.train.BytesList(
        #                                 value=[embedding.tobytes()]))
        #                         for embedding in postprocessed_batch
        #                     ]
        #                 )
        #         }
        #     )
        # )
        # print(seq_example)
        # if writer:
        #   writer.write(seq_example.SerializeToString())
    # if writer:
    #   writer.close()
    tf.reset_default_graph()
    return embedding_batch, postprocessed_batch
Exemplo n.º 30
0
def main(_):
    (train_addrs, train_labels, val_addrs, val_labels, test_addrs,
     test_labels) = utils.adressLabelSort('sortedTestAudio2')
    addr = train_addrs
    embedding_labels = train_labels
    print('number of addr: ', len(addr))
    print('number of labels: ', len(embedding_labels))

    (examples_batch,
     embedding_labels) = utils._get_batch(addr, embedding_labels)

    tfrecords_filename = 'Evalval1.tfrecords'
    writer = tf.python_io.TFRecordWriter(tfrecords_filename)

    # restricting memory usage, TensorFlow is greedy and will use all memory otherwise
    config = tf.ConfigProto()
    #config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allocator_type = 'BFC'
    #config.gpu_options.allow_growth = True
    config.gpu_options.per_process_gpu_memory_fraction = 0.90

    with tf.Graph().as_default(), tf.Session(config=config) as sess:

        vggish_slim.define_vggish_slim(
            training=False)  # Defines the VGGish TensorFlow model.
        vggish_slim.load_vggish_slim_checkpoint(
            sess, 'vggish_model.ckpt'
        )  # Loads a pre-trained VGGish-compatible checkpoint.

        # locate input and output tensors.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        feed_dict = {features_tensor: examples_batch}

        [embedding_batch] = sess.run([embedding_tensor], feed_dict=feed_dict)

        print('example_batch shape: ', examples_batch.shape)
        print('embedding_batch shape: ', embedding_batch.shape)
        print('labels_batch shape: ', len(embedding_labels))

        # store the data to the TFRecords file.
        for i in range(len(embedding_batch)):
            embedding = embedding_batch[i]

            # convert into proper data type:
            embedding_length = embedding_labels[i]  # embedding.shape[0]
            embedding_raw = embedding.tostring()

            # Create a feature
            feature = {
                'Evalval1/labels': utils._int64_feature(embedding_length),
                'Evalval1/embedding': utils._bytes_feature(embedding_raw)
            }

            # Create an example protocol buffer
            example = tf.train.Example(features=tf.train.Features(
                feature=feature))
            # Serialize to string and write on the file

            writer.write(example.SerializeToString())

        writer.close()
        sys.stdout.flush()
Exemplo n.º 31
0
 def define_vggish(waveform):
   with tf.variable_creator_scope(var_tracker):
     features = waveform_to_features(waveform)
     return vggish_slim.define_vggish_slim(features, training=False)
Exemplo n.º 32
0
def main(_):
    with tf.Graph().as_default(), tf.Session() as sess:
        # Define VGGish.
        embeddings = vggish_slim.define_vggish_slim(
            training=FLAGS.train_vggish)

        # Define a shallow classification model and associated training ops on top
        # of VGGish.
        with tf.variable_scope('mymodel'):
            # Add a fully connected layer with 100 units. Add an activation function
            # to the embeddings since they are pre-activation.
            num_units = 100
            fc = slim.fully_connected(tf.nn.relu(embeddings), num_units)

            # Add a classifier layer at the end, consisting of parallel logistic
            # classifiers, one per class. This allows for multi-class tasks.
            logits = slim.fully_connected(fc,
                                          _NUM_CLASSES,
                                          activation_fn=None,
                                          scope='logits')
            tf.sigmoid(logits, name='prediction')

            # Add training ops.
            with tf.variable_scope('train'):
                global_step = tf.train.create_global_step()

                # Labels are assumed to be fed as a batch multi-hot vectors, with
                # a 1 in the position of each positive class label, and 0 elsewhere.
                labels_input = tf.placeholder(tf.float32,
                                              shape=(None, _NUM_CLASSES),
                                              name='labels')

                # Cross-entropy label loss.
                xent = tf.nn.sigmoid_cross_entropy_with_logits(
                    logits=logits, labels=labels_input, name='xent')
                loss = tf.reduce_mean(xent, name='loss_op')
                tf.summary.scalar('loss', loss)

                # We use the same optimizer and hyperparameters as used to train VGGish.
                optimizer = tf.train.AdamOptimizer(
                    learning_rate=vggish_params.LEARNING_RATE,
                    epsilon=vggish_params.ADAM_EPSILON)
                train_op = optimizer.minimize(loss, global_step=global_step)

        # Initialize all variables in the model, and then load the pre-trained
        # VGGish checkpoint.
        sess.run(tf.global_variables_initializer())
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

        # The training loop.
        features_input = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        for _ in range(FLAGS.num_batches):
            (features, labels) = _get_examples_batch()
            [num_steps, loss_value,
             _] = sess.run([global_step, loss, train_op],
                           feed_dict={
                               features_input: features,
                               labels_input: labels
                           })
            print('Step %d: loss %g' % (num_steps, loss_value))
Exemplo n.º 33
0
def main(_):
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define VGGish.
    embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish)

    # Define a shallow classification model and associated training ops on top
    # of VGGish.
    with tf.variable_scope('mymodel'):
      # Add a fully connected layer with 100 units.
      num_units = 100
      fc = slim.fully_connected(embeddings, num_units)

      # Add a classifier layer at the end, consisting of parallel logistic
      # classifiers, one per class. This allows for multi-class tasks.
      logits = slim.fully_connected(
          fc, _NUM_CLASSES, activation_fn=None, scope='logits')
      tf.sigmoid(logits, name='prediction')

      # Add training ops.
      with tf.variable_scope('train'):
        global_step = tf.Variable(
            0, name='global_step', trainable=False,
            collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                         tf.GraphKeys.GLOBAL_STEP])

        # Labels are assumed to be fed as a batch multi-hot vectors, with
        # a 1 in the position of each positive class label, and 0 elsewhere.
        labels = tf.placeholder(
            tf.float32, shape=(None, _NUM_CLASSES), name='labels')

        # Cross-entropy label loss.
        xent = tf.nn.sigmoid_cross_entropy_with_logits(
            logits=logits, labels=labels, name='xent')
        loss = tf.reduce_mean(xent, name='loss_op')
        tf.summary.scalar('loss', loss)

        # We use the same optimizer and hyperparameters as used to train VGGish.
        optimizer = tf.train.AdamOptimizer(
            learning_rate=vggish_params.LEARNING_RATE,
            epsilon=vggish_params.ADAM_EPSILON)
        optimizer.minimize(loss, global_step=global_step, name='train_op')

    # Initialize all variables in the model, and then load the pre-trained
    # VGGish checkpoint.
    sess.run(tf.global_variables_initializer())
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)

    # Locate all the tensors and ops we need for the training loop.
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0')
    global_step_tensor = sess.graph.get_tensor_by_name(
        'mymodel/train/global_step:0')
    loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0')
    train_op = sess.graph.get_operation_by_name('mymodel/train/train_op')

    # The training loop.
    for _ in range(FLAGS.num_batches):
      (features, labels) = _get_examples_batch()
      [num_steps, loss, _] = sess.run(
          [global_step_tensor, loss_tensor, train_op],
          feed_dict={features_tensor: features, labels_tensor: labels})
      print('Step %d: loss %g' % (num_steps, loss))