def extract_audioset_features(ids, id2audio_path, id2label):
    first_audio = True
    for i in ids:
        if first_audio:
            input_data = vggish_input.wavfile_to_examples(id2audio_path[i])
            ground_truth = np.repeat(id2label[i], input_data.shape[0], axis=0)
            identifiers = np.repeat(i, input_data.shape[0], axis=0)
            first_audio = False
        else:
            tmp_in = vggish_input.wavfile_to_examples(id2audio_path[i])
            input_data = np.concatenate((input_data, tmp_in), axis=0)
            tmp_gt = np.repeat(id2label[i], tmp_in.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(i, tmp_in.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        extracted_feat = sess.run([embedding_tensor],
                                  feed_dict={features_tensor: input_data})
        feature = np.squeeze(np.asarray(extracted_feat))

    return [feature, ground_truth, identifiers]
Exemplo n.º 2
0
def _folder_to_mel(path):
    os.chdir(path)
    files = os.listdir(path)
    sound_examples = vggish_input.wavfile_to_examples(files[0])
    for i in range(1, len(files)):
        sound_examples = np.concatenate(
            (sound_examples, vggish_input.wavfile_to_examples(files[i])))
    return sound_examples
Exemplo n.º 3
0
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file):
    print("Input file: " +input_wav_file)

    
    if (os.path.isfile(input_wav_file)):
      examples_batch = vggish_input.wavfile_to_examples(input_wav_file)
      #print(examples_batch)
      pproc = vggish_postprocess.Postprocessor(pca_params)

      with tf.Graph().as_default(), tf.Session() as sess:
       # Define the model in inference mode, load the checkpoint, and
       # locate input and output tensors.
       vggish_slim.define_vggish_slim(training=False)
       vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
       features_tensor = sess.graph.get_tensor_by_name(
          vggish_params.INPUT_TENSOR_NAME)
       embedding_tensor = sess.graph.get_tensor_by_name(
          vggish_params.OUTPUT_TENSOR_NAME)

       # Run inference and postprocessing.
       [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
       #print(embedding_batch)
       postprocessed_batch = pproc.postprocess(embedding_batch)
       #print(postprocessed_batch)
       num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32)
    
       video_batch_val = np.zeros((1, 300, 128), dtype=np.float32)
       video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2)
    

 #  extract_n_predict()
       predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file)
       return(predicted_class)
      tf.reset_default_graph()
Exemplo n.º 4
0
    def extraccion_embeddings_audio(self, rutaAudio, embeddings):
        '''Extrae de un único audio, la media y la desviación de los 128 embeddings, para devolver 
        un vector de 256 ccas por audio o los 128 espectros en caso de que embeddings sea falso.
        
        Parametros
        ----------
        rutaAudio:str
            ruta del audio en concreto
        embeddings:boolean
            Si es true sacamos los embeddings, si false sacamos los espectros
        Return
        ------
        final_ccas: numpy.array
            array de características del audio. Embeddings o espectros.
        '''
        #1. Sacamos las ccas MFCC y espectrales
        input_batch = vggish_input.wavfile_to_examples(rutaAudio)

        #2. producimos los embbeding cn el modelo o los espectros segun se indique
        if embeddings:
            ccas = self.model.predict(input_batch[:, :, :, None])
        else:
            # "Rompemos" los grupos de 0.96s
            ccas = input_batch.reshape(-1, input_batch.shape[-1])

        #3. Hacemos media y deviación de los embbedings
        media = np.mean(ccas, axis=0)
        desvs = np.std(ccas, axis=0)
        final_ccas = np.append(media, desvs)

        return final_ccas
Exemplo n.º 5
0
def main(_):
  with open(FLAGS.wav_files) as f:
      files_list = [line.replace('\n', '') for line in f]

  n_files = len(files_list)
  output_emedding = np.zeros((n_files, 128))
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
  processed_fnames = []
  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    for n_file, wav_file in enumerate(files_list):
        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        print(n_file, '/', n_files)

        if examples_batch.shape[0] == 0:
          with open('bad_files.log', 'a') as logf:
            logf.write(wav_file + '\n')
        else:
          processed_fnames.append(wav_file)

          [embedding_batch] = sess.run([embedding_tensor],
                                       feed_dict={features_tensor: examples_batch})
          postprocessed_batch = pproc.postprocess(embedding_batch)
          postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0)
          output_emedding[n_file, :] = postprocessed_batch_mean
      
    np.save(FLAGS.npy_file, output_emedding)
Exemplo n.º 6
0
def extract_and_predict(wav):
    print("Boom from PYTHON!!!")

    # tf.enable_v2_behavior()
    # loaded_model = tf2.saved_model.load(saved_model_path)
    # print("I can load model now!!!")

    wav_file = wav
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    print("Jerry audio_to_prediction.py: after wavfile_to_examples")


    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor()

    print("Jerry audio_to_prediction.py: after pproc")

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        print("Jerry audio_to_prediction.py: after load vggish_slim")
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        postprocessed_batch = [postprocessed_batch[i] for i in range(len(postprocessed_batch))]
    pred_each_n_seconds = predict_with_saved_model(postprocessed_batch)
    return str(pred_each_n_seconds)
def extract_and_predict(wav):
    wav_file = wav
    examples_batch = vggish_input.wavfile_to_examples(wav_file)

    # Prepare a postprocessor to munge the model embeddings.
    # pproc = vggish_postprocess.Postprocessor()

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = vggish_postprocess.postprocess(embedding_batch)
        postprocessed_batch = [
            postprocessed_batch[i] for i in range(len(postprocessed_batch))
        ]
    pred_each_n_seconds = predict_with_saved_model(postprocessed_batch)
    print(str(pred_each_n_seconds))
Exemplo n.º 8
0
def extract(wav_file):
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    pproc = vggish_postprocess.Postprocessor(
        '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_pca_params.npz'
    )

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(
            sess,
            '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_model.ckpt'
        )
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

    return postprocessed_batch
Exemplo n.º 9
0
def main(_):
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
    # print(FLAGS.checkpoint)
    # print(os.getcwd())
    # print(path.exists(FLAGS.checkpoint))
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        # embedding_batch为提取结果
        with open(path.splitext(wav_file)[0] + '.pk', "wb") as f:
            pickle.dump(embedding_batch, f)
Exemplo n.º 10
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.

  wav_file = FLAGS.wav_file

  examples_batch = vggish_input.wavfile_to_examples(wav_file)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})

    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)
    np.save("/postprocessed_batch.npy", postprocessed_batch)
Exemplo n.º 11
0
def extract_vggish_features(wav_path):
    # Produce a batch of log mel spectrogram examples.
    input_batch = vggish_input.wavfile_to_examples(wav_path)
    if input_batch.shape[0] < 1:
        print('{}: Audio sample shorter than 1 second. Ignoring ...',
              os.path.basename(wav_path))
        return None

    # print('Log Mel Spectrogram example: ', input_batch[0])

    # Define VGGish, load the checkpoint, and run the batch through the model to
    # produce embeddings.
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim()
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path)

        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        [embedding_batch] = sess.run([embedding_tensor],
                                     feed_dict={features_tensor: input_batch})
    # Postprocess the results to produce whitened quantized embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params_path)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    return postprocessed_batch
Exemplo n.º 12
0
def main(wav_file):
    """
    #Specify the path for the downloaded or recorded audio files and
    #also path for writing the embeddings or pickle files
    """
    if wav_file:
        pkl = wav_file[:-4] + '.pkl'
        print(pkl)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    with tf.Graph().as_default(), tf.Session() as sess:

        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

    predict_prob, predictions = model_function.predictions_wavfile(
        postprocessed_batch)
    K.clear_session()
    return predict_prob, predictions
Exemplo n.º 13
0
    def main(_):
        audio_files = os.listdir(audio_path)
        # maxi = 0
        for each_file in tqdm.tqdm(audio_files):
            file_nm = dest_path + each_file.split('.')[0] + '.npy'
            if not (path.exists(file_nm)):
                try:
                    wav_file = audio_path + each_file
                    examples_batch = vggish_input.wavfile_to_examples(wav_file)

                    with tf.Graph().as_default(), tf.Session() as sess:
                        vggish_slim.define_vggish_slim(training=False)
                        vggish_slim.load_vggish_slim_checkpoint(
                            sess, FLAGS.checkpoint)
                        features_tensor = sess.graph.get_tensor_by_name(
                            vggish_params.INPUT_TENSOR_NAME)
                        embedding_tensor = sess.graph.get_tensor_by_name(
                            vggish_params.OUTPUT_TENSOR_NAME)
                        [embedding_batch] = sess.run(
                            [embedding_tensor],
                            feed_dict={features_tensor: examples_batch})
                        postprocessed_batch = embedding_batch
                        #indices = np.linspace(0, len(postprocessed_batch), max_frames, endpoint=False, dtype=int)
                        #postprocessed_batch = postprocessed_batch[indices]
                        np.save(dest_path + each_file.split('.')[0] + '.npy',
                                postprocessed_batch)
                except:
                    print("here")
                    continue
def main(_):
    opt = parse_opt()
    with tf.Graph().as_default(), tf.Session() as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME)
        keys, values = ['train', 'val', 'test'], [opt.train_range, opt.val_range, opt.test_range]

        for i in range(3):
            h5_path = opt.feat_h5 + '2016' + '_' + keys[i] + '_' + opt.type + '.h5'
            if os.path.exists(h5_path): os.remove(h5_path)
            h5 = h5py.File(h5_path, 'w')
            dataset_feats = h5.create_dataset('feats', ((values[i][1] - values[i][0] + 1), opt.feat_size), dtype='float32')
            # print(values[i])
            for audio_id in range(values[i][0], values[i][1] + 1):
                wav_file = opt.video_root + 'video' + str(audio_id) + '.mp4.wav'
                #print(wav_file)
                # id = int(audio_id[5:-9])
                #print(audio_id)
                if os.path.isfile(wav_file):
                    examples_batch = vggish_input.wavfile_to_examples(wav_file)
                    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)
                    writer = tf.python_io.TFRecordWriter(FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
                    [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch})
                    #print(len(embedding_batch), len(embedding_batch[0]))
                    embedding_batch = embedding_batch.mean(0)
                    dataset_feats[audio_id - values[i][0]] = embedding_batch
                    #print(embedding_batch)

    if writer:
        writer.close()
Exemplo n.º 15
0
def extract_vggish_features(paths, path2gt, model):
    """Extracts VGGish features and their corresponding ground_truth and identifiers (the path).

       VGGish features are extracted from non-overlapping audio patches of 0.96 seconds, 
       where each audio patch covers 64 mel bands and 96 frames of 10 ms each.

       We repeat ground_truth and identifiers to fit the number of extracted VGGish features.
    """
    # 1) Extract log-mel spectrograms
    first_audio = True
    for p in paths:
        if first_audio:
            input_data = vggish_input.wavfile_to_examples(
                config['audio_folder'] + p)
            ground_truth = np.repeat(path2gt[p], input_data.shape[0], axis=0)
            identifiers = np.repeat(p, input_data.shape[0], axis=0)
            first_audio = False
        else:
            tmp_in = vggish_input.wavfile_to_examples(config['audio_folder'] +
                                                      p)
            input_data = np.concatenate((input_data, tmp_in), axis=0)
            tmp_gt = np.repeat(path2gt[p], tmp_in.shape[0], axis=0)
            ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0)
            tmp_id = np.repeat(p, tmp_in.shape[0], axis=0)
            identifiers = np.concatenate((identifiers, tmp_id), axis=0)

    # 2) Load Tensorflow model to extract VGGish features
    tfconfig = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=True,
        device_count={'CPU': 4},
        intra_op_parallelism_threads=4,
        inter_op_parallelism_threads=2,
    )
    with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(
            sess, '/kaggle/input/vggishmodel/vggish_model.ckpt')
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        extracted_feat = sess.run([embedding_tensor],
                                  feed_dict={features_tensor: input_data})
        feature = np.squeeze(np.asarray(extracted_feat))

    return [feature, ground_truth, identifiers]
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(pca_params)
    vggish_params.EXAMPLE_HOP_SECONDS = (
        1 - args.overlap) * vggish_params.EXAMPLE_WINDOW_SECONDS

    # If needed, prepare a record writer_dict to store the postprocessed embeddings.

    with tf.Graph().as_default(), tf.Session(config=config) as sess:
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        movie_id = args.wav_file[args.wav_file.rfind('/') +
                                 1:args.wav_file.rfind('.')]

        examples_batch = vggish_input.wavfile_to_examples(args.wav_file)
        num_splits = min(int(examples_batch.shape[0] / 10), 100)
        num_splits = max(1, num_splits)
        examples_batch = np.array_split(examples_batch, num_splits)

        embedding_batch = []
        for i in range(num_splits):
            [batch] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: examples_batch[i]})
            embedding_batch.extend(batch)

        postprocessed_batch = pproc.postprocess(np.array(embedding_batch))

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'movie_id':
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[movie_id]))
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        writer = tf.python_io.TFRecordWriter(
            os.path.join(args.write_dir, movie_id + '.tfrecord'))
        writer.write(seq_example.SerializeToString())
        writer.close()
Exemplo n.º 17
0
    def __getitem__(self, idx):
        filename = self.X[idx]
        wav = wavfile_to_examples(filename)
        wav = wav.cuda()

        target = self.y[idx]

        return wav, target
Exemplo n.º 18
0
 def _preprocess(self, x, fs):
     if isinstance(x, np.ndarray):
         x = vggish_input.waveform_to_examples(x, fs)
     elif isinstance(x, str):
         x = vggish_input.wavfile_to_examples(x)
     else:
         raise AttributeError
     return x
Exemplo n.º 19
0
def main(_):
  
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        return "No wav file"
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch] = sess.run([embedding_tensor],
                                    feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                        tf.train.FeatureList(
                            feature=[
                                tf.train.Feature(
                                    bytes_list=tf.train.BytesList(
                                        value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch
                            ]
                        )
                }
            )
        )
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
Exemplo n.º 20
0
def readDirectory(dirname, label):
    pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz")

    for wav_file in glob.glob(dirname + "*.wav"):
        print(wav_file)
        try:
            examples_batch = vggish_input.wavfile_to_examples(wav_file)
        except:
            continue
        writer = tf.python_io.TFRecordWriter(wav_file[:-3] + "tfrecord")

        with tf.Graph().as_default(), tf.Session() as sess:
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt")
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)
            try:
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})
            except:
                continue
            postprocessed_batch = pproc.postprocess(embedding_batch)

            nBatches = len(postprocessed_batch)

            if nBatches < 10:
                nBatches = 1
            else:
                nBatches = nBatches / 10

            for i in range(nBatches):
                seq_example = tf.train.SequenceExample(
                    context=tf.train.Features(
                        feature={
                            "labels":
                            tf.train.Feature(int64_list=tf.train.Int64List(
                                value=[label]))
                        }),
                    feature_lists=tf.train.FeatureLists(
                        feature_list={
                            vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                            tf.train.FeatureList(feature=[
                                tf.train.Feature(bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                                for embedding in postprocessed_batch[i * 10:i *
                                                                     10 + 10]
                            ])
                        }))

                if writer:
                    writer.write(seq_example.SerializeToString())

        if writer:
            writer.close()
Exemplo n.º 21
0
def _folder_to_mel(path):
    scaler = StandardScaler()
    os.chdir(path)
    files = os.listdir(".")
    sound_examples = vggish_input.wavfile_to_examples(files[0])
    for i in range(0, sound_examples.shape[0]):
        sound_examples[i, :, :] = scaler.fit_transform(sound_examples[i, :, :])
    sound_examples = sound_examples.reshape(sound_examples.shape[0], 96, 64, 1)
    sound_examples = np.repeat(sound_examples, 3, axis=3)
    for i in range(1, len(files)):
        if (sf.SoundFile(files[i]).subtype) == "PCM_16":
            temp_example = vggish_input.wavfile_to_examples(files[i])
            for j in range(0, temp_example.shape[0]):
                temp_example[j, :, :] = scaler.fit_transform(
                    temp_example[j, :, :])
            temp_example = temp_example.reshape(temp_example.shape[0], 96, 64,
                                                1)
            temp_example = np.repeat(temp_example, 3, axis=3)
            sound_examples = np.concatenate((sound_examples, temp_example))
    return sound_examples
Exemplo n.º 22
0
def main(wav_file, npz_path):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    #if FLAGS.wav_file:
    #  wav_file = str(FLAGS.wav_file)
    #  print (FLAGS.wav_file)

    if 1:
        wav_file = wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    #print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(tfrecord_file)

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        #print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        #print(postprocessed_batch)

        if 0 in embedding_batch.shape:
            print('NO')
            return 0

        np.savez_compressed(npz_path, postprocessed_batch)
    return 1
Exemplo n.º 23
0
def main(_):
    ontology_lookup = {}
    with open(ONTROLOGY, 'r') as f:
        label_json = json.load(f)
    for entry in label_json:
        label_id = entry['id'].replace('/', '_')
        assert label_id not in ontology_lookup.keys()
        ontology_lookup[label_id] = entry
    wav_paths = glob.glob(os.path.join(AUDIO_CHUNKS, '*', '*.wav'))

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    audio_tsv = []
    label_tsv = []
    emb_tsv = []
    for wavfile in tqdm(wav_paths):
        label = Path(Path(wavfile).parent).stem
        filename = Path(wavfile).name
        examples_batch = vggish_input.wavfile_to_examples(wavfile)

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            # emb = []
            # for embedding in embedding_batch:
            #     emb.append(embedding.tolist())
            emb = np.mean(embedding_batch, axis=0).tolist()

        label_tsv.append([ontology_lookup[label]['name']])
        audio_tsv.append([f'{label}/{filename}'])
        emb_tsv.append(emb)
        assert len(emb_tsv[0]) == len(emb)

    with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f:
        for emb in emb_tsv:
            csv.writer(f, delimiter='\t').writerow(emb)
    with open(f'{OUTPUTDIR}/label.tsv', 'w') as f:
        for label in label_tsv:
            csv.writer(f, delimiter='\t').writerow(label)
    with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f:
        for audio_path in audio_tsv:
            csv.writer(f, delimiter='\t').writerow(audio_path)
Exemplo n.º 24
0
    def _pre_process(paths):
        """Individual VGGish preprocessing process."""
        input_path, output_path = paths
        input_path_exists, output_path_exists = FeatureExtractor.feature_path_checker(
            input_path, output_path)

        if input_path_exists and not output_path_exists:
            features = vggish_input.wavfile_to_examples(
                input_path)  # can also do .ogg files
            pickle.dump(features, open(output_path, "wb"))
            del features
Exemplo n.º 25
0
def ProcessWithVGGish(sess, vgg, file_name, start=0, stop=None):
    '''Run the VGGish model, starting with a sound (x) at sample rate
  (sr). Return a whitened version of the embeddings. Sound must be scaled to be
  floats between -1 and +1.'''

    # Produce a batch of log mel spectrogram examples. (MFCC)
    input_batch = vggish_input.wavfile_to_examples(file_name, start, stop)
    # print('Log Mel Spectrogram example: ', input_batch[0])
    [embedding_batch] = sess.run([vgg['embedding']],
                                 feed_dict={vgg['features']: input_batch})

    return embedding_batch, input_batch
Exemplo n.º 26
0
def audio_inference(wav_file):
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    with tf.Graph().as_default() as g:
        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})

        postprocessed_batch = pproc.postprocess(embedding_batch)

    return postprocessed_batch


# postprocessed_batch = audio_inference('../data/audio/00059.wav')
Exemplo n.º 27
0
def get_audio_input(wave_file_address, sess, features_tensor, embedding_tensor,
                    pproc):
    wave_file = wavfile_to_examples(wave_file_address)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: wave_file})

    sample4 = pproc.postprocess(embedding_batch)
    #print(np.shape(sample4))
    sample5 = align(sample4)
    sample5 = np.reshape(sample5, (1, 5, 128))

    return sample5
Exemplo n.º 28
0
 def get_examples_(self, i, audio_name):
     # audio_name = self.labeled_data.iloc[i,0]
     context_num = self.labeled_data.iloc[i, 1]
     all_examples = vggish_input.wavfile_to_examples(audio_name)
     all_labels = np.array([self.onehot_label[context_num - 1]] *
                           all_examples.shape[0])
     labeled_examples = list(zip(all_examples, all_labels))
     # Separate and return the features and labels.
     features = [example for (example, _) in labeled_examples]
     labels = [label for (_, label) in labeled_examples]
     if features == None:
         print("ERROR: None features")
         exit()
     return (features, labels, context_num)
def add_augmented_files(X_files, y_categories):
    path_to_dataset = os.path.join(DATA_DIR, AUGMENTATION_DIR)
    augmented_features = []
    augmented_categories = []
    for sound_file in X_files:
        if sound_file[0] != '.':
            sound_category = sound_file.split('_')[0]
            path_to_directory = os.path.join(path_to_dataset, sound_category)
            path_to_file = os.path.join(path_to_directory, sound_file)
            # get all features of the sound files
            features = wavfile_to_examples(path_to_file)
            features = np.array(features)
            augmented_features.append(features)
            augmented_categories.append()
def OutputAudioEmbeddings(pathIn, row):
    video_id = row['video_id']
    video_path = row['video_path']
    split = row['split']
    full_path = os.path.join(pathIn, video_path)
    full_path = full_path.replace("%(ext)s",
                                  "wav")  # output file of the downloader path
    if split == 'train':
        full_path_cut = full_path.replace("train", "train/cut")
    elif split == 'test':
        full_path_cut = full_path.replace("test", "test/cut")

    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.

    if os.path.isfile(full_path_cut):
        wav_file = full_path_cut

        examples_batch = vggish_input.wavfile_to_examples(wav_file)
        #print(examples_batch)

        # Prepare a postprocessor to munge the model embeddings.
        pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

        # If needed, prepare a record writer to store the postprocessed embeddings.
        writer = tf.python_io.TFRecordWriter(
            FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False)
            vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
            features_tensor = sess.graph.get_tensor_by_name(
                vggish_params.INPUT_TENSOR_NAME)
            embedding_tensor = sess.graph.get_tensor_by_name(
                vggish_params.OUTPUT_TENSOR_NAME)

            # Run inference and postprocessing.
            [embedding_batch
             ] = sess.run([embedding_tensor],
                          feed_dict={features_tensor: examples_batch})
            #print(embedding_batch)
            postprocessed_batch = pproc.postprocess(embedding_batch)
            print(postprocessed_batch)
            #print(postprocessed_batch.shape)
            np.save(
                '/lfs01/workdirs/shams010/shams010u1/code/audio_features/' +
                split + '/' + video_id, postprocessed_batch)
Exemplo n.º 31
0
def main(_):
  # In this simple example, we run the examples from a single audio file through
  # the model. If none is provided, we generate a synthetic input.
  if FLAGS.wav_file:
    wav_file = FLAGS.wav_file
  else:
    # Write a WAV of a sine wav into an in-memory file object.
    num_secs = 5
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)
    # Convert to signed 16-bit samples.
    samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
    wav_file = six.BytesIO()
    wavfile.write(wav_file, sr, samples)
    wav_file.seek(0)
  examples_batch = vggish_input.wavfile_to_examples(wav_file)
  print(examples_batch)

  # Prepare a postprocessor to munge the model embeddings.
  pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

  # If needed, prepare a record writer to store the postprocessed embeddings.
  writer = tf.python_io.TFRecordWriter(
      FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

  with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    # Run inference and postprocessing.
    [embedding_batch] = sess.run([embedding_tensor],
                                 feed_dict={features_tensor: examples_batch})
    print(embedding_batch)
    postprocessed_batch = pproc.postprocess(embedding_batch)
    print(postprocessed_batch)

    # Write the postprocessed embeddings as a SequenceExample, in a similar
    # format as the features released in AudioSet. Each row of the batch of
    # embeddings corresponds to roughly a second of audio (96 10ms frames), and
    # the rows are written as a sequence of bytes-valued features, where each
    # feature value contains the 128 bytes of the whitened quantized embedding.
    seq_example = tf.train.SequenceExample(
        feature_lists=tf.train.FeatureLists(
            feature_list={
                vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(
                        feature=[
                            tf.train.Feature(
                                bytes_list=tf.train.BytesList(
                                    value=[embedding.tobytes()]))
                            for embedding in postprocessed_batch
                        ]
                    )
            }
        )
    )
    print(seq_example)
    if writer:
      writer.write(seq_example.SerializeToString())

  if writer:
    writer.close()