Пример #1
0
def generate_embeddings(frm_rcv,
                        emb_snd,
                        mon_snd,
                        emb_rec_snd=None,
                        aud_cmd_rcv=None,
                        RATE=16000):
    pproc = vggish_postprocess.Postprocessor(
        '/opt/vggish/vggish_pca_params.npz')

    # Load TFLite model and allocate tensors.
    interpreter = tf.lite.Interpreter(
        model_path="/opt/soundscene/vggish.tflite")
    interpreter.allocate_tensors()

    # Get input and output tensors.
    input_details = interpreter.get_input_details()
    output_details = interpreter.get_output_details()

    logger.info('starting embedding processor:%s', input_details[0])

    resume_at = True
    while True:
        try:
            mon_snd.send(time.time())

            if aud_cmd_rcv != None and aud_cmd_rcv.poll(.01):
                resume_at = aud_cmd_rcv.recv()
                logger.info('received embedding command:%s', resume_at)

            if frm_rcv.poll(.01) != None:
                aud_time, normalized_audio = frm_rcv.recv()
                if time.time() >= resume_at:
                    mel_samples = vggish_input.waveform_to_examples(
                        normalized_audio, RATE)
                    mel_samples = mel_samples.astype(dtype=np.float32)

                    interpreter.set_tensor(input_details[0]['index'],
                                           mel_samples)
                    interpreter.invoke()
                    output_data = interpreter.get_tensor(
                        output_details[0]['index'])

                    postprocessed_batch = pproc.postprocess(output_data)
                    t = aud_time
                    for i, _n in enumerate(postprocessed_batch):
                        emb_snd.send((t, _n))
                        if emb_rec_snd:
                            emb_rec_snd.send(_n)
                        t += 1.0  # each sample is ~ 1 sec
            else:
                time.sleep(.3)
        except Exception as e:
            logger.exception('embedding handler exception!:')
            raise e
def process_with_vggish(wavfile, vgg, sess, config):
    """
    Run the VGGish model, starting with a sound (x) at sample rate (sr).
    Return a whitened version of the embedding. Sound must be scaled to be
    floats between -1 and +1.
    """
    # Produce a batch of log mel spectrogram examples.
    input_features = vggish_input.wavfile_to_examples(wavfile)

    [embedding] = sess.run([vgg['embedding']],
                           feed_dict={vgg['features']: input_features})

    # Postprocess the results to produce whitened quantized embeddings.
    pproc = vggish_postprocess.Postprocessor(config.vggish_pca_params_path)
    postprocessed_embedding = pproc.postprocess(embedding)

    return postprocessed_embedding
batches = []
count = 0
for wav_file in wav_files:
    if "wav" in wav_file:
        examples_batch = vggish_input.wavfile_to_examples(
            join(wav_file_direc, wav_file))
        batches.append(examples_batch)
        count += 1
        if count % 100 == 0:
            print("At File ", count, "/", N)

print("Done!")

print("Computing Tensorflow Embeddings...")
# Prepare a postprocessor to munge the model embeddings.
pproc = vggish_postprocess.Postprocessor(pca_params)

output_sequences = []

with tf.Graph().as_default(), tf.Session() as sess:
    # Define the model in inference mode, load the checkpoint, and
    # locate input and output tensors.
    vggish_slim.define_vggish_slim(training=False)
    vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint)
    features_tensor = sess.graph.get_tensor_by_name(
        vggish_params.INPUT_TENSOR_NAME)
    embedding_tensor = sess.graph.get_tensor_by_name(
        vggish_params.OUTPUT_TENSOR_NAME)

    count = 0
    for batch in batches:
def main(_):
    # In this simple example, we run the examples from a single audio file through
    # the model. If none is provided, we generate a synthetic input.
    if FLAGS.wav_file:
        wav_file = FLAGS.wav_file
    else:
        # Write a WAV of a sine wav into an in-memory file object.
        num_secs = 5
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)
        # Convert to signed 16-bit samples.
        samples = np.clip(x * 32768, -32768, 32767).astype(np.int16)
        wav_file = six.BytesIO()
        wavfile.write(wav_file, sr, samples)
        wav_file.seek(0)
    examples_batch = vggish_input.wavfile_to_examples(wav_file)
    print(examples_batch)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    writer = tf.python_io.TFRecordWriter(
        FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        print(embedding_batch)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        print(postprocessed_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.
        seq_example = tf.train.SequenceExample(
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    vggish_params.AUDIO_EMBEDDING_FEATURE_NAME:
                    tf.train.FeatureList(feature=[
                        tf.train.Feature(bytes_list=tf.train.BytesList(
                            value=[embedding.tobytes()]))
                        for embedding in postprocessed_batch
                    ])
                }))
        print(seq_example)
        if writer:
            writer.write(seq_example.SerializeToString())

    if writer:
        writer.close()
def make_extract_vggish_embedding(frame_duration,
                                  hop_duration,
                                  input_op_name='vggish/input_features',
                                  output_op_name='vggish/embedding',
                                  embedding_size=128,
                                  resources_dir=None):
    """
    Creates a coroutine generator for extracting and saving VGGish embeddings

    Parameters
    ----------
    frame_duration
    hop_duration
    input_op_name
    output_op_name
    embedding_size
    resources_dir

    Returns
    -------
    coroutine

    """
    params = {
        'frame_win_sec': frame_duration,
        'frame_hop_sec': hop_duration,
        'embedding_size': embedding_size
    }

    if not resources_dir:
        resources_dir = os.path.join(os.path.dirname(__file__),
                                     'vggish/resources')

    pca_params_path = os.path.join(resources_dir, 'vggish_pca_params.npz')
    model_path = os.path.join(resources_dir, 'vggish_model.ckpt')

    try:
        with tf.Graph().as_default(), tf.Session() as sess:
            # Define the model in inference mode, load the checkpoint, and
            # locate input and output tensors.
            vggish_slim.define_vggish_slim(training=False, **params)
            vggish_slim.load_vggish_slim_checkpoint(sess, model_path, **params)

            while True:
                # We use a coroutine to more easily keep open the Tensorflow contexts
                # without having to constantly reload the model
                audio_path, output_path = (yield)

                if os.path.exists(output_path):
                    continue

                try:
                    examples_batch = vggish_input.wavfile_to_examples(
                        audio_path, **params)
                except ValueError:
                    print("Error opening {}. Skipping...".format(audio_path))
                    continue

                # Prepare a postprocessor to munge the model embeddings.
                pproc = vggish_postprocess.Postprocessor(
                    pca_params_path, **params)

                input_tensor_name = input_op_name + ':0'
                output_tensor_name = output_op_name + ':0'

                features_tensor = sess.graph.get_tensor_by_name(
                    input_tensor_name)
                embedding_tensor = sess.graph.get_tensor_by_name(
                    output_tensor_name)

                # Run inference and postprocessing.
                [embedding_batch
                 ] = sess.run([embedding_tensor],
                              feed_dict={features_tensor: examples_batch})

                emb = pproc.postprocess(embedding_batch,
                                        **params).astype(np.float32)

                with gzip.open(output_path, 'wb') as f:
                    emb.dump(f)

    except GeneratorExit:
        pass