Exemplo n.º 1
0
def create_tf_example(
    file_id, start, end, sub_segment, sub_segment_id, sub_segment_len, label
):
    # Drop 1/4th of speech signals to balance data
    if label == 1:
        drop = np.random.randint(0, 4)
        if drop > 0:
            return None

    # MFCC feature extraction
    signal_to_process = np.copy(sub_segment)
    signal_to_process = np.float32(signal_to_process)
    features = extract_features(
        signal_to_process, freq=16000, n_mfcc=5, size=512, step=16
    )
    features = np.reshape(features, -1)

    feature_dict = {
        "signal/id": bytes_feature(file_id.encode()),
        "segment/start": int64_feature(int(start)),
        "segment/end": int64_feature(int(end)),
        "subsegment/id": int64_feature(sub_segment_id),
        "subsegment/length": int64_feature(sub_segment_len),
        "subsegment/signal": float_list_feature(sub_segment.tolist()),
        "subsegment/features": float_list_feature(features.tolist()),
        "subsegment/label": int64_feature(label),
    }

    example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    return example
Exemplo n.º 2
0
def create_tf_example(
    file_id,
    start,
    end,
    sub_segment,
    sub_segment_id,
    sub_segment_len,
    label,
):
    """Build a TF training example from raw data.

    Args:
        file_id (int): file ID
        start (int): global audio segment start frame
        end (int): global audio segment end frame
        sub_segment (np.ndarray): sub audio segment signal
        sub_segment_id (int): sub audio segment ID
        sub_segment_len (int): sub audio segment length
        label (int): sub audio segment ID (0 or 1)

    Returns:
        example (tf.core.example.example_pb2.Example): TF training example
    """
    # Drop 1/4th of speech signals to balance data
    if label == 1:
        drop = np.random.randint(0, 4)
        if drop > 0:
            return None

    # MFCC feature extraction
    signal_to_process = np.copy(sub_segment)
    signal_to_process = np.float32(signal_to_process)
    features = extract_features(
        signal_to_process, freq=16000, n_mfcc=5, size=512, step=16
    )
    features = np.reshape(features, -1)

    feature_dict = {
        "signal/id": bytes_feature(file_id.encode()),
        "segment/start": int64_feature(int(start)),
        "segment/end": int64_feature(int(end)),
        "subsegment/id": int64_feature(sub_segment_id),
        "subsegment/length": int64_feature(sub_segment_len),
        "subsegment/signal": float_list_feature(sub_segment.tolist()),
        "subsegment/features": float_list_feature(features.tolist()),
        "subsegment/label": int64_feature(label),
    }

    example = tf.train.Example(features=tf.train.Features(feature=feature_dict))
    return example
Exemplo n.º 3
0
def main(_):
    np.random.seed(0)

    # Directories
    data_dir = os.path.join(FLAGS.data_dir, 'test-clean/')
    label_dir = os.path.join(FLAGS.data_dir, 'labels/')

    _, _, test = split_data(label_dir, split='0.7/0.15', random_seed=0)
    file_it = file_iter(data_dir, label_dir, files=test)

    # TensorFlow inputs
    features_input_ph = tf.placeholder(shape=FEAT_SIZE, dtype=tf.float32)
    features_input_op = tf.transpose(features_input_ph, perm=[1, 0])
    features_input_op = tf.expand_dims(features_input_op, axis=0)

    # TensorFlow exported model
    speech_predictor = tf.contrib.predictor.from_saved_model(export_dir=FLAGS.exported_model)
    init = tf.initializers.global_variables()
    classes = ['Noise', 'Speech']

    # Iterate though test data
    with tf.Session() as sess:
        for signal, labels, fn in file_it:
            sess.run(init)
            print('\nPrediction on file {} ...'.format(fn))
            signal_input = deque(signal[:FLAGS.seq_len].tolist(), maxlen=FLAGS.seq_len)

            preds, pred_time = [], []
            pointer = FLAGS.seq_len
            while pointer < len(signal):
                start = time()
                # Preprocess signal & extract features
                signal_to_process = np.copy(signal_input)
                signal_to_process = np.float32(signal_to_process)
                features = extract_features(signal_to_process, freq=16000, n_mfcc=5, size=512, step=16)

                # Prediction
                features_input = sess.run(features_input_op, feed_dict={features_input_ph: features})
                speech_prob = speech_predictor({'features_input': features_input})['speech'][0]
                speech_pred = classes[int(np.round(speech_prob))]

                # Time prediction & processing
                end = time()
                dt = end - start
                pred_time.append(dt)
                print('Prediction = {} | proba = {:.2f} | time = {:.2f} s'.format(speech_pred, speech_prob[0], dt))

                # For visualization
                preds.append([pointer - FLAGS.seq_len, pointer, np.round(speech_prob)])

                # Update signal segment
                signal_input.extend(signal[pointer + FLAGS.stride:pointer + FLAGS.stride + FLAGS.seq_len])
                pointer += FLAGS.seq_len + FLAGS.stride

            print('Average prediction time = {:.2f} ms'.format(np.mean(pred_time) * 1e3))

            # Smoothing & hangover
            if FLAGS.smoothing:
                preds = smooth_predictions(preds)

            # Visualization
            visualize_predictions(signal, fn, preds)
def main(_):
    np.random.seed(0)
    file_it = file_iter(FLAGS.data_dir)
    if not tf.gfile.IsDirectory(FLAGS.out_dir):
        tf.gfile.MakeDirs(FLAGS.out_dir)

    # TensorFlow inputs
    features_input_ph = tf.placeholder(shape=(16, 65), dtype=tf.float32)
    features_input_op = tf.transpose(features_input_ph, perm=[1, 0])
    features_input_op = tf.expand_dims(features_input_op, axis=0)

    # TensorFlow exported model
    speech_predictor = tf.contrib.predictor.from_saved_model(
        export_dir=FLAGS.exported_model)
    init = tf.initializers.global_variables()
    classes = ["Noise", "Speech"]

    # Iterate though test data
    with tf.Session() as sess:
        for signal, fn in file_it:
            sess.run(init)
            print("\nPrediction on file {} ...".format(fn))
            signal_input = deque(signal[:1024].tolist(), maxlen=1024)

            labels = {"speech_segments": []}
            preds, pred_time = [], []
            pointer = 1024
            while pointer < len(signal):
                start = time.time()
                # Preprocess signal & extract features
                signal_to_process = np.copy(signal_input)
                signal_to_process = np.float32(signal_to_process)
                signal_to_process = np.add(signal_to_process, 1.0)
                signal_to_process = np.divide(signal_to_process, 2.0)
                features = extract_features(signal_to_process,
                                            freq=16000,
                                            n_mfcc=5,
                                            size=512,
                                            step=16)

                # Prediction
                features_input = sess.run(
                    features_input_op, feed_dict={features_input_ph: features})
                speech_prob = speech_predictor(
                    {"features_input": features_input})["speech"][0]
                speech_pred = classes[int(np.round(speech_prob))]

                # Time prediction & processing
                end = time.time()
                dt = end - start
                pred_time.append(dt)
                if FLAGS.viz:
                    print("Prediction = {} | proba = {:.2f} | time = {:.2f} s".
                          format(speech_pred, speech_prob[0], dt))

                # For visualization
                preds.append([pointer - 1024, pointer, np.round(speech_prob)])

                # For label recording
                if np.round(speech_prob) > 0:
                    labels["speech_segments"].append({
                        "start_time": pointer - 1024,
                        "end_time": pointer
                    })

                # Update signal segment
                signal_input.extend(signal[pointer + 1:pointer + 1 + 1024])
                pointer += 1024 + 1

            print("Average prediction time = {:.2f} ms".format(
                np.mean(pred_time) * 1e3))

            # Visualization
            if FLAGS.viz:
                visualize_predictions(signal, fn, preds)

            # Record labels to .json
            if not FLAGS.viz:
                out_fn = "{}.json".format(fn.split(".")[0])
                out_fp = os.path.join(FLAGS.out_dir, out_fn)
                with open(out_fp, "w") as f:
                    json.dump(labels, f)
                print("{} predictions recorded to {}".format(
                    len(labels["speech_segments"]), FLAGS.out_dir))
def automatic_labeling(data_dir, exported_model, visualize=False):
    """Run automatic labeling over a given dataset of raw audio signals, given a pre-trained VAD model.

    Args:
        data_dir (str): path to raw dataset directory
        exported_model (str): path to exported pre-trained TF model directory
        visualize (bool, optional): option to visualize automatic labeling. Defaults to False.
    """
    np.random.seed(0)
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
    tf.logging.set_verbosity(tf.logging.INFO)

    test_data_dir = os.path.join(data_dir, "test-clean/")
    labels_dir = os.path.join(data_dir, "labels/")

    file_it = file_iter(test_data_dir)
    if not tf.gfile.IsDirectory(labels_dir):
        tf.gfile.MakeDirs(labels_dir)

    # TensorFlow inputs
    features_input_ph = tf.placeholder(shape=(16, 65), dtype=tf.float32)
    features_input_op = tf.transpose(features_input_ph, perm=[1, 0])
    features_input_op = tf.expand_dims(features_input_op, axis=0)

    # TensorFlow exported model
    speech_predictor = tf.contrib.predictor.from_saved_model(
        export_dir=exported_model)
    init = tf.initializers.global_variables()
    classes = ["Noise", "Speech"]

    # Iterate though test data
    with tf.Session() as sess:
        for signal, fn in file_it:
            sess.run(init)
            logger.info(f"Prediction on file {fn} ...")
            signal_input = deque(signal[:1024].tolist(), maxlen=1024)

            labels = {"speech_segments": []}
            preds, pred_time = [], []
            pointer = 1024
            while pointer < len(signal):
                start = time.time()
                # Preprocess signal & extract features
                signal_to_process = np.copy(signal_input)
                signal_to_process = np.float32(signal_to_process)
                signal_to_process = np.add(signal_to_process, 1.0)
                signal_to_process = np.divide(signal_to_process, 2.0)
                features = extract_features(signal_to_process,
                                            freq=16000,
                                            n_mfcc=5,
                                            size=512,
                                            step=16)

                # Prediction
                features_input = sess.run(
                    features_input_op, feed_dict={features_input_ph: features})
                speech_prob = speech_predictor(
                    {"features_input": features_input})["speech"][0]
                speech_pred = classes[int(np.round(speech_prob))]

                # Time prediction & processing
                end = time.time()
                dt = end - start
                pred_time.append(dt)
                if visualize:
                    logger.info(
                        f"Prediction = {speech_pred} | proba = {speech_prob[0]:.2f} | time = {dt:.2f} s"
                    )

                # For visualization
                preds.append([pointer - 1024, pointer, np.round(speech_prob)])

                # For label recording
                if np.round(speech_prob) > 0:
                    labels["speech_segments"].append({
                        "start_time": pointer - 1024,
                        "end_time": pointer
                    })

                # Update signal segment
                signal_input.extend(signal[pointer + 1:pointer + 1 + 1024])
                pointer += 1024 + 1

            logger.info(
                f"Average prediction time = {np.mean(pred_time) * 1e3:.2f} ms")

            # Visualization
            if visualize:
                visualize_predictions(signal, fn, preds)

            # Record labels to .json
            if not visualize:
                base_name = fn.split(".")[0]
                out_fn = f"{base_name}.json"
                out_fp = os.path.join(labels_dir, out_fn)
                with open(out_fp, "w") as f:
                    json.dump(labels, f)

                nb_preds = len(labels["speech_segments"])
                logger.info(f"{nb_preds} predictions recorded to {labels_dir}")
def run_inference(params, data_dir, exported_model):
    """Run Voice Activity Detection CNN inference over raw audio signals.

    Args:
        params (dict): dictionary of inference parameters
        data_dir (str): path to raw dataset directory
        exported_model (str): path to exported pre-trained TF model directory
    """
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
    tf.logging.set_verbosity(tf.logging.INFO)
    np.random.seed(0)

    input_size = params["input_size"]
    stride = params["stride"]
    smoothing = params["smoothing"]

    # Directories
    test_data_dir = os.path.join(data_dir, "test-clean/")
    label_dir = os.path.join(data_dir, "labels/")

    _, _, test = split_data(label_dir, split="0.7/0.15", random_seed=0)
    file_it = file_iter(test_data_dir, label_dir, files=test)

    # TensorFlow inputs
    features_input_ph = tf.placeholder(shape=FEAT_SIZE, dtype=tf.float32)
    features_input_op = tf.transpose(features_input_ph, perm=[1, 0])
    features_input_op = tf.expand_dims(features_input_op, axis=0)

    # TensorFlow exported model
    speech_predictor = tf.contrib.predictor.from_saved_model(export_dir=exported_model)
    init = tf.initializers.global_variables()
    classes = ["Noise", "Speech"]

    # Iterate though test data
    with tf.Session() as sess:
        for signal, labels, fn in file_it:
            sess.run(init)
            logger.info(f"Prediction on file {fn} ...")
            signal_input = deque(signal[:input_size].tolist(), maxlen=input_size)

            preds, pred_time = [], []
            pointer = input_size
            while pointer < len(signal):
                start = time.time()
                # Preprocess signal & extract features
                signal_to_process = np.copy(signal_input)
                signal_to_process = np.float32(signal_to_process)
                features = extract_features(
                    signal_to_process, freq=16000, n_mfcc=5, size=512, step=16
                )

                # Prediction
                features_input = sess.run(
                    features_input_op, feed_dict={features_input_ph: features}
                )
                speech_prob = speech_predictor({"features_input": features_input})[
                    "speech"
                ][0]
                speech_pred = classes[int(np.round(speech_prob))]

                # Time prediction & processing
                end = time.time()
                dt = end - start
                pred_time.append(dt)
                logger.info(
                    f"Prediction = {speech_pred} | proba = {speech_prob[0]:.2f} | time = {dt:.2f} s"
                )

                # For visualization
                preds.append([pointer - input_size, pointer, np.round(speech_prob)])

                # Update signal segment
                signal_input.extend(
                    signal[pointer + stride : pointer + stride + input_size]
                )
                pointer += input_size + stride

            logger.info(f"Average prediction time = {np.mean(pred_time) * 1e3:.2f} ms")

            # Smoothing & hangover
            if smoothing:
                preds = smooth_predictions(preds)

            # Visualization
            visualize_predictions(signal, fn, preds)