예제 #1
0
  def test_fit_list_and_predict_single_label(self):
    """Train and test model while training data has single label.

    Training data are not concatenated.
    """
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.enable_cuda = False
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 50
    inference_args.test_iteration = 1

    # generate fake training data, as a list
    train_sequences = [
        np.random.rand(100, model_args.observation_dim),
        np.random.rand(200, model_args.observation_dim),
        np.random.rand(300, model_args.observation_dim)]
    train_cluster_ids = [
        np.array(['A'] * 100),
        np.array(['A'] * 200),
        np.array(['A'] * 300),]

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequences, train_cluster_ids, training_args)

    # testing, where data has less variation than training
    test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0
    predicted_label = model.predict(test_sequence, inference_args)
    self.assertListEqual([0] * 10, predicted_label)
예제 #2
0
  def test_fit_concatenated_and_predict_single_label(self):
    """Train and test model while training data has single label.

    Training data have already been concatenated.
    """
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 50
    inference_args.test_iteration = 1

    # generate fake training data, assume already concatenated
    train_sequence = np.random.rand(1000, model_args.observation_dim)
    train_cluster_id = np.array(['A'] * 1000)

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequence, train_cluster_id, training_args)

    # testing, where data has less variation than training
    test_sequence = np.random.rand(10, model_args.observation_dim) / 10.0
    predicted_label = model.predict(test_sequence, inference_args)
    self.assertListEqual([0] * 10, predicted_label)
예제 #3
0
def diarization_experiment(model_args, training_args, inference_args):
    """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """

    predicted_labels = []
    test_record = []

    train_data = np.load('./ghostvlad/training_data1.npz')
    train_sequence = train_data['train_sequence']
    train_cluster_id = train_data['train_cluster_id']
    train_sequence_list = [
        seq.astype(float) + 0.00001 for seq in train_sequence
    ]
    train_cluster_id_list = [
        np.array(cid).astype(str) for cid in train_cluster_id
    ]

    model = uisrnn.UISRNN(model_args)
    #model.load(SAVED_MODEL_NAME)
    # training
    model.fit(train_sequence_list, train_cluster_id_list, training_args)
    model.save(SAVED_MODEL_NAME + str(2))
    '''
예제 #4
0
def diarization_experiment(model_args, training_args, inference_args):
    """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """

    predicted_labels = []
    test_record = []

    train_data = np.load('./training_data.npz')
    test_data = np.load('./data/testing_data.npz')
    train_sequence = train_data['train_sequence']
    train_cluster_id = train_data['train_cluster_id']
    test_sequences = test_data['test_sequences']
    test_cluster_ids = test_data['test_cluster_ids']

    model = uisrnn.UISRNN(model_args)

    print("train_sequence = {}".format(train_sequence.shape))
    print("train_cluster_id = {}".format(train_cluster_id.shape))
    print(train_cluster_id[:1000])
    '''
예제 #5
0
 def test_save_and_load(self):
   """Save model and load it."""
   model_args, _, _ = uisrnn.parse_arguments()
   model_args.observation_dim = 16
   model_args.transition_bias = 0.5
   model_args.sigma2 = 0.05
   model = uisrnn.UISRNN(model_args)
   temp_file_path = tempfile.mktemp()
   model.save(temp_file_path)
   model.load(temp_file_path)
   self.assertEqual(0.5, model.transition_bias)
예제 #6
0
def diarization_experiment(model_args, training_args, inference_args, isLoaded=True):
    """Experiment pipeline.

    Load data --> train model --> test model --> output result

    Args:
      model_args: model configurations
      training_args: training configurations
      inference_args: inference configurations
    """

    predicted_labels = []
    test_record = []

    train_data = np.load('./ghostvlad/training_data_100.npz', allow_pickle=True)
    train_sequence = train_data['train_sequence']
    train_cluster_id = train_data['train_cluster_id']

    train_sequence_list = [seq.astype(float) + 1e-5 for seq in train_sequence]
    train_cluster_id_list = [np.array(cid).astype(str) for cid in train_cluster_id]

    test_sequences = train_sequence_list[-2:-1]
    test_cluster_ids = [e.tolist() for e in train_cluster_id_list[-2:-1]]

    model = uisrnn.UISRNN(model_args)

    if not isLoaded:
        # training
        model.fit(train_sequence_list, train_cluster_id_list, training_args)
        model.save(SAVED_MODEL_NAME)
    else:
        # testing
        # we can also skip training by calling:
        model.load(SAVED_MODEL_NAME)

    for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids):
        predicted_label = model.predict(test_sequence, inference_args)
        predicted_labels.append(predicted_label)
        accuracy = uisrnn.compute_sequence_match_accuracy(
            test_cluster_id, predicted_label)
        test_record.append((accuracy, len(test_cluster_id)))
        print('Ground truth labels:')
        print(test_cluster_id)
        print('Predicted labels:')
        print(predicted_label)
        print('-' * 80)

    output_string = uisrnn.output_result(model_args, training_args, test_record)

    print('Finished diarization experiment')
    print(output_string)
예제 #7
0
def diarization_experiment(model_args, training_args, inference_args):
  """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """

  predicted_cluster_ids = []
  test_record = []

  train_data = np.load('./data/toy_training_data.npz', allow_pickle=True)
  test_data = np.load('./data/toy_testing_data.npz', allow_pickle=True)
  train_sequence = train_data['train_sequence']
  train_cluster_id = train_data['train_cluster_id']
  test_sequences = test_data['test_sequences'].tolist()
  test_cluster_ids = test_data['test_cluster_ids'].tolist()

  model = uisrnn.UISRNN(model_args)

  # Training.
  # If we have saved a mode previously, we can also skip training by
  # calling:
  # model.load(SAVED_MODEL_NAME)
  model.fit(train_sequence, train_cluster_id, training_args)
  model.save(SAVED_MODEL_NAME)

  # Testing.
  # You can also try uisrnn.parallel_predict to speed up with GPU.
  # But that is a beta feature which is not thoroughly tested, so
  # proceed with caution.
  for (test_sequence, test_cluster_id) in zip(test_sequences, test_cluster_ids):
    predicted_cluster_id = model.predict(test_sequence, inference_args)
    predicted_cluster_ids.append(predicted_cluster_id)
    accuracy = uisrnn.compute_sequence_match_accuracy(
        test_cluster_id, predicted_cluster_id)
    test_record.append((accuracy, len(test_cluster_id)))
    print('Ground truth labels:')
    print(test_cluster_id)
    print('Predicted labels:')
    print(predicted_cluster_id)
    print('-' * 80)

  output_string = uisrnn.output_result(model_args, training_args, test_record)

  print('Finished diarization experiment')
  print(output_string)
예제 #8
0
def diarize(segments, sr=16000, win_len=400, hop_len=160, embedding_per_sec=1.0, overlap_rate=0.1):
    logger.debug("[Speaker diarization] Initializing models")
    # Initialize ghostvlad
    toolkits.initialize_GPU(Expando({"gpu": ""}))
    ghostvlad_model = model.vggvox_resnet2d_icassp(input_dim=(257, None, 1),
                                                   num_class=5994,
                                                   mode="eval",
                                                   args=Expando({"net": "resnet34s",
                                                                 "loss": "softmax",
                                                                 "vlad_cluster": 8,
                                                                 "ghost_cluster": 2,
                                                                 "bottleneck_dim": 512,
                                                                 "aggregation_mode": "gvlad"}))
    ghostvlad_model.load_weights("ghostvlad/pretrained/weights.h5", by_name=True)

    # Initialize uisrnn
    sys.argv = sys.argv[:1]
    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnn_model = uisrnn.UISRNN(model_args)
    uisrnn_model.load("uisrnn/pretrained/saved_model.uisrnn_benchmark")

    logger.debug("[Speaker diarization] Calculating utterance features")
    utterances_spec = prepare_ghostvlad_data(segments, sr, win_len, hop_len, embedding_per_sec, overlap_rate)
    feats = []
    for spec in utterances_spec:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = ghostvlad_model.predict(spec)
        feats += [v]
    feats = np.array(feats)[:, 0, :].astype(float)

    logger.debug("[Speaker diarization] Clustering utterance features")
    labels = uisrnn_model.predict(feats, inference_args)

    logger.debug("[Speaker diarization] Tagging segments speakers")
    embedding_duration = (1/embedding_per_sec) * (1.0 - overlap_rate)
    labels_count = len(labels)
    current = 0
    for segment in segments:
        begin_index = math.floor(current/embedding_duration)
        current += segment.end-segment.begin
        end_index = math.ceil(current/embedding_duration)
        segment_labels = [labels[index] for index in range(begin_index, min(end_index, labels_count))]
        if len(segment_labels) > 0:
            segment.speaker = max(segment_labels, key=segment_labels.count)
        else:
            segment.speaker = 999
    return segments
예제 #9
0
def diarization_experiment(model_args, training_args, inference_args):
    """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """

    predicted_cluster_ids = []
    test_record = []

    train_data = np.load('./data/toy_training_data.npz')
    test_data = np.load('./data/toy_testing_data.npz')
    train_sequence = train_data['train_sequence']
    train_cluster_id = train_data['train_cluster_id']
    test_sequences = test_data['test_sequences'].tolist()
    test_cluster_ids = test_data['test_cluster_ids'].tolist()

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequence, train_cluster_id, training_args)
    model.save(SAVED_MODEL_NAME)
    # we can also skip training by calling:
    # model.load(SAVED_MODEL_NAME)

    # testing
    for (test_sequence, test_cluster_id) in zip(test_sequences,
                                                test_cluster_ids):
        predicted_cluster_id = model.predict(test_sequence, inference_args)
        predicted_cluster_ids.append(predicted_cluster_id)
        accuracy = uisrnn.compute_sequence_match_accuracy(
            test_cluster_id, predicted_cluster_id)
        test_record.append((accuracy, len(test_cluster_id)))
        print('Ground truth labels:')
        print(test_cluster_id)
        print('Predicted labels:')
        print(predicted_cluster_id)
        print('-' * 80)

    output_string = uisrnn.output_result(model_args, training_args,
                                         test_record)

    print('Finished diarization experiment')
    print(output_string)
예제 #10
0
  def test_fit_with_wrong_dim(self):
    """Training data has wrong dimension."""
    model_args, training_args, _ = uisrnn.parse_arguments()
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 5

    # generate fake data
    train_sequence = np.random.rand(1000, 18)
    train_cluster_id = np.array(['A'] * 1000)

    model = uisrnn.UISRNN(model_args)

    # training
    with self.assertRaises(ValueError):
      model.fit(train_sequence, train_cluster_id, training_args)
예제 #11
0
  def test_predict_with_wrong_dim(self):
    """Testing data has wrong dimension."""
    model_args, training_args, inference_args = uisrnn.parse_arguments()
    model_args.enable_cuda = False
    model_args.rnn_depth = 1
    model_args.rnn_hidden_size = 8
    model_args.observation_dim = 16
    training_args.learning_rate = 0.01
    training_args.train_iteration = 50

    # generate fake data
    train_sequence = np.random.rand(1000, model_args.observation_dim)
    train_cluster_id = np.array(['A'] * 1000)

    model = uisrnn.UISRNN(model_args)

    # training
    model.fit(train_sequence, train_cluster_id, training_args)

    # testing
    test_sequence = np.random.rand(10, 18)
    with self.assertRaises(ValueError):
      model.predict(test_sequence, inference_args)
예제 #12
0
def process(wav_path,
            embedding_per_second=1.0,
            overlap_rate=0.5,
            after_shift=0,
            output_seg=False,
            show=False,
            segment_fn='output.seg',
            args=None):

    if args is None:
        args = Args()
        args.gpu = ''
        args.resume = os.path.join(BASE_DIR, 'ghostvlad/pretrained/weights.h5')
        args.data_path = '4persons'
        # set up network configuration.
        args.net = 'resnet34s'  #, choices=['resnet34s', 'resnet34l'], type=str)
        args.ghost_cluster = 2
        args.vlad_cluster = 8
        args.bottleneck_dim = 512
        args.aggregation_mode = 'gvlad'  #, choices=['avg', 'vlad', 'gvlad'], type=str)
        # set up learning rate, training loss and optimizer.
        args.loss = 'softmax'  #, choices=['softmax', 'amsoftmax'], type=str)
        args.test_type = 'normal'  #, choices=['normal', 'hard', 'extend'], type=str)

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }
    t0 = time.time()
    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args = Args()
    model_args.observation_dim = 512
    model_args.rnn_hidden_size = 512
    model_args.rnn_depth = 1
    model_args.rnn_dropout = 0.2
    model_args.transition_bias = None
    model_args.crp_alpha = 1.0
    model_args.sigma2 = None
    model_args.verbosity = 2

    inference_args = Args()
    inference_args.beam_size = 10
    inference_args.look_ahead = 1
    inference_args.test_iteration = 2

    # model_args, _, inference_args = uisrnn.parse_arguments()
    # model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)
    td = time.time() - t0
    print('Load model time:', td)

    print('Loading data...')
    t0 = time.time()
    # specs, intervals = load_data(wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate)
    specs, intervals, feats = load_data(
        wav_path,
        embedding_per_second=embedding_per_second,
        overlap_rate=overlap_rate,
        network_eval=network_eval)
    mapTable, keys = genMap(intervals)
    td = time.time() - t0
    print('Load data time:', td)

    print('Generating feats...')
    t0 = time.time()
    # feats = []
    # for spec in specs:
    # spec = np.expand_dims(np.expand_dims(spec, 0), -1)
    # v = network_eval.predict(spec)
    # feats += [v]
    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    td = time.time() - t0
    print('Load feat time:', td)

    print('inference_args:', inference_args)
    print('running uisrnn.predict...')
    t0 = time.time()
    predicted_label = uisrnnModel.predict(feats, inference_args)
    td = time.time() - t0
    print('Load uisrnn.predict time:', td)

    t0 = time.time()
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)
    td = time.time() - t0
    print('Load arrangeResult time:', td)

    t0 = time.time()
    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e
    print('Load speakerSlicing time:', td)

    audacity_segments = []
    for spk, timeDicts in speakerSlice.items():
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = s * 1 / 1000.
            e = e * 1 / 1000.
            s += after_shift
            e += after_shift
            audacity_segments.append((s, e, spk))

    if output_seg:
        with open(segment_fn, 'w') as fout:
            for s, e, l in audacity_segments:
                fout.write('%s\t%s\t%s\n' % (round(s, 6), round(e, 6), spk))

    if show:
        p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
        p.draw()
        p.plot.show()

    return audacity_segments
예제 #13
0
# from matplotlib import cm
# from time import sleep, perf_counter as timer
# from umap import UMAP
# import matplotlib.pyplot as plt

sys.path.append("Resemblyzer")
from resemblyzer import preprocess_wav, VoiceEncoder, sampling_rate  # noqa

# %%
# Load file
wav = preprocess_wav("Resemblyzer/audio_data/X2zqiX6yL3I.mp3")

# %%
# Audio features
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=5)

# %%
# Load UIS-RNN model
sys.argv = ['dummy']
model_args, training_args, inference_args = uisrnn.parse_arguments()
model = uisrnn.UISRNN(model_args)
model.load('uis-rnn/saved_model.uisrnn')

# %%
# Testing
test_sequence = cont_embeds.astype(float)
predictions = model.predict(test_sequence, inference_args)

# %%
예제 #14
0
def diarization_experiment(model_args, training_args, inference_args):
    """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """

    predicted_labels = []
    test_record = []

    # edited by renni
    # handle ValueError allow pickle
    #train_data = np.load('./ghostvlad/training_data.npz')

    # save np.load
    np_load_old = np.load

    # modify the default parameters of np.load
    np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True, **k)

    # call load with allow_pickle implicitly set to true
    train_data = np.load('./ghostvlad/training_data.npz')

    # restore np.load for future normal usage
    np.load = np_load_old

    # end of renni

    train_sequence = train_data['train_sequence']
    train_cluster_id = train_data['train_cluster_id']
    train_sequence_list = [
        seq.astype(float) + 0.00001 for seq in train_sequence
    ]
    train_cluster_id_list = [
        np.array(cid).astype(str) for cid in train_cluster_id
    ]

    model = uisrnn.UISRNN(model_args)

    # training
    #model.fit(train_sequence_list, train_cluster_id_list, training_args)
    #model.save(SAVED_MODEL_NAME)

    # testing
    # we can also skip training by calling:
    model.load(SAVED_MODEL_NAME)
    for (test_sequence, test_cluster_id) in zip(test_sequences,
                                                test_cluster_ids):
        predicted_label = model.predict(test_sequence, inference_args)
        predicted_labels.append(predicted_label)
        accuracy = uisrnn.compute_sequence_match_accuracy(
            test_cluster_id, predicted_label)
        test_record.append((accuracy, len(test_cluster_id)))
        print('Ground truth labels:')
        print(test_cluster_id)
        print('Predicted labels:')
        print(predicted_label)
        print('-' * 80)

    output_string = uisrnn.output_result(model_args, training_args,
                                         test_record)

    print('Finished diarization experiment')
    print(output_string)
예제 #15
0
def main(wav_path, check, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)
    if check != '':
        specs1, interval1 = load_data(check,
                                      embedding_per_second=1.2,
                                      overlap_rate=0.4)
        mapTable1, keys1 = genMap(interval1)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]
    featss = np.array(feats)[:, 0, :].astype(float)
    predicted_label = uisrnnModel.predict(featss, inference_args)
    total_speaker = len(set(predicted_label))
    global no_speakers
    print("predicted_label: %s" % predicted_label)
    no_speakers = len(set(predicted_label))
    print('total no of speakers', no_speakers)
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    if check != '':
        for spec1 in specs1:
            spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
            v = network_eval.predict(spec1)
            feats += [v]
        featss = np.array(feats)[:,
                                 0, :].astype(float)  # [splits, embedding dim]
        print("=====================")
        print(feats)
        print(featss)
        print("=====================")
        predicted_label2 = uisrnnModel.predict(featss, inference_args)
        check_speaker = len(set(predicted_label2))
        print("predicted_label2: %s" % predicted_label2)
        print('same Speaker' if total_speaker ==
              check_speaker else 'not the same speaker')
        print('speaker detected as ' +
              str(predicted_label2[-1]) if total_speaker ==
              check_speaker else '')
        speakerSlice2 = arrangeResult(predicted_label2, time_spec_rate)
        print("=============speakerSlice2===============")
        for spk, timeDicts in speakerSlice2.items(
        ):  # time map to orgin wav(contains mute)
            for tid, timeDict in enumerate(timeDicts):
                s = 0
                e = 0
                for i, key in enumerate(keys):
                    if (s != 0 and e != 0):
                        break
                    if (s == 0 and key > timeDict['start']):
                        offset = timeDict['start'] - keys[i - 1]
                        s = mapTable[keys[i - 1]] + offset
                    if (e == 0 and key > timeDict['stop']):
                        offset = timeDict['stop'] - keys[i - 1]
                        e = mapTable[keys[i - 1]] + offset

                speakerSlice2[spk][tid]['start'] = s
                speakerSlice2[spk][tid]['stop'] = e

        for spk, timeDicts in speakerSlice2.items():
            print('========= ' + str(spk) + ' =========')
            for timeDict in timeDicts:
                s = timeDict['start']
                e = timeDict['stop']
                s = fmtTime(s)  # change point moves to the center of the slice
                e = fmtTime(e)
                print(s + ' ==> ' + e)
        print("=============speakerSlice2===============")
        #print(predicted_label,'**************************')
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
예제 #16
0
    def test_four_clusters(self):
        """Four clusters on vertices of a square."""
        label_to_center = {
            'A': np.array([0.0, 0.0]),
            'B': np.array([0.0, 1.0]),
            'C': np.array([1.0, 0.0]),
            'D': np.array([1.0, 1.0]),
        }

        # generate training data
        train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D'
                                                                      ] * 100
        random.shuffle(train_cluster_id)
        train_sequence = _generate_random_sequence(train_cluster_id,
                                                   label_to_center,
                                                   sigma=0.01)
        train_sequences = [
            train_sequence[:100, :], train_sequence[100:300, :],
            train_sequence[300:600, :], train_sequence[600:, :]
        ]
        train_cluster_ids = [
            train_cluster_id[:100], train_cluster_id[100:300],
            train_cluster_id[300:600], train_cluster_id[600:]
        ]

        # generate testing data
        test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40
        random.shuffle(test_cluster_id)
        test_sequence = _generate_random_sequence(test_cluster_id,
                                                  label_to_center,
                                                  sigma=0.01)

        # construct model
        model_args, training_args, inference_args = uisrnn.parse_arguments()
        model_args.enable_cuda = True  #for prince
        model_args.rnn_depth = 2
        model_args.rnn_hidden_size = 8
        model_args.observation_dim = 2
        model_args.verbosity = 3
        training_args.learning_rate = 0.01
        training_args.train_iteration = 200
        training_args.enforce_cluster_id_uniqueness = False
        inference_args.test_iteration = 2

        model = uisrnn.UISRNN(model_args)
        verbose = True
        if verbose:
            print("Training prints")
            print('TYPES(seq, id):', type(train_sequences),
                  type(train_cluster_ids))
            print('emb shape:', np.shape(train_sequences))
            print('label shape:', np.shape(train_sequences[0]))
            print('flat label:', np.shape(train_cluster_ids[0]))
            print('*' * 10, '\n\n')
        # run training, and save the model
        model.fit(train_sequences, train_cluster_ids, training_args)
        temp_file_path = tempfile.mktemp()
        model.save(temp_file_path)

        # run testing
        predicted_label = model.predict(test_sequence, inference_args)

        if verbose:
            print("Prediction prints")
            print(type(predicted_label))
            #print(len(predicted_label))
            print('*' * 10, '\n\n')
        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # load new model
        loaded_model = uisrnn.UISRNN(model_args)
        loaded_model.load(temp_file_path)

        # run testing with loaded model
        predicted_label = loaded_model.predict(test_sequence, inference_args)

        # run evaluation with loaded model
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # keep training from loaded model on a subset of training data
        transition_bias_1 = model.transition_bias
        training_args.learning_rate = 0.001
        training_args.train_iteration = 50
        model.fit(train_sequence[:100, :], train_cluster_id[:100],
                  training_args)
        transition_bias_2 = model.transition_bias
        self.assertNotAlmostEqual(transition_bias_1, transition_bias_2)
        model.logger.print(
            3, 'Asserting transition_bias changed from {} to {}'.format(
                transition_bias_1, transition_bias_2))

        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)

    #specs1,interval1 = load_data(r'wavs/REC20190716140159.wav', embedding_per_second=1.2, overlap_rate=0.4)
    #mapTable1,keys1 =genMap(interval1)
    mapTable, keys = genMap(intervals)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]


# =============================================================================
#     for spec1 in specs1:
#         spec1 = np.expand_dims(np.expand_dims(spec1, 0), -1)
#         v = network_eval.predict(spec1)
#         feats += [v]
# =============================================================================
    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    #print(len(feats),'00000000')
    #predicted_label = uisrnnModel.predict(feats, inference_args)

    #silhoutte score
    # =============================================================================
    #     sli=[]
    #     fromsel=[]
    #     li=[]
    #     knum=[]
    #     for i in range(10):
    #         li=[]
    #         range_n_clusters = list (range(2,5))
    #         for n_clusters in range_n_clusters:
    #             clusterer = KMeans(n_clusters=n_clusters)
    #             preds = clusterer.fit_predict(feats)
    #             centers = clusterer.cluster_centers_
    #
    #             score = silhouette_score (feats, preds, metric='euclidean')
    #             print ("For n_clusters = {}, silhouette score is {})".format(n_clusters, score))
    #             li.append([n_clusters,score,clusterer,centers])
    #     # =============================================================================
    #     #     print([float(str(i[1])[:4]) for i in li])
    #     #     kvalue=(max([float(str(i[1])[:4]) for i in li]))
    #     #     for i in range(len(li)):
    #     #         if kvalue==float(str(li[i][1])[:4]):
    #     #             true_k=li[i][0]
    #     #             break
    #     # =============================================================================
    #         maxi=li[0][1]
    #         for i in range(1,len(li)):
    #             if li[i][1]-maxi>=0.005:
    #                 maxi=li[i][1]
    #         for i in li:
    #             if i[1]==maxi:
    #                 true_k=i[0]
    #     # =============================================================================
    #     #     maxi=max([i[1] for i in li])
    #     #     for i in li:
    #     #         if i[1]==maxi:
    #     #             true_k=i[0]
    #     # =============================================================================
    #         fromsel.append(li[true_k-2])
    #         print(true_k)
    #         knum.append(true_k)
    #     kval=(max(set(knum), key=knum.count))
    #     print(kval)
    # =============================================================================

    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.95,
                                  gaussian_blur_sigma=1)
    predicted_label = clusterer.predict(feats)

    # =============================================================================
    #     clusters = KMeans(n_clusters=40, init='k-means++', max_iter=100, n_init=1, random_state = 0)
    #     clusters.fit(feats)
    #     tsne = TSNEVisualizer()
    #     tsne.fit(feats, ["c{}".format(c) for c in clusters.labels_])
    #     tsne.poof()
    # =============================================================================

    global no_speakers
    no_speakers = len(set(predicted_label))
    #print(predicted_label,'**************************')
    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()
예제 #18
0
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    #toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.GVladModel(input_dim=params['dim'],
                                       num_class=params['n_classes'],
                                       mode='eval',
                                       args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v + 0.00001]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    saveAudacity(wav_path + ".txt", speakerSlice)
예제 #19
0
def diarization_experiment(model_args, training_args, inference_args):
  """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """
  # data loading
  train_data = np.load('./data/toy_training_data.npz', allow_pickle=True)
  test_data = np.load('./data/toy_testing_data.npz', allow_pickle=True)
  train_sequence = train_data['train_sequence']
  train_cluster_id = train_data['train_cluster_id']
  test_sequences = test_data['test_sequences'].tolist()
  test_cluster_ids = test_data['test_cluster_ids'].tolist()

  # model init
  model = uisrnn.UISRNN(model_args)
  # model.load(SAVED_MODEL_NAME) # to load a checkpoint
  # tensorboard writer init
  writer = SummaryWriter()

  # training
  for epoch in range(training_args.epochs):
    stats = model.fit(train_sequence, train_cluster_id, training_args)
    # add to tensorboard
    for loss, cur_iter in stats:
      for loss_name, loss_value in loss.items():
        writer.add_scalar('loss/' + loss_name, loss_value, cur_iter)
    # save the mdoel
    model.save(SAVED_MODEL_NAME)

  # testing
  predicted_cluster_ids = []
  test_record = []
  # predict sequences in parallel
  model.rnn_model.share_memory()
  pool = mp.Pool(NUM_WORKERS, maxtasksperchild=None)
  pred_gen = pool.imap(
      func=partial(model.predict, args=inference_args),
      iterable=test_sequences)
  # collect and score predicitons
  for idx, predicted_cluster_id in enumerate(pred_gen):
    accuracy = uisrnn.compute_sequence_match_accuracy(
        test_cluster_ids[idx], predicted_cluster_id)
    predicted_cluster_ids.append(predicted_cluster_id)
    test_record.append((accuracy, len(test_cluster_ids[idx])))
    print('Ground truth labels:')
    print(test_cluster_ids[idx])
    print('Predicted labels:')
    print(predicted_cluster_id)
    print('-' * 80)

  # close multiprocessing pool
  pool.close()
  # close tensorboard writer
  writer.close()

  print('Finished diarization experiment')
  print(uisrnn.output_result(model_args, training_args, test_record))
예제 #20
0
    def test_four_clusters(self):
        """Four clusters on vertices of a square."""
        label_to_center = {
            'A': np.array([0.0, 0.0]),
            'B': np.array([0.0, 1.0]),
            'C': np.array([1.0, 0.0]),
            'D': np.array([1.0, 1.0]),
        }

        # generate training data
        train_cluster_id = ['A'] * 400 + ['B'] * 300 + ['C'] * 200 + ['D'
                                                                      ] * 100
        random.shuffle(train_cluster_id)
        train_sequence = _generate_random_sequence(train_cluster_id,
                                                   label_to_center,
                                                   sigma=0.01)

        # generate testing data
        test_cluster_id = ['A'] * 10 + ['B'] * 20 + ['C'] * 30 + ['D'] * 40
        random.shuffle(test_cluster_id)
        test_sequence = _generate_random_sequence(test_cluster_id,
                                                  label_to_center,
                                                  sigma=0.01)

        # construct model
        model_args, training_args, inference_args = uisrnn.parse_arguments()
        model_args.rnn_depth = 2
        model_args.rnn_hidden_size = 8
        model_args.observation_dim = 2
        model_args.verbosity = 3
        training_args.learning_rate = 0.01
        training_args.learning_rate_half_life = 50
        training_args.train_iteration = 200
        inference_args.test_iteration = 2

        model = uisrnn.UISRNN(model_args)

        # run training, and save the model
        model.fit(train_sequence, train_cluster_id, training_args)
        temp_file_path = tempfile.mktemp()
        model.save(temp_file_path)

        # run testing
        predicted_label = model.predict(test_sequence, inference_args)

        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # load new model
        loaded_model = uisrnn.UISRNN(model_args)
        loaded_model.load(temp_file_path)

        # run testing with loaded model
        predicted_label = loaded_model.predict(test_sequence, inference_args)

        # run evaluation with loaded model
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)

        # keep training from loaded model on a subset of training data
        transition_bias_1 = model.transition_bias
        training_args.learning_rate = 0.001
        training_args.train_iteration = 50
        model.fit(train_sequence[:100, :], train_cluster_id[:100],
                  training_args)
        transition_bias_2 = model.transition_bias
        self.assertNotAlmostEqual(transition_bias_1, transition_bias_2)
        model.logger.print(
            3, 'Asserting transition_bias changed from {} to {}'.format(
                transition_bias_1, transition_bias_2))

        # run evaluation
        model.logger.print(
            3, 'Asserting the equivalence between'
            '\nGround truth: {}\nPredicted: {}'.format(test_cluster_id,
                                                       predicted_label))
        accuracy = uisrnn.compute_sequence_match_accuracy(
            predicted_label, test_cluster_id)
        self.assertEqual(1.0, accuracy)
예제 #21
0
def run_experiment(train_sequence, train_cluster_id, test_sequence,
                   test_cluster_id, model_args, training_args, inference_args,
                   exp_name):
    start = datetime.now()

    if training_args.debug:
        print('\n\n===== DEBUG MODE =====\n\n')

    def debug(m):
        if training_args.debug:
            print(m)

    # Create model class
    model = uisrnn.UISRNN(model_args)
    print('{} - Created {} model with {:,} params:'.format(
        datetime.now() - start, model.__class__.__name__,
        count_parameters(model.rnn_model)))
    print(model.rnn_model)

    # Training
    model_loc = os.path.join(training_args.out_dir, exp_name)
    model_constructed = (not training_args.overwrite) \
         and os.path.exists(model_loc)
    if model_constructed:
        try:
            model.load(model_loc)
            print('{} - Loaded trained model from {}'.format(
                datetime.now() - start,
                model_loc,
            ))
        except Exception as e:
            print('Unable to load model from {}:\n{}'.format(model_loc, e))
            model_constructed = False
    if not model_constructed:
        model.fit(train_sequence, train_cluster_id, training_args)
        print('{} - Trained model!'.format(datetime.now() - start))
        model.save(model_loc)
        print('{} - Saved model to {}'.format(datetime.now() - start,
                                              model_loc))

    # Testing
    predicted_cluster_ids = []
    test_record = []
    with torch.no_grad():
        for i, (test_seq, test_cluster) in tqdm(enumerate(
                zip(test_sequence, test_cluster_id)),
                                                total=len(test_cluster_id)):
            debug('Test seq ({}) shape: {}'.format(test_seq.__class__.__name__,
                                                   test_seq.shape))
            debug('Test cluster ({}): {}'.format(
                test_cluster.__class__.__name__, test_cluster))
            predicted_cluster_id = model.predict(test_seq, inference_args)
            debug('Predicted cluster ID: {}, class {}'.format(
                predicted_cluster_id, predicted_cluster_id.__class__.__name__))
            predicted_cluster_ids.append(predicted_cluster_id)
            accuracy = uisrnn.compute_sequence_match_accuracy(
                test_cluster.tolist(), predicted_cluster_id)

            # We are getting accuracy per batch
            test_record.append((accuracy, len(test_cluster)))
            debug('Gold labels: {}'.format(list(test_cluster)))
            debug('Pred labels: {}'.format(list(predicted_cluster_id)))
            debug('-' * 80)

    # Output
    output_string = uisrnn.output_result(model_args, training_args,
                                         test_record)
    print('Finished diarization experiment')
    print(output_string)
    with open(
            os.path.join(training_args.out_dir,
                         '{}_test.pkl'.format(exp_name)), 'wb') as wf:
        pickle.dump(test_record, wf)

    accuracy_array, _ = zip(*test_record)
    exp_accuracy = np.mean(accuracy_array)
    return exp_accuracy
예제 #22
0
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for_json = {}
    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for_json[str(spk)] = []
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            for_json[str(spk)] += [(s / 1000, e / 1000)]
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    if args.out_path:
        print('about to try', for_json)
        with open(args.out_path, "w+", encoding='utf-8') as f:
            f.write(json.dumps(for_json))
예제 #23
0
def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         exportFile=None,
         expectedSpeakers=2):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e
    n_speakers = len(speakerSlice)
    print('N-SPeakers:', n_speakers)
    global speaker_final
    speaker_final = [pdb.empty()] * n_speakers
    for spk, timeDicts in speakerSlice.items():
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            diarization_try(wav_path, s / 1000, e / 1000, spk)
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)

    # Find the Top n Speakers
    speaker_final.sort(key=lambda speaker: speaker.duration_seconds,
                       reverse=True)
    speaker_final = speaker_final[0:expectedSpeakers]

    # Export the Files
    iso_wav_path = wav_path.split(".")[0]
    itr = 0
    while itr < len(speaker_final):
        write_path = exportFile + "_speaker" + str(itr) + ".wav"
        speaker_final[itr].export(write_path, format="wav")
        itr += 1

    del speaker_final
예제 #24
0
def diarization_experiment(model_args, training_args, inference_args):
    """Experiment pipeline.

  Load data --> train model --> test model --> output result

  Args:
    model_args: model configurations
    training_args: training configurations
    inference_args: inference configurations
  """
    predicted_cluster_ids = []
    test_record = []

    # train_data = np.load('./data/toy_training_data.npz')
    # test_data = np.load('./data/toy_testing_data.npz')
    # train_sequence = train_data['train_sequence']
    # train_cluster_id = train_data['train_cluster_id']
    # test_sequences = test_data['test_sequences'].tolist()
    # test_cluster_ids = test_data['test_cluster_ids'].tolist()
    orig_train_sequences = np.load('data/train_sequence.npy').astype(
        np.float64)
    orig_train_cluster_ids = np.array(np.load('data/train_cluster_id.npy'))
    orig_test_sequences = np.load('data/test_sequence.npy').astype(np.float64)
    orig_test_cluster_ids = np.array(np.load('data/test_cluster_id.npy'))

    print(orig_test_sequences.shape)
    print(orig_test_cluster_ids.shape)

    orig_test_sequences = orig_test_sequences[:orig_test_sequences.shape[0] //
                                              100]
    orig_test_cluster_ids = orig_test_cluster_ids[:orig_test_cluster_ids.
                                                  shape[0] // 100]

    print(orig_test_sequences.shape)
    print(orig_test_cluster_ids.shape)

    test_chunk_size = orig_test_sequences.shape[0] // 86
    test_left_over = orig_test_sequences.shape[0] % test_chunk_size
    test_new_len = orig_test_sequences.shape[0] - test_left_over

    test_sequences = np.split(orig_test_sequences[:test_new_len],
                              test_chunk_size)
    test_cluster_ids = np.split(orig_test_cluster_ids[:test_new_len],
                                test_chunk_size)

    model = uisrnn.UISRNN(model_args)

    # train_sequences = np.array(train_sequences)
    # train_cluster_ids = np.array(train_cluster_ids)

    # d = vars(training_args)
    # # training
    # for i in range(train_sequences.shape[0]):
    #   train_sequence = train_sequences[i]
    #   train_cluster_id = train_cluster_ids[i]
    #   train_cluster_id = train_cluster_id.tolist()
    #   d['learning_rate'] = 1e-3
    #   model.fit(train_sequence, train_cluster_id, training_args)

    # # Take care of leftovers
    # train_sequence = orig_train_sequences[train_new_len:]
    # train_cluster_id = orig_train_cluster_id[train_new_len:]
    # d['learning_rate'] = 1e-3
    # model.fit(train_sequence, train_cluster_id, training_args)
    # model.save(SAVED_MODEL_NAME)

    # we can also skip training by calling:
    model.load(SAVED_MODEL_NAME)

    # testing
    # Take care of leftover
    # test_sequence = orig_test_sequences[test_new_len:]
    # test_cluster_id = orig_test_cluster_ids[test_new_len:].tolist()
    # predicted_cluster_id = model.predict(test_sequence, inference_args)
    # predicted_cluster_ids.append(predicted_cluster_id)
    # accuracy = uisrnn.compute_sequence_match_accuracy(
    #     test_cluster_id, predicted_cluster_id)
    # test_record.append((accuracy, len(test_cluster_id)))
    # print('Ground truth labels:')
    # print(test_cluster_id)
    # print('Predicted labels:')
    # print(predicted_cluster_id)
    # print('-' * 80)

    # Then the rest
    for (test_sequence, test_cluster_id) in zip(test_sequences,
                                                test_cluster_ids):
        #print(test_sequence.shape)
        #print(test_cluster_id)
        #assert 1 == 2
        test_cluster_id = test_cluster_id.tolist()
        predicted_cluster_id = model.predict(test_sequence, inference_args)
        predicted_cluster_ids.append(predicted_cluster_id)
        accuracy = uisrnn.compute_sequence_match_accuracy(
            test_cluster_id, predicted_cluster_id)
        test_record.append((accuracy, len(test_cluster_id)))
        print('Ground truth labels:')
        print(test_cluster_id)
        print('Predicted labels:')
        print(predicted_cluster_id)
        print('-' * 80)

    output_string = uisrnn.output_result(model_args, training_args,
                                         test_record)

    print('Finished diarization experiment')
    print(output_string)
예제 #25
0
def main(wav_path, embedding_per_second=1.0, overlap_rate=0.5):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    # for spk,timeDicts in speakerSlice.items():
    #     print('========= ' + str(spk) + ' =========')
    #     for timeDict in timeDicts:
    #         s = timeDict['start']
    #         e = timeDict['stop']
    #         s = fmtTime(s)  # change point moves to the center of the slice
    #         e = fmtTime(e)
    #         print(s+' ==> '+e)
    # p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    # p.draw()
    # p.plot.show()
    speech_r = speech_reg.Recognizer()
    sound = AudioSegment.from_wav(wav_path)
    for spk in speakerSlice.keys():
        print('========= ' + str(spk) + ' =========')
        for item_dict in speakerSlice[spk]:
            audio_seg = sound[item_dict['start']:item_dict['stop']]
            s = item_dict['start']
            e = item_dict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            print(s + ' ==> ' + e)
            item_dict.update({'content': audio_seg})
            filename = 'speaker' + str(spk) + '-' + str(
                item_dict['start'] / 1000) + '-' + str(
                    item_dict['stop'] / 1000) + '.wav'
            audio_seg.export(filename, format="wav")
            audio = speech_reg.AudioFile(filename)
            # words=speech_reg.AudioData(audio_seg,sample_rate=fs,sample_width=2)
            with audio as source:
                words = speech_r.record(source)
                try:
                    res = speech_r.recognize_google(words)
                except speech_reg.UnknownValueError:
                    try:
                        res = speech_r.recognize_sphinx(words)
                    except speech_reg.UnknownValueError:
                        res = ''
                item_dict.update({'content': res})
            print(res)

    return speakerSlice
예제 #26
0
def main(wav_path,
         embedding_per_second=1.0,
         n_classes=5994,
         overlap_rate=0.5,
         plot_results=True):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)
    #

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    print('intervals', intervals, len(intervals))
    print('mapTable', mapTable, len(mapTable))
    print('keys', keys, len(keys))
    # print('mapTable, keys', mapTable, keys)
    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        # print('v',v.shape)
        #print('feats', feats.shape)

        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)
    print(feats.shape)
    print(inference_args)
    print('predicted_label', predicted_label)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    print('time_spec_rate', time_spec_rate)
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)
    print('speakerSlice', speakerSlice)
    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        print(spk, timeDicts)
        for tid, timeDict in enumerate(timeDicts):
            print(tid, timeDict)
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    print('offset', offset)
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

                print('i,s,e')
                print(i, s, e, tid, spk)
            print('>>>>>', i, s, e, tid, spk)
            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    speaker_assingments = []
    for spk, timeDicts in speakerSlice.items():
        speaker = str(spk)
        print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            start = timeDict['start']
            end = timeDict['stop']
            start = fmtTime(
                start)  # change point moves to the center of the slice
            end = fmtTime(end)
            print(start + ' ==> ' + end)
            speaker_assingments.append((start, end, speaker, wav_path))

    if plot_results:
        p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
        p.draw()
        p.plot.show()

    return feats, predicted_label, intervals, speaker_assingments, time_spec_rate
예제 #27
0
파일: pipeline.py 프로젝트: gooltz/MoM.ai
def dia_audio(wav_path, embedding_per_second=0.3, overlap_rate=0.33):

    # gpu configuration
    #toolkits.initialize_GPU(args)

    params = {'dim': (257, None, 1),
              'nfft': 512,
              'spec_len': 250,
              'win_length': 400,
              'hop_length': 160,
              'n_classes': 5994,
              'sampling_rate': 16000,
              'normalize': True,
              }

    network_eval = spkModel.vggvox_resnet2d_icassp(input_dim=params['dim'],
                                                   num_class=params['n_classes'],
                                                   mode='eval', args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(
        wav_path, embedding_per_second=embedding_per_second, overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000*(1.0/embedding_per_second) * \
        (1.0-overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000*(1.0/embedding_per_second)//2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items():    # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if(s != 0 and e != 0):
                    break
                if(s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i-1]
                    s = mapTable[keys[i-1]] + offset
                if(e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i-1]
                    e = mapTable[keys[i-1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        ##print('========= ' + str(spk) + ' =========')
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            s = fmtTime(s)  # change point moves to the center of the slice
            e = fmtTime(e)
            #print(s+' ==> '+e)

#     p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
#     p.draw()
#     p.plot.show()
    return speakerSlice
예제 #28
0
def main(wav_path,
         embedding_per_second=1.0,
         overlap_rate=0.5,
         retain_audio_clip=False):

    # gpu configuration
    toolkits.initialize_GPU(args)

    params = {
        'dim': (257, None, 1),
        'nfft': 512,
        'spec_len': 250,
        'win_length': 400,
        'hop_length': 160,
        'n_classes': 5994,
        'sampling_rate': 16000,
        'normalize': True,
    }

    network_eval = spkModel.vggvox_resnet2d_icassp(
        input_dim=params['dim'],
        num_class=params['n_classes'],
        mode='eval',
        args=args)
    network_eval.load_weights(args.resume, by_name=True)

    model_args, _, inference_args = uisrnn.parse_arguments()
    model_args.observation_dim = 512
    uisrnnModel = uisrnn.UISRNN(model_args)
    uisrnnModel.load(SAVED_MODEL_NAME)

    specs, intervals = load_data(wav_path,
                                 embedding_per_second=embedding_per_second,
                                 overlap_rate=overlap_rate)
    mapTable, keys = genMap(intervals)

    feats = []
    for spec in specs:
        spec = np.expand_dims(np.expand_dims(spec, 0), -1)
        v = network_eval.predict(spec)
        feats += [v]

    feats = np.array(feats)[:, 0, :].astype(float)  # [splits, embedding dim]
    predicted_label = uisrnnModel.predict(feats, inference_args)

    time_spec_rate = 1000 * (1.0 / embedding_per_second) * (
        1.0 - overlap_rate)  # speaker embedding every ?ms
    center_duration = int(1000 * (1.0 / embedding_per_second) // 2)
    speakerSlice = arrangeResult(predicted_label, time_spec_rate)

    for spk, timeDicts in speakerSlice.items(
    ):  # time map to orgin wav(contains mute)
        for tid, timeDict in enumerate(timeDicts):
            s = 0
            e = 0
            for i, key in enumerate(keys):
                if (s != 0 and e != 0):
                    break
                if (s == 0 and key > timeDict['start']):
                    offset = timeDict['start'] - keys[i - 1]
                    s = mapTable[keys[i - 1]] + offset
                if (e == 0 and key > timeDict['stop']):
                    offset = timeDict['stop'] - keys[i - 1]
                    e = mapTable[keys[i - 1]] + offset

            speakerSlice[spk][tid]['start'] = s
            speakerSlice[spk][tid]['stop'] = e

    for spk, timeDicts in speakerSlice.items():
        for timeDict in timeDicts:
            s = timeDict['start']
            e = timeDict['stop']
            get_transcript(str(spk), s, e)

    result = print_transcipt()
    try:
        for item in result:
            start = fmtTime(item[1])
            end = fmtTime(item[2])
            file = open(os.path.join(dir_name, 'FinalTranscript.txt'), 'a')
            transcription = f"{start} ==> {end}: [Speaker : {item[0]}] : {item[3]}"
            print(transcription)
            file.write(transcription)
    except Exception as exp:
        print(f"Failed in main() while writing to file with exception {exp}")
    finally:
        file.close()

    if not retain_audio_clip:
        shutil.rmtree(dir_name)
    else:
        print(
            f'Audio files of transcriptions can be found in {dir_name} folder')

    p = PlotDiar(map=speakerSlice, wav=wav_path, gui=True, size=(25, 6))
    p.draw()
    p.plot.show()

    return result