def extract_embed(seq_file, model_file, preproc_file, output_path,
                  max_seq_length, pooling_output, write_format, **kwargs):

    set_float_cpu('float32')
    
    sr_args = SDRF.filter_args(**kwargs)
    
    if preproc_file is not None:
        preproc = TransformList.load(preproc_file)
    else:
        preproc = None

    sr = SDRF.create(seq_file, transform=preproc, **sr_args)
    
    t1 = time.time()

    model = SeqQEmbed.load(model_file)
    model.build(max_seq_length)
    model.build_embed(pooling_output)
    y_dim = model.embed_dim

    _, seq_lengths = sr.read_num_rows()
    sr.reset()
    num_seqs = len(seq_lengths)

    p1_y = np.zeros((num_seqs, y_dim), dtype=float_keras())
    p2_y = np.zeros((num_seqs, y_dim), dtype=float_keras())
    keys = []

    for i in xrange(num_seqs):
        ti1 = time.time()
        key, data = sr.read(1)
        
        ti2 = time.time()
        logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' %
              (i, num_seqs, key[0], data[0].shape[0]))
        keys.append(key[0])
        p1_y[i], p2_y[i] = model.predict_embed(data[0])
                
        ti4 = time.time()
        logging.info('Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f' %
              (i, num_seqs, key, ti4-ti1, ti2-ti1, ti4-ti2))
            
    logging.info('Extract elapsed time: %.2f' % (time.time() - t1))

    if write_format == 'p1':
        y = p1_y
    elif write_format == 'p1+p2':
        y = np.hstack((p1_y, p2_y))
    else:
        y = p2_y
    
    hw = DWF.create(output_path)
    hw.write(keys, y)
예제 #2
0
def convert(input_file, output_file, class_file):

    r = DRF.create(input_file)
    seg_set, score_mat = r.read(0, squeeze=True)

    with open(class_file, 'r') as f:
        model_set = [line.rstrip().split()[0] for line in f]

    scores = TrialScores(model_set, seg_set, score_mat.T)
    scores.save(output_file)
예제 #3
0
def compute_mfcc_feats(input_path, output_path,
                       compress, compression_method, write_num_frames, **kwargs):

    mfcc_args = MFCC.filter_args(**kwargs)
    mfcc = MFCC(**mfcc_args)

    if mfcc.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')
    
    for data in reader:
        if mfcc.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting MFCC for %s' % (key))
        t1 = time.time()
        y = mfcc.compute(x)
        dt = (time.time() - t1)*1000
        rtf = mfcc.frame_shift*y.shape[0]/dt
        logging.info('Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' %
                     (key, y.shape[0], dt, rtf))
        writer.write([key], [y])
        
        if write_num_frames is not None:
            f_num_frames.write('%s %d\n' % (key, y.shape[0]))

        mfcc.reset()
            
    if write_num_frames is not None:
        f_num_frames.close()
def compute_mfcc_feats(input_path, output_path, compress, compression_method,
                       write_num_frames, use_gpu, nn_model_path, chunk_size,
                       context, **kwargs):

    #open device
    if use_gpu and torch.cuda.is_available():
        logging.info('CUDA_VISIBLE_DEVICES=%s' %
                     os.environ['CUDA_VISIBLE_DEVICES'])
        logging.info('init gpu device')
        device = torch.device('cuda', 0)
        torch.tensor([0]).to(device)
    else:
        logging.info('init cpu device')
        device = torch.device('cpu')

    mfcc_args = MFCC.filter_args(**kwargs)
    mfcc = MFCC(**mfcc_args)
    # PUT YOUR NNET MODEL HERE!!!!
    enhancer = CAN(num_channels=45)
    enhancer.load_state_dict(
        torch.load(nn_model_path, map_location=device)['state_dict'])
    enhancer.to(device)
    enhancer.eval()

    if mfcc.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path,
                        scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')

    for data in reader:
        if mfcc.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting filter-banks for %s' % (key))
        t1 = time.time()
        y = mfcc.compute(x)

        #we apply dummy identity network to fb
        logging.info('Running enhancement network')
        y = apply_nnet(y, enhancer, chunk_size, context, device)

        dt = (time.time() - t1) * 1000
        rtf = mfcc.frame_shift * y.shape[0] / dt
        logging.info(
            'Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f'
            % (key, y.shape[0], dt, rtf))
        writer.write([key], [y])

        if write_num_frames is not None:
            f_num_frames.write('%s %d\n' % (key, y.shape[0]))

        mfcc.reset()

    if write_num_frames is not None:
        f_num_frames.close()
if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        fromfile_prefix_chars='@',
        description=
        'Compute filter-bank features and enhance with pytorch model')

    parser.add_argument('--input', dest='input_path', required=True)
    parser.add_argument('--output', dest='output_path', required=True)
    parser.add_argument('--write-num-frames',
                        dest='write_num_frames',
                        default=None)

    DRF.add_argparse_args(parser)
    MFCC.add_argparse_args(parser)
    parser.add_argument('--compress',
                        dest='compress',
                        default=False,
                        action='store_true',
                        help='Compress the features')
    parser.add_argument('--compression-method',
                        dest='compression_method',
                        default='auto',
                        choices=compression_methods,
                        help='Compression method')
    parser.add_argument('-v',
                        '--verbose',
                        dest='verbose',
                        default=1,
예제 #6
0
def extract_embed(seq_file, model_file, preproc_file, output_path, max_length,
                  layer_names, **kwargs):

    set_float_cpu('float32')

    sr_args = SDRF.filter_args(**kwargs)

    if preproc_file is not None:
        preproc = TransformList.load(preproc_file)
    else:
        preproc = None

    sr = SDRF.create(seq_file, transform=preproc, **sr_args)

    t1 = time.time()

    model = SeqEmbed.load(model_file)
    model.build()
    model.build_embed(layer_names)
    y_dim = model.embed_dim

    _, seq_lengths = sr.read_num_rows()
    sr.reset()
    num_seqs = len(seq_lengths)
    max_length = np.minimum(np.max(seq_lengths), max_length)

    y = np.zeros((num_seqs, y_dim), dtype=float_keras())
    xx = np.zeros((1, max_length, model.x_dim), dtype=float_keras())
    keys = []

    for i in xrange(num_seqs):
        ti1 = time.time()
        data = sr.read(1)
        key = data[0][0]
        x = data[1][0]

        ti2 = time.time()
        logging.info('Extracting embeddings %d/%d for %s, num_frames: %d' %
                     (i, num_seqs, key, x.shape[0]))
        keys.append(key)
        xx[:, :, :] = 0

        if x.shape[0] <= max_length:
            xx[0, :x.shape[0]] = x
            y[i] = model.predict_embed(xx, batch_size=1)
        else:
            num_chunks = int(np.ceil(float(x.shape[0]) / max_length))
            chunk_size = int(np.ceil(float(x.shape[0]) / num_chunks))
            for j in xrange(num_chunks - 1):
                start = j * chunk_size
                xx[0, :chunk_size] = x[start:start + chunk_size]
                y[i] += model.predict_embed(xx, batch_size=1).ravel()
            xx[0, :chunk_size] = x[-chunk_size:]
            y[i] += model.predict_embed(xx, batch_size=1).ravel()
            y[i] /= num_chunks

        ti4 = time.time()
        logging.info(
            'Elapsed time embeddings %d/%d for %s, total: %.2f read: %.2f, vae: %.2f'
            % (i, num_seqs, key, ti4 - ti1, ti2 - ti1, ti4 - ti2))

    logging.info('Extract elapsed time: %.2f' % (time.time() - t1))

    hw = DWF.create(output_path)
    hw.write(keys, y)
예제 #7
0
def compute_mfcc_feats(input_path, output_path,
                       compress, compression_method, write_num_frames, 
                       use_gpu, nn_model_path, chunk_size, context,
                       **kwargs):

    #open device
    if  use_gpu and torch.cuda.is_available():
        os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
        max_tries = 100
        for g in range(max_tries):
            try:
                gpu_ids = find_free_gpus()
                os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids
                logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES'])
                logging.info('init gpu device')
                device = torch.device('cuda', 0)
                torch.tensor([0]).to(device)
                break
            except:
                if g < max_tries-1:
                    logging.info('failing init gpu, trying again')
                    time.sleep(10)
                else:
                    logging.info('failing init gpu, using cpu')
                    device = torch.device('cpu')

    else:
        logging.info('init cpu device')
        device = torch.device('cpu')

    mfcc_args1 = MFCC.filter_args(**kwargs)
    mfcc_args2 = copy.deepcopy(mfcc_args1)
    mfcc_args1['output_step'] = 'logfb'
    mfcc_args2['input_step'] = 'logfb'
    print(kwargs)
    print(mfcc_args1)
    print(mfcc_args2)
    mfcc1 = MFCC(**mfcc_args1)
    mfcc2 = MFCC(**mfcc_args2)   

    mvn = MVN(norm_var=False, left_context=150, right_context=150)
    
    # PUT YOUR NNET MODEL HERE!!!!
    enhancer = CGN()
    #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict'])
    enhancer.load_state_dict(torch.load(nn_model_path, map_location=device))
    enhancer.to(device)
    enhancer.eval()

    if mfcc1.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')
    
    for data in reader:
        if mfcc1.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting filter-banks for %s' % (key))
        t1 = time.time()
        y = mfcc1.compute(x)

        # separate logE and filterbanks
        logE = y[:,0]
        y = y[:,1:]

        #estimate log energy from filterbanks
        logEy1 = logsumexp(y, axis=-1)

        #we apply dummy identity network to fb
        logging.info('Running enhancement network')
        y = mvn.normalize(y)
        y = apply_nnet(y, enhancer, chunk_size, context, device)

        #lets rescale the logE based on enhanced filterbanks
        logEy2 = logsumexp(y, axis=-1)
        logE = logE + (logEy2 - logEy1)

        # concatenate logE and filterbanks
        y = np.concatenate((logE[:,None], y), axis=-1)

        #apply DCT
        logging.info('Applying DCT')
        y = mfcc2.compute(y)

        dt = (time.time() - t1)*1000
        rtf = mfcc1.frame_shift*y.shape[0]/dt
        logging.info('Extracted filter-banks for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' %
                     (key, y.shape[0], dt, rtf))
        writer.write([key], [y])
        
        if write_num_frames is not None:
            f_num_frames.write('%s %d\n' % (key, y.shape[0]))

        mfcc1.reset()
            
    if write_num_frames is not None:
        f_num_frames.close()
예제 #8
0
    print(mfcc_args2)
    mfcc1 = MFCC(**mfcc_args1)
    mfcc2 = MFCC(**mfcc_args2)   

    # PUT YOUR NNET MODEL HERE!!!!
    enhancer = CGN()
    #enhancer.load_state_dict(torch.load(nn_model_path, map_location=device)['state_dict'])
    enhancer.load_state_dict(torch.load(nn_model_path, map_location=device))
    enhancer.to(device)
    enhancer.eval()

    if mfcc1.input_step == 'wave':
        input_args = AR.filter_args(**kwargs)
        reader = AR(input_path, **input_args)
    else:
        input_args = DRF.filter_args(**kwargs)
        reader = DRF.create(input_path, **input_args)

    writer = DWF.create(output_path, scp_sep=' ',
                        compress=compress,
                        compression_method=compression_method)

    if write_num_frames is not None:
        f_num_frames = open(write_num_frames, 'w')
    
    for data in reader:
        if mfcc1.input_step == 'wave':
            key, x, fs = data
        else:
            key, x = data
        logging.info('Extracting filter-banks for %s' % (key))