Python ReadHelper 예제들, kaldiio.ReadHelper Python 예제들

예제 #1

0

파일 보기

파일: prepare_lda_plda_inputs.py 프로젝트: kailashkarthik9/MultiModalEmotionDetection

def get_meld_utterances(speech_dir, text_dir):
    utterances = defaultdict(dict)
    with open('%s/utt2spk' % (MELD_PATH), 'r') as f:
        for line in f.readlines():
            utterance_id, _ = line.split(' ')

            if utterance_id in utterances:
                raise Exception("Duplicate utterance: %s" % utterance_id)

            utterances[utterance_id]['emotion'] = utterance_id.split('-')[0]

    if speech_dir != 'none':
        with ReadHelper('scp:%s/meld/xvector.scp' % speech_dir) as reader:
            for utterance_id, speech_vector in reader:
                if utterance_id not in utterances:
                    raise Exception("Speech vector for unknown utterance: %s" %
                                    utterance_id)

                utterances[utterance_id]['speech'] = speech_vector

    if text_dir != 'none':
        with open('%s/meld_embedding.pkl' % text_dir, 'rb') as f:
            meld = pickle.load(f)
            for _, row in meld.iterrows():
                utterance_id, text_embeddings = row['ID'], row[
                    'Text Embeddings']

                if utterance_id not in utterances:
                    raise Exception("Text vector for unknown utterance: %s" %
                                    utterance_id)

                utterances[utterance_id]['text'] = text_embeddings
    return utterances

예제 #2

0

파일 보기

def par_core_extractXvectors(inFeatsScp, outXvecArk, outXvecScp, net,
                             layerName):
    """ To be called using pytorch multiprocessing
        Note: This function reads all the data from feats.scp into memory
        before inference. Hence, make sure the file is not too big (Hint: use
        split_data_dir.sh)
    """

    activation = {}

    def get_activation(name):
        def hook(model, input, output):
            activation[name] = output.detach()

        return hook

    eval('net.%s.register_forward_hook(get_activation(layerName))' % layerName)

    with kaldi_python_io.ArchiveWriter(outXvecArk, outXvecScp,
                                       matrix=False) as writer:
        with ReadHelper('scp:%s' % inFeatsScp) as reader:
            for key, mat in reader:
                out = net(x=torch.Tensor(mat).permute(1,
                                                      0).unsqueeze(0).cuda(),
                          eps=0)
                writer.write(key,
                             np.squeeze(activation[layerName].cpu().numpy()))

예제 #3

0

파일 보기

def main():

    args = parse()

    spec_time_warp = args.spec_time_warp
    spec_freq_mask_width = args.spec_freq_mask_width
    spec_time_mask_width = args.spec_time_mask_width
    spec_num_freq_masks = args.spec_num_freq_masks
    spec_num_time_masks = args.spec_num_time_masks
    spec_time_mask_bound_ratio = args.spec_time_mask_bound_ratio
    spec_replace_with_zero = args.spec_replace_with_zero

    featdir = args.spec_feat_dir[0]
    featscp = os.path.join(featdir, 'feats.scp')
    with open(featscp) as f:
        lines = f.readlines()
        pbar = tqdm(total=len(lines))

    feats_dict = {}
    with ReadHelper('scp:' + featscp) as reader:
        for key, mat in reader:
            spec_feat = specaug(torch.from_numpy(mat), spec_time_warp, spec_freq_mask_width, spec_time_mask_width, \
                 spec_num_freq_masks, spec_num_time_masks, spec_time_mask_bound_ratio, spec_replace_with_zero)
            feats_dict[key] = spec_feat.numpy()
            pbar.update(1)

    with WriteHelper('ark,scp:' + featdir + '/feats_spec.ark,' + featdir +
                     '/feats_spec.scp') as writer:
        for key, mat in feats_dict.items():
            writer(key, mat)

예제 #4

0

파일 보기

파일: calc_cossim_scores.py 프로젝트: siddalmia/espnet

def ReadXvecs(rspec):
    xvecs = dict()
    with ReadHelper(rspec) as reader:
        for utid, xvec in reader:
            xvecs[utid] = xvec
    reader.close()
    return xvecs

예제 #5

0

파일 보기

def main():
    if not os.path.isdir(KALDI_ROOT):
        print('CHANGE THIS TO YOUR OWN KALDI ROOT: ', KALDI_ROOT)
        exit()

    if not os.path.isdir(TIMIT_PATH):
        print('Invalid path for the kaldi TIMIT dataset: ', TIMIT_PATH)
        print('Please run the kaldi scripts first! More information are described in the README file and Wiki page.')

    if not os.path.isdir(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    # read data from the preprocessed kaldi directory
    for s in SETS:
        
        output = {}
        print('Preprocessing', s, 'data...')
        cur_dir = os.path.join(OUTPUT_DIR, s.replace('_', '-'))
        if not os.path.isdir(cur_dir): os.mkdir(cur_dir)
        
        for i in range(10):
            with ReadHelper('ark:' + TIMIT_PATH + s + '/data/feats_fmllr_' + s + '.' + str(i+1) + '.ark') as reader:
                for key, array in tqdm(reader):
                    array = np.asarray(array).astype('float32')
                    np.save(os.path.join(cur_dir, key), array)
                    output[os.path.join(s.replace('_', '-'), key + '.npy')] = len(array)

        output = sorted(output.items(), key=operator.itemgetter(1), reverse=True)
        df = pd.DataFrame(data={'file_path':[fp for fp, l in output], 'length':[l for fp, l in output], 'label':'None'})
        df.to_csv(os.path.join(OUTPUT_DIR, s.replace('_', '-') + '.csv'))

    print('[ARK-TO-TIMIT] - All done, saved at \'' + str(OUTPUT_DIR) + '\', exit.')
    exit()

예제 #6

0

파일 보기

    def __init__(self, vad_rspec, reg_exp):
        data = dict()
        prev = -1
        with ReadHelper(vad_rspec) as reader:
            for utid, prob in reader:
                result = reg_exp.match(utid)
                assert result is not None, 'Wrong utterance ID format: \"{}\"'.format(
                    utid)
                sess_indx = result.group(1)
                spkr = result.group(2)

                result = reg_exp.match(sess_indx)
                assert result is not None, 'Wrong utterance ID format: \"{}\"'.format(
                    sess_indx)
                sess = result.group(1)
                indx = int(result.group(2))

                sess = sess + '-' + spkr

                if sess not in data.keys():
                    assert indx == 1
                    prev = -1
                    data[sess] = list()
                assert indx >= prev
                data[sess].append(prob)
                prev = indx
            reader.close()
        print('  loaded {} sessions'.format(len(data)))
        print('  combining fragments')
        self.data = dict()
        for sess, items in data.items():
            self.data[sess] = np.hstack(items)

예제 #7

0

파일 보기

파일: prepare_lda_plda_inputs.py 프로젝트: kailashkarthik9/MultiModalEmotionDetection

def get_cremad_utterances(speech_dir, text_dir):
    utterances = defaultdict(dict)
    with open('%s/utt2spk' % (CREMA_D_PATH), 'r') as f:
        for line in f.readlines():
            utterance_id, _ = line.split(' ')

            if utterance_id in utterances:
                raise Exception("Duplicate utterance: %s" % utterance_id)

            utterances[utterance_id]['emotion'] = utterance_id.split('-')[0]

    if speech_dir != 'none':
        with ReadHelper('scp:%s/cremad/xvector.scp' % speech_dir) as reader:
            for utterance_id, speech_vector in reader:
                if utterance_id not in utterances:
                    raise Exception("Speech vector for unknown utterance: %s" %
                                    utterance_id)

                utterances[utterance_id]['speech'] = speech_vector

    if text_dir != 'none':
        embeddings_by_emotion = defaultdict(list)
        with open('%s/dd_embedding.pkl' % text_dir, 'rb') as f:
            dd = pickle.load(f)
            for _, row in dd.iterrows():
                utterance_id = row['ID']
                emotion = utterance_id.split('-')[0]
                embeddings_by_emotion[emotion].append(row['Text Embeddings'])

        for utterance_id, utterance in utterances.items():
            emotion = utterance['emotion']
            random.seed(utterance_id)
            random_text_vector = random.choice(embeddings_by_emotion[emotion])
            utterance['text'] = random_text_vector
    return utterances

예제 #8

0

파일 보기

파일: flickr8k_word_image.py 프로젝트: lwang114/DiscoPhoneInfoBottleneck

    def load_audio(self, audio_file):
        if self.audio_feature_type == "mfcc":
            audio, _ = torchaudio.load(audio_file)
            try:
                inputs = self.audio_transforms(audio[:, begin:end]).squeeze(0)
            except:
                inputs = self.audio_transforms(audio)
                inputs = inputs.squeeze(0)
        elif self.audio_feature_type == "cpc":
            if os.path.exists(audio_file):
                audio = np.loadtxt(audio_file)
            else:
                with ReadHelper(f"ark: gunzip -c {audio_file} |") as ark_f:
                    for k, audio in ark_f:
                        continue
                print(audio.size())  # XXX
            inputs = torch.FloatTensor(audio).t()
        else:
            Exception(
                f"Audio feature type {self.audio_feature_type} not supported")

        nframes = inputs.size(-1)
        input_mask = torch.zeros(self.max_feat_len)
        input_mask[:nframes] = 1.
        inputs = fix_embedding_length(inputs.t(), self.max_feat_len).t()
        return inputs, input_mask

예제 #9

0

파일 보기

def parse_predictions_ark(filepath):
    utt2predictions = {}
    with ReadHelper('ark:%s' % (filepath)) as reader:
        for utterance_id, predictions in reader:
            if utterance_id in utt2predictions:
                raise Exception('%s duped in %s' % (utterance_id, filepath))
            utt2predictions[utterance_id] = predictions
    return utt2predictions

예제 #10

0

파일 보기

파일: gen_pseudo_xvecs.py 프로젝트: ssloxford/DistributionPreservingXVectors

def load_xvecs(xvec_file):
    original_xvecs = {}
    # Read source original xvectors.
    with ReadHelper('scp:' + xvec_file) as reader:
        for key, xvec in reader:
            # print key, mat.shape
            original_xvecs[key] = xvec
    return original_xvecs

예제 #11

0

파일 보기

파일: timit_prepare_phones.py 프로젝트: danpovey/gruber

def main():
    args = parser.parse_args()

    name_to_phones = {}  # from string utterance-id to a NumPy array containing
    # the phones in 1-based indexing.d
    with ReadHelper('ark:-') as reader:
        for key, numpy_array in reader:
            name_to_phones[key] = numpy_array.astype(np.int8)
    np.savez_compressed(args.output_file, **name_to_phones)

예제 #12

0

파일 보기

def data_gen_test_1():

    os.chdir('../../')
    root_dir_test = 'Accent_Data/test/'
    print('FOR TEST DIR..............')
    print(os.getcwd())
    os.chdir(root_dir_test)
    print(os.getcwd())

    file_to_read = open("test_dict", "rb")
    test_dict = pickle.load(file_to_read)

    for i in range(1, 9):
        with ReadHelper("scp:" + "feats" + str(i) + ".scp") as reader:
            for key, numpy_array in reader:
                #ids_train.append(key.split('-')[1])
                #if numpy_array.shape[0] > max_len:
                #max_len = numpy_array.shape[0]

                if numpy_array.shape[0] < 1000:
                    numpy_array = np.concatenate([
                        numpy_array,
                        np.array([[0] * 83] * (1000 - numpy_array.shape[0]))
                    ])

                elif numpy_array.shape[0] > 1000:
                    numpy_array = numpy_array[:1000]
                    #n = n+1

                if key.split('-')[1] == 'AMERICAN':
                    label = np.array([1., 0., 0., 0., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'BRITISH':
                    label = np.array([0., 1., 0., 0., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'CHINESE':
                    label = np.array([0., 0., 1., 0., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'INDIAN':
                    label = np.array([0., 0., 0., 1., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'JAPANESE':
                    label = np.array([0., 0., 0., 0., 1., 0., 0., 0.])

                elif key.split('-')[1] == 'KOREAN':
                    label = np.array([0., 0., 0., 0., 0., 1., 0., 0.])

                elif key.split('-')[1] == 'PORTUGUESE':
                    label = np.array([0., 0., 0., 0., 0., 0., 1., 0.])

                elif key.split('-')[1] == 'RUSSIAN':
                    label = np.array([0., 0., 0., 0., 0., 0., 0., 1.])

                #yield(numpy_array, {'accent': label, 'gender': np.array([float(1)])} )
                if test_dict[key] == 1:
                    yield numpy_array, label  # , np.array([float(1)])

예제 #13

0

파일 보기

파일: gen_pseudo_xvecs.py 프로젝트: ssloxford/DistributionPreservingXVectors

def train_models(pool_data, xvec_out_dir, combine_genders=False):
    # Load and assemble all of the xvectors from the pool sources
    pool_data_sources = os.listdir(pool_data)
    pool_data_sources = [
        x for x in pool_data_sources if os.path.isdir(join(pool_data, x))
        and os.path.isfile(os.path.join(pool_data, x, 'wav.scp'))
    ]

    gender_pools = {'m': [], 'f': []}
    xvector_pool = []

    for pool_source in pool_data_sources:
        print('Adding {} to the pool'.format(join(pool_data, pool_source)))
        pool_spk2gender_file = join(pool_data, pool_source, 'spk2gender')

        # Read pool spk2gender
        pool_spk2gender = {}
        with open(pool_spk2gender_file) as f:
            for line in f.read().splitlines():
                sp = line.split()
                pool_spk2gender[sp[0]] = sp[1]

        # Read pool xvectors
        pool_xvec_file = join(xvec_out_dir, 'xvectors_' + pool_source,
                              'spk_xvector.scp')
        if not os.path.exists(pool_xvec_file):
            raise ValueError(
                'Xvector file: {} does not exist'.format(pool_xvec_file))

        with ReadHelper('scp:' + pool_xvec_file) as reader:
            for key, xvec in reader:
                # print key, mat.shape
                xvector_pool.append(xvec)
                gender = pool_spk2gender[key]
                gender_pools[gender].append(xvec)

    print("Read ", len(gender_pools['m']), " male pool xvectors")
    print("Read ", len(gender_pools['f']), " female pool xvectors")

    # Fit and train GMMS
    if combine_genders:
        transforms = generate_pca_and_gmm(xvector_pool,
                                          pca_parameter,
                                          random_state=random_seed)
    else:
        transforms = {'m': {}, 'f': {}}
        for gender in ('m', 'f'):
            gender_xvecs = gender_pools[gender]
            transforms[gender] = generate_pca_and_gmm(gender_xvecs,
                                                      pca_parameter,
                                                      random_state=random_seed)

    return transforms

예제 #14

0

파일 보기

파일: extract_wav2vec.py 프로젝트: LeBenchmark/Interspeech2021

def write_features(model, input, output):
    os.makedirs(output, exist_ok=True)
    with ReadHelper(
            f'ark:extract-segments scp:{input}/wav.scp {input}/segments ark:-|'
    ) as reader:
        with WriteHelper(
                f'ark,scp:{output}/feats.ark,{output}/feats.scp') as writer:
            for key, (sf, wav) in reader:
                wav = wav.astype(dtype=np.float32)
                feat = model(wav)
                feat = np.repeat(feat, 2, axis=0)
                writer(key, feat)

예제 #15

0

파일 보기

def main(args):

    phone_post_dir = args.phone_post_dir
    output_dir = args.output_dir

    phone_posteriorgram = 'scp:' + phone_post_dir + '/phone_post.1.scp'

    with ReadHelper(phone_posteriorgram) as reader:
        for key, array in reader:
            array = np.asarray(array)

    np.save(output_dir + '/phone_post.npy', array)

예제 #16

0

파일 보기

def data_gen_test():

    os.chdir('../../')
    print('FOR TEST DIR..............')
    root_dir_test = 'Accent_Data/test/'
    #print(os.getcwd())
    os.chdir(root_dir_test)
    print(os.getcwd())

    for i in range(1, 9):
        with ReadHelper("scp:" + "feats" + str(i) + ".scp") as reader:
            for key, numpy_array in reader:
                #ids_train.append(key.split('-')[1])
                #if numpy_array.shape[0] > max_len:
                #max_len = numpy_array.shape[0]

                if numpy_array.shape[0] < 1000:
                    numpy_array = np.concatenate([
                        numpy_array,
                        np.array([[0] * 83] * (1000 - numpy_array.shape[0]))
                    ])

                elif numpy_array.shape[0] > 1000:
                    numpy_array = numpy_array[:1000]
                    #n = n+1

                if key.split('-')[1] == 'AMERICAN':
                    label = np.array([1., 0., 0., 0., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'BRITISH':
                    label = np.array([0., 1., 0., 0., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'CHINESE':
                    label = np.array([0., 0., 1., 0., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'INDIAN':
                    label = np.array([0., 0., 0., 1., 0., 0., 0., 0.])

                elif key.split('-')[1] == 'JAPANESE':
                    label = np.array([0., 0., 0., 0., 1., 0., 0., 0.])

                elif key.split('-')[1] == 'KOREAN':
                    label = np.array([0., 0., 0., 0., 0., 1., 0., 0.])

                elif key.split('-')[1] == 'PORTUGUESE':
                    label = np.array([0., 0., 0., 0., 0., 0., 1., 0.])

                elif key.split('-')[1] == 'RUSSIAN':
                    label = np.array([0., 0., 0., 0., 0., 0., 0., 1.])

                yield numpy_array, label

예제 #17

0

파일 보기

def main():
    opt = parse_opt()

    enroll_mat = []
    test_mat = []
    enroll_idx_dict = defaultdict(lambda: len(enroll_idx_dict))
    test_idx_dict = defaultdict(lambda: len(test_idx_dict))

    enroll = 'scp:' + opt.enroll_embedding_path
    verify = 'scp:' + opt.test_embedding_path
    with ReadHelper(enroll) as reader:
        for key, numpy_array in reader:
            enroll_mat.append(numpy_array)
            enroll_idx_dict[key]
    with ReadHelper(verify) as reader:
        for key, numpy_array in reader:
            test_mat.append(numpy_array)
            test_idx_dict[key]
    enroll_mat = np.stack(enroll_mat)
    test_mat = np.stack(test_mat)
    # print(enroll_mat.shape)
    # print(test_mat.shape)

    if opt.is_mean:
        if opt.mean_vec == "self":
            mean_feat = centering_mean(enroll_mat)
        else:
            mean_feat = np.loadtxt(opt.mean_vec)
        enroll_mat = enroll_mat - mean_feat
        test_mat = enroll_mat - mean_feat

    enroll_idx_dict = dict(enroll_idx_dict)
    test_idx_dict = dict(test_idx_dict)
    assert enroll_mat.ndim == 2, "dimension should be 2"
    generate_score(enroll_mat, test_mat, enroll_idx_dict, test_idx_dict,
                   opt.trial_list, opt.result_cosine)

예제 #18

0

파일 보기

파일: cos_snorm.py 프로젝트: twistedmove/open-speaker-verification

def main():
    opt = parse_opt()

    whole_mat = []
    cohort_mat = []
    whole_idx_dict = defaultdict(lambda: len(whole_idx_dict))

    whole = 'scp:' + opt.whole_trial_embedding_path
    cohort = 'scp:' + opt.cohort_embedding_path
    with ReadHelper(whole) as reader:
        for key, numpy_array in reader:
            whole_mat.append(numpy_array)
            whole_idx_dict[key]
    with ReadHelper(cohort) as reader:
        for key, numpy_array in reader:
            cohort_mat.append(numpy_array)
    whole_mat = np.stack(whole_mat)
    cohort_mat = np.stack(cohort_mat)

    assert whole_mat.ndim == 2 and cohort_mat.ndim == 2, "dimension should be 2"
    do_asnorm = not opt.snorm
    stats_dict = generate_stat(whole_mat, cohort_mat, whole_idx_dict,
                               do_asnorm, opt.topN)
    output_score(opt.raw_scores, stats_dict, opt.result_snorm)

예제 #19

0

파일 보기

파일: calculate_gop.py 프로젝트: MarceloSancinetti/epa-gop-pykaldi

def compute_gop(df_phones_pure, df_alignments):

    gop = {}
    with ReadHelper('ark:loglikes.ark') as reader:
        for key, loglikes in tqdm.tqdm(reader):

            loglikes = softmax(np.array(loglikes),
                               axis=1)  #Apply softmax before computing
            df_scores = pd.DataFrame(df_alignments.loc[:, key]).transpose()
            df_scores['p'] = [loglikes]
            gop[key] = gop_robust_with_matrix(df_scores, df_phones_pure, 6024,
                                              1, [], [])

    with open('gop_epa.pickle', 'wb') as handle:
        pickle.dump(gop, handle, protocol=pickle.HIGHEST_PROTOCOL)

예제 #20

0

파일 보기

def extract_mfcc(name, original_mfcc_dir, mfcc_npy_root_dir):
    utt_id2mfcc = {}
    for scp in glob(p_join(original_mfcc_dir,
                           'raw_mfcc_{}.*.scp'.format(name))):
        num = scp.split('.')[-2]
        print('extract:', scp)
        mfcc_npy_dir = p_join(mfcc_npy_root_dir, name + '.' + num)
        os.makedirs(mfcc_npy_dir, exist_ok=True)
        with ReadHelper('scp:' + scp) as reader:
            for utt_id, mfcc in reader:
                #print(utt_id, mfcc.shape)
                mfcc_npy = p_join(mfcc_npy_dir, utt_id + '.npy')
                #print(mfcc_npy)
                np.save(mfcc_npy, mfcc)
                utt_id2mfcc[utt_id] = mfcc_npy
    return utt_id2mfcc

예제 #21

0

파일 보기

파일: prepare_lda_plda_inputs.py 프로젝트: kailashkarthik9/MultiModalEmotionDetection

def get_iemocap_utterances(speech_dir, text_dir, subset):
    utterances = defaultdict(dict)
    with open('%s/utt2spk' % (IEMOCAP_PATH), 'r') as f:
        for line in f.readlines():
            utterance_id, _ = line.split(' ')
            session = int(utterance_id.split('-')[2][0:2])

            if session != int(subset[-1:]):
                continue

            if utterance_id in utterances:
                raise Exception("Duplicate utterance: %s" % utterance_id)

            utterances[utterance_id]['session'] = session
            utterances[utterance_id]['emotion'] = utterance_id.split('-')[0]

    if speech_dir != 'none':
        with ReadHelper('scp:%s/iemocap/xvector.scp' % speech_dir) as reader:
            for utterance_id, speech_vector in reader:
                session = int(utterance_id.split('-')[2][0:2])

                if session != int(subset[-1:]):
                    continue

                if utterance_id not in utterances:
                    raise Exception("Speech vector for unknown utterance: %s" %
                                    utterance_id)

                utterances[utterance_id]['speech'] = speech_vector

    if text_dir != 'none':
        with open('%s/iemocap_embedding.pkl' % text_dir, 'rb') as f:
            iemocap = pickle.load(f)
            for _, row in iemocap.iterrows():
                utterance_id = row['ID']
                session = int(utterance_id.split('-')[2][0:2])

                if session != int(subset[-1:]):
                    continue

                if utterance_id not in utterances:
                    raise Exception("Text vector for unknown utterance: %s" %
                                    utterance_id)

                utterances[utterance_id]['text'] = row['Text Embeddings']

    return utterances

예제 #22

0

파일 보기

def get_train_data():

    feat_arr_train = []
    ids_train = []

    root_dir_train = 'TIMIT_Data/speed_perturbation_80fbanks3pitchs/dump/trainNet_sp/deltafalse/'  # Root directory for training set
    os.chdir(root_dir_train)

    for i in range(1, 21):
        with ReadHelper('scp:' + 'feats' + str(i) + '.scp') as reader:
            for key, numpy_array in reader:
                ids_train.append(key)
                numpy_array = spec_augment(numpy_array)

                if numpy_array.shape[0] < 800:
                    numpy_array = pad_sequences(numpy_array.T,
                                                maxlen=800,
                                                padding='post')
                    numpy_array = numpy_array.T

                elif numpy_array.shape[0] > 800:
                    numpy_array = numpy_array[:800]

                # Incorporating gender information as a binary feature:

                if key[0] == 'F':  # i.e. for female samples
                    #female_feat_arr_train.append(numpy_array)
                    #feat_arr_train.append(numpy_array)
                    feat_arr_train.append(
                        np.concatenate(
                            (numpy_array, np.array([1] * 800).reshape(800, 1)),
                            axis=1))

                elif key[0] == 'M':  # i.e. for male samples
                    #male_feat_arr_train.append(numpy_array)
                    #feat_arr_train.append(numpy_array)
                    feat_arr_train.append(
                        np.concatenate(
                            (numpy_array, np.array([0] * 800).reshape(800, 1)),
                            axis=1))
                else:
                    print('ERROR! ' + str(key))

    # Coming back to main directory
    os.chdir('../../../../../')

    return feat_arr_train, ids_train

예제 #23

0

파일 보기

파일: vad_prob_mod.py 프로젝트: Elli-Kafritsa/asr-tts-class-2021

 def __init__(self, vad_rspec, reg_exp):
     data = dict()
     prev = -1
     with ReadHelper(vad_rspec) as reader:
         for utid, align in reader:
             result = reg_exp.match(utid)
             assert result is not None, 'Wrong VAD alignment utterance ID format: \"{}\"'.format(
                 utid)
             sess = result.group(1)
             piece = result.group(2)
             spkr = result.group(3)
             if sess not in data.keys():
                 data[sess] = dict()
             if piece not in data[sess].keys():
                 data[sess][piece] = dict()
             data[sess][piece][spkr] = align
         reader.close()
     print('  loaded {} sessions'.format(len(data)))
     self.data = data

예제 #24

0

파일 보기

파일: inference.py 프로젝트: cvqluu/MTL-Speaker-Embeddings

def test_nosil(generator, ds_test, device, mindcf=False):
    generator.eval()
    all_embeds = []
    all_utts = []
    num_examples = len(ds_test.veri_utts)

    with torch.no_grad():
        with ReadHelper(
                'ark:apply-cmvn-sliding --norm-vars=false --center=true --cmn-window=300 scp:{0}/feats_trimmed.scp '
                'ark:- | select-voiced-frames ark:- scp:{0}/vad_trimmed.scp ark:- |'
                .format(ds_test.data_base_path)) as reader:
            for key, feat in tqdm(reader, total=num_examples):
                if key in ds_test.veri_utts:
                    all_utts.append(key)
                    feats = torch.FloatTensor(feat).unsqueeze(0).to(device)
                    embeds = generator(feats)
                    all_embeds.append(embeds.cpu().numpy())

    metric = SpeakerRecognitionMetrics(distance_measure='cosine')
    all_embeds = np.vstack(all_embeds)
    all_embeds = normalize(all_embeds, axis=1)
    all_utts = np.array(all_utts)

    print(all_embeds.shape, len(ds_test.veri_utts))
    utt_embed = OrderedDict({k: v for k, v in zip(all_utts, all_embeds)})

    emb0 = np.array([utt_embed[utt] for utt in ds_test.veri_0])
    emb1 = np.array([utt_embed[utt] for utt in ds_test.veri_1])

    scores = metric.scores_from_pairs(emb0, emb1)
    fpr, tpr, thresholds = roc_curve(1 - ds_test.veri_labs,
                                     scores,
                                     pos_label=1,
                                     drop_intermediate=False)
    eer = metric.eer_from_ers(fpr, tpr)
    generator.train()
    if mindcf:
        mindcf1 = metric.compute_min_dcf(fpr, tpr, thresholds, p_target=0.01)
        mindcf2 = metric.compute_min_dcf(fpr, tpr, thresholds, p_target=0.001)
        return eer, mindcf1, mindcf2
    else:
        return eer

예제 #25

0

파일 보기

import sys
from glob import iglob, glob
from os.path import basename, dirname, join as p_join
import json

import numpy as np
from kaldiio import ReadHelper, WriteHelper

vad_scp = sys.argv[1]
utt2num_frames = sys.argv[2]

utt_ids = []
vads = []
utt_id2vad = {}

with ReadHelper('scp:' + vad_scp) as vad_reader:
    #for i, (utt_id, vad) in enumerate(vad_reader):
    #    utt_id2vad[utt_id] = vad
    #    print(utt_id, len(vad))
    with open(utt2num_frames, 'w') as writer:
        for i, (utt_id, vad) in enumerate(vad_reader):
            utt_id2vad[utt_id] = vad
            print(utt_id, len(vad), file=writer)

#print('vad reading completed')
#with ReadHelper('scp:' + mfcc_scp) as mfcc_reader:
#    for i, (utt_id, mfcc) in enumerate(mfcc_reader):
#        if utt_id in utt_id2vad.keys():
#            vad = utt_id2vad[utt_id]
#            assert(len(vad) == len(mfcc))
#            print(i, 'check',  vad.shape, mfcc.shape)

예제 #26

0

파일 보기

파일: preprocess.py 프로젝트: dannigt/NMTGMinor.lowLatency

def make_asr_data(src_file, tgt_file, tgt_dicts, max_src_length=64, max_tgt_length=64,
                  input_type='word', stride=1, concat=1, prev_context = 0, fp16=False, reshape=True,asr_format="h5"):
    src, tgt = [], []
    # sizes = []
    src_sizes = []
    tgt_sizes = []
    count, ignored = 0, 0
    n_unk_words = 0

    print('Processing %s & %s ...' % (src_file, tgt_file))


    if(asr_format == "h5"):
        fileIdx = -1;
        if(src_file[-2:] == "h5"):
            srcf = h5.File(src_file,'r')
        else:
            fileIdx = 0
            srcf = h5.File(src_file+"."+str(fileIdx)+".h5",'r')
    elif(asr_format == "scp"):
        import kaldiio
        from kaldiio import ReadHelper
        audio_data =  iter(ReadHelper('scp:'+src_file))
    
    tgtf = open(tgt_file)

    index = 0

    s_prev_context = []
    t_prev_context = []

    while True:
        tline = tgtf.readline()
        # normal end of file
        if tline == "":
            break

        if(asr_format == "h5" ):
            if(str(index) in srcf):
                featureVectors = np.array(srcf[str(index)])
            elif(fileIdx != -1):
                srcf.close()
                fileIdx += 1
                srcf = h5.File(src_file+"."+str(fileIdx)+".h5",'r')
                featureVectors = np.array(srcf[str(index)])
            else:
                print("No feature vector for index:",index,file=sys.stderr)
                exit(-1)
        elif(asr_format == "scp"):
            _,featureVectors = next(audio_data)
        featureVectors = featureVectors[:, :40]
        #if index == 0:
        #        print(len(featureVectors), featureVectors.shape)
        if(stride == 1):
            sline = torch.from_numpy(featureVectors)
        else:
            sline = torch.from_numpy(featureVectors[0::opt.stride])

        if reshape:
            if concat != 1:
                add = (concat-sline.size()[0]%concat)%concat
                z= torch.FloatTensor(add, sline.size()[1]).zero_()
                sline = torch.cat((sline,z),0)
                sline = sline.reshape((int(sline.size()[0]/concat), sline.size()[1]*concat))
        index += 1;

        tline = tline.strip()

        if prev_context > 0:

            print("Multiple ASR context isn't supported at the moment   ")
            raise NotImplementedError

            # s_prev_context.append(sline)
            # t_prev_context.append(tline)
            # for i in range(1,prev_context+1):
            #     if i < len(s_prev_context):
            #         sline = torch.cat((torch.cat((s_prev_context[-i-1],torch.zeros(1,sline.size()[1]))),sline))
            #         tline = t_prev_context[-i-1]+" # "+tline
            # if len(s_prev_context) > prev_context:
            #     s_prev_context = s_prev_context[-1*prev_context:]
            #     t_prev_context = t_prev_context[-1*prev_context:]

        # source and/or target are empty
        if tline == "":
            print('WARNING: ignoring an empty line (' + str(count + 1) + ')')
            continue

        if input_type == 'word':
            tgt_words = tline.split()
        elif input_type == 'char':
            tgt_words = split_line_by_char(tline)

        if len(tgt_words) <= max_tgt_length - 2 and sline.size(0) <= max_src_length:

            # Check truncation condition.
            if opt.tgt_seq_length_trunc != 0:
                tgt_words = tgt_words[:opt.tgt_seq_length_trunc]

            if fp16:
                sline = sline.half()
            src += [sline]

            tgt_tensor = tgt_dicts.convertToIdx(tgt_words,
                                          onmt.Constants.UNK_WORD,
                                          onmt.Constants.BOS_WORD,
                                          onmt.Constants.EOS_WORD)
            tgt += [tgt_tensor]
            src_sizes += [len(sline)]
            tgt_sizes += [len(tgt_words)]

            unks =  tgt_tensor.eq(onmt.Constants.UNK).sum().item()
            n_unk_words += unks

            if unks > 0:
                if "<unk>" not in tline:
                    print("DEBUGGING: This line contains UNK: %s" % tline)

        else:
            ignored += 1

        count += 1

        if count % opt.report_every == 0:
            print('... %d sentences prepared' % count)
    if (asr_format == "h5"):
        srcf.close()
    tgtf.close()

    print('Total number of unk words: %d' % n_unk_words)

    if opt.shuffle == 1:
        print('... shuffling sentences')
        perm = torch.randperm(len(src))
        src = [src[idx] for idx in perm]
        tgt = [tgt[idx] for idx in perm]
        src_sizes = [src_sizes[idx] for idx in perm]
        tgt_sizes = [tgt_sizes[idx] for idx in perm]

    print('... sorting sentences by size')


    # _, perm = torch.sort(torch.Tensor(sizes), descending=(opt.sort_type == 'descending'))
    # src = [src[idx] for idx in perm]
    # tgt = [tgt[idx] for idx in perm]
    z = zip(src, tgt, src_sizes, tgt_sizes)

    # ultimately sort by source size
    sorted_z = sorted(sorted(z, key=lambda x: x[3]), key=lambda x: x[2])

    src = [z_[0] for z_ in sorted_z]
    tgt = [z_[1] for z_ in sorted_z]

    print(('Prepared %d sentences ' +
           '(%d ignored due to length == 0 or src len > %d or tgt len > %d)') %
          (len(src), ignored, max_src_length, max_tgt_length))

    return src, tgt

예제 #27

0

파일 보기

from kaldiio import ReadHelper
import numpy as np
m = 0
nsp = np.array([])
abss = []
with ReadHelper('scp:spk_xvector.scp') as reader:
    for key, ab in reader:
        if m == 0:
            nsp = np.array(ab).reshape(1, -1)
        else:
            nsp = np.vstack((nsp, ab.reshape(1, -1)))
        m += 1
print(nsp.shape)
np.save("spk_vector_bigd.npy", nsp)

예제 #28

0

파일 보기

파일: view_spk_space.py 프로젝트: fengpeng-yue/ASRTTS

    sc = ax.scatter(x, y, **kw)
    if (m is not None) and (len(m) == len(x)):
        paths = []
        for marker in m:
            if isinstance(marker, mmarkers.MarkerStyle):
                marker_obj = marker
            else:
                marker_obj = mmarkers.MarkerStyle(marker)
            path = marker_obj.get_path().transformed(
                marker_obj.get_transform())
            paths.append(path)
        sc.set_paths(paths)
    return sc


with ReadHelper('scp:%s' % args.feat_scp_1) as reader:
    for key, numpy_array in reader:
        # torch_array = F.normalize(torch.from_numpy(numpy_array),dim=0)
        # numpy_array = torch_array.numpy()
        #print(numpy_array)
        data.append(numpy_array)
        speaker.append(key.split("_")[0])
        marker.append("x")
        alpha.append(1.0)
with ReadHelper('scp:%s' % args.feat_scp_2) as reader:
    for key, numpy_array in reader:
        # torch_array = F.normalize(torch.from_numpy(numpy_array),dim=0)
        # numpy_array = torch_array.numpy()
        data.append(numpy_array)
        speaker.append(key.split("_")[0])
        marker.append("o")

예제 #29

0

파일 보기

args = sys.argv
data_dir = args[1]
target_spk = args[2]
out_dir = args[3]
dataset_of_target = args[4]

dataname = basename(data_dir)
yaap_pitch_dir = join(data_dir, 'yaapt_pitch')
pitch_out_dir = join(out_dir, "f0")

statsdir = "exp/vc_toolkit_exp_voice_privacy/feats/f0/"

# Write pitch features
pitch_file = join(data_dir, 'pitch.scp')
pitch2shape = {}
with ReadHelper('scp:' + pitch_file) as reader:
    for key, mat in reader:
        pitch2shape[key] = mat.shape[0]
        kaldi_f0 = mat[:, 1].squeeze().copy()
        yaapt_f0 = readwrite.read_raw_mat(join(yaap_pitch_dir, key + '.f0'), 1)
        #unvoiced = np.where(yaapt_f0 == 0)[0]
        #kaldi_f0[unvoiced] = 0
        #readwrite.write_raw_mat(kaldi_f0, join(pitch_out_dir, key+'.f0'))
        if kaldi_f0.shape < yaapt_f0.shape:
            print("Warning yaapt_f0 > kaldi_f0 for utt:", key)
            yaapt_f0 = yaapt_f0[:kaldi_f0.shape[0]]
        f0 = np.zeros(kaldi_f0.shape)
        f0[:yaapt_f0.shape[0]] = yaapt_f0

        source_stats = {}
        with open(statsdir + dataname + "/" +

예제 #30

0

파일 보기

def inference(rspecifier, wspecifier, model_type, model_path, model_config,
              bnf_feature_kind, data_config, output_txt):

    stat_dict = data_config.get('statistic_file', None)
    feat_kind = data_config.get('feature_kind', 'mel-id')
    filter_length = data_config.get('filter_length', 1024)
    hop_length = data_config.get('hop_length', 256)
    win_length = data_config.get('win_length', 1024)
    n_mel_channels = data_config.get('n_mel_channels', 80)
    sampling_rate = data_config.get('sampling_rate', 24000)
    mel_fmin = data_config.get('mel_fmin', 80)
    mel_fmax = data_config.get('mel_fmax', 7600)

    feature_kinds = feat_kind.split('-')
    assert feature_kinds[0] in ['mel']
    assert bnf_feature_kind in ['id', 'csid', 'token']

    module = import_module('model.{}'.format(model_type), package=None)
    model = getattr(module, 'Model')(model_config)
    model.load_state_dict(torch.load(model_path, map_location='cpu')['model'])
    model.cuda().eval()

    # Read stat scp
    if stat_dict is None:
        with open(data_config.get('training_dir', '') / 'stat.scp', 'r') as rf:
            stat_dict = [line.rstrip() for line in rf.readlines()][0]
    elif stat_dict.split('.')[-1] == 'scp':
        with open(stat_dict, 'r') as rf:
            stat_dict = [line.rstrip() for line in rf.readlines()][0]

    feat_stat = torch.load(stat_dict)

    feat_fn = MelSpectrum(filter_length=filter_length,
                          hop_length=hop_length,
                          win_length=win_length,
                          n_mel_channels=n_mel_channels,
                          sampling_rate=sampling_rate,
                          mel_fmin=mel_fmin,
                          mel_fmax=mel_fmax,
                          feat_stat=feat_stat).cuda()

    if output_txt and bnf_feature_kind in ['id', 'csid']:
        bnf_writer = open(wspecifier, 'w')
    else:
        bnf_writer = WriteHelper(bnf_writer, compression_method=1)
        output_txt = False

    for utt, (rate, X) in ReadHelper(rspecifier):
        X = X.astype(np.float32) / MAX_WAV_VALUE
        X = librosa.core.resample(X,
                                  rate,
                                  sampling_rate,
                                  res_type='kaiser_best')
        if np.max(np.abs(X)) >= 1.0:
            X /= np.max(np.abs(X))
        # Extract features
        X = feat_fn(torch.from_numpy(X).cuda().unsqueeze(0))
        X = feat_fn.normalize(X)

        X_in = X['mel']

        with torch.no_grad():
            z = model.encoder(X_in)
            z_id = model.quantizer.encode(z)
            z_vq = model.quantizer.decode(z_id)

        # Save converted feats
        if bnf_feature_kind == 'id':
            X_bnf = z_id.view(-1).cpu().numpy()
        if bnf_feature_kind == 'csid':
            X_bnf = z_id.view(-1).unique_consecutive().cpu().numpy()
        elif bnf_feature_kind == 'token':
            X_bnf = z_vq.squeeze(0).t().cpu().numpy()

        if output_txt:
            X_bnf = X_bnf.reshape(-1)
            X_bnf = ''.join(['<{}>'.format(bnf) for bnf in X_bnf])
            bnf_writer.write('{} {}\n'.format(utt, X_bnf))
        else:
            bnf_writer.write(utt, X_bnf)

        print('Extracting BNF {} of {}.'.format(bnf_feature_kind, utt),
              end=' ' * 30 + '\r')

    bnf_writer.close()