def get_timestamps(train_files, test_files, output_folder):
    for idx, files in enumerate([train_files, test_files]):
        pairs = []
        for i in range(len(files)):
            for j in range(i + 1, len(files)):
                file1 = files[i].split('/')[-1].split('.')[0]
                file2 = files[j].split('/')[-1].split('.')[0]

                if file1 == file2:
                    sorter = file1.split('-')
                    s = "-"
                    num_zeros = 10 - len(sorter[1])
                    for _ in range(num_zeros):
                        s += '0'
                    sorter = sorter[0] + s + sorter[1]
                    if files[i].endswith('.txt'):
                        pairs.append((files[j], files[i], sorter))
                    else:
                        pairs.append((files[i], files[j], sorter))

        pairs = sorted(pairs, key=lambda x: x[-1])
        for wav, txt, _ in pairs:
            try:
                original = []
                with open(txt, 'r', encoding='utf8') as f:
                    for line in f:
                        original.append(line.strip())
                phoneme_alignments, word_alignments = align.align(wav, txt)
                output_file = txt.split('/')[-1]
                if idx == 0:
                    output_path = os.path.join(output_folder, 'train')
                if idx == 1:
                    output_path = os.path.join(output_folder, 'test')
                if not os.path.exists(output_path):
                    os.makedirs(output_path)
                with open(os.path.join(output_path, output_file), 'w', encoding='utf8') as f:
                    for word, start, stop in word_alignments:
                        f.write(word + ',' + str(start) + ',' + str(stop) + '\n')
                print('Alignment successful', wav, txt)
            except:
                print('Alignment failed: ', wav, txt)
            print('-----------------------------')
            print()
Exemplo n.º 2
0
for names in items:
    if names.endswith(".txt"):
        txtfilelist.append(names)


# Iterate through text files, performing aligner and saving Praat as we go
# for file in txtfilelist:
for ff in range(0,len(txtfilelist)):
    
    curr_text = txtfilelist[ff]
    curr_wav = curr_text.replace('txt', "wav")
    curr_TextGird = curr_text.replace('txt', "TextGrid")

    print([str(ff)+': '+curr_text])
    
    phoneme_alignments, word_alignments = align.align(wavdirname+'/'+curr_wav, txtdirname+'/'+curr_text,TGdirname+'/'+curr_TextGird)
            
    with open(Path(matdirname,curr_text), 'w') as fp:
        fp.writelines('Phonemes\n')
        for pp in range(1,len(phoneme_alignments)):
            fp.write(' '.join('%s' % x for x in phoneme_alignments[pp]))
            fp.writelines('\n')
        fp.writelines('Words\n')
        for pp in range(1,len(word_alignments)):
            fp.write(' '.join('%s' % x for x in word_alignments[pp]))
            fp.writelines('\n')
            


## For Spanish (unfortunately P2FA doesn't have Spanish support)
txtdirname = r'/home/sakkol/Documents/Forced_Alignment/FORCE/Fast-Spanish/txt'
Exemplo n.º 3
0
 def test_aligner(self):
     align.align(self.input_wav, self.input_transcription, self.outfile)
     self.assertTrue(filecmp.cmp(self.outfile, self.true_alignment_file))
Exemplo n.º 4
0
def extract_phoneme_data(args):
    audio_file_name, lyrics_file_name, audio_length = args
    print(audio_file_name)

    # Extract phonemes using Penn's force aligner
    ph_align, w_align = align.align(audio_file_name, lyrics_file_name)

    step = params.frame_period / 1000
    phoneme_position = 0
    phoneme_array = []
    x = 0

    while x < audio_length:
        if ph_align[phoneme_position][1] <= x * step:
            if ph_align[phoneme_position][2] > x * step:
                phoneme_array.append(ph_align[phoneme_position][0])
                x = x + 1
            elif phoneme_position + 1 < len(ph_align):
                phoneme_position = phoneme_position + 1
            else:
                x = x + 1
                phoneme_array.append(ph_align[phoneme_position][0])
        else:
            phoneme_array.append("sp")
            x = x + 1

    x = 0
    timing_array = []

    while x < audio_length:
        phoneme = phoneme_array[x]
        counter = 0
        while x + counter < audio_length and phoneme_array[x +
                                                           counter] == phoneme:
            counter = counter + 1
        numerator = 1
        for y in range(counter):
            # timing_array.append(numerator / counter)
            timing = numerator / counter
            if timing <= 0.333:
                timing_array.append(0)
            elif timing <= 0.666:
                timing_array.append(1)
            else:
                timing_array.append(2)
            numerator = numerator + 1
        x = x + counter

    pre_phoneme_array = []
    post_phoneme_array = []

    for y in range(audio_length):
        try:
            pre_phoneme_array.append(phoneme_array[y - 1])
        except IndexError:
            pre_phoneme_array.append("sp")
        try:
            post_phoneme_array.append(phoneme_array[y + 1])
        except IndexError:
            post_phoneme_array.append("sp")

    phoneme_position_data = pd.DataFrame(phoneme_array, columns=['Phoneme'])
    pre_phoneme_position_data = pd.DataFrame(pre_phoneme_array,
                                             columns=['Pre_phoneme'])
    post_phoneme_position_data = pd.DataFrame(post_phoneme_array,
                                              columns=['Post_phoneme'])
    phoneme_timing_data = pd.DataFrame(timing_array,
                                       columns=['Phoneme_timings'])
    phoneme_data = pd.concat([
        phoneme_position_data, phoneme_timing_data, pre_phoneme_position_data,
        post_phoneme_position_data
    ],
                             axis=1)
    return phoneme_data