def run_gentle(key='103/1241/103_1241_000000_000001'): text_file = f'{TRAIN_PATH}/{key}.normalized.txt' audio_file = f'{TRAIN_PATH}/{key}.wav' json_file = f'{TRAIN_PATH}/{key}.json' if os.path.isfile(json_file): with open(json_file) as r: return json.loads(r.read()) with open(text_file, encoding="utf-8") as fh: transcript = fh.read() logging.info("converting audio to 8K sampled wav") def on_progress(p): return # for k,v in p.items(): # logging.debug("%s: %s" % (k, v)) with gentle.resampled(audio_file) as wavfile: # logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=False, # include disfluencies (uh, um) in alignment conservative=False, disfluencies=set(['uh', 'um'])) result = aligner.transcribe(wavfile, progress_cb=None, logging=logging) result_dict = result.to_dict() with open(json_file, 'w') as f: f.write(json.dumps(result_dict, indent=2)) return result_dict
def align_words(audio, text): # resample audio to 8K audio_8k = librosa.resample(audio, 16000, 8000) wave_file = 'output/temp.wav' sf.write(wave_file, audio_8k, 8000, 'PCM_16') # run gentle to align words aligner = gentle.ForcedAligner(gentle_resources, text, nthreads=2, disfluency=False, conservative=False) gentle_out = aligner.transcribe(wave_file, logging=logging) words_with_timestamps = [] for i, gentle_word in enumerate(gentle_out.words): if gentle_word.case == 'success': words_with_timestamps.append( [gentle_word.word, gentle_word.start, gentle_word.end]) elif 0 < i < len(gentle_out.words) - 1: words_with_timestamps.append([ gentle_word.word, gentle_out.words[i - 1].end, gentle_out.words[i + 1].start ]) return words_with_timestamps
def run_gentle(audio_path: str, text_content: str, tokenization_view: View = None): with gentle.resampled(audio_path) as audio_file: resources = gentle.Resources() aligner = gentle.ForcedAligner( resources, text_content, nthreads=multiprocessing.cpu_count(), disfluencies={'uh', 'um'}, disfluency=True, conservative=False) if tokenization_view is not None: aligner.ms._seq = [] for token in tokenization_view.get_annotations(Uri.TOKEN): print(token.serialize(pretty=True)) start = token.properties['start'] end = token.properties['end'] token_text = text_content[start:end] kaldi_token = { 'start': start, 'end': end, 'token': metasentence.kaldi_normalize(token_text, aligner.ms.vocab) } aligner.ms._seq.append(kaldi_token) result = aligner.transcribe(audio_file) return result
def run_gentle(seg, transcript): """ Takes in a segment 1. create new text file containing text 2. create new audio with pydub 3. run Gentle with these two 4. delete text file/audio files Parameters --------- seg : Segment object to align with Gentle transcript : string holding the relevant transcript for this segment """ audio_cut = seg.audio_file[1000 * seg.start_audio:1000 * seg.end_audio] audio_cut.export("temp_audio.wav", format="wav") # run Gentle resources = gentle.Resources() with gentle.resampled("temp_audio.wav") as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile).words # delete cut audio file os.remove("temp_audio.wav") # fix unaligned-word start/end time data fix_unaligned(result, len(audio_cut) / 1000) # put gentle timestamps in relation to entire file for word in result: word.start += seg.start_audio word.end += seg.start_audio return result
def get_gentle_response(self, parsed_txt_path): """Returns response from gentle Args: parsed_txt_path (str): parsed txt path Returns: list: aligned words """ with open(parsed_txt_path, encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() # words for gentle to ignore when aligning disfluencies = set(['uh', 'um']) with gentle.resampled(self.audiopath) as wavfile: aligner = gentle.ForcedAligner( resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wavfile) return [word.as_dict(without="duration") for word in result.words]
def align_file(transcription, snd_filename): # TODO: Add a file of "sound files that did not finish" class TimeoutException(Exception): pass @contextmanager def time_limit(seconds): def signal_handler(signum, frame): raise TimeoutException("Timed out!") signal.signal(signal.SIGALRM, signal_handler) signal.alarm(seconds) try: yield finally: signal.alarm(0) with gentle.resampled(snd_filename) as wavfile: aligner = gentle.ForcedAligner(resources, transcription, nthreads=nthreads) try: with time_limit(10): result = {snd_filename: aligner.transcribe(wavfile)} except TimeoutException as e: print( "Transcription of {} timed out! Please check that your transcription is accurate." .format(fid)) result = {snd_filename: None} return result
def align(audiopath, text, nthreads=1): resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiopath) as wavfile: logging.info("Starting alignment") aligner = gentle.ForcedAligner(resources, text, nthreads=nthreads, disfluency=False, conservative=False) return aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging)
def align(audiopath, transcript): logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiopath) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False) return json.loads( aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging).to_json())
def align_audio(wav_path, transcript): with gentle.resampled(wav_path) as wavfile: print("starting alignment {}".format(wav_path)) aligner = gentle.ForcedAligner(RESOURCES, transcript, nthreads=N_THREADS, disfluency=False, conservative=False, disfluencies=DISFLUENCIES) result = aligner.transcribe(wavfile, progress_cb=_on_progress, logging=logging) result_json = json.loads(result.to_json()) return result_json
def run_gentle(video_path, vid, result_path): vtt_subtitle = read_subtitle(vid) transcript = '' for i, sub in enumerate(vtt_subtitle): transcript += (vtt_subtitle[i].text + ' ') transcript = re.sub('\n', ' ', transcript) # remove newline characters # align with gentle.resampled(video_path) as wav_file: aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wav_file, logging=logging) # write results with open(result_path, 'w', encoding="utf-8") as fh: fh.write(result.to_json(indent=2))
def align(args): audiopath, transcript = args with gentle.resampled(audiopath) as wavfile: logging.info("Audio file: {}".format(audiopath)) logging.info("Transcript: <{}...>".format(transcript[:40])) aligner = gentle.ForcedAligner(resources, transcript, nthreads=1, disfluency=False, conservative=False) result = json.loads( aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging).to_json()) result['audiopath'] = audiopath return result
def align_db(data): import pathlib except_i_list = list(range(len(data))) while True: for i in tqdm(except_i_list): row = data.iloc[i] f = row.sentence_path transcript = row.transcription with gentle.resampled(f) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=40) #print("Align starts") try: result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) except: except_i_list.append(i) continue #print("Align ends") # os.system('python align.py '+f+' words.txt -o test.json') output = os.path.join( 'alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json') pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir( parents=True, exist_ok=True) fh = open(output, 'w') #print("{} starts to be written".format(output)) fh.write(result.to_json(indent=2)) #print("{} written".format(output)) if output: logging.info("output written to %s" % (output)) fh.close() #print("i={}".format(i)) #print("f={}".format(row.sentence_path)) print("except_i_list:", except_i_list) if len(except_i_list) == 0: break
def call_gentle_chunk(wav_path, transcript, disfluency=False, conservative=False): """""" resources = gentle.Resources() aligner = gentle.ForcedAligner(resources, transcript, nthreads=multiprocessing.cpu_count(), disfluency=disfluency, conservative=conservative, disfluencies=('uh', 'um')) result = aligner.transcribe(wav_path, progress_cb=_on_progress, logging=logging) return json.loads(result.to_json())
def transcribe(self, output_dir, **kwargs): orig_audio = os.path.join(output_dir, AUDIO_FILENAME) resample_audio = os.path.join(output_dir, RESAMPLE_FILENAME) transcript = os.path.join(output_dir, TEXT_FILENAME) logging.info('Resampling audio file %s', orig_audio) if gentle.resample(orig_audio, resample_audio) != 0: logging.info('Failed to resample %s', orig_audio) return -1 def on_progress(p): for k,v in p.items(): logging.info('Transcribing %s, %s, %s', resample_audio, k, v) logging.info('Starting to transcribe %s', output_dir) with open(transcript, 'r', encoding='utf-8') as file: text = file.read() transcriber = gentle.ForcedAligner(self.resources, text, nthreads=self.nthreads, **kwargs) output = transcriber.transcribe(resample_audio, progress_cb=on_progress, logging=logging) logging.info('Finished transcribing %s', output_dir) return output
def get_transcribed_words(textFile, audioFile): with open(textFile) as file: transcript = file.read() resources = gentle.Resources() with gentle.resampled(audioFile) as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile) transcribed_words = [] for word in result.words: phones = word.phones if phones is not None: root_phones = [] for phone in phones: root_phone = phone['phone'][0:phone['phone'].index('_')] root_phones.append(root_phone) transcribed_words.append(Word(word.word, root_phones)) return transcribed_words
def align_onefile(data, align_json_path): import pathlib splitted_path = split_path(align_json_path) json_file_name = splitted_path[-1] id, _ = os.path.splitext(json_file_name) emotion = splitted_path[-2] speaker = splitted_path[-3] row = data[(data.id == id) & (data.speaker == speaker)].iloc[0] # iloc: df to series f = row.sentence_path transcript = row.transcription with gentle.resampled(f) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=40) #print("Align starts") try: result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) except: return #except_i_list.append(i) #print("except_i_list:", except_i_list) #print("Align ends") # os.system('python align.py '+f+' words.txt -o test.json') output = os.path.join('alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json') pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(parents=True, exist_ok=True) fh = open(output, 'w') #print("{} starts to be written".format(output)) fh.write(result.to_json(indent=2)) #print("{} written".format(output)) if output: logging.info("output written to %s" % (output)) fh.close()
def gentle_solve(self, audio_path, transcript): """ gentle wrapper to solve the forced alignment given audio file and text string """ args = { 'log': 'INFO', 'nthreads': self.num_thread, 'conservative': True, 'disfluency': True, } disfluencies = set(['uh', 'um']) resources = gentle.Resources() with gentle.resampled(audio_path) as wavfile: aligner = gentle.ForcedAligner( resources, transcript, nthreads=args['nthreads'], disfluency=args['disfluency'], conservative=args['conservative'], disfluencies=disfluencies) result = aligner.transcribe(wavfile) return [word.as_dict(without="phones") for word in result.words]
def align_db(data): import pathlib for i, row in data.iterrows(): f = row.sentence_path transcript = row.transcription with gentle.resampled(f) as wavfile: aligner = gentle.ForcedAligner(resources, transcript) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) # os.system('python align.py '+f+' words.txt -o test.json') output = os.path.join( 'alignments', '/'.join(f.split('/')[-4:]).split('.')[0] + '.json') pathlib.Path('/'.join(output.split('/')[0:-1])).mkdir(parents=True, exist_ok=True) fh = open(output, 'w') fh.write(result.to_json(indent=2)) if output: logging.info("output written to %s" % (output)) fh.close()
def start_aligning(audiofile, txtfile, output): log_level = "INFO" #can be one of the following: (DEBUG, INFO, WARNING, ERROR, or CRITICAL) logging.getLogger().setLevel(log_level) disfluencies = set(['uh', 'um']) with open(txtfile, encoding="utf-8") as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner( resources, transcript, nthreads ) #, True, False, disfluencies)#, conservative, disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(output, 'w', encoding="utf-8") fh.write(result.to_json(indent=2)) logging.info("output written to %s" % (output))
audio_path = os.path.join(DIR_PATH, "audio", audio_name) transcript_path = os.path.join(DIR_PATH, "transcripts", transcript_name) if not os.path.isfile(transcript_path): continue # get transcript text transcript_text = "" with open(transcript_path) as f: transcript_text = f.read() # run Gentle print "Running Gentle on", transcript_name resources = gentle.Resources() with gentle.resampled(audio_path) as wavfile: aligner = gentle.ForcedAligner(resources, transcript_text) result = aligner.transcribe(wavfile).words # create gentle_results directory if it doesn't already exist # usually better to use try-catch here, but not worried about race conditions right now if not os.path.exists(os.path.join(DIR_PATH, "gentle_results")): os.makedirs(os.path.join(DIR_PATH, "gentle_results")) # write Gentle output to gentle_results directory with open( os.path.join(DIR_PATH, "gentle_results", transcript_name + ".txt"), "w") as f: output = [] for word in result: output.append({ "word": word.word,
def on_progress(p): for k, v in p.items(): logging.debug("%s: %s" % (k, v)) with open(args.txtfile) as fh: transcript = fh.read() resources = gentle.Resources(args.model_dir) config = resources.getConfig() logging.info("converting audio to {} sampled wav".format(config['samplerate'])) with gentle.resampled(args.audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, context_width=config['context-width'], disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(args.output, 'w') if args.output else sys.stdout fh.write(result.to_json(indent=2)) if args.output: logging.info("output written to %s" % (args.output))
class Transcriber(): def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2): self.data_dir = data_dir self.nthreads = nthreads self.ntranscriptionthreads = ntranscriptionthreads self.resources = gentle.Resources() self.full_transcriber = gentle.FullTranscriber(self.resources, nthreads=ntranscriptionthreads) self._status_dicts = {} def get_status(self, uid): return self._status_dicts.setdefault(uid, {}) def out_dir(self, uid): return os.path.join(self.data_dir, 'transcriptions', uid) # TODO(maxhawkins): refactor so this is returned by transcribe() def next_id(self): uid = None while uid is None or os.path.exists(os.path.join(self.data_dir, uid)): uid = uuid.uuid4().get_hex()[:8] return uid def transcribe(self, uid, transcript, audio, async, **kwargs): status = self.get_status(uid) status['status'] = 'STARTED' output = { 'transcript': transcript } outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status['error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): for k,v in p.items(): status[k] = v if len(transcript.strip()) > 0: trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs) elif self.full_transcriber.available: trans = self.full_transcriber else: status['status'] = 'ERROR' status['error'] = 'No transcript provided and no language model for full transcription' return output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging) # ...remove the original upload os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: jsfile.write(output.to_json(indent=2)) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(output.to_csv()) # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json())); open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output
def transcribe(self, uid, transcript, audio, async_mode, **kwargs): status = self.get_status(uid) status['status'] = 'STARTED' output = {'transcript': transcript} outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) if not isinstance(audio, str): audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'wb') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') # if ((not isinstance(audio, str)) and gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0) or gentle.resample(audio, wavfile) != 0: if (not isinstance(audio, str)) and gentle.resample( os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status[ 'error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return if isinstance(audio, str) and gentle.resample(audio, wavfile) != 0: status['status'] = 'ERROR' status[ 'error'] = "Encoding failed. Make sure that you've referenced a valid media URL." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return # XXX: Maybe we should pass this wave object instead of the # file path to align_progress if not isinstance(audio, str): wav_obj = wave.open(wavfile, 'rb') status['duration'] = wav_obj.getnframes() / \ float(wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): print(p) for k, v in p.items(): status[k] = v if len(transcript.strip()) > 0: trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs) elif self.full_transcriber.available: trans = self.full_transcriber else: status['status'] = 'ERROR' status[ 'error'] = 'No transcript provided and no language model for full transcription' return output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging) # ...remove the original upload if not isinstance(audio, str): os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: jsfile.write(output.to_json(indent=2)) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(output.to_csv()) # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json())) open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output
def generate_diphones(audio_file, transcript_file, output_folder, pre_padding=0.0, post_padding=0.0) -> set: """Generates the list of diphones for a given audio_file using the transcript and store the diphones in the output_folder Args: :param audio_file:(str) Name of the audio file to segment (.wav) :param transcript_file:(str) Name of the text file with the transcript :param output_folder:(str) Name of the destination directory to store the diphones :param pre_padding:(float) A fraction of audio to clip before the generated diphone :param post_padding:(float) A fraction of audio to clip after the generated diphone Returns: :return set of generated diphones """ nthreads = multiprocessing.cpu_count() disfluency = False conservative = False disfluencies = {'uh', 'um'} with open(transcript_file, encoding="utf-8") as fh: transcript = fh.read() print(transcript) resources = gentle.Resources() with gentle.resampled(audio_file) as wavfile: aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=disfluency, conservative=conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile) r = json.loads(result.to_json()) phone_time_list = [] diphones = set() for word in r['words']: start = word['start'] * 1000 for phone in word['phones']: diphones.add(phone['phone']) phone_time_list.append( [phone['phone'], start, start + phone['duration'] * 1000]) start = start + phone['duration'] * 1000 for entry in phone_time_list: diphone = segment_audio(audio_file, entry[1], entry[2], pre_padding, post_padding) # print('Old ' + str(entry[0]) + ':' + str(len(diphone))) if len(diphone) < 150: try: diphone = ensure_length(diphone, 150) except exceptions.CouldntDecodeError: print( entry[0], 'is very small.........................................................' ) if not os.path.exists(output_folder): os.mkdir(output_folder) output_filename = output_folder + '/' + str(entry[0]) + '.wav' diphone.export(output_filename, format='wav') print('New ' + str(entry[0]) + ':' + str(len(diphone))) return diphones
def data_generator(file_id, min_dur=2, max_dur=(5, 20), randomize=False): """Given a file id and random seed, align the audio and text versions after dividing into single-speaker utterances, and write out texts of unbroken captured strings and their corresponding audio segments when the latter are between 2 and max_length seconds. """ if randomize: seed = ord(file_id[-1]) random.seed(seed) max_length = random.randint(max_dur[0], max_dur[1]) else: max_length = max_dur[1] logger.info("Processing file id {}...".format(file_id)) # grab audio file from s3 mp3 = os.path.join(mp3_dir, "{}.mp3".format(file_id)) wav = os.path.join(mp3_dir, "{}.wav".format(file_id)) if not os.path.isfile(wav): if not os.path.isfile(mp3): bucket = boto3.resource("s3").Bucket("cgws") logger.info("Downloading file {} from S3...".format(file_id)) try: bucket.download_file("{}.mp3".format(file_id), mp3) except: logger.warning( "Could not download file {} from S3.".format(file_id)) return FNULL = open(os.devnull, 'w') subprocess.call([ "sox", "{}".format(mp3), "-r", "16k", "{}".format(wav), "remix", "-" ], stdout=FNULL, stderr=FNULL) # transcript txt_file = os.path.join(records_dir, "{}.txt".format(file_id)) logger.info("Reading transcript {}...".format(file_id)) try: with open(txt_file, "r") as tr: transcript = tr.read() except IOError: logger.warning("File {} does not exist.".format(txt_file)) return # split transcript by speaker, and get timestamps (as seconds) # of the boundaries of each paragraph logger.info("Splitting transcript by speaker...") paragraphs = [] times = [] for paragraph in transcript.split("\n"): catch = re.match("\d:\d+:\d+\.\d", paragraph) if catch: timestamp = catch.group() h, m, s = timestamp.split(":") time = int(h) * 60 * 60 + int(m) * 60 + float(s) paragraphs.append(paragraph) times.append(time) file_end = get_duration(mp3) times.append(file_end) total_captures, captures_dur = 0, 0 # taking one speaker at a time, find unbroken alignments up to max_length # and write out corresponding files for i, paragraph in enumerate(paragraphs): logger.info("Cleaning and trimming paragraph {}: \n{}".format( i, paragraph)) paragraph_start, paragraph_end = times[i], times[i + 1] # don't bother with short files if paragraph_end - paragraph_start < min_dur: logger.info("Skipping paragraph {} (too short)...".format(i)) continue if len(paragraph.split()) < 2: logger.info("Skipping paragraph {} (too few words)...".format(i)) continue temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0, "/tmp") # unique name of json object to read/write paragraph_hash = hashlib.sha1("{}{}{}{}".format( file_id, paragraph, paragraph_start, paragraph_end)).hexdigest() if use_filename_json is True: json_file = os.path.join( json_out_dir, "{}_{}_{}.json".format(file_id, paragraph_start, paragraph_end)) else: json_file = os.path.join(json_out_dir, "{}.json".format(paragraph_hash)) result = None # check if json object has been written from a previous run if not os.path.isfile(json_file): logger.info( "JSON file with hash {} not found.".format(paragraph_hash)) try: logger.info("Resampling paragraph {}...".format(i)) with gentle.resampled(temp_wav) as wav_file: resources = gentle.Resources() cleaned = clean(paragraph) logger.info( "Aligning paragraph {} with gentle...".format(i)) aligner = gentle.ForcedAligner( resources, cleaned, nthreads=multiprocessing.cpu_count(), disfluency=False, conservative=False, disfluencies=set(["uh", "um"])) logger.info( "Transcribing audio segment {} with gentle...".format( i)) result = aligner.transcribe(wav_file) except: logger.warning("Paragraph {} - {} ".format( i, sys.exc_info()[2])) os.remove(temp_wav) continue aligned_words = result.to_json() with open(json_file, "w") as f: f.write(aligned_words) if not result: logger.info("Empty result for paragraph {}.".format(i)) os.remove(temp_wav) continue else: logger.info( "Found JSON of paragraph {} -- skipping alignment and transcription by gentle" .format(i)) # dictionary of aligned words with open(json_file) as f: aligned = json.loads(f.read()) # save all consecutively captured strings # and keep track of their start and stop times captures = [] current, start_time, end_time = [], 0, 0 # loop through every word as returned from gentle logger.info("Capturing strings in paragraph {}...".format(i)) if not "words" in aligned: logger.info("No words in paragraph {}.".format(i)) os.remove(temp_wav) continue # first two seconds will be skipped even if it contains a capture for catch in aligned["words"]: # successful capture if catch["case"] == "success" and catch[ "alignedWord"] != "<unk>" and catch[ 'start'] > 5 and catch['end'] - catch['start'] > .07: # new capture group if not current: # begin capturing if it has been two seconds since the last word if catch["start"] - end_time > 1: current = [catch["alignedWord"]] start_time = catch["start"] end_time = catch["end"] # continuation of a capture group else: # large gap between last capture and this one # likely that something was missing in the transcript if catch["start"] - end_time > 1: save_capture(captures, start_time, end_time, current) current = [] # adding this word would equal or exceed max_length elif catch["end"] - start_time >= max_length: save_capture(captures, start_time, end_time, current, min_dur) current = [] if randomize: max_length = random.randint(max_dur[0], max_dur[1]) # continue capturing else: current.append(catch["alignedWord"]) end_time = catch["end"] # a miss after prior success(es) elif current: save_capture(captures, start_time, end_time, current, min_dur) current = [] # last word was a success but current capture hasn't been saved yet if current: save_capture(captures, start_time, end_time, current, min_dur) # write strings and split audio into consituent segments logger.info( "Writing text and audio segments from paragraph {}...".format(i)) for result in captures: txt_segment = os.path.join( text_out_dir, "{}_{}_{}.txt".format( file_id, "{:07d}".format(int((times[i] + result["start"]) * 100)), "{:07d}".format(int((times[i] + result["end"]) * 100)))) with open(txt_segment, "w") as f: f.write("{}\n".format(result["string"])) segment = trim(file_id, temp_wav, result["start"], result["end"], times[i], wav_out_dir) # make sure durations match segment_dur = get_duration(segment) assert segment_dur - result["duration"] <= .01 total_captures += 1 captures_dur += segment_dur # delete the clip of this speaker os.remove(temp_wav) # per-file logging total_dur = get_duration(mp3) logger.info("Wrote {} segments from {}, totalling {} seconds, out of a possible {}, ratio {:.2f}."\ .format(total_captures,file_id,captures_dur,total_dur,captures_dur/total_dur)) return
if not os.path.isfile(json_file): temp_wav = trim(file_id, wav, paragraph_start, paragraph_end, 0, "/tmp") if not os.path.isfile(temp_wav): continue try: with gentle.resampled(temp_wav) as wav_file: resources = gentle.Resources() cleaned = clean(paragraph) aligner = gentle.ForcedAligner( resources, cleaned, nthreads=multiprocessing.cpu_count() * args.threads_multiplier, disfluency=False, conservative=False, disfluencies=set(["uh", "um"])) result = aligner.transcribe(wav_file) aligned_words = result.to_json() with open(json_file, "w") as f: f.write(aligned_words) except: exc_type, exc_value, exc_traceback = sys.exc_info() lines = traceback.format_exception(exc_type, exc_value, exc_traceback) print ''.join(line for line in lines) continue
disfluencies = set(['uh', 'um']) def on_progress(p): for k, v in p.items(): logging.debug("%s: %s" % (k, v)) with open(args.txtfile) as fh: transcript = fh.read() resources = gentle.Resources() logging.info("converting audio to 8K sampled wav") with gentle.resampled(args.audiofile) as wavfile: logging.info("starting alignment") aligner = gentle.ForcedAligner(resources, transcript, nthreads=args.nthreads, disfluency=args.disfluency, conservative=args.conservative, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) fh = open(args.output, 'w') if args.output else sys.stdout fh.write(result.to_json(indent=2)) if args.output: logging.info("output written to %s" % (args.output))
class Transcriber(): def __init__(self, data_dir, nthreads=4, ntranscriptionthreads=2): self.data_dir = data_dir self.nthreads = nthreads self.ntranscriptionthreads = ntranscriptionthreads self.resources = gentle.Resources() self.full_transcriber = gentle.FullTranscriber( self.resources, nthreads=ntranscriptionthreads) self._status_dicts = {} def get_status(self, uid): return self._status_dicts.setdefault(uid, {}) def out_dir(self, uid): return os.path.join(self.data_dir, 'transcriptions', uid) # TODO(maxhawkins): refactor so this is returned by transcribe() def next_id(self): uid = None while uid is None or os.path.exists(os.path.join(self.data_dir, uid)): uid = uuid.uuid4().get_hex()[:8] return uid def transcribe(self, uid, transcript, audio, async, **kwargs): status = self.get_status(uid) status['status'] = 'STARTED' output = {'transcript': transcript} outdir = os.path.join(self.data_dir, 'transcriptions', uid) tran_path = os.path.join(outdir, 'transcript.txt') with open(tran_path, 'w') as tranfile: tranfile.write(transcript) audio_path = os.path.join(outdir, 'upload') with open(audio_path, 'w') as wavfile: wavfile.write(audio) status['status'] = 'ENCODING' wavfile = os.path.join(outdir, 'a.wav') if gentle.resample(os.path.join(outdir, 'upload'), wavfile) != 0: status['status'] = 'ERROR' status[ 'error'] = "Encoding failed. Make sure that you've uploaded a valid media file." # Save the status so that errors are recovered on restart of the server # XXX: This won't work, because the endpoint will override this file with open(os.path.join(outdir, 'status.json'), 'w') as jsfile: json.dump(status, jsfile, indent=2) return #XXX: Maybe we should pass this wave object instead of the # file path to align_progress wav_obj = wave.open(wavfile, 'r') status['duration'] = wav_obj.getnframes() / float( wav_obj.getframerate()) status['status'] = 'TRANSCRIBING' def on_progress(p): for k, v in p.items(): status[k] = v if len(transcript.strip()) > 0: trans = gentle.ForcedAligner(self.resources, transcript, nthreads=self.nthreads, **kwargs) elif self.full_transcriber.available: trans = self.full_transcriber else: status['status'] = 'ERROR' status[ 'error'] = 'No transcript provided and no language model for full transcription' return output = trans.transcribe(wavfile, progress_cb=on_progress, logging=logging) # ...remove the original upload os.unlink(os.path.join(outdir, 'upload')) # Save with open(os.path.join(outdir, 'align.json'), 'w') as jsfile: jsfile.write(output.to_json(indent=2)) with open(os.path.join(outdir, 'align.csv'), 'w') as csvfile: csvfile.write(output.to_csv()) # add file datas sens_end_index = trans.ms.get_sentences_index() res = output.to_json() res = json.loads(res, encoding='utf-8', strict=True) time_sentences_index = [] ss_dot = 0 s_pos = None time_pos = 0 try: for i, w in enumerate(res['words']): if w["case"] != "success": continue end_v = w['endOffset'] start_v = w['startOffset'] if s_pos is None: s_pos = start_v time_pos = i if end_v >= sens_end_index[ss_dot]: ss_dot += 1 time_sentences_index.append( (res['words'][time_pos]["start"], res['words'][i]["end"])) time_pos = i s_pos = end_v if len(sens_end_index) != len(time_sentences_index): time_sentences_index.append( (res['words'][time_pos]["start"], res['words'][-1]["end"])) #print sens_end_index, len(sens_end_index) #print time_sentences_index, len(time_sentences_index) sens_str = trans.ms.get_sentences_string() save_ss = "" for i, t in enumerate(time_sentences_index): #print "{{time}}%s/%s{{end}}" % (str(round(float(t[0]), 2)), str(round(float(t[1]), 2))) #print "{{raw}}%s{{end}}" % (str(sens_str[i])) save_ss += "{{time}}" + str(round(float(t[0]), 2)) + "/" + str( round(float(t[1]), 2)) + "{{end}}\n" save_ss += "{{raw}}" + sens_str[i] + "{{end}}\n" with open(os.path.join(outdir, 'time.csv'), 'w') as timefile: timefile.write(save_ss) except Exception as e: print traceback.format_exc() # Inline the alignment into the index.html file. htmltxt = open(get_resource('www/view_alignment.html')).read() htmltxt = htmltxt.replace("var INLINE_JSON;", "var INLINE_JSON=%s;" % (output.to_json())) open(os.path.join(outdir, 'index.html'), 'w').write(htmltxt) status['status'] = 'OK' logging.info('done with transcription.') return output
parsed_source_xml = ET.parse(source_path_xml) parsed_root = parsed_source_xml.getroot() for turn in parsed_root.findall('.//vx:Turn', namespaces): if 'DISCLAIMER' != turn.attrib['Speaker']: for fragment in turn.findall('.//vx:Fragment', namespaces): target_file_txt.write(fragment.text) with open(target_path_txt) as target_file_txt: transcript = target_file_txt.read() source_path_mp3 = source_path_xml.replace('transcripts/extracted', 'audio').replace('.xml', '.mp3') if os.path.isfile(source_path_mp3) and transcript: target_path_json = target_path_txt.replace('.txt', '.json') with open(target_path_json, 'w') as target_file_json: print('converting audio to 8K sampled wav') with gentle.resampled(source_path_mp3) as wavfile: print('starting alignment for', source_path_xml, ' and ', source_path_mp3) aligner = gentle.ForcedAligner(resources, transcript, nthreads=nthreads, disfluency=False, conservative=False, disfluencies=disfluencies) result = aligner.transcribe(wavfile, progress_cb=on_progress, logging=logging) target_file_json.write(result.to_json(indent=2)) print('finished alignment in', target_path_json)