示例#1
0
def transcribeAudio(lang, audio_file):
    """
    Call the Sphinx Java module to transcribe the audio file
    Read the results and convert

    Args:
        lang (string): language
        audio_file (string): path to the audio file:
    
    Returns:
        list(AudioWord): The start/end of each word
    """
    rword = re.compile(r'^{(?P<token>[^,]+),\s(?P<prob>[^,]+),\s\[(?P<from>\d+):(?P<to>\d+)]}$')
    outfile = os.path.join(config.TEMP_DIR, 'mapping.txt')
    mapping = None
    if os.path.exists(audio_file):
        gu.removeFile(outfile)
        ret = subprocess.call(['java', '-jar', config.AUDIO_TRANSCRIBER, lang, audio_file, outfile], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        if ret == 0: # success
            with open(outfile, 'r') as f:
                lines = f.readlines()
                mapping = []
                for line in lines:
                    m = rword.search(line)
                    if m:
                        mapping.append(AudioWord(m.group('token'), '', m.group('from'), m.group('to')))
    gu.removeFile(outfile)
    return mapping
示例#2
0
def alignAudio(lang, audio_file, transcript_file):
    """
    Call the Sphinx Java module to align an audiofile with a transcript file
    Read the results and convert

    Args:
        lang (string): language
        audio_file (string): path to the audio file 
        transcript_file (string): path to the transcript file
    
    Returns:
        list(AudioWord): Alignment status if applicable the start/end of each token
    """
    rmapped = re.compile(r'^(?P<type>MAPPED):\s(?P<token>\S+)\s+\[(?P<from>\d+):(?P<to>\d+)]$')
    rmissing = re.compile(r'^(?P<type>MISSING):\s(?P<token>\S+)\s+$')
    outfile = os.path.join(config.TEMP_DIR, 'mapping.txt')
    mapping = None
    if os.path.exists(audio_file) and os.path.exists(transcript_file):
        gu.removeFile(outfile)
        ret = subprocess.call(['java', '-jar', config.AUDIO_ALIGNER, lang, audio_file, transcript_file, outfile], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        if ret == 0: # success
            with open(outfile, 'r') as f:
                lines = f.readlines()
                mapping = []
                for line in lines:
                    m = rmapped.search(line)
                    if m:
                        mapping.append(AudioWord(m.group('token'), m.group('type'), m.group('from'), m.group('to')))                        
                    else:
                        m = rmissing.search(line)
                        if m:
                            mapping.append(AudioWord(m.group('token'), m.group('type')))
    gu.removeFile(outfile)    
    return mapping
示例#3
0
def findBoundaries(lang, bookid, chapter):
    """
    Find the stat and the end of a chapter
    """
    start = '00:00:00.000'
    end = start
    audio_file, start_time, stop_time = book_manager.chapterAudio(lang, bookid, chapter)
    audio_segment = AudioSegment.from_mp3(audio_file) # read the audio
    audio_len = len(audio_segment)
    wavfile = os.path.join(config.TEMP_DIR, 'chapter%s.wav' % chapter)
    sp = utils.getSpacy(lang)
    text = book_manager.bookChapter(lang, bookid, chapter)
    doc = sp(text)
    doc_tokens = [tkn for tkn in doc if (not tkn.is_punct) and tkn.text.strip()]
    doc_start = [tok.text.lower() for tok in doc_tokens[:10]]
    doc_end = [tok.text.lower() for tok in doc_tokens[-10:]]
    # start of the chapter
    gu.removeFile(wavfile)    
    encodeForSphinx(audio_file, '00:00:00', '00:01:00', wavfile) # encode audio for speech recognition
    words = transcribeAudio(lang, wavfile)
    trans = [word for word in words if (not word.token.startswith('<')) and (not word.token.startswith('['))]
    start = msec2time(findPosition('end', 200, trans, doc_start))
    gu.removeFile(wavfile)
    # end of the chapter
    segment_end = msec2time(audio_len)
    msec_start = audio_len - 30000
    segment_start = msec2time(msec_start)
    encodeForSphinx(audio_file, segment_start, segment_end, wavfile) # encode audio for speech recognition
    words = transcribeAudio(lang, wavfile)
    trans = [word for word in words if (not word.token.startswith('<')) and (not word.token.startswith('['))]
    doc_end = list(reversed(doc_end))
    trans = list(reversed(trans))
    end = msec2time(findPosition('begin', msec_start - 300, trans, doc_end))
    return (start, start_time, end, stop_time)
示例#4
0
def reset(fromLang, toLang):
    dbName = __dbName(fromLang, toLang)
    lemma_mappings[dbName] = {}
    file_to_del = os.path.join(CACHE, dbName)
    try:
        gu.removeFile(file_to_del)
        setDefault(fromLang, toLang)
    except Exception as e:
        print('ERROR: could not remove lemma cache under %s: %s' % (file_to_del, e))
示例#5
0
def alignChapter(lang, bookid, chapter):
    """
    Align a chapter of a book

    Args:
        lang (str): language
        bookid (str): identifier of a book
        chapter (int): the chapter to be aligned
    
    Returns:
        list of spacy tokens: the tokens with the added audio alignment information
    """
    bar = IncrementalBar('Processing %s [%s] (%s)' % (bookid, lang, chapter) , max=100)
    bar.start()
    outfile = os.path.join(book_manager.chaptersPath(lang, bookid),book_manager.mappingFile(chapter))
    audio_file, start_time, stop_time = book_manager.chapterAudio(lang, bookid, chapter)
    wavfile = os.path.join(config.TEMP_DIR, 'chapter%s.wav' % chapter)
    gu.removeFile(wavfile)
    encodeForSphinx(audio_file, start_time, stop_time, wavfile) # encode audio for speech recognition
    # get spacy models for language processing
    sp = utils.getSpacy(lang)
    text = book_manager.bookChapter(lang, bookid, chapter)
    doc = sp(text)    
    # prepare sentences without punctuation
    token_count = 0    
    doc_tokens = [tkn for tkn in doc if tkn.is_alpha and (not tkn.is_punct) and tkn.text.strip()]
    token_count = len(doc_tokens)
    audio_segment = AudioSegment.from_wav(wavfile) # read the audio
    audio_len = len(audio_segment)
    begin_tkn = 0
    begin_audio = 0
    startm = time2msec(start_time)
    stopm = time2msec(stop_time)
    l = stopm - startm
    
    while begin_tkn < token_count:
        chunk = doc_tokens[begin_tkn:begin_tkn+50]
        rel_len = 1.25 * len(chunk) / token_count
        end_audio = begin_audio + int(rel_len * audio_len)
        last_idx, begin_audio = alignChunk(lang, audio_segment=audio_segment, audio_begin=begin_audio, audio_end=end_audio, chunk=chunk)        
        bar.goto(int(100.0 * begin_audio / l))
        if last_idx == -1: # could not map anything
            break
        else:
            begin_tkn += last_idx + 1
    gu.removeFile(wavfile)
    saveAudioMapping(doc_tokens, startm, stopm, outfile)
示例#6
0
def alignChunk(lang, audio_segment, audio_begin, audio_end, chunk):
    """
    Align a chunk of a longer text/audio

    Args:
        lang (str): language 
        audio_segment (AudioSegment): The entire audio content
        audio_begin (int): The start of the audio range (ms)
        audio_end (int): The end of the audio range (ms)
        chunk (list of spacy Tokens): List of spacy tokens with the transcript
    
    Returns:
        (int, int): Index of the last mapped token 
        and the position of the last mapping in the audio

    """
    temp_audio = os.path.join(config.TEMP_DIR, 'temp.wav')
    temp_transcript = os.path.join(config.TEMP_DIR, 'transcript.txt')
    audio = audio_segment[audio_begin:audio_end]
    audio.export(temp_audio, format="wav", parameters=["-ac", "1", "-ar", "16000"])
    trans = " ".join([word.lower_ for word in chunk])
    with open(temp_transcript, 'w') as f:
        f.write(trans)
    alignment = alignAudio(lang, temp_audio, temp_transcript) # try to align the audio with the text
    lastMappedEnd = -1
    lastIdx = -1
    if alignment is None:
        gu.removeFile(temp_audio)
        gu.removeFile(temp_transcript)
        return (lastIdx, lastMappedEnd)
    # add mapping information to the tokens
    for idx, tkn in enumerate(chunk):
        tkn._.spoken = alignment[idx].token
        end = alignment[idx].end
        end = end if end == -1 else end + audio_begin
        if end > 0 and ((lastMappedEnd < 0) or (end < lastMappedEnd + 10000)):
            tkn._.end = end
            lastMappedEnd = end
            lastIdx = idx
            bgn = alignment[idx].begin
            if bgn > 0:
                bgn += audio_begin
                tkn._.begin = bgn        
    gu.removeFile(temp_audio)
    gu.removeFile(temp_transcript)
    return (lastIdx, lastMappedEnd)