def make_transcription(self): text = dict() for trs in utils.list_directory(self.transcription_dir, abspath=True): spk_id = os.path.splitext(os.path.basename(trs))[0] lines = utils.open_utf8(trs, 'r').readlines() # add utterence id from even lines starting at line 2 ids = [ spk_id + u'_' + re.sub(ur'\s+|:|;', u'', e) for e in lines[1::2] ] # delete linebreaks on odd lines starting at line 3 # (this does not take into account fancy unicode # linebreaks), see # http://stackoverflow.com/questions/3219014 transcriptions = [
def export(self): """Copy model files to output_dir""" result_directory = os.path.join(self.recipe_dir, 'exp', self.model_type) for path in ( # exclude files starting with numbers, as we want only # final state p for p in utils.list_directory(result_directory, abspath=True) if not os.path.basename(p)[0].isdigit()): if os.path.isdir(path): # for log subdir shutil.copytree( path, os.path.join(self.output_dir, os.path.basename(path))) else: shutil.copy(path, self.output_dir) super(AbstractAcousticModel, self).export()
def export(self): """Copy the whole <recipe-dir>/decode to <output-dir>, copy <recipe-dir>/graph to <output-dir>/graph """ self.log.debug('exporting results to %s', self.output_dir) result_directory = os.path.join(self.recipe_dir, 'decode') for path in utils.list_directory(result_directory, abspath=True): if os.path.isdir(path): shutil.copytree(path, os.path.join( self.output_dir, os.path.basename(path))) else: shutil.copy(path, self.output_dir) shutil.copytree( os.path.join(self.recipe_dir, 'graph'), os.path.join(self.output_dir, 'graph')) super(Decode, self).export()
def correct_transcription(self): """Correct problems with the GlobalPhone Vietnamese transcripts The corrections are completely ad hoc and the result are stored in a temporary folder. - remove trailings spaces and all double spacings and '_' from transcriptions on every odd line but the first - double spacings and '_' are actually only found for speakers 200 to 208 """ # generate temporary output folder corrected_transcription_dir = tempfile.mkdtemp() # get the list of transcription files trss = utils.list_directory(self.transcription_dir, abspath=True) for trs in trss: # read transcript file lines = utils.open_utf8(trs, 'r').readlines() # correct odd lines lines[2::2] = [line.replace(u'_', u' ').replace( u' ', u' ').strip() + u'\n' for line in lines[2::2]] # # write corrected version to temp output_file = os.path.join( corrected_transcription_dir, os.path.basename(trs)) with utils.open_utf8(output_file, 'w') as out: for line in lines: out.write(line) self.transcription_dir = corrected_transcription_dir return True