def loadDictionary(langKey, langVal, minLen=0): langKeyInfo = languages.get(code3=langKey) langValInfo = languages.get(code3=langVal) minKeyLen = langKeyInfo.ngrams or minLen minValLen = langValInfo.ngrams or minLen dictionary = gizmo.Dictionary() def addEntry(key, val): if len(key) >= minKeyLen and len(val) >= minValLen: if langKeyInfo.rightToLeft: key = key[::-1] if langValInfo.rightToLeft: val = val[::-1] for k in splitNgrams(key, langKeyInfo.ngrams): for v in splitNgrams(val, langValInfo.ngrams): dictionary.add(k.lower(), v) asset = assets.getAsset('dict', (langKey, langVal)) if asset.localVersion(): for key, val in asset.readDictionary(): addEntry(key, val) else: asset = assets.getAsset('dict', (langVal, langKey)) if asset.localVersion(): for key, val in asset.readDictionary(): addEntry(val, key) if not asset.localVersion(): raise Error(_('There is no dictionary for transaltion from {} to {}') \ .format(langKey, langVal)) \ .add('language1', langKey) \ .add('language2', langVal) logger.info('dictionary ready with %u entries', dictionary.size()) return dictionary
def getLangFromPath(path): """Get language code from file name. Reads 2- or 3-letter language code from file name, if code is present as last thing before file extension. Returns ------- str 3-letter language code or `None` if code is not present. Examples -------- - `subtitles.eng.srt` - eng - `subtitles-fr.srt` - fre """ name = path.rsplit('.', 1)[0] size = 0 for c in reversed(name): if c.isalpha(): size += 1 else: break if size == 2 or size == 3: return languages.get(name[-size:].lower()).code3
def format(self, pattern, sub, ref): if pattern is None or sub is None or ref is None: return None cacheKey = (sub.path, sub.no, sub.lang, ref.path, ref.no, ref.lang) if self.cache[0] == cacheKey: if self.cache[1] == pattern: return self.cache[2] else: self.d = {} for prefix, item in [('sub_', sub), ('ref_', ref)]: self.d[prefix + 'path'] = item.path self.d[prefix + 'no'] = str(item.no + 1) self.d[prefix + 'lang'] = item.lang or '' self.d[prefix + 'lang2'] = languages.get(code3=item.lang).code2 or '' self.d[prefix + 'name'] = os.path.splitext( os.path.basename(item.path))[0] self.d[prefix + 'dir'] = os.path.dirname(item.path) path = _formatPattern(pattern, self.d) self.cache = (cacheKey, pattern, path) return path
def __init__(self, stream, runCb=None): ''' Speech recognition pipeline: Demux --> SpeechDec --[words]--> {NgramSplitter} --[words]--> ... ''' super().__init__(stream, runCb) self.dec = gizmo.SubtitleDec() self.dec.setMinWordLen(settings().minWordLen) self.ngramSplitter = None self.sink = self.dec langInfo = stream.lang and languages.get(code3=stream.lang.lower()) if langInfo: if langInfo.rightToLeft: logger.info('switching to right-to-left for file "%s"', stream.path) self.dec.setRightToLeft(True) if langInfo.ngrams: logger.info('switching to %i-gram for file "%s"', langInfo.ngrams, stream.path) self.dec.setMinWordLen(langInfo.ngrams) self.ngramSplitter = gizmo.NgramSplitter(langInfo.ngrams) self.dec.addWordsListener(self.ngramSplitter.pushWord) self.sink = self.ngramSplitter if stream.enc != None: self.dec.setEncoding(stream.enc) self.demux.connectDec(self.dec, stream.no)
def detectEncoding(path, lang, probeSize=32 * 1024): try: dlang, denc = locale.getdefaultlocale() except Exception as e: logger.warn('getdefaultlocale failed, %r', e) dlang, denc = None, None if not lang and dlang: lang = dlang.split('_', 1)[0] encs = ['UTF-8'] + languages.get(lang).encodings if denc and denc not in encs: encs.append(denc) try: for enc in encs: with open(path, 'r', encoding=enc) as fp: try: fp.read(32 * 1024) logger.info('detected encoding %s for file "%s"', enc, path) return enc except UnicodeError: pass except FileNotFoundError: raise Error('File not found').add('path', path) logger.info('couldn\'t detect encoding for file "%s", tried %s', path, encs)
def getLangFromPath(path): ''' Returns two- or three-letters language code from filename in form name.code.extension, e.g. subtitles.eng.srt or subtitles-fr.srt ''' name = path.rsplit('.', 1)[0] size = 0 for c in reversed(name): if c.isalpha(): size += 1 else: break if size == 2 or size == 3: return languages.get(name[-size:].lower()).code3
def genDefaultFileName(self, path, suffix=None): try: res = [] basename, _ = os.path.splitext(os.path.basename(path)) res.append(basename) if suffix: res.append(suffix) elif settings().appendLangCode and self.task.sub.lang: if settings().appendLangCode in [3, True]: res.append(self.task.sub.lang) elif settings().appendLangCode == 2: lang = languages.get(code=self.task.sub.lang) if lang.code2: res.append(lang.code2) res.append('srt') return '.'.join(res) except Exception as e: logger.warning('%r', e)
def __init__(self, stream, runCb=None): ''' Speech recognition pipeline: Demux --> AudioDec --> Resampler --> SpeechRecognition --[words]--> {NgramSplitter} --[words]--> ... ''' super().__init__(stream, runCb) speechModel = speech.loadSpeechModel(stream.lang) self.dec = gizmo.AudioDec() speechAudioFormat = speech.getSpeechAudioFormat(speechModel) logger.info('speech recognition audio format: %s', speechAudioFormat) self.speechRec = speech.createSpeechRec(speechModel) self.speechRec.setMinWordProb(settings().minWordProb) self.speechRec.setMinWordLen(settings().minWordLen) self.ngramSplitter = None self.sink = self.speechRec langInfo = stream.lang and languages.get(code3=stream.lang.lower()) if langInfo and langInfo.ngrams: logger.info('switching to %i-gram for audio "%s"', langInfo.ngrams, stream.path) self.speechRec.setMinWordLen(langInfo.ngrams) self.ngramSplitter = gizmo.NgramSplitter(langInfo.ngrams) self.speechRec.addWordsListener(self.ngramSplitter.pushWord) self.sink = self.ngramSplitter self.resampler = gizmo.Resampler() self.channels = stream.channels self.resampler.connectFormatChangeCallback(self.onAudioFormatChanged) self.demux.connectDec(self.dec, stream.no) self.dec.connectOutput(self.resampler) self.resampler.connectOutput(self.speechRec, speechAudioFormat)