def __init__(self, model_dir=None, conf_file=None): if model_dir is None: model_dir = self.MODEL_DIR if not path.isdir(model_dir): raise ValueError('Directory does not exist: %s' % (model_dir)) if conf_file is None: conf_file = self.CONF_FILE conf_file = path.abspath(path.join(model_dir, conf_file)) if not path.isfile(conf_file): raise ValueError('File does not exist: %s' % (conf_file)) self._lang_codes = {} self._load_config(conf_file) self.ngram = NGram(model_dir)
class LanguageIdentifier(object): MODEL_DIR = get_abs_data_filename('langmodels') """The directory containing the ngram language model files.""" CONF_FILE = 'fpdb.conf' """ The name of the file that contains language name-code pairs (relative to ``MODEL_DIR``). """ def __init__(self, model_dir=None, conf_file=None): if model_dir is None: model_dir = self.MODEL_DIR if not path.isdir(model_dir): raise ValueError('Directory does not exist: %s' % (model_dir)) if conf_file is None: conf_file = self.CONF_FILE conf_file = path.abspath(path.join(model_dir, conf_file)) if not path.isfile(conf_file): raise ValueError('File does not exist: %s' % (conf_file)) self._lang_codes = {} self._load_config(conf_file) self.ngram = NGram(model_dir) def _load_config(self, conf_file): """Load the mapping of language names to language codes as given in the configuration file.""" lines = open(conf_file).read().splitlines() for line in lines: parts = line.split() if not parts or line.startswith('#'): continue # Skip comment- and empty lines lname, lcode = parts[0], parts[1] # Make sure lname is not prefixed by directory names lname = path.split(lname)[-1] if extsep in lname: lname = lname[:lname.rindex( extsep)] # Remove extension if it has # Remove trailing '[_-]-utf8' from code if lcode.endswith('-utf8'): lcode = lcode[:-len('-utf8')] if lcode.endswith('-') or lcode.endswith('_'): lcode = lcode[:-1] self._lang_codes[lname] = lcode def identify_lang(self, text): """Identify the language of the text in the given string.""" if not text: return None result = self.ngram.classify(text) if result in self._lang_codes: result = self._lang_codes[result] return result def identify_source_lang(self, instore): """Identify the source language of the given translation store or units. :type instore: ``TranslationStore`` or list or tuple of ``TranslationUnit``s. :param instore: The translation store to extract source text from. :returns: The identified language's code or ``None`` if the language could not be identified.""" if not isinstance(instore, (TranslationStore, list, tuple)): return None text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source) if not text: return None return self.identify_lang(text) def identify_target_lang(self, instore): """Identify the target language of the given translation store or units. :type instore: ``TranslationStore`` or list or tuple of ``TranslationUnit``s. :param instore: The translation store to extract target text from. :returns: The identified language's code or ``None`` if the language could not be identified.""" if not isinstance(instore, (TranslationStore, list, tuple)): return None text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target) if not text: return None return self.identify_lang(text)
class LanguageIdentifier(object): MODEL_DIR = get_abs_data_filename('langmodels') """The directory containing the ngram language model files.""" CONF_FILE = 'fpdb.conf' """ The name of the file that contains language name-code pairs (relative to ``MODEL_DIR``). """ def __init__(self, model_dir=None, conf_file=None): if model_dir is None: model_dir = self.MODEL_DIR if not path.isdir(model_dir): raise ValueError('Directory does not exist: %s' % (model_dir)) if conf_file is None: conf_file = self.CONF_FILE conf_file = path.abspath(path.join(model_dir, conf_file)) if not path.isfile(conf_file): raise ValueError('File does not exist: %s' % (conf_file)) self._lang_codes = {} self._load_config(conf_file) self.ngram = NGram(model_dir) def _load_config(self, conf_file): """Load the mapping of language names to language codes as given in the configuration file.""" lines = open(conf_file).read().splitlines() for line in lines: parts = line.split() if not parts or line.startswith('#'): continue # Skip comment- and empty lines lname, lcode = parts[0], parts[1] # Make sure lname is not prefixed by directory names lname = path.split(lname)[-1] if extsep in lname: lname = lname[:lname.rindex(extsep)] # Remove extension if it has # Remove trailing '[_-]-utf8' from code if lcode.endswith('-utf8'): lcode = lcode[:-len('-utf8')] if lcode.endswith('-') or lcode.endswith('_'): lcode = lcode[:-1] self._lang_codes[lname] = lcode def identify_lang(self, text): """Identify the language of the text in the given string.""" if not text: return None result = self.ngram.classify(text) if result in self._lang_codes: result = self._lang_codes[result] return result def identify_source_lang(self, instore): """Identify the source language of the given translation store or units. :type instore: ``TranslationStore`` or list or tuple of ``TranslationUnit``s. :param instore: The translation store to extract source text from. :returns: The identified language's code or ``None`` if the language could not be identified.""" if not isinstance(instore, (TranslationStore, list, tuple)): return None text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source) if not text: return None return self.identify_lang(text) def identify_target_lang(self, instore): """Identify the target language of the given translation store or units. :type instore: ``TranslationStore`` or list or tuple of ``TranslationUnit``s. :param instore: The translation store to extract target text from. :returns: The identified language's code or ``None`` if the language could not be identified.""" if not isinstance(instore, (TranslationStore, list, tuple)): return None text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target) if not text: return None return self.identify_lang(text)