예제 #1
0
    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._lang_codes = {}
        self._load_config(conf_file)
        self.ngram = NGram(model_dir)
예제 #2
0
파일: identify.py 프로젝트: ANKIT-KS/fjord
    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._lang_codes = {}
        self._load_config(conf_file)
        self.ngram = NGram(model_dir)
예제 #3
0
class LanguageIdentifier(object):
    MODEL_DIR = get_abs_data_filename('langmodels')
    """The directory containing the ngram language model files."""
    CONF_FILE = 'fpdb.conf'
    """
    The name of the file that contains language name-code pairs
    (relative to ``MODEL_DIR``).
    """
    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._lang_codes = {}
        self._load_config(conf_file)
        self.ngram = NGram(model_dir)

    def _load_config(self, conf_file):
        """Load the mapping of language names to language codes as given in the
            configuration file."""
        lines = open(conf_file).read().splitlines()
        for line in lines:
            parts = line.split()
            if not parts or line.startswith('#'):
                continue  # Skip comment- and empty lines
            lname, lcode = parts[0], parts[1]

            # Make sure lname is not prefixed by directory names
            lname = path.split(lname)[-1]
            if extsep in lname:
                lname = lname[:lname.rindex(
                    extsep)]  # Remove extension if it has

            # Remove trailing '[_-]-utf8' from code
            if lcode.endswith('-utf8'):
                lcode = lcode[:-len('-utf8')]
            if lcode.endswith('-') or lcode.endswith('_'):
                lcode = lcode[:-1]

            self._lang_codes[lname] = lcode

    def identify_lang(self, text):
        """Identify the language of the text in the given string."""
        if not text:
            return None
        result = self.ngram.classify(text)
        if result in self._lang_codes:
            result = self._lang_codes[result]
        return result

    def identify_source_lang(self, instore):
        """Identify the source language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract source text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.source for unit in instore[:50]
                         if unit.istranslatable() and unit.source)
        if not text:
            return None
        return self.identify_lang(text)

    def identify_target_lang(self, instore):
        """Identify the target language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract target text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.target for unit in instore[:200]
                         if unit.istranslatable() and unit.target)
        if not text:
            return None
        return self.identify_lang(text)
예제 #4
0
파일: identify.py 프로젝트: ANKIT-KS/fjord
class LanguageIdentifier(object):
    MODEL_DIR = get_abs_data_filename('langmodels')
    """The directory containing the ngram language model files."""
    CONF_FILE = 'fpdb.conf'
    """
    The name of the file that contains language name-code pairs
    (relative to ``MODEL_DIR``).
    """

    def __init__(self, model_dir=None, conf_file=None):
        if model_dir is None:
            model_dir = self.MODEL_DIR
        if not path.isdir(model_dir):
            raise ValueError('Directory does not exist: %s' % (model_dir))

        if conf_file is None:
            conf_file = self.CONF_FILE
        conf_file = path.abspath(path.join(model_dir, conf_file))
        if not path.isfile(conf_file):
            raise ValueError('File does not exist: %s' % (conf_file))

        self._lang_codes = {}
        self._load_config(conf_file)
        self.ngram = NGram(model_dir)

    def _load_config(self, conf_file):
        """Load the mapping of language names to language codes as given in the
            configuration file."""
        lines = open(conf_file).read().splitlines()
        for line in lines:
            parts = line.split()
            if not parts or line.startswith('#'):
                continue  # Skip comment- and empty lines
            lname, lcode = parts[0], parts[1]

            # Make sure lname is not prefixed by directory names
            lname = path.split(lname)[-1]
            if extsep in lname:
                lname = lname[:lname.rindex(extsep)]  # Remove extension if it has

            # Remove trailing '[_-]-utf8' from code
            if lcode.endswith('-utf8'):
                lcode = lcode[:-len('-utf8')]
            if lcode.endswith('-') or lcode.endswith('_'):
                lcode = lcode[:-1]

            self._lang_codes[lname] = lcode

    def identify_lang(self, text):
        """Identify the language of the text in the given string."""
        if not text:
            return None
        result = self.ngram.classify(text)
        if result in self._lang_codes:
            result = self._lang_codes[result]
        return result

    def identify_source_lang(self, instore):
        """Identify the source language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract source text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.source for unit in instore[:50] if unit.istranslatable() and unit.source)
        if not text:
            return None
        return self.identify_lang(text)

    def identify_target_lang(self, instore):
        """Identify the target language of the given translation store or
            units.

            :type  instore: ``TranslationStore`` or list or tuple of
                ``TranslationUnit``s.
            :param instore: The translation store to extract target text from.
            :returns: The identified language's code or ``None`` if the language
                could not be identified."""
        if not isinstance(instore, (TranslationStore, list, tuple)):
            return None

        text = u' '.join(unit.target for unit in instore[:200] if unit.istranslatable() and unit.target)
        if not text:
            return None
        return self.identify_lang(text)