예제 #1
0
    def _scan_country(self, country, strict=False):
        """
        Find a country if it is at the start or end of country string
        """
        words_match = list(iter_words(country.lower()))
        s = ""
        start = None

        for word_match in words_match:
            if not start:
                start = word_match.start(0)
            s += word_match.group(0)
            try:
                return Country(s, strict=True), (start, word_match.end(0))
            except ValueError:
                continue

        words_match.reverse()
        s = ""
        end = None
        for word_match in words_match:
            if not end:
                end = word_match.end(0)
            s = word_match.group(0) + s
            try:
                return Country(s, strict=True), (word_match.start(0), end)
            except ValueError:
                continue

        return Country(country, strict=strict), None
예제 #2
0
    def __init__(self, language, country=None, strict=False, scheme=None):
        language = u(language.strip().lower())
        with_country = (Language._with_country_regexp.match(language)
                        or Language._with_country_regexp2.match(language))
        if with_country:
            self.lang = Language(with_country.group(1)).lang
            self.country = Country(with_country.group(2))
            return

        self.lang = None
        self.country = Country(country) if country else None

        # first look for scheme specific languages
        if scheme == 'opensubtitles':
            if language == 'br':
                self.lang = 'bre'
                return
            elif language == 'se':
                self.lang = 'sme'
                return
        elif scheme is not None:
            log.warning(
                'Unrecognized scheme: "%s" - Proceeding with standard one' %
                scheme)

        # look for ISO language codes
        if len(language) == 2:
            self.lang = lng2_to_lng3.get(language)
        elif len(language) == 3:
            self.lang = (language if language in lng3 else
                         lng3term_to_lng3.get(language))
        else:
            self.lang = (lng_en_name_to_lng3.get(language)
                         or lng_fr_name_to_lng3.get(language))

        # general language exceptions
        if self.lang is None and language in lng_exceptions:
            lang, country = lng_exceptions[language]
            self.lang = Language(lang).alpha3
            self.country = Country(country) if country else None

        msg = 'The given string "%s" could not be identified as a language' % language

        if self.lang is None and strict:
            raise ValueError(msg)

        if self.lang is None:
            log.debug(msg)
            self.lang = 'und'
예제 #3
0
def process(mtree):
    for node in mtree.unidentified_leaves():
        # only keep explicit groups (enclosed in parentheses/brackets)
        if len(node.node_idx) == 2:
            try:
                country = Country(node.value[1:-1], strict=True)
                if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
                    continue
                node.guess = Guess(country=country, confidence=1.0)

            except ValueError:
                pass
예제 #4
0
 def process(self, mtree, options=None):
     GuessFinder(self.guess_country, None, self.log,
                 options).process_nodes(mtree.unidentified_leaves())
     for node in mtree.leaves_containing('language'):
         c = node.clean_value.lower()
         if c in self.replace_language:
             node.guess.set('language', None)
             try:
                 country = Country(c, strict=True)
                 if self.is_valid_country(country):
                     guess = Guess(country=country,
                                   confidence=0.9,
                                   input=node.value,
                                   span=node.span)
                     found_guess(node, guess)
             except ValueError:
                 pass
예제 #5
0
    def process(self, mtree, options=None):
        for node in mtree.unidentified_leaves():
            if len(node.node_idx) == 2:
                c = node.value[1:-1].lower()
                if c in self.country_common_words:
                    continue

                # only keep explicit groups (enclosed in parentheses/brackets)
                if not node.is_explicit():
                    continue

                try:
                    country = Country(c, strict=True)
                except ValueError:
                    continue

                node.guess = Guess(country=country, confidence=1.0, input=node.value, span=node.span)
예제 #6
0
def process(mtree):
    for node in mtree.unidentified_leaves():
        if len(node.node_idx) == 2:
            c = node.value[1:-1].lower()
            if c in country_common_words:
                continue

            # only keep explicit groups (enclosed in parentheses/brackets)
            if node.value[0] + node.value[-1] not in ['()', '[]', '{}']:
                continue

            try:
                country = Country(c, strict=True)
            except ValueError:
                continue

            node.guess = Guess(country=country, confidence=1.0, raw=c)