def _scan_country(self, country, strict=False): """ Find a country if it is at the start or end of country string """ words_match = list(iter_words(country.lower())) s = "" start = None for word_match in words_match: if not start: start = word_match.start(0) s += word_match.group(0) try: return Country(s, strict=True), (start, word_match.end(0)) except ValueError: continue words_match.reverse() s = "" end = None for word_match in words_match: if not end: end = word_match.end(0) s = word_match.group(0) + s try: return Country(s, strict=True), (word_match.start(0), end) except ValueError: continue return Country(country, strict=strict), None
def __init__(self, language, country=None, strict=False, scheme=None): language = u(language.strip().lower()) with_country = (Language._with_country_regexp.match(language) or Language._with_country_regexp2.match(language)) if with_country: self.lang = Language(with_country.group(1)).lang self.country = Country(with_country.group(2)) return self.lang = None self.country = Country(country) if country else None # first look for scheme specific languages if scheme == 'opensubtitles': if language == 'br': self.lang = 'bre' return elif language == 'se': self.lang = 'sme' return elif scheme is not None: log.warning( 'Unrecognized scheme: "%s" - Proceeding with standard one' % scheme) # look for ISO language codes if len(language) == 2: self.lang = lng2_to_lng3.get(language) elif len(language) == 3: self.lang = (language if language in lng3 else lng3term_to_lng3.get(language)) else: self.lang = (lng_en_name_to_lng3.get(language) or lng_fr_name_to_lng3.get(language)) # general language exceptions if self.lang is None and language in lng_exceptions: lang, country = lng_exceptions[language] self.lang = Language(lang).alpha3 self.country = Country(country) if country else None msg = 'The given string "%s" could not be identified as a language' % language if self.lang is None and strict: raise ValueError(msg) if self.lang is None: log.debug(msg) self.lang = 'und'
def process(mtree): for node in mtree.unidentified_leaves(): # only keep explicit groups (enclosed in parentheses/brackets) if len(node.node_idx) == 2: try: country = Country(node.value[1:-1], strict=True) if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: continue node.guess = Guess(country=country, confidence=1.0) except ValueError: pass
def process(self, mtree, options=None): GuessFinder(self.guess_country, None, self.log, options).process_nodes(mtree.unidentified_leaves()) for node in mtree.leaves_containing('language'): c = node.clean_value.lower() if c in self.replace_language: node.guess.set('language', None) try: country = Country(c, strict=True) if self.is_valid_country(country): guess = Guess(country=country, confidence=0.9, input=node.value, span=node.span) found_guess(node, guess) except ValueError: pass
def process(self, mtree, options=None): for node in mtree.unidentified_leaves(): if len(node.node_idx) == 2: c = node.value[1:-1].lower() if c in self.country_common_words: continue # only keep explicit groups (enclosed in parentheses/brackets) if not node.is_explicit(): continue try: country = Country(c, strict=True) except ValueError: continue node.guess = Guess(country=country, confidence=1.0, input=node.value, span=node.span)
def process(mtree): for node in mtree.unidentified_leaves(): if len(node.node_idx) == 2: c = node.value[1:-1].lower() if c in country_common_words: continue # only keep explicit groups (enclosed in parentheses/brackets) if node.value[0] + node.value[-1] not in ['()', '[]', '{}']: continue try: country = Country(c, strict=True) except ValueError: continue node.guess = Guess(country=country, confidence=1.0, raw=c)