def guess_language(string, node=None, options=None): allowed_languages = None if options and 'allowed_languages' in options: allowed_languages = options.get('allowed_languages') directory = list(filter(lambda x: x.category == 'path', node.ancestors))[0] if len(directory.clean_value) <= 3: # skip if we have a langage code as directory return None guess = search_language(string, allowed_languages) return guess
def guess_language(string): language, span, confidence = search_language(string) if language: # is it a subtitle language? if 'sub' in clean_string(string[:span[0]]).lower().split(' '): return (Guess({'subtitleLanguage': language}, confidence=confidence), span) else: return (Guess({'language': language}, confidence=confidence), span) return None, None
def guess_language(string, node, skip=None): if skip: relative_skip = [] for entry in skip: node_idx = entry['node_idx'] span = entry['span'] if node_idx == node.node_idx[:len(node_idx)]: relative_span = (span[0] - node.offset + 1, span[1] - node.offset + 1) relative_skip.append(relative_span) skip = relative_skip language, span, confidence = search_language(string, skip=skip) if language: return (Guess({'language': language}, confidence=confidence, raw= string[span[0]:span[1]]), span) return None, None
def guess_language(string, node, skip=None): if skip: relative_skip = [] for entry in skip: node_idx = entry['node_idx'] span = entry['span'] if node_idx == node.node_idx[:len(node_idx)]: relative_span = (span[0] - node.offset + 1, span[1] - node.offset + 1) relative_skip.append(relative_span) skip = relative_skip language, span, confidence = search_language(string, skip=skip) if language: return (Guess({'language': language}, confidence=confidence, raw=string[span[0]:span[1]]), span) return None, None
def guess_language(string): guess = search_language(string) return guess
def guess_language(self, string, node=None, options=None): allowed_languages = None if options and 'allowed_languages' in options: allowed_languages = options.get('allowed_languages') guess = search_language(string, allowed_languages) return guess
def guess_language(string): language, span, confidence = search_language(string) if language: return (Guess({'language': language}, confidence=confidence), span) return None, None
def guess_language(self, string, node=None, options=None): guess = search_language(string) return guess
def guess_language(string, node=None, options=None): allowed_languages = None if options and 'allowed_languages' in options: allowed_languages = options.get('allowed_languages') guess = search_language(string, allowed_languages) return guess
def guess_language(string): language, span, confidence = search_language(string) if language: return (Guess({"language": language}, confidence=confidence), span) return None, None
def guess_groups(string, result, filetype): # add sentinels so we can match a separator char at either end of # our groups, even when they are at the beginning or end of the string # we will adjust the span accordingly later # # filetype can either be movie, moviesubtitle, episode, episodesubtitle current = " " + string + " " regions = [] # list of (start, end) of matched regions def guessed(match_dict, confidence): guess = format_guess(Guess(match_dict, confidence=confidence)) result.append(guess) log.debug("Found with confidence %.2f: %s" % (confidence, guess)) return guess def update_found(string, guess, span, span_adjust=(0, 0)): span = (span[0] + span_adjust[0], span[1] + span_adjust[1]) regions.append((span, guess)) return blank_region(string, span) # try to find dates first, as they are very specific date, span = search_date(current) if date: guess = guessed({"date": date}, confidence=1.0) current = update_found(current, guess, span) # for non episodes only, look for year information if filetype not in ("episode", "episodesubtitle"): year, span = search_year(current) if year: guess = guessed({"year": year}, confidence=1.0) current = update_found(current, guess, span) # specific regexps (ie: cd number, season X episode, ...) for rexp, confidence, span_adjust in video_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it) if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None: del metadata["cdNumberTotal"] guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) if filetype in ("episode", "episodesubtitle"): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) # Now websites, but as exact string instead of regexps clow = current.lower() for site in websites: pos = clow.find(site.lower()) if pos != -1: guess = guessed({"website": site}, confidence=confidence) current = update_found(current, guess, (pos, pos + len(site))) clow = current.lower() # release groups have certain constraints, cannot be included in the previous general regexps group_names = [ r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]", r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]", r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]", ] for rexp in group_names: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() metadata.update({"videoCodec": match.group(1)}) guess = guessed(metadata, confidence=0.8) current = update_found(current, guess, match.span(), span_adjust=(1, -1)) # common well-defined words and regexps confidence = 1.0 # for all of them for prop, value, pos, end in find_properties(current): guess = guessed({prop: value}, confidence=confidence) current = update_found(current, guess, (pos, end)) # weak guesses for episode number, only run it if we don't have an estimate already if filetype in ("episode", "episodesubtitle"): if not any("episodeNumber" in match for match in result): for rexp, _, span_adjust in weak_episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() epnum = int(metadata["episodeNumber"]) if epnum > 100: guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6) else: guess = guessed(metadata, confidence=0.3) current = update_found(current, guess, match.span(), span_adjust) # try to find languages now language, span, confidence = search_language(current) while language: # is it a subtitle language? if "sub" in clean_string(current[: span[0]]).lower().split(" "): guess = guessed({"subtitleLanguage": language}, confidence=confidence) else: guess = guessed({"language": language}, confidence=confidence) current = update_found(current, guess, span) language, span, confidence = search_language(current) # remove our sentinels now and ajust spans accordingly assert current[0] == " " and current[-1] == " " current = current[1:-1] regions = [((start - 1, end - 1), guess) for (start, end), guess in regions] # split into '-' separated subgroups (with required separator chars # around the dash) didx = current.find("-") while didx > 0: regions.append(((didx, didx), None)) didx = current.find("-", didx + 1) # cut our final groups, and rematch the guesses to the group that created # id, None if it is a leftover group region_spans = [span for span, guess in regions] string_groups = split_on_groups(string, region_spans) remaining_groups = split_on_groups(current, region_spans) guesses = [] pos = 0 for group in string_groups: found = False for span, guess in regions: if span[0] == pos: guesses.append(guess) found = True if not found: guesses.append(None) pos += len(group) return zip(string_groups, remaining_groups, guesses)