def guess_filetype(filename, filetype = 'autodetect'): other = {} # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: if 'movie' in filetype: filetype = 'moviesubtitle' elif 'episode' in filetype: filetype = 'episodesubtitle' else: filetype = 'subtitle' other = { 'container': fileext } elif fileext in video_exts: if filetype == 'autodetect': filetype = 'video' other = { 'container': fileext } else: if filetype == 'autodetect': filetype = 'unknown' other = { 'extension': fileext } # now look whether there are some specific hints for episode vs movie if filetype in ('video', 'subtitle'): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: if filetype == 'video': filetype = 'episode' elif filetype == 'subtitle': filetype = 'episodesubtitle' break for prop, value, start, end in find_properties(filename): if canonical_form(value) == 'DVB': if filetype == 'video': filetype = 'episode' elif filetype == 'subtitle': filetype = 'episodesubtitle' break # if no episode info found, assume it's a movie if filetype == 'video': filetype = 'movie' elif filetype == 'subtitle': filetype = 'moviesubtitle' return filetype, other
def guess_filetype(mtree, filetype): # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword (python 3 does have it) filetype_container = [filetype] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == 'video': filetype_container[0] = 'episode' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' def upgrade_movie(): if filetype_container[0] == 'video': filetype_container[0] = 'movie' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' def upgrade_subtitle(): if 'movie' in filetype_container[0]: filetype_container[0] = 'moviesubtitle' elif 'episode' in filetype_container[0]: filetype_container[0] = 'episodesubtitle' else: filetype_container[0] = 'subtitle' def upgrade(type='unknown'): if filetype_container[0] == 'autodetect': filetype_container[0] = type # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = {'container': fileext} elif fileext in video_exts: upgrade(type='video') other = {'container': fileext} else: upgrade(type='unknown') other = {'extension': fileext} # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [(r'Movies?', upgrade_movie), (r'Tv[ _-]?Shows?', upgrade_episode), (r'Series', upgrade_episode)] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = clean_string(filename).lower() for m in MOVIES: if m in fname: upgrade_movie() for s in SERIES: if s in fname: upgrade_episode() # now look whether there are some specific hints for episode vs movie if filetype_container[0] in ('video', 'subtitle'): # if we have an episode_rexp (eg: s02e13), it is an episode for rexp, _, _ in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: upgrade_episode() break # if we have a 3-4 digit number that's not a year, maybe an episode match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename) if match: fullnumber = int(match.group()[1:-1]) #season = fullnumber // 100 epnumber = fullnumber % 100 possible = True # check for validity if epnumber > 40: possible = False if valid_year(fullnumber): possible = False if possible: upgrade_episode() # if we have certain properties characteristic of episodes, it is an ep for prop, value, _, _ in find_properties(filename): log.debug('prop: %s = %s' % (prop, value)) if prop == 'episodeFormat': upgrade_episode() break elif compute_canonical_form('format', value) == 'DVB': upgrade_episode() break # origin-specific type if 'tvu.org.ru' in filename: upgrade_episode() # if no episode info found, assume it's a movie upgrade_movie() filetype = filetype_container[0] return filetype, other
def guess_properties(string): try: prop, value, pos, end = find_properties(string)[0] return { prop: value }, (pos, end) except IndexError: return None, None
def guess_filetype(mtree, filetype): # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword (python 3 does have it) filetype_container = [filetype] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == 'video': filetype_container[0] = 'episode' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' def upgrade_movie(): if filetype_container[0] == 'video': filetype_container[0] = 'movie' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' def upgrade_subtitle(): if 'movie' in filetype_container[0]: filetype_container[0] = 'moviesubtitle' elif 'episode' in filetype_container[0]: filetype_container[0] = 'episodesubtitle' else: filetype_container[0] = 'subtitle' def upgrade(type='unknown'): if filetype_container[0] == 'autodetect': filetype_container[0] = type # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = { 'container': fileext } elif fileext in video_exts: upgrade(type='video') other = { 'container': fileext } else: upgrade(type='unknown') other = { 'extension': fileext } # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [ (r'Movies?', upgrade_movie), (r'Tv ?Shows?', upgrade_episode), (r'Series', upgrade_episode) ] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = clean_string(filename).lower() for m in MOVIES: if m in fname: upgrade_movie() for s in SERIES: if s in fname: upgrade_episode() # now look whether there are some specific hints for episode vs movie if filetype_container[0] in ('video', 'subtitle'): # if we have an episode_rexp (eg: s02e13), it is an episode for rexp, _, _ in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: upgrade_episode() break # if we have a 3-4 digit number that's not a year, maybe an episode match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename) if match: fullnumber = int(match.group()[1:-1]) #season = fullnumber // 100 epnumber = fullnumber % 100 possible = True # check for validity if epnumber > 40: possible = False if valid_year(fullnumber): possible = False if possible: upgrade_episode() # if we have certain properties characteristic of episodes, it is an ep for prop, value, _, _ in find_properties(filename): log.debug('prop: %s = %s' % (prop, value)) if prop == 'episodeFormat': upgrade_episode() break elif canonical_form(value) == 'DVB': upgrade_episode() break # origin-specific type if 'tvu.org.ru' in filename: upgrade_episode() # if no episode info found, assume it's a movie upgrade_movie() filetype = filetype_container[0] return filetype, other
def guess_filetype(filename, filetype): other = {} # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: if 'movie' in filetype: filetype = 'moviesubtitle' elif 'episode' in filetype: filetype = 'episodesubtitle' else: filetype = 'subtitle' other = { 'container': fileext } elif fileext in video_exts: if filetype == 'autodetect': filetype = 'video' other = { 'container': fileext } else: if filetype == 'autodetect': filetype = 'unknown' other = { 'extension': fileext } # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword (python 3 does have it) filetype_container = [filetype] def upgrade_episode(): if filetype_container[0] == 'video': filetype_container[0] = 'episode' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' def upgrade_movie(): if filetype_container[0] == 'video': filetype_container[0] = 'movie' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' # now look whether there are some specific hints for episode vs movie if filetype in ('video', 'subtitle'): for rexp, _, _ in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: upgrade_episode() break for prop, value, _, _ in find_properties(filename): log.debug('prop: %s = %s' % (prop, value)) if prop == 'episodeFormat': upgrade_episode() break elif canonical_form(value) == 'DVB': upgrade_episode() break # if no episode info found, assume it's a movie upgrade_movie() filetype = filetype_container[0] return filetype, other
def guess_filetype(filename, filetype): other = {} # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: if 'movie' in filetype: filetype = 'moviesubtitle' elif 'episode' in filetype: filetype = 'episodesubtitle' else: filetype = 'subtitle' other = {'container': fileext} elif fileext in video_exts: if filetype == 'autodetect': filetype = 'video' other = {'container': fileext} else: if filetype == 'autodetect': filetype = 'unknown' other = {'extension': fileext} # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword (python 3 does have it) filetype_container = [filetype] def upgrade_episode(): if filetype_container[0] == 'video': filetype_container[0] = 'episode' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' def upgrade_movie(): if filetype_container[0] == 'video': filetype_container[0] = 'movie' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' # now look whether there are some specific hints for episode vs movie if filetype in ('video', 'subtitle'): for rexp, _, _ in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: upgrade_episode() break for prop, value, _, _ in find_properties(filename): log.debug('prop: %s = %s' % (prop, value)) if prop == 'episodeFormat': upgrade_episode() break elif canonical_form(value) == 'DVB': upgrade_episode() break if 'tvu.org.ru' in filename: upgrade_episode() # if no episode info found, assume it's a movie upgrade_movie() filetype = filetype_container[0] return filetype, other
def guess_groups(string, result, filetype): # add sentinels so we can match a separator char at either end of # our groups, even when they are at the beginning or end of the string # we will adjust the span accordingly later # # filetype can either be movie, moviesubtitle, episode, episodesubtitle current = " " + string + " " regions = [] # list of (start, end) of matched regions def guessed(match_dict, confidence): guess = format_guess(Guess(match_dict, confidence=confidence)) result.append(guess) log.debug("Found with confidence %.2f: %s" % (confidence, guess)) return guess def update_found(string, guess, span, span_adjust=(0, 0)): span = (span[0] + span_adjust[0], span[1] + span_adjust[1]) regions.append((span, guess)) return blank_region(string, span) # try to find dates first, as they are very specific date, span = search_date(current) if date: guess = guessed({"date": date}, confidence=1.0) current = update_found(current, guess, span) # for non episodes only, look for year information if filetype not in ("episode", "episodesubtitle"): year, span = search_year(current) if year: guess = guessed({"year": year}, confidence=1.0) current = update_found(current, guess, span) # specific regexps (ie: cd number, season X episode, ...) for rexp, confidence, span_adjust in video_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it) if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None: del metadata["cdNumberTotal"] guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) if filetype in ("episode", "episodesubtitle"): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) # Now websites, but as exact string instead of regexps clow = current.lower() for site in websites: pos = clow.find(site.lower()) if pos != -1: guess = guessed({"website": site}, confidence=confidence) current = update_found(current, guess, (pos, pos + len(site))) clow = current.lower() # release groups have certain constraints, cannot be included in the previous general regexps group_names = [ r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]", r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]", r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]", ] for rexp in group_names: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() metadata.update({"videoCodec": match.group(1)}) guess = guessed(metadata, confidence=0.8) current = update_found(current, guess, match.span(), span_adjust=(1, -1)) # common well-defined words and regexps confidence = 1.0 # for all of them for prop, value, pos, end in find_properties(current): guess = guessed({prop: value}, confidence=confidence) current = update_found(current, guess, (pos, end)) # weak guesses for episode number, only run it if we don't have an estimate already if filetype in ("episode", "episodesubtitle"): if not any("episodeNumber" in match for match in result): for rexp, _, span_adjust in weak_episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() epnum = int(metadata["episodeNumber"]) if epnum > 100: guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6) else: guess = guessed(metadata, confidence=0.3) current = update_found(current, guess, match.span(), span_adjust) # try to find languages now language, span, confidence = search_language(current) while language: # is it a subtitle language? if "sub" in clean_string(current[: span[0]]).lower().split(" "): guess = guessed({"subtitleLanguage": language}, confidence=confidence) else: guess = guessed({"language": language}, confidence=confidence) current = update_found(current, guess, span) language, span, confidence = search_language(current) # remove our sentinels now and ajust spans accordingly assert current[0] == " " and current[-1] == " " current = current[1:-1] regions = [((start - 1, end - 1), guess) for (start, end), guess in regions] # split into '-' separated subgroups (with required separator chars # around the dash) didx = current.find("-") while didx > 0: regions.append(((didx, didx), None)) didx = current.find("-", didx + 1) # cut our final groups, and rematch the guesses to the group that created # id, None if it is a leftover group region_spans = [span for span, guess in regions] string_groups = split_on_groups(string, region_spans) remaining_groups = split_on_groups(current, region_spans) guesses = [] pos = 0 for group in string_groups: found = False for span, guess in regions: if span[0] == pos: guesses.append(guess) found = True if not found: guesses.append(None) pos += len(group) return zip(string_groups, remaining_groups, guesses)