def leftover_valid_groups(match_tree, valid = lambda s: len(s[0]) > 3): """Return the list of valid string groups (eg: len(s) > 3) that could not be matched to anything as a list of pairs (cleaned_str, group_pos).""" leftover = [] for gpos, (group, remaining, guess) in iterate_groups(match_tree): if not guess: clean_str = clean_string(remaining) if valid((clean_str, gpos)): leftover.append((clean_str, gpos)) return leftover
def second_pass_options(self, mtree, options=None): m = mtree.matched() to_skip_language_nodes = [] for lang_key in ('language', 'subtitleLanguage'): langs = {} lang_nodes = set(mtree.leaves_containing(lang_key)) for lang_node in lang_nodes: lang = lang_node.guess.get(lang_key, None) if self._skip_language_on_second_pass(mtree, lang_node): # Language probably split the title. Add to skip for 2nd pass. # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(lang_node.root.value).split() if (m.get('type') in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2)): continue to_skip_language_nodes.append(lang_node) elif not lang in langs: langs[lang] = lang_node else: # The same language was found. Keep the more confident one, # and add others to skip for 2nd pass. existing_lang_node = langs[lang] to_skip = None if (existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language')): # lang_node is to remove to_skip = lang_node else: # existing_lang_node is to remove langs[lang] = lang_node to_skip = existing_lang_node to_skip_language_nodes.append(to_skip) if to_skip_language_nodes: # Also skip same value nodes skipped_values = [ skip_node.value for skip_node in to_skip_language_nodes ] for lang_key in ('language', 'subtitleLanguage'): lang_nodes = set(mtree.leaves_containing(lang_key)) for lang_node in lang_nodes: if lang_node not in to_skip_language_nodes and lang_node.value in skipped_values: to_skip_language_nodes.append(lang_node) return {'skip_nodes': to_skip_language_nodes} return None
def guess_language(string): language, span, confidence = search_language(string) if language: # is it a subtitle language? if 'sub' in clean_string(string[:span[0]]).lower().split(' '): return (Guess({'subtitleLanguage': language}, confidence=confidence), span) else: return (Guess({'language': language}, confidence=confidence), span) return None, None
def second_pass_options(self, mtree, options=None): m = mtree.matched() to_skip_language_nodes = [] for lang_key in ('language', 'subtitleLanguage'): langs = {} lang_nodes = set(mtree.leaves_containing(lang_key)) for lang_node in lang_nodes: lang = lang_node.guess.get(lang_key, None) if self._skip_language_on_second_pass(mtree, lang_node): # Language probably split the title. Add to skip for 2nd pass. # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(lang_node.root.value).split() if (m.get('type') in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2)): continue to_skip_language_nodes.append(lang_node) elif not lang in langs: langs[lang] = lang_node else: # The same language was found. Keep the more confident one, # and add others to skip for 2nd pass. existing_lang_node = langs[lang] to_skip = None if (existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language')): # lang_node is to remove to_skip = lang_node else: # existing_lang_node is to remove langs[lang] = lang_node to_skip = existing_lang_node to_skip_language_nodes.append(to_skip) if to_skip_language_nodes: # Also skip same value nodes skipped_values = [skip_node.value for skip_node in to_skip_language_nodes] for lang_key in ('language', 'subtitleLanguage'): lang_nodes = set(mtree.leaves_containing(lang_key)) for lang_node in lang_nodes: if lang_node not in to_skip_language_nodes and lang_node.value in skipped_values: to_skip_language_nodes.append(lang_node) return {'skip_nodes': to_skip_language_nodes} return None
def format_guess(guess): """Format all the found values to their natural type. For instance, a year would be stored as an int value, etc... Note that this modifies the dictionary given as input. """ for prop, value in guess.items(): if prop in ("season", "episodeNumber", "year", "cdNumber", "cdNumberTotal"): guess[prop] = int(guess[prop]) elif isinstance(value, basestring): if prop in ("edition",): value = clean_string(value) guess[prop] = canonical_form(value) return guess
def format_guess(guess): """Format all the found values to their natural type. For instance, a year would be stored as an int value, etc... Note that this modifies the dictionary given as input. """ for prop, value in guess.items(): if prop in ('season', 'episodeNumber', 'year', 'cdNumber', 'cdNumberTotal', 'bonusNumber', 'filmNumber'): guess[prop] = parse_numeral(guess[prop]) elif isinstance(value, base_text_type): if prop in ('edition',): value = clean_string(value) guess[prop] = get_synonym(value).replace('\\', '') return guess
def format_guess(guess): """Format all the found values to their natural type. For instance, a year would be stored as an int value, etc... Note that this modifies the dictionary given as input. """ for prop, value in list(guess.items()): if prop in ('season', 'episodeNumber', 'year', 'cdNumber', 'cdNumberTotal', 'bonusNumber', 'filmNumber'): guess[prop] = int(guess[prop]) elif isinstance(value, base_text_type): if prop in ('edition', ): value = clean_string(value) guess[prop] = canonical_form(value).replace('\\', '') return guess
def __init__(self, filename, filetype='autodetect', opts=None, transfo_opts=None): if opts is None: opts = [] if not isinstance(opts, list): raise ValueError('opts must be a list of option names! Received: type=%s val=%s', type(opts), opts) if transfo_opts is None: transfo_opts = {} if not isinstance(transfo_opts, dict): raise ValueError('transfo_opts must be a dict of { transfo_name: (args, kwargs) }. ' + 'Received: type=%s val=%s', type(transfo_opts), transfo_opts) valid_filetypes = ('autodetect', 'subtitle', 'info', 'video', 'movie', 'moviesubtitle', 'movieinfo', 'episode', 'episodesubtitle', 'episodeinfo') if filetype not in valid_filetypes: raise ValueError("filetype needs to be one of %s" % valid_filetypes) if not PY3 and not isinstance(filename, unicode): log.warning('Given filename to matcher is not unicode...') filename = filename.decode('utf-8') filename = normalize_unicode(filename) self.filename = filename self.match_tree = MatchTree(filename) self.filetype = filetype self.opts = opts self.transfo_opts = transfo_opts self._transfo_calls = [] # sanity check: make sure we don't process a (mostly) empty string if clean_string(filename) == '': return try: mtree = self.match_tree mtree.guess.set('type', filetype, confidence=1.0) for transformer in transformers.extensions.objects(): self._apply_transfo(transformer) log.debug('Found match tree:\n%s' % u(mtree)) except TransfoException as e: log.debug('An error has occured in Transformer %s: %s' % (e.transformer, e))
def get_seasons_for_showid(sid, title=None): url = 'http://tvu.org.ru/index.php?show=season&sid=%s' % sid r = requests.get(url) r.encoding = 'utf-8' feeds = [] #open('/tmp/tvub.html', 'w').write(r.text.encode('utf-8')) bs = BeautifulSoup(r.text) dubbed = bs.find(id='main').find_all('table') for d in dubbed: rows = d('tr') dub_lang = rows[0].find('img')['alt'] result = [] for season in d('tr')[2:]: cells = season('td') source, season, format = cells[0].text, int(cells[1].text), cells[2].text stitle = cells[3].find('a').text.strip() # remove series name if it appears in front if title and stitle.lower().startswith(title.lower()): stitle = clean_string(stitle[len(title):]) link = 'http://tvu.org.ru/' + cells[3].find('a')['href'] feedid = link.split('=')[-1] feedlink = 'http://tvu.org.ru/rss.php?se_id=%s' % feedid status = cells[3].find('i').text sub_lang = None subflag_td = cells[4].find('img') if subflag_td: sub_lang = subflag_td['alt'] year = int(cells[5].text) result.append((source, season, format, stitle, status, sub_lang, year, feedlink)) feeds.append((dub_lang, result)) return feeds
def __init__(self, filename, options=None, **kwargs): options = dict(options or {}) for k, v in kwargs.items(): if k not in options or not options[k]: options[ k] = v # options dict has priority over keyword arguments self._validate_options(options) if not PY3 and not isinstance(filename, unicode): log.warning('Given filename to matcher is not unicode...') filename = filename.decode('utf-8') filename = normalize_unicode(filename) self.match_tree = MatchTree(filename) self.options = options self._transfo_calls = [] # sanity check: make sure we don't process a (mostly) empty string if clean_string(filename) == '': return from guessit.plugins import transformers try: mtree = self.match_tree if 'type' in self.options: mtree.guess.set('type', self.options['type'], confidence=0.0) # Process for transformer in transformers.all_transformers(): self._process(transformer, False) # Post-process for transformer in transformers.all_transformers(): self._process(transformer, True) log.debug('Found match tree:\n%s' % u(mtree)) except TransformerException as e: log.debug('An error has occurred in Transformer %s: %s' % (e.transformer, e))
def get_seasons_for_showid(sid, title=None): url = "http://tvu.org.ru/index.php?show=season&sid=%s" % sid r = requests.get(url) r.encoding = "utf-8" feeds = [] # open('/tmp/tvub.html', 'w').write(r.text.encode('utf-8')) bs = BeautifulSoup(r.text) dubbed = bs.find(id="main").find_all("table") for d in dubbed: rows = d("tr") dub_lang = rows[0].find("img")["alt"] result = [] for season in d("tr")[2:]: cells = season("td") source, season, format = cells[0].text, int(cells[1].text), cells[2].text stitle = cells[3].find("a").text.strip() # remove series name if it appears in front if title and stitle.lower().startswith(title.lower()): stitle = clean_string(stitle[len(title) :]) link = "http://tvu.org.ru/" + cells[3].find("a")["href"] feedid = link.split("=")[-1] feedlink = "http://tvu.org.ru/rss.php?se_id=%s" % feedid status = cells[3].find("i").text sub_lang = None subflag_td = cells[4].find("img") if subflag_td: sub_lang = subflag_td["alt"] year = int(cells[5].text) result.append((source, season, format, stitle, status, sub_lang, year, feedlink)) feeds.append((dub_lang, result)) return feeds
def __init__(self, filename, options=None, **kwargs): options = dict(options or {}) for k, v in kwargs.items(): if k not in options or not options[k]: options[k] = v # options dict has priority over keyword arguments self._validate_options(options) if not PY3 and not isinstance(filename, unicode): log.warning('Given filename to matcher is not unicode...') filename = filename.decode('utf-8') filename = normalize_unicode(filename) self.match_tree = MatchTree(filename) self.options = options self._transfo_calls = [] # sanity check: make sure we don't process a (mostly) empty string if clean_string(filename) == '': return from guessit.plugins import transformers try: mtree = self.match_tree if 'type' in self.options: mtree.guess.set('type', self.options['type'], confidence=0.0) # Process for transformer in transformers.all_transformers(): self._process(transformer, False) # Post-process for transformer in transformers.all_transformers(): self._process(transformer, True) log.debug('Found match tree:\n%s' % u(mtree)) except TransformerException as e: log.debug('An error has occured in Transformer %s: %s' % (e.transformer, e))
def guess_filetype(self, mtree, options=None): options = options or {} # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword which we could use here in the upgrade_* functions # (python 3 does have it) filetype_container = [mtree.guess.get("type")] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == "subtitle": filetype_container[0] = "episodesubtitle" elif filetype_container[0] == "info": filetype_container[0] = "episodeinfo" elif not filetype_container[0]: filetype_container[0] = "episode" def upgrade_movie(): if filetype_container[0] == "subtitle": filetype_container[0] = "moviesubtitle" elif filetype_container[0] == "info": filetype_container[0] = "movieinfo" elif not filetype_container[0]: filetype_container[0] = "movie" def upgrade_subtitle(): if filetype_container[0] == "movie": filetype_container[0] = "moviesubtitle" elif filetype_container[0] == "episode": filetype_container[0] = "episodesubtitle" elif not filetype_container[0]: filetype_container[0] = "subtitle" def upgrade_info(): if filetype_container[0] == "movie": filetype_container[0] = "movieinfo" elif filetype_container[0] == "episode": filetype_container[0] = "episodeinfo" elif not filetype_container[0]: filetype_container[0] = "info" # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = {"container": fileext} elif fileext in info_exts: upgrade_info() other = {"container": fileext} elif fileext in video_exts: other = {"container": fileext} else: if fileext and not options.get("name_only"): other = {"extension": fileext} # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [ (r"Movies?", upgrade_movie), (r"Films?", upgrade_movie), (r"Tv[ _-]?Shows?", upgrade_episode), (r"Series?", upgrade_episode), (r"Episodes?", upgrade_episode), ] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() return filetype_container[0], other # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = clean_string(filename).lower() for m in self.MOVIES: if m in fname: self.log.debug("Found in exception list of movies -> type = movie") upgrade_movie() return filetype_container[0], other for s in self.SERIES: if s in fname: self.log.debug("Found in exception list of series -> type = episode") upgrade_episode() return filetype_container[0], other # now look whether there are some specific hints for episode vs movie # if we have an episode_rexp (eg: s02e13), it is an episode episode_transformer = get_transformer("guess_episodes_rexps") if episode_transformer: guess = episode_transformer.guess_episodes_rexps(filename) if guess: self.log.debug("Found guess_episodes_rexps: %s -> type = episode", guess) upgrade_episode() return filetype_container[0], other properties_transformer = get_transformer("guess_properties") if properties_transformer: # if we have certain properties characteristic of episodes, it is an ep found = properties_transformer.container.find_properties(filename, mtree, "episodeFormat") guess = properties_transformer.container.as_guess(found, filename) if guess: self.log.debug('Found characteristic property of episodes: %s"', guess) upgrade_episode() return filetype_container[0], other found = properties_transformer.container.find_properties(filename, mtree, "format") guess = properties_transformer.container.as_guess(found, filename) if guess and guess["format"] in ("HDTV", "WEBRip", "WEB-DL", "DVB"): # Use weak episodes only if TV or WEB source weak_episode_transformer = get_transformer("guess_weak_episodes_rexps") if weak_episode_transformer: guess = weak_episode_transformer.guess_weak_episodes_rexps(filename) if guess: self.log.debug("Found guess_weak_episodes_rexps: %s -> type = episode", guess) upgrade_episode() return filetype_container[0], other website_transformer = get_transformer("guess_website") if website_transformer: found = website_transformer.container.find_properties(filename, mtree, "website") guess = website_transformer.container.as_guess(found, filename) if guess: for namepart in ("tv", "serie", "episode"): if namepart in guess["website"]: # origin-specific type self.log.debug("Found characteristic property of episodes: %s", guess) upgrade_episode() return filetype_container[0], other if filetype_container[0] in ("subtitle", "info") or (not filetype_container[0] and fileext in video_exts): # if no episode info found, assume it's a movie self.log.debug("Nothing characteristic found, assuming type = movie") upgrade_movie() if not filetype_container[0]: self.log.debug("Nothing characteristic found, assuming type = unknown") filetype_container[0] = "unknown" return filetype_container[0], other
def __init__(self, filename, filetype="autodetect"): """An iterative matcher tries to match different patterns that appear in the filename. The 'filetype' argument indicates which type of file you want to match. If it is 'autodetect', the matcher will try to see whether it can guess that the file corresponds to an episode, or otherwise will assume it is a movie. The recognized 'filetype' values are: [ autodetect, subtitle, movie, moviesubtitle, episode, episodesubtitle ] The IterativeMatcher works mainly in 2 steps: First, it splits the filename into a match_tree, which is a tree of groups which have a semantic meaning, such as episode number, movie title, etc... The match_tree created looks like the following: 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv The first 3 lines indicates the group index in which a char in the filename is located. So for instance, x264 is the group (0, 4, 1), and it corresponds to a video codec, denoted by the letter'v' in the 4th line. (for more info, see guess.matchtree.tree_to_string) Second, it tries to merge all this information into a single object containing all the found properties, and does some (basic) conflict resolution when they arise. """ if filetype not in ("autodetect", "subtitle", "video", "movie", "moviesubtitle", "episode", "episodesubtitle"): raise ValueError, "filetype needs to be one of ('autodetect', 'subtitle', 'video', 'movie', 'moviesubtitle', 'episode', 'episodesubtitle')" if not isinstance(filename, unicode): log.debug("WARNING: given filename to matcher is not unicode...") match_tree = [] result = [] # list of found metadata def guessed(match_dict, confidence): guess = format_guess(Guess(match_dict, confidence=confidence)) result.append(guess) log.debug("Found with confidence %.2f: %s" % (confidence, guess)) return guess def update_found(leftover, group_pos, guess): pidx, eidx, gidx = group_pos group = match_tree[pidx][eidx][gidx] match_tree[pidx][eidx][gidx] = (group[0], deleted * len(group[0]), guess) return [g for g in leftover if g[1] != group_pos] # 1- first split our path into dirs + basename + ext match_tree = split_path_components(filename) # try to detect the file type filetype, other = guess_filetype(filename, filetype) guessed({"type": filetype}, confidence=1.0) extguess = guessed(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(filename, strict=False) if mime is not None: guessed({"mimetype": mime}, confidence=1.0) # remove the extension from the match tree, as all indices relative # the the filename groups assume the basename is the last one fileext = match_tree.pop(-1)[1:].lower() # 2- split each of those into explicit groups, if any # note: be careful, as this might split some regexps with more confidence such as # Alfleni-Team, or [XCT] or split a date such as (14-01-2008) match_tree = [split_explicit_groups(part) for part in match_tree] # 3- try to match information in decreasing order of confidence and # blank the matching group in the string if we found something for pathpart in match_tree: for gidx, explicit_group in enumerate(pathpart): pathpart[gidx] = guess_groups(explicit_group, result, filetype=filetype) # 4- try to identify the remaining unknown groups by looking at their position # relative to other known elements if filetype in ("episode", "episodesubtitle"): eps = find_group(match_tree, "episodeNumber") if eps: match_tree = match_from_epnum_position(match_tree, eps[0], guessed, update_found) leftover = leftover_valid_groups(match_tree) if not eps: # if we don't have the episode number, but at least 2 groups in the # last path group, then it's probably series - eptitle title_candidates = filter( lambda g: g[0].lower() not in non_episode_title, filter(lambda g: g[1][0] == len(match_tree) - 1, leftover_valid_groups(match_tree)), ) if len(title_candidates) >= 2: guess = guessed({"series": title_candidates[0][0]}, confidence=0.4) leftover = update_found(leftover, title_candidates[0][1], guess) guess = guessed({"title": title_candidates[1][0]}, confidence=0.4) leftover = update_found(leftover, title_candidates[1][1], guess) # if there's a path group that only contains the season info, then the previous one # is most likely the series title (ie: .../series/season X/...) eps = [ gpos for gpos in find_group(match_tree, "season") if "episodeNumber" not in get_group(match_tree, gpos)[2] ] if eps: pidx, eidx, gidx = eps[0] previous = [group for group in leftover if group[1][0] == pidx - 1] if len(previous) == 1: guess = guessed({"series": previous[0][0]}, confidence=0.5) leftover = update_found(leftover, previous[0][1], guess) # reduce the confidence of unlikely series for guess in result: if "series" in guess: if guess["series"].lower() in unlikely_series: guess.set_confidence("series", guess.confidence("series") * 0.5) elif filetype in ("movie", "moviesubtitle"): leftover_all = leftover_valid_groups(match_tree) # specific cases: # - movies/tttttt (yyyy)/tttttt.ccc try: if match_tree[-3][0][0][0].lower() == "movies": # Note:too generic, might solve all the unittests as they all contain 'movies' # in their path # # if len(match_tree[-2][0]) == 1: # title = match_tree[-2][0][0] # guess = guessed({ 'title': clean_string(title[0]) }, confidence = 0.7) # update_found(leftover_all, title, guess) year_group = filter(lambda gpos: gpos[0] == len(match_tree) - 2, find_group(match_tree, "year"))[0] leftover = leftover_valid_groups( match_tree, valid=lambda g: ((g[0] and g[0][0] not in sep) and g[1][0] == len(match_tree) - 2) ) if len(match_tree[-2]) == 2 and year_group[1] == 1: title = leftover[0] guess = guessed({"title": clean_string(title[0])}, confidence=0.8) update_found(leftover_all, title[1], guess) raise Exception # to exit the try catch now leftover = [ g for g in leftover_all if (g[1][0] == year_group[0] and g[1][1] < year_group[1] and g[1][2] < year_group[2]) ] leftover = sorted(leftover, key=lambda x: x[1]) title = leftover[0] guess = guessed({"title": title[0]}, confidence=0.8) leftover = update_found(leftover, title[1], guess) except: pass # if we have either format or videoCodec in the folder containing the file # or one of its parents, then we should probably look for the title in # there rather than in the basename props = filter( lambda g: g[0] <= len(match_tree) - 2, find_group(match_tree, "videoCodec") + find_group(match_tree, "format") + find_group(match_tree, "language"), ) leftover = None if props and all(g[0] == props[0][0] for g in props): leftover = [g for g in leftover_all if g[1][0] == props[0][0]] if props and leftover: guess = guessed({"title": leftover[0][0]}, confidence=0.7) leftover = update_found(leftover, leftover[0][1], guess) else: # first leftover group in the last path part sounds like a good candidate for title, # except if it's only one word and that the first group before has at least 3 words in it # (case where the filename contains an 8 chars short name and the movie title is # actually in the parent directory name) leftover = [g for g in leftover_all if g[1][0] == len(match_tree) - 1] if leftover: title, (pidx, eidx, gidx) = leftover[0] previous_pgroup_leftover = filter(lambda g: g[1][0] == pidx - 1, leftover_all) if ( title.count(" ") == 0 and previous_pgroup_leftover and previous_pgroup_leftover[0][0].count(" ") >= 2 ): guess = guessed({"title": previous_pgroup_leftover[0][0]}, confidence=0.6) leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess) else: guess = guessed({"title": title}, confidence=0.6) leftover = update_found(leftover, leftover[0][1], guess) else: # if there were no leftover groups in the last path part, look in the one before that previous_pgroup_leftover = filter(lambda g: g[1][0] == len(match_tree) - 2, leftover_all) if previous_pgroup_leftover: guess = guessed({"title": previous_pgroup_leftover[0][0]}, confidence=0.6) leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess) # 5- perform some post-processing steps # 5.1- try to promote language to subtitle language where it makes sense for pidx, eidx, gidx in find_group(match_tree, "language"): string, remaining, guess = get_group(match_tree, (pidx, eidx, gidx)) def promote_subtitle(): guess.set("subtitleLanguage", guess["language"], confidence=guess.confidence("language")) del guess["language"] # - if we matched a language in a file with a sub extension and that the group # is the last group of the filename, it is probably the language of the subtitle # (eg: 'xxx.english.srt') if fileext in subtitle_exts and pidx == len(match_tree) - 1 and eidx == len(match_tree[pidx]) - 1: promote_subtitle() # - if a language is in an explicit group just preceded by "st", it is a subtitle # language (eg: '...st[fr-eng]...') if eidx > 0: previous = get_group(match_tree, (pidx, eidx - 1, -1)) if previous[0][-2:].lower() == "st": promote_subtitle() # re-append the extension now match_tree.append([[(fileext, deleted * len(fileext), extguess)]]) self.parts = result self.match_tree = match_tree if filename.startswith("/"): filename = " " + filename log.debug("Found match tree:\n%s\n%s" % (to_utf8(tree_to_string(match_tree)), to_utf8(filename)))
def clean_value(self): """Return a cleaned value of the matched substring, with better presentation formatting (punctuation marks removed, duplicate spaces, ...)""" return clean_string(self.value)
def __init__(self, filename, filetype='autodetect', opts=None): """An iterative matcher tries to match different patterns that appear in the filename. The 'filetype' argument indicates which type of file you want to match. If it is 'autodetect', the matcher will try to see whether it can guess that the file corresponds to an episode, or otherwise will assume it is a movie. The recognized 'filetype' values are: [ autodetect, subtitle, movie, moviesubtitle, episode, episodesubtitle ] The IterativeMatcher works mainly in 2 steps: First, it splits the filename into a match_tree, which is a tree of groups which have a semantic meaning, such as episode number, movie title, etc... The match_tree created looks like the following: 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv The first 3 lines indicates the group index in which a char in the filename is located. So for instance, x264 is the group (0, 4, 1), and it corresponds to a video codec, denoted by the letter'v' in the 4th line. (for more info, see guess.matchtree.to_string) Second, it tries to merge all this information into a single object containing all the found properties, and does some (basic) conflict resolution when they arise. """ valid_filetypes = ('autodetect', 'subtitle', 'video', 'movie', 'moviesubtitle', 'episode', 'episodesubtitle') if filetype not in valid_filetypes: raise ValueError("filetype needs to be one of %s" % valid_filetypes) if not PY3 and not isinstance(filename, unicode): log.warning('Given filename to matcher is not unicode...') filename = filename.decode('utf-8') filename = normalize_unicode(filename) if opts is None: opts = [] elif isinstance(opts, base_text_type): opts = opts.split() self.match_tree = MatchTree(filename) # sanity check: make sure we don't process a (mostly) empty string if clean_string(filename) == '': return mtree = self.match_tree mtree.guess.set('type', filetype, confidence=1.0) def apply_transfo(transfo_name, *args, **kwargs): transfo = __import__('guessit.transfo.' + transfo_name, globals=globals(), locals=locals(), fromlist=['process'], level=0) transfo.process(mtree, *args, **kwargs) # 1- first split our path into dirs + basename + ext apply_transfo('split_path_components') # 2- guess the file type now (will be useful later) apply_transfo('guess_filetype', filetype) if mtree.guess['type'] == 'unknown': return # 3- split each of those into explicit groups (separated by parentheses # or square brackets) apply_transfo('split_explicit_groups') # 4- try to match information for specific patterns # NOTE: order needs to comply to the following: # - website before language (eg: tvu.org.ru vs russian) # - language before episodes_rexps # - properties before language (eg: he-aac vs hebrew) # - release_group before properties (eg: XviD-?? vs xvid) if mtree.guess['type'] in ('episode', 'episodesubtitle'): strategy = [ 'guess_date', 'guess_website', 'guess_release_group', 'guess_properties', 'guess_language', 'guess_video_rexps', 'guess_episodes_rexps', 'guess_weak_episodes_rexps' ] else: strategy = [ 'guess_date', 'guess_website', 'guess_release_group', 'guess_properties', 'guess_language', 'guess_video_rexps' ] if 'nolanguage' in opts: strategy.remove('guess_language') for name in strategy: apply_transfo(name) # more guessers for both movies and episodes apply_transfo('guess_bonus_features') apply_transfo('guess_year', skip_first_year=('skip_first_year' in opts)) if 'nocountry' not in opts: apply_transfo('guess_country') apply_transfo('guess_idnumber') # split into '-' separated subgroups (with required separator chars # around the dash) apply_transfo('split_on_dash') # 5- try to identify the remaining unknown groups by looking at their # position relative to other known elements if mtree.guess['type'] in ('episode', 'episodesubtitle'): apply_transfo('guess_episode_info_from_position') else: apply_transfo('guess_movie_title_from_position') # 6- perform some post-processing steps apply_transfo('post_process') log.debug('Found match tree:\n%s' % u(mtree))
def guess_filetype(self, mtree, options=None): options = options or {} # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword which we could use here in the upgrade_* functions # (python 3 does have it) filetype_container = [mtree.guess.get('type')] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' elif filetype_container[0] == 'info': filetype_container[0] = 'episodeinfo' elif not filetype_container[0]: filetype_container[0] = 'episode' def upgrade_movie(): if filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' elif filetype_container[0] == 'info': filetype_container[0] = 'movieinfo' elif not filetype_container[0]: filetype_container[0] = 'movie' def upgrade_subtitle(): if filetype_container[0] == 'movie': filetype_container[0] = 'moviesubtitle' elif filetype_container[0] == 'episode': filetype_container[0] = 'episodesubtitle' elif not filetype_container[0]: filetype_container[0] = 'subtitle' def upgrade_info(): if filetype_container[0] == 'movie': filetype_container[0] = 'movieinfo' elif filetype_container[0] == 'episode': filetype_container[0] = 'episodeinfo' elif not filetype_container[0]: filetype_container[0] = 'info' # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = {'container': fileext} elif fileext in info_exts: upgrade_info() other = {'container': fileext} elif fileext in video_exts: other = {'container': fileext} else: if fileext and not options.get('name_only'): other = {'extension': fileext} # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [ (r'Movies?', upgrade_movie), (r'Films?', upgrade_movie), (r'Tv[ _-]?Shows?', upgrade_episode), (r'Series?', upgrade_episode), (r'Episodes?', upgrade_episode), ] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() return filetype_container[0], other # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = clean_string(filename).lower() for m in self.MOVIES: if m in fname: self.log.debug( 'Found in exception list of movies -> type = movie') upgrade_movie() return filetype_container[0], other for s in self.SERIES: if s in fname: self.log.debug( 'Found in exception list of series -> type = episode') upgrade_episode() return filetype_container[0], other # now look whether there are some specific hints for episode vs movie # if we have an episode_rexp (eg: s02e13), it is an episode episode_transformer = get_transformer('guess_episodes_rexps') if episode_transformer: guess = episode_transformer.guess_episodes_rexps(filename) if guess: self.log.debug( 'Found guess_episodes_rexps: %s -> type = episode', guess) upgrade_episode() return filetype_container[0], other properties_transformer = get_transformer('guess_properties') if properties_transformer: # if we have certain properties characteristic of episodes, it is an ep found = properties_transformer.container.find_properties( filename, mtree, 'episodeFormat') guess = properties_transformer.container.as_guess(found, filename) if guess: self.log.debug( 'Found characteristic property of episodes: %s"', guess) upgrade_episode() return filetype_container[0], other found = properties_transformer.container.find_properties( filename, mtree, 'format') guess = properties_transformer.container.as_guess(found, filename) if guess and guess['format'] in ('HDTV', 'WEBRip', 'WEB-DL', 'DVB'): # Use weak episodes only if TV or WEB source weak_episode_transformer = get_transformer( 'guess_weak_episodes_rexps') if weak_episode_transformer: guess = weak_episode_transformer.guess_weak_episodes_rexps( filename) if guess: self.log.debug( 'Found guess_weak_episodes_rexps: %s -> type = episode', guess) upgrade_episode() return filetype_container[0], other website_transformer = get_transformer('guess_website') if website_transformer: found = website_transformer.container.find_properties( filename, mtree, 'website') guess = website_transformer.container.as_guess(found, filename) if guess: for namepart in ('tv', 'serie', 'episode'): if namepart in guess['website']: # origin-specific type self.log.debug( 'Found characteristic property of episodes: %s', guess) upgrade_episode() return filetype_container[0], other if filetype_container[0] in ('subtitle', 'info') or (not filetype_container[0] and fileext in video_exts): # if no episode info found, assume it's a movie self.log.debug( 'Nothing characteristic found, assuming type = movie') upgrade_movie() if not filetype_container[0]: self.log.debug( 'Nothing characteristic found, assuming type = unknown') filetype_container[0] = 'unknown' return filetype_container[0], other
def clean_value(self): return clean_string(self.value)
def guess_filetype(mtree, filetype): # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword (python 3 does have it) filetype_container = [filetype] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == 'video': filetype_container[0] = 'episode' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' def upgrade_movie(): if filetype_container[0] == 'video': filetype_container[0] = 'movie' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' def upgrade_subtitle(): if 'movie' in filetype_container[0]: filetype_container[0] = 'moviesubtitle' elif 'episode' in filetype_container[0]: filetype_container[0] = 'episodesubtitle' else: filetype_container[0] = 'subtitle' def upgrade(type='unknown'): if filetype_container[0] == 'autodetect': filetype_container[0] = type # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = {'container': fileext} elif fileext in video_exts: upgrade(type='video') other = {'container': fileext} else: upgrade(type='unknown') other = {'extension': fileext} # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [(r'Movies?', upgrade_movie), (r'Tv[ _-]?Shows?', upgrade_episode), (r'Series', upgrade_episode)] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = clean_string(filename).lower() for m in MOVIES: if m in fname: upgrade_movie() for s in SERIES: if s in fname: upgrade_episode() # now look whether there are some specific hints for episode vs movie if filetype_container[0] in ('video', 'subtitle'): # if we have an episode_rexp (eg: s02e13), it is an episode for rexp, _, _ in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: upgrade_episode() break # if we have a 3-4 digit number that's not a year, maybe an episode match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename) if match: fullnumber = int(match.group()[1:-1]) #season = fullnumber // 100 epnumber = fullnumber % 100 possible = True # check for validity if epnumber > 40: possible = False if valid_year(fullnumber): possible = False if possible: upgrade_episode() # if we have certain properties characteristic of episodes, it is an ep for prop, value, _, _ in find_properties(filename): log.debug('prop: %s = %s' % (prop, value)) if prop == 'episodeFormat': upgrade_episode() break elif compute_canonical_form('format', value) == 'DVB': upgrade_episode() break # origin-specific type if 'tvu.org.ru' in filename: upgrade_episode() # if no episode info found, assume it's a movie upgrade_movie() filetype = filetype_container[0] return filetype, other
def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) m = mtree.matched() second_pass_opts = [] second_pass_transfo_opts = {} # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: second_pass_opts.append('skip_first_year') to_skip_language_nodes = [] title_nodes = set(n for n in find_nodes(mtree.match_tree, ['title', 'series'])) title_spans = {} for title_node in title_nodes: title_spans[title_node.span[0]] = title_node title_spans[title_node.span[1]] = title_node for lang_key in ('language', 'subtitleLanguage'): langs = {} lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key)) for lang_node in lang_nodes: lang = lang_node.guess.get(lang_key, None) if len(lang_node.value) > 3 and (lang_node.span[0] in title_spans.keys() or lang_node.span[1] in title_spans.keys()): # Language is next or before title, and is not a language code. Add to skip for 2nd pass. # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(lang_node.root.value).split() if m['type'] in ['moviesubtitle', 'episodesubtitle'] and (parts.index(lang_node.value) == len(parts) - 2): continue to_skip_language_nodes.append(lang_node) elif not lang in langs: langs[lang] = lang_node else: # The same language was found. Keep the more confident one, and add others to skip for 2nd pass. existing_lang_node = langs[lang] to_skip = None if existing_lang_node.guess.confidence('language') >= lang_node.guess.confidence('language'): # lang_node is to remove to_skip = lang_node else: # existing_lang_node is to remove langs[lang] = lang_node to_skip = existing_lang_node to_skip_language_nodes.append(to_skip) if to_skip_language_nodes: second_pass_transfo_opts['guess_language'] = ( ((), { 'skip': [ { 'node_idx': node.parent.node_idx, 'span': node.span } for node in to_skip_language_nodes ] })) if second_pass_opts or second_pass_transfo_opts: # 2nd pass is needed log.info("Running 2nd pass with options: %s" % second_pass_opts) log.info("Transfo options: %s" % second_pass_transfo_opts) mtree = IterativeMatcher(filename, filetype=filetype, opts=second_pass_opts, transfo_opts=second_pass_transfo_opts) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return m
def guess_groups(string, result, filetype): # add sentinels so we can match a separator char at either end of # our groups, even when they are at the beginning or end of the string # we will adjust the span accordingly later # # filetype can either be movie, moviesubtitle, episode, episodesubtitle current = " " + string + " " regions = [] # list of (start, end) of matched regions def guessed(match_dict, confidence): guess = format_guess(Guess(match_dict, confidence=confidence)) result.append(guess) log.debug("Found with confidence %.2f: %s" % (confidence, guess)) return guess def update_found(string, guess, span, span_adjust=(0, 0)): span = (span[0] + span_adjust[0], span[1] + span_adjust[1]) regions.append((span, guess)) return blank_region(string, span) # try to find dates first, as they are very specific date, span = search_date(current) if date: guess = guessed({"date": date}, confidence=1.0) current = update_found(current, guess, span) # for non episodes only, look for year information if filetype not in ("episode", "episodesubtitle"): year, span = search_year(current) if year: guess = guessed({"year": year}, confidence=1.0) current = update_found(current, guess, span) # specific regexps (ie: cd number, season X episode, ...) for rexp, confidence, span_adjust in video_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it) if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None: del metadata["cdNumberTotal"] guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) if filetype in ("episode", "episodesubtitle"): for rexp, confidence, span_adjust in episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() guess = guessed(metadata, confidence=confidence) current = update_found(current, guess, match.span(), span_adjust) # Now websites, but as exact string instead of regexps clow = current.lower() for site in websites: pos = clow.find(site.lower()) if pos != -1: guess = guessed({"website": site}, confidence=confidence) current = update_found(current, guess, (pos, pos + len(site))) clow = current.lower() # release groups have certain constraints, cannot be included in the previous general regexps group_names = [ r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]", r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]", r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]", ] for rexp in group_names: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() metadata.update({"videoCodec": match.group(1)}) guess = guessed(metadata, confidence=0.8) current = update_found(current, guess, match.span(), span_adjust=(1, -1)) # common well-defined words and regexps confidence = 1.0 # for all of them for prop, value, pos, end in find_properties(current): guess = guessed({prop: value}, confidence=confidence) current = update_found(current, guess, (pos, end)) # weak guesses for episode number, only run it if we don't have an estimate already if filetype in ("episode", "episodesubtitle"): if not any("episodeNumber" in match for match in result): for rexp, _, span_adjust in weak_episode_rexps: match = re.search(rexp, current, re.IGNORECASE) if match: metadata = match.groupdict() epnum = int(metadata["episodeNumber"]) if epnum > 100: guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6) else: guess = guessed(metadata, confidence=0.3) current = update_found(current, guess, match.span(), span_adjust) # try to find languages now language, span, confidence = search_language(current) while language: # is it a subtitle language? if "sub" in clean_string(current[: span[0]]).lower().split(" "): guess = guessed({"subtitleLanguage": language}, confidence=confidence) else: guess = guessed({"language": language}, confidence=confidence) current = update_found(current, guess, span) language, span, confidence = search_language(current) # remove our sentinels now and ajust spans accordingly assert current[0] == " " and current[-1] == " " current = current[1:-1] regions = [((start - 1, end - 1), guess) for (start, end), guess in regions] # split into '-' separated subgroups (with required separator chars # around the dash) didx = current.find("-") while didx > 0: regions.append(((didx, didx), None)) didx = current.find("-", didx + 1) # cut our final groups, and rematch the guesses to the group that created # id, None if it is a leftover group region_spans = [span for span, guess in regions] string_groups = split_on_groups(string, region_spans) remaining_groups = split_on_groups(current, region_spans) guesses = [] pos = 0 for group in string_groups: found = False for span, guess in regions: if span[0] == pos: guesses.append(guess) found = True if not found: guesses.append(None) pos += len(group) return zip(string_groups, remaining_groups, guesses)
def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: mtree = IterativeMatcher(filename, filetype=filetype, opts=['skip_first_year']) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') is None: return m if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage'])) if not langs: return warning('A weird error happened with language detection') # find the language that is likely more relevant for lng in langs: if lng.value in title2.value: # if the language was detected as part of a potential title, # look at this one in particular lang = lng break else: # pick the first one if we don't have a better choice lang = langs[0] # language code are rarely part of a title, and those # should be handled by the Language exceptions anyway if len(lang.value) <= 3: return m # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(title.root.value).split() if (m['type'] in ['moviesubtitle', 'episodesubtitle']): if lang.value in parts and (parts.index(lang.value) == len(parts) - 2): return m # if the language was in the middle of the other potential title, # keep the other title (eg: The Italian Job), except if it is at the # very beginning, in which case we consider it an error if m2['title'].startswith(lang.value): return m elif lang.value in title2.value: return m2 # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return warning('Not sure of the title because of the language position') return m
def _guess_filename(filename, filetype): def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m mtree = IterativeMatcher(filename, filetype=filetype) m = mtree.matched() second_pass_opts = [] second_pass_transfo_opts = {} # if there are multiple possible years found, we assume the first one is # part of the title, reparse the tree taking this into account years = set(n.value for n in find_nodes(mtree.match_tree, 'year')) if len(years) >= 2: second_pass_opts.append('skip_first_year') to_skip_language_nodes = [] title_nodes = set( n for n in find_nodes(mtree.match_tree, ['title', 'series'])) title_spans = {} for title_node in title_nodes: title_spans[title_node.span[0]] = title_node title_spans[title_node.span[1]] = title_node for lang_key in ('language', 'subtitleLanguage'): langs = {} lang_nodes = set(n for n in find_nodes(mtree.match_tree, lang_key)) for lang_node in lang_nodes: lang = lang_node.guess.get(lang_key, None) if len(lang_node.value) > 3 and ( lang_node.span[0] in list(title_spans.keys()) or lang_node.span[1] in list(title_spans.keys())): # Language is next or before title, and is not a language code. Add to skip for 2nd pass. # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(lang_node.root.value).split() if m['type'] in [ 'moviesubtitle', 'episodesubtitle' ] and (parts.index(lang_node.value) == len(parts) - 2): continue to_skip_language_nodes.append(lang_node) elif not lang in langs: langs[lang] = lang_node else: # The same language was found. Keep the more confident one, and add others to skip for 2nd pass. existing_lang_node = langs[lang] to_skip = None if existing_lang_node.guess.confidence( 'language') >= lang_node.guess.confidence('language'): # lang_node is to remove to_skip = lang_node else: # existing_lang_node is to remove langs[lang] = lang_node to_skip = existing_lang_node to_skip_language_nodes.append(to_skip) if to_skip_language_nodes: second_pass_transfo_opts['guess_language'] = (((), { 'skip': [{ 'node_idx': node.parent.node_idx, 'span': node.span } for node in to_skip_language_nodes] })) if second_pass_opts or second_pass_transfo_opts: # 2nd pass is needed log.info("Running 2nd pass with options: %s" % second_pass_opts) log.info("Transfo options: %s" % second_pass_transfo_opts) mtree = IterativeMatcher(filename, filetype=filetype, opts=second_pass_opts, transfo_opts=second_pass_transfo_opts) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m or 'title' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return m
def _guess_filename(filename, filetype): mtree = IterativeMatcher(filename, filetype=filetype) m = mtree.matched() if 'language' not in m and 'subtitleLanguage' not in m: return m # if we found some language, make sure we didn't cut a title or sth... mtree2 = IterativeMatcher(filename, filetype=filetype, opts=['nolanguage', 'nocountry']) m2 = mtree2.matched() def find_nodes(tree, props): """Yields all nodes containing any of the given props.""" if isinstance(props, base_text_type): props = [props] for node in tree.nodes(): if any(prop in node.guess for prop in props): yield node def warning(title): log.warning('%s, guesses: %s - %s' % (title, m.nice_string(), m2.nice_string())) return m if m.get('title') != m2.get('title'): title = next(find_nodes(mtree.match_tree, 'title')) title2 = next(find_nodes(mtree2.match_tree, 'title')) langs = list(find_nodes(mtree.match_tree, ['language', 'subtitleLanguage'])) if not langs: return warning('A weird error happened with language detection') # find the language that is likely more relevant for lng in langs: if lng.value in title2.value: # if the language was detected as part of a potential title, # look at this one in particular lang = lng break else: # pick the first one if we don't have a better choice lang = langs[0] # language code are rarely part of a title, and those # should be handled by the Language exceptions anyway if len(lang.value) <= 3: return m # if filetype is subtitle and the language appears last, just before # the extension, then it is likely a subtitle language parts = clean_string(title.root.value).split() if (m['type'] in ['moviesubtitle', 'episodesubtitle'] and parts.index(lang.value) == len(parts) - 2): return m # if the language was in the middle of the other potential title, # keep the other title (eg: The Italian Job), except if it is at the # very beginning, in which case we consider it an error if m2['title'].startswith(lang.value): return m elif lang.value in title2.value: return m2 # if a node is in an explicit group, then the correct title is probably # the other one if title.root.node_at(title.node_idx[:2]).is_explicit(): return m2 elif title2.root.node_at(title2.node_idx[:2]).is_explicit(): return m return warning('Not sure of the title because of the language position') return m
def guess_filetype(mtree, filetype): # put the filetype inside a dummy container to be able to have the # following functions work correctly as closures # this is a workaround for python 2 which doesn't have the # 'nonlocal' keyword (python 3 does have it) filetype_container = [filetype] other = {} filename = mtree.string def upgrade_episode(): if filetype_container[0] == 'video': filetype_container[0] = 'episode' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'episodesubtitle' elif filetype_container[0] == 'info': filetype_container[0] = 'episodeinfo' def upgrade_movie(): if filetype_container[0] == 'video': filetype_container[0] = 'movie' elif filetype_container[0] == 'subtitle': filetype_container[0] = 'moviesubtitle' elif filetype_container[0] == 'info': filetype_container[0] = 'movieinfo' def upgrade_subtitle(): if 'movie' in filetype_container[0]: filetype_container[0] = 'moviesubtitle' elif 'episode' in filetype_container[0]: filetype_container[0] = 'episodesubtitle' else: filetype_container[0] = 'subtitle' def upgrade_info(): if 'movie' in filetype_container[0]: filetype_container[0] = 'movieinfo' elif 'episode' in filetype_container[0]: filetype_container[0] = 'episodeinfo' else: filetype_container[0] = 'info' def upgrade(type='unknown'): if filetype_container[0] == 'autodetect': filetype_container[0] = type # look at the extension first fileext = os.path.splitext(filename)[1][1:].lower() if fileext in subtitle_exts: upgrade_subtitle() other = {'container': fileext} elif fileext in info_exts: upgrade_info() other = {'container': fileext} elif fileext in video_exts: upgrade(type='video') other = {'container': fileext} else: upgrade(type='unknown') other = {'extension': fileext} # check whether we are in a 'Movies', 'Tv Shows', ... folder folder_rexps = [ (r'Movies?', upgrade_movie), (r'Tv[ _-]?Shows?', upgrade_episode), (r'Series', upgrade_episode) ] for frexp, upgrade_func in folder_rexps: frexp = re.compile(frexp, re.IGNORECASE) for pathgroup in mtree.children: if frexp.match(pathgroup.value): upgrade_func() # check for a few specific cases which will unintentionally make the # following heuristics confused (eg: OSS 117 will look like an episode, # season 1, epnum 17, when it is in fact a movie) fname = clean_string(filename).lower() for m in MOVIES: if m in fname: log.debug('Found in exception list of movies -> type = movie') upgrade_movie() for s in SERIES: if s in fname: log.debug('Found in exception list of series -> type = episode') upgrade_episode() # now look whether there are some specific hints for episode vs movie if filetype_container[0] in ('video', 'subtitle', 'info'): # if we have an episode_rexp (eg: s02e13), it is an episode for rexp, _, _ in episode_rexps: match = re.search(rexp, filename, re.IGNORECASE) if match: log.debug('Found matching regexp: "%s" (string = "%s") -> type = episode', rexp, match.group()) upgrade_episode() break # if we have a 3-4 digit number that's not a year, maybe an episode match = re.search(r'[^0-9]([0-9]{3,4})[^0-9]', filename) if match: fullnumber = int(match.group()[1:-1]) #season = fullnumber // 100 epnumber = fullnumber % 100 possible = True # check for validity if epnumber > 40: possible = False if valid_year(fullnumber): possible = False if possible: log.debug('Found possible episode number: %s (from string "%s") -> type = episode', epnumber, match.group()) upgrade_episode() # if we have certain properties characteristic of episodes, it is an ep for prop, _ in container.find_properties(filename, 'episodeFormat'): log.debug('prop: %s' % prop) log.debug('Found characteristic property of episodes: %s"', prop) upgrade_episode() for prop, _ in container.find_properties(filename, 'format'): if container.compute_canonical_form('format', prop.canonical_form) == 'DVB': log.debug('Found characteristic property of episodes: %s', prop) upgrade_episode() break # origin-specific type if 'tvu.org.ru' in filename: log.debug('Found characteristic property of episodes: %s', 'tvu.org.ru') upgrade_episode() # if no episode info found, assume it's a movie log.debug('Nothing characteristic found, assuming type = movie') upgrade_movie() filetype = filetype_container[0] return filetype, other