def checkMinimumFieldsCorrect(self, guesser, filename, removeType = True): groundTruth = yaml.load(open(join(currentPath(), filename)).read()) for filename, required in groundTruth.items(): if isinstance(filename, str): filename = filename.decode('utf-8') log.debug('\n' + '-' * 120) log.info('Guessing information for file: %s' % to_utf8(filename)) found = guesser(filename) # no need for this in the unittests if removeType: del found['type'] for prop in ('container', 'mimetype'): if prop in found: del found[prop] # props which are list of just 1 elem should be opened for easier writing of the tests for prop in ('language', 'subtitleLanguage', 'other'): value = found.get(prop, None) if isinstance(value, list) and len(value) == 1: found[prop] = value[0] # compare all properties for prop, value in required.items(): if prop not in found: log.warning('Prop \'%s\' not found in: %s' % (prop, to_utf8(filename))) continue #if type(value) != type(found[prop]) and not (isinstance(value, basestring) and isinstance(found[prop], basestring)): # log.warning("Wrong prop types for '%s': expected = '%s' - received = '%s'" % (prop, to_utf8(value), found[prop])) if isinstance(value, basestring) and isinstance(found[prop], basestring): if value.lower() != found[prop].lower(): log.warning("Wrong prop value str for '%s': expected = '%s' - received = '%s'" % (prop, to_utf8(value), to_utf8(found[prop]))) elif isinstance(value, list) and isinstance(found[prop], list): s1 = set(str(s).lower() for s in value) s2 = set(str(s).lower() for s in found[prop]) if s1 != s2: log.warning("Wrong prop value list for '%s': expected = '%s' - received = '%s'" % (prop, to_utf8(value), to_utf8(found[prop]))) else: if found[prop] != value: log.warning("Wrong prop value for '%s': expected = '%s' - received = '%s'" % (prop, to_utf8(value), to_utf8(found[prop]))) for prop, value in found.items(): if prop not in required: log.warning("Found additional info for prop = '%s': '%s'" % (prop, to_utf8(value)))
def __init__(self, filename, filetype='autodetect'): """An iterative matcher tries to match different patterns that appear in the filename. The 'filetype' argument indicates which type of file you want to match. If it is 'autodetect', the matcher will try to see whether it can guess that the file corresponds to an episode, or otherwise will assume it is a movie. The recognized 'filetype' values are: [ autodetect, subtitle, movie, moviesubtitle, episode, episodesubtitle ] The IterativeMatcher works mainly in 2 steps: First, it splits the filename into a match_tree, which is a tree of groups which have a semantic meaning, such as episode number, movie title, etc... The match_tree created looks like the following: 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv The first 3 lines indicates the group index in which a char in the filename is located. So for instance, x264 is the group (0, 4, 1), and it corresponds to a video codec, denoted by the letter'v' in the 4th line. (for more info, see guess.matchtree.to_string) Second, it tries to merge all this information into a single object containing all the found properties, and does some (basic) conflict resolution when they arise. """ valid_filetypes = ('autodetect', 'subtitle', 'video', 'movie', 'moviesubtitle', 'episode', 'episodesubtitle') if filetype not in valid_filetypes: raise ValueError("filetype needs to be one of %s" % valid_filetypes) if not isinstance(filename, unicode): log.debug('WARNING: given filename to matcher is not unicode...') self.match_tree = MatchTree(filename) mtree = self.match_tree mtree.guess.set('type', filetype, confidence=1.0) def apply_transfo(transfo_name, *args, **kwargs): transfo = __import__('guessit.transfo.' + transfo_name, globals=globals(), locals=locals(), fromlist=['process'], level=-1) transfo.process(mtree, *args, **kwargs) # 1- first split our path into dirs + basename + ext apply_transfo('split_path_components') # 2- guess the file type now (will be useful later) apply_transfo('guess_filetype', filetype) if mtree.guess['type'] == 'unknown': return # 3- split each of those into explicit groups (separated by parentheses # or square brackets) apply_transfo('split_explicit_groups') # 4- try to match information for specific patterns if mtree.guess['type'] in ('episode', 'episodesubtitle'): strategy = ['guess_date', 'guess_video_rexps', 'guess_episodes_rexps', 'guess_website', 'guess_release_group', 'guess_properties', 'guess_weak_episodes_rexps', 'guess_language'] else: strategy = ['guess_date', 'guess_year', 'guess_video_rexps', 'guess_website', 'guess_release_group', 'guess_properties', 'guess_language'] for name in strategy: apply_transfo(name) # more guessers for both movies and episodes for name in ['guess_bonus_features']: apply_transfo(name) # split into '-' separated subgroups (with required separator chars # around the dash) apply_transfo('split_on_dash') # 5- try to identify the remaining unknown groups by looking at their # position relative to other known elements if mtree.guess['type'] in ('episode', 'episodesubtitle'): apply_transfo('guess_episode_info_from_position') else: apply_transfo('guess_movie_title_from_position') # 6- perform some post-processing steps apply_transfo('post_process') log.debug('Found match tree:\n%s' % (to_utf8(unicode(mtree))))
def __str__(self): return to_utf8(unicode(self))
def __init__(self, filename, filetype="autodetect"): """An iterative matcher tries to match different patterns that appear in the filename. The 'filetype' argument indicates which type of file you want to match. If it is 'autodetect', the matcher will try to see whether it can guess that the file corresponds to an episode, or otherwise will assume it is a movie. The recognized 'filetype' values are: [ autodetect, subtitle, movie, moviesubtitle, episode, episodesubtitle ] The IterativeMatcher works mainly in 2 steps: First, it splits the filename into a match_tree, which is a tree of groups which have a semantic meaning, such as episode number, movie title, etc... The match_tree created looks like the following: 0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111 0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000 0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000 __________________(The.Prestige).______.[____.HP.______.{__-___}.St{__-___}.Chaps].___ xxxxxttttttttttttt ffffff vvvv xxxxxx ll lll xx xxx ccc [XCT].Le.Prestige.(The.Prestige).DVDRip.[x264.HP.He-Aac.{Fr-Eng}.St{Fr-Eng}.Chaps].mkv The first 3 lines indicates the group index in which a char in the filename is located. So for instance, x264 is the group (0, 4, 1), and it corresponds to a video codec, denoted by the letter'v' in the 4th line. (for more info, see guess.matchtree.tree_to_string) Second, it tries to merge all this information into a single object containing all the found properties, and does some (basic) conflict resolution when they arise. """ if filetype not in ("autodetect", "subtitle", "video", "movie", "moviesubtitle", "episode", "episodesubtitle"): raise ValueError, "filetype needs to be one of ('autodetect', 'subtitle', 'video', 'movie', 'moviesubtitle', 'episode', 'episodesubtitle')" if not isinstance(filename, unicode): log.debug("WARNING: given filename to matcher is not unicode...") match_tree = [] result = [] # list of found metadata def guessed(match_dict, confidence): guess = format_guess(Guess(match_dict, confidence=confidence)) result.append(guess) log.debug("Found with confidence %.2f: %s" % (confidence, guess)) return guess def update_found(leftover, group_pos, guess): pidx, eidx, gidx = group_pos group = match_tree[pidx][eidx][gidx] match_tree[pidx][eidx][gidx] = (group[0], deleted * len(group[0]), guess) return [g for g in leftover if g[1] != group_pos] # 1- first split our path into dirs + basename + ext match_tree = split_path_components(filename) # try to detect the file type filetype, other = guess_filetype(filename, filetype) guessed({"type": filetype}, confidence=1.0) extguess = guessed(other, confidence=1.0) # guess the mimetype of the filename # TODO: handle other mimetypes not found on the default type_maps # mimetypes.types_map['.srt']='text/subtitle' mime, _ = mimetypes.guess_type(filename, strict=False) if mime is not None: guessed({"mimetype": mime}, confidence=1.0) # remove the extension from the match tree, as all indices relative # the the filename groups assume the basename is the last one fileext = match_tree.pop(-1)[1:].lower() # 2- split each of those into explicit groups, if any # note: be careful, as this might split some regexps with more confidence such as # Alfleni-Team, or [XCT] or split a date such as (14-01-2008) match_tree = [split_explicit_groups(part) for part in match_tree] # 3- try to match information in decreasing order of confidence and # blank the matching group in the string if we found something for pathpart in match_tree: for gidx, explicit_group in enumerate(pathpart): pathpart[gidx] = guess_groups(explicit_group, result, filetype=filetype) # 4- try to identify the remaining unknown groups by looking at their position # relative to other known elements if filetype in ("episode", "episodesubtitle"): eps = find_group(match_tree, "episodeNumber") if eps: match_tree = match_from_epnum_position(match_tree, eps[0], guessed, update_found) leftover = leftover_valid_groups(match_tree) if not eps: # if we don't have the episode number, but at least 2 groups in the # last path group, then it's probably series - eptitle title_candidates = filter( lambda g: g[0].lower() not in non_episode_title, filter(lambda g: g[1][0] == len(match_tree) - 1, leftover_valid_groups(match_tree)), ) if len(title_candidates) >= 2: guess = guessed({"series": title_candidates[0][0]}, confidence=0.4) leftover = update_found(leftover, title_candidates[0][1], guess) guess = guessed({"title": title_candidates[1][0]}, confidence=0.4) leftover = update_found(leftover, title_candidates[1][1], guess) # if there's a path group that only contains the season info, then the previous one # is most likely the series title (ie: .../series/season X/...) eps = [ gpos for gpos in find_group(match_tree, "season") if "episodeNumber" not in get_group(match_tree, gpos)[2] ] if eps: pidx, eidx, gidx = eps[0] previous = [group for group in leftover if group[1][0] == pidx - 1] if len(previous) == 1: guess = guessed({"series": previous[0][0]}, confidence=0.5) leftover = update_found(leftover, previous[0][1], guess) # reduce the confidence of unlikely series for guess in result: if "series" in guess: if guess["series"].lower() in unlikely_series: guess.set_confidence("series", guess.confidence("series") * 0.5) elif filetype in ("movie", "moviesubtitle"): leftover_all = leftover_valid_groups(match_tree) # specific cases: # - movies/tttttt (yyyy)/tttttt.ccc try: if match_tree[-3][0][0][0].lower() == "movies": # Note:too generic, might solve all the unittests as they all contain 'movies' # in their path # # if len(match_tree[-2][0]) == 1: # title = match_tree[-2][0][0] # guess = guessed({ 'title': clean_string(title[0]) }, confidence = 0.7) # update_found(leftover_all, title, guess) year_group = filter(lambda gpos: gpos[0] == len(match_tree) - 2, find_group(match_tree, "year"))[0] leftover = leftover_valid_groups( match_tree, valid=lambda g: ((g[0] and g[0][0] not in sep) and g[1][0] == len(match_tree) - 2) ) if len(match_tree[-2]) == 2 and year_group[1] == 1: title = leftover[0] guess = guessed({"title": clean_string(title[0])}, confidence=0.8) update_found(leftover_all, title[1], guess) raise Exception # to exit the try catch now leftover = [ g for g in leftover_all if (g[1][0] == year_group[0] and g[1][1] < year_group[1] and g[1][2] < year_group[2]) ] leftover = sorted(leftover, key=lambda x: x[1]) title = leftover[0] guess = guessed({"title": title[0]}, confidence=0.8) leftover = update_found(leftover, title[1], guess) except: pass # if we have either format or videoCodec in the folder containing the file # or one of its parents, then we should probably look for the title in # there rather than in the basename props = filter( lambda g: g[0] <= len(match_tree) - 2, find_group(match_tree, "videoCodec") + find_group(match_tree, "format") + find_group(match_tree, "language"), ) leftover = None if props and all(g[0] == props[0][0] for g in props): leftover = [g for g in leftover_all if g[1][0] == props[0][0]] if props and leftover: guess = guessed({"title": leftover[0][0]}, confidence=0.7) leftover = update_found(leftover, leftover[0][1], guess) else: # first leftover group in the last path part sounds like a good candidate for title, # except if it's only one word and that the first group before has at least 3 words in it # (case where the filename contains an 8 chars short name and the movie title is # actually in the parent directory name) leftover = [g for g in leftover_all if g[1][0] == len(match_tree) - 1] if leftover: title, (pidx, eidx, gidx) = leftover[0] previous_pgroup_leftover = filter(lambda g: g[1][0] == pidx - 1, leftover_all) if ( title.count(" ") == 0 and previous_pgroup_leftover and previous_pgroup_leftover[0][0].count(" ") >= 2 ): guess = guessed({"title": previous_pgroup_leftover[0][0]}, confidence=0.6) leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess) else: guess = guessed({"title": title}, confidence=0.6) leftover = update_found(leftover, leftover[0][1], guess) else: # if there were no leftover groups in the last path part, look in the one before that previous_pgroup_leftover = filter(lambda g: g[1][0] == len(match_tree) - 2, leftover_all) if previous_pgroup_leftover: guess = guessed({"title": previous_pgroup_leftover[0][0]}, confidence=0.6) leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess) # 5- perform some post-processing steps # 5.1- try to promote language to subtitle language where it makes sense for pidx, eidx, gidx in find_group(match_tree, "language"): string, remaining, guess = get_group(match_tree, (pidx, eidx, gidx)) def promote_subtitle(): guess.set("subtitleLanguage", guess["language"], confidence=guess.confidence("language")) del guess["language"] # - if we matched a language in a file with a sub extension and that the group # is the last group of the filename, it is probably the language of the subtitle # (eg: 'xxx.english.srt') if fileext in subtitle_exts and pidx == len(match_tree) - 1 and eidx == len(match_tree[pidx]) - 1: promote_subtitle() # - if a language is in an explicit group just preceded by "st", it is a subtitle # language (eg: '...st[fr-eng]...') if eidx > 0: previous = get_group(match_tree, (pidx, eidx - 1, -1)) if previous[0][-2:].lower() == "st": promote_subtitle() # re-append the extension now match_tree.append([[(fileext, deleted * len(fileext), extguess)]]) self.parts = result self.match_tree = match_tree if filename.startswith("/"): filename = " " + filename log.debug("Found match tree:\n%s\n%s" % (to_utf8(tree_to_string(match_tree)), to_utf8(filename)))
def detect_filename(filename, filetype, info = ['filename']): if isinstance(filename, str): filename = filename.decode('utf-8') print 'For:', to_utf8(filename) print 'GuessIt found:', guess_file_info(filename, filetype, info).nice_string()