def leftover_valid_groups(match_tree, valid = lambda s: len(s[0]) > 3):
    """Return the list of valid string groups (eg: len(s) > 3) that could not be
    matched to anything as a list of pairs (cleaned_str, group_pos)."""
    leftover = []
    for gpos, (group, remaining, guess) in iterate_groups(match_tree):
        if not guess:
            clean_str = clean_string(remaining)
            if valid((clean_str, gpos)):
                leftover.append((clean_str, gpos))

    return leftover
def guess_language(string):
    language, span, confidence = search_language(string)
    if language:
        # is it a subtitle language?
        if 'sub' in clean_string(string[:span[0]]).lower().split(' '):
            return (Guess({'subtitleLanguage': language},
            return (Guess({'language': language},

    return None, None
def format_guess(guess):
    """Format all the found values to their natural type.
    For instance, a year would be stored as an int value, etc...

    Note that this modifies the dictionary given as input.
    for prop, value in guess.items():
        if prop in ("season", "episodeNumber", "year", "cdNumber", "cdNumberTotal"):
            guess[prop] = int(guess[prop])
        elif isinstance(value, basestring):
            if prop in ("edition",):
                value = clean_string(value)
            guess[prop] = canonical_form(value)

    return guess
    def __init__(self, filename, filetype='autodetect', opts=None, transfo_opts=None):
        if opts is None:
            opts = []
        if not isinstance(opts, list):
            raise ValueError('opts must be a list of option names! Received: type=%s val=%s',
                             type(opts), opts)

        if transfo_opts is None:
            transfo_opts = {}
        if not isinstance(transfo_opts, dict):
            raise ValueError('transfo_opts must be a dict of { transfo_name: (args, kwargs) }. ' +
                             'Received: type=%s val=%s', type(transfo_opts), transfo_opts)

        valid_filetypes = ('autodetect', 'subtitle', 'info', 'video',
                           'movie', 'moviesubtitle', 'movieinfo',
                           'episode', 'episodesubtitle', 'episodeinfo')
        if filetype not in valid_filetypes:
            raise ValueError("filetype needs to be one of %s" % valid_filetypes)
        if not PY3 and not isinstance(filename, unicode):
            log.warning('Given filename to matcher is not unicode...')
            filename = filename.decode('utf-8')

        filename = normalize_unicode(filename)

        self.filename = filename
        self.match_tree = MatchTree(filename)
        self.filetype = filetype
        self.opts = opts
        self.transfo_opts = transfo_opts
        self._transfo_calls = []

        # sanity check: make sure we don't process a (mostly) empty string
        if clean_string(filename) == '':

            mtree = self.match_tree
            mtree.guess.set('type', filetype, confidence=1.0)

            for transformer in transformers.extensions.objects():

            log.debug('Found match tree:\n%s' % u(mtree))
        except TransfoException as e:
            log.debug('An error has occured in Transformer %s: %s' % (e.transformer, e))
    def __init__(self, filename, options=None, **kwargs):
        options = dict(options or {})
        for k, v in kwargs.items():
            if k not in options or not options[k]:
                    k] = v  # options dict has priority over keyword arguments
        if not PY3 and not isinstance(filename, unicode):
            log.warning('Given filename to matcher is not unicode...')
            filename = filename.decode('utf-8')

        filename = normalize_unicode(filename)
        self.match_tree = MatchTree(filename)
        self.options = options
        self._transfo_calls = []

        # sanity check: make sure we don't process a (mostly) empty string
        if clean_string(filename) == '':

        from guessit.plugins import transformers

            mtree = self.match_tree
            if 'type' in self.options:
                mtree.guess.set('type', self.options['type'], confidence=0.0)

            # Process
            for transformer in transformers.all_transformers():
                self._process(transformer, False)

            # Post-process
            for transformer in transformers.all_transformers():
                self._process(transformer, True)

            log.debug('Found match tree:\n%s' % u(mtree))
        except TransformerException as e:
            log.debug('An error has occurred in Transformer %s: %s' %
                      (e.transformer, e))
    def __init__(self, filename, options=None, **kwargs):
        options = dict(options or {})
        for k, v in kwargs.items():
            if k not in options or not options[k]:
                options[k] = v  # options dict has priority over keyword arguments
        if not PY3 and not isinstance(filename, unicode):
            log.warning('Given filename to matcher is not unicode...')
            filename = filename.decode('utf-8')

        filename = normalize_unicode(filename)
        self.match_tree = MatchTree(filename)
        self.options = options
        self._transfo_calls = []

        # sanity check: make sure we don't process a (mostly) empty string
        if clean_string(filename) == '':

        from guessit.plugins import transformers

            mtree = self.match_tree
            if 'type' in self.options:
                mtree.guess.set('type', self.options['type'], confidence=0.0)

            # Process
            for transformer in transformers.all_transformers():
                self._process(transformer, False)

            # Post-process
            for transformer in transformers.all_transformers():
                self._process(transformer, True)

            log.debug('Found match tree:\n%s' % u(mtree))
        except TransformerException as e:
            log.debug('An error has occured in Transformer %s: %s' % (e.transformer, e))
    def guess_filetype(self, mtree, options=None):
        options = options or {}

        # put the filetype inside a dummy container to be able to have the
        # following functions work correctly as closures
        # this is a workaround for python 2 which doesn't have the
        # 'nonlocal' keyword which we could use here in the upgrade_* functions
        # (python 3 does have it)
        filetype_container = [mtree.guess.get("type")]
        other = {}
        filename = mtree.string

        def upgrade_episode():
            if filetype_container[0] == "subtitle":
                filetype_container[0] = "episodesubtitle"
            elif filetype_container[0] == "info":
                filetype_container[0] = "episodeinfo"
            elif not filetype_container[0]:
                filetype_container[0] = "episode"

        def upgrade_movie():
            if filetype_container[0] == "subtitle":
                filetype_container[0] = "moviesubtitle"
            elif filetype_container[0] == "info":
                filetype_container[0] = "movieinfo"
            elif not filetype_container[0]:
                filetype_container[0] = "movie"

        def upgrade_subtitle():
            if filetype_container[0] == "movie":
                filetype_container[0] = "moviesubtitle"
            elif filetype_container[0] == "episode":
                filetype_container[0] = "episodesubtitle"
            elif not filetype_container[0]:
                filetype_container[0] = "subtitle"

        def upgrade_info():
            if filetype_container[0] == "movie":
                filetype_container[0] = "movieinfo"
            elif filetype_container[0] == "episode":
                filetype_container[0] = "episodeinfo"
            elif not filetype_container[0]:
                filetype_container[0] = "info"

        # look at the extension first
        fileext = os.path.splitext(filename)[1][1:].lower()
        if fileext in subtitle_exts:
            other = {"container": fileext}
        elif fileext in info_exts:
            other = {"container": fileext}
        elif fileext in video_exts:
            other = {"container": fileext}
            if fileext and not options.get("name_only"):
                other = {"extension": fileext}

        # check whether we are in a 'Movies', 'Tv Shows', ... folder
        folder_rexps = [
            (r"Movies?", upgrade_movie),
            (r"Films?", upgrade_movie),
            (r"Tv[ _-]?Shows?", upgrade_episode),
            (r"Series?", upgrade_episode),
            (r"Episodes?", upgrade_episode),
        for frexp, upgrade_func in folder_rexps:
            frexp = re.compile(frexp, re.IGNORECASE)
            for pathgroup in mtree.children:
                if frexp.match(pathgroup.value):
                    return filetype_container[0], other

        # check for a few specific cases which will unintentionally make the
        # following heuristics confused (eg: OSS 117 will look like an episode,
        # season 1, epnum 17, when it is in fact a movie)
        fname = clean_string(filename).lower()
        for m in self.MOVIES:
            if m in fname:
                self.log.debug("Found in exception list of movies -> type = movie")
                return filetype_container[0], other
        for s in self.SERIES:
            if s in fname:
                self.log.debug("Found in exception list of series -> type = episode")
                return filetype_container[0], other

        # now look whether there are some specific hints for episode vs movie
        # if we have an episode_rexp (eg: s02e13), it is an episode
        episode_transformer = get_transformer("guess_episodes_rexps")
        if episode_transformer:
            guess = episode_transformer.guess_episodes_rexps(filename)
            if guess:
                self.log.debug("Found guess_episodes_rexps: %s -> type = episode", guess)
                return filetype_container[0], other

        properties_transformer = get_transformer("guess_properties")
        if properties_transformer:
            # if we have certain properties characteristic of episodes, it is an ep
            found = properties_transformer.container.find_properties(filename, mtree, "episodeFormat")
            guess = properties_transformer.container.as_guess(found, filename)
            if guess:
                self.log.debug('Found characteristic property of episodes: %s"', guess)
                return filetype_container[0], other

            found = properties_transformer.container.find_properties(filename, mtree, "format")
            guess = properties_transformer.container.as_guess(found, filename)
            if guess and guess["format"] in ("HDTV", "WEBRip", "WEB-DL", "DVB"):
                # Use weak episodes only if TV or WEB source
                weak_episode_transformer = get_transformer("guess_weak_episodes_rexps")
                if weak_episode_transformer:
                    guess = weak_episode_transformer.guess_weak_episodes_rexps(filename)
                    if guess:
                        self.log.debug("Found guess_weak_episodes_rexps: %s -> type = episode", guess)
                        return filetype_container[0], other

        website_transformer = get_transformer("guess_website")
        if website_transformer:
            found = website_transformer.container.find_properties(filename, mtree, "website")
            guess = website_transformer.container.as_guess(found, filename)
            if guess:
                for namepart in ("tv", "serie", "episode"):
                    if namepart in guess["website"]:
                        # origin-specific type
                        self.log.debug("Found characteristic property of episodes: %s", guess)
                        return filetype_container[0], other

        if filetype_container[0] in ("subtitle", "info") or (not filetype_container[0] and fileext in video_exts):
            # if no episode info found, assume it's a movie
            self.log.debug("Nothing characteristic found, assuming type = movie")

        if not filetype_container[0]:
            self.log.debug("Nothing characteristic found, assuming type = unknown")
            filetype_container[0] = "unknown"

        return filetype_container[0], other
    def __init__(self, filename, filetype="autodetect"):
        """An iterative matcher tries to match different patterns that appear
        in the filename.

        The 'filetype' argument indicates which type of file you want to match.
        If it is 'autodetect', the matcher will try to see whether it can guess
        that the file corresponds to an episode, or otherwise will assume it is
        a movie.

        The recognized 'filetype' values are:
        [ autodetect, subtitle, movie, moviesubtitle, episode, episodesubtitle ]

        The IterativeMatcher works mainly in 2 steps:

        First, it splits the filename into a match_tree, which is a tree of groups
        which have a semantic meaning, such as episode number, movie title,

        The match_tree created looks like the following:

        0000000000000000000000000000000000000000000000000000000000000000000000000000000000 111
        0000011111111111112222222222222233333333444444444444444455555555666777777778888888 000
        0000000000000000000000000000000001111112011112222333333401123334000011233340000000 000
        xxxxxttttttttttttt               ffffff  vvvv    xxxxxx  ll lll     xx xxx         ccc

        The first 3 lines indicates the group index in which a char in the
        filename is located. So for instance, x264 is the group (0, 4, 1), and
        it corresponds to a video codec, denoted by the letter'v' in the 4th line.
        (for more info, see guess.matchtree.tree_to_string)

         Second, it tries to merge all this information into a single object
         containing all the found properties, and does some (basic) conflict
         resolution when they arise.

        if filetype not in ("autodetect", "subtitle", "video", "movie", "moviesubtitle", "episode", "episodesubtitle"):
            raise ValueError, "filetype needs to be one of ('autodetect', 'subtitle', 'video', 'movie', 'moviesubtitle', 'episode', 'episodesubtitle')"
        if not isinstance(filename, unicode):
            log.debug("WARNING: given filename to matcher is not unicode...")

        match_tree = []
        result = []  # list of found metadata

        def guessed(match_dict, confidence):
            guess = format_guess(Guess(match_dict, confidence=confidence))
            log.debug("Found with confidence %.2f: %s" % (confidence, guess))
            return guess

        def update_found(leftover, group_pos, guess):
            pidx, eidx, gidx = group_pos
            group = match_tree[pidx][eidx][gidx]
            match_tree[pidx][eidx][gidx] = (group[0], deleted * len(group[0]), guess)
            return [g for g in leftover if g[1] != group_pos]

        # 1- first split our path into dirs + basename + ext
        match_tree = split_path_components(filename)

        # try to detect the file type
        filetype, other = guess_filetype(filename, filetype)
        guessed({"type": filetype}, confidence=1.0)
        extguess = guessed(other, confidence=1.0)

        # guess the mimetype of the filename
        # TODO: handle other mimetypes not found on the default type_maps
        # mimetypes.types_map['.srt']='text/subtitle'
        mime, _ = mimetypes.guess_type(filename, strict=False)
        if mime is not None:
            guessed({"mimetype": mime}, confidence=1.0)

        # remove the extension from the match tree, as all indices relative
        # the the filename groups assume the basename is the last one
        fileext = match_tree.pop(-1)[1:].lower()

        # 2- split each of those into explicit groups, if any
        # note: be careful, as this might split some regexps with more confidence such as
        #       Alfleni-Team, or [XCT] or split a date such as (14-01-2008)
        match_tree = [split_explicit_groups(part) for part in match_tree]

        # 3- try to match information in decreasing order of confidence and
        #    blank the matching group in the string if we found something
        for pathpart in match_tree:
            for gidx, explicit_group in enumerate(pathpart):
                pathpart[gidx] = guess_groups(explicit_group, result, filetype=filetype)

        # 4- try to identify the remaining unknown groups by looking at their position
        #    relative to other known elements

        if filetype in ("episode", "episodesubtitle"):
            eps = find_group(match_tree, "episodeNumber")
            if eps:
                match_tree = match_from_epnum_position(match_tree, eps[0], guessed, update_found)

            leftover = leftover_valid_groups(match_tree)

            if not eps:
                # if we don't have the episode number, but at least 2 groups in the
                # last path group, then it's probably series - eptitle
                title_candidates = filter(
                    lambda g: g[0].lower() not in non_episode_title,
                    filter(lambda g: g[1][0] == len(match_tree) - 1, leftover_valid_groups(match_tree)),
                if len(title_candidates) >= 2:
                    guess = guessed({"series": title_candidates[0][0]}, confidence=0.4)
                    leftover = update_found(leftover, title_candidates[0][1], guess)
                    guess = guessed({"title": title_candidates[1][0]}, confidence=0.4)
                    leftover = update_found(leftover, title_candidates[1][1], guess)

            # if there's a path group that only contains the season info, then the previous one
            # is most likely the series title (ie: .../series/season X/...)
            eps = [
                for gpos in find_group(match_tree, "season")
                if "episodeNumber" not in get_group(match_tree, gpos)[2]

            if eps:
                pidx, eidx, gidx = eps[0]
                previous = [group for group in leftover if group[1][0] == pidx - 1]
                if len(previous) == 1:
                    guess = guessed({"series": previous[0][0]}, confidence=0.5)
                    leftover = update_found(leftover, previous[0][1], guess)

            # reduce the confidence of unlikely series
            for guess in result:
                if "series" in guess:
                    if guess["series"].lower() in unlikely_series:
                        guess.set_confidence("series", guess.confidence("series") * 0.5)

        elif filetype in ("movie", "moviesubtitle"):
            leftover_all = leftover_valid_groups(match_tree)

            # specific cases:
            #  - movies/tttttt (yyyy)/tttttt.ccc
                if match_tree[-3][0][0][0].lower() == "movies":
                    # Note:too generic, might solve all the unittests as they all contain 'movies'
                    # in their path
                    # if len(match_tree[-2][0]) == 1:
                    #    title = match_tree[-2][0][0]
                    #    guess = guessed({ 'title': clean_string(title[0]) }, confidence = 0.7)
                    #    update_found(leftover_all, title, guess)

                    year_group = filter(lambda gpos: gpos[0] == len(match_tree) - 2, find_group(match_tree, "year"))[0]
                    leftover = leftover_valid_groups(
                        match_tree, valid=lambda g: ((g[0] and g[0][0] not in sep) and g[1][0] == len(match_tree) - 2)
                    if len(match_tree[-2]) == 2 and year_group[1] == 1:
                        title = leftover[0]
                        guess = guessed({"title": clean_string(title[0])}, confidence=0.8)
                        update_found(leftover_all, title[1], guess)
                        raise Exception  # to exit the try catch now

                    leftover = [
                        for g in leftover_all
                        if (g[1][0] == year_group[0] and g[1][1] < year_group[1] and g[1][2] < year_group[2])
                    leftover = sorted(leftover, key=lambda x: x[1])
                    title = leftover[0]
                    guess = guessed({"title": title[0]}, confidence=0.8)
                    leftover = update_found(leftover, title[1], guess)

            # if we have either format or videoCodec in the folder containing the file
            # or one of its parents, then we should probably look for the title in
            # there rather than in the basename
            props = filter(
                lambda g: g[0] <= len(match_tree) - 2,
                find_group(match_tree, "videoCodec")
                + find_group(match_tree, "format")
                + find_group(match_tree, "language"),
            leftover = None
            if props and all(g[0] == props[0][0] for g in props):
                leftover = [g for g in leftover_all if g[1][0] == props[0][0]]

            if props and leftover:
                guess = guessed({"title": leftover[0][0]}, confidence=0.7)
                leftover = update_found(leftover, leftover[0][1], guess)

                # first leftover group in the last path part sounds like a good candidate for title,
                # except if it's only one word and that the first group before has at least 3 words in it
                # (case where the filename contains an 8 chars short name and the movie title is
                #  actually in the parent directory name)
                leftover = [g for g in leftover_all if g[1][0] == len(match_tree) - 1]
                if leftover:
                    title, (pidx, eidx, gidx) = leftover[0]
                    previous_pgroup_leftover = filter(lambda g: g[1][0] == pidx - 1, leftover_all)

                    if (
                        title.count(" ") == 0
                        and previous_pgroup_leftover
                        and previous_pgroup_leftover[0][0].count(" ") >= 2

                        guess = guessed({"title": previous_pgroup_leftover[0][0]}, confidence=0.6)
                        leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess)

                        guess = guessed({"title": title}, confidence=0.6)
                        leftover = update_found(leftover, leftover[0][1], guess)
                    # if there were no leftover groups in the last path part, look in the one before that
                    previous_pgroup_leftover = filter(lambda g: g[1][0] == len(match_tree) - 2, leftover_all)
                    if previous_pgroup_leftover:
                        guess = guessed({"title": previous_pgroup_leftover[0][0]}, confidence=0.6)
                        leftover = update_found(leftover, previous_pgroup_leftover[0][1], guess)

        # 5- perform some post-processing steps

        # 5.1- try to promote language to subtitle language where it makes sense
        for pidx, eidx, gidx in find_group(match_tree, "language"):
            string, remaining, guess = get_group(match_tree, (pidx, eidx, gidx))

            def promote_subtitle():
                guess.set("subtitleLanguage", guess["language"], confidence=guess.confidence("language"))
                del guess["language"]

            # - if we matched a language in a file with a sub extension and that the group
            #   is the last group of the filename, it is probably the language of the subtitle
            #   (eg: '')
            if fileext in subtitle_exts and pidx == len(match_tree) - 1 and eidx == len(match_tree[pidx]) - 1:

            # - if a language is in an explicit group just preceded by "st", it is a subtitle
            #   language (eg: '[fr-eng]...')
            if eidx > 0:
                previous = get_group(match_tree, (pidx, eidx - 1, -1))
                if previous[0][-2:].lower() == "st":

        # re-append the extension now
        match_tree.append([[(fileext, deleted * len(fileext), extguess)]]) = result
        self.match_tree = match_tree

        if filename.startswith("/"):
            filename = " " + filename

        log.debug("Found match tree:\n%s\n%s" % (to_utf8(tree_to_string(match_tree)), to_utf8(filename)))
