Пример #1
0
def search_for_equivalent_numbers(tokens):
    for token in tokens:
        if parser_helper.is_token_isolated(token) or \
                not is_valid_episode_number(token.content):
            continue

        # Find the first enclosed, non-delimiter token
        next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER)
        if next_token is None or next_token.category != TokenCategory.BRACKET:
            continue
        next_token = Tokens.find_next(
            next_token, TokenFlags.ENCLOSED | TokenFlags.NOT_DELIMITER)
        if next_token.category != TokenCategory.UNKNOWN:
            continue

        # Check if it's an isolated number
        if not parser_helper.is_token_isolated(next_token) or \
                not next_token.content.isdigit() or \
                not is_valid_episode_number(next_token.content):
            continue

        episode = min(token, next_token, key=lambda t: int(t.content))
        alt_episode = max(token, next_token, key=lambda t: int(t.content))

        set_episode_number(episode.content, episode, validate=False)
        set_alternative_episode_number(alt_episode.content, alt_episode)

        return True

    return False
Пример #2
0
def parse(filename, options=default_options):
    Elements.clear()
    Tokens.clear()

    # Add missing options
    for key, value in default_options.items():
        options.setdefault(key, value)

    Elements.insert(ElementCategory.FILE_NAME, filename)
    if options['parse_file_extension']:
        filename, extension = remove_extension_from_filename(filename)
        if extension:
            Elements.insert(ElementCategory.FILE_EXTENSION, extension)

    if options['ignored_strings']:
        filename = remove_ignored_strings_from_filename(
            filename, options['ignored_strings'])

    if not filename:
        return None

    tokenizer = Tokenizer(filename, options)
    if not tokenizer.tokenize():
        return None

    parser = Parser(options)
    if not parser.parse():
        return None

    return Elements.get_dictionary()
Пример #3
0
    def search_for_release_group(self):
        token_end = None
        while True:
            # Find the first enclosed unknown token
            if token_end:
                token_begin = Tokens.find_next(
                    token_end, TokenFlags.ENCLOSED | TokenFlags.UNKNOWN)
            else:
                token_begin = Tokens.find(
                    TokenFlags.ENCLOSED | TokenFlags.UNKNOWN)
            if token_begin is None:
                return

            # Continue until a bracket or identifier is found
            token_end = Tokens.find_next(
                token_begin, TokenFlags.BRACKET | TokenFlags.IDENTIFIER)
            if token_end is None:
                return
            if token_end.category != TokenCategory.BRACKET:
                continue

            # Ignore if it's not the first non-delimiter token in group
            previous_token = Tokens.find_previous(
                token_begin, TokenFlags.NOT_DELIMITER)
            if previous_token is not None and \
                    previous_token.category != TokenCategory.BRACKET:
                continue

            # Build release group, token end is a bracket, so we get the
            # previous token to be included in the element
            token_end = Tokens.find_previous(token_end, TokenFlags.VALID)
            parser_helper.build_element(
                ElementCategory.RELEASE_GROUP, token_begin, token_end,
                keep_delimiters=True)
            return
Пример #4
0
    def search_for_episode_title(self):
        token_end = None
        while True:
            # Find the first non-enclosed unknown token
            if token_end:
                token_begin = Tokens.find_next(
                    token_end, TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN)
            else:
                token_begin = Tokens.find(
                    TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN)
            if token_begin is None:
                return

            # Continue until a bracket or identifier is found
            token_end = Tokens.find_next(
                token_begin, TokenFlags.BRACKET | TokenFlags.IDENTIFIER)

            # Ignore if it's only a dash
            if Tokens.distance(token_begin, token_end) <= 2 and \
                    parser_helper.is_dash_character(token_begin.content):
                continue

            # If token end is a bracket, then we get the previous token to be
            # included in the element
            if token_end and token_end.category == TokenCategory.BRACKET:
                token_end = Tokens.find_previous(token_end, TokenFlags.VALID)
            # Build episode title
            parser_helper.build_element(
                ElementCategory.EPISODE_TITLE, token_begin, token_end,
                keep_delimiters=False)
            return
Пример #5
0
def search_for_last_number(tokens):
    for token in tokens:
        token_index = Tokens.get_index(token)

        # Assuming that episode number always comes after the title, first
        # token cannot be what we're looking for
        if token_index == 0:
            continue

        # An enclosed token is unlikely to be the episode number at this point
        if token.enclosed:
            continue

        # Ignore if it's the first non-enclosed, non-delimiter token
        if all([
                t.enclosed or t.category == TokenCategory.DELIMITER
                for t in Tokens.get_list()[:token_index]
        ]):
            continue

        # Ignore if the previous token is "Movie" or "Part"
        previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER)
        if previous_token.category == TokenCategory.UNKNOWN:
            if previous_token.content.lower() == 'movie' or \
                    previous_token.content.lower() == 'part':
                continue

        # We'll use this number after all
        if set_episode_number(token.content, token, validate=True):
            return True

    return False
Пример #6
0
def is_token_isolated(token):
    previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER)
    if previous_token.category != TokenCategory.BRACKET:
        return False

    next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER)
    if next_token.category != TokenCategory.BRACKET:
        return False

    return True
Пример #7
0
    def search_for_isolated_numbers(self):
        for token in Tokens.get_list(TokenFlags.UNKNOWN):
            if not token.content.isdigit() or \
                    not parser_helper.is_token_isolated(token):
                continue

            number = int(token.content)

            # Anime year
            if number >= parser_number.ANIME_YEAR_MIN and \
                    number <= parser_number.ANIME_YEAR_MAX:
                if not Elements.contains(ElementCategory.ANIME_YEAR):
                    Elements.insert(ElementCategory.ANIME_YEAR, token.content)
                    token.category = TokenCategory.IDENTIFIER
                    continue

            # Video resolution
            if number == 480 or number == 720 or number == 1080:
                # If these numbers are isolated, it's more likely for them to
                # be the video resolution rather than the episode number. Some
                # fansub groups use these without the "p" suffix.
                if not Elements.contains(ElementCategory.VIDEO_RESOLUTION):
                    Elements.insert(
                        ElementCategory.VIDEO_RESOLUTION, token.content)
                    token.category = TokenCategory.IDENTIFIER
                    continue
Пример #8
0
def build_element(category, token_begin=None, token_end=None,
                  keep_delimiters=False):
    element = ''

    for token in Tokens.get_list(begin=token_begin, end=token_end):
        if token.category == TokenCategory.UNKNOWN:
            element += token.content
            token.category = TokenCategory.IDENTIFIER
        elif token.category == TokenCategory.BRACKET:
            element += token.content
        elif token.category == TokenCategory.DELIMITER:
            delimiter = token.content
            if keep_delimiters:
                element += delimiter
            elif token != token_begin and token != token_end:
                if delimiter == ',' or delimiter == '&':
                    element += delimiter
                else:
                    element += ' '

    if not keep_delimiters:
        element = element.strip(' ' + DASHES)

    if element:
        Elements.insert(category, element)
Пример #9
0
def check_anime_season_keyword(token):
    def set_anime_season(first, second, content):
        Elements.insert(ElementCategory.ANIME_SEASON, content)
        first.category = TokenCategory.IDENTIFIER
        second.category = TokenCategory.IDENTIFIER

    previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER)
    if previous_token:
        number = get_number_from_ordinal(previous_token.content)
        if number:
            set_anime_season(previous_token, token, number)
            return True

    next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER)
    if next_token and next_token.content.isdigit():
        set_anime_season(token, next_token, next_token.content)
        return True

    return False
Пример #10
0
def number_comes_before_another_number(token):
    separator_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER)

    if separator_token:
        separator = separator_token.content
        if separator == '&' or separator == 'of':
            other_token = Tokens.find_next(separator_token,
                                           TokenFlags.NOT_DELIMITER)
            if other_token and other_token.content.isdigit():
                set_episode_number(token.content, token, validate=False)
                if separator == '&':
                    set_episode_number(other_token.content,
                                       token,
                                       validate=False)
                separator_token.category = TokenCategory.IDENTIFIER
                other_token.category = TokenCategory.IDENTIFIER
                return True

    return False
Пример #11
0
def search_for_separated_numbers(tokens):
    for token in tokens:
        previous_token = Tokens.find_previous(token, TokenFlags.NOT_DELIMITER)

        # See if the number has a preceding "-" separator
        if previous_token.category == TokenCategory.UNKNOWN and \
                parser_helper.is_dash_character(previous_token.content):
            if set_episode_number(token.content, token, validate=True):
                previous_token.category = TokenCategory.IDENTIFIER
                return True

    return False
Пример #12
0
    def search_for_keywords(self):
        for token in Tokens.get_list(TokenFlags.UNKNOWN):
            word = token.content
            word = word.strip(' -')

            if not word:
                continue
            # Don't bother if the word is a number that cannot be CRC
            if len(word) != 8 and word.isdigit():
                continue

            category = ElementCategory.UNKNOWN
            keyword = keyword_manager.find(keyword_manager.normalize(word))
            if keyword:
                category = keyword.category
                if not self.options['parse_release_group'] and \
                        category == ElementCategory.RELEASE_GROUP:
                    continue
                if not ElementCategory.is_searchable(category) or \
                        not keyword.options.searchable:
                    continue
                if ElementCategory.is_singular(category) and \
                        Elements.contains(category):
                    continue

                if category == ElementCategory.ANIME_SEASON_PREFIX:
                    parser_helper.check_anime_season_keyword(token)
                    continue
                elif category == ElementCategory.EPISODE_PREFIX:
                    if keyword.options.valid:
                        parser_number.check_extent_keyword(
                            ElementCategory.EPISODE_NUMBER, token)
                    continue
                elif category == ElementCategory.RELEASE_VERSION:
                    word = word[1:]  # number without "v"
                elif category == ElementCategory.VOLUME_PREFIX:
                    parser_number.check_extent_keyword(
                        ElementCategory.VOLUME_NUMBER, token)
                    continue
            else:
                if not Elements.contains(ElementCategory.FILE_CHECKSUM) and \
                        parser_helper.is_crc32(word):
                    category = ElementCategory.FILE_CHECKSUM
                elif not Elements.contains(ElementCategory.VIDEO_RESOLUTION) \
                        and parser_helper.is_resolution(word):
                    category = ElementCategory.VIDEO_RESOLUTION

            if category != ElementCategory.UNKNOWN:
                Elements.insert(category, word)
                if keyword is None or keyword.options.identifiable:
                    token.category = TokenCategory.IDENTIFIER
Пример #13
0
def match_type_and_episode_pattern(word, token):
    number_begin = parser_helper.find_number_in_string(word)
    prefix = word[:number_begin]

    keyword = keyword_manager.find(keyword_manager.normalize(prefix),
                                   ElementCategory.ANIME_TYPE)

    if keyword:
        Elements.insert(ElementCategory.ANIME_TYPE, prefix)
        number = word[number_begin:]
        if match_episode_patterns(number, token) or \
                set_episode_number(number, token, validate=True):
            # Split token (we do this last in order to avoid invalidating our
            # token reference earlier)
            token_index = Tokens.get_index(token)
            token.content = number
            Tokens.insert(
                token_index,
                Token(
                    TokenCategory.IDENTIFIER if keyword.options.identifiable
                    else TokenCategory.UNKNOWN, prefix, token.enclosed))
            return True

    return False
Пример #14
0
    def search_for_episode_number(self):
        # List all unknown tokens that contain a number
        tokens = [token for token in Tokens.get_list(TokenFlags.UNKNOWN)
                  if parser_helper.find_number_in_string(token.content) is not
                  None]

        if not tokens:
            return

        Elements.set_check_alt_number(
            Elements.contains(ElementCategory.EPISODE_NUMBER))

        # If a token matches a known episode pattern, it has to be the episode
        # number
        if parser_number.search_for_episode_patterns(tokens):
            return

        if Elements.contains(ElementCategory.EPISODE_NUMBER):
            return  # We have previously found an episode number via keywords

        # From now on, we're only interested in numeric tokens
        tokens = [token for token in tokens if token.content.isdigit()]

        if not tokens:
            return

        # e.g. "01 (176)", "29 (04)"
        if parser_number.search_for_equivalent_numbers(tokens):
            return

        # e.g. " - 08"
        if parser_number.search_for_separated_numbers(tokens):
            return

        # e.g. "[12]", "(2006)"
        if parser_number.search_for_isolated_numbers(tokens):
            return

        # Consider using the last number as a last resort
        parser_number.search_for_last_number(tokens)
Пример #15
0
def check_extent_keyword(category, token):
    next_token = Tokens.find_next(token, TokenFlags.NOT_DELIMITER)

    if next_token.category == TokenCategory.UNKNOWN:
        if next_token and \
                parser_helper.find_number_in_string(next_token.content) \
                is not None:
            if category == ElementCategory.EPISODE_NUMBER:
                if not match_episode_patterns(next_token.content, next_token):
                    set_episode_number(next_token.content,
                                       next_token,
                                       validate=False)
            elif category == ElementCategory.VOLUME_NUMBER:
                if not match_volume_patterns(next_token.content, next_token):
                    set_volume_number(next_token.content,
                                      next_token,
                                      validate=False)
            else:
                return False
            token.category = TokenCategory.IDENTIFIER
            return True

    return False
Пример #16
0
    def _validate_delimiter_tokens(self):
        def find_previous_valid_token(token):
            return Tokens.find_previous(token, TokenFlags.VALID)

        def find_next_valid_token(token):
            return Tokens.find_next(token, TokenFlags.VALID)

        def is_delimiter_token(token):
            return token is not None and \
                   token.category == TokenCategory.DELIMITER

        def is_unknown_token(token):
            return token is not None and \
                   token.category == TokenCategory.UNKNOWN

        def is_single_character_token(token):
            return is_unknown_token(token) and len(token.content) == 1 and \
                   token.content != '-'

        def append_token_to(token, append_to):
            append_to.content += token.content
            token.category = TokenCategory.INVALID

        for token in Tokens.get_list():
            if token.category != TokenCategory.DELIMITER:
                continue

            delimiter = token.content
            prev_token = find_previous_valid_token(token)
            next_token = find_next_valid_token(token)

            # Check for single-character tokens to prevent splitting group
            # names, keywords, episode number, etc.
            if delimiter != ' ' and delimiter != '_':
                if is_single_character_token(prev_token):
                    append_token_to(token, prev_token)
                    while is_unknown_token(next_token):
                        append_token_to(next_token, prev_token)
                        next_token = find_next_valid_token(next_token)
                        if is_delimiter_token(next_token) and \
                                next_token.content == delimiter:
                            append_token_to(next_token, prev_token)
                            next_token = find_next_valid_token(next_token)
                    continue
                if is_single_character_token(next_token):
                    append_token_to(token, prev_token)
                    append_token_to(next_token, prev_token)
                    continue

            # Check for adjacent delimiters
            if is_unknown_token(prev_token) and is_delimiter_token(next_token):
                next_delimiter = next_token.content
                if delimiter != next_delimiter and delimiter != ',':
                    if next_delimiter == ' ' or next_delimiter == '_':
                        append_token_to(token, prev_token)

            elif is_delimiter_token(prev_token) and \
                    is_delimiter_token(next_token):
                prev_delimiter = prev_token.content
                next_delimiter = next_token.content
                if prev_delimiter == next_delimiter and \
                        prev_delimiter != delimiter:
                    token.category = TokenCategory.UNKNOWN  # e.g. "&" in "_&_"

            # Check for other special cases
            if delimiter == '&' or delimiter == '+':
                if is_unknown_token(prev_token) and \
                        is_unknown_token(next_token):
                    if prev_token.content.isdigit() and \
                            next_token.content.isdigit():
                        append_token_to(token, prev_token)
                        append_token_to(next_token, prev_token)  # e.g. "01+02"

        Tokens.update([
            token for token in Tokens.get_list()
            if token.category != TokenCategory.INVALID
        ])
Пример #17
0
    def search_for_anime_title(self):
        enclosed_title = False

        # Find the first non-enclosed unknown token
        token_begin = Tokens.find(TokenFlags.NOT_ENCLOSED | TokenFlags.UNKNOWN)

        # If that doesn't work, find the first unknown token in the second
        # enclosed group, assuming that the first one is the release group
        if token_begin is None:
            enclosed_title = True
            token_begin = Tokens.get(0)
            skipped_previous_group = False
            while token_begin is not None:
                token_begin = Tokens.find_next(token_begin, TokenFlags.UNKNOWN)
                if token_begin is None:
                    break
                # Ignore groups that are composed of non-Latin characters
                if parser_helper.is_mostly_latin_string(token_begin.content):
                    if skipped_previous_group:
                        break  # Found it
                # Get the first unknown token of the next group
                token_begin = Tokens.find_next(token_begin, TokenFlags.BRACKET)
                skipped_previous_group = True

        if token_begin is None:
            return

        # Continue until an identifier (or a bracket, if the title is enclosed)
        # is found
        token_end = Tokens.find_next(
            token_begin, TokenFlags.IDENTIFIER | (
                TokenFlags.BRACKET if enclosed_title else TokenFlags.NONE
            ))

        # If within the interval there's an open bracket without its matching
        # pair, move the upper endpoint back to the bracket
        if not enclosed_title:
            last_bracket = token_end
            bracket_open = False
            for token in Tokens.get_list(TokenFlags.BRACKET, begin=token_begin,
                                         end=token_end):
                last_bracket = token
                bracket_open = not bracket_open
            if bracket_open:
                token_end = last_bracket

        # If the interval ends with an enclosed group (e.g. "Anime Title
        # [Fansub]"), move the upper endpoint back to the beginning of the
        # group. We ignore parentheses in order to keep certain groups (e.g.
        # "(TV)") intact.
        if not enclosed_title:
            token = Tokens.find_previous(token_end, TokenFlags.NOT_DELIMITER)
            while token.category == TokenCategory.BRACKET and \
                    token.content != ')':
                token = Tokens.find_previous(token, TokenFlags.BRACKET)
                if token is not None:
                    token_end = token
                    token = Tokens.find_previous(
                        token_end, TokenFlags.NOT_DELIMITER)

        # Token end is a bracket, so we get the previous token to be included
        # in the element
        token_end = Tokens.find_previous(token_end, TokenFlags.VALID)
        parser_helper.build_element(ElementCategory.ANIME_TITLE, token_begin,
                                    token_end, keep_delimiters=False)
Пример #18
0
 def _add_token(self, category, content, enclosed):
     Tokens.append(Token(category, content, enclosed))
Пример #19
0
 def tokenize(self):
     self._tokenize_by_brackets()
     return not Tokens.empty()
Пример #20
0
 def find_next_valid_token(token):
     return Tokens.find_next(token, TokenFlags.VALID)
Пример #21
0
 def find_previous_valid_token(token):
     return Tokens.find_previous(token, TokenFlags.VALID)