示例#1
0
    def unescape(self, name):
        """ Unescapes all HTML entities from a string using
            HTMLParser().unescape()

        Args:
            name (str): String to convert

        Returns:
            str: Converted string
        """
        name = name.replace('<![CDATA[', '').replace(']]', '')
        name = HTMLParser().unescape(name.lower())

        return name
示例#2
0
文件: normalize.py 项目: seuf/burst
def normalize_string(string, charset=None, replacing=False):
    """
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        try:
            if re.search(u'=[0-9a-fA-F]{2}', string):
                string = py2_decode(string, 'Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
            try:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = py2_decode(string, 'latin-1')
                pass

            except TypeError:
                string = unicode(string, errors='ignore')
                pass

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')
            pass

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string
示例#3
0
def clean_str(string):
    # 去除html标签
    dr = re.compile(r'<[^>]+>', re.S)
    string = dr.sub('', string)
    # 统一全角标点
    for c in en_punctuation_set:
        if c in string:
            string = string.replace(c, semi_angle_to_sbc(c))
    # 去除html中的特殊字符
    string = HTMLParser().unescape(string)
    # 将字母统一转成小写
    string = string.lower()
    # 去除重复的符号
    string = clean_redundant(string, '?')
    string = clean_redundant(string, ',')
    string = clean_redundant(string, '……')
    string = clean_redundant(string, '。')
    return string.strip()
示例#4
0
def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(
        r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(
        r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$"
    )
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
                additional_date_regex,
                date_regex,
                preceding_square_braces_regex,
                square_braces_regex,
                round_braces_regex,
                angle_braces_regex,
                release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "",
                          torrent_name)

    torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*",
        r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(
            r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)",
            "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
            "январь",
            "января",
            "февраль",
            "февраля",
            "март",
            "марта",
            "апрель",
            "апреля",
            "май",
            "мая",
            "июнь",
            "июня",
            "июль",
            "июля",
            "август",
            "августа",
            "сентябрь",
            "сентября",
            "октябрь",
            "октября",
            "ноябрь",
            "ноября",
            "декабрь",
            "декабря",
    ):
        torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name)
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()
示例#5
0
def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
            additional_date_regex,
            date_regex,
            preceding_square_braces_regex,
            square_braces_regex,
            round_braces_regex,
            angle_braces_regex,
            release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(
        r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(
        r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(
        c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
        "январь",   "января",
        "февраль",  "февраля",
        "март",     "марта",
        "апрель",   "апреля",
        "май",      "мая",
        "июнь",     "июня",
        "июль",     "июля",
        "август",   "августа",
        "сентябрь", "сентября",
        "октябрь",  "октября",
        "ноябрь",   "ноября",
        "декабрь",  "декабря",
    ):
        torrent_name = torrent_name.replace(month, "")
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()