def unescape(self, name): """ Unescapes all HTML entities from a string using HTMLParser().unescape() Args: name (str): String to convert Returns: str: Converted string """ name = name.replace('<![CDATA[', '').replace(']]', '') name = HTMLParser().unescape(name.lower()) return name
def normalize_string(string, charset=None, replacing=False): """ Decode and Convert to Unicode any string :param charset: encoding :type charset: str :param string: string to convert :type string: str or unicode :param replacing: Whether is ' is replaced :type replacing: bool :return: converted unicode :rtype: unicode """ if not isinstance(string, unicode): try: if re.search(u'=[0-9a-fA-F]{2}', string): string = py2_decode(string, 'Quoted-printable') string = json.loads(u'%s' % string, encoding=charset) except ValueError: try: string = unicode(eval(string), 'raw_unicode_escape') except (SyntaxError, NameError): string = py2_decode(string, 'latin-1') pass except TypeError: string = unicode(string, errors='ignore') pass except LookupError: return u'' except TypeError: string = unicode(string, errors='ignore') pass string = remove_control_chars(string) string = fix_bad_unicode(string) string = unquote(string) string = string.replace(u'<![CDATA[', u'').replace(u']]', u'') string = HTMLParser().unescape(string) if replacing: string = string.replace(u"'", '') string = string.lower() return string
def clean_str(string): # 去除html标签 dr = re.compile(r'<[^>]+>', re.S) string = dr.sub('', string) # 统一全角标点 for c in en_punctuation_set: if c in string: string = string.replace(c, semi_angle_to_sbc(c)) # 去除html中的特殊字符 string = HTMLParser().unescape(string) # 将字母统一转成小写 string = string.lower() # 去除重复的符号 string = clean_redundant(string, '?') string = clean_redundant(string, ',') string = clean_redundant(string, '……') string = clean_redundant(string, '。') return string.strip()
def get_fingerprint(torrent_name): """ Tries to obtain a fingerprint from the torrent name that will uniquely identify it's group (TV show). """ # Minimize typing differences torrent_name = torrent_name.replace("ё", "е") # Unescape HTML entities torrent_name = HTMLParser().unescape(torrent_name) # Drop all tags torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name) # Drop any additional info: timestamps, release versions, etc. # --> square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$") preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$") round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$") angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$") date_regex = re.compile( r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") # Unable to merge it into date_regex due to some strange behaviour of re # module. additional_date_regex = re.compile( r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$" ) release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$") old_torrent_name = None while torrent_name != old_torrent_name: old_torrent_name = torrent_name for regex in ( additional_date_regex, date_regex, preceding_square_braces_regex, square_braces_regex, round_braces_regex, angle_braces_regex, release_counter_regex, ): torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,")) torrent_name = re.sub(r"\s+/.*", "", torrent_name) # <-- # We need all names in lowercase for easier analysis torrent_name = torrent_name.lower() # Try to get most possible short fingerprint --> torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "", torrent_name) torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name) torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name) torrent_name = re.sub( r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name) # Try to get most possible short fingerprint <-- # Drop all punctuation and other non-alphabet characters characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя" torrent_name = torrent_name.replace(".", " ") torrent_name = "".join(c for c in torrent_name if c in " " + characters) # Drop any additional info: timestamps, release versions, etc. # --> torrent_name = torrent_name.replace("г.", "") while True: new_torrent_name = re.sub( r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name) if new_torrent_name == torrent_name: break torrent_name = new_torrent_name for month in ( "январь", "января", "февраль", "февраля", "март", "марта", "апрель", "апреля", "май", "мая", "июнь", "июня", "июль", "июля", "август", "августа", "сентябрь", "сентября", "октябрь", "октября", "ноябрь", "ноября", "декабрь", "декабря", ): torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name) # <-- # Drop several spaces torrent_name = re.sub(r"\s+", " ", torrent_name).strip() return torrent_name.strip()
def get_fingerprint(torrent_name): """ Tries to obtain a fingerprint from the torrent name that will uniquely identify it's group (TV show). """ # Minimize typing differences torrent_name = torrent_name.replace("ё", "е") # Unescape HTML entities torrent_name = HTMLParser().unescape(torrent_name) # Drop all tags torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name) # Drop any additional info: timestamps, release versions, etc. # --> square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$") preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$") round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$") angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$") date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") # Unable to merge it into date_regex due to some strange behaviour of re # module. additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$") old_torrent_name = None while torrent_name != old_torrent_name: old_torrent_name = torrent_name for regex in ( additional_date_regex, date_regex, preceding_square_braces_regex, square_braces_regex, round_braces_regex, angle_braces_regex, release_counter_regex, ): torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,")) torrent_name = re.sub(r"\s+/.*", "", torrent_name) # <-- # We need all names in lowercase for easier analysis torrent_name = torrent_name.lower() # Try to get most possible short fingerprint --> torrent_name = re.sub( r"^«([^»]{6,})»", r"\1", torrent_name) torrent_name = re.sub( r'^"([^»]{6,})"', r"\1", torrent_name) torrent_name = re.sub( r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name) # Try to get most possible short fingerprint <-- # Drop all punctuation and other non-alphabet characters characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя" torrent_name = torrent_name.replace(".", " ") torrent_name = "".join( c for c in torrent_name if c in " " + characters) # Drop any additional info: timestamps, release versions, etc. # --> torrent_name = torrent_name.replace("г.", "") while True: new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name) if new_torrent_name == torrent_name: break torrent_name = new_torrent_name for month in ( "январь", "января", "февраль", "февраля", "март", "марта", "апрель", "апреля", "май", "мая", "июнь", "июня", "июль", "июля", "август", "августа", "сентябрь", "сентября", "октябрь", "октября", "ноябрь", "ноября", "декабрь", "декабря", ): torrent_name = torrent_name.replace(month, "") # <-- # Drop several spaces torrent_name = re.sub(r"\s+", " ", torrent_name).strip() return torrent_name.strip()