def replace_html_codes(txt): txt = to_utf8(txt) txt = re.sub(r"(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt) txt = HTMLParser().unescape(txt) txt = txt.replace(""", "\"") txt = txt.replace("&", "&") return txt
def normalize(text, emoticon=False, repeat=None): text = HTMLParser().unescape(text) text = text.replace('\r', '\n') if emoticon is False: text = remove_emoticon(text) text = jaconv.h2z(text) text = text.replace('よぉ', 'よ').replace('よぉ', 'よ') text = text.replace('よお', 'よ').replace('よお', 'よ') if repeat: text = shorten_repeat(text, repeat) return text
def normalize(text, emoticon=False, repeat=None): text = HTMLParser().unescape(text) text = text.replace('\r', '\n') if emoticon is False: text = remove_useless_symbol(text) text = text.replace(u'γ⌒ヽ', '') text = jctconv.h2z(text) text = text.replace(u'よぉ', u'よ').replace(u'よぉ', u'よ') text = text.replace(u'よお', u'よ').replace(u'よお', u'よ') if repeat: text = shorten_repeat(text, repeat) return text
def to_csv_row(category, scraped_regex): row = {'category': category} try: regex_bytes = bytes(scraped_regex[0].text, encoding='utf-8') regex_text = str(regex_bytes, encoding='utf-8') unescaped_regex = HTMLParser().unescape(regex_text) # Data quality check: skip regexes that contain new lines. if "\n" in unescaped_regex: return None clean_regex = unescaped_regex.replace(" ", "") # More cleaning: remove optional double quotes surrouding regex. if clean_regex.startswith('"') and clean_regex.endswith('"'): clean_regex = clean_regex[1:-1] row['regex'] = clean_regex except Exception as e: # Escaping won't throw exceptions for the included html files. template = 'Exception while escaping regex: type: {0}, args:\n{1!r}' msg = template.format(type(e).__name__, e.args) print(msg) return None return row
def get_filename_from_title(title, ext='.m4a'): """ Creates a filename from title """ if not title: return 'music' + ext title = HTMLParser().unescape(title) for _ in FILENAME_EXCLUDE: title = title.replace(_, ' ') # provide readability with space return title + ext # TODO - smart hunt
def normalize_string(string, charset=None, replacing=False): """ Decode and Convert to Unicode any string :param charset: encoding :type charset: str :param string: string to convert :type string: str or unicode :param replacing: Whether is ' is replaced :type replacing: bool :return: converted unicode :rtype: unicode """ if not isinstance(string, unicode): try: if re.search(u'=[0-9a-fA-F]{2}', string): string = py2_decode(string, 'Quoted-printable') string = json.loads(u'%s' % string, encoding=charset) except ValueError: try: string = unicode(eval(string), 'raw_unicode_escape') except (SyntaxError, NameError): string = py2_decode(string, 'latin-1') pass except TypeError: string = unicode(string, errors='ignore') pass except LookupError: return u'' except TypeError: string = unicode(string, errors='ignore') pass string = remove_control_chars(string) string = fix_bad_unicode(string) string = unquote(string) string = string.replace(u'<![CDATA[', u'').replace(u']]', u'') string = HTMLParser().unescape(string) if replacing: string = string.replace(u"'", '') string = string.lower() return string
def _ircfy_tweet(self, tweet): """Takes a twitter status and outputs irc message.""" message = tweet.text urls = tweet.urls if tweet.retweeted_status: #HACK: because iPhone sucks and does not correctly handle RT message = "RT @{0}: {1}".format( tweet.retweeted_status.user.screen_name, tweet.retweeted_status.text) urls = tweet.urls try: message = HTMLParser().unescape(message) except: self.log.exception("Unable to escape message %r", message) message = "{surround}{screen_name}{surround}: {message}".format( surround=IRC_BOLD, screen_name=tweet.user.screen_name, message=message) message = message.replace('\r', '').replace('\n', ' ') urls = self._urls_to_dict(urls) message = self._handle_url_expansion(message, urls, 440) return message
def clean_html(content): content = HTMLParser().unescape(content) content = content.replace('\xa0', ' ') return content
def get_fingerprint(torrent_name): """ Tries to obtain a fingerprint from the torrent name that will uniquely identify it's group (TV show). """ # Minimize typing differences torrent_name = torrent_name.replace("ё", "е") # Unescape HTML entities torrent_name = HTMLParser().unescape(torrent_name) # Drop all tags torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name) # Drop any additional info: timestamps, release versions, etc. # --> square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$") preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$") round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$") angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$") date_regex = re.compile( r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") # Unable to merge it into date_regex due to some strange behaviour of re # module. additional_date_regex = re.compile( r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$" ) release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$") old_torrent_name = None while torrent_name != old_torrent_name: old_torrent_name = torrent_name for regex in ( additional_date_regex, date_regex, preceding_square_braces_regex, square_braces_regex, round_braces_regex, angle_braces_regex, release_counter_regex, ): torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,")) torrent_name = re.sub(r"\s+/.*", "", torrent_name) # <-- # We need all names in lowercase for easier analysis torrent_name = torrent_name.lower() # Try to get most possible short fingerprint --> torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "", torrent_name) torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name) torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name) torrent_name = re.sub( r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name) # Try to get most possible short fingerprint <-- # Drop all punctuation and other non-alphabet characters characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя" torrent_name = torrent_name.replace(".", " ") torrent_name = "".join(c for c in torrent_name if c in " " + characters) # Drop any additional info: timestamps, release versions, etc. # --> torrent_name = torrent_name.replace("г.", "") while True: new_torrent_name = re.sub( r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name) if new_torrent_name == torrent_name: break torrent_name = new_torrent_name for month in ( "январь", "января", "февраль", "февраля", "март", "марта", "апрель", "апреля", "май", "мая", "июнь", "июня", "июль", "июля", "август", "августа", "сентябрь", "сентября", "октябрь", "октября", "ноябрь", "ноября", "декабрь", "декабря", ): torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name) # <-- # Drop several spaces torrent_name = re.sub(r"\s+", " ", torrent_name).strip() return torrent_name.strip()
def get_fingerprint(torrent_name): """ Tries to obtain a fingerprint from the torrent name that will uniquely identify it's group (TV show). """ # Minimize typing differences torrent_name = torrent_name.replace("ё", "е") # Unescape HTML entities torrent_name = HTMLParser().unescape(torrent_name) # Drop all tags torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name) # Drop any additional info: timestamps, release versions, etc. # --> square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$") preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$") round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$") angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$") date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") # Unable to merge it into date_regex due to some strange behaviour of re # module. additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$") release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$") old_torrent_name = None while torrent_name != old_torrent_name: old_torrent_name = torrent_name for regex in ( additional_date_regex, date_regex, preceding_square_braces_regex, square_braces_regex, round_braces_regex, angle_braces_regex, release_counter_regex, ): torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,")) torrent_name = re.sub(r"\s+/.*", "", torrent_name) # <-- # We need all names in lowercase for easier analysis torrent_name = torrent_name.lower() # Try to get most possible short fingerprint --> torrent_name = re.sub( r"^«([^»]{6,})»", r"\1", torrent_name) torrent_name = re.sub( r'^"([^»]{6,})"', r"\1", torrent_name) torrent_name = re.sub( r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name) # Try to get most possible short fingerprint <-- # Drop all punctuation and other non-alphabet characters characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя" torrent_name = torrent_name.replace(".", " ") torrent_name = "".join( c for c in torrent_name if c in " " + characters) # Drop any additional info: timestamps, release versions, etc. # --> torrent_name = torrent_name.replace("г.", "") while True: new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name) if new_torrent_name == torrent_name: break torrent_name = new_torrent_name for month in ( "январь", "января", "февраль", "февраля", "март", "марта", "апрель", "апреля", "май", "мая", "июнь", "июня", "июль", "июля", "август", "августа", "сентябрь", "сентября", "октябрь", "октября", "ноябрь", "ноября", "декабрь", "декабря", ): torrent_name = torrent_name.replace(month, "") # <-- # Drop several spaces torrent_name = re.sub(r"\s+", " ", torrent_name).strip() return torrent_name.strip()