Python HTMLParser.replace примеры использования

Язык программирования: Python

Пространство имен/Пакет: html.parser

Класс/Тип: HTMLParser

Метод/Функция: replace

Примеров на hotexamples.com: 11

Python HTMLParser.replace - 11 примеров найдено. Это лучшие примеры Python кода для html.parser.HTMLParser.replace, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

HTMLParser(30)

feed(30)

reset(30)

__init__(30)

unescape(30)

close(30)

handle_data(14)

replace(8)

strip(6)

get_starttag_text(5)

handle_starttag(5)

decode(5)

split(4)

lower(4)

handle_endtag(3)

handle_comment(2)

encode(2)

handle_startendtag(2)

error(2)

fed(1)

text(1)

strict(1)

_init_(1)

closer(1)

lstrip(1)

getElementsByTagName(1)

important_tag(1)

hrefs(1)

find(1)

findLinks(1)

handle_decl(1)

convert_charrefs(1)

Пример #1

Показать файл

Файл: utils.py Проект: CYBERxNUKE/xbmc-addon

def replace_html_codes(txt):
    txt = to_utf8(txt)
    txt = re.sub(r"(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", txt)
    txt = HTMLParser().unescape(txt)
    txt = txt.replace("&quot;", "\"")
    txt = txt.replace("&amp;", "&")
    return txt

Пример #2

Показать файл

Файл: normalize.py Проект: pombredanne/atango

def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_emoticon(text)
        text = jaconv.h2z(text)
        text = text.replace('よぉ', 'よ').replace('よぉ', 'よ')
        text = text.replace('よお', 'よ').replace('よお', 'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text

Пример #3

Показать файл

Файл: normalize.py Проект: kuhaku/atango

def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_emoticon(text)
        text = jaconv.h2z(text)
        text = text.replace('よぉ', 'よ').replace('よぉ', 'よ')
        text = text.replace('よお', 'よ').replace('よお', 'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text

Пример #4

Показать файл

def normalize(text, emoticon=False, repeat=None):
    text = HTMLParser().unescape(text)
    text = text.replace('\r', '\n')
    if emoticon is False:
        text = remove_useless_symbol(text)
        text = text.replace(u'γ⌒ヽ', '')
        text = jctconv.h2z(text)
        text = text.replace(u'よぉ', u'よ').replace(u'よぉ', u'よ')
        text = text.replace(u'よお', u'よ').replace(u'よお', u'よ')
    if repeat:
        text = shorten_repeat(text, repeat)
    return text

Пример #5

Показать файл

Файл: scraper.py Проект: mitdbg/datascienceclass

def to_csv_row(category, scraped_regex):
    row = {'category': category}

    try:
        regex_bytes = bytes(scraped_regex[0].text, encoding='utf-8')
        regex_text = str(regex_bytes, encoding='utf-8')
        unescaped_regex = HTMLParser().unescape(regex_text)

        # Data quality check: skip regexes that contain new lines.
        if "\n" in unescaped_regex:
            return None

        clean_regex = unescaped_regex.replace(" ", "")
        # More cleaning: remove optional double quotes surrouding regex.
        if clean_regex.startswith('"') and clean_regex.endswith('"'):
            clean_regex = clean_regex[1:-1]
        row['regex'] = clean_regex
    except Exception as e:
        # Escaping won't throw exceptions for the included html files.
        template = 'Exception while escaping regex: type: {0}, args:\n{1!r}'
        msg = template.format(type(e).__name__, e.args)
        print(msg)
        return None

    return row

Пример #6

Показать файл

Файл: helpers.py Проект: dessHub/sleek

def get_filename_from_title(title, ext='.m4a'):
    """
    Creates a filename from title
    """
    if not title:
        return 'music' + ext
    title = HTMLParser().unescape(title)
    for _ in FILENAME_EXCLUDE:
        title = title.replace(_, ' ')  # provide readability with space
    return title + ext  # TODO - smart hunt

Пример #7

Показать файл

Файл: normalize.py Проект: seuf/burst

def normalize_string(string, charset=None, replacing=False):
    """
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        try:
            if re.search(u'=[0-9a-fA-F]{2}', string):
                string = py2_decode(string, 'Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
            try:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = py2_decode(string, 'latin-1')
                pass

            except TypeError:
                string = unicode(string, errors='ignore')
                pass

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')
            pass

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string

Пример #8

Показать файл

    def _ircfy_tweet(self, tweet):
        """Takes a twitter status and outputs irc message."""
        message = tweet.text
        urls = tweet.urls
        if tweet.retweeted_status:
            #HACK: because iPhone sucks and does not correctly handle RT
            message = "RT @{0}: {1}".format(
                tweet.retweeted_status.user.screen_name,
                tweet.retweeted_status.text)
            urls = tweet.urls
        try:
            message = HTMLParser().unescape(message)
        except:
            self.log.exception("Unable to escape message %r", message)

        message = "{surround}{screen_name}{surround}: {message}".format(
            surround=IRC_BOLD,
            screen_name=tweet.user.screen_name,
            message=message)
        message = message.replace('\r', '').replace('\n', '  ')
        urls = self._urls_to_dict(urls)
        message = self._handle_url_expansion(message, urls, 440)
        return message

Пример #9

Показать файл

Файл: webutils.py Проект: Karolucha/magisterka

def clean_html(content):
    content = HTMLParser().unescape(content)
    content = content.replace('\xa0', ' ')
    return content

Пример #10

Показать файл

def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(
        r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(
        r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$"
    )
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
                additional_date_regex,
                date_regex,
                preceding_square_braces_regex,
                square_braces_regex,
                round_braces_regex,
                angle_braces_regex,
                release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(r"^(national\s+geographic\s*:|наука\s+2\.0)\s+", "",
                          torrent_name)

    torrent_name = re.sub(r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*",
        r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(
            r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)",
            "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
            "январь",
            "января",
            "февраль",
            "февраля",
            "март",
            "марта",
            "апрель",
            "апреля",
            "май",
            "мая",
            "июнь",
            "июня",
            "июль",
            "июля",
            "август",
            "августа",
            "сентябрь",
            "сентября",
            "октябрь",
            "октября",
            "ноябрь",
            "ноября",
            "декабрь",
            "декабря",
    ):
        torrent_name = re.sub(r"\b" + month + r"\b", "", torrent_name)
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()

Пример #11

Показать файл

Файл: torrents.py Проект: psyvisions/rutracker.rss

def get_fingerprint(torrent_name):
    """
    Tries to obtain a fingerprint from the torrent name that will uniquely
    identify it's group (TV show).
    """

    # Minimize typing differences
    torrent_name = torrent_name.replace("ё", "е")

    # Unescape HTML entities
    torrent_name = HTMLParser().unescape(torrent_name)

    # Drop all tags
    torrent_name = re.sub(r"</?[a-z]+>", "", torrent_name)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    square_braces_regex = re.compile(r"^(.+(?:\s+|\)))\[[^\[\]]+?\](.*)$")
    preceding_square_braces_regex = re.compile(r"^(\s*)\[[^\[\]]+?\](.+)$")
    round_braces_regex = re.compile(r"^(.+(?:\s+|\]))\([^()]+?\)(.*)$")
    angle_braces_regex = re.compile(r"^(.+)\s+<<.*?>>(.*)$")
    date_regex = re.compile(r"^(.+)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    # Unable to merge it into date_regex due to some strange behaviour of re
    # module.
    additional_date_regex = re.compile(r"^(.+)\s+(?:по|от)\s+(?:\d{1,2}\.\d{1,2}\.\d{4}|\d{4}\.\d{2}\.\d{2})(.*)$")
    release_counter_regex = re.compile(r"^(.+)\s+\d+\s*(?:в|из)\s*\d+(.*)$")

    old_torrent_name = None
    while torrent_name != old_torrent_name:
        old_torrent_name = torrent_name

        for regex in (
            additional_date_regex,
            date_regex,
            preceding_square_braces_regex,
            square_braces_regex,
            round_braces_regex,
            angle_braces_regex,
            release_counter_regex,
        ):
            torrent_name = regex.sub(r"\1\2", torrent_name.strip(" .,"))

    torrent_name = re.sub(r"\s+/.*", "", torrent_name)
    # <--

    # We need all names in lowercase for easier analysis
    torrent_name = torrent_name.lower()

    # Try to get most possible short fingerprint -->
    torrent_name = re.sub(
        r"^«([^»]{6,})»", r"\1", torrent_name)

    torrent_name = re.sub(
        r'^"([^»]{6,})"', r"\1", torrent_name)

    torrent_name = re.sub(
        r"^([0-9a-zабвгдеёжзийклмнопрстуфхцчшщьъыэюя., \-:]{6,}?(?:[:.?!]| - | — |\|)).*", r"\1", torrent_name)
    # Try to get most possible short fingerprint <--

    # Drop all punctuation and other non-alphabet characters
    characters = "abcdefghijklmnopqrstuvwxyzабвгдеёжзийклмнопрстуфхцчшщьъыэюя"
    torrent_name = torrent_name.replace(".", " ")
    torrent_name = "".join(
        c for c in torrent_name if c in " " + characters)

    # Drop any additional info: timestamps, release versions, etc.
    # -->
    torrent_name = torrent_name.replace("г.", "")
    while True:
        new_torrent_name = re.sub(r"(?:\s|\()(:?выпуск|выпуски|выпусков|обновлено|передачи за|серия из|сезон|серия|серии|премьера|эфир с|эфир от|эфиры от|satrip)(?:\s|\)|$)", "", torrent_name)
        if new_torrent_name == torrent_name:
            break
        torrent_name = new_torrent_name

    for month in (
        "январь",   "января",
        "февраль",  "февраля",
        "март",     "марта",
        "апрель",   "апреля",
        "май",      "мая",
        "июнь",     "июня",
        "июль",     "июля",
        "август",   "августа",
        "сентябрь", "сентября",
        "октябрь",  "октября",
        "ноябрь",   "ноября",
        "декабрь",  "декабря",
    ):
        torrent_name = torrent_name.replace(month, "")
    # <--

    # Drop several spaces
    torrent_name = re.sub(r"\s+", " ", torrent_name).strip()

    return torrent_name.strip()