Python htmlclean示例

编程语言: Python

命名空间/包名称: core.scrapertoolsV2

方法/功能: htmlclean

hotexamples.com的示例: 5

Python htmlclean - 已找到5个示例。这些是从开源项目中提取的最受好评的core.scrapertoolsV2.htmlclean现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： casacinema.py 项目： proyeus1972/addon

 def load_episodios(html, item, itemlist, lang_title):
     patron = '.*?<a href="[^"]+"[^o]+ofollow[^>]+>[^<]+</a><(?:b|/)[^>]+>'
     matches = re.compile(patron).findall(html)
     for data in matches:
         # Estrae i contenuti
         scrapedtitle = scrapertoolsV2.htmlclean(re.sub(r'(<a [^>]+>)*(<\/a>.*)*(Speedvideo)*', '', data)).strip()
         if scrapedtitle != 'Categorie':
             scrapedtitle = scrapedtitle.replace('&#215;', 'x')
             scrapedtitle = scrapedtitle.replace('×', 'x')
             scrapedtitle = scrapedtitle.replace(';', '')
             itemlist.append(
                 Item(channel=item.channel,
                      action="findvideos",
                      contentType="episode",
                      title="[COLOR azure]%s[/COLOR]" % (scrapedtitle + " (" + lang_title + ")"),
                      url=data,
                      thumbnail=item.thumbnail,
                      extra=item.extra,
                      fulltitle=scrapedtitle + " (" + lang_title + ")" + ' - ' + item.show,
                      show=item.show))

示例#2

显示文件

文件： support.py 项目： dentaku65/addon

def cleantitle(title):
    cleantitle = scrapertoolsV2.htmlclean(
        scrapertoolsV2.decodeHtmlentities(title).replace('"', "'").replace(
            '×', 'x').replace('–', '-')).strip()
    return cleantitle

示例#3

显示文件

def scrape(item,
           patron='',
           listGroups=[],
           headers="",
           blacklist="",
           data="",
           patron_block="",
           patronNext="",
           action="findvideos",
           addVideolibrary=True,
           type_content_dict={},
           type_action_dict={}):
    # patron: the patron to use for scraping page, all capturing group must match with listGroups
    # listGroups: a list containing the scraping info obtained by your patron, in order
    # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating, episode, lang

    # header: values to pass to request header
    # blacklist: titles that you want to exclude(service articles for example)
    # data: if you want to pass data manually, maybe because you need some custom replacement
    # patron_block: patron to get parts of the page (to scrape with patron attribute),
    #               if you need a "block inside another block" you can create a list, please note that all matches
    #               will be packed as string
    # patronNext: patron for scraping next page link
    # action: if you want results perform an action different from "findvideos", useful when scraping film by genres
    # url_host: string to prepend to scrapedurl, useful when url don't contain host
    # example usage:
    #   import support
    #   itemlist = []
    #   patron = 'blablabla'
    #   headers = [['Referer', host]]
    #   blacklist = 'Request a TV serie!'
    #   return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'title2', 'year', 'plot', 'episode', 'lang'],
    #                           headers=headers, blacklist=blacklist)
    # listGroups
    #    thumb = immagine, quality = qualità, url = link singolo o gruppo, title = titolo film o serie, title2 = titolo aggiuntivo
    #    year = anno del film o della serie, plot = descrizione film o serie, episode = numero stagione - numero episodio in caso di serie,
    #    lang = lingua del video
    # 'type' is a check for typologies of content e.g. Film or TV Series
    # 'episode' is a key to grab episode numbers if it is separated from the title
    # IMPORTANT 'type' is a special key, to work need type_content_dict={} and type_action_dict={}

    itemlist = []

    if not data:
        data = httptools.downloadpage(item.url,
                                      headers=headers,
                                      ignore_response_code=True).data.replace(
                                          "'", '"')
        data = re.sub('\n|\t', ' ', data)
        # replace all ' with " and eliminate newline, so we don't need to worry about
        log('DATA =', data)

        block = data

        if patron_block:
            if type(patron_block) == str:
                patron_block = [patron_block]

            for n, regex in enumerate(patron_block):
                blocks = scrapertoolsV2.find_multiple_matches(block, regex)
                block = ""
                for b in blocks:
                    block += "\n" + str(b)
                log('BLOCK ', n, '=', block)
    else:
        block = data
    if patron and listGroups:
        matches = scrapertoolsV2.find_multiple_matches(block, patron)
        log('MATCHES =', matches)

        known_keys = [
            'url', 'title', 'title2', 'episode', 'thumb', 'quality', 'year',
            'plot', 'duration', 'genere', 'rating', 'type', 'lang'
        ]  #by greko aggiunto episode
        lang = ''  # aggiunto per gestire i siti con pagine di serietv dove si hanno i video in ita e in subita

        for match in matches:
            if len(listGroups) > len(match):  # to fix a bug
                match = list(match)
                match.extend([''] * (len(listGroups) - len(match)))

            scraped = {}
            for kk in known_keys:
                val = match[listGroups.index(kk)] if kk in listGroups else ''
                if val and (kk == "url"
                            or kk == 'thumb') and 'http' not in val:
                    val = scrapertoolsV2.find_single_match(
                        item.url, 'https?://[a-z0-9.-]+') + val
                scraped[kk] = val

            title = scrapertoolsV2.htmlclean(
                scrapertoolsV2.decodeHtmlentities(scraped["title"])).replace(
                    '’', '\'').replace('"',
                                       "'").strip()  # fix by greko da " a '
            plot = scrapertoolsV2.htmlclean(
                scrapertoolsV2.decodeHtmlentities(scraped["plot"]))

            longtitle = typo(title, 'bold')
            if scraped['quality']:
                longtitle = longtitle + typo(scraped['quality'],
                                             '_ [] color kod')
            if scraped['episode']:
                scraped['episode'] = re.sub(r'\s-\s|-|x|&#8211', 'x',
                                            scraped['episode'])
                longtitle = typo(scraped['episode'] + ' - ',
                                 'bold') + longtitle
            if scraped['title2']:
                title2 = scrapertoolsV2.htmlclean(
                    scrapertoolsV2.decodeHtmlentities(
                        scraped["title2"])).replace('"', "'").strip()
                longtitle = longtitle + typo(title2, 'bold _ -- _')

            ##    Aggiunto/modificato per gestire i siti che hanno i video
            ##    in ita e subita delle serie tv nella stessa pagina
            if scraped['lang']:
                if 'sub' in scraped['lang'].lower():
                    lang = 'Sub-ITA'
                else:
                    lang = 'ITA'
            if lang != '':
                longtitle += typo(lang, '_ [] color kod')

            if item.infoLabels[
                    "title"] or item.fulltitle:  # if title is set, probably this is a list of episodes or video sources
                infolabels = item.infoLabels
            else:
                infolabels = {}
                if scraped["year"]:
                    infolabels['year'] = scraped["year"]
                if scraped["plot"]:
                    infolabels['plot'] = plot
                if scraped["duration"]:
                    matches = scrapertoolsV2.find_multiple_matches(
                        scraped["duration"],
                        r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
                    for h, m in matches:
                        scraped["duration"] = int(h) * 60 + int(m)
                    if not matches:
                        scraped["duration"] = scrapertoolsV2.find_single_match(
                            scraped["duration"], r'(\d+)')
                    infolabels['duration'] = int(scraped["duration"]) * 60
                if scraped["genere"]:
                    genres = scrapertoolsV2.find_multiple_matches(
                        scraped["genere"], '[A-Za-z]+')
                    infolabels['genere'] = ", ".join(genres)
                if scraped["rating"]:
                    infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(
                        scraped["rating"])

            if type_content_dict:
                for name, variants in type_content_dict.items():
                    if scraped['type'] in variants:
                        item.contentType = name
            if type_action_dict:
                for name, variants in type_action_dict.items():
                    if scraped['type'] in variants:
                        action = name

            if inspect.stack()[1][3] == 'episodios':
                item.contentType = 'episode'

            if scraped["title"] not in blacklist:
                it = Item(channel=item.channel,
                          action=action,
                          contentType=item.contentType,
                          title=longtitle,
                          fulltitle=title,
                          show=title,
                          language=lang if lang != '' else '',
                          quality=scraped["quality"],
                          url=scraped["url"],
                          infoLabels=infolabels,
                          thumbnail=scraped["thumb"],
                          args=item.args)

                for lg in list(set(listGroups).difference(known_keys)):
                    it.__setattr__(lg, match[listGroups.index(lg)])

                itemlist.append(it)
        checkHost(item, itemlist)
        if (item.contentType == "tvshow" and (action != "findvideos" and action != "play")) \
                or (item.contentType == "episode" and action != "play") \
                or (item.contentType == "movie" and action != "play"):
            tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
        else:
            for it in itemlist:
                it.infoLabels = item.infoLabels

        if patronNext:
            nextPage(itemlist, item, data, patronNext, 2)

        if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
            item.fulltitle = item.infoLabels["title"]
            videolibrary(itemlist, item)

    return itemlist

示例#4

显示文件

def peliculas(item):
    logger.info()
    itemlist = []

    data = httptools.downloadpage(item.url).data
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;", "", data)

    patron = 'class="post-item-image btn-play-item".*?'
    patron += 'href="([^"]+)">.*?'
    patron += '<img data-original="([^"]+)".*?'
    patron += 'glyphicon-calendar"></i>([^<]+).*?'
    patron += 'post-item-flags"> (.*?)</div.*?'
    patron += 'text-muted f-14">(.*?)</h3'

    matches = scrapertools.find_multiple_matches(data, patron)

    patron_next_page = 'href="([^"]+)"> &raquo;'
    matches_next_page = scrapertools.find_single_match(data, patron_next_page)
    if len(matches_next_page) > 0:
        url_next_page = item.url + matches_next_page

    for scrapedurl, scrapedthumbnail, year, idiomas, scrapedtitle in matches:
        year = year.strip()
        patronidiomas = '<img src="([^"]+)"'
        matchesidiomas = scrapertools.find_multiple_matches(
            idiomas, patronidiomas)
        idiomas_disponibles = []
        for idioma in matchesidiomas:
            if idioma.endswith("/la.png"):
                idiomas_disponibles.append("LAT")
            elif idioma.endswith("/en.png"):
                idiomas_disponibles.append("VO")
            elif idioma.endswith("/en_es.png"):
                idiomas_disponibles.append("VOSE")
            elif idioma.endswith("/es.png"):
                idiomas_disponibles.append("ESP")

        if idiomas_disponibles:
            idiomas_disponibles = "[" + "/".join(idiomas_disponibles) + "]"
        contentTitle = scrapertoolsV2.htmlclean(scrapedtitle.strip())
        title = "%s %s" % (contentTitle, idiomas_disponibles)
        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 title=title,
                 url=scrapedurl,
                 thumbnail=scrapedthumbnail,
                 contentTitle=contentTitle,
                 infoLabels={"year": year},
                 text_color=color1))
    # Obtenemos los datos basicos de todas las peliculas mediante multihilos
    tmdb.set_infoLabels(itemlist)

    # Si es necesario añadir paginacion
    if url_next_page:
        itemlist.append(
            Item(channel=item.channel,
                 action="peliculas",
                 title=">> Página siguiente",
                 thumbnail=thumbnail_host,
                 url=url_next_page,
                 folder=True,
                 text_color=color3,
                 text_bold=True))

    return itemlist

示例#5

显示文件

def scrape(item,
           patron='',
           listGroups=[],
           headers="",
           blacklist="",
           data="",
           patron_block="",
           patronNext="",
           action="findvideos",
           url_host="",
           addVideolibrary=True):
    # patron: the patron to use for scraping page, all capturing group must match with listGroups
    # listGroups: a list containing the scraping info obtained by your patron, in order
    # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating

    # header: values to pass to request header
    # blacklist: titles that you want to exclude(service articles for example)
    # data: if you want to pass data manually, maybe because you need some custom replacement
    # patron_block: patron to get parts of the page (to scrape with patron attribute),
    #               if you need a "block inside another block" you can create a list, please note that all matches
    #               will be packed as string
    # patronNext: patron for scraping next page link
    # action: if you want results perform an action different from "findvideos", useful when scraping film by genres
    # url_host: string to prepend to scrapedurl, useful when url don't contain host
    # example usage:
    #   import support
    #   itemlist = []
    #   patron = 'blablabla'
    #   headers = [['Referer', host]]
    #   blacklist = 'Request a TV serie!'
    #   return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'year', 'plot'],
    #                           headers=headers, blacklist=blacklist)

    itemlist = []

    if not data:
        data = httptools.downloadpage(item.url,
                                      headers=headers).data.replace("'", '"')
        data = re.sub('\n|\t', ' ', data)
        # replace all ' with " and eliminate newline, so we don't need to worry about
        log('DATA =', data)

        block = data

        if patron_block:
            if type(patron_block) == str:
                patron_block = [patron_block]

            for n, regex in enumerate(patron_block):
                blocks = scrapertoolsV2.find_multiple_matches(block, regex)
                block = ""
                for b in blocks:
                    block += "\n" + b
                log('BLOCK ', n, '=', block)
    else:
        block = data
    if patron and listGroups:
        matches = scrapertoolsV2.find_multiple_matches(block, patron)
        log('MATCHES =', matches)

        known_keys = [
            'url', 'title', 'thumb', 'quality', 'year', 'plot', 'duration',
            'genere', 'rating'
        ]
        for match in matches:
            if len(listGroups) > len(match):  # to fix a bug
                match = list(match)
                match.extend([''] * (len(listGroups) - len(match)))

            scraped = {}
            for kk in known_keys:
                val = match[listGroups.index(kk)] if kk in listGroups else ''
                if kk == "url":
                    val = url_host + val
                scraped[kk] = val

            title = scrapertoolsV2.decodeHtmlentities(scraped["title"]).strip()
            plot = scrapertoolsV2.htmlclean(
                scrapertoolsV2.decodeHtmlentities(scraped["plot"]))

            if scraped["quality"]:
                longtitle = '[B]' + title + '[/B] [COLOR blue][' + scraped[
                    "quality"] + '][/COLOR]'
            else:
                longtitle = '[B]' + title + '[/B]'

            if item.infoLabels[
                    "title"] or item.fulltitle:  # if title is set, probably this is a list of episodes or video sources
                infolabels = item.infoLabels
            else:
                infolabels = {}
                if scraped["year"]:
                    infolabels['year'] = scraped["year"]
                if scraped["plot"]:
                    infolabels['plot'] = plot
                if scraped["duration"]:
                    matches = scrapertoolsV2.find_multiple_matches(
                        scraped["duration"],
                        r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
                    for h, m in matches:
                        scraped["duration"] = int(h) * 60 + int(m)
                    if not matches:
                        scraped["duration"] = scrapertoolsV2.find_single_match(
                            scraped["duration"], r'(\d+)')
                    infolabels['duration'] = int(scraped["duration"]) * 60
                if scraped["genere"]:
                    genres = scrapertoolsV2.find_multiple_matches(
                        scraped["genere"], '[A-Za-z]+')
                    infolabels['genere'] = ", ".join(genres)
                if scraped["rating"]:
                    infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(
                        scraped["rating"])

            if scraped["title"] not in blacklist:
                it = Item(channel=item.channel,
                          action=action,
                          contentType=item.contentType,
                          title=longtitle,
                          fulltitle=title,
                          show=title,
                          quality=scraped["quality"],
                          url=scraped["url"],
                          infoLabels=infolabels,
                          thumbnail=scraped["thumb"])

                for lg in list(set(listGroups).difference(known_keys)):
                    it.__setattr__(lg, match[listGroups.index(lg)])

                itemlist.append(it)

        if (item.contentType == "episode" and (action != "findvideos" and action != "play")) \
                or (item.contentType == "movie" and action != "play"):
            tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
        else:
            for it in itemlist:
                it.infoLabels = item.infoLabels

        if patronNext:
            nextPage(itemlist, item, data, patronNext, 2)

        if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
            item.fulltitle = item.infoLabels["title"]
            videolibrary(itemlist, item)

    return itemlist