Python decodeHtmlentities примеры, core.scrapertoolsV2.decodeHtmlentities Python примеры использования

Пример #1

0

Показать файл

def search(item, texto):
    logger.info("[casacinemaInfo.py] " + item.url + " search " + texto)

    item.url = host + "?s=" + texto
    data = httptools.downloadpage(item.url).data

    itemlist = []

    patron = '<li class="col-md-12 itemlist">.*?<a href="([^"]+)" title="([^"]+)".*?<img src="([^"]+)".*?Film dell\\\'anno: ([0-9]{4}).*?<p class="text-list">([^<>]+)</p>'
    matches = scrapertoolsV2.find_multiple_matches(data, patron)
    for scrapedurl, scrapedtitle, scrapedthumbnail, scrapedyear, scrapedplot in matches:
        title = scrapertoolsV2.decodeHtmlentities(scrapedtitle)
        cleantitle = title.replace('[Sub-ITA]', '').strip()

        infoLabels = {
            "plot": scrapertoolsV2.decodeHtmlentities(scrapedplot),
            "year": scrapedyear
        }

        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 contentType="movie",
                 title=title,
                 url=scrapedurl,
                 thumbnail=scrapedthumbnail,
                 infoLabels=infoLabels,
                 fulltitle=cleantitle))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)

    return itemlist

Пример #2

0

Показать файл

def search_peliculas(item):
    log()
    itemlist = []

    action = 'findvideos' if item.extra == 'movie' else 'episodios'

    data = httptools.downloadpage(item.url,
                                  headers=headers,
                                  ignore_response_code=True).data.replace(
                                      '\t', '').replace('\n', '')
    log(data)
    patron = r'<a href="([^"]+)" title="Permalink to\s([^"]+) \(([^<]+)\).*?".*?<img[^s]+src="([^"]+)".*?<div class="calitate">\s*<p>([^<]+)<\/p>'
    matches = re.compile(patron, re.MULTILINE).findall(data)

    for url, title, year, thumb, quality in matches:
        infoLabels = {}
        infoLabels['year'] = year
        title = scrapertoolsV2.decodeHtmlentities(title)
        quality = scrapertoolsV2.decodeHtmlentities(quality)
        longtitle = title + support.typo(quality, '_ [] color kod')
        itemlist.append(
            Item(channel=item.channel,
                 action=action,
                 contentType=item.contentType,
                 fulltitle=title,
                 show=title,
                 title=longtitle,
                 url=url,
                 thumbnail=thumb,
                 infoLabels=infoLabels,
                 args=item.args))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)

    return itemlist

Пример #3

0

Показать файл

def newep(item):
    log()
    itemlist = []

    page = 1
    if item.page:
        page = item.page

    matches = support.match(item, r'<div class="poster"><img src="([^"]+)" alt="([^"]+)">[^>]+><a href="([^"]+)">')[0]

    for i, (thumb, title, url) in enumerate(matches):
        if (page - 1) * PERPAGE > i: continue
        if i >= page * PERPAGE: break
        title = scrapertoolsV2.decodeHtmlentities(title)
        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 fulltitle=title,
                 show=title,
                 title= support.typo(title,'bold'),
                 url=url,
                 thumbnail=thumb))
    support.pagination(itemlist, item, page, PERPAGE)

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
    return itemlist

Пример #4

0

Показать файл

def newest(categoria):
    log()
    itemlist = []
    item = Item()
    item.url = host + '/aggiornamenti/'

    matches = support.match(
        item,
        r'mediaWrapAlt recomended_videos"[^>]+>\s*<a href="([^"]+)" title="([^"]+)" rel="bookmark">\s*<img[^s]+src="([^"]+)"[^>]+>'
    )[0]

    for url, title, thumb in matches:
        title = scrapertoolsV2.decodeHtmlentities(title).replace(
            "Permalink to ", "").replace("streaming", "")
        title = re.sub(r'\s\(\d+\)', '', title)
        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 contentType="movie",
                 fulltitle=title,
                 show=title,
                 title=support.typo(title, 'bold'),
                 url=url,
                 thumbnail=thumb,
                 folder=True))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
    return itemlist

Пример #5

0

Показать файл

def cleantitle(scrapedtitle):
    scrapedtitle = scrapertoolsV2.decodeHtmlentities(scrapedtitle.strip())
    scrapedtitle = scrapedtitle.replace('’',
                                        '\'').replace('&#215;', 'x').replace(
                                            '×', 'x').replace('"', "'")

    return scrapedtitle.strip()

Пример #6

0

Показать файл

def serietv(item):

    logger.info("%s serietv log: %s" % (__channel__, item))
    itemlist = []
    # Carica la pagina
    data = httptools.downloadpage(item.url).data

    # Estrae i contenuti
    patron = '<div class="post-thumb">\s*<a href="([^"]+)" title="([^"]+)">\s*<img src="([^"]+)"'
    matches = re.compile(patron, re.DOTALL).findall(data)

    for scrapedurl, scrapedtitle, scrapedthumbnail in matches:
        #scrapedplot = ""
        scrapedtitle = scrapertoolsV2.decodeHtmlentities(
            scrapedtitle)  #.replace("Streaming", ""))
        if scrapedtitle.startswith("Link to "):
            scrapedtitle = scrapedtitle[8:]
        num = scrapertoolsV2.find_single_match(scrapedurl, '(-\d+/)')
        if num:
            scrapedurl = scrapedurl.replace(num, "-episodi/")
        itemlist.append(
            Item(
                channel=item.channel,
                action="episodios",
                #contentType="tvshow",
                contentSerieName=scrapedtitle,
                title=scrapedtitle,
                #text_color="azure",
                url=scrapedurl,
                thumbnail=scrapedthumbnail,
                #plot=scrapedplot,
                show=item.show,
                extra=item.extra,
                folder=True))

    # locandine e trama e altro da tmdb se presente l'anno migliora la ricerca
    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True, idioma_busqueda='it')

    # Paginazione
    patronvideos = '<a class="next page-numbers" href="?([^>"]+)">Avanti &raquo;</a>'
    matches = re.compile(patronvideos, re.DOTALL).findall(data)

    if len(matches) > 0:
        scrapedurl = urlparse.urljoin(item.url, matches[0])
        itemlist.append(
            Item(
                channel=item.channel,
                action="serietv",
                title="[COLOR lightgreen]" +
                config.get_localized_string(30992) + "[/COLOR]",
                url=scrapedurl,
                thumbnail=
                "http://2.bp.blogspot.com/-fE9tzwmjaeQ/UcM2apxDtjI/AAAAAAAAeeg/WKSGM2TADLM/s1600/pager+old.png",
                extra=item.extra,
                folder=True))

    return itemlist

Пример #7

0

Показать файл

def peliculas(item):
    log()
    itemlist = []

    blacklist = ['top 10 anime da vedere']
    matches, data = support.match(
        item,
        r'<a class="[^"]+" href="([^"]+)" title="([^"]+)"><img[^s]+src="([^"]+)"[^>]+'
    )

    for url, title, thumb in matches:
        title = scrapertoolsV2.decodeHtmlentities(title.strip()).replace(
            "streaming", "")
        lang = scrapertoolsV2.find_single_match(title, r"((?:SUB ITA|ITA))")
        videoType = ''
        if 'movie' in title.lower():
            videoType = ' - (MOVIE)'
        if 'ova' in title.lower():
            videoType = ' - (OAV)'

        cleantitle = title.replace(lang, "").replace(
            '(Streaming & Download)',
            '').replace('( Streaming & Download )',
                        '').replace('OAV',
                                    '').replace('OVA',
                                                '').replace('MOVIE',
                                                            '').strip()

        if not videoType:
            contentType = "tvshow"
            action = "episodios"
        else:
            contentType = "movie"
            action = "findvideos"

        if not title.lower() in blacklist:
            itemlist.append(
                Item(channel=item.channel,
                     action=action,
                     contentType=contentType,
                     title=support.typo(cleantitle + videoType, 'bold') +
                     support.typo(lang, '_ [] color kod'),
                     fulltitle=cleantitle,
                     show=cleantitle,
                     url=url,
                     thumbnail=thumb))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
    autorenumber.renumber(itemlist)
    support.nextPage(itemlist, item, data,
                     r'<a class="next page-numbers" href="([^"]+)">')

    return itemlist

Пример #8

0

Показать файл

Файл: cinemastreaming.py Проект: Jpocas3212/salva59sg

def findvideos(item):

    if item.quality.lower() in ["ended", "canceled", "returning series"]:
        return episodios(item)

    itemlist = []
    data = scrapertoolsV2.decodeHtmlentities(
        httptools.downloadpage(item.url).data)
    btns = re.compile(
        r'data-tplayernv="Opt.*?><span>([^<]+)</span><span>([^<]+)</span>',
        re.DOTALL).findall(data)
    matches = re.compile(r'<iframe.*?src="([^"]+trembed=[^"]+)',
                         re.DOTALL).findall(data)
    for i, scrapedurl in enumerate(matches):

        scrapedurl = scrapertoolsV2.decodeHtmlentities(scrapedurl)
        patron = r'<iframe.*?src="([^"]+)"'
        link_data = httptools.downloadpage(scrapedurl).data
        url = scrapertoolsV2.find_single_match(link_data, patron)

        itemlist.append(
            Item(
                channel=item.channel,
                action="play",
                contentType=item.contentType,
                title="[B]" + btns[i][0] + "[/B] - " + btns[i][1],
                fulltitle=btns[i][0] + " " + btns[i][1],
                show=btns[i][0] + " " + btns[i][1],
                url=url,
                extra=item.extra,
                infoLabels=item.infoLabels,
                server=btns[i][0],
                contentQuality=btns[i][1].replace('Italiano - ', ''),
            ))

    if item.contentType == "movie":
        support.videolibrary(itemlist, item)
    autoplay.start(itemlist, item)

    return itemlist

Пример #9

0

Показать файл

Файл: cineblog01.py Проект: proyeus1972/addon

def findvideos(item):
    findhost()

    if item.contentType == "episode":
        return findvid_serie(item)

    def load_links(itemlist, re_txt, color, desc_txt, quality=""):
        streaming = scrapertoolsV2.find_single_match(data, re_txt).replace('"', '')
        support.log('STREAMING=', streaming)
        patron = '<td><a.*?href=(.*?) (?:target|rel)[^>]+>([^<]+)<'
        matches = re.compile(patron, re.DOTALL).findall(streaming)
        for scrapedurl, scrapedtitle in matches:
            logger.debug("##### findvideos %s ## %s ## %s ##" % (desc_txt, scrapedurl, scrapedtitle))
            itemlist.append(
                Item(channel=item.channel,
                     action="play",
                     title=scrapedtitle,
                     url=scrapedurl,
                     server=scrapedtitle,
                     fulltitle=item.fulltitle,
                     thumbnail=item.thumbnail,
                     show=item.show,
                     quality=quality,
                     contentType=item.contentType,
                     folder=False))

    support.log()

    itemlist = []

    # Carica la pagina
    data = httptools.downloadpage(item.url).data
    data = re.sub('\n|\t','',data)

    # Extract the quality format
    patronvideos = '>([^<]+)</strong></div>'
    matches = re.compile(patronvideos, re.DOTALL).finditer(data)
    QualityStr = ""
    for match in matches:
        QualityStr = scrapertoolsV2.decodeHtmlentities(match.group(1))[6:]

    # Estrae i contenuti - Streaming
    load_links(itemlist, '<strong>Streaming:</strong>(.*?)<tableclass=cbtable height=30>', "orange", "Streaming", "SD")

    # Estrae i contenuti - Streaming HD
    load_links(itemlist, '<strong>Streaming HD[^<]+</strong>(.*?)<tableclass=cbtable height=30>', "yellow", "Streaming HD", "HD")

    # Estrae i contenuti - Streaming 3D
    load_links(itemlist, '<strong>Streaming 3D[^<]+</strong>(.*?)<tableclass=cbtable height=30>', "pink", "Streaming 3D")

    return support.server(item, itemlist=itemlist)

Пример #10

0

Показать файл

def peliculas(item):
    logger.info("[casacinemaInfo.py] peliculas")

    itemlist = []

    # Carica la pagina
    data = httptools.downloadpage(item.url).data

    # Estrae i contenuti

    patron = '<div class="col-mt-5 postsh">[^<>]+<div class="poster-media-card">[^<>]+<a href="([^"]+)" title="([^"]+)".*?<img src="([^"]+)"'

    matches = scrapertoolsV2.find_multiple_matches(data, patron)

    for scrapedurl, scrapedtitle, scrapedthumbnail in matches:
        title = scrapertoolsV2.decodeHtmlentities(scrapedtitle)
        cleantitle = title.replace('[Sub-ITA]', '').strip()

        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 contentType="movie",
                 title=title,
                 url=scrapedurl,
                 thumbnail=scrapedthumbnail,
                 fulltitle=cleantitle))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)

    ## Paginación
    next_page = scrapertoolsV2.find_single_match(
        data, '<a href="([^"]+)"><i class="glyphicon glyphicon-chevron-right"'
    )  ### <- Regex rimosso spazio - precedente <li><a href="([^"]+)" >Pagina -> Continua. riga 221

    if next_page != "":
        itemlist.append(
            Item(
                channel=item.channel,
                action="peliculas",
                title="[COLOR lightgreen]" +
                config.get_localized_string(30992) + "[/COLOR]",
                url=next_page,
                extra=item.extra,
                thumbnail=
                "http://2.bp.blogspot.com/-fE9tzwmjaeQ/UcM2apxDtjI/AAAAAAAAeeg/WKSGM2TADLM/s1600/pager+old.png"
            ))

    return itemlist

Пример #11

0

Показать файл

Файл: seriesblanco.py Проект: RIgodonius/addon-master-original

def series(item):
    logger.info()
    if not hasattr(item, 'extra') or not isinstance(item.extra, int):
        item.extra = 1

    if '?' in item.url:
        merger = '&'
    else:
        merger = '?'

    page_url = "%s%spagina=%s" % (item.url, merger, item.extra)
    logger.info("url = %s" % page_url)

    data = scrapertoolsV2.decodeHtmlentities(
        httptools.downloadpage(page_url).data)
    return extract_series_from_data(item, data)

Пример #12

0

Показать файл

Файл: serietvonline.py Проект: 32Dexter/KOD-Addon

def peliculas(item):
    logger.info(item.channel + 'peliculas')
    itemlist = []

    if item.contentType == 'movie':
        action = 'findvideos'
    else:
        action = 'episodios'

    page = 1
    if '{}' in item.url:
        item.url, page = item.url.split('{}')
        page = int(page)

    data = httptools.downloadpage(item.url, headers=headers).data
    block = scrapertoolsV2.find_single_match(
        data, r'id="lcp_instance_0">(.*?)<\/ul>')
    matches = re.compile(r'<a\s*href="([^"]+)" title="([^<]+)">[^<]+</a>',
                         re.DOTALL).findall(block)

    for i, (url, title) in enumerate(matches):
        if (page - 1) * PERPAGE > i: continue
        if i >= page * PERPAGE: break
        title = scrapertoolsV2.decodeHtmlentities(title)
        itemlist.append(
            Item(channel=item.channel,
                 action=action,
                 title=title,
                 contentTitle=title,
                 fulltitle=title,
                 url=url,
                 contentType=item.contentType,
                 show=title))

    if len(matches) >= page * PERPAGE:
        url = item.url + '{}' + str(page + 1)
        itemlist.append(
            Item(channel=item.channel,
                 action="peliculas",
                 title="[COLOR blue]" + config.get_localized_string(30992) +
                 " >[/COLOR]",
                 url=url,
                 thumbnail=thumb(),
                 contentType=item.contentType))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
    return itemlist

Пример #13

0

Показать файл

Файл: yaske.py Проект: prpeaprendiz/addon

def findvideos(item):
    logger.info()
    itemlist = list()
    sublist = list()

    # Descarga la página
    data = httptools.downloadpage(item.url).data

    if not item.plot:
        item.plot = scrapertoolsV2.find_single_match(data, '>Sinopsis</dt> <dd>([^<]+)</dd>')
        item.plot = scrapertoolsV2.decodeHtmlentities(item.plot)

    patron = '<option value="([^"]+)"[^>]+'
    patron += '>([^<]+).*?</i>([^<]+)'
    matches = re.compile(patron, re.DOTALL).findall(data)

    for url, idioma, calidad in matches:
        if 'yaske' in url:
            data = httptools.downloadpage(url).data
            url_enc = scrapertoolsV2.find_single_match(data, "eval.*?'(.*?)'")
            url_dec = base64.b64decode(url_enc)
            url = scrapertoolsV2.find_single_match(url_dec, 'iframe src="(.*?)"')
        sublist.append(item.clone(action="play", url=url, folder=False, text_color=color1, quality=calidad.strip(),
                                  language=idioma.strip()))

    sublist = servertools.get_servers_itemlist(sublist, lambda i: "Ver en %s %s" % (i.server, i.quality), True)

    # Añadir servidores encontrados, agrupandolos por idioma
    for k in ["Español", "Latino", "Subtitulado", "Ingles"]:
        lista_idioma = filter(lambda i: i.language == k, sublist)
        if lista_idioma:
            itemlist.append(Item(channel=item.channel, title=k, fanart=item.fanart, folder=False,
                                 text_color=color2, text_bold=True, thumbnail=thumbnail_host))
            itemlist.extend(lista_idioma)

    # Insertar items "Buscar trailer" y "Añadir a la videoteca"
    if itemlist and item.extra != "library":
        title = "%s [Buscar trailer]" % (item.contentTitle)
        itemlist.insert(0, item.clone(channel="trailertools", action="buscartrailer",
                                      text_color=color3, title=title, viewmode="list"))

        if config.get_videolibrary_support():
            itemlist.append(Item(channel=item.channel, title="Añadir película a la videoteca",
                                 action="add_pelicula_to_library", url=item.url, text_color="green",
                                 contentTitle=item.contentTitle, extra="library", thumbnail=thumbnail_host))

    return itemlist

Пример #14

0

Показать файл

def peliculas(item):
    support.log()
    itemlist = []

    data = httptools.downloadpage(item.url, headers=headers).data
    patron = r'<div class="cover_kapsul ml-mask".*?<a href="(.*?)">(.*?)<\/a>.*?<img .*?src="(.*?)".*?<div class="trdublaj">(.*?)<\/div>.(<div class="sub_ita">(.*?)<\/div>|())'
    matches = scrapertoolsV2.find_multiple_matches(data, patron)

    for scrapedurl, scrapedtitle, scrapedthumbnail, scrapedquality, subDiv, subText, empty in matches:
        info = scrapertoolsV2.find_multiple_matches(
            data,
            r'<span class="ml-label">([0-9]+)+<\/span>.*?<span class="ml-label">(.*?)<\/span>.*?<p class="ml-cat".*?<p>(.*?)<\/p>.*?<a href="(.*?)" class="ml-watch">'
        )
        infoLabels = {}
        for infoLabels['year'], duration, scrapedplot, checkUrl in info:
            if checkUrl == scrapedurl:
                break

        infoLabels['duration'] = int(duration.replace(
            ' min', '')) * 60  # calcolo la durata in secondi
        scrapedthumbnail = host + scrapedthumbnail
        scrapedtitle = scrapertoolsV2.decodeHtmlentities(scrapedtitle)
        fulltitle = scrapedtitle
        if subDiv:
            fulltitle += support.typo(subText + ' _ () color limegreen')
        fulltitle += support.typo(scrapedquality.strip() + ' _ [] color kod')

        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 contentType=item.contenType,
                 contentTitle=scrapedtitle,
                 contentQuality=scrapedquality.strip(),
                 plot=scrapedplot,
                 title=fulltitle,
                 fulltitle=scrapedtitle,
                 show=scrapedtitle,
                 url=scrapedurl,
                 infoLabels=infoLabels,
                 thumbnail=scrapedthumbnail))

    tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
    support.nextPage(itemlist, item, data,
                     '<span>[^<]+</span>[^<]+<a href="(.*?)">')

    return itemlist

Пример #15

0

Показать файл

Файл: support.py Проект: dentaku65/addon

def scrapeBlock(item, args, block, patron, headers, action, pagination, debug,
                typeContentDict, typeActionDict, blacklist, search, pag,
                function, lang):
    itemlist = []
    log("scrapeBlock qui", block, patron)
    matches = scrapertoolsV2.find_multiple_matches_groups(block, patron)
    log('MATCHES =', matches)

    if debug:
        regexDbg(item, patron, headers, block)

    known_keys = [
        'url', 'title', 'title2', 'season', 'episode', 'thumb', 'quality',
        'year', 'plot', 'duration', 'genere', 'rating', 'type', 'lang'
    ]
    # Legenda known_keys per i groups nei patron
    # known_keys = ['url', 'title', 'title2', 'season', 'episode', 'thumb', 'quality',
    #                'year', 'plot', 'duration', 'genere', 'rating', 'type', 'lang']
    # url = link relativo o assoluto alla pagina titolo film/serie
    # title = titolo Film/Serie/Anime/Altro
    # title2 = titolo dell'episodio Serie/Anime/Altro
    # season = stagione in formato numerico
    # episode = numero episodio, in formato numerico.
    # thumb = linkrealtivo o assoluto alla locandina Film/Serie/Anime/Altro
    # quality = qualità indicata del video
    # year = anno in formato numerico (4 cifre)
    # duration = durata del Film/Serie/Anime/Altro
    # genere = genere del Film/Serie/Anime/Altro. Es: avventura, commedia
    # rating = punteggio/voto in formato numerico
    # type = tipo del video. Es. movie per film o tvshow per le serie. Di solito sono discrimanti usati dal sito
    # lang = lingua del video. Es: ITA, Sub-ITA, Sub, SUB ITA.
    # AVVERTENZE: Se il titolo è trovato nella ricerca TMDB/TVDB/Altro allora le locandine e altre info non saranno quelle recuperate nel sito.!!!!

    stagione = ''  # per quei siti che hanno la stagione nel blocco ma non nelle puntate
    for i, match in enumerate(matches):
        if pagination and (pag - 1) * pagination > i: continue  # pagination
        if pagination and i >= pag * pagination: break  # pagination
        listGroups = match.keys()
        match = match.values()

        if len(listGroups) > len(match):  # to fix a bug
            match = list(match)
            match.extend([''] * (len(listGroups) - len(match)))

        scraped = {}
        for kk in known_keys:
            val = match[listGroups.index(kk)] if kk in listGroups else ''
            if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
                val = scrapertoolsV2.find_single_match(
                    item.url, 'https?://[a-z0-9.-]+') + val
            scraped[kk] = val

        if scraped['season'] != None:
            season = scraped['season']
        if stagione:
            episode = season + 'x' + scraped['episode']
        elif item.contentType == 'tvshow' and (scraped['episode'] == ''
                                               and season == ''):
            item.args = 'season_completed'
            episode = ''
        else:
            episode = re.sub(r'\s-\s|-|x|&#8211|&#215;', 'x',
                             scraped['episode']) if scraped['episode'] else ''

        #episode = re.sub(r'\s-\s|-|x|&#8211|&#215;', 'x', scraped['episode']) if scraped['episode'] else ''
        title = cleantitle(scraped['title']) if scraped['title'] else ''
        title2 = cleantitle(scraped['title2']) if scraped['title2'] else ''
        quality = scraped['quality'].strip() if scraped['quality'] else ''
        Type = scraped['type'] if scraped['type'] else ''
        plot = cleantitle(scraped["plot"]) if scraped["plot"] else ''

        # make formatted Title [longtitle]
        s = ' - '
        title = episode + (s if episode and title else '') + title
        longtitle = title + (s if title and title2 else '') + title2
        longtitle = typo(longtitle, 'bold')
        longtitle += (typo(Type, '_ () bold') if Type else '') + (typo(
            quality, '_ [] color kod') if quality else '')

        lang1, longtitle = scrapeLang(scraped, lang, longtitle)

        # if title is set, probably this is a list of episodes or video sources
        # necessaria l'aggiunta di == scraped["title"] altrimenti non prende i gruppi dopo le categorie
        if item.infoLabels["title"] == scraped["title"]:
            infolabels = item.infoLabels
        else:
            infolabels = {}
            if scraped['year']:
                infolabels['year'] = scraped['year']
            if scraped["plot"]:
                infolabels['plot'] = plot
            if scraped['duration']:
                matches = scrapertoolsV2.find_multiple_matches(
                    scraped['duration'],
                    r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
                for h, m in matches:
                    scraped['duration'] = int(h) * 60 + int(m)
                if not matches:
                    scraped['duration'] = scrapertoolsV2.find_single_match(
                        scraped['duration'], r'(\d+)')
                infolabels['duration'] = int(scraped['duration']) * 60
            if scraped['genere']:
                genres = scrapertoolsV2.find_multiple_matches(
                    scraped['genere'], '[A-Za-z]+')
                infolabels['genere'] = ", ".join(genres)
            if scraped["rating"]:
                infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(
                    scraped["rating"])

        AC = CT = ''
        if typeContentDict:
            for name, variants in typeContentDict.items():
                if str(scraped['type']).lower() in variants:
                    CT = name
                    break
                else:
                    CT = item.contentType
        if typeActionDict:
            for name, variants in typeActionDict.items():
                if str(scraped['type']).lower() in variants:
                    AC = name
                    break
                else:
                    AC = action

        if (scraped["title"] not in blacklist) and (search.lower()
                                                    in longtitle.lower()):
            it = Item(
                channel=item.channel,
                action=AC if AC else action,
                contentType='episode'
                if function == 'episodios' else CT if CT else item.contentType,
                title=longtitle,
                fulltitle=item.fulltitle if function == 'episodios' else title,
                show=item.show if function == 'episodios' else title,
                quality=quality,
                url=scraped["url"],
                infoLabels=infolabels,
                thumbnail=item.thumbnail
                if function == 'episodios' else scraped["thumb"],
                args=item.args,
                contentSerieName=scraped['title'] if item.contentType
                or CT != 'movie' and function != 'episodios' else
                item.fulltitle if function == 'episodios' else '',
                contentTitle=scraped['title']
                if item.contentType or CT == 'movie' else '',
                contentLanguage=lang1,
                contentEpisodeNumber=episode if episode else '')

            for lg in list(set(listGroups).difference(known_keys)):
                it.__setattr__(lg, match[listGroups.index(lg)])

            if 'itemHook' in args:
                it = args['itemHook'](it)
            itemlist.append(it)

    return itemlist, matches

Пример #16

0

Показать файл

def episodios(item):
    #logger.info("%s episodios log: %s" % (__channel__, item))
    itemlist = []

    if not (item.lang):
        lang_season = {'ITA': 0, 'SUB ITA': 0}
        # Download pagina
        data = httptools.downloadpage(item.url).data
        #========
        if 'clicca qui per aprire' in data.lower():
            logger.info("%s CLICCA QUI PER APRIRE GLI EPISODI log: %s" %
                        (__channel__, item))
            item.url = scrapertoolsV2.find_single_match(
                data, '"go_to":"(.*?)"')
            item.url = item.url.replace("\\", "")
            # Carica la pagina
            data = httptools.downloadpage(item.url).data
            #logger.info("%s FINE CLICCA QUI PER APRIRE GLI EPISODI log: %s" % (__channel__, item))
        elif 'clicca qui</span>' in data.lower():
            logger.info("%s inizio CLICCA QUI</span> log: %s" %
                        (__channel__, item))
            item.url = scrapertoolsV2.find_single_match(
                data, '<h2 style="text-align: center;"><a href="(.*?)">')
            data = httptools.downloadpage(item.url).data
            #logger.info("%s fine CLICCA QUI</span> log: %s" % (__channel__, item))
        #=========
        data = scrapertoolsV2.decodeHtmlentities(data)
        bloque = scrapertoolsV2.find_single_match(
            data, '<div class="su-accordion">(.*?)<div class="clear"></div>')
        patron = '<span class="su-spoiler-icon"></span>(.*?)</div>'
        matches = scrapertoolsV2.find_multiple_matches(bloque, patron)
        for scrapedseason in matches:
            #logger.info("%s scrapedseason log: %s" % (__channel__, scrapedseason))
            if "(SUB ITA)" in scrapedseason.upper():
                lang = "SUB ITA"
                lang_season['SUB ITA'] += 1
            else:
                lang = "ITA"
                lang_season['ITA'] += 1
            #logger.info("%s lang_dict log: %s" % (__channel__, lang_season))

        for lang in sorted(lang_season):
            if lang_season[lang] > 0:
                itemlist.append(
                    Item(
                        channel=item.channel,
                        action="episodios",
                        #contentType = "episode",
                        contentSerieName=item.title,
                        title='%s (%s)' % (item.title, lang),
                        url=item.url,
                        fulltitle=item.title,
                        data=data,
                        lang=lang,
                        show=item.show,
                        folder=True,
                    ))

        # locandine e trama e altro da tmdb se presente l'anno migliora la ricerca
        tmdb.set_infoLabels_itemlist(itemlist,
                                     seekTmdb=True,
                                     idioma_busqueda='it')

        return itemlist

    else:
        # qui ci vanno le puntate delle stagioni
        html = item.data
        logger.info("%s else log: [%s]" % (__channel__, item))

        if item.lang == 'SUB ITA':
            item.lang = '\(SUB ITA\)'
            logger.info("%s item.lang log: %s" % (__channel__, item.lang))
        bloque = scrapertoolsV2.find_single_match(
            html, '<div class="su-accordion">(.*?)<div class="clear"></div>')
        patron = '<span class="su-spoiler-icon"></span>.*?' + item.lang + '</div>(.*?)</div>'  # leggo tutte le stagioni
        #logger.info("%s patronpatron log: %s" % (__channel__, patron))
        matches = scrapertoolsV2.find_multiple_matches(bloque, patron)
        for scrapedseason in matches:
            #logger.info("%s scrapedseasonscrapedseason log: %s" % (__channel__, scrapedseason))
            scrapedseason = scrapedseason.replace('<strong>',
                                                  '').replace('</strong>', '')
            patron = '(\d+)×(\d+)(.*?)<(.*?)<br />'  # stagione - puntanta - titolo - gruppo link
            matches = scrapertoolsV2.find_multiple_matches(
                scrapedseason, patron)
            for scrapedseason, scrapedpuntata, scrapedtitolo, scrapedgroupurl in matches:
                #logger.info("%s finale log: %s" % (__channel__, patron))
                scrapedtitolo = scrapedtitolo.replace('–', '')
                itemlist.append(
                    Item(
                        channel=item.channel,
                        action="findvideos",
                        contentType="episode",
                        #contentSerieName = item.contentSerieName,
                        contentTitle=scrapedtitolo,
                        title='%sx%s %s' %
                        (scrapedseason, scrapedpuntata, scrapedtitolo),
                        url=scrapedgroupurl,
                        fulltitle=item.fulltitle,
                        #show = item.show,
                        #folder = True,
                    ))

        logger.info("%s itemlistitemlist log: %s" % (__channel__, itemlist))

        # Opción "Añadir esta película a la biblioteca de KODI"
        if item.extra != "library":
            if config.get_videolibrary_support(
            ) and len(itemlist) > 0 and item.extra != 'findvideos':
                itemlist.append(
                    Item(channel=item.channel,
                         title="%s" % config.get_localized_string(30161),
                         text_color="green",
                         extra="episodios",
                         action="add_serie_to_library",
                         url=item.url,
                         thumbnail=get_thumb('videolibrary', auto=True),
                         contentTitle=item.contentSerieName,
                         lang=item.lang,
                         show=item.show,
                         data=html
                         #, infoLabels = item.infoLabels
                         ))

        return itemlist

Пример #17

0

Показать файл

Файл: support.py Проект: dentaku65/addon

def cleantitle(title):
    cleantitle = scrapertoolsV2.htmlclean(
        scrapertoolsV2.decodeHtmlentities(title).replace('"', "'").replace(
            '×', 'x').replace('–', '-')).strip()
    return cleantitle

Пример #18

0

Показать файл

Файл: support.py Проект: DjDiabolik/addon

def scrape(item,
           patron='',
           listGroups=[],
           headers="",
           blacklist="",
           data="",
           patron_block="",
           patronNext="",
           action="findvideos",
           url_host="",
           addVideolibrary=True):
    # patron: the patron to use for scraping page, all capturing group must match with listGroups
    # listGroups: a list containing the scraping info obtained by your patron, in order
    # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating

    # header: values to pass to request header
    # blacklist: titles that you want to exclude(service articles for example)
    # data: if you want to pass data manually, maybe because you need some custom replacement
    # patron_block: patron to get parts of the page (to scrape with patron attribute),
    #               if you need a "block inside another block" you can create a list, please note that all matches
    #               will be packed as string
    # patronNext: patron for scraping next page link
    # action: if you want results perform an action different from "findvideos", useful when scraping film by genres
    # url_host: string to prepend to scrapedurl, useful when url don't contain host
    # example usage:
    #   import support
    #   itemlist = []
    #   patron = 'blablabla'
    #   headers = [['Referer', host]]
    #   blacklist = 'Request a TV serie!'
    #   return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'year', 'plot'],
    #                           headers=headers, blacklist=blacklist)

    itemlist = []

    if not data:
        data = httptools.downloadpage(item.url,
                                      headers=headers).data.replace("'", '"')
        data = re.sub('\n|\t', ' ', data)
        # replace all ' with " and eliminate newline, so we don't need to worry about
        log('DATA =', data)

        block = data

        if patron_block:
            if type(patron_block) == str:
                patron_block = [patron_block]

            for n, regex in enumerate(patron_block):
                blocks = scrapertoolsV2.find_multiple_matches(block, regex)
                block = ""
                for b in blocks:
                    block += "\n" + b
                log('BLOCK ', n, '=', block)
    else:
        block = data
    if patron and listGroups:
        matches = scrapertoolsV2.find_multiple_matches(block, patron)
        log('MATCHES =', matches)

        for match in matches:
            if len(listGroups) > len(match):  # to fix a bug
                match = list(match)
                match.extend([''] * (len(listGroups) - len(match)))

            scrapedurl = url_host + match[listGroups.index(
                'url')] if 'url' in listGroups else ''
            scrapedtitle = match[listGroups.index(
                'title')] if 'title' in listGroups else ''
            scrapedthumb = match[listGroups.index(
                'thumb')] if 'thumb' in listGroups else ''
            scrapedquality = match[listGroups.index(
                'quality')] if 'quality' in listGroups else ''
            scrapedyear = match[listGroups.index(
                'year')] if 'year' in listGroups else ''
            scrapedplot = match[listGroups.index(
                'plot')] if 'plot' in listGroups else ''
            scrapedduration = match[listGroups.index(
                'duration')] if 'duration' in listGroups else ''
            scrapedgenre = match[listGroups.index(
                'genre')] if 'genre' in listGroups else ''
            scrapedrating = match[listGroups.index(
                'rating')] if 'rating' in listGroups else ''

            title = scrapertoolsV2.decodeHtmlentities(scrapedtitle)
            plot = scrapertoolsV2.decodeHtmlentities(scrapedplot)
            if scrapedquality:
                longtitle = '[B]' + title + '[/B] [COLOR blue][' + scrapedquality + '][/COLOR]'
            else:
                longtitle = '[B]' + title + '[/B]'

            if item.infoLabels[
                    "title"] or item.fulltitle:  # if title is set, probably this is a list of episodes or video sources
                infolabels = item.infoLabels
            else:
                infolabels = {}
                if scrapedyear:
                    infolabels['year'] = scrapedyear
                if scrapedplot:
                    infolabels['plot'] = plot
                if scrapedduration:
                    matches = scrapertoolsV2.find_multiple_matches(
                        scrapedduration,
                        r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
                    for h, m in matches:
                        scrapedduration = int(h) * 60 + int(m)
                    infolabels['duration'] = int(scrapedduration) * 60
                if scrapedgenre:
                    genres = scrapertoolsV2.find_multiple_matches(
                        scrapedgenre, '[A-Za-z]+')
                    infolabels['genre'] = ", ".join(genres)
                if scrapedrating:
                    infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(
                        scrapedrating)

            if not scrapedtitle in blacklist:
                itemlist.append(
                    Item(channel=item.channel,
                         action=action,
                         contentType=item.contentType,
                         title=longtitle,
                         fulltitle=title,
                         show=title,
                         quality=scrapedquality,
                         url=scrapedurl,
                         infoLabels=infolabels,
                         thumbnail=scrapedthumb))

        tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)

        if patronNext:
            nextPage(itemlist, item, data, patronNext, 2)

        if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
            item.fulltitle = item.infoLabels["title"]
            videolibrary(itemlist, item)

    return itemlist

Пример #19

0

Показать файл

def scrapeBlock(item, args, block, patron, headers, action, pagination, debug,
                typeContentDict, typeActionDict, blacklist, search, pag,
                function, lang):
    itemlist = []
    log("scrapeBlock qui", block, patron)
    matches = scrapertoolsV2.find_multiple_matches_groups(block, patron)
    log('MATCHES =', matches)

    if debug:
        regexDbg(item, patron, headers, block)

    known_keys = [
        'url', 'title', 'title2', 'episode', 'thumb', 'quality', 'year',
        'plot', 'duration', 'genere', 'rating', 'type', 'lang'
    ]
    # lang = ''  # aggiunto per gestire i siti con pagine di serietv dove si hanno i video in ita e in subita
    for i, match in enumerate(matches):
        if pagination and (pag - 1) * pagination > i: continue  # pagination
        if pagination and i >= pag * pagination: break  # pagination
        listGroups = match.keys()
        match = match.values()

        if len(listGroups) > len(match):  # to fix a bug
            match = list(match)
            match.extend([''] * (len(listGroups) - len(match)))

        scraped = {}
        for kk in known_keys:
            val = match[listGroups.index(kk)] if kk in listGroups else ''
            if val and (kk == "url" or kk == 'thumb') and 'http' not in val:
                val = scrapertoolsV2.find_single_match(
                    item.url, 'https?://[a-z0-9.-]+') + val
            scraped[kk] = val

        episode = re.sub(r'\s-\s|-|x|&#8211|&#215;', 'x',
                         scraped['episode']) if scraped['episode'] else ''
        title = cleantitle(scraped['title']) if scraped['title'] else ''
        title2 = cleantitle(scraped['title2']) if scraped['title2'] else ''
        quality = scraped['quality'].strip() if scraped['quality'] else ''
        Type = scraped['type'] if scraped['type'] else ''
        plot = cleantitle(scraped["plot"]) if scraped["plot"] else ''

        # make formatted Title [longtitle]
        s = ' - '
        title = episode + (s if episode and title else '') + title
        longtitle = title + (s if title and title2 else '') + title2
        longtitle = typo(longtitle, 'bold')
        longtitle += (typo(Type, '_ () bold') if Type else '') + (typo(
            quality, '_ [] color kod') if quality else '')

        # # per togliere la voce [ITA] da liste che non siano titoli (es.: genere)
        # if action != 'peliculas':
        #     lang, longtitle = scrapeLang(scraped, lang, longtitle)
        # else:
        #     longtitle = longtitle.replace('[ITA]','')
        #     lang = ''

        lang, longtitle = scrapeLang(scraped, lang, longtitle)

        # if title is set, probably this is a list of episodes or video sources
        # necessaria l'aggiunta di == scraped["title"] altrimenti non prende i gruppi dopo le categorie
        if item.infoLabels["title"] == scraped["title"]:
            infolabels = item.infoLabels
        else:
            infolabels = {}
            if scraped['year']:
                infolabels['year'] = scraped['year']
            if scraped["plot"]:
                infolabels['plot'] = plot
            if scraped['duration']:
                matches = scrapertoolsV2.find_multiple_matches(
                    scraped['duration'],
                    r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
                for h, m in matches:
                    scraped['duration'] = int(h) * 60 + int(m)
                if not matches:
                    scraped['duration'] = scrapertoolsV2.find_single_match(
                        scraped['duration'], r'(\d+)')
                infolabels['duration'] = int(scraped['duration']) * 60
            if scraped['genere']:
                genres = scrapertoolsV2.find_multiple_matches(
                    scraped['genere'], '[A-Za-z]+')
                infolabels['genere'] = ", ".join(genres)
            if scraped["rating"]:
                infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(
                    scraped["rating"])

        AC = CT = ''
        if typeContentDict:
            for name, variants in typeContentDict.items():
                if str(scraped['type']).lower() in variants:
                    CT = name
                else:
                    CT = item.contentType
        if typeActionDict:
            for name, variants in typeActionDict.items():
                if str(scraped['type']).lower() in variants:
                    AC = name
                else:
                    AC = action

        if (scraped["title"] not in blacklist) and (search.lower()
                                                    in longtitle.lower()):
            it = Item(
                channel=item.channel,
                action=AC if AC else action,
                contentType='episode'
                if function == 'episodios' else CT if CT else item.contentType,
                title=longtitle,
                fulltitle=item.fulltitle if function == 'episodios' else title,
                show=item.show if function == 'episodios' else title,
                quality=quality,
                url=scraped["url"],
                infoLabels=infolabels,
                thumbnail=item.thumbnail
                if function == 'episodios' else scraped["thumb"],
                args=item.args,
                contentSerieName=title
                if item.contentType != 'movie' and function != 'episodios' else
                item.fulltitle if function == 'episodios' else '',
                contentTitle=title if item.contentType == 'movie' else '',
                contentLanguage=lang,
                ep=episode if episode else '')

            for lg in list(set(listGroups).difference(known_keys)):
                it.__setattr__(lg, match[listGroups.index(lg)])

            if 'itemHook' in args:
                it = args['itemHook'](it)
            itemlist.append(it)

    return itemlist, matches

Пример #20

0

Показать файл

def peliculas(item):
    logger.info()
    itemlist = []
    url_next_page = ""

    data = httptools.downloadpage(item.url).data
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;", "", data)

    patron = '<article class.*?'
    patron += '<a href="([^"]+)">.*?'
    patron += '<img src="([^"]+)".*?'
    patron += '<aside class="item-control down">(.*?)</aside>.*?'
    patron += '<small class="pull-right text-muted">([^<]+)</small>.*?'
    patron += '<h2 class.*?>([^<]+)</h2>'

    matches = re.compile(patron, re.DOTALL).findall(data)

    # Paginacion
    if item.next_page != 'b':
        if len(matches) > 30:
            url_next_page = item.url
        matches = matches[:30]
        next_page = 'b'
    else:
        matches = matches[30:]
        next_page = 'a'
        patron_next_page = 'Anteriores</a> <a href="([^"]+)" class="btn btn-default ".*?Siguiente'
        matches_next_page = re.compile(patron_next_page,
                                       re.DOTALL).findall(data)
        if len(matches_next_page) > 0:
            url_next_page = matches_next_page[0]

    for scrapedurl, scrapedthumbnail, idiomas, year, scrapedtitle in matches:
        patronidiomas = "<img src='([^']+)'"
        matchesidiomas = re.compile(patronidiomas, re.DOTALL).findall(idiomas)

        idiomas_disponibles = []
        for idioma in matchesidiomas:
            if idioma.endswith("la_la.png"):
                idiomas_disponibles.append("LAT")
            elif idioma.endswith("en_en.png"):
                idiomas_disponibles.append("VO")
            elif idioma.endswith("en_es.png"):
                idiomas_disponibles.append("VOSE")
            elif idioma.endswith("es_es.png"):
                idiomas_disponibles.append("ESP")

        if idiomas_disponibles:
            idiomas_disponibles = "[" + "/".join(idiomas_disponibles) + "]"

        contentTitle = scrapertoolsV2.decodeHtmlentities(scrapedtitle.strip())
        title = "%s %s" % (contentTitle, idiomas_disponibles)

        itemlist.append(
            Item(channel=item.channel,
                 action="findvideos",
                 title=title,
                 url=scrapedurl,
                 thumbnail=scrapedthumbnail,
                 contentTitle=contentTitle,
                 infoLabels={"year": year},
                 text_color=color1))

    # Obtenemos los datos basicos de todas las peliculas mediante multihilos
    tmdb.set_infoLabels(itemlist)

    # Si es necesario añadir paginacion
    if url_next_page:
        itemlist.append(
            Item(channel=item.channel,
                 action="peliculas",
                 title=">> Página siguiente",
                 thumbnail=thumbnail_host,
                 url=url_next_page,
                 next_page=next_page,
                 folder=True,
                 text_color=color3,
                 text_bold=True))

    return itemlist

Пример #21

0

Показать файл

def findvideos(item):
    logger.info()
    itemlist = []
    sublist = []

    # Descarga la página
    url = "http://widget.olimpo.link/playlist/?tmdb=" + scrapertools.find_single_match(
        item.url, 'yaske.ro/([0-9]+)')
    data = httptools.downloadpage(url).data
    if not item.plot:
        item.plot = scrapertoolsV2.find_single_match(
            data, '>Sinopsis</dt> <dd>([^<]+)</dd>')
        item.plot = scrapertoolsV2.decodeHtmlentities(item.plot)

    patron = '(/embed/[^"]+).*?'
    patron += 'quality text-overflow ">([^<]+).*?'
    patron += 'title="([^"]+)'
    matches = scrapertools.find_multiple_matches(data, patron)

    for url, calidad, idioma in matches:
        if 'embed' in url:
            url = "http://widget.olimpo.link" + url
            data = httptools.downloadpage(url).data
            url = scrapertools.find_single_match(data, 'iframe src="([^"]+)')
            sublist.append(
                item.clone(channel=item.channel,
                           action="play",
                           url=url,
                           folder=False,
                           text_color=color1,
                           quality=calidad.strip(),
                           language=idioma.strip()))
    sublist = servertools.get_servers_itemlist(
        sublist, lambda i: "Ver en %s %s" % (i.server, i.quality), True)

    # Añadir servidores encontrados, agrupandolos por idioma
    for k in ["Español", "Latino", "Subtitulado", "Ingles"]:
        lista_idioma = filter(lambda i: i.language == k, sublist)
        if lista_idioma:
            itemlist.append(
                Item(channel=item.channel,
                     title=k,
                     fanart=item.fanart,
                     folder=False,
                     text_color=color2,
                     text_bold=True,
                     thumbnail=thumbnail_host))
            itemlist.extend(lista_idioma)

    # Insertar items "Buscar trailer" y "Añadir a la videoteca"
    if itemlist and item.extra != "library":
        title = "%s [Buscar trailer]" % (item.contentTitle)
        itemlist.insert(
            0,
            item.clone(channel="trailertools",
                       action="buscartrailer",
                       text_color=color3,
                       title=title,
                       viewmode="list"))

        if config.get_videolibrary_support():
            itemlist.append(
                Item(channel=item.channel,
                     title="Añadir película a la videoteca",
                     action="add_pelicula_to_library",
                     url=item.url,
                     text_color="green",
                     contentTitle=item.contentTitle,
                     extra="library",
                     thumbnail=thumbnail_host))

    return itemlist

Пример #22

0

Показать файл

Файл: cineblog01.py Проект: hypno99/addon

def findvideos(item):
    findhost()

    if item.contentType == "episode":
        return findvid_serie(item)

    def load_links(itemlist, re_txt, color, desc_txt, quality=""):
        streaming = scrapertoolsV2.find_single_match(data,
                                                     re_txt).replace('"', '')
        support.log('STREAMING=', streaming)
        patron = '<td><a.*?href=(.*?) (?:target|rel)[^>]+>([^<]+)<'
        matches = re.compile(patron, re.DOTALL).findall(streaming)
        for scrapedurl, scrapedtitle in matches:
            logger.debug("##### findvideos %s ## %s ## %s ##" %
                         (desc_txt, scrapedurl, scrapedtitle))
            title = "[COLOR " + color + "]" + desc_txt + ":[/COLOR] " + item.fulltitle + " [COLOR grey]" + QualityStr + "[/COLOR] [COLOR blue][" + scrapedtitle + "][/COLOR]"
            itemlist.append(
                Item(channel=item.channel,
                     action="play",
                     title=title,
                     url=scrapedurl,
                     server=scrapedtitle,
                     fulltitle=item.fulltitle,
                     thumbnail=item.thumbnail,
                     show=item.show,
                     quality=quality,
                     contentType=item.contentType,
                     folder=False))

    support.log()

    itemlist = []

    # Carica la pagina
    data = httptools.downloadpage(item.url).data
    data = re.sub('\n|\t', '', data)

    # Extract the quality format
    patronvideos = '>([^<]+)</strong></div>'
    matches = re.compile(patronvideos, re.DOTALL).finditer(data)
    QualityStr = ""
    for match in matches:
        QualityStr = scrapertoolsV2.decodeHtmlentities(match.group(1))[6:]

    # Estrae i contenuti - Streaming
    load_links(
        itemlist,
        '<strong>Streaming:</strong>(.*?)<tableclass=cbtable height=30>',
        "orange", "Streaming", "SD")

    # Estrae i contenuti - Streaming HD
    load_links(
        itemlist,
        '<strong>Streaming HD[^<]+</strong>(.*?)<tableclass=cbtable height=30>',
        "yellow", "Streaming HD", "HD")

    autoplay.start(itemlist, item)

    # Estrae i contenuti - Streaming 3D
    load_links(
        itemlist,
        '<strong>Streaming 3D[^<]+</strong>(.*?)<tableclass=cbtable height=30>',
        "pink", "Streaming 3D")

    # Estrae i contenuti - Download
    # load_links(itemlist, '<strong>Download:</strong>(.*?)<tableclass=cbtable height=30>', "aqua", "Download")

    # Estrae i contenuti - Download HD
    # load_links(itemlist, '<strong>Download HD[^<]+</strong>(.*?)<tableclass=cbtable width=100% height=20>', "azure", "Download HD")

    if len(itemlist) == 0:
        itemlist = servertools.find_video_items(item=item)

    # Requerido para Filtrar enlaces

    if __comprueba_enlaces__:
        itemlist = servertools.check_list_links(itemlist,
                                                __comprueba_enlaces_num__)

    # Requerido para FilterTools

    itemlist = filtertools.get_links(itemlist, item, list_language)

    # Requerido para AutoPlay

    autoplay.start(itemlist, item)

    support.videolibrary(itemlist, item)

    return itemlist

Пример #23

0

Показать файл

def scrape(item,
           patron='',
           listGroups=[],
           headers="",
           blacklist="",
           data="",
           patron_block="",
           patronNext="",
           action="findvideos",
           addVideolibrary=True,
           type_content_dict={},
           type_action_dict={}):
    # patron: the patron to use for scraping page, all capturing group must match with listGroups
    # listGroups: a list containing the scraping info obtained by your patron, in order
    # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating, episode, lang

    # header: values to pass to request header
    # blacklist: titles that you want to exclude(service articles for example)
    # data: if you want to pass data manually, maybe because you need some custom replacement
    # patron_block: patron to get parts of the page (to scrape with patron attribute),
    #               if you need a "block inside another block" you can create a list, please note that all matches
    #               will be packed as string
    # patronNext: patron for scraping next page link
    # action: if you want results perform an action different from "findvideos", useful when scraping film by genres
    # url_host: string to prepend to scrapedurl, useful when url don't contain host
    # example usage:
    #   import support
    #   itemlist = []
    #   patron = 'blablabla'
    #   headers = [['Referer', host]]
    #   blacklist = 'Request a TV serie!'
    #   return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'title2', 'year', 'plot', 'episode', 'lang'],
    #                           headers=headers, blacklist=blacklist)
    # listGroups
    #    thumb = immagine, quality = qualità, url = link singolo o gruppo, title = titolo film o serie, title2 = titolo aggiuntivo
    #    year = anno del film o della serie, plot = descrizione film o serie, episode = numero stagione - numero episodio in caso di serie,
    #    lang = lingua del video
    # 'type' is a check for typologies of content e.g. Film or TV Series
    # 'episode' is a key to grab episode numbers if it is separated from the title
    # IMPORTANT 'type' is a special key, to work need type_content_dict={} and type_action_dict={}

    itemlist = []

    if not data:
        data = httptools.downloadpage(item.url,
                                      headers=headers,
                                      ignore_response_code=True).data.replace(
                                          "'", '"')
        data = re.sub('\n|\t', ' ', data)
        # replace all ' with " and eliminate newline, so we don't need to worry about
        log('DATA =', data)

        block = data

        if patron_block:
            if type(patron_block) == str:
                patron_block = [patron_block]

            for n, regex in enumerate(patron_block):
                blocks = scrapertoolsV2.find_multiple_matches(block, regex)
                block = ""
                for b in blocks:
                    block += "\n" + str(b)
                log('BLOCK ', n, '=', block)
    else:
        block = data
    if patron and listGroups:
        matches = scrapertoolsV2.find_multiple_matches(block, patron)
        log('MATCHES =', matches)

        known_keys = [
            'url', 'title', 'title2', 'episode', 'thumb', 'quality', 'year',
            'plot', 'duration', 'genere', 'rating', 'type', 'lang'
        ]  #by greko aggiunto episode
        lang = ''  # aggiunto per gestire i siti con pagine di serietv dove si hanno i video in ita e in subita

        for match in matches:
            if len(listGroups) > len(match):  # to fix a bug
                match = list(match)
                match.extend([''] * (len(listGroups) - len(match)))

            scraped = {}
            for kk in known_keys:
                val = match[listGroups.index(kk)] if kk in listGroups else ''
                if val and (kk == "url"
                            or kk == 'thumb') and 'http' not in val:
                    val = scrapertoolsV2.find_single_match(
                        item.url, 'https?://[a-z0-9.-]+') + val
                scraped[kk] = val

            title = scrapertoolsV2.htmlclean(
                scrapertoolsV2.decodeHtmlentities(scraped["title"])).replace(
                    '’', '\'').replace('"',
                                       "'").strip()  # fix by greko da " a '
            plot = scrapertoolsV2.htmlclean(
                scrapertoolsV2.decodeHtmlentities(scraped["plot"]))

            longtitle = typo(title, 'bold')
            if scraped['quality']:
                longtitle = longtitle + typo(scraped['quality'],
                                             '_ [] color kod')
            if scraped['episode']:
                scraped['episode'] = re.sub(r'\s-\s|-|x|&#8211', 'x',
                                            scraped['episode'])
                longtitle = typo(scraped['episode'] + ' - ',
                                 'bold') + longtitle
            if scraped['title2']:
                title2 = scrapertoolsV2.htmlclean(
                    scrapertoolsV2.decodeHtmlentities(
                        scraped["title2"])).replace('"', "'").strip()
                longtitle = longtitle + typo(title2, 'bold _ -- _')

            ##    Aggiunto/modificato per gestire i siti che hanno i video
            ##    in ita e subita delle serie tv nella stessa pagina
            if scraped['lang']:
                if 'sub' in scraped['lang'].lower():
                    lang = 'Sub-ITA'
                else:
                    lang = 'ITA'
            if lang != '':
                longtitle += typo(lang, '_ [] color kod')

            if item.infoLabels[
                    "title"] or item.fulltitle:  # if title is set, probably this is a list of episodes or video sources
                infolabels = item.infoLabels
            else:
                infolabels = {}
                if scraped["year"]:
                    infolabels['year'] = scraped["year"]
                if scraped["plot"]:
                    infolabels['plot'] = plot
                if scraped["duration"]:
                    matches = scrapertoolsV2.find_multiple_matches(
                        scraped["duration"],
                        r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)')
                    for h, m in matches:
                        scraped["duration"] = int(h) * 60 + int(m)
                    if not matches:
                        scraped["duration"] = scrapertoolsV2.find_single_match(
                            scraped["duration"], r'(\d+)')
                    infolabels['duration'] = int(scraped["duration"]) * 60
                if scraped["genere"]:
                    genres = scrapertoolsV2.find_multiple_matches(
                        scraped["genere"], '[A-Za-z]+')
                    infolabels['genere'] = ", ".join(genres)
                if scraped["rating"]:
                    infolabels['rating'] = scrapertoolsV2.decodeHtmlentities(
                        scraped["rating"])

            if type_content_dict:
                for name, variants in type_content_dict.items():
                    if scraped['type'] in variants:
                        item.contentType = name
            if type_action_dict:
                for name, variants in type_action_dict.items():
                    if scraped['type'] in variants:
                        action = name

            if inspect.stack()[1][3] == 'episodios':
                item.contentType = 'episode'

            if scraped["title"] not in blacklist:
                it = Item(channel=item.channel,
                          action=action,
                          contentType=item.contentType,
                          title=longtitle,
                          fulltitle=title,
                          show=title,
                          language=lang if lang != '' else '',
                          quality=scraped["quality"],
                          url=scraped["url"],
                          infoLabels=infolabels,
                          thumbnail=scraped["thumb"],
                          args=item.args)

                for lg in list(set(listGroups).difference(known_keys)):
                    it.__setattr__(lg, match[listGroups.index(lg)])

                itemlist.append(it)
        checkHost(item, itemlist)
        if (item.contentType == "tvshow" and (action != "findvideos" and action != "play")) \
                or (item.contentType == "episode" and action != "play") \
                or (item.contentType == "movie" and action != "play"):
            tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True)
        else:
            for it in itemlist:
                it.infoLabels = item.infoLabels

        if patronNext:
            nextPage(itemlist, item, data, patronNext, 2)

        if addVideolibrary and (item.infoLabels["title"] or item.fulltitle):
            item.fulltitle = item.infoLabels["title"]
            videolibrary(itemlist, item)

    return itemlist

Python decodeHtmlentities примеры использования