def load_episodios(html, item, itemlist, lang_title): patron = '.*?<a href="[^"]+"[^o]+ofollow[^>]+>[^<]+</a><(?:b|/)[^>]+>' matches = re.compile(patron).findall(html) for data in matches: # Estrae i contenuti scrapedtitle = scrapertoolsV2.htmlclean(re.sub(r'(<a [^>]+>)*(<\/a>.*)*(Speedvideo)*', '', data)).strip() if scrapedtitle != 'Categorie': scrapedtitle = scrapedtitle.replace('×', 'x') scrapedtitle = scrapedtitle.replace('×', 'x') scrapedtitle = scrapedtitle.replace(';', '') itemlist.append( Item(channel=item.channel, action="findvideos", contentType="episode", title="[COLOR azure]%s[/COLOR]" % (scrapedtitle + " (" + lang_title + ")"), url=data, thumbnail=item.thumbnail, extra=item.extra, fulltitle=scrapedtitle + " (" + lang_title + ")" + ' - ' + item.show, show=item.show))
def cleantitle(title): cleantitle = scrapertoolsV2.htmlclean( scrapertoolsV2.decodeHtmlentities(title).replace('"', "'").replace( '×', 'x').replace('–', '-')).strip() return cleantitle
def scrape(item, patron='', listGroups=[], headers="", blacklist="", data="", patron_block="", patronNext="", action="findvideos", addVideolibrary=True, type_content_dict={}, type_action_dict={}): # patron: the patron to use for scraping page, all capturing group must match with listGroups # listGroups: a list containing the scraping info obtained by your patron, in order # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating, episode, lang # header: values to pass to request header # blacklist: titles that you want to exclude(service articles for example) # data: if you want to pass data manually, maybe because you need some custom replacement # patron_block: patron to get parts of the page (to scrape with patron attribute), # if you need a "block inside another block" you can create a list, please note that all matches # will be packed as string # patronNext: patron for scraping next page link # action: if you want results perform an action different from "findvideos", useful when scraping film by genres # url_host: string to prepend to scrapedurl, useful when url don't contain host # example usage: # import support # itemlist = [] # patron = 'blablabla' # headers = [['Referer', host]] # blacklist = 'Request a TV serie!' # return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'title2', 'year', 'plot', 'episode', 'lang'], # headers=headers, blacklist=blacklist) # listGroups # thumb = immagine, quality = qualità, url = link singolo o gruppo, title = titolo film o serie, title2 = titolo aggiuntivo # year = anno del film o della serie, plot = descrizione film o serie, episode = numero stagione - numero episodio in caso di serie, # lang = lingua del video # 'type' is a check for typologies of content e.g. Film or TV Series # 'episode' is a key to grab episode numbers if it is separated from the title # IMPORTANT 'type' is a special key, to work need type_content_dict={} and type_action_dict={} itemlist = [] if not data: data = httptools.downloadpage(item.url, headers=headers, ignore_response_code=True).data.replace( "'", '"') data = re.sub('\n|\t', ' ', data) # replace all ' with " and eliminate newline, so we don't need to worry about log('DATA =', data) block = data if patron_block: if type(patron_block) == str: patron_block = [patron_block] for n, regex in enumerate(patron_block): blocks = scrapertoolsV2.find_multiple_matches(block, regex) block = "" for b in blocks: block += "\n" + str(b) log('BLOCK ', n, '=', block) else: block = data if patron and listGroups: matches = scrapertoolsV2.find_multiple_matches(block, patron) log('MATCHES =', matches) known_keys = [ 'url', 'title', 'title2', 'episode', 'thumb', 'quality', 'year', 'plot', 'duration', 'genere', 'rating', 'type', 'lang' ] #by greko aggiunto episode lang = '' # aggiunto per gestire i siti con pagine di serietv dove si hanno i video in ita e in subita for match in matches: if len(listGroups) > len(match): # to fix a bug match = list(match) match.extend([''] * (len(listGroups) - len(match))) scraped = {} for kk in known_keys: val = match[listGroups.index(kk)] if kk in listGroups else '' if val and (kk == "url" or kk == 'thumb') and 'http' not in val: val = scrapertoolsV2.find_single_match( item.url, 'https?://[a-z0-9.-]+') + val scraped[kk] = val title = scrapertoolsV2.htmlclean( scrapertoolsV2.decodeHtmlentities(scraped["title"])).replace( '’', '\'').replace('"', "'").strip() # fix by greko da " a ' plot = scrapertoolsV2.htmlclean( scrapertoolsV2.decodeHtmlentities(scraped["plot"])) longtitle = typo(title, 'bold') if scraped['quality']: longtitle = longtitle + typo(scraped['quality'], '_ [] color kod') if scraped['episode']: scraped['episode'] = re.sub(r'\s-\s|-|x|–', 'x', scraped['episode']) longtitle = typo(scraped['episode'] + ' - ', 'bold') + longtitle if scraped['title2']: title2 = scrapertoolsV2.htmlclean( scrapertoolsV2.decodeHtmlentities( scraped["title2"])).replace('"', "'").strip() longtitle = longtitle + typo(title2, 'bold _ -- _') ## Aggiunto/modificato per gestire i siti che hanno i video ## in ita e subita delle serie tv nella stessa pagina if scraped['lang']: if 'sub' in scraped['lang'].lower(): lang = 'Sub-ITA' else: lang = 'ITA' if lang != '': longtitle += typo(lang, '_ [] color kod') if item.infoLabels[ "title"] or item.fulltitle: # if title is set, probably this is a list of episodes or video sources infolabels = item.infoLabels else: infolabels = {} if scraped["year"]: infolabels['year'] = scraped["year"] if scraped["plot"]: infolabels['plot'] = plot if scraped["duration"]: matches = scrapertoolsV2.find_multiple_matches( scraped["duration"], r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)') for h, m in matches: scraped["duration"] = int(h) * 60 + int(m) if not matches: scraped["duration"] = scrapertoolsV2.find_single_match( scraped["duration"], r'(\d+)') infolabels['duration'] = int(scraped["duration"]) * 60 if scraped["genere"]: genres = scrapertoolsV2.find_multiple_matches( scraped["genere"], '[A-Za-z]+') infolabels['genere'] = ", ".join(genres) if scraped["rating"]: infolabels['rating'] = scrapertoolsV2.decodeHtmlentities( scraped["rating"]) if type_content_dict: for name, variants in type_content_dict.items(): if scraped['type'] in variants: item.contentType = name if type_action_dict: for name, variants in type_action_dict.items(): if scraped['type'] in variants: action = name if inspect.stack()[1][3] == 'episodios': item.contentType = 'episode' if scraped["title"] not in blacklist: it = Item(channel=item.channel, action=action, contentType=item.contentType, title=longtitle, fulltitle=title, show=title, language=lang if lang != '' else '', quality=scraped["quality"], url=scraped["url"], infoLabels=infolabels, thumbnail=scraped["thumb"], args=item.args) for lg in list(set(listGroups).difference(known_keys)): it.__setattr__(lg, match[listGroups.index(lg)]) itemlist.append(it) checkHost(item, itemlist) if (item.contentType == "tvshow" and (action != "findvideos" and action != "play")) \ or (item.contentType == "episode" and action != "play") \ or (item.contentType == "movie" and action != "play"): tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True) else: for it in itemlist: it.infoLabels = item.infoLabels if patronNext: nextPage(itemlist, item, data, patronNext, 2) if addVideolibrary and (item.infoLabels["title"] or item.fulltitle): item.fulltitle = item.infoLabels["title"] videolibrary(itemlist, item) return itemlist
def peliculas(item): logger.info() itemlist = [] data = httptools.downloadpage(item.url).data data = re.sub(r"\n|\r|\t|\s{2}| ", "", data) patron = 'class="post-item-image btn-play-item".*?' patron += 'href="([^"]+)">.*?' patron += '<img data-original="([^"]+)".*?' patron += 'glyphicon-calendar"></i>([^<]+).*?' patron += 'post-item-flags"> (.*?)</div.*?' patron += 'text-muted f-14">(.*?)</h3' matches = scrapertools.find_multiple_matches(data, patron) patron_next_page = 'href="([^"]+)"> »' matches_next_page = scrapertools.find_single_match(data, patron_next_page) if len(matches_next_page) > 0: url_next_page = item.url + matches_next_page for scrapedurl, scrapedthumbnail, year, idiomas, scrapedtitle in matches: year = year.strip() patronidiomas = '<img src="([^"]+)"' matchesidiomas = scrapertools.find_multiple_matches( idiomas, patronidiomas) idiomas_disponibles = [] for idioma in matchesidiomas: if idioma.endswith("/la.png"): idiomas_disponibles.append("LAT") elif idioma.endswith("/en.png"): idiomas_disponibles.append("VO") elif idioma.endswith("/en_es.png"): idiomas_disponibles.append("VOSE") elif idioma.endswith("/es.png"): idiomas_disponibles.append("ESP") if idiomas_disponibles: idiomas_disponibles = "[" + "/".join(idiomas_disponibles) + "]" contentTitle = scrapertoolsV2.htmlclean(scrapedtitle.strip()) title = "%s %s" % (contentTitle, idiomas_disponibles) itemlist.append( Item(channel=item.channel, action="findvideos", title=title, url=scrapedurl, thumbnail=scrapedthumbnail, contentTitle=contentTitle, infoLabels={"year": year}, text_color=color1)) # Obtenemos los datos basicos de todas las peliculas mediante multihilos tmdb.set_infoLabels(itemlist) # Si es necesario añadir paginacion if url_next_page: itemlist.append( Item(channel=item.channel, action="peliculas", title=">> Página siguiente", thumbnail=thumbnail_host, url=url_next_page, folder=True, text_color=color3, text_bold=True)) return itemlist
def scrape(item, patron='', listGroups=[], headers="", blacklist="", data="", patron_block="", patronNext="", action="findvideos", url_host="", addVideolibrary=True): # patron: the patron to use for scraping page, all capturing group must match with listGroups # listGroups: a list containing the scraping info obtained by your patron, in order # accepted values are: url, title, thumb, quality, year, plot, duration, genre, rating # header: values to pass to request header # blacklist: titles that you want to exclude(service articles for example) # data: if you want to pass data manually, maybe because you need some custom replacement # patron_block: patron to get parts of the page (to scrape with patron attribute), # if you need a "block inside another block" you can create a list, please note that all matches # will be packed as string # patronNext: patron for scraping next page link # action: if you want results perform an action different from "findvideos", useful when scraping film by genres # url_host: string to prepend to scrapedurl, useful when url don't contain host # example usage: # import support # itemlist = [] # patron = 'blablabla' # headers = [['Referer', host]] # blacklist = 'Request a TV serie!' # return support.scrape(item, itemlist, patron, ['thumb', 'quality', 'url', 'title', 'year', 'plot'], # headers=headers, blacklist=blacklist) itemlist = [] if not data: data = httptools.downloadpage(item.url, headers=headers).data.replace("'", '"') data = re.sub('\n|\t', ' ', data) # replace all ' with " and eliminate newline, so we don't need to worry about log('DATA =', data) block = data if patron_block: if type(patron_block) == str: patron_block = [patron_block] for n, regex in enumerate(patron_block): blocks = scrapertoolsV2.find_multiple_matches(block, regex) block = "" for b in blocks: block += "\n" + b log('BLOCK ', n, '=', block) else: block = data if patron and listGroups: matches = scrapertoolsV2.find_multiple_matches(block, patron) log('MATCHES =', matches) known_keys = [ 'url', 'title', 'thumb', 'quality', 'year', 'plot', 'duration', 'genere', 'rating' ] for match in matches: if len(listGroups) > len(match): # to fix a bug match = list(match) match.extend([''] * (len(listGroups) - len(match))) scraped = {} for kk in known_keys: val = match[listGroups.index(kk)] if kk in listGroups else '' if kk == "url": val = url_host + val scraped[kk] = val title = scrapertoolsV2.decodeHtmlentities(scraped["title"]).strip() plot = scrapertoolsV2.htmlclean( scrapertoolsV2.decodeHtmlentities(scraped["plot"])) if scraped["quality"]: longtitle = '[B]' + title + '[/B] [COLOR blue][' + scraped[ "quality"] + '][/COLOR]' else: longtitle = '[B]' + title + '[/B]' if item.infoLabels[ "title"] or item.fulltitle: # if title is set, probably this is a list of episodes or video sources infolabels = item.infoLabels else: infolabels = {} if scraped["year"]: infolabels['year'] = scraped["year"] if scraped["plot"]: infolabels['plot'] = plot if scraped["duration"]: matches = scrapertoolsV2.find_multiple_matches( scraped["duration"], r'([0-9])\s*?(?:[hH]|:|\.|,|\\|\/|\||\s)\s*?([0-9]+)') for h, m in matches: scraped["duration"] = int(h) * 60 + int(m) if not matches: scraped["duration"] = scrapertoolsV2.find_single_match( scraped["duration"], r'(\d+)') infolabels['duration'] = int(scraped["duration"]) * 60 if scraped["genere"]: genres = scrapertoolsV2.find_multiple_matches( scraped["genere"], '[A-Za-z]+') infolabels['genere'] = ", ".join(genres) if scraped["rating"]: infolabels['rating'] = scrapertoolsV2.decodeHtmlentities( scraped["rating"]) if scraped["title"] not in blacklist: it = Item(channel=item.channel, action=action, contentType=item.contentType, title=longtitle, fulltitle=title, show=title, quality=scraped["quality"], url=scraped["url"], infoLabels=infolabels, thumbnail=scraped["thumb"]) for lg in list(set(listGroups).difference(known_keys)): it.__setattr__(lg, match[listGroups.index(lg)]) itemlist.append(it) if (item.contentType == "episode" and (action != "findvideos" and action != "play")) \ or (item.contentType == "movie" and action != "play"): tmdb.set_infoLabels_itemlist(itemlist, seekTmdb=True) else: for it in itemlist: it.infoLabels = item.infoLabels if patronNext: nextPage(itemlist, item, data, patronNext, 2) if addVideolibrary and (item.infoLabels["title"] or item.fulltitle): item.fulltitle = item.infoLabels["title"] videolibrary(itemlist, item) return itemlist