def __get_episode_url(self, data, hostDict): scraper = cfscrape.create_scraper() try: value = "/seasons/" + cleantitle.geturl(data['tvshowtitle']) + '-season-' + data['season'] url = self.base_link + value html = scraper.get(self.base_link) html = scraper.get(url) page_list = BeautifulSoup(html.text, 'html.parser') page_list = page_list.find_all('div', {'class':'episodiotitle'}) ep_page = '' for i in page_list: if re.sub(r'\W+', '', data['title'].lower()) in re.sub(r'\W+', '', i.text.lower()): ep_page = i.prettify() if ep_page == '': return '' ep_page = BeautifulSoup(ep_page, 'html.parser').find_all('a')[0]['href'] html = scraper.get(ep_page) embed = re.findall('<iframe.+?src=\"(.+?)\"', html.text)[0] url = embed sources = [] if 'mehliz' in url: html = scraper.get(url, headers={'referer': self.base_link + '/'}) files = re.findall('file: \"(.+?)\".+?label: \"(.+?)\"', html.text) for i in files: try: sources.append({ 'source': 'gvideo', 'quality': i[1], 'language': 'en', 'url': i[0] + "|Referer=https://www.mehlizmovies.is", 'direct': True, 'debridonly': False }) except Exception: pass else: valid, hoster = source_utils.is_host_valid(url, hostDict) if not valid: return '' urls, host, direct = source_utils.check_directstreams(url, hoster) sources.append({ 'source': host, 'quality': urls[0]['quality'], 'language': 'en', 'url': url + "|Referer=https://www.mehlizmovies.is", 'direct': False, 'debridonly': False }) return sources except Exception: print("Unexpected error in Mehlix _get_episode_url Script:") exc_type, exc_obj, exc_tb = sys.exc_info() print(exc_type, exc_tb.tb_lineno) return ""
def __search(self, titles, year): try: query = self.search_link % (urllib.quote_plus(cleantitle.getsearch(titles[0]))) query = urlparse.urljoin(self.base_link, query) t = cleantitle.get(titles[0]) scraper = cfscrape.create_scraper() data = scraper.get(query).content #data = client.request(query, referer=self.base_link) data = client.parseDOM(data, 'div', attrs={'class': 'result-item'}) r = dom_parser.parse_dom(data, 'div', attrs={'class': 'title'}) r = zip(dom_parser.parse_dom(r, 'a'), dom_parser.parse_dom(data, 'span', attrs={'class': 'year'})) url = [] for i in range(len(r)): title = cleantitle.get(r[i][0][1]) title = re.sub('(\d+p|4k|3d|hd|season\d+)','',title) y = r[i][1][1] link = r[i][0][0]['href'] if 'season' in title: continue if t == title and y == year: if 'season' in link: url.append(source_utils.strip_domain(link)) print url[0] return url[0] else: url.append(source_utils.strip_domain(link)) return url except: return
def getter(code): synonyms = get(code) if not synonyms: msg = 'no synonyms' else: msg = '{} synonyms: {}'.format(len(synonyms), synonyms) return 'for code {!r} found {}\n'.format(code, msg)
def __get_movie_url(self, data, hostDict): scraper = cfscrape.create_scraper() try: html = scraper.get(self.base_link +"/movies/"+cleantitle.geturl(data['title'])) embeds = re.findall('play-box-iframe.+\s<iframe.+?src=\"(.+?)\"', html.text)[0] url = embeds sources = [] if 'mehliz' in url: html = scraper.get(url, headers={'referer': self.base_link + '/'}) files = re.findall('file: \"(.+?)\".+?label: \"(.+?)\"', html.text) for i in files: try: sources.append({ 'source': 'gvideo', 'quality': i[1], 'language': 'en', 'url': i[0] + "|Referer=https://www.mehlizmovies.is", 'direct': True, 'debridonly': False }) except Exception: pass else: valid, hoster = source_utils.is_host_valid(url, hostDict) if not valid: return '' urls, host, direct = source_utils.check_directstreams(url, hoster) sources.append({ 'source': host, 'quality': urls[0]['quality'], 'language': 'en', 'url': url + "|Referer=https://www.mehlizmovies.is", 'direct': False, 'debridonly': False }) return sources except Exception: print("Unexpected error in Mehliz getMovieURL Script:") exc_type, exc_obj, exc_tb = sys.exc_info() print(exc_type, exc_tb.tb_lineno) return ""
def mz_server(self,url): try: scraper = cfscrape.create_scraper() urls = [] data = scraper.get(url).content data = re.findall('''file:\s*["']([^"']+)",label:\s*"(\d{3,}p)"''', data, re.DOTALL) for url, label in data: label = source_utils.label_to_quality(label) if label == 'SD': continue urls.append({'url': url, 'quality': label}) return urls except: return url
def get_current_dogs(): driver = scraper.get(url) dogs = [] soup = BeautifulSoup(driver.page_source, 'html.parser') dogs_info = soup.find_all("div", "wsb-media-carousel") for dog in dogs_info[0].find_all("a"): dogs.append({ "id": dog['rel'][0], "image": "http:" + dog['href'], "info": dog['title'] }) driver.close() return dogs
def links_found(self,urls): try: scraper = cfscrape.create_scraper() links = [] if type(urls) is list: for item in urls: query = urlparse.urljoin(self.base_link, item) r = scraper.get(query).content data = client.parseDOM(r, 'div', attrs={'id': 'playex'}) data = client.parseDOM(data, 'div', attrs={'id': 'option-\d+'}) links += client.parseDOM(data, 'iframe', ret='src') print links else: query = urlparse.urljoin(self.base_link, urls) r = scraper.get(query).content data = client.parseDOM(r, 'div', attrs={'id': 'playex'}) data = client.parseDOM(data, 'div', attrs={'id': 'option-\d+'}) links += client.parseDOM(data, 'iframe', ret='src') return links except: return urls
async def scan(): while True: await bot.wait_until_ready() file=json_manager.curent_file("./json/delivery.json") for url in file: print("dfssd") last_event = list(scraper.get(url).get("events").values())[0] if last_event == "Colis livré au destinataire": await bot.get_channel(json_manager.get(json_manager.config_file_uri, "relais_channel_id")).send("formidable ! "+file.get(url).get("name")+"viens d'arriver") dict_without_it=json_manager.curent_file("./json/delivery.json") del dict_without_it[url] a_file = open("./json/delivery.json", "w") json.dump(dict_without_it, a_file, indent = 4) a_file.close() else: if not file.get(url).get("last_event") == last_event: await bot.get_channel(json_manager.get(json_manager.config_file_uri, "relais_channel_id")).send(file.get(url).get("name")+"```\n"+last_event+"```") json_manager.update("./json/delivery.json", url, {"author": file.get(url).get("author"), "name": file.get(url).get("name"), "last_event": last_event}) await asyncio.sleep(100)
def get_current_dogs(): driver = scraper.get(url) dogs = [] soup = BeautifulSoup(driver.page_source, 'html.parser') for dog in soup.find_all("div", "info-card-grid__item"): dog_info = dog.find_all("div") dog_details = dog_info[1].find_all("li") dogs.append({ "id": dog_details[4].get_text(strip=True), "url": dog.a['href'], "image": dog_info[0].img['src'], "name": dog_info[2].get_text(strip=True), "breed": dog_details[0].get_text(strip=True), "gender": dog_details[2].get_text(strip=True), "age": dog_details[1].get_text(strip=True) }) driver.close() return dogs
async def get(self, ctx, *args): url="" name="" for i in args: if i.find("mondialrelay.fr/suivi-de-colis/?NumeroExpedition=")>=0: url = i else: name+=i+" " if url == "": await ctx.send("url invalide") return False result=scraper.get(url) last_event="" text="\n```" for i in range(len(result.get("events"))): text+=list(result.get("events"))[i]+" "+list(result.get("events").values())[i]+"\n" last_event=list(result.get("events").values())[i] text+="```" await ctx.send(("votre colis est arrivé", "votre colis n'est pas arrivé")[not result.get("delivered")] + text) json_manager.update("./json/delivery.json", url, {"author": ctx.message.author.id, "name": name, "last_event": last_event})
def episode(self, url, imdb, tvdb, title, premiered, season, episode): try: if not url: return url = urlparse.urljoin(self.base_link, url) scraper = cfscrape.create_scraper() data = scraper.get(url).content data = client.parseDOM(data, 'ul', attrs={'class': 'episodios'}) links = client.parseDOM(data, 'div', attrs={'class': 'episodiotitle'}) sp = zip(client.parseDOM(data, 'div', attrs={'class': 'numerando'}), client.parseDOM(links, 'a', ret='href')) Sea_Epi = '%dx%d'% (int(season), int(episode)) for i in sp: sep = i[0] if sep == Sea_Epi: url = source_utils.strip_domain(i[1]) return url except: return
return discord_stats(page) if __name__ == '__main__': import os import discord import requests from scraper import get from discord import Webhook, RequestsWebhookAdapter btag = 'LZR#119553' webhook = Webhook.from_url(os.environ['OW_WEBHOOK'], adapter=RequestsWebhookAdapter()) #with open('BEER.html', 'r') as f: #with open('Rorschach-11181.html', 'r') as f: #with open('Cupnoodle.html', 'r') as f: #page = f.read() page = get( f'https://playoverwatch.com/en-us/career/pc/{btag.replace("#","-")}' ).content stats = scraping(page, btag) with open('ow_scraping_results.json', 'w') as f: json.dump(stats, f, indent=2) webhook.send(embed=discord_stats(stats))
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] show = True if 'tvshowtitle' in data else False hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s' % (data['tvshowtitle']) if\ 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) scraper = cfscrape.create_scraper() r = scraper.get(url).content u = r next_page = True num = 1 while next_page: try: np = re.findall('<link rel="next" href="([^"]+)', u)[0] # Client Requests is causing a timeout on links for ddl valley, falling back on cfscrape #u = client.request(np, headers=headers, cookie=cookie, timeout=5) u = scraper.get(np).content r += u except: next_page = False items = dom_parser2.parse_dom(r, 'h2') items = [ dom_parser2.parse_dom( i.content, 'a', req=['href', 'rel', 'title', 'data-wpel-link']) for i in items ] items = [(i[0].content, i[0].attrs['href']) for i in items] items = [(i[0], i[1]) for i in items if cleantitle.get_simple( title.lower()) in cleantitle.get_simple(i[0].lower())] for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) # Client Requests is causing a timeout on links for ddl valley, falling back on cfscrape #r = client.request(item[1], headers=headers, cookie=cookie, timeout=15) r = scraper.get(item[1]).content links = dom_parser2.parse_dom( r, 'a', req=['href', 'rel', 'data-wpel-link', 'target']) links = [i.attrs['href'] for i in links] if show: links = [i for i in links if hdlr.lower() in i.lower()] for url in links: try: if hdlr in name: fmt = re.sub( '(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper()) fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt) fmt = [i.lower() for i in fmt] if any( i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception() if any(i in ['extras'] for i in fmt): raise Exception() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = '720p' else: quality = 'SD' if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR' elif any(i in [ 'camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts' ] for i in fmt): quality = 'CAM' info = [] if '3d' in fmt: info.append('3D') try: size = re.findall( '((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', name[2])[-1] div = 1 if size.endswith( ('GB', 'GiB')) else 1024 size = float( re.sub('[^0-9|/.|/,]', '', size)) / div size = '%.2f GB' % size info.append(size) except: pass if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC') info = ' | '.join(info) if not any(x in url for x in ['.rar', '.zip', '.iso']): url = client.replaceHTMLCodes(url) url = url.encode('utf-8') host = re.findall( '([\w]+[.][\w]+)$', urlparse.urlparse( url.strip().lower()).netloc)[0] host = client.replaceHTMLCodes(host) host = host.encode('utf-8') if host in hostDict: sources.append({ 'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': False }) elif host in hostprDict: sources.append({ 'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True }) except: pass except: pass check = [i for i in sources if not i['quality'] == 'CAM'] if check: sources = check return sources except: return sources