def tvshow(self, imdb, tvdb, tvshowtitle, localtvshowtitle, aliases, year): try: simple_title = cleantitle.getsearch(tvshowtitle) tvshowtitle = urllib.quote_plus(simple_title) search_url = urlparse.urljoin(self.base_link, self.search_link % tvshowtitle) r = client.request(search_url) if r: r = json.loads(r)['series'] r = [(urlparse.urljoin(self.base_link, i['seo_name'])) for i in r if simple_title == cleantitle.get_simple( i['original_name'])] if r: return r[0] else: return else: search_url2 = urlparse.urljoin(self.base_link2, self.search_link % tvshowtitle) r = client.request(search_url2) r = json.loads(r)['series'] r = [(urlparse.urljoin(self.base_link2, i['seo_name'])) for i in r if simple_title == cleantitle.get_simple( i['original_name'])] if r: return r[0] else: return except BaseException: return
def movie(self, imdb, title, localtitle, aliases, year): try: scrape = cleantitle.get_simple(title) google = '%s%s'%(self.goog,scrape.replace(' ','+')) get_page = requests.get(google).content log_utils.log('Scraper bobmovies - Movie - title: ' + str(title)) log_utils.log('Scraper bobmovies - Movie - search_id: ' + str(scrape)) match = re.compile('<a href="(.+?)"',re.DOTALL).findall(get_page) for url1 in match: if '/url?q=' in url1: if self.base_link in url1 and 'google' not in url1: url2 = url1.split('/url?q=')[1] url2 = url2.split('&')[0] headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'} html = requests.get(url2,headers=headers,timeout=5).content results = re.compile('<div class="page_film_top full_film_top">.+?<h1>(.+?)</h1>.+?<td class="name">Quality:</td><td><a href=.+?">(.+?)</a>.+?<td class="name">Year:</td><td><a href=.+?">(.+?)</a>',re.DOTALL).findall(html) for item_title, qual, date in results: if not scrape == cleantitle.get_simple(item_title): continue if not year in date: continue log_utils.log('Scraper bobmovies - Movie - url2: ' + str(url2)) return url2 return except: failure = traceback.format_exc() log_utils.log('BobMovies - Exception: \n' + str(failure)) return
def tvshow(self, imdb, tvdb, tvshowtitle, localtvshowtitle, aliases, year): try: simple_title = cleantitle.get_simple(tvshowtitle) tvshowtitle = cleantitle.geturl(tvshowtitle).replace('-', '+') search_url = urlparse.urljoin(self.base_link, self.search_link % tvshowtitle) r = client.request(search_url) r = json.loads(r)['series'] r = [(urlparse.urljoin(self.base_link, i['seo_name'])) for i in r if simple_title == cleantitle.get_simple(i['original_name'])] if r: return r[0] else: return except: return
def __search(self, titles, year): try: query = self.search_link % (urllib.quote_plus(cleantitle.query(titles[0])+' '+year)) query = urlparse.urljoin(self.base_link, query) t = [cleantitle.get(i) for i in set(titles) if i][0] r = client.request(query) r = client.parseDOM(r, 'div', attrs={'class': 'karatula'}) for i in r: title = client.parseDOM(i, 'a', ret='title')[0] y = re.findall('(\d{4})',title)[0] title = cleantitle.get_simple(title) if t in title and y == year : x = dom_parser.parse_dom(i, 'a', req='href') return source_utils.strip_domain(x[0][0]['href']) return except: return
def __search(self, titles, year): try: query = self.search_link % (urllib.quote_plus( cleantitle.query(titles[0]))) query = urlparse.urljoin(self.base_link, query) t = [cleantitle.get(i) for i in set(titles) if i][0] r = client.request(query) r = client.parseDOM(r, 'div', attrs={'class': 'details'}) for i in r: title = client.parseDOM(i, 'div', attrs={'class': 'title'})[0] y = client.parseDOM(i, 'span', attrs={'class': 'year'})[0] title = re.findall('">(.+?)</a', title, re.DOTALL)[0] title = cleantitle.get_simple(title) if t in title and y == year: x = dom_parser.parse_dom(i, 'a', req='href') return source_utils.strip_domain(x[0][0]['href']) return except: return
def tvshow(self, imdb, tvdb, tvshowtitle, localtvshowtitle, aliases, year): try: simple_title = cleantitle.get_simple(tvshowtitle) tvshowtitle = cleantitle.geturl(tvshowtitle).replace('-','+') search_url = urlparse.urljoin(self.base_link, self.search_link % tvshowtitle) r = client.request(search_url) r = json.loads(r)['series'] r = [(urlparse.urljoin(self.base_link, i['seo_name'])) for i in r if simple_title == cleantitle.get_simple(i['original_name'])] if r: return r[0] else: return except: return
def sources(self, url, hostDict, hostprDict): try: self._sources = [] if url is None: return self._sources if debrid.status() is False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % ( data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % ( data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) query = self.search_link % cleantitle.geturl(query) url = urlparse.urljoin(self.base_link, query) r = client.request(url) posts = dom_parser2.parse_dom(r, 'div', {'class': 'eTitle'}) posts = [ dom_parser2.parse_dom(i.content, 'a', req='href') for i in posts if i ] posts = [(i[0].attrs['href'], re.sub('<.+?>', '', i[0].content)) for i in posts if i] posts = [ (i[0], i[1]) for i in posts if (cleantitle.get_simple(i[1].split(hdlr)[0]) == cleantitle.get(title) and hdlr.lower() in i[1].lower()) ] self.hostDict = hostDict + hostprDict threads = [] for i in posts: threads.append(workers.Thread(self._get_sources, i)) [i.start() for i in threads] [i.join() for i in threads] alive = [x for x in threads if x.is_alive() == True] while alive: alive = [x for x in threads if x.is_alive() == True] time.sleep(0.1) return self._sources except Exception: return self._sources
def search(self, title, year): try: content = [] for link in self.base_link: try: query = urlparse.urljoin( link, self.search_link % (urllib.quote(title), year)) r = client.request(query) r = json.loads(r) r = r['data']['children'][0]['data'] if not cleantitle.get_simple(r['title'].split(year) [0]) == cleantitle.get(title): raise Exception() if not year in r['title']: raise Exception() content = [(r['title'], r['url'])] except BaseException: pass return content except BaseException: return
def __search(self, titles, year): try: query = self.search_link % (urllib.quote_plus(cleantitle.getsearch(titles[0]+' '+year))) query = urlparse.urljoin(self.base_link, query) t = [cleantitle.get(i) for i in set(titles) if i][0] r = client.request(query) r = dom_parser.parse_dom(r, 'div', attrs={'class': 'v_pict'}) for i in r: title = re.findall('alt="(.+?)"',i[1], re.DOTALL)[0] y = re.findall('(\d{4})', title, re.DOTALL)[0] title = re.sub('<\w+>|</\w+>','',title) title = cleantitle.get_simple(title) if title in t and year == y: url = re.findall('href="(.+?)"',i[1], re.DOTALL)[0] return source_utils.strip_domain(url) return except: return
def sources(self, url, hostDict, hostprDict): try: sources = [] if url is None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'Season %d' % int(data['season']) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) r = self.scraper.get(url).content posts = client.parseDOM(r, 'article', attrs={'class': 'latestPost excerpt\s*\w*'}) for post in posts: try: t = re.findall('title="([^"]+)"', post)[0] t2 = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', t) y = re.findall('[\.|\(|\[|\s](S\d*E\d*|Season\s*\d*|\d{4})[\.|\)|\]|\s]', t)[-1] if not (cleantitle.get_simple(t2.replace('720p / 1080p', '')) == cleantitle.get( title) and y == hdlr): raise Exception() link = client.parseDOM(post, 'a', ret='href')[0] if not 'Episodes' in post: u = self.movie_links(link) else: sep = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) u = self.show_links(link, sep) for item in u: quality, info = source_utils.get_release_quality(item[1], None) try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', item[3])[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size)) / div size = '%.2f GB' % size info.append(size) except: pass info = ' | '.join(info) url = 'http://' + item[2] + item[0].split('//')[-1] url = client.replaceHTMLCodes(url) url = url.encode('utf-8') sources.append({'source': 'popcorn', 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': True, 'debridonly': False}) except: pass return sources except: log_utils.log('>>>> %s TRACE <<<<\n%s' % (__file__.upper().split('\\')[-1].split('.')[0], traceback.format_exc()), log_utils.LOGDEBUG) return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url is None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'Season %d' % int(data['season']) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) r = client.request(url) posts = client.parseDOM(r, 'item') for post in posts: try: t = client.parseDOM(post, 'title')[0] t2 = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', t) y = re.findall('[\.|\(|\[|\s](S\d*E\d*|Season\s*\d*|\d{4})[\.|\)|\]|\s]', t)[-1] if not (cleantitle.get_simple(t2.replace('720p / 1080p', '')) == cleantitle.get( title) and y == hdlr): raise Exception() link = client.parseDOM(post, 'link')[0] if not 'Episodes' in post: u = self.movie_links(link) else: sep = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) u = self.show_links(link, sep) for item in u: 'SD', info = source_utils.get_release_quality(item[0][0], None) try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', item[0][1])[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size)) / div size = '%.2f GB' % size info.append(size) except: pass info = ' | '.join(info) url = item[0][0] url = client.replaceHTMLCodes(url) url = url.encode('utf-8') sources.append({'source': 'popcorn', 'quality': 'SD', 'language': 'en', 'url': url, 'info': info, 'direct': True, 'debridonly': False}) except: pass return sources except: return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url is None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'Season %d' % int(data['season']) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) self.scraper = cfscrape.create_scraper() r = self.scraper.get(url).content posts = client.parseDOM(r, 'li') for post in posts: try: data = dom_parser2.parse_dom(post, 'a', req='href')[0] t = re.findall('title=.+?>\s*(.+?)$', data.content, re.DOTALL)[0] t2 = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', t) y = re.findall('[\.|\(|\[|\s](S\d*E\d*|Season\s*\d*|\d{4})[\.|\)|\]|\s]', t)[-1] if not (cleantitle.get_simple(t2.replace('720p / 1080p', '')) == cleantitle.get( title) and y == hdlr): raise Exception() link = client.parseDOM(post, 'a', ret='href')[0] if not 'Episodes' in post: u = self.movie_links(link) else: sep = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) u = self.show_links(link, sep) for item in u: quality, info = source_utils.get_release_quality(item[0][0], None) try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) [M|G]B)', item[0][1])[-1] div = 1 if size.endswith(' GB') else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size)) / div size = '%.2f GB' % size info.append(size) except: pass info = ' | '.join(info) url = item[0][0] url = client.replaceHTMLCodes(url) url = url.encode('utf-8') sources.append({'source': 'popcorn', 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': True, 'debridonly': False}) except: pass return sources except: return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data[ 'title'] hdlr = 'S%02dE%02d' % (int(data['season']), int( data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % ( data['tvshowtitle'], int(data['season']), int(data['episode']) ) if 'tvshowtitle' in data else '%s' % (data['title']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) r = client.request(url) posts = client.parseDOM(r, 'item') items = [] for post in posts: try: t = client.parseDOM(post, 'title')[0] t2 = re.sub( '(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', t) if not cleantitle.get_simple(t2.replace( 'Watch Online', '')) == cleantitle.get(title): raise Exception() l = client.parseDOM(post, 'link')[0] p = client.parseDOM(post, 'pubDate')[0] if data['year'] in p: items += [(t, l)] except: pass print items for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) u = client.request(item[1]) if 'http://www.imdb.com/title/%s/' % data['imdb'] in u: l = client.parseDOM(u, 'div', {'class': 'movieplay'})[0] l = client.parseDOM(u, 'iframe', ret='data-lazy-src')[0] quality, info = source_utils.get_release_quality( name, l) info = ' | '.join(info) url = l url = client.replaceHTMLCodes(url) url = url.encode('utf-8') valid, host = source_utils.is_host_valid(url, hostDict) sources.append({ 'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': False }) except: pass return sources except: return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] show = True if 'tvshowtitle' in data else False hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s' % (data['tvshowtitle']) if\ 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) scraper = cfscrape.create_scraper() r = scraper.get(url).content u = r next_page = True num = 1 while next_page: try: np = re.findall('<link rel="next" href="([^"]+)', u)[0] # Client Requests is causing a timeout on links for ddl valley, falling back on cfscrape #u = client.request(np, headers=headers, cookie=cookie, timeout=5) u = scraper.get(np).content r += u except: next_page = False items = dom_parser2.parse_dom(r, 'h2') items = [dom_parser2.parse_dom(i.content, 'a', req=['href','rel','title','data-wpel-link']) for i in items] items = [(i[0].content, i[0].attrs['href']) for i in items] items = [(i[0], i[1]) for i in items if cleantitle.get_simple(title.lower()) in cleantitle.get_simple(i[0].lower())] for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) # Client Requests is causing a timeout on links for ddl valley, falling back on cfscrape #r = client.request(item[1], headers=headers, cookie=cookie, timeout=15) r = scraper.get(item[1]).content links = dom_parser2.parse_dom(r, 'a', req=['href','rel','data-wpel-link','target']) links = [i.attrs['href'] for i in links] if show: links = [i for i in links if hdlr.lower() in i.lower()] for url in links: try: if hdlr in name: fmt = re.sub('(.+)(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*)(\.|\)|\]|\s)', '', name.upper()) fmt = re.split('\.|\(|\)|\[|\]|\s|\-', fmt) fmt = [i.lower() for i in fmt] if any(i.endswith(('subs', 'sub', 'dubbed', 'dub')) for i in fmt): raise Exception() if any(i in ['extras'] for i in fmt): raise Exception() if '1080p' in fmt: quality = '1080p' elif '720p' in fmt: quality = '720p' else: quality = 'SD' if any(i in ['dvdscr', 'r5', 'r6'] for i in fmt): quality = 'SCR' elif any(i in ['camrip', 'tsrip', 'hdcam', 'hdts', 'dvdcam', 'dvdts', 'cam', 'telesync', 'ts'] for i in fmt): quality = 'CAM' info = [] if '3d' in fmt: info.append('3D') try: size = re.findall('((?:\d+\.\d+|\d+\,\d+|\d+) (?:GB|GiB|MB|MiB))', name[2])[-1] div = 1 if size.endswith(('GB', 'GiB')) else 1024 size = float(re.sub('[^0-9|/.|/,]', '', size))/div size = '%.2f GB' % size info.append(size) except: pass if any(i in ['hevc', 'h265', 'x265'] for i in fmt): info.append('HEVC') info = ' | '.join(info) if not any(x in url for x in ['.rar', '.zip', '.iso']): url = client.replaceHTMLCodes(url) url = url.encode('utf-8') host = re.findall('([\w]+[.][\w]+)$', urlparse.urlparse(url.strip().lower()).netloc)[0] host = client.replaceHTMLCodes(host) host = host.encode('utf-8') if host in hostDict: sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': False}) elif host in hostprDict: sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': True}) except: pass except: pass check = [i for i in sources if not i['quality'] == 'CAM'] if check: sources = check return sources except: return sources
def sources(self, url, hostDict, hostprDict): try: sources = [] if url == None: return sources data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s S%02dE%02d' % (data['tvshowtitle'], int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else '%s' % (data['title']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) url = urlparse.urljoin(self.base_link, url) r = client.request(url) posts = client.parseDOM(r, 'item') items = [] for post in posts: try: t = client.parseDOM(post, 'title')[0] t2 = re.sub('(\.|\(|\[|\s)(\d{4}|S\d*E\d*|S\d*|3D)(\.|\)|\]|\s|)(.+|)', '', t) if not cleantitle.get_simple(t2.replace('Watch Online','')) == cleantitle.get(title): raise Exception() l = client.parseDOM(post, 'link')[0] p = client.parseDOM(post, 'pubDate')[0] if data['year'] in p: items += [(t, l)] except: pass print items for item in items: try: name = item[0] name = client.replaceHTMLCodes(name) u = client.request(item[1]) if 'http://www.imdb.com/title/%s/' % data['imdb'] in u: l = client.parseDOM(u, 'div', {'class': 'movieplay'})[0] l = client.parseDOM(u, 'iframe', ret='data-lazy-src')[0] quality, info = source_utils.get_release_quality(name, l) info = ' | '.join(info) url = l url = client.replaceHTMLCodes(url) url = url.encode('utf-8') valid, host = source_utils.is_host_valid(url,hostDict) sources.append({'source': host, 'quality': quality, 'language': 'en', 'url': url, 'info': info, 'direct': False, 'debridonly': False}) except: pass return sources except: return sources
def sources(self, url, hostDict, hostprDict): self._sources = [] try: if url == None: return self._sources if debrid.status() == False: raise Exception() data = urlparse.parse_qs(url) data = dict([(i, data[i][0]) if data[i] else (i, '') for i in data]) title = data['tvshowtitle'] if 'tvshowtitle' in data else data['title'] self.show = True if 'tvshowtitle' in data else False self.hdlr = 'S%02dE%02d' % (int(data['season']), int(data['episode'])) if 'tvshowtitle' in data else data['year'] query = '%s' % (data['tvshowtitle']) if\ 'tvshowtitle' in data else '%s %s' % (data['title'], data['year']) query = re.sub('(\\\|/| -|:|;|\*|\?|"|\'|<|>|\|)', ' ', query) url = self.search_link % urllib.quote_plus(query) ref = urlparse.urljoin(self.base_link, self.search_link1 % urllib.quote_plus(query)) url = urlparse.urljoin(self.base_link, url) self.scraper = cfscrape.create_scraper() self.scraper.headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36' self.scraper.headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' self.scraper.headers['Referer'] = 'http://www.ddlvalley.me/' self.scraper.headers['Host'] = 'www.ddlvalley.me' self.scraper.headers['Upgrade-Insecure-Requests'] = '1' sess = self.scraper.get(self.base_link) self.scraper.headers['Cookie'] = '' for key, value in self.scraper.cookies.iteritems(): self.scraper.headers['Cookie'] += '%s=%s;'%(key, value) dts = datetime.datetime.utcnow() + datetime.timedelta(days=1) self.scraper.headers['Cookie'] += 'noprpkedvhozafiwrcnt=1; noprpkedvhozafiwrexp=%s'%dts.strftime("%a, %d %b %Y %H:%M:%S GMT") self.scraper.headers['Referer'] = ref r = self.scraper.get(url).content u = r next_page = True num = 1 while next_page: try: np = re.findall('<link rel="next" href="([^"]+)', u)[0] u = self.scraper.get(np).content r += u except: next_page = False items = dom_parser2.parse_dom(r, 'h2') items = [dom_parser2.parse_dom(i.content, 'a', req=['href','rel','title','data-wpel-link']) for i in items] items = [(i[0].content, i[0].attrs['href']) for i in items] items = [(i[0], i[1]) for i in items if cleantitle.get_simple(i[0].split(self.hdlr)[0].lower()) == cleantitle.get_simple(title.lower())] threads = [] for i in items: threads.append(workers.Thread(self._get_sources, i[0], i[1], hostDict, hostprDict)) for i in threads: i.start(); time.sleep(0.5) alive = [x for x in threads if x.is_alive() == True] while alive: alive = [x for x in threads if x.is_alive() == True] time.sleep(0.5) return self._sources except: return self._sources