def getChannels(page,cookie='',referer=''): x = [] html = "" if str(page) == '0': x = Fildonet.getMainSections() elif str(page) == '100artist': page=Fildonet.MAIN_URL html = Downloader.getContentFromUrl(page,"",cookie,"") x = Fildonet.extractElementsArtist(html) elif str(page) == 'topalbums': page=Fildonet.MAIN_URL html = Downloader.getContentFromUrl(page,"",cookie,"") x = Fildonet.extractElementsAlbum(html) elif str(page) == 'lastestplaylists': pass elif str(page).find('search')!=-1: keyboard = xbmc.Keyboard("") keyboard.doModal() text = "" if (keyboard.isConfirmed()): text = keyboard.getText() x = Fildonet.search(text) else: page = base64.standard_b64decode(page) logger.info("ELSE --- page is: "+page) html = Downloader.getContentFromUrl(page,"",cookie,"") if page.find("albumId=")!=-1: jsonData = json.loads(html) x = Fildonet.buildFromJSON(jsonData) else: x = Fildonet.extractElementsPlayer(html) return x
def decodeStreamliveto(html,page=''): iframeUrl = "http://www.streamlive.to/view/"+Decoder.extract('http://www.streamlive.to/embed/','&width=',html) html2 = Downloader.getContentFromUrl(iframeUrl,urllib.urlencode({"captcha":"yes"}),"",iframeUrl) if html2.find("Question:")>-1:#captcha #logger.debug(html2) captcha = Decoder.rExtract(': ','<br /><br />',html2) if captcha.find("(")>-1: logger.debug("resolving captcha with math..."+captcha) try: captcha = Decoder.resolveSimpleMath(captcha) except: logger.error("Could not resolve captcha: "+captcha) pass logger.debug("captcha="+captcha) captchaPost = urllib.urlencode({'captcha': captcha}) logger.debug(captchaPost) time.sleep(3) html2 = Downloader.getContentFromUrl(iframeUrl,captchaPost,Downloader.cookie,iframeUrl) link = "http://harddevelop.com/2015/11/tv-box.html|Referer=http://gordosyfrikis.com/" # ;) if html2.find("http://www.streamlive.to/ads/ilive_player.swf")>-1: #builds the link swfUrl = "http://www.streamlive.to/ads/streamlive.swf" tokenUrl = Decoder.extractWithRegex("http://www.streamlive.to/server.php?id=",'"',html2) tokenUrl = tokenUrl[:(len(tokenUrl)-1)] token = Downloader.getContentFromUrl(tokenUrl,"",Downloader.cookie,page) token = Decoder.extract('{"token":"','"}',token) file = Decoder.extract('file: "','",',html2).replace('.flv','') streamer = Decoder.extract('streamer: "','",',html2).replace("\\","") link = streamer+"./"+file+" playpath="+file+" live=1 token="+token+" swfUrl="+swfUrl+" pageUrl=http://www.streamlive.to/view"+(iframeUrl[iframeUrl.rfind("/"):]) logger.debug("built a link to be used: "+link) return link
def getChannels(page, cookie='', referer=''): x = [] html = "" if str(page) == '0': x = Fildonet.getMainSections() elif str(page) == '100artist': page = Fildonet.MAIN_URL html = Downloader.getContentFromUrl(page, "", cookie, "") x = Fildonet.extractElementsArtist(html) elif str(page) == 'topalbums': page = Fildonet.MAIN_URL html = Downloader.getContentFromUrl(page, "", cookie, "") x = Fildonet.extractElementsAlbum(html) elif str(page) == 'lastestplaylists': pass elif str(page).find('search') != -1: keyboard = xbmc.Keyboard("") keyboard.doModal() text = "" if (keyboard.isConfirmed()): text = keyboard.getText() x = Fildonet.search(text) else: page = base64.standard_b64decode(page) logger.info("ELSE --- page is: " + page) html = Downloader.getContentFromUrl(page, "", cookie, "") if page.find("albumId=") != -1: jsonData = json.loads(html) x = Fildonet.buildFromJSON(jsonData) else: x = Fildonet.extractElementsPlayer(html) return x
def decodeOpenload(link): #get cookies mediaId = Decoder.extract("/f/", "/", link) embedUrl = 'https://openload.io/embed/' + mediaId html = Downloader.getContentFromUrl(embedUrl, "", "", "", False, False) logger.info("html is: " + html) logger.debug("using cookie 1: " + Downloader.cookie) logger.debug("Media id for openload is: " + mediaId) extra = "&login=f750b26513f64034&key=oaA-MbZo" #this avoid captcha petition link2 = "https://api.openload.io/1/file/dlticket?file=" + mediaId + extra data = Downloader.getContentFromUrl(link2, "", Downloader.cookie, embedUrl, True, False) logger.debug("jsonData: " + data) js_result = json.loads(data) logger.info("sleeping... " + str(js_result['result']['wait_time'])) time.sleep(int(js_result['result']['wait_time'])) link3 = 'https://api.openload.io/1/file/dl?file=%s&ticket=%s' % ( mediaId, js_result['result']['ticket']) logger.debug("using cookie 2: " + Downloader.cookie) result = Downloader.getContentFromUrl(link3, "", Downloader.cookie, embedUrl, True, False) logger.debug("jsonData 2: " + result) js_result2 = json.loads(result) file = js_result2['result']['url'] + '?mime=true' logger.info("Built final link: " + file) return file
def extractSawlive(scriptSrc, cookie, iframeUrl): encryptedHtml = Downloader.getContentFromUrl(scriptSrc, "", cookie, iframeUrl) # print encryptedHtml decryptedUrl = Decoder.decodeSawliveUrl(encryptedHtml) html3 = Downloader.getContentFromUrl(decryptedUrl, "", cookie, scriptSrc) # ok, now extract flash script content flashContent = Decoder.extract("var so = new SWFObject('", "</script>", html3) file = Decoder.extract("'file', '", "');", flashContent) rtmpUrl = "" if flashContent.find("'streamer', '") > 0.1: rtmpUrl = Decoder.extract("'streamer', '", "');", flashContent) swfUrl = "http://static3.sawlive.tv/player.swf" # default # update swf url swfUrl = flashContent[: flashContent.find("'")] logger.info("updated swf player to: " + swfUrl) if rtmpUrl == "" and file.find("http://") > -1: finalRtmpUrl = file # it's a redirect with an .m3u8, so it's used else: finalRtmpUrl = ( rtmpUrl + " playpath=" + file + " swfUrl=" + swfUrl + " live=1 conn=S:OK pageUrl=" + decryptedUrl + " timeout=12" ) return finalRtmpUrl
def decodeOpenloadUsingOfficialApi(link): #API sucks, today it always returns a 509 with all logins xDDD #get cookies mediaId = Decoder.extract("/f/","/",link) embedUrl = 'https://openload.io/embed/'+mediaId html = Downloader.getContentFromUrl(embedUrl,"","","",False,False) logger.info("html is: "+html) logger.debug("using cookie 1: "+Downloader.cookie) logger.debug("Media id for openload is: "+mediaId) key = "oaA-MbZo" login = "******" extra = "&login="******"&key="+key #this avoid captcha petition link2 = "https://api.openload.io/1/file/dlticket?file="+mediaId+extra data = Downloader.getContentFromUrl(link2,"",Downloader.cookie,embedUrl,True,False) logger.debug("jsonData: "+data) js_result = json.loads(data) logger.info("sleeping... "+str(js_result['result']['wait_time'])) time.sleep(int(js_result['result']['wait_time'])) link3 = 'https://api.openload.io/1/file/dl?file=%s&ticket=%s' % (mediaId, js_result['result']['ticket']) logger.debug("using cookie 2: "+Downloader.cookie) result = Downloader.getContentFromUrl(link3,"",Downloader.cookie,embedUrl,True,False) logger.debug("jsonData 2: "+result) js_result2 = json.loads(result) file = js_result2['result']['url'] + '?mime=true' logger.info("Built final link: "+file) return file
class Engine(): def __init__(self): self.logger = get_logger('Core', True) # core moudles share the same logger self.Scheduler = Scheduler(self.logger) self.Downloader = Downloader(self.logger) self.Uploader = Uploader(self.logger) self.Monitor = Monitor(self.logger) def _do_register(self): user = GlobalConfig.Deploy_dict['user'] password = GlobalConfig.Deploy_dict['password'] self.logger.info('registering START: %s' % user) RegisterSuccess = do_register(user, password, self.logger) self.logger.info('registering END: %s' % str(RegisterSuccess)) return RegisterSuccess def start(self): if self._do_register(): self.logger.info('---engine START---') self.Scheduler.start_threads() self.Monitor.start_threads() self.Downloader.start_threads( ) # Downloader uses spiders which uses Status, so Monitor should run in front self.Uploader.start_threads() else: self.logger.info('---engine START failed---') def stop(self): pass
def getFinalLink(link): # trying to decode link downloading it again if ".m3u8" in link: logger.debug("old link: " + link) oldLink = link m3u8Text = ZonaAppCom.getContentFromUrl(link) logger.debug("m3u8 content is: "+m3u8Text) if "http" in m3u8Text: m3u8Text = m3u8Text[m3u8Text.find("http"):] if "\n" not in m3u8Text: link = m3u8Text logger.debug("1) updated link to: " + link) if ".php" in link: # trying second time m3u8Text = ZonaAppCom.getContentFromUrl(link) if "http" in m3u8Text: oldLink = link link = m3u8Text[m3u8Text.find("http"):] logger.debug("2) updated link to: " + link) link += "|" + Downloader.getHeaders(oldLink) else: link += "|" + Downloader.getHeaders(oldLink) else: logger.debug("0) Complex link, not changed!" + link) else: logger.debug("nothing done! "+link) return link
def __init__(self): self.logger = get_logger('Core', True) # core moudles share the same logger self.Scheduler = Scheduler(self.logger) self.Downloader = Downloader(self.logger) self.Uploader = Uploader(self.logger) self.Monitor = Monitor(self.logger)
def test_timeout_is_passed(mock_requests): downloader = Downloader() mock_requests.return_value = create_ok_return_value() downloader.fetch_url(FAKE_COOKIE, FAKE_URL, timeout_secs=3600) mock_requests.assert_called_once_with(timeout=3600, url=ANY, headers=ANY, allow_redirects=ANY)
def decodeVidggTo(link): referer = "http://www.vidgg.to/player/cloudplayer.swf" html = Downloader.getContentFromUrl(link) file = Decoder.extract("flashvars.file=\"",'";',html) key = Decoder.extract("flashvars.filekey=\"",'";',html) url2 = "http://www.vidgg.to/api/player.api.php?pass=undefined&key="+key+"&user=undefined&numOfErrors=0&cid3=undefined&cid=1&file="+file+"&cid2=undefined" bruteResponse = Downloader.getContentFromUrl(url2) finalLink = Decoder.extract("url=","&title",bruteResponse) logger.debug("Final link is: "+finalLink) return finalLink
def test_redirect_is_enabled(mock_requests): downloader = Downloader() mock_requests.return_value = create_ok_return_value() downloader.fetch_url(FAKE_COOKIE, FAKE_URL) mock_requests.assert_called_once_with(allow_redirects=True, url=ANY, headers=ANY, timeout=ANY)
def decodeVidggTo(link): referer = "http://www.vidgg.to/player/cloudplayer.swf" html = Downloader.getContentFromUrl(link) file = Decoder.extract("flashvars.file=\"", '";', html) key = Decoder.extract("flashvars.filekey=\"", '";', html) url2 = "http://www.vidgg.to/api/player.api.php?pass=undefined&key=" + key + "&user=undefined&numOfErrors=0&cid3=undefined&cid=1&file=" + file + "&cid2=undefined" bruteResponse = Downloader.getContentFromUrl(url2) finalLink = Decoder.extract("url=", "&title", bruteResponse) logger.debug("Final link is: " + finalLink) return finalLink
def test_response_is_returned(mock_requests): downloader = Downloader() mock_requests.return_value = create_ok_return_value() res = downloader.fetch_url(FAKE_COOKIE, FAKE_URL) assert res == create_ok_return_value() mock_requests.assert_called_once_with(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY)
def download_from_url(url, item): logger.info("Intentando descargar: %s" % (url)) if url.lower().endswith(".m3u8") or url.lower().startswith("rtmp"): save_server_statistics(item.server, 0, False) return {"downloadStatus": STATUS_CODES.error} # Obtenemos la ruta de descarga y el nombre del archivo item.downloadFilename = item.downloadFilename.replace('/','-') download_path = filetools.dirname(filetools.join(DOWNLOAD_PATH, item.downloadFilename)) file_name = filetools.basename(filetools.join(DOWNLOAD_PATH, item.downloadFilename)) # Creamos la carpeta si no existe if not filetools.exists(download_path): filetools.mkdir(download_path) # Lanzamos la descarga d = Downloader(url, download_path, file_name, max_connections=1 + int(config.get_setting("max_connections", "downloads")), block_size=2 ** (17 + int(config.get_setting("block_size", "downloads"))), part_size=2 ** (20 + int(config.get_setting("part_size", "downloads"))), max_buffer=2 * int(config.get_setting("max_buffer", "downloads"))) d.start_dialog(config.get_localized_string(60332)) # Descarga detenida. Obtenemos el estado: # Se ha producido un error en la descarga if d.state == d.states.error: logger.info("Error al intentar descargar %s" % (url)) status = STATUS_CODES.error # La descarga se ha detenifdo elif d.state == d.states.stopped: logger.info("Descarga detenida") status = STATUS_CODES.canceled # La descarga ha finalizado elif d.state == d.states.completed: logger.info("Descargado correctamente") status = STATUS_CODES.completed if item.downloadSize and item.downloadSize != d.size[0]: status = STATUS_CODES.error save_server_statistics(item.server, d.speed[0], d.state != d.states.error) dir = os.path.dirname(item.downloadFilename) file = filetools.join(dir, d.filename) if status == STATUS_CODES.completed: move_to_libray(item.clone(downloadFilename=file)) return {"downloadUrl": d.download_url, "downloadStatus": status, "downloadSize": d.size[0], "downloadProgress": d.progress, "downloadCompleted": d.downloaded[0], "downloadFilename": file}
def __init__(self, start_monitor=True): self.init() self.number_dict = {core.constant.TOTAL_TASK: 0, core.constant.TOTAL_REQUEST: 0, core.constant.TOTAL_RESPONSE: 0} self.color = core.constant.COLOR self.close = False self.loop = asyncio.get_event_loop() self.filter = core.bloomFilter.bloomFilterContext.get_filter(settings.PROJECT_NAME) self.scheduler = Scheduler(self) self.downloader = Downloader(self, settings.DOWNLOADER_WORKER) self.save = Save(self, settings.SAVE_WORKER) self.monitor = Monitor(self) self.start_monitor = start_monitor
def download_url(url, item, path=None, filename=None, resume=False): logger.trace() if url.lower().endswith(".m3u8") or url.lower().startswith( "rtmp") or item.server == 'torrent': logger.debug('Servidor o tipo de medio no soportado') return {"status": 3} download_path = settings.get_setting('download_path', __file__) if path: path = filetools.join(download_path, path) else: path = download_path if not filetools.isdir(path): filetools.makedirs(path) d = Downloader( url=url, path=filetools.validate_path(path), filename=filetools.validate_path(filename), resume=resume, max_connections=1 + settings.get_setting("max_connections", __file__), block_size=2**(17 + settings.get_setting("block_size", __file__)), part_size=2**(20 + settings.get_setting("part_size", __file__)), max_buffer=2 * settings.get_setting("max_buffer", __file__)) d.start_dialog("Descargas [%s]" % item.servername or item.server) result = { 'download_size': d.size[0], 'download_progress': d.progress, 'download_filename': d.filename, 'download_path': path } if d.state == d.states.error: logger.debug("Error al intentar descargar %s" % url) result['download_status'] = 3 elif d.state == d.states.stopped: logger.debug("Descarga detenida") result['download_status'] = 2 elif d.state == d.states.completed: logger.debug("Descargado correctamente") result['download_status'] = 1 return result
def test_status_code_different_from_200_causes_exception(mock_requests): downloader = Downloader() mock_requests.return_value = create_not_found_return_value() got_ex = False try: downloader.fetch_url(FAKE_COOKIE, FAKE_URL) except RuntimeError: got_ex = True mock_requests.assert_called_once_with(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) assert got_ex
def test_empty_returned_text_causes_exception(mock_requests): downloader = Downloader() mock_requests.return_value = create_ok_return_value_without_text() got_ex = False try: downloader.fetch_url(FAKE_COOKIE, FAKE_URL) except RuntimeError: got_ex = True mock_requests.assert_called_once_with(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) assert got_ex
def openSpliveLink(url, page, provider): if url.find(".m3u8") == -1 and url.find("rtmp://") == -1: channel = Spliveappcom.decodeUrl(url, provider) link = channel[0]["link"] if link.find(", referer:") > -1: link = link[0 : link.find(", referer:")] url = link else: logger.debug("nothing decoded for splive encrypted channels, continue...") logger.debug("splive BRUTE logic for url: " + url) try: if "ponlatv.com" in url or "playerhd1.pw" in url: logger.debug("trying to decode cineestrenos script from url: " + url) url = Cineestrenostv.extractScriptLevel3(url, referer=Cineestrenostv.MAIN_URL) logger.debug("decoded link was: " + url) else: url = Cineestrenostv.getChannels(url)[0]["link"] html = Downloader.getContentFromUrl(url) element = Cineestrenostv.extractIframeChannel(html, url) if element is not None and element.has_key("link"): url = element["link"] logger.debug("cineestrenos url was decoded to: " + url) else: logger.debug("nothing was done to decode cineestrenostv url!") except: logger.debug("nothing to be decoded with url: " + url) pass link = url logger.info("found link: " + link + ", launching...") open(link, page)
def drawBbcCoUkNew(url): htmlContent = Downloader.getContentFromUrl(url=url) title = Decoder.extract('<p class="story-body__introduction">', "</p><div", htmlContent) if 'property="articleBody"' in htmlContent: body = Decoder.extract( 'property="articleBody"', " </div>", htmlContent, ) body = body.replace('<span class="off-screen">Image copyright</span>', "") body = body.replace('<span class="story-image-copyright">AFP</span>', "") body = body.replace('<span class="story-image-copyright">Reuters</span>', "") body = body.replace('<span class="off-screen">Image caption</span>', "") body = body.replace('<span class="off-screen">Media caption</span>', "") while '<span class="media-caption__text">' in body: line = Decoder.extractWithRegex('<span class="media-caption__text">', "</span>", body) body = body.replace(line, "") elif 'class="text-wrapper"' in htmlContent: # special content body = Decoder.extract('class="text-wrapper"', "</p>\n", htmlContent) dates = Decoder.extractWithRegex('<div class="date', "</div>", body) lastUpdate = Decoder.extractWithRegex('<p class="date ', "</p>", body) body = body.replace(dates, "") body = body.replace(lastUpdate, "") elif '<figcaption class="sp-media-asset' in htmlContent: body = Decoder.extract('<figcaption class="sp-media-asset', "</p><div ", htmlContent) if ">" in body: body = body[body.find(">") + 1 :] body = Decoder.removeHTML(body).replace(".", ".\n").replace(">", "") logger.debug("body is: " + body) drawNew(textContent=(body))
def start(self): try: self.wait_for_start() self._istart = True """load seed """ self.load_seeds() #load seeds from google search """show welcome info""" self.show_welcome() self._status._sys_start = time() """start threads""" self._downloader = Downloader( self._config._down_num, self._status) self._downloader.start() self._parser = Parser(self._config._parser_num, self._status ) self._parser.start() self._downloader_pool_checker.start() self._parse_pool_checker.start() self._status_update.start() """notify mysql, i am started""" self.sqlex.write_if_start() except (Exception) as e: Log().debug("start failed") raise(e) return False
def __init__(self): #, spider_group, task_gettter): # self.spider_group = spider_group # self.task_getter = task_gettter self.spiders = self._auto_import_cls(SPIDERS, True) self.pool = Pool() self.pipelines = self._auto_import_cls(PIPELINES) self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES) #self.downloader_mids = downloader_mids self.downloader_mids = self._auto_import_cls(DOWNLOADER_MIDDLEWARES) self.scheduler = Scheduler(ROLE, QUEUE_TYPE) self.downloader = Downloader() # self.spider_mids = spider_mids self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES) self.is_running = True self.total_response = 0 self.executor = BaseThreadPoolExecutor(max_workers=ASYNC_COUNT)
def test_retries_when_service_unavailable_then_ok(mock_requests): downloader = Downloader() mock_requests.side_effect = [ create_service_unavailable_return_value(), create_ok_return_value() ] res = downloader.fetch_url(FAKE_COOKIE, FAKE_URL, retries=3) assert res == create_ok_return_value() mock_requests.assert_has_calls([ call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY), call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) ])
def decodeIguide(iframeUrl3,iframeUrl2=''): logger.debug("iguide url is: "+iframeUrl3) html4 = Downloader.getContentFromUrl(iframeUrl3,"autoplay=true",Downloader.cookie,iframeUrl2) logger.debug("part 2 of iguide") #at this point is a similar logic than streamlive.to (probably because like always it's the same server), builds the link swfUrl = Decoder.rExtractWithRegex("http://",".swf",html4) logger.debug("using swfUrl: "+swfUrl) tokenUrl = Decoder.extractWithRegex("http://www.iguide.to/serverfile.php?id=",'"',html4) tokenUrl = tokenUrl[:(len(tokenUrl)-1)] token = Downloader.getContentFromUrl(tokenUrl,"",Downloader.cookie) token = Decoder.extract('{"token":"','"}',token) file = Decoder.extract("'file': '","',",html4).replace('.flv','') streamer = Decoder.extract("'streamer': '","',",html4).replace("\\","") link = streamer+" playpath="+file+" live=1 token="+token+" swfUrl="+swfUrl+" pageUrl="+iframeUrl3 logger.debug("built a link to be used: "+link) return link
def test_exceptions_when_internal_server_error(mock_requests): downloader = Downloader() mock_requests.return_value = create_internal_server_error_return_value() got_ex = False try: downloader.fetch_url(FAKE_COOKIE, FAKE_URL) except RuntimeError: got_ex = True mock_requests.assert_called_once_with(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) assert got_ex
def test_timeout_is_propagated_when_retries_are_disabled(mock_requests): downloader = Downloader() mock_requests.side_effect = requests.exceptions.Timeout() got_ex = False try: downloader.fetch_url(FAKE_COOKIE, FAKE_URL) except requests.exceptions.Timeout: got_ex = True mock_requests.assert_called_once_with(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) assert got_ex
def test_exceptions_from_get_are_propagated(mock_requests): downloader = Downloader() mock_requests.side_effect = RuntimeError('Boom') got_ex = False try: downloader.fetch_url(FAKE_COOKIE, FAKE_URL) except RuntimeError: got_ex = True mock_requests.assert_called_once_with(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) assert got_ex
def extractDinostreamPart(url,referer=''): element = {} logger.debug("url: "+url+", referer: "+referer) html4 = Downloader.getContentFromUrl(url,"","",referer) finalIframeUrl = Decoder.extractWithRegex('http://','%3D"',html4) finalIframeUrl = finalIframeUrl[0:len(finalIframeUrl)-1] logger.debug("proccessing level 4, cookie: "+Downloader.cookie) finalHtml = Downloader.getContentFromUrl(finalIframeUrl,"",Downloader.cookie,referer) logger.debug("proccessing level 5, cookie: "+Downloader.cookie) playerUrl = Decoder.decodeBussinessApp(finalHtml,finalIframeUrl) #print "player url is: "+playerUrl element["title"] = "Watch streaming" element["permalink"] = True element["link"] = playerUrl return element
def parse_relayer(params): url = "NonE" try: if params.has_key("url"): url = params["url"] logger.debug("mobdro.directURL: " + url) elif params.has_key("relayer"): params2 = json.loads(params["relayer"]) logger.debug("RELAYED: "+repr(params2)) protocol = "http"#params2["protocol"] app = params2["app"] server = params2["server"] playpath = params2["playpath"] password = params2["password"] dire = params2["dir"] expiration_time = params2["expiration_time"] millis = int(round(time.time() * 1000)) l = millis / 1000L + expiration_time arr = [password, l, dire, playpath] url = "%s%d/%s/%s" url = url % tuple(arr) url_md5 = md5.new(url).digest() url_base64 = base64.b64encode(url_md5) url_base64 = url_base64.replace("+", "-").replace("/", "_").replace("=", "") #arr = [server, url_base64, l, playpath] arr = [protocol,server,app,playpath,url_base64,l] url = "%s://%s/%s/%s?st=%s&e=%d" #"http://%s/live/%s/%d/%s" url = url % tuple(arr) url += "|"+Downloader.getHeaders(Mobdro.MAIN_URL) else: logger.debug("REJECTED: " + repr(params)) except KeyError: url = "exception" pass return url
def getChannels(page,cookie='',referer=''): x = [] html = "" if str(page) == '0': x = Redmp3cc.getMainSections() elif str(page) == 'songs.html': page=Redmp3cc.MAIN_URL+"/" html = Downloader.getContentFromUrl(page,"",cookie,"") x = Redmp3cc.extractElementsPlayer(html) elif str(page).find('search.html')!=-1: if str(page).find('search.html/')==-1: keyboard = xbmc.Keyboard("") keyboard.doModal() text = "" if (keyboard.isConfirmed()): text = keyboard.getText() x = Redmp3cc.search(text) else: text = Decoder.rExtract('search.html/','/',page) page = int(page[page.rfind('/')+1:]) x = Redmp3cc.search(text,page) elif str(page).find(".html")!=-1: if str(page) == 'albums.html'!=-1: page = Redmp3cc.MAIN_URL html = Downloader.getContentFromUrl(page,"",cookie,"") x = Redmp3cc.extractElementsAlbum(html) else: html = Downloader.getContentFromUrl(page,"",cookie,"") x = Redmp3cc.extractElementsPlayer(html) else: logger.info("page is: "+page) response = Redmp3cc.getContentFromUrl(page,"",cookie,Redmp3cc.MAIN_URL,True) #logger.info("will be used a mp3 url: "+Decoder.extract('<a href="','">here',response)) host = response[response.find("://")+len("://"):] if host.find("/")>-1: host = host[0:host.find("/")] cookie = Redmp3cc.cookie referer = page logger.info("cookie is: "+cookie+", referer is: "+referer) headers = downloadtools.buildMusicDownloadHeaders(host,cookie,referer) filename= Decoder.extract('filename=','&',response) #ROOT_DIR = xbmcaddon.Addon(id='org.harddevelop.kodi.juke').getAddonInfo('path') ROOT_DIR = xbmc.translatePath('special://temp/') logger.info("using special root folder: "+ROOT_DIR) downloadtools.downloadfile(response,ROOT_DIR+"/"+filename,headers,False,True) x.append(Redmp3cc.buildDownloadedFile(xbmc.makeLegalFilename(ROOT_DIR+"/"+filename))) return x
def decodeVidag(link): html = Downloader.getContentFromUrl(link,"","","",False,True) try: encodedMp4File = Decoder.extract("<script type='text/javascript'>eval(function(p,a,c,k,e,d)","</script>",html) except: pass mp4File = jsunpack.unpack(encodedMp4File) #needs un-p,a,c,k,e,t|d mp4File = Decoder.extract(',{file:"','",',mp4File) return mp4File
def getListsUrls(url,icon=XBMCUtils.getAddonFilePath('icon.png'),provider='',finalTarget=1): #logger.debug("using url: "+url) html = Downloader.getContentFromUrl(url) if url.endswith(".xml") or ('<items>' in html or '<item>' in html): #main channels, it's a list to browse drawXml(html,icon=icon,finalTarget=finalTarget,provider=provider) elif url.endswith(".xspf"): drawXspf(html,icon) else: #it's the final list channel, split drawBruteChannels(html,icon)
def test_cookie_is_passed_in_headers(mock_requests): downloader = Downloader() mock_requests.return_value = create_ok_return_value() downloader.fetch_url(FAKE_COOKIE, FAKE_URL) expected_headers = { 'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8', 'accept': '*/*', 'accept-encoding': 'gzip, deflate', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36', 'cookie': FAKE_COOKIE } mock_requests.assert_called_once_with(headers=expected_headers, url=ANY, allow_redirects=ANY, timeout=ANY)
def decodeOpenload(link): #decode javascript link like Firefox mediaId = Decoder.extract("/f/","/",link) logger.debug("mediaId is: "+mediaId) link = link.replace('/f/', '/embed/') html = Downloader.getContentFromUrl(link,"data=data","","",False,True) #make post, with get there is an infinite loop #extract script script = re.search(r"<video(?:.|\s)*?<script\s[^>]*?>((?:.|\s)*?)</script", html, re.DOTALL | re.IGNORECASE).group(1) url = Decoder.decodeAAScript(script) logger.debug("decoded url is: "+url) return url
def test_timeout_is_propagated_after_last_retry_failed(mock_requests): downloader = Downloader() mock_requests.side_effect = [ requests.exceptions.Timeout(), requests.exceptions.Timeout() ] got_ex = False try: downloader.fetch_url(FAKE_COOKIE, FAKE_URL, retries=2) except requests.exceptions.Timeout: got_ex = True mock_requests.assert_has_calls([ call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY), call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY) ]) assert got_ex
def decodeVidag(link): html = Downloader.getContentFromUrl(link, "", "", "", False, True) try: encodedMp4File = Decoder.extract( "<script type='text/javascript'>eval(function(p,a,c,k,e,d)", "</script>", html) except: pass mp4File = jsunpack.unpack(encodedMp4File) #needs un-p,a,c,k,e,t|d mp4File = Decoder.extract(',{file:"', '",', mp4File) return mp4File
def extractSawlive(scriptSrc,cookie,iframeUrl): encryptedHtml = Downloader.getContentFromUrl(scriptSrc,"",cookie,iframeUrl) #print encryptedHtml decryptedUrl = Decoder.decodeSawliveUrl(encryptedHtml) html3 = Downloader.getContentFromUrl(decryptedUrl,"",cookie,scriptSrc) logger.debug("decrypted sawlive url content obtained!") #ok, now extract flash script content flashContent = Decoder.extract("var so = new SWFObject('","</script>",html3) file = Decoder.extract("'file', ",");",flashContent) logger.debug("proccessing brute file: "+file) #now proccess file, it can be a figure so needs to be appended if contains + if file.find("+")>1: newFile = "" for target in file.split("+"): seekedString = "var "+target+" = '" if html3.find(seekedString)>-1: value = Decoder.extract(seekedString,"'",html3) newFile += value else: newFile += target logger.debug("now file is: "+newFile) file = newFile logger.debug("updated file to: "+file) else: file = file.replace("'","") #clean rtmpUrl = "" if flashContent.find("'streamer', '")>.1: rtmpUrl = Decoder.extract("'streamer', '","');",flashContent) else: rtmpVar = Decoder.extract("'streamer', ",");",flashContent) seekedString = "var "+rtmpVar+" = '" rtmpUrl = Decoder.extract(seekedString,"';",html3) swfUrl = "http://static3.sawlive.tv/player.swf" #default #update swf url swfUrl = flashContent[:flashContent.find("'")] logger.info("updated swf player to: "+swfUrl) if rtmpUrl=='' and file.find("http://")>-1: finalRtmpUrl = file #it's a redirect with an .m3u8, so it's used else: finalRtmpUrl = rtmpUrl+" playpath="+file+" swfUrl="+swfUrl+" live=1 conn=S:OK pageUrl="+decryptedUrl+" timeout=12" return finalRtmpUrl
def decodeStreamable(link): html = Downloader.getContentFromUrl(link) flashContent = Decoder.extract("<object", "</object", html) movie = "" flashVars = "" for content in flashContent.split("<param"): value = Decoder.extract('value="', '"', content) name = Decoder.extract('name="', '"', content) if name == "movie" or name == "player": movie = value elif name == "FlashVars": flashVars = value swfUrl = "http://www.streamable.ch" + movie flashVars = flashVars[flashVars.find("=") :] decodedFlashvars = base64.standard_b64decode(flashVars) logger.info("decoded url is: " + decodedFlashvars) response = Downloader.getContentFromUrl(decodedFlashvars) token = Decoder.extract('"token1":"', '"', response) finalLink = base64.standard_b64decode(token) logger.debug("final link is: " + finalLink) return finalLink
def decodeStreamable(link): html = Downloader.getContentFromUrl(link) flashContent = Decoder.extract('<object', '</object', html) movie = "" flashVars = "" for content in flashContent.split('<param'): value = Decoder.extract('value="', '"', content) name = Decoder.extract('name="', '"', content) if name == "movie" or name == "player": movie = value elif name == "FlashVars": flashVars = value swfUrl = "http://www.streamable.ch" + movie flashVars = flashVars[flashVars.find("="):] decodedFlashvars = base64.standard_b64decode(flashVars) logger.info("decoded url is: " + decodedFlashvars) response = Downloader.getContentFromUrl(decodedFlashvars) token = Decoder.extract("\"token1\":\"", "\"", response) finalLink = base64.standard_b64decode(token) logger.debug("final link is: " + finalLink) return finalLink
def extractTargetVideo(page): logger.debug("extracting from page: "+page) html = Streamgaroo.getContentFromUrl(url=page,referer=Streamgaroo.MAIN_URL) logger.debug("html is: "+html) apiKey = Decoder.extract('data-sh="','"',html) bruteJSON = Streamgaroo.getContentFromUrl(Streamgaroo.CHANNEL_API, "h="+apiKey, Streamgaroo.cookie, Streamgaroo.MAIN_URL) jsonList = json.loads(bruteJSON) url2 = jsonList["link"] logger.debug("using url: "+url2) html2 = Streamgaroo.getContentFromUrl(url2, "", Streamgaroo.cookie, page) logger.debug("html2 is: "+html2) if 'playJS("' in html2: finalUrl = Decoder.extract('playJS("','"',html2) logger.debug("found final url: "+finalUrl) finalUrl = finalUrl.replace("http://www.streamgaroo.com/fetch/r/","") #clean proxies if 'playlist.m3u8' in finalUrl and '==' in finalUrl: finalUrl = finalUrl.replace('playlist.m3u8?','chunks.m3u8?') finalUrl = finalUrl + "|" + urllib.unquote(Downloader.getHeaders()) elif "playStream('iframe','" in html2: iframeUrl = finalUrl = Decoder.extract("playStream('iframe','","'",html2) logger.debug("found iframe link: " + iframeUrl) try: iframeHtml = Downloader.getContentFromUrl(url=iframeUrl, data=" ", referer=page) except: logger.debug("trying second way, easy!!") import urllib2 req = urllib2.Request(iframeUrl) req.add_header('Referer', page) req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0') resp = urllib2.urlopen(req) iframeHtml = resp.read() logger.debug("done!") pass logger.debug("html iframe is: "+iframeHtml) if 'adca.st/broadcast/player' in iframeHtml: finalUrl = Decoder.decodeBroadcastst(iframeUrl,page) elif 'vaughnlive.tv/embed/video/' in iframeUrl: finalUrl = Decoder.decodeVaughnlivetv(iframeUrl,page) logger.debug("done!") return finalUrl
def decrypt(encrypted): decrypted = encrypted try: logger.debug("Encrypted content is: "+encrypted) if not ONLINE: decrypted = PBEWithMD5AndDES.decrypt(encrypted, Spliveappcom.PASSWORD) elif len(encrypted)>0 and encrypted.find("http://")==-1: decrypted = Downloader.getContentFromUrl(Spliveappcom.DECODER_URL+'?data='+encrypted+"&key="+Spliveappcom.PASSWORD+"&iterations=1000") logger.debug("Decrypted content is: "+decrypted) except: logger.error("Could not be unencrypted: "+encrypted) pass return decrypted
def decodeOpenload(link): # get cookies mediaId = Decoder.extract("/f/", "/", link) embedUrl = "https://openload.io/embed/" + mediaId html = Downloader.getContentFromUrl(embedUrl, "", "", "", False, False) logger.info("html is: " + html) logger.debug("using cookie 1: " + Downloader.cookie) logger.debug("Media id for openload is: " + mediaId) extra = "&login=f750b26513f64034&key=oaA-MbZo" # this avoid captcha petition link2 = "https://api.openload.io/1/file/dlticket?file=" + mediaId + extra data = Downloader.getContentFromUrl(link2, "", Downloader.cookie, embedUrl, True, False) logger.debug("jsonData: " + data) js_result = json.loads(data) logger.info("sleeping... " + str(js_result["result"]["wait_time"])) time.sleep(int(js_result["result"]["wait_time"])) link3 = "https://api.openload.io/1/file/dl?file=%s&ticket=%s" % (mediaId, js_result["result"]["ticket"]) logger.debug("using cookie 2: " + Downloader.cookie) result = Downloader.getContentFromUrl(link3, "", Downloader.cookie, embedUrl, True, False) logger.debug("jsonData 2: " + result) js_result2 = json.loads(result) file = js_result2["result"]["url"] + "?mime=true" logger.info("Built final link: " + file) return file
def extractSawlive(scriptSrc, cookie, iframeUrl): encryptedHtml = Downloader.getContentFromUrl(scriptSrc, "", cookie, iframeUrl) #print encryptedHtml decryptedUrl = Decoder.decodeSawliveUrl(encryptedHtml) html3 = Downloader.getContentFromUrl(decryptedUrl, "", cookie, scriptSrc) #ok, now extract flash script content flashContent = Decoder.extract("var so = new SWFObject('", "</script>", html3) file = Decoder.extract("'file', '", "');", flashContent) rtmpUrl = "" if flashContent.find("'streamer', '") > .1: rtmpUrl = Decoder.extract("'streamer', '", "');", flashContent) swfUrl = "http://static3.sawlive.tv/player.swf" #default #update swf url swfUrl = flashContent[:flashContent.find("'")] logger.info("updated swf player to: " + swfUrl) if rtmpUrl == '' and file.find("http://") > -1: finalRtmpUrl = file #it's a redirect with an .m3u8, so it's used else: finalRtmpUrl = rtmpUrl + " playpath=" + file + " swfUrl=" + swfUrl + " live=1 conn=S:OK pageUrl=" + decryptedUrl + " timeout=12" return finalRtmpUrl
def getChannels(page): x = [] if page == '0': page = RedeneobuxCom.LIST_PAGE results = RedeneobuxCom.getContentFromUrl(page) i=0 for result in results.split('<div class="media">'): if i>0: element = {} img = Decoder.extract('<img src=\'',"'",result) link = Decoder.extract('location.href=\'', "'", result) title = Decoder.extract('\' alt=\'', "'", result) if "http" in link: logger.debug("appending result: "+title+", url: "+link) element["title"] = title element["link"] = link element["thumbnail"] = img x.append(element) i+=1 else: content = RedeneobuxCom.getContentFromUrl(url=page,referer=RedeneobuxCom.LIST_PAGE) logger.debug("list content is: " + content) url = Decoder.extractWithRegex('http'," ",content).replace(" ","") logger.debug("url is: " + url) if 'adf' in url: listUrl = Decoder.decodeAdfly(url) logger.debug("list obtained is: "+listUrl) m3uContent = Downloader.getSimpleDownload(listUrl) #simple urllib2 download logger.debug("content: "+m3uContent) i=0 for lineContent in m3uContent.split('#EXTINF:'): if i>0: title = Decoder.extract(',','\n',lineContent) lineContent = lineContent[lineContent.find("\n"):] urlContent = Decoder.extractWithRegex('http://',"\n",lineContent).replace('\n','') element = {} element["title"] = title element["link"] = urlContent#+"|"+Downloader.getHeaders(listUrl) element["thumbnail"] = '' element["finalLink"] = True if "://" in urlContent: logger.debug("added: " + title + ", content: " + urlContent) x.append(element) i+=1 return x
def extractElementsSearch(html): jsonContent = json.loads(html) x = [] for jsonValues in jsonContent: element = {} element["title"] = jsonValues["label"]+" - "+jsonValues["category"] if jsonValues["category"] == 'Artists': link = base64.standard_b64encode(Fildonet.ARTIST+str(jsonValues["label"])) elif jsonValues["category"] == 'Albums': link = base64.standard_b64encode(Fildonet.ALBUM+str(jsonValues["id"])) elif jsonValues["category"] == 'Songs': id = jsonValues["id"] html2 = Downloader.getContentFromUrl(Fildonet.SONG+str(id)) songsJSONS = json.loads(html2) for songsJSON in songsJSONS: link = songsJSON["mp3Url"] element["thumbnail"] = songsJSON["picUrl"] element["link"] = link x.append(element) return x
def decodeKeepVid(link): html = Downloader.getContentFromUrl("http://keepvid.com/?url="+urllib.quote_plus(link)) tableHtml = Decoder.extract('<ul><li>',"</ul>",html) logger.debug("extracting from html: "+tableHtml) links = [] selectedLink = "" for liHtml in tableHtml.split('</li>'): link = Decoder.extract('a href="','"',liHtml) title = Decoder.extract('alt="', '"', liHtml) if "1080p" in title and '(Video Only)' not in title: selectedLink = link elif len(selectedLink)==0 and "720p" in title and '(Video Only)' not in title: selectedLink = link else: logger.debug("No link selected with title: "+title) logger.debug("url at this moment is (youtube external): " + link) links.append(link) if len(selectedLink)==0: selectedLink = links[0] return selectedLink
class TestDownloader(aiounittest.AsyncTestCase): def setUp(self): self.downloader = Downloader(None) @enter_session async def test_get_soup(self, session): self.downloader.session = session soup = await self.downloader.get_soup("https://www.example.com") h1_text = soup.find("h1").text self.assertEqual(h1_text, "Example Domain") @enter_session async def test_get_soup_error(self, session): self.downloader.session = session with self.assertRaises(RuntimeError) as e: await self.downloader.get_soup("http://www.example.com/testse") @enter_session async def test_get_image(self, session): self.downloader.session = session img_bytes = await self.downloader.get_img( "https://manhua1034-104-250-139-219.cdnmanhua.net/3/2800/1006905/1_1002.jpg?cid=1006905&key=9a12f75785ef4d8dc9fffcfa58f5e406&type=1" ) self.assertEqual(len(img_bytes), 331566) @enter_session async def test_get_chapter_images(self, session): self.downloader.session = session count = 0 urls = [ "https://manhua1034-104-250-139-219.cdnmanhua.net/3/2800/1006905/1_1002.jpg?cid=1006905&key=9a12f75785ef4d8dc9fffcfa58f5e406&type=1", "https://manhua1034-104-250-139-219.cdnmanhua.net/3/2800/1006905/2_7528.jpg?cid=1006905&key=9a12f75785ef4d8dc9fffcfa58f5e406&type=1" ] async for img_dict in self.downloader.get_images(urls, ""): count += 1 if img_dict["idx"] == 0: self.assertEqual(len(img_dict["message"]), 442088) self.assertEqual(count, 2)
def extractTargetVideo(link): logger.debug("trying to decode with youtube link decrypter: " + link) code = link[link.find("v=") + 2:] logger.debug("trying with code: " + code) try: link = Decoder.downloadY(code) except: # trying second way, external page html = Downloader.getContentFromUrl(link, referer=Youtube.MAIN_URL) oldLink = link if 'ytplayer.config = {' in html: logger.debug("trying new way for .m3u8 links...") link = Decoder.extract(',"hlsvp":"', '"', html).replace('\\', '') link = urllib.unquote(link) logger.debug("new youtube extracted link from json is: " + link) # link += "|" + Downloader.getHeaders(oldLink) if "http" not in link: logger.debug("trying old second way: external resource...") link = Youtube.decodeKeepVid(oldLink) pass if ".m3u8" in link: bruteM3u8 = Youtube.getContentFromUrl(link); if 'https://' in bruteM3u8: m3u8 = bruteM3u8[bruteM3u8.rfind('https://'):] link = urllib.unquote_plus(m3u8).strip() logger.debug("using the last one inside: "+m3u8) else: logger.debug("no last one link selected :'(") else: logger.debug("nothing is transformed for youtube links.") logger.debug("final youtube decoded url is: " + link) if ";" in link: link = link.replace("=", "%3D").replace(";", "%3B") else: link = link.replace("%3D","=") return link
def extractElementsSearch(html): jsonContent = json.loads(html) x = [] for jsonValues in jsonContent: element = {} element[ "title"] = jsonValues["label"] + " - " + jsonValues["category"] if jsonValues["category"] == 'Artists': link = base64.standard_b64encode(Fildonet.ARTIST + str(jsonValues["label"])) elif jsonValues["category"] == 'Albums': link = base64.standard_b64encode(Fildonet.ALBUM + str(jsonValues["id"])) elif jsonValues["category"] == 'Songs': id = jsonValues["id"] html2 = Downloader.getContentFromUrl(Fildonet.SONG + str(id)) songsJSONS = json.loads(html2) for songsJSON in songsJSONS: link = songsJSON["mp3Url"] element["thumbnail"] = songsJSON["picUrl"] element["link"] = link x.append(element) return x
def start_file_downloading(self, source_path: str, path_to_save: str, filename: str): if not self.can_continue or self.dtp is not None: self.download_queue.put( ('down', (source_path, path_to_save, filename))) return logger.debug('file %s downloading started. destination: %s' % (filename, path_to_save)) self.pi.change_dir(source_path) downloader = Downloader(path_to_save, filename) self.dtp = downloader self.pi.passive_mode.connect( downloader.data_transfer_process.start_transfer, QtCore.Qt.QueuedConnection) downloader.complete.connect(self.update_local_model) downloader.complete.connect(self.set_dtp_to_none) downloader.complete.connect(downloader.deleteLater) self.pi.initiate_passive_mode() self.pi.download_file(filename)
def decodeCastalbatv(url,page=''): channelId = url[url.find('cid=')+len('cid='):] if channelId.find("&")>-1: channelId = channelId[:channelId.find("&")] #iframeUrl = "http://castalba.tv/channel/"+channelId iframeUrl = url; logger.debug("using referer: "+page) html = Downloader.getContentFromUrl(iframeUrl,'',"",page) file = ""; if html.find(".m3u8")>-1: file = Decoder.rExtract("'file': '",'.m3u8',html) logger.debug("detected castalba file: "+file) if len(file)>0 and page!='': file+="|Referer="+page else: file+="|Referer="+file else: file = Decoder.extract("var file = '","'",html) flash= Decoder.extract("'flashplayer': \"","\"",html) rtmpUrl = "rtmp://"+Decoder.extract("return '/","';",html) playpath = file+"?"+Decoder.extract("unescape('?","'),",html) file = rtmpUrl+" playpath="+playpath+" swfUrl="+flash+" live=1 pageUrl=http://castalba.tv/" logger.debug("final link from castalba is: "+file) return file
def search(text,page=0,cookie=''): page = "http://redmp3.cc/mp3-"+urllib.unquote_plus(text)+"/"+str(page) html = Downloader.getContentFromUrl(page,"",cookie,"") x = Redmp3cc.extractElementsPlayer(html) return x
def download_from_url(url, item): logger.info("pelisalacarta.channels.descargas download_from_url - Intentando descargar: %s" % (url)) if url.lower().endswith(".m3u8") or url.lower().startswith("rtmp"): save_server_statistics(item.server, 0, False) return {"downloadStatus": STATUS_CODES.error} # Obtenemos la ruta de descarga y el nombre del archivo download_path = filetools.dirname(filetools.join(config.get_setting("downloadpath"), item.downloadFilename)) file_name = filetools.basename(filetools.join(config.get_setting("downloadpath"), item.downloadFilename)) # Creamos la carpeta si no existe if not filetools.exists(download_path): filetools.mkdir(download_path) # Mostramos el progreso progreso = platformtools.dialog_progress("Descargas", "Iniciando descarga...") # Lanzamos la descarga d = Downloader(url, filetools.encode(download_path), filetools.encode(file_name)) d.start() # Monitorizamos la descarga hasta que se termine o se cancele while d.state == d.states.downloading and not progreso.iscanceled(): time.sleep(0.1) line1 = "%s" % (filetools.decode(d.filename)) line2 = "%.2f%% - %.2f %s de %.2f %s a %.2f %s/s (%d/%d)" % ( d.progress, d.downloaded[1], d.downloaded[2], d.size[1], d.size[2], d.speed[1], d.speed[2], d.connections[0], d.connections[1]) line3 = "Tiempo restante: %s" % (d.remaining_time) progreso.update(int(d.progress), line1, line2, line3) # Descarga detenida. Obtenemos el estado: # Se ha producido un error en la descarga if d.state == d.states.error: logger.info("pelisalacarta.channels.descargas download_video - Error al intentar descargar %s" % (url)) d.stop() progreso.close() status = STATUS_CODES.error # Aun está descargando (se ha hecho click en cancelar) elif d.state == d.states.downloading: logger.info("pelisalacarta.channels.descargas download_video - Descarga detenida") d.stop() progreso.close() status = STATUS_CODES.canceled # La descarga ha finalizado elif d.state == d.states.completed: logger.info("pelisalacarta.channels.descargas download_video - Descargado correctamente") progreso.close() status = STATUS_CODES.completed if item.downloadSize and item.downloadSize != d.size[0]: status = STATUS_CODES.error save_server_statistics(item.server, d.speed[0], d.state != d.states.error) if progreso.iscanceled(): status = STATUS_CODES.canceled dir = os.path.dirname(item.downloadFilename) file = filetools.join(dir, filetools.decode(d.filename)) if status == STATUS_CODES.completed: move_to_libray(item.clone(downloadFilename = file)) return {"downloadUrl": d.download_url, "downloadStatus": status, "downloadSize": d.size[0], "downloadProgress": d.progress, "downloadCompleted": d.downloaded[0], "downloadFilename": file}
def setUp(self): self.downloader = Downloader(None)
class Core(): def __init__(self): #, spider_group, task_gettter): # self.spider_group = spider_group # self.task_getter = task_gettter self.spiders = self._auto_import_cls(SPIDERS, True) self.pool = Pool() self.pipelines = self._auto_import_cls(PIPELINES) self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES) #self.downloader_mids = downloader_mids self.downloader_mids = self._auto_import_cls(DOWNLOADER_MIDDLEWARES) self.scheduler = Scheduler(ROLE, QUEUE_TYPE) self.downloader = Downloader() # self.spider_mids = spider_mids self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES) self.is_running = True self.total_response = 0 self.executor = BaseThreadPoolExecutor(max_workers=ASYNC_COUNT) def _auto_import_cls(self, path_list=[], is_spider=False): if is_spider: instances = {} else: instances = [] import importlib for path in path_list: if is_spider: module_name = 'crawlers.' + path[:path.rfind(".")] class_name = path[path.rfind(".") + 1:] result = importlib.import_module(module_name) cls = getattr(result, class_name) instances[cls.name] = cls() print(f'爬虫“{cls.name}”已加载') else: module_name = path[:path.rfind(".")] class_name = path[path.rfind(".") + 1:] result = importlib.import_module(module_name) cls = getattr(result, class_name) instances.append(cls()) print(f'“{cls.__name__}”已加载') return instances def _start_engine(self): # master只执行 添加请求,所以total_request会自增, # 但是不发送请求total_response不会自增 if ROLE == "master" or ROLE is None: # 将Engine的工作分工,分为两部分: # 1 处理start_request请求并存如调度器中 #self._execute_start_requests() self.pool.apply_async(self._execute_start_requests) while 1: time.sleep(0.01) li_req = self.scheduler.get_batch_requests(ASYNC_COUNT) if not li_req: continue tasks = [ self.executor.submit(self._execute_request_return_item, req) for req in li_req ] for fu in as_completed(tasks): fu.result() if self.scheduler.total_request == self.total_response and self.scheduler.total_request != 0: self.is_running = False break print("Main Thread is over!") # def _callback(self, _): # if self.is_running: # self.pool.apply_async(self._execute_request_response_item, callback=self._callback) def start(self): # 开始时间 start = datetime.now() print("Start time : {}".format(start)) print("----" * 30) self._start_engine() # 结束时间 end = datetime.now() print("----" * 30) print("End time : {}".format(end)) # 总计运行时间 print("Useing time : {}".format((end - start).total_seconds())) def _execute_start_requests(self): # 将所有爬虫的start_urls里的请求全部放入同一个调度器中 #[("baidu", baidu_spider), ("douban" : douban_spider)] for spider_name, spider in self.spiders.items(): print(spider_name, spider) # 1. 从spider中获取第一批请求,交给调度器 #request = self.spider.start_requests() for request in spider.start_requests(): # 第一次处理请求时,就添加爬虫名,该爬虫名可以传递到后续提取的请求中 request.spider_name = spider_name # 1.1 将请求交给spider中间件做处理,再返回处理后的请求 for spider_mid in self.spider_mids: request = spider_mid.process_request(request, spider) self.scheduler.add_request(request) def _execute_request_response_item(self): # 每次while 循环,处理的都是同一个爬虫下的某一个请求 #while True: # 2. 取出调度器的请求,并交给下载器,下载器返回响应交给spider解析 request = self.scheduler.get_request() if not request: #break return # 获取请求对应的爬虫对象 spider = self.spiders[request.spider_name] # 2.1 将调度器中返回的请求交给下载中间件做预处理,并返回处理后的请求 for downloader_mid in self.downloader_mids: request = downloader_mid.process_request(request, spider) response = self.downloader.send_request(request) # 2.2 将下载器返回的响应交给下载中间件做预处理,并返回处理后的响应 for downloader_mid in self.downloader_mids: response = downloader_mid.process_response(response, spider) # 将响应交给爬虫解析 # parse_func = spider.parse(response) #爬虫对象的某个解析方法 parse, parse_page #getattr(spider, "parse_page") # 动态获取获取爬虫对象的该请求指定的回调函数,并将响应传入回调函数解析 callback_func = getattr(spider, request.callback) parse_func = callback_func(response) for item_or_request in parse_func: # 3. 判断解析结果,如果是请求继续交给调度器;如果是item数据交给管道 if isinstance(item_or_request, LRequest): item_or_request.spider_name = spider.name for spider_mid in self.spider_mids: item_or_request = spider_mid.process_request( item_or_request, spider) self.scheduler.add_request(item_or_request) elif isinstance(item_or_request, Item): for spider_mid in self.spider_mids: item_or_request = spider_mid.process_item( item_or_request, spider) for pipeline in self.pipelines: item_or_request = pipeline.process_item( item_or_request, spider) else: raise Exception("Not support data type : <{}>".format( type(item_or_request))) self.total_response += 1 def _execute_request_return_item(self, request: LRequest): if not request: return spider = self.spiders[request.spider_name] for downloader_mid in self.downloader_mids: request = downloader_mid.process_request(request, spider) try: response = self.downloader.send_request(request) except Exception as e: spider.logger.error(f'链接{request.url}出错:' + str(e)) return for downloader_mid in self.downloader_mids: response = downloader_mid.process_response(response, spider) callback_func = getattr(spider, request.callback) try: parse_func = callback_func(response) for item_or_request in parse_func: if isinstance(item_or_request, LRequest): item_or_request.spider_name = spider.name for spider_mid in self.spider_mids: item_or_request = spider_mid.process_request( item_or_request, spider) self.scheduler.add_request(item_or_request) elif isinstance(item_or_request, Item): for spider_mid in self.spider_mids: item_or_request = spider_mid.process_item( item_or_request, spider) for pipeline in self.pipelines: item_or_request = pipeline.process_item( item_or_request, spider) else: raise Exception("Not support data type : <{}>".format( type(item_or_request))) except Exception as e: spider.logger.error(f'解析{request.url}出错:' + str(e) + f'响应码[{response.status_code}]') return self.total_response += 1
def getChannels(page): x = [] logger.debug("page is: "+page) if str(page) == '0': page=Zonasportsme.MAIN_URL else: logger.debug("decoding page: "+page) page = base64.b64decode(page) logger.debug("decoded page: "+page) logger.debug("launching web petition to page: "+page) html = Zonasportsme.getContentFromUrl(page,"",Zonasportsme.cookie,Zonasportsme.MAIN_URL) if page==Zonasportsme.MAIN_URL: logger.debug("browsing main menu...") menu = Decoder.extract('<ul class="nav" id="main-menu">',"</li></ul></li></ul>",html) x = Zonasportsme.extractElements(menu) else: url = "" #decoder part if 'http://www.ustream.tv/' in html: uStreamUrl = Decoder.extractWithRegex('http://www.ustream.','"',html) url = Decoder.getUstreamLink(uStreamUrl,page) elif 'castamp.com/embed.js' in html: channel = Decoder.extract('channel="','"',html) url = Decoder.getCastcampLink(channel,page) elif 'adca.st/broadcast/player.js' in html: if "<script type='text/javascript'>id='" in html: id2 = Decoder.extract("<script type='text/javascript'>id='", "';", html) logger.debug("using id = " + id2) url4 = "http://bro.adca.st/stream.php?id=" + id2 + "&width=700&height=450&stretching=uniform" html4 = Zonasportsme.getContentFromUrl(url4, "", Zonasportsme.cookie, page) logger.debug("html4: " + html4) curl = Decoder.rExtract('= "', '=";', html4)+'=' fn = Decoder.rExtract('"','.php";',html4) token = Zonasportsme.getContentFromUrl('http://bro.adca.st/'+fn+'.php', "",Zonasportsme.cookie, url4, True) logger.debug("token: " + token) token = Decoder.extract('":"', '"', token) file = base64.decodestring(curl) + token + "|" + Downloader.getHeaders('http://cdn.allofme.site/jw/jwplayer.flash.swf') logger.debug("final url is: " + file) url = file elif 'zony.tv/static/scripts/zony.js' in html: channel = Decoder.extract("channel='","'",html) url = 'http://www.zony.tv/embedplayer/'+channel+'/1/700/400/' html2 = Zonasportsme.getContentFromUrl(url=url,referer=page) logger.debug("html2 is: "+html2) #newParam = Decoder.extract("so.addParam('FlashVars', '", "'", html2) # brute params, needs a sort newParam = Decoder.extractParams(html2) rtmp = "rtmp://146.185.16.62/stream playPath="+newParam+" swfVfy=1 timeout=10 conn=S:OK live=true swfUrl=http://www.zony.tv/static/scripts/fplayer.swf flashver=WIN/2019,0,0,226 pageUrl="+page url = rtmp elif 'http://www.embeducaster.com/static/' in html: channel = Decoder.extract("channel='", "'", html) url = 'http://www.embeducaster.com/embedplayer/' + channel + '/1/700/400/' html2 = Zonasportsme.getContentFromUrl(url=url, referer=page) logger.debug("html2 is: " + html2) url = Decoder.decodeUcaster(html2,url) elif '247bay.tv/static/' in html: channel = Decoder.extract("channel='", "'", html) url = 'http://www.247bay.tv/embedplayer/'+channel+'/2/750/420' url = Decoder.decode247bay(url,page) element = {} element["title"] = "Stream" element["link"] = url element["permaLink"] = True x.append(element) return x
class Engine(object): def __init__( self): self._istart = False self._status = Status() """--- load config file----""" self._config = Configuration(); """--- core object ----""" self._downloader = None self._parser = None """--- memory models --- """ self._download_pool = SafeQueue() #Store the html objects to be downloaded by the downloader self._parse_pool = SafeQueue() #Store the html objects to be parsed by the parser """--- checker threads --- """ """The target is the function passed in to run in the thread. Those two threads keep checking and assigning jobs to the two thread pools""" self._downloader_pool_checker = Thread( target=self.download_pool_checker) self._parse_pool_checker = Thread( target=self.parse_pool_checker) """--- threads --- """ self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql """ ---strategies--- """ self._earlyvisithandler = EarlyVisitHandler() self._robothandler = RobotHandler() self._cgihandler = CGIHandler() self._nestlevelhandler = NestLevelHandler() self._schemehandler = SchemeHandler() self._filetypehandler = FileTypeHandler() self._bookmarkhandler = BookMarkHandler() self._omitindex = OmitIndex() self._urlextender = URLExtender() """ ---init the path for saving data, if the folder don't exist, create it ---""" self._path = self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/" if not os.path.exists(self._path): os.makedirs(self._path) self._config._down_path = self._path self._keywords_links= [] """ ---Mysql Manager--- """ self.sqlex = DatabseManager(self._config) #self.f= open("data.txt", 'w') def load_seeds(self): #load seed info from config file #print "load_seeds 1" #load seed from contacter = SearchGoogle(self._config._keywords, self._config._result_num) self._keywords_links = contacter.getURLs() #append seeds, which from google search result, into download pool #print "load_seeds 2" #self._keywords_links.insert(0, "https://twitter.com/") #self._keywords_links.insert(0, "https://twitter.com/signup?context=login") i = 0 for url in self._keywords_links: if i < self._config._result_num: #print "@@{0}".format(url) html_task = Html(url) #print "@@1" if(self._schemehandler.SchemeChecker(html_task)==False): #print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@2" self._status._scheme+=1 continue if(self._bookmarkhandler.BookMarkChecker(html_task)==True): #print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@3" self._status._bookmark+=1 continue if(self._cgihandler.FindCGI(html_task)==True): #print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) #print "@@4" self._status._cgi+=1 continue if(self._nestlevelhandler.checknestlevel(html_task,self._config._parser_nlv)==True): self._status._nestlv +=1 #print "@@5" #print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue if(self._filetypehandler.FileTypeChecker(html_task)==False): #print "@@6" self._status._file_type +=1 continue #print "@@7" ''' if(self._earlyvisithandler.check_visited(html_task) == True): self._status._early_visit +=1 #print("Ingore the link visited before, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue ''' self._omitindex.Omit(html_task) """ print "@@8" if(self._robothandler.is_allowed(html_task) == False): print "@@9" self._status._robot +=1 #print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) continue print "@@10" """ self._earlyvisithandler.add_entry(html_task._md5, html_task) self._download_pool.append(html_task) '''If use the following two line of code, then the program won't run, which means checking for revisit works''' '''however, the dic should be safe with a lock''' #self._visited_dic[html_task._md5] = html_task._url #print(len(self._visited_dic)) #print "@@11" else: break i+=1 #print "load_seeds 3" def show_welcome(self): print("download folder:"+self._path) print "key words:"+self._config._keywords print "donload thread num: {0}".format(self._config._down_num) print "parse thread num: {0}".format(self._config._parser_num) print "Load " +str(self._config._result_num)+" results from google search:" i = 0 for url in self._keywords_links: if i < self._config._result_num: print ("[{0}]".format(i)+url) i+=1 print "\n------------------------------------------------------------------------\n" #raw_input("press any key to start crawling, press second key to stop") def wait_for_start(self): print "ready for start....." print "go to http://dengxu.me/crawling/ to input some key words & see the result " while( self.sqlex.read_if_start(self._config)!= True): sleep(1) print "\n------------------------------------------------------------------------\n" print "starting crawling engine...." def start(self): try: self.wait_for_start() self._istart = True """load seed """ self.load_seeds() #load seeds from google search """show welcome info""" self.show_welcome() self._status._sys_start = time() """start threads""" self._downloader = Downloader( self._config._down_num, self._status) self._downloader.start() self._parser = Parser(self._config._parser_num, self._status ) self._parser.start() self._downloader_pool_checker.start() self._parse_pool_checker.start() self._status_update.start() """notify mysql, i am started""" self.sqlex.write_if_start() except (Exception) as e: Log().debug("start failed") raise(e) return False def stop(self): self._istart = False """"clear download and parse popl""" self._download_pool.clear() self._parse_pool.clear() """stop downloader and parser threads""" self._downloader.stop() self._parser.stop() """"Those two checker threads will end when the thread who calls them ends""" self._downloader_pool_checker.join() self._parse_pool_checker.join() self._status_update.join() print ("Engine is stopping") def pause(self): pass def finish_download(self, html_task): sentence = "Downloaded:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-code: {4} data-size: {5}byes url: {6}"\ .format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\ html_task._parent,html_task._return_code, html_task._data_size, html_task._url ) #if self._status._download_times <= 500 : # self.f.write(sentence+"\n") """caculate the path for saving files""" full_path = self._path+"[No.{0}]_".format(self._status._download_times)+".html" """save html data to files""" #f= open(full_path, 'w') #f.write(html_task._data) #f.close() """After downloading, pass the data(still using the html objects) to the parse pool""" self._parse_pool.append(html_task) def finish_parse(self, html_task): ''' print("parsed:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-status: {4} data-size: {5}byes url:{6}"\ .format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\ html_task._parent,html_task._return_code, html_task._data_size, html_task._url)) ''' """After parsing, pass the urls to be downloaded to the download pool""" if(self._earlyvisithandler.check_visited(html_task) == True): #print("Ingore the link visited before, this link is within page {0} , so don't put it in queue".format(html_task._parent), html_task._url) self._status._early_visit +=1 return if(self._robothandler.is_allowed(html_task) == False): #print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url) self._status._robot +=1 return self._earlyvisithandler.add_entry(html_task._md5, html_task) self._download_pool.append(html_task) def download_pool_checker(self): while (self._istart == True): new_download_task = self._download_pool.pop_left() """If there is no task remain in the download pool, put the thread into sleep""" """else pop the new task, and download it""" """for the engine to get the result to put into the parse pool, we need to pass the function finish_download down as a callback""" if (new_download_task == None): #print("No task remaining in download_pool") sleep(0.1) else: self._downloader.queue_download_task(new_download_task , self.finish_download) def parse_pool_checker(self): while (self._istart == True): new_parse_task = self._parse_pool.pop_left() if (new_parse_task == None): #print("sleeping") sleep(0.1) else: self._parser.queue_parse_task(new_parse_task, self.finish_parse) #~~~see result at http://dengxu.me/crawling/ def status_update(self): while (self._istart == True): self._status._download_queue = self._downloader.len() self._status._parse_queue = self._parser.len() sentence = "[time: {0:0.1f}],queue:{8}, down: {1}, total: {2:0.1f}MB | queue:{9}, parsed: {3},scheme:{10}, cig: {4}, bookmark: {11} type {12} visited: {5}, robot: {6},nestlv: {7} | error: 404: {13} , timeout: {14}"\ .format( time()-self._status._sys_start,\ self._status._download_times, float(self._status._download_size)/1024/1024, self._status._parse_times\ ,self._status._cgi, self._status._early_visit, self._status._robot, self._status._nestlv\ ,self._downloader.len(), self._parser.len(),self._status._scheme_type, self._status._bookmark, self._status._file_type\ ,self._status._404,self._status._socket_timeout) print sentence #if( self._status._download_times > 500): # self.f.write( sentence+"\n") """update status tp mysql""" self.sqlex.write_status(self._status) """update recent download url""" self.sqlex.write_recent_download(self._status) sleep(1)