Python Downloader 예제들, core.downloader.Downloader Python 예제들

예제 #1

0

파일 보기

파일: fildonet.py 프로젝트: harddevelop/jukebox

 def getChannels(page,cookie='',referer=''):
     x = []
     html = ""
     if str(page) == '0':
         x = Fildonet.getMainSections()
     elif str(page) == '100artist':
         page=Fildonet.MAIN_URL
         html = Downloader.getContentFromUrl(page,"",cookie,"")
         x = Fildonet.extractElementsArtist(html)
     elif str(page) == 'topalbums':
         page=Fildonet.MAIN_URL
         html = Downloader.getContentFromUrl(page,"",cookie,"")
         x = Fildonet.extractElementsAlbum(html)
     elif str(page) == 'lastestplaylists':
         pass
     elif str(page).find('search')!=-1:
         keyboard = xbmc.Keyboard("")
         keyboard.doModal()
         text = ""
         if (keyboard.isConfirmed()):
             text = keyboard.getText()
             x = Fildonet.search(text)
     else:
         page = base64.standard_b64decode(page)
         logger.info("ELSE --- page is: "+page)
         html = Downloader.getContentFromUrl(page,"",cookie,"")
         if page.find("albumId=")!=-1:
             jsonData = json.loads(html)
             x = Fildonet.buildFromJSON(jsonData)
         else:
             x = Fildonet.extractElementsPlayer(html)
     return x

예제 #2

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeStreamliveto(html,page=''):
     iframeUrl = "http://www.streamlive.to/view/"+Decoder.extract('http://www.streamlive.to/embed/','&width=',html)
     html2 = Downloader.getContentFromUrl(iframeUrl,urllib.urlencode({"captcha":"yes"}),"",iframeUrl)
     if html2.find("Question:")>-1:#captcha
         #logger.debug(html2)
         captcha = Decoder.rExtract(': ','<br /><br />',html2)
         if captcha.find("(")>-1:
             logger.debug("resolving captcha with math..."+captcha)
             try:
                 captcha = Decoder.resolveSimpleMath(captcha)
             except:
                 logger.error("Could not resolve captcha: "+captcha)
                 pass
         logger.debug("captcha="+captcha)
         captchaPost = urllib.urlencode({'captcha': captcha})
         logger.debug(captchaPost)
         time.sleep(3)
         html2 = Downloader.getContentFromUrl(iframeUrl,captchaPost,Downloader.cookie,iframeUrl)
     link = "http://harddevelop.com/2015/11/tv-box.html|Referer=http://gordosyfrikis.com/" # ;)
     if html2.find("http://www.streamlive.to/ads/ilive_player.swf")>-1: #builds the link
         swfUrl = "http://www.streamlive.to/ads/streamlive.swf"
         tokenUrl = Decoder.extractWithRegex("http://www.streamlive.to/server.php?id=",'"',html2)
         tokenUrl = tokenUrl[:(len(tokenUrl)-1)]
         token = Downloader.getContentFromUrl(tokenUrl,"",Downloader.cookie,page)
         token = Decoder.extract('{"token":"','"}',token)
         file = Decoder.extract('file: "','",',html2).replace('.flv','')
         streamer = Decoder.extract('streamer: "','",',html2).replace("\\","")
         link = streamer+"./"+file+" playpath="+file+" live=1 token="+token+" swfUrl="+swfUrl+" pageUrl=http://www.streamlive.to/view"+(iframeUrl[iframeUrl.rfind("/"):])
         logger.debug("built a link to be used: "+link)
     return link

예제 #3

0

파일 보기

파일: fildonet.py 프로젝트: bitstuffing/jukebox

 def getChannels(page, cookie='', referer=''):
     x = []
     html = ""
     if str(page) == '0':
         x = Fildonet.getMainSections()
     elif str(page) == '100artist':
         page = Fildonet.MAIN_URL
         html = Downloader.getContentFromUrl(page, "", cookie, "")
         x = Fildonet.extractElementsArtist(html)
     elif str(page) == 'topalbums':
         page = Fildonet.MAIN_URL
         html = Downloader.getContentFromUrl(page, "", cookie, "")
         x = Fildonet.extractElementsAlbum(html)
     elif str(page) == 'lastestplaylists':
         pass
     elif str(page).find('search') != -1:
         keyboard = xbmc.Keyboard("")
         keyboard.doModal()
         text = ""
         if (keyboard.isConfirmed()):
             text = keyboard.getText()
             x = Fildonet.search(text)
     else:
         page = base64.standard_b64decode(page)
         logger.info("ELSE --- page is: " + page)
         html = Downloader.getContentFromUrl(page, "", cookie, "")
         if page.find("albumId=") != -1:
             jsonData = json.loads(html)
             x = Fildonet.buildFromJSON(jsonData)
         else:
             x = Fildonet.extractElementsPlayer(html)
     return x

예제 #4

0

파일 보기

파일: decoder.py 프로젝트: bitstuffing/jukebox

 def decodeOpenload(link):
     #get cookies
     mediaId = Decoder.extract("/f/", "/", link)
     embedUrl = 'https://openload.io/embed/' + mediaId
     html = Downloader.getContentFromUrl(embedUrl, "", "", "", False, False)
     logger.info("html is: " + html)
     logger.debug("using cookie 1: " + Downloader.cookie)
     logger.debug("Media id for openload is: " + mediaId)
     extra = "&login=f750b26513f64034&key=oaA-MbZo"  #this avoid captcha petition
     link2 = "https://api.openload.io/1/file/dlticket?file=" + mediaId + extra
     data = Downloader.getContentFromUrl(link2, "", Downloader.cookie,
                                         embedUrl, True, False)
     logger.debug("jsonData: " + data)
     js_result = json.loads(data)
     logger.info("sleeping... " + str(js_result['result']['wait_time']))
     time.sleep(int(js_result['result']['wait_time']))
     link3 = 'https://api.openload.io/1/file/dl?file=%s&ticket=%s' % (
         mediaId, js_result['result']['ticket'])
     logger.debug("using cookie 2: " + Downloader.cookie)
     result = Downloader.getContentFromUrl(link3, "", Downloader.cookie,
                                           embedUrl, True, False)
     logger.debug("jsonData 2: " + result)
     js_result2 = json.loads(result)
     file = js_result2['result']['url'] + '?mime=true'
     logger.info("Built final link: " + file)
     return file

예제 #5

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/jukebox

    def extractSawlive(scriptSrc, cookie, iframeUrl):
        encryptedHtml = Downloader.getContentFromUrl(scriptSrc, "", cookie, iframeUrl)
        # print encryptedHtml
        decryptedUrl = Decoder.decodeSawliveUrl(encryptedHtml)
        html3 = Downloader.getContentFromUrl(decryptedUrl, "", cookie, scriptSrc)
        # ok, now extract flash script content

        flashContent = Decoder.extract("var so = new SWFObject('", "</script>", html3)
        file = Decoder.extract("'file', '", "');", flashContent)
        rtmpUrl = ""
        if flashContent.find("'streamer', '") > 0.1:
            rtmpUrl = Decoder.extract("'streamer', '", "');", flashContent)
        swfUrl = "http://static3.sawlive.tv/player.swf"  # default
        # update swf url
        swfUrl = flashContent[: flashContent.find("'")]
        logger.info("updated swf player to: " + swfUrl)
        if rtmpUrl == "" and file.find("http://") > -1:
            finalRtmpUrl = file  # it's a redirect with an .m3u8, so it's used
        else:
            finalRtmpUrl = (
                rtmpUrl
                + " playpath="
                + file
                + " swfUrl="
                + swfUrl
                + " live=1 conn=S:OK pageUrl="
                + decryptedUrl
                + " timeout=12"
            )
        return finalRtmpUrl

예제 #6

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeOpenloadUsingOfficialApi(link): #API sucks, today it always returns a 509 with all logins xDDD
     #get cookies
     mediaId = Decoder.extract("/f/","/",link)
     embedUrl = 'https://openload.io/embed/'+mediaId
     html = Downloader.getContentFromUrl(embedUrl,"","","",False,False)
     logger.info("html is: "+html)
     logger.debug("using cookie 1: "+Downloader.cookie)
     logger.debug("Media id for openload is: "+mediaId)
     key = "oaA-MbZo"
     login = "******"
     extra = "&login="******"&key="+key #this avoid captcha petition
     link2 = "https://api.openload.io/1/file/dlticket?file="+mediaId+extra
     data = Downloader.getContentFromUrl(link2,"",Downloader.cookie,embedUrl,True,False)
     logger.debug("jsonData: "+data)
     js_result = json.loads(data)
     logger.info("sleeping... "+str(js_result['result']['wait_time']))
     time.sleep(int(js_result['result']['wait_time']))
     link3 = 'https://api.openload.io/1/file/dl?file=%s&ticket=%s' % (mediaId, js_result['result']['ticket'])
     logger.debug("using cookie 2: "+Downloader.cookie)
     result = Downloader.getContentFromUrl(link3,"",Downloader.cookie,embedUrl,True,False)
     logger.debug("jsonData 2: "+result)
     js_result2 = json.loads(result)
     file = js_result2['result']['url'] + '?mime=true'
     logger.info("Built final link: "+file)
     return file

예제 #7

0

파일 보기

파일: engine.py 프로젝트: lostrhythm/crawler_test_git

class Engine():
    def __init__(self):
        self.logger = get_logger('Core',
                                 True)  # core moudles share the same logger

        self.Scheduler = Scheduler(self.logger)
        self.Downloader = Downloader(self.logger)
        self.Uploader = Uploader(self.logger)
        self.Monitor = Monitor(self.logger)

    def _do_register(self):
        user = GlobalConfig.Deploy_dict['user']
        password = GlobalConfig.Deploy_dict['password']
        self.logger.info('registering START: %s' % user)
        RegisterSuccess = do_register(user, password, self.logger)
        self.logger.info('registering END: %s' % str(RegisterSuccess))
        return RegisterSuccess

    def start(self):
        if self._do_register():
            self.logger.info('---engine START---')

            self.Scheduler.start_threads()
            self.Monitor.start_threads()
            self.Downloader.start_threads(
            )  # Downloader uses spiders which uses Status, so Monitor should run in front
            self.Uploader.start_threads()

        else:
            self.logger.info('---engine START failed---')

    def stop(self):
        pass

예제 #8

0

파일 보기

파일: zonaappcom.py 프로젝트: harddevelop/tvbox

    def getFinalLink(link):
        # trying to decode link downloading it again
        if ".m3u8" in link:
            logger.debug("old link: " + link)
            oldLink = link
            m3u8Text = ZonaAppCom.getContentFromUrl(link)
            logger.debug("m3u8 content is: "+m3u8Text)
            if "http" in m3u8Text:
                m3u8Text = m3u8Text[m3u8Text.find("http"):]
                if "\n" not  in m3u8Text:
                    link = m3u8Text
                    logger.debug("1) updated link to: " + link)
                    if ".php" in link:
                        # trying second time
                        m3u8Text = ZonaAppCom.getContentFromUrl(link)
                        if "http" in m3u8Text:
                            oldLink = link
                            link = m3u8Text[m3u8Text.find("http"):]
                            logger.debug("2) updated link to: " + link)
                            link += "|" + Downloader.getHeaders(oldLink)
                    else:
                        link += "|" + Downloader.getHeaders(oldLink)

                else:
                    logger.debug("0) Complex link, not changed!" + link)
            else:
                logger.debug("nothing done! "+link)
        return link

예제 #9

0

파일 보기

파일: engine.py 프로젝트: lostrhythm/crawler_test_git

    def __init__(self):
        self.logger = get_logger('Core',
                                 True)  # core moudles share the same logger

        self.Scheduler = Scheduler(self.logger)
        self.Downloader = Downloader(self.logger)
        self.Uploader = Uploader(self.logger)
        self.Monitor = Monitor(self.logger)

예제 #10

0

파일 보기

def test_timeout_is_passed(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_ok_return_value()
    downloader.fetch_url(FAKE_COOKIE, FAKE_URL, timeout_secs=3600)

    mock_requests.assert_called_once_with(timeout=3600,
                                          url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY)

예제 #11

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeVidggTo(link):
     referer = "http://www.vidgg.to/player/cloudplayer.swf"
     html = Downloader.getContentFromUrl(link)
     file = Decoder.extract("flashvars.file=\"",'";',html)
     key = Decoder.extract("flashvars.filekey=\"",'";',html)
     url2 = "http://www.vidgg.to/api/player.api.php?pass=undefined&key="+key+"&user=undefined&numOfErrors=0&cid3=undefined&cid=1&file="+file+"&cid2=undefined"
     bruteResponse = Downloader.getContentFromUrl(url2)
     finalLink = Decoder.extract("url=","&title",bruteResponse)
     logger.debug("Final link is: "+finalLink)
     return finalLink

예제 #12

0

파일 보기

def test_redirect_is_enabled(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_ok_return_value()
    downloader.fetch_url(FAKE_COOKIE, FAKE_URL)

    mock_requests.assert_called_once_with(allow_redirects=True,
                                          url=ANY,
                                          headers=ANY,
                                          timeout=ANY)

예제 #13

0

파일 보기

파일: decoder.py 프로젝트: bitstuffing/jukebox

 def decodeVidggTo(link):
     referer = "http://www.vidgg.to/player/cloudplayer.swf"
     html = Downloader.getContentFromUrl(link)
     file = Decoder.extract("flashvars.file=\"", '";', html)
     key = Decoder.extract("flashvars.filekey=\"", '";', html)
     url2 = "http://www.vidgg.to/api/player.api.php?pass=undefined&key=" + key + "&user=undefined&numOfErrors=0&cid3=undefined&cid=1&file=" + file + "&cid2=undefined"
     bruteResponse = Downloader.getContentFromUrl(url2)
     finalLink = Decoder.extract("url=", "&title", bruteResponse)
     logger.debug("Final link is: " + finalLink)
     return finalLink

예제 #14

0

파일 보기

def test_response_is_returned(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_ok_return_value()
    res = downloader.fetch_url(FAKE_COOKIE, FAKE_URL)

    assert res == create_ok_return_value()

    mock_requests.assert_called_once_with(url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)

예제 #15

0

파일 보기

def download_from_url(url, item):
    logger.info("Intentando descargar: %s" % (url))
    if url.lower().endswith(".m3u8") or url.lower().startswith("rtmp"):
        save_server_statistics(item.server, 0, False)
        return {"downloadStatus": STATUS_CODES.error}

    # Obtenemos la ruta de descarga y el nombre del archivo
    item.downloadFilename = item.downloadFilename.replace('/','-')
    download_path = filetools.dirname(filetools.join(DOWNLOAD_PATH, item.downloadFilename))
    file_name = filetools.basename(filetools.join(DOWNLOAD_PATH, item.downloadFilename))

    # Creamos la carpeta si no existe

    if not filetools.exists(download_path):
        filetools.mkdir(download_path)

    # Lanzamos la descarga
    d = Downloader(url, download_path, file_name,
                   max_connections=1 + int(config.get_setting("max_connections", "downloads")),
                   block_size=2 ** (17 + int(config.get_setting("block_size", "downloads"))),
                   part_size=2 ** (20 + int(config.get_setting("part_size", "downloads"))),
                   max_buffer=2 * int(config.get_setting("max_buffer", "downloads")))
    d.start_dialog(config.get_localized_string(60332))

    # Descarga detenida. Obtenemos el estado:
    # Se ha producido un error en la descarga   
    if d.state == d.states.error:
        logger.info("Error al intentar descargar %s" % (url))
        status = STATUS_CODES.error

    # La descarga se ha detenifdo
    elif d.state == d.states.stopped:
        logger.info("Descarga detenida")
        status = STATUS_CODES.canceled

    # La descarga ha finalizado
    elif d.state == d.states.completed:
        logger.info("Descargado correctamente")
        status = STATUS_CODES.completed

        if item.downloadSize and item.downloadSize != d.size[0]:
            status = STATUS_CODES.error

    save_server_statistics(item.server, d.speed[0], d.state != d.states.error)

    dir = os.path.dirname(item.downloadFilename)
    file = filetools.join(dir, d.filename)

    if status == STATUS_CODES.completed:
        move_to_libray(item.clone(downloadFilename=file))

    return {"downloadUrl": d.download_url, "downloadStatus": status, "downloadSize": d.size[0],
            "downloadProgress": d.progress, "downloadCompleted": d.downloaded[0], "downloadFilename": file}

예제 #16

0

파일 보기

파일: spider.py 프로젝트: nskyzone/Porcupine

 def __init__(self, start_monitor=True):
     self.init()
     self.number_dict = {core.constant.TOTAL_TASK: 0, core.constant.TOTAL_REQUEST: 0,
                         core.constant.TOTAL_RESPONSE: 0}
     self.color = core.constant.COLOR
     self.close = False
     self.loop = asyncio.get_event_loop()
     self.filter = core.bloomFilter.bloomFilterContext.get_filter(settings.PROJECT_NAME)
     self.scheduler = Scheduler(self)
     self.downloader = Downloader(self, settings.DOWNLOADER_WORKER)
     self.save = Save(self, settings.SAVE_WORKER)
     self.monitor = Monitor(self)
     self.start_monitor = start_monitor

예제 #17

0

파일 보기

파일: downloads.py 프로젝트: gacj22/WizardGacj22

def download_url(url, item, path=None, filename=None, resume=False):
    logger.trace()

    if url.lower().endswith(".m3u8") or url.lower().startswith(
            "rtmp") or item.server == 'torrent':
        logger.debug('Servidor o tipo de medio no soportado')
        return {"status": 3}

    download_path = settings.get_setting('download_path', __file__)

    if path:
        path = filetools.join(download_path, path)
    else:
        path = download_path

    if not filetools.isdir(path):
        filetools.makedirs(path)

    d = Downloader(
        url=url,
        path=filetools.validate_path(path),
        filename=filetools.validate_path(filename),
        resume=resume,
        max_connections=1 + settings.get_setting("max_connections", __file__),
        block_size=2**(17 + settings.get_setting("block_size", __file__)),
        part_size=2**(20 + settings.get_setting("part_size", __file__)),
        max_buffer=2 * settings.get_setting("max_buffer", __file__))

    d.start_dialog("Descargas [%s]" % item.servername or item.server)

    result = {
        'download_size': d.size[0],
        'download_progress': d.progress,
        'download_filename': d.filename,
        'download_path': path
    }

    if d.state == d.states.error:
        logger.debug("Error al intentar descargar %s" % url)
        result['download_status'] = 3

    elif d.state == d.states.stopped:
        logger.debug("Descarga detenida")
        result['download_status'] = 2

    elif d.state == d.states.completed:
        logger.debug("Descargado correctamente")
        result['download_status'] = 1

    return result

예제 #18

0

파일 보기

def test_status_code_different_from_200_causes_exception(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_not_found_return_value()
    got_ex = False
    try:
        downloader.fetch_url(FAKE_COOKIE, FAKE_URL)
    except RuntimeError:
        got_ex = True

    mock_requests.assert_called_once_with(url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)
    assert got_ex

예제 #19

0

파일 보기

def test_empty_returned_text_causes_exception(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_ok_return_value_without_text()
    got_ex = False
    try:
        downloader.fetch_url(FAKE_COOKIE, FAKE_URL)
    except RuntimeError:
        got_ex = True

    mock_requests.assert_called_once_with(url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)
    assert got_ex

예제 #20

0

파일 보기

파일: providersUtils.py 프로젝트: harddevelop/tvbox

def openSpliveLink(url, page, provider):
    if url.find(".m3u8") == -1 and url.find("rtmp://") == -1:
        channel = Spliveappcom.decodeUrl(url, provider)
        link = channel[0]["link"]
        if link.find(", referer:") > -1:
            link = link[0 : link.find(", referer:")]
        url = link
    else:
        logger.debug("nothing decoded for splive encrypted channels, continue...")

    logger.debug("splive BRUTE logic for url: " + url)

    try:
        if "ponlatv.com" in url or "playerhd1.pw" in url:
            logger.debug("trying to decode cineestrenos script from url: " + url)
            url = Cineestrenostv.extractScriptLevel3(url, referer=Cineestrenostv.MAIN_URL)
            logger.debug("decoded link was: " + url)

        else:
            url = Cineestrenostv.getChannels(url)[0]["link"]
            html = Downloader.getContentFromUrl(url)
            element = Cineestrenostv.extractIframeChannel(html, url)
            if element is not None and element.has_key("link"):
                url = element["link"]
                logger.debug("cineestrenos url was decoded to: " + url)
            else:
                logger.debug("nothing was done to decode cineestrenostv url!")
    except:
        logger.debug("nothing to be decoded with url: " + url)
        pass

    link = url

    logger.info("found link: " + link + ", launching...")
    open(link, page)

예제 #21

0

파일 보기

파일: providersUtils.py 프로젝트: harddevelop/tvbox

def drawBbcCoUkNew(url):
    htmlContent = Downloader.getContentFromUrl(url=url)
    title = Decoder.extract('<p class="story-body__introduction">', "</p><div", htmlContent)
    if 'property="articleBody"' in htmlContent:
        body = Decoder.extract(
            'property="articleBody"',
            "                                                                                                </div>",
            htmlContent,
        )
        body = body.replace('<span class="off-screen">Image copyright</span>', "")
        body = body.replace('<span class="story-image-copyright">AFP</span>', "")
        body = body.replace('<span class="story-image-copyright">Reuters</span>', "")
        body = body.replace('<span class="off-screen">Image caption</span>', "")
        body = body.replace('<span class="off-screen">Media caption</span>', "")
        while '<span class="media-caption__text">' in body:
            line = Decoder.extractWithRegex('<span class="media-caption__text">', "</span>", body)
            body = body.replace(line, "")
    elif 'class="text-wrapper"' in htmlContent:
        # special content
        body = Decoder.extract('class="text-wrapper"', "</p>\n", htmlContent)
        dates = Decoder.extractWithRegex('<div class="date', "</div>", body)
        lastUpdate = Decoder.extractWithRegex('<p class="date ', "</p>", body)
        body = body.replace(dates, "")
        body = body.replace(lastUpdate, "")
    elif '<figcaption class="sp-media-asset' in htmlContent:
        body = Decoder.extract('<figcaption class="sp-media-asset', "</p><div ", htmlContent)
        if ">" in body:
            body = body[body.find(">") + 1 :]
    body = Decoder.removeHTML(body).replace(".", ".\n").replace(">", "")
    logger.debug("body is: " + body)
    drawNew(textContent=(body))

예제 #22

0

파일 보기

파일: engine.py 프로젝트: derrick0714/web_search_engine

	def start(self):
		try:
			self.wait_for_start()

			self._istart = True
			
			"""load seed """
			self.load_seeds()	#load seeds from google search 

			
			"""show welcome info"""
			self.show_welcome()
			self._status._sys_start	= time()

			"""start threads"""
			self._downloader = Downloader( self._config._down_num, self._status)
			self._downloader.start()
			self._parser     = Parser(self._config._parser_num, self._status )
			self._parser.start()
			self._downloader_pool_checker.start()
			self._parse_pool_checker.start()
			self._status_update.start()


			"""notify mysql, i am started"""
			self.sqlex.write_if_start()
			
		except (Exception) as e:
			Log().debug("start failed")
			raise(e)
			return False

예제 #23

0

파일 보기

파일: spidercore.py 프로젝트: lichangg/housejobwweapon

 def __init__(self):  #, spider_group, task_gettter):
     # self.spider_group = spider_group
     # self.task_getter = task_gettter
     self.spiders = self._auto_import_cls(SPIDERS, True)
     self.pool = Pool()
     self.pipelines = self._auto_import_cls(PIPELINES)
     self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES)
     #self.downloader_mids = downloader_mids
     self.downloader_mids = self._auto_import_cls(DOWNLOADER_MIDDLEWARES)
     self.scheduler = Scheduler(ROLE, QUEUE_TYPE)
     self.downloader = Downloader()
     # self.spider_mids = spider_mids
     self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES)
     self.is_running = True
     self.total_response = 0
     self.executor = BaseThreadPoolExecutor(max_workers=ASYNC_COUNT)

예제 #24

0

파일 보기

def test_retries_when_service_unavailable_then_ok(mock_requests):
    downloader = Downloader()

    mock_requests.side_effect = [
        create_service_unavailable_return_value(),
        create_ok_return_value()
    ]

    res = downloader.fetch_url(FAKE_COOKIE, FAKE_URL, retries=3)

    assert res == create_ok_return_value()

    mock_requests.assert_has_calls([
        call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY),
        call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY)
    ])

예제 #25

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeIguide(iframeUrl3,iframeUrl2=''):
     logger.debug("iguide url is: "+iframeUrl3)
     html4 = Downloader.getContentFromUrl(iframeUrl3,"autoplay=true",Downloader.cookie,iframeUrl2)
     logger.debug("part 2 of iguide")
     #at this point is a similar logic than streamlive.to (probably because like always it's the same server), builds the link
     swfUrl = Decoder.rExtractWithRegex("http://",".swf",html4)
     logger.debug("using swfUrl: "+swfUrl)
     tokenUrl = Decoder.extractWithRegex("http://www.iguide.to/serverfile.php?id=",'"',html4)
     tokenUrl = tokenUrl[:(len(tokenUrl)-1)]
     token = Downloader.getContentFromUrl(tokenUrl,"",Downloader.cookie)
     token = Decoder.extract('{"token":"','"}',token)
     file = Decoder.extract("'file': '","',",html4).replace('.flv','')
     streamer = Decoder.extract("'streamer': '","',",html4).replace("\\","")
     link = streamer+" playpath="+file+" live=1 token="+token+" swfUrl="+swfUrl+" pageUrl="+iframeUrl3
     logger.debug("built a link to be used: "+link)
     return link

예제 #26

0

파일 보기

def test_exceptions_when_internal_server_error(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_internal_server_error_return_value()
    got_ex = False
    try:
        downloader.fetch_url(FAKE_COOKIE, FAKE_URL)

    except RuntimeError:
        got_ex = True

    mock_requests.assert_called_once_with(url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)
    assert got_ex

예제 #27

0

파일 보기

def test_timeout_is_propagated_when_retries_are_disabled(mock_requests):
    downloader = Downloader()

    mock_requests.side_effect = requests.exceptions.Timeout()
    got_ex = False
    try:
        downloader.fetch_url(FAKE_COOKIE, FAKE_URL)

    except requests.exceptions.Timeout:
        got_ex = True

    mock_requests.assert_called_once_with(url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)
    assert got_ex

예제 #28

0

파일 보기

def test_exceptions_from_get_are_propagated(mock_requests):
    downloader = Downloader()

    mock_requests.side_effect = RuntimeError('Boom')
    got_ex = False
    try:
        downloader.fetch_url(FAKE_COOKIE, FAKE_URL)

    except RuntimeError:
        got_ex = True

    mock_requests.assert_called_once_with(url=ANY,
                                          headers=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)
    assert got_ex

예제 #29

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

    def extractDinostreamPart(url,referer=''):
        element = {}
        logger.debug("url: "+url+", referer: "+referer)
        html4 = Downloader.getContentFromUrl(url,"","",referer)
        finalIframeUrl = Decoder.extractWithRegex('http://','%3D"',html4)
        finalIframeUrl = finalIframeUrl[0:len(finalIframeUrl)-1]
        logger.debug("proccessing level 4, cookie: "+Downloader.cookie)
        finalHtml = Downloader.getContentFromUrl(finalIframeUrl,"",Downloader.cookie,referer)
        logger.debug("proccessing level 5, cookie: "+Downloader.cookie)
        playerUrl = Decoder.decodeBussinessApp(finalHtml,finalIframeUrl)
        #print "player url is: "+playerUrl
        element["title"] = "Watch streaming"
        element["permalink"] = True
        element["link"] = playerUrl

        return element

예제 #30

0

파일 보기

파일: mobdro.py 프로젝트: harddevelop/tvbox

 def parse_relayer(params):
     url = "NonE"
     try:
         if params.has_key("url"):
             url = params["url"]
             logger.debug("mobdro.directURL: " + url)
         elif params.has_key("relayer"):
             params2 = json.loads(params["relayer"])
             logger.debug("RELAYED: "+repr(params2))
             protocol = "http"#params2["protocol"]
             app = params2["app"]
             server = params2["server"]
             playpath = params2["playpath"]
             password = params2["password"]
             dire = params2["dir"]
             expiration_time = params2["expiration_time"]
             millis = int(round(time.time() * 1000))
             l = millis / 1000L + expiration_time
             arr = [password, l, dire, playpath]
             url = "%s%d/%s/%s"
             url = url % tuple(arr)
             url_md5 = md5.new(url).digest()
             url_base64 = base64.b64encode(url_md5)
             url_base64 = url_base64.replace("+", "-").replace("/", "_").replace("=", "")
             #arr = [server, url_base64, l, playpath]
             arr = [protocol,server,app,playpath,url_base64,l]
             url = "%s://%s/%s/%s?st=%s&e=%d" #"http://%s/live/%s/%d/%s"
             url = url % tuple(arr)
             url += "|"+Downloader.getHeaders(Mobdro.MAIN_URL)
         else:
             logger.debug("REJECTED: " + repr(params))
     except KeyError:
         url = "exception"
         pass
     return url

예제 #31

0

파일 보기

파일: redmp3cc.py 프로젝트: bitstuffing/jukebox

 def getChannels(page,cookie='',referer=''):
     x = []
     html = ""
     if str(page) == '0':
         x = Redmp3cc.getMainSections()
     elif str(page) == 'songs.html':
         page=Redmp3cc.MAIN_URL+"/"
         html = Downloader.getContentFromUrl(page,"",cookie,"")
         x = Redmp3cc.extractElementsPlayer(html)
     elif str(page).find('search.html')!=-1:
         if str(page).find('search.html/')==-1:
             keyboard = xbmc.Keyboard("")
             keyboard.doModal()
             text = ""
             if (keyboard.isConfirmed()):
                 text = keyboard.getText()
                 x = Redmp3cc.search(text)
         else:
             text = Decoder.rExtract('search.html/','/',page)
             page = int(page[page.rfind('/')+1:])
             x = Redmp3cc.search(text,page)
     elif str(page).find(".html")!=-1:
         if str(page) == 'albums.html'!=-1:
             page = Redmp3cc.MAIN_URL
             html = Downloader.getContentFromUrl(page,"",cookie,"")
             x = Redmp3cc.extractElementsAlbum(html)
         else:
             html = Downloader.getContentFromUrl(page,"",cookie,"")
             x = Redmp3cc.extractElementsPlayer(html)
     else:
         logger.info("page is: "+page)
         response = Redmp3cc.getContentFromUrl(page,"",cookie,Redmp3cc.MAIN_URL,True)
         #logger.info("will be used a mp3 url: "+Decoder.extract('<a href="','">here',response))
         host = response[response.find("://")+len("://"):]
         if host.find("/")>-1:
             host = host[0:host.find("/")]
         cookie = Redmp3cc.cookie
         referer = page
         logger.info("cookie is: "+cookie+", referer is: "+referer)
         headers = downloadtools.buildMusicDownloadHeaders(host,cookie,referer)
         filename= Decoder.extract('filename=','&',response)
         #ROOT_DIR = xbmcaddon.Addon(id='org.harddevelop.kodi.juke').getAddonInfo('path')
         ROOT_DIR = xbmc.translatePath('special://temp/')
         logger.info("using special root folder: "+ROOT_DIR)
         downloadtools.downloadfile(response,ROOT_DIR+"/"+filename,headers,False,True)
         x.append(Redmp3cc.buildDownloadedFile(xbmc.makeLegalFilename(ROOT_DIR+"/"+filename)))
     return x

예제 #32

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeVidag(link):
     html = Downloader.getContentFromUrl(link,"","","",False,True)
     try:
         encodedMp4File = Decoder.extract("<script type='text/javascript'>eval(function(p,a,c,k,e,d)","</script>",html)
     except:
         pass
     mp4File = jsunpack.unpack(encodedMp4File) #needs un-p,a,c,k,e,t|d
     mp4File = Decoder.extract(',{file:"','",',mp4File)
     return mp4File

예제 #33

0

파일 보기

파일: listsParsers.py 프로젝트: harddevelop/tvbox

def getListsUrls(url,icon=XBMCUtils.getAddonFilePath('icon.png'),provider='',finalTarget=1):
    #logger.debug("using url: "+url)
    html = Downloader.getContentFromUrl(url)
    if url.endswith(".xml") or ('<items>' in html or '<item>' in html): #main channels, it's a list to browse
        drawXml(html,icon=icon,finalTarget=finalTarget,provider=provider)
    elif url.endswith(".xspf"):
        drawXspf(html,icon)
    else: #it's the final list channel, split
        drawBruteChannels(html,icon)

예제 #34

0

파일 보기

def test_cookie_is_passed_in_headers(mock_requests):
    downloader = Downloader()

    mock_requests.return_value = create_ok_return_value()
    downloader.fetch_url(FAKE_COOKIE, FAKE_URL)

    expected_headers = {
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' +
        '(KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36',
        'cookie': FAKE_COOKIE
    }

    mock_requests.assert_called_once_with(headers=expected_headers,
                                          url=ANY,
                                          allow_redirects=ANY,
                                          timeout=ANY)

예제 #35

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeOpenload(link): #decode javascript link like Firefox
     mediaId = Decoder.extract("/f/","/",link)
     logger.debug("mediaId is: "+mediaId)
     link = link.replace('/f/', '/embed/')
     html = Downloader.getContentFromUrl(link,"data=data","","",False,True) #make post, with get there is an infinite loop
     #extract script
     script = re.search(r"<video(?:.|\s)*?<script\s[^>]*?>((?:.|\s)*?)</script", html, re.DOTALL | re.IGNORECASE).group(1)
     url = Decoder.decodeAAScript(script)
     logger.debug("decoded url is: "+url)
     return url

예제 #36

0

파일 보기

def test_timeout_is_propagated_after_last_retry_failed(mock_requests):
    downloader = Downloader()

    mock_requests.side_effect = [
        requests.exceptions.Timeout(),
        requests.exceptions.Timeout()
    ]

    got_ex = False
    try:
        downloader.fetch_url(FAKE_COOKIE, FAKE_URL, retries=2)

    except requests.exceptions.Timeout:
        got_ex = True

    mock_requests.assert_has_calls([
        call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY),
        call(url=ANY, headers=ANY, allow_redirects=ANY, timeout=ANY)
    ])
    assert got_ex

예제 #37

0

파일 보기

파일: decoder.py 프로젝트: bitstuffing/jukebox

 def decodeVidag(link):
     html = Downloader.getContentFromUrl(link, "", "", "", False, True)
     try:
         encodedMp4File = Decoder.extract(
             "<script type='text/javascript'>eval(function(p,a,c,k,e,d)",
             "</script>", html)
     except:
         pass
     mp4File = jsunpack.unpack(encodedMp4File)  #needs un-p,a,c,k,e,t|d
     mp4File = Decoder.extract(',{file:"', '",', mp4File)
     return mp4File

예제 #38

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def extractSawlive(scriptSrc,cookie,iframeUrl):
     encryptedHtml = Downloader.getContentFromUrl(scriptSrc,"",cookie,iframeUrl)
     #print encryptedHtml
     decryptedUrl = Decoder.decodeSawliveUrl(encryptedHtml)
     html3 = Downloader.getContentFromUrl(decryptedUrl,"",cookie,scriptSrc)
     logger.debug("decrypted sawlive url content obtained!")
     #ok, now extract flash script content
     flashContent = Decoder.extract("var so = new SWFObject('","</script>",html3)
     file = Decoder.extract("'file', ",");",flashContent)
     logger.debug("proccessing brute file: "+file)
     #now proccess file, it can be a figure so needs to be appended if contains +
     if file.find("+")>1:
         newFile = ""
         for target in file.split("+"):
             seekedString = "var "+target+" = '"
             if html3.find(seekedString)>-1:
                 value = Decoder.extract(seekedString,"'",html3)
                 newFile += value
             else:
                 newFile += target
             logger.debug("now file is: "+newFile)
         file = newFile
         logger.debug("updated file to: "+file)
     else:
         file = file.replace("'","") #clean
     rtmpUrl = ""
     if flashContent.find("'streamer', '")>.1:
         rtmpUrl = Decoder.extract("'streamer', '","');",flashContent)
     else:
         rtmpVar = Decoder.extract("'streamer', ",");",flashContent)
         seekedString = "var "+rtmpVar+" = '"
         rtmpUrl = Decoder.extract(seekedString,"';",html3)
     swfUrl = "http://static3.sawlive.tv/player.swf" #default
     #update swf url
     swfUrl = flashContent[:flashContent.find("'")]
     logger.info("updated swf player to: "+swfUrl)
     if rtmpUrl=='' and file.find("http://")>-1:
         finalRtmpUrl = file #it's a redirect with an .m3u8, so it's used
     else:
         finalRtmpUrl = rtmpUrl+" playpath="+file+" swfUrl="+swfUrl+" live=1 conn=S:OK pageUrl="+decryptedUrl+" timeout=12"
     return finalRtmpUrl

예제 #39

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/jukebox

 def decodeStreamable(link):
     html = Downloader.getContentFromUrl(link)
     flashContent = Decoder.extract("<object", "</object", html)
     movie = ""
     flashVars = ""
     for content in flashContent.split("<param"):
         value = Decoder.extract('value="', '"', content)
         name = Decoder.extract('name="', '"', content)
         if name == "movie" or name == "player":
             movie = value
         elif name == "FlashVars":
             flashVars = value
     swfUrl = "http://www.streamable.ch" + movie
     flashVars = flashVars[flashVars.find("=") :]
     decodedFlashvars = base64.standard_b64decode(flashVars)
     logger.info("decoded url is: " + decodedFlashvars)
     response = Downloader.getContentFromUrl(decodedFlashvars)
     token = Decoder.extract('"token1":"', '"', response)
     finalLink = base64.standard_b64decode(token)
     logger.debug("final link is: " + finalLink)
     return finalLink

예제 #40

0

파일 보기

파일: decoder.py 프로젝트: bitstuffing/jukebox

 def decodeStreamable(link):
     html = Downloader.getContentFromUrl(link)
     flashContent = Decoder.extract('<object', '</object', html)
     movie = ""
     flashVars = ""
     for content in flashContent.split('<param'):
         value = Decoder.extract('value="', '"', content)
         name = Decoder.extract('name="', '"', content)
         if name == "movie" or name == "player":
             movie = value
         elif name == "FlashVars":
             flashVars = value
     swfUrl = "http://www.streamable.ch" + movie
     flashVars = flashVars[flashVars.find("="):]
     decodedFlashvars = base64.standard_b64decode(flashVars)
     logger.info("decoded url is: " + decodedFlashvars)
     response = Downloader.getContentFromUrl(decodedFlashvars)
     token = Decoder.extract("\"token1\":\"", "\"", response)
     finalLink = base64.standard_b64decode(token)
     logger.debug("final link is: " + finalLink)
     return finalLink

예제 #41

0

파일 보기

파일: streamgaroo.py 프로젝트: harddevelop/tvbox

 def extractTargetVideo(page):
     logger.debug("extracting from page: "+page)
     html = Streamgaroo.getContentFromUrl(url=page,referer=Streamgaroo.MAIN_URL)
     logger.debug("html is: "+html)
     apiKey = Decoder.extract('data-sh="','"',html)
     bruteJSON = Streamgaroo.getContentFromUrl(Streamgaroo.CHANNEL_API, "h="+apiKey, Streamgaroo.cookie, Streamgaroo.MAIN_URL)
     jsonList = json.loads(bruteJSON)
     url2 = jsonList["link"]
     logger.debug("using url: "+url2)
     html2 = Streamgaroo.getContentFromUrl(url2, "", Streamgaroo.cookie, page)
     logger.debug("html2 is: "+html2)
     if 'playJS("' in html2:
         finalUrl = Decoder.extract('playJS("','"',html2)
         logger.debug("found final url: "+finalUrl)
         finalUrl = finalUrl.replace("http://www.streamgaroo.com/fetch/r/","") #clean proxies
         if 'playlist.m3u8' in finalUrl and '==' in finalUrl:
             finalUrl = finalUrl.replace('playlist.m3u8?','chunks.m3u8?')
         finalUrl = finalUrl + "|" + urllib.unquote(Downloader.getHeaders())
     elif "playStream('iframe','" in html2:
         iframeUrl = finalUrl = Decoder.extract("playStream('iframe','","'",html2)
         logger.debug("found iframe link: " + iframeUrl)
         try:
             iframeHtml = Downloader.getContentFromUrl(url=iframeUrl, data=" ", referer=page)
         except:
             logger.debug("trying second way, easy!!")
             import urllib2
             req = urllib2.Request(iframeUrl)
             req.add_header('Referer', page)
             req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0')
             resp = urllib2.urlopen(req)
             iframeHtml = resp.read()
             logger.debug("done!")
             pass
         logger.debug("html iframe is: "+iframeHtml)
         if 'adca.st/broadcast/player' in iframeHtml:
             finalUrl = Decoder.decodeBroadcastst(iframeUrl,page)
         elif 'vaughnlive.tv/embed/video/' in iframeUrl:
             finalUrl = Decoder.decodeVaughnlivetv(iframeUrl,page)
     logger.debug("done!")
     return finalUrl

예제 #42

0

파일 보기

파일: spliveappcom.py 프로젝트: harddevelop/tvbox

    def decrypt(encrypted):
        decrypted = encrypted
        try:
            logger.debug("Encrypted content is: "+encrypted)
            if not ONLINE:
                decrypted = PBEWithMD5AndDES.decrypt(encrypted, Spliveappcom.PASSWORD)
            elif len(encrypted)>0 and encrypted.find("http://")==-1:
                decrypted = Downloader.getContentFromUrl(Spliveappcom.DECODER_URL+'?data='+encrypted+"&key="+Spliveappcom.PASSWORD+"&iterations=1000")
            logger.debug("Decrypted content is: "+decrypted)
        except:
            logger.error("Could not be unencrypted: "+encrypted)
            pass

        return decrypted

예제 #43

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/jukebox

 def decodeOpenload(link):
     # get cookies
     mediaId = Decoder.extract("/f/", "/", link)
     embedUrl = "https://openload.io/embed/" + mediaId
     html = Downloader.getContentFromUrl(embedUrl, "", "", "", False, False)
     logger.info("html is: " + html)
     logger.debug("using cookie 1: " + Downloader.cookie)
     logger.debug("Media id for openload is: " + mediaId)
     extra = "&login=f750b26513f64034&key=oaA-MbZo"  # this avoid captcha petition
     link2 = "https://api.openload.io/1/file/dlticket?file=" + mediaId + extra
     data = Downloader.getContentFromUrl(link2, "", Downloader.cookie, embedUrl, True, False)
     logger.debug("jsonData: " + data)
     js_result = json.loads(data)
     logger.info("sleeping... " + str(js_result["result"]["wait_time"]))
     time.sleep(int(js_result["result"]["wait_time"]))
     link3 = "https://api.openload.io/1/file/dl?file=%s&ticket=%s" % (mediaId, js_result["result"]["ticket"])
     logger.debug("using cookie 2: " + Downloader.cookie)
     result = Downloader.getContentFromUrl(link3, "", Downloader.cookie, embedUrl, True, False)
     logger.debug("jsonData 2: " + result)
     js_result2 = json.loads(result)
     file = js_result2["result"]["url"] + "?mime=true"
     logger.info("Built final link: " + file)
     return file

예제 #44

0

파일 보기

파일: decoder.py 프로젝트: bitstuffing/jukebox

    def extractSawlive(scriptSrc, cookie, iframeUrl):
        encryptedHtml = Downloader.getContentFromUrl(scriptSrc, "", cookie,
                                                     iframeUrl)
        #print encryptedHtml
        decryptedUrl = Decoder.decodeSawliveUrl(encryptedHtml)
        html3 = Downloader.getContentFromUrl(decryptedUrl, "", cookie,
                                             scriptSrc)
        #ok, now extract flash script content

        flashContent = Decoder.extract("var so = new SWFObject('", "</script>",
                                       html3)
        file = Decoder.extract("'file', '", "');", flashContent)
        rtmpUrl = ""
        if flashContent.find("'streamer', '") > .1:
            rtmpUrl = Decoder.extract("'streamer', '", "');", flashContent)
        swfUrl = "http://static3.sawlive.tv/player.swf"  #default
        #update swf url
        swfUrl = flashContent[:flashContent.find("'")]
        logger.info("updated swf player to: " + swfUrl)
        if rtmpUrl == '' and file.find("http://") > -1:
            finalRtmpUrl = file  #it's a redirect with an .m3u8, so it's used
        else:
            finalRtmpUrl = rtmpUrl + " playpath=" + file + " swfUrl=" + swfUrl + " live=1 conn=S:OK pageUrl=" + decryptedUrl + " timeout=12"
        return finalRtmpUrl

예제 #45

0

파일 보기

파일: redeneobuxcom.py 프로젝트: harddevelop/tvbox

 def getChannels(page):
     x = []
     if page == '0':
         page = RedeneobuxCom.LIST_PAGE
         results = RedeneobuxCom.getContentFromUrl(page)
         i=0
         for result in results.split('<div class="media">'):
             if i>0:
                 element = {}
                 img = Decoder.extract('<img src=\'',"'",result)
                 link = Decoder.extract('location.href=\'', "'", result)
                 title = Decoder.extract('\' alt=\'', "'", result)
                 if "http" in link:
                     logger.debug("appending result: "+title+", url: "+link)
                     element["title"] = title
                     element["link"] = link
                     element["thumbnail"] = img
                     x.append(element)
             i+=1
     else:
         content = RedeneobuxCom.getContentFromUrl(url=page,referer=RedeneobuxCom.LIST_PAGE)
         logger.debug("list content is: " + content)
         url = Decoder.extractWithRegex('http'," ",content).replace(" ","")
         logger.debug("url is: " + url)
         if 'adf' in url:
             listUrl = Decoder.decodeAdfly(url)
             logger.debug("list obtained is: "+listUrl)
             m3uContent = Downloader.getSimpleDownload(listUrl) #simple urllib2 download
             logger.debug("content: "+m3uContent)
             i=0
             for lineContent in m3uContent.split('#EXTINF:'):
                 if i>0:
                     title = Decoder.extract(',','\n',lineContent)
                     lineContent = lineContent[lineContent.find("\n"):]
                     urlContent = Decoder.extractWithRegex('http://',"\n",lineContent).replace('\n','')
                     element = {}
                     element["title"] = title
                     element["link"] = urlContent#+"|"+Downloader.getHeaders(listUrl)
                     element["thumbnail"] = ''
                     element["finalLink"] = True
                     if "://" in urlContent:
                         logger.debug("added: " + title + ", content: " + urlContent)
                         x.append(element)
                 i+=1
     return x

예제 #46

0

파일 보기

파일: fildonet.py 프로젝트: harddevelop/jukebox

 def extractElementsSearch(html):
     jsonContent = json.loads(html)
     x = []
     for jsonValues in jsonContent:
         element = {}
         element["title"] = jsonValues["label"]+" - "+jsonValues["category"]
         if jsonValues["category"] == 'Artists':
             link = base64.standard_b64encode(Fildonet.ARTIST+str(jsonValues["label"]))
         elif jsonValues["category"] == 'Albums':
             link = base64.standard_b64encode(Fildonet.ALBUM+str(jsonValues["id"]))
         elif jsonValues["category"] == 'Songs':
             id = jsonValues["id"]
             html2 = Downloader.getContentFromUrl(Fildonet.SONG+str(id))
             songsJSONS = json.loads(html2)
             for songsJSON in songsJSONS:
                 link = songsJSON["mp3Url"]
                 element["thumbnail"] = songsJSON["picUrl"]
         element["link"] = link
         x.append(element)
     return x

예제 #47

0

파일 보기

파일: youtube.py 프로젝트: harddevelop/tvbox

 def decodeKeepVid(link):
     html = Downloader.getContentFromUrl("http://keepvid.com/?url="+urllib.quote_plus(link))
     tableHtml = Decoder.extract('<ul><li>',"</ul>",html)
     logger.debug("extracting from html: "+tableHtml)
     links = []
     selectedLink = ""
     for liHtml in tableHtml.split('</li>'):
         link = Decoder.extract('a href="','"',liHtml)
         title = Decoder.extract('alt="', '"', liHtml)
         if "1080p" in title and '(Video Only)' not in title:
             selectedLink = link
         elif len(selectedLink)==0 and "720p" in title and '(Video Only)' not in title:
             selectedLink = link
         else:
             logger.debug("No link selected with title: "+title)
         logger.debug("url at this moment is (youtube external): " + link)
         links.append(link)
     if len(selectedLink)==0:
         selectedLink = links[0]
     return selectedLink

예제 #48

0

파일 보기

파일: test_downloader.py 프로젝트: taiharry108/fastapi-manga-server

class TestDownloader(aiounittest.AsyncTestCase):
    def setUp(self):
        self.downloader = Downloader(None)

    @enter_session
    async def test_get_soup(self, session):
        self.downloader.session = session
        soup = await self.downloader.get_soup("https://www.example.com")
        h1_text = soup.find("h1").text
        self.assertEqual(h1_text, "Example Domain")

    @enter_session
    async def test_get_soup_error(self, session):
        self.downloader.session = session
        with self.assertRaises(RuntimeError) as e:
            await self.downloader.get_soup("http://www.example.com/testse")

    @enter_session
    async def test_get_image(self, session):
        self.downloader.session = session
        img_bytes = await self.downloader.get_img(
            "https://manhua1034-104-250-139-219.cdnmanhua.net/3/2800/1006905/1_1002.jpg?cid=1006905&key=9a12f75785ef4d8dc9fffcfa58f5e406&type=1"
        )
        self.assertEqual(len(img_bytes), 331566)

    @enter_session
    async def test_get_chapter_images(self, session):
        self.downloader.session = session
        count = 0
        urls = [
            "https://manhua1034-104-250-139-219.cdnmanhua.net/3/2800/1006905/1_1002.jpg?cid=1006905&key=9a12f75785ef4d8dc9fffcfa58f5e406&type=1",
            "https://manhua1034-104-250-139-219.cdnmanhua.net/3/2800/1006905/2_7528.jpg?cid=1006905&key=9a12f75785ef4d8dc9fffcfa58f5e406&type=1"
        ]
        async for img_dict in self.downloader.get_images(urls, ""):
            count += 1
            if img_dict["idx"] == 0:
                self.assertEqual(len(img_dict["message"]), 442088)
        self.assertEqual(count, 2)

예제 #49

0

파일 보기

파일: youtube.py 프로젝트: harddevelop/tvbox

    def extractTargetVideo(link):
        logger.debug("trying to decode with youtube link decrypter: " + link)
        code = link[link.find("v=") + 2:]
        logger.debug("trying with code: " + code)
        try:
            link = Decoder.downloadY(code)
        except:
            # trying second way, external page

            html = Downloader.getContentFromUrl(link, referer=Youtube.MAIN_URL)
            oldLink = link
            if 'ytplayer.config = {' in html:
                logger.debug("trying new way for .m3u8 links...")
                link = Decoder.extract(',"hlsvp":"', '"', html).replace('\\', '')
                link = urllib.unquote(link)
                logger.debug("new youtube extracted link from json is: " + link)
                # link += "|" + Downloader.getHeaders(oldLink)
            if "http" not in link:
                logger.debug("trying old second way: external resource...")
                link = Youtube.decodeKeepVid(oldLink)
            pass
        if ".m3u8" in link:
            bruteM3u8 = Youtube.getContentFromUrl(link);
            if 'https://' in bruteM3u8:
                m3u8 = bruteM3u8[bruteM3u8.rfind('https://'):]
                link = urllib.unquote_plus(m3u8).strip()
                logger.debug("using the last one inside: "+m3u8)
            else:
                logger.debug("no last one link selected :'(")
        else:
            logger.debug("nothing is transformed for youtube links.")

        logger.debug("final youtube decoded url is: " + link)
        if ";" in link:
            link = link.replace("=", "%3D").replace(";", "%3B")
        else:
            link = link.replace("%3D","=")
        return link

예제 #50

0

파일 보기

파일: fildonet.py 프로젝트: bitstuffing/jukebox

 def extractElementsSearch(html):
     jsonContent = json.loads(html)
     x = []
     for jsonValues in jsonContent:
         element = {}
         element[
             "title"] = jsonValues["label"] + " - " + jsonValues["category"]
         if jsonValues["category"] == 'Artists':
             link = base64.standard_b64encode(Fildonet.ARTIST +
                                              str(jsonValues["label"]))
         elif jsonValues["category"] == 'Albums':
             link = base64.standard_b64encode(Fildonet.ALBUM +
                                              str(jsonValues["id"]))
         elif jsonValues["category"] == 'Songs':
             id = jsonValues["id"]
             html2 = Downloader.getContentFromUrl(Fildonet.SONG + str(id))
             songsJSONS = json.loads(html2)
             for songsJSON in songsJSONS:
                 link = songsJSON["mp3Url"]
                 element["thumbnail"] = songsJSON["picUrl"]
         element["link"] = link
         x.append(element)
     return x

예제 #51

0

파일 보기

파일: core.py 프로젝트: gallyamb/FTP_client

    def start_file_downloading(self, source_path: str, path_to_save: str,
                               filename: str):
        if not self.can_continue or self.dtp is not None:
            self.download_queue.put(
                ('down', (source_path, path_to_save, filename)))
            return
        logger.debug('file %s downloading started. destination: %s' %
                     (filename, path_to_save))

        self.pi.change_dir(source_path)

        downloader = Downloader(path_to_save, filename)

        self.dtp = downloader
        self.pi.passive_mode.connect(
            downloader.data_transfer_process.start_transfer,
            QtCore.Qt.QueuedConnection)

        downloader.complete.connect(self.update_local_model)
        downloader.complete.connect(self.set_dtp_to_none)
        downloader.complete.connect(downloader.deleteLater)
        self.pi.initiate_passive_mode()
        self.pi.download_file(filename)

예제 #52

0

파일 보기

파일: decoder.py 프로젝트: harddevelop/kivy-downloader

 def decodeCastalbatv(url,page=''):
     channelId = url[url.find('cid=')+len('cid='):]
     if channelId.find("&")>-1:
         channelId = channelId[:channelId.find("&")]
     #iframeUrl = "http://castalba.tv/channel/"+channelId
     iframeUrl = url;
     logger.debug("using referer: "+page)
     html = Downloader.getContentFromUrl(iframeUrl,'',"",page)
     file = "";
     if html.find(".m3u8")>-1:
         file = Decoder.rExtract("'file': '",'.m3u8',html)
         logger.debug("detected castalba file: "+file)
         if len(file)>0 and page!='':
             file+="|Referer="+page
         else:
             file+="|Referer="+file
     else:
         file = Decoder.extract("var file = '","'",html)
         flash= Decoder.extract("'flashplayer': \"","\"",html)
         rtmpUrl = "rtmp://"+Decoder.extract("return '/","';",html)
         playpath = file+"?"+Decoder.extract("unescape('?","'),",html)
         file = rtmpUrl+" playpath="+playpath+" swfUrl="+flash+" live=1 pageUrl=http://castalba.tv/"
     logger.debug("final link from castalba is: "+file)
     return file

예제 #53

0

파일 보기

파일: redmp3cc.py 프로젝트: bitstuffing/jukebox

 def search(text,page=0,cookie=''):
     page = "http://redmp3.cc/mp3-"+urllib.unquote_plus(text)+"/"+str(page)
     html = Downloader.getContentFromUrl(page,"",cookie,"")
     x = Redmp3cc.extractElementsPlayer(html)
     return x

예제 #54

0

파일 보기

파일: descargas.py 프로젝트: superberny70/pelisalacarta

def download_from_url(url, item):
    logger.info("pelisalacarta.channels.descargas download_from_url - Intentando descargar: %s" % (url))
    if url.lower().endswith(".m3u8") or url.lower().startswith("rtmp"):
      save_server_statistics(item.server, 0, False)
      return {"downloadStatus": STATUS_CODES.error}

    # Obtenemos la ruta de descarga y el nombre del archivo
    download_path = filetools.dirname(filetools.join(config.get_setting("downloadpath"), item.downloadFilename))
    file_name = filetools.basename(filetools.join(config.get_setting("downloadpath"), item.downloadFilename))

    # Creamos la carpeta si no existe
    if not filetools.exists(download_path):
        filetools.mkdir(download_path)

    # Mostramos el progreso
    progreso = platformtools.dialog_progress("Descargas", "Iniciando descarga...")

    # Lanzamos la descarga
    d = Downloader(url, filetools.encode(download_path), filetools.encode(file_name))
    d.start()

    # Monitorizamos la descarga hasta que se termine o se cancele
    while d.state == d.states.downloading and not progreso.iscanceled():
        time.sleep(0.1)
        line1 = "%s" % (filetools.decode(d.filename))
        line2 = "%.2f%% - %.2f %s de %.2f %s a %.2f %s/s (%d/%d)" % (
        d.progress, d.downloaded[1], d.downloaded[2], d.size[1], d.size[2], d.speed[1], d.speed[2], d.connections[0],
        d.connections[1])
        line3 = "Tiempo restante: %s" % (d.remaining_time)
        progreso.update(int(d.progress), line1, line2, line3)

    # Descarga detenida. Obtenemos el estado:
    # Se ha producido un error en la descarga
    if d.state == d.states.error:
        logger.info("pelisalacarta.channels.descargas download_video - Error al intentar descargar %s" % (url))
        d.stop()
        progreso.close()
        status = STATUS_CODES.error

    # Aun está descargando (se ha hecho click en cancelar)
    elif d.state == d.states.downloading:
        logger.info("pelisalacarta.channels.descargas download_video - Descarga detenida")
        d.stop()
        progreso.close()
        status = STATUS_CODES.canceled

    # La descarga ha finalizado
    elif d.state == d.states.completed:
        logger.info("pelisalacarta.channels.descargas download_video - Descargado correctamente")
        progreso.close()
        status = STATUS_CODES.completed

        if item.downloadSize and item.downloadSize != d.size[0]:
            status = STATUS_CODES.error

    
    save_server_statistics(item.server, d.speed[0], d.state != d.states.error)
    
    if progreso.iscanceled():
      status = STATUS_CODES.canceled
      
    dir = os.path.dirname(item.downloadFilename)
    file = filetools.join(dir, filetools.decode(d.filename))
    
    if status == STATUS_CODES.completed:
        move_to_libray(item.clone(downloadFilename =  file))
        
    return {"downloadUrl": d.download_url, "downloadStatus": status, "downloadSize": d.size[0],
            "downloadProgress": d.progress, "downloadCompleted": d.downloaded[0], "downloadFilename": file}

예제 #55

0

파일 보기

파일: test_downloader.py 프로젝트: taiharry108/fastapi-manga-server

 def setUp(self):
     self.downloader = Downloader(None)

예제 #56

0

파일 보기

파일: spidercore.py 프로젝트: lichangg/housejobwweapon

class Core():
    def __init__(self):  #, spider_group, task_gettter):
        # self.spider_group = spider_group
        # self.task_getter = task_gettter
        self.spiders = self._auto_import_cls(SPIDERS, True)
        self.pool = Pool()
        self.pipelines = self._auto_import_cls(PIPELINES)
        self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES)
        #self.downloader_mids = downloader_mids
        self.downloader_mids = self._auto_import_cls(DOWNLOADER_MIDDLEWARES)
        self.scheduler = Scheduler(ROLE, QUEUE_TYPE)
        self.downloader = Downloader()
        # self.spider_mids = spider_mids
        self.spider_mids = self._auto_import_cls(SPIDER_MIDDLEWARES)
        self.is_running = True
        self.total_response = 0
        self.executor = BaseThreadPoolExecutor(max_workers=ASYNC_COUNT)

    def _auto_import_cls(self, path_list=[], is_spider=False):
        if is_spider:
            instances = {}
        else:
            instances = []

        import importlib

        for path in path_list:

            if is_spider:
                module_name = 'crawlers.' + path[:path.rfind(".")]
                class_name = path[path.rfind(".") + 1:]
                result = importlib.import_module(module_name)
                cls = getattr(result, class_name)
                instances[cls.name] = cls()
                print(f'爬虫“{cls.name}”已加载')

            else:
                module_name = path[:path.rfind(".")]
                class_name = path[path.rfind(".") + 1:]
                result = importlib.import_module(module_name)
                cls = getattr(result, class_name)
                instances.append(cls())
                print(f'“{cls.__name__}”已加载')
        return instances

    def _start_engine(self):
        # master只执行 添加请求，所以total_request会自增，
        # 但是不发送请求total_response不会自增
        if ROLE == "master" or ROLE is None:
            # 将Engine的工作分工，分为两部分：
            # 1 处理start_request请求并存如调度器中
            #self._execute_start_requests()
            self.pool.apply_async(self._execute_start_requests)

        while 1:
            time.sleep(0.01)
            li_req = self.scheduler.get_batch_requests(ASYNC_COUNT)
            if not li_req:
                continue
            tasks = [
                self.executor.submit(self._execute_request_return_item, req)
                for req in li_req
            ]
            for fu in as_completed(tasks):
                fu.result()
            if self.scheduler.total_request == self.total_response and self.scheduler.total_request != 0:
                self.is_running = False
                break
        print("Main Thread is over!")

    # def _callback(self, _):
    #     if self.is_running:
    #         self.pool.apply_async(self._execute_request_response_item, callback=self._callback)
    def start(self):
        # 开始时间
        start = datetime.now()
        print("Start time : {}".format(start))
        print("----" * 30)

        self._start_engine()

        # 结束时间
        end = datetime.now()

        print("----" * 30)
        print("End time : {}".format(end))
        # 总计运行时间
        print("Useing time : {}".format((end - start).total_seconds()))

    def _execute_start_requests(self):
        # 将所有爬虫的start_urls里的请求全部放入同一个调度器中
        #[("baidu", baidu_spider), ("douban" : douban_spider)]
        for spider_name, spider in self.spiders.items():
            print(spider_name, spider)
            # 1. 从spider中获取第一批请求，交给调度器
            #request = self.spider.start_requests()
            for request in spider.start_requests():
                # 第一次处理请求时，就添加爬虫名，该爬虫名可以传递到后续提取的请求中
                request.spider_name = spider_name
                # 1.1 将请求交给spider中间件做处理，再返回处理后的请求
                for spider_mid in self.spider_mids:
                    request = spider_mid.process_request(request, spider)

                self.scheduler.add_request(request)

    def _execute_request_response_item(self):
        # 每次while 循环，处理的都是同一个爬虫下的某一个请求
        #while True:
        # 2. 取出调度器的请求，并交给下载器，下载器返回响应交给spider解析
        request = self.scheduler.get_request()

        if not request:
            #break
            return

        # 获取请求对应的爬虫对象
        spider = self.spiders[request.spider_name]

        # 2.1 将调度器中返回的请求交给下载中间件做预处理，并返回处理后的请求
        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request, spider)

        response = self.downloader.send_request(request)
        # 2.2 将下载器返回的响应交给下载中间件做预处理，并返回处理后的响应
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(response, spider)
        #  将响应交给爬虫解析
        # parse_func = spider.parse(response)

        #爬虫对象的某个解析方法 parse， parse_page
        #getattr(spider, "parse_page")
        # 动态获取获取爬虫对象的该请求指定的回调函数，并将响应传入回调函数解析

        callback_func = getattr(spider, request.callback)
        parse_func = callback_func(response)

        for item_or_request in parse_func:
            # 3. 判断解析结果，如果是请求继续交给调度器；如果是item数据交给管道
            if isinstance(item_or_request, LRequest):
                item_or_request.spider_name = spider.name

                for spider_mid in self.spider_mids:
                    item_or_request = spider_mid.process_request(
                        item_or_request, spider)

                self.scheduler.add_request(item_or_request)

            elif isinstance(item_or_request, Item):
                for spider_mid in self.spider_mids:
                    item_or_request = spider_mid.process_item(
                        item_or_request, spider)

                for pipeline in self.pipelines:
                    item_or_request = pipeline.process_item(
                        item_or_request, spider)
            else:
                raise Exception("Not support data type : <{}>".format(
                    type(item_or_request)))

        self.total_response += 1

    def _execute_request_return_item(self, request: LRequest):

        if not request:
            return

        spider = self.spiders[request.spider_name]

        for downloader_mid in self.downloader_mids:
            request = downloader_mid.process_request(request, spider)
        try:
            response = self.downloader.send_request(request)
        except Exception as e:
            spider.logger.error(f'链接{request.url}出错：' + str(e))
            return
        for downloader_mid in self.downloader_mids:
            response = downloader_mid.process_response(response, spider)

        callback_func = getattr(spider, request.callback)
        try:
            parse_func = callback_func(response)
            for item_or_request in parse_func:
                if isinstance(item_or_request, LRequest):
                    item_or_request.spider_name = spider.name

                    for spider_mid in self.spider_mids:
                        item_or_request = spider_mid.process_request(
                            item_or_request, spider)

                    self.scheduler.add_request(item_or_request)

                elif isinstance(item_or_request, Item):
                    for spider_mid in self.spider_mids:
                        item_or_request = spider_mid.process_item(
                            item_or_request, spider)

                    for pipeline in self.pipelines:
                        item_or_request = pipeline.process_item(
                            item_or_request, spider)
                else:
                    raise Exception("Not support data type : <{}>".format(
                        type(item_or_request)))
        except Exception as e:
            spider.logger.error(f'解析{request.url}出错：' + str(e) +
                                f'响应码[{response.status_code}]')
            return
        self.total_response += 1

예제 #57

0

파일 보기

파일: zonasportsme.py 프로젝트: harddevelop/tvbox

 def getChannels(page):
     x = []
     logger.debug("page is: "+page)
     if str(page) == '0':
         page=Zonasportsme.MAIN_URL
     else:
         logger.debug("decoding page: "+page)
         page = base64.b64decode(page)
         logger.debug("decoded page: "+page)
     logger.debug("launching web petition to page: "+page)
     html = Zonasportsme.getContentFromUrl(page,"",Zonasportsme.cookie,Zonasportsme.MAIN_URL)
     if page==Zonasportsme.MAIN_URL:
         logger.debug("browsing main menu...")
         menu = Decoder.extract('<ul class="nav" id="main-menu">',"</li></ul></li></ul>",html)
         x = Zonasportsme.extractElements(menu)
     else:
         url = ""
         #decoder part
         if 'http://www.ustream.tv/' in html:
             uStreamUrl = Decoder.extractWithRegex('http://www.ustream.','"',html)
             url = Decoder.getUstreamLink(uStreamUrl,page)
         elif 'castamp.com/embed.js' in html:
             channel = Decoder.extract('channel="','"',html)
             url = Decoder.getCastcampLink(channel,page)
         elif 'adca.st/broadcast/player.js' in html:
             if "<script type='text/javascript'>id='" in html:
                 id2 = Decoder.extract("<script type='text/javascript'>id='", "';", html)
             logger.debug("using id = " + id2)
             url4 = "http://bro.adca.st/stream.php?id=" + id2 + "&width=700&height=450&stretching=uniform"
             html4 = Zonasportsme.getContentFromUrl(url4, "", Zonasportsme.cookie, page)
             logger.debug("html4: " + html4)
             curl = Decoder.rExtract('= "', '=";', html4)+'='
             fn = Decoder.rExtract('"','.php";',html4)
             token = Zonasportsme.getContentFromUrl('http://bro.adca.st/'+fn+'.php', "",Zonasportsme.cookie, url4, True)
             logger.debug("token: " + token)
             token = Decoder.extract('":"', '"', token)
             file = base64.decodestring(curl) + token + "|" + Downloader.getHeaders('http://cdn.allofme.site/jw/jwplayer.flash.swf')
             logger.debug("final url is: " + file)
             url = file
         elif 'zony.tv/static/scripts/zony.js' in html:
             channel = Decoder.extract("channel='","'",html)
             url = 'http://www.zony.tv/embedplayer/'+channel+'/1/700/400/'
             html2 = Zonasportsme.getContentFromUrl(url=url,referer=page)
             logger.debug("html2 is: "+html2)
             #newParam = Decoder.extract("so.addParam('FlashVars', '", "'", html2)  # brute params, needs a sort
             newParam = Decoder.extractParams(html2)
             rtmp = "rtmp://146.185.16.62/stream playPath="+newParam+" swfVfy=1 timeout=10 conn=S:OK live=true swfUrl=http://www.zony.tv/static/scripts/fplayer.swf flashver=WIN/2019,0,0,226 pageUrl="+page
             url = rtmp
         elif 'http://www.embeducaster.com/static/' in html:
             channel = Decoder.extract("channel='", "'", html)
             url = 'http://www.embeducaster.com/embedplayer/' + channel + '/1/700/400/'
             html2 = Zonasportsme.getContentFromUrl(url=url, referer=page)
             logger.debug("html2 is: " + html2)
             url = Decoder.decodeUcaster(html2,url)
         elif '247bay.tv/static/' in html:
             channel = Decoder.extract("channel='", "'", html)
             url = 'http://www.247bay.tv/embedplayer/'+channel+'/2/750/420'
             url = Decoder.decode247bay(url,page)
         element = {}
         element["title"] = "Stream"
         element["link"] = url
         element["permaLink"] = True
         x.append(element)
     return x

예제 #58

0

파일 보기

파일: engine.py 프로젝트: derrick0714/web_search_engine

class Engine(object):
	def __init__( self):
		self._istart		= False
		self._status		= Status()

		"""--- load config file----"""
		self._config 		= Configuration();
	
		"""--- core object ----"""
		self._downloader	= None
		self._parser		= None

		"""--- memory models --- """
		self._download_pool	= SafeQueue() #Store the html objects to be downloaded by the downloader
		self._parse_pool	= SafeQueue() #Store the html objects to be parsed by the parser
		
		"""--- checker threads --- """
		"""The target is the function passed in to 
		run in the thread. Those two threads keep checking 
		and assigning jobs to the two thread pools"""
		self._downloader_pool_checker = Thread( target=self.download_pool_checker)
		self._parse_pool_checker = Thread( target=self.parse_pool_checker)
		
		"""---  threads --- """
		self._status_update = Thread( target=self.status_update) #every second, this thread post runtime info to remote mysql

		""" ---strategies--- """
		self._earlyvisithandler	=	EarlyVisitHandler()
		self._robothandler  	=	RobotHandler()
		self._cgihandler		=	CGIHandler()
		self._nestlevelhandler 	=	NestLevelHandler()
		self._schemehandler    	=	SchemeHandler()
		self._filetypehandler	=	FileTypeHandler()
		self._bookmarkhandler	=	BookMarkHandler()
		self._omitindex			=	OmitIndex()
		self._urlextender		=	URLExtender()			
	
		""" ---init the path for saving data, if the folder don't exist, create it ---"""
		self._path			= self._config._down_path+"/"+ strftime('%Y-%m-%d', localtime())+"/"+ strftime('%H-%M-%S', localtime())+"/"
		if not os.path.exists(self._path):
			os.makedirs(self._path)

		self._config._down_path = self._path
		
		self._keywords_links= []

		""" ---Mysql Manager--- """
		self.sqlex      = DatabseManager(self._config)

		#self.f= open("data.txt", 'w')

	def load_seeds(self):
		#load seed info from config file	
		#print "load_seeds 1"
		#load seed from 
		contacter = SearchGoogle(self._config._keywords, self._config._result_num)
		self._keywords_links = contacter.getURLs()
		#append seeds, which from google search result, into download pool
		#print "load_seeds 2"
		#self._keywords_links.insert(0, "https://twitter.com/")
		#self._keywords_links.insert(0, "https://twitter.com/signup?context=login")
		
		i = 0
		for url in self._keywords_links:
			if i < self._config._result_num:
				#print "@@{0}".format(url)
				html_task = Html(url)

				#print "@@1"
				if(self._schemehandler.SchemeChecker(html_task)==False):
					#print("Ingore the wrong scheme, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					#print "@@2"
					self._status._scheme+=1
					continue
				if(self._bookmarkhandler.BookMarkChecker(html_task)==True):
					#print("Ingore bookmark link, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					#print "@@3"
					self._status._bookmark+=1
					continue
				if(self._cgihandler.FindCGI(html_task)==True):
					#print("Ingore the link contain cgi, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					#print "@@4"
					self._status._cgi+=1
					continue
				if(self._nestlevelhandler.checknestlevel(html_task,self._config._parser_nlv)==True):
					self._status._nestlv +=1
					#print "@@5"
					#print("Ingore the link nested too much, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					continue
				if(self._filetypehandler.FileTypeChecker(html_task)==False):
					#print "@@6"
					self._status._file_type +=1
					continue
				#print "@@7"
				'''
				if(self._earlyvisithandler.check_visited(html_task) == True):
					self._status._early_visit +=1
					#print("Ingore the link visited before, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					continue
				'''
				self._omitindex.Omit(html_task)
				"""
				print "@@8"
				if(self._robothandler.is_allowed(html_task) == False):
					print "@@9"
					self._status._robot +=1
					#print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
					continue
				print "@@10"
				"""
				self._earlyvisithandler.add_entry(html_task._md5, html_task)
				self._download_pool.append(html_task)
				'''If use the following two line of code, then the program won't run, which means checking for revisit works'''
				'''however, the dic should be safe with a lock'''
				#self._visited_dic[html_task._md5] = html_task._url 
				#print(len(self._visited_dic))
				#print "@@11"
			else:

				break
			i+=1
		#print "load_seeds 3"
	def show_welcome(self):
		print("download folder:"+self._path)
		print "key words:"+self._config._keywords
		print "donload thread num: {0}".format(self._config._down_num)
		print "parse thread num: {0}".format(self._config._parser_num)
		print "Load " +str(self._config._result_num)+" results from google search:"
		
		i = 0
		for url in self._keywords_links:
			if i < self._config._result_num:
				print ("[{0}]".format(i)+url)
			i+=1
		print "\n------------------------------------------------------------------------\n"

		#raw_input("press any key to start crawling, press second key to stop")
	
	def wait_for_start(self):
		print "ready for start....."
		print "go to http://dengxu.me/crawling/ to input some key words & see the result "

		while( self.sqlex.read_if_start(self._config)!= True):
			sleep(1)
		print "\n------------------------------------------------------------------------\n"
		print "starting crawling engine...."


	def start(self):
		try:
			self.wait_for_start()

			self._istart = True
			
			"""load seed """
			self.load_seeds()	#load seeds from google search 

			
			"""show welcome info"""
			self.show_welcome()
			self._status._sys_start	= time()

			"""start threads"""
			self._downloader = Downloader( self._config._down_num, self._status)
			self._downloader.start()
			self._parser     = Parser(self._config._parser_num, self._status )
			self._parser.start()
			self._downloader_pool_checker.start()
			self._parse_pool_checker.start()
			self._status_update.start()


			"""notify mysql, i am started"""
			self.sqlex.write_if_start()
			
		except (Exception) as e:
			Log().debug("start failed")
			raise(e)
			return False

		
		
	def stop(self):
		self._istart = False
		""""clear download and parse popl"""
		self._download_pool.clear()
		self._parse_pool.clear()

		"""stop downloader and parser threads"""
		self._downloader.stop()
		self._parser.stop()
		""""Those two checker threads will end when the thread who calls them ends"""
		self._downloader_pool_checker.join()
		self._parse_pool_checker.join()
		self._status_update.join()
		print ("Engine is stopping")

	def pause(self):
		pass

	def finish_download(self, html_task):
			
		
		
		
		sentence = "Downloaded:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-code: {4} data-size: {5}byes url: {6}"\
			.format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\
		html_task._parent,html_task._return_code, html_task._data_size, html_task._url )

		#if self._status._download_times <= 500 :
		#	self.f.write(sentence+"\n")
			


		"""caculate the path for saving files"""
		full_path = self._path+"[No.{0}]_".format(self._status._download_times)+".html"

		"""save html data to files"""
		#f= open(full_path, 'w')
		#f.write(html_task._data)
		#f.close()


		"""After downloading, pass the data(still using the html objects) to the parse pool"""
		self._parse_pool.append(html_task)




	def finish_parse(self, html_task):
		'''
		print("parsed:[No.{0}] time:{1:0.1f} page:depth_parent {2}_{3} http-status: {4} data-size: {5}byes url:{6}"\
			.format(self._status._download_times,time()-self._status._sys_start,html_task._depth,\
		html_task._parent,html_task._return_code, html_task._data_size, html_task._url))
		'''
		"""After parsing, pass the urls to be downloaded to the download pool"""
		if(self._earlyvisithandler.check_visited(html_task) == True):
			#print("Ingore the link visited before, this link is within page {0} , so don't put it in queue".format(html_task._parent), html_task._url)
			self._status._early_visit +=1
			return
		if(self._robothandler.is_allowed(html_task) == False):
			#print("Blocked by the Robot.txt, this link is within page {0} , so don't download".format(html_task._parent), html_task._url)
			self._status._robot +=1
			return
		
		self._earlyvisithandler.add_entry(html_task._md5, html_task)
		self._download_pool.append(html_task)
		




	def download_pool_checker(self):
		while (self._istart == True):
			new_download_task = self._download_pool.pop_left()
			"""If there is no task remain in the download pool, put the thread into sleep"""
			"""else pop the new task, and download it"""
			"""for the engine to get the result to put into the parse pool, we need to pass the function finish_download down as a callback"""
			
			if (new_download_task == None):
				#print("No task remaining in download_pool")
				sleep(0.1)
			else:
				self._downloader.queue_download_task(new_download_task , self.finish_download)


	def parse_pool_checker(self):
		while (self._istart == True):
			new_parse_task = self._parse_pool.pop_left()
			if (new_parse_task == None):
				#print("sleeping")
				sleep(0.1)				
			else:

				self._parser.queue_parse_task(new_parse_task, self.finish_parse)





	#~~~see result at http://dengxu.me/crawling/
	def status_update(self):

		while (self._istart == True):

			self._status._download_queue = self._downloader.len()
			self._status._parse_queue = self._parser.len()
			
			
			sentence = "[time: {0:0.1f}],queue:{8}, down: {1}, total: {2:0.1f}MB | queue:{9}, parsed: {3},scheme:{10}, cig: {4}, bookmark: {11} type {12} visited: {5}, robot: {6},nestlv: {7} | error: 404: {13} , timeout: {14}"\
			.format( time()-self._status._sys_start,\
		 	self._status._download_times, float(self._status._download_size)/1024/1024, self._status._parse_times\
		 	,self._status._cgi, self._status._early_visit, self._status._robot, self._status._nestlv\
		 	,self._downloader.len(), self._parser.len(),self._status._scheme_type, self._status._bookmark, self._status._file_type\
		 	,self._status._404,self._status._socket_timeout)
			
			print sentence

			#if( self._status._download_times > 500):
			#	self.f.write( sentence+"\n")
			

			"""update status tp mysql"""
			self.sqlex.write_status(self._status)
			
			"""update recent download url"""
			self.sqlex.write_recent_download(self._status)
			
			sleep(1)