Exemplo n.º 1
0
    def check_useful_task(self):
        check_buffer = []
        count = 0
        while True:
            count += 1
            data = self.db.pop_temp_buffer()
            if data:
                eval_data = eval(data)
                if eval_data[4] < self.MAX_ERROR_NUM and eval_data[
                        5] < self.MAX_ERROR_NUM:
                    if count // 2:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'http')
                    else:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'https')
                else:
                    if eval_data[4] < self.MAX_ERROR_NUM:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'http')
                    elif eval_data[5] < self.MAX_ERROR_NUM:
                        msgs = self.start_one_check(check_buffer, eval_data,
                                                    'https')
                    else:
                        continue
                downloader(msgs)
            else:
                for item in check_buffer:
                    if not item[1].empty():
                        res = item[1].get()
                        if res:
                            res.encoding = res.apparent_encoding
                            if item[2] == 'http':
                                if re.findall("<title>腾讯首页</title>", res.text):
                                    item[0][4] = self.MAX_ERROR_NUM
                                    self.db.adds_http_pool(
                                        ((*item[0][:4], 0, item[0][6]), ))
                                else:
                                    item[0][4] += 1
                            elif item[2] == 'https':
                                if re.findall("<title>百度一下,你就知道</title>",
                                              res.text):
                                    item[0][5] = self.MAX_ERROR_NUM
                                    self.db.adds_https_pool(
                                        ((*item[0][:4], 0, item[0][6]), ))
                                else:
                                    item[0][5] += 1
                        else:
                            if item[2] == 'http':
                                item[0][4] += 1
                            elif item[2] == 'https':
                                item[0][5] += 1

                        if item[0][4] < self.MAX_ERROR_NUM or item[0][
                                5] < self.MAX_ERROR_NUM:
                            self.db.adds_temp_buffer((item[0], ))

                        check_buffer.remove(item)
                time.sleep(0.01)
Exemplo n.º 2
0
def engine(initUrls):

    total_pages = manager(initUrls)

    downloader()

    success_page = int(redis.get('success'))
    success_rate = success_page / total_pages
    print('success_page:', success_page)
    print('total_pages:', total_pages)
    print('success_rate: %0.2f%%' % (success_rate * 100))
Exemplo n.º 3
0
 def start(self):
     self.wait_proxy_ready()
     urls = self.get_page_urls()
     print(list(urls))
     _headers = self.headers
     _headers['User-Agent'] = self.ua.random
     msgs = [(requests.get, self.queue, url, {'headers': _headers,'timeout':10,'proxies': {'http': self.get_http_proxy()}}) for url in urls]
     download.downloader(msgs)
     print('xicidaili ---> start')
     for i in range(len(msgs)):
         res = self.queue.get()
         if res:
             res.encoding = res.apparent_encoding
             for i in self.handle_html(res.text):
                 yield i
Exemplo n.º 4
0
 def start(self):
     urls = self.get_page_urls()
     _headers = self.headers
     _headers['User-Agent'] = self.ua.random
     msgs = [(requests.get, self.queue, url, {
         'headers': _headers,
         'timeout': 10
     }) for url in urls]
     download.downloader(msgs)
     for i in range(len(msgs)):
         res = self.queue.get()
         if res:
             res.encoding = res.apparent_encoding
             for i in self.handle_html(res.text):
                 yield i
def download():
    url = request.form.get('download_link')
    file_location = downloader(url, DOWNLOADS_FOLDER)
    if not url:
        return render_template('failure.html')
    else:
        return file_location
Exemplo n.º 6
0
 def proxy_requests(self,url):
     while True:
         _headers = self.headers
         _headers['User-Agent'] = self.ua.random
         proxy = self.get_http_proxy()
         if proxy:
             msgs = [(requests.get, self.queue, url,
                      {'headers': _headers, 'timeout': 10, 'proxies': {'http': proxy}})]
         else:
             msgs = [(requests.get, self.queue, url,
                      {'headers': _headers, 'timeout': 10})]
         download.downloader(msgs)
         res = self.queue.get()
         if not res:
             print('get_page_urls error')
             self.send_http_proxy_error(proxy)
         else:
             return res
Exemplo n.º 7
0
def run_by_iid(session,Iid):
    # session : requests.Session() : None
    # Iid     : integer            : illust ID

    ''' get download url '''
    url_and_title=url.get_base_url(session, Iid, True)
    try:
        print (url_and_title['msg'])
        return
    except:
        base_url=url_and_title['base_url']
        title=url_and_title['title']
        illuster_name=url.get_illuster_name(title)
        illust_title=url.get_illust_title(title)

    ''' _p0 --> _plast '''
    counter=0
    jpgFlag=True
    pngFlag=True
    while jpgFlag or pngFlag:

        ''' jpg code block '''
        _data=url.get_whole_url(session,base_url,str(counter),'.jpg')
        if _data['status_code'] != 200:
            jpgFlag=False
        else:
            jpgFlag=True
            file_name=download.downloader(_data['img_url'],illuster_name,illust_title+'_p'+str(counter),str(Iid)+'.jpg')
            print('===>'+file_name+' download completed!')
            print()

        ''' png code block '''
        _data=url.get_whole_url(session,base_url,str(counter),'.png')
        if _data['status_code'] != 200:
            pngFlag=False
        else:
            pngFlag=True
            file_name=download.downloader(_data['img_url'],illuster_name,illust_title+'_p'+str(counter),str(Iid)+'.png')
            print('===>'+illuster_name+':'+file_name+' download completed!')
            print()

        counter+=1
Exemplo n.º 8
0
 def get_page_urls(self):
     '''获取需要爬取页面的urls'''
     try:
         _headers = self.headers
         _headers['User-Agent'] = self.ua.random
         msgs = [(requests.get, self.queue, self.start_urls, {
             'headers': _headers
         })]
         download.downloader(msgs)
         res = self.queue.get()
         if res:
             res.encoding = res.apparent_encoding
             htmlEmt = etree.HTML(res.text)
             tr_list = htmlEmt.xpath("//ul[@class='textlarge22']/li")[1:]
             return (self.start_urls + i.xpath("a/@href")[0]
                     for i in tr_list)
         else:
             return []
     except Exception as e:
         print('get_page_urls:', e)
def downloading(continue_toggle=True):
    ### Part 1: download images ###
    username = util.USERNAME
    password = util.PASSWORD
    OUTPUT_DIR = util.OUTPUT_DIR

    # intialize downloader
    downloader = download.downloader(username=util.USERNAME,
                                     password=util.PASSWORD,
                                     OUTPUT_DIR=util.OUTPUT_DIR)

    # download set of scenes:
    landsat_dir, modis_dir = downloader.download_all(
        continue_toggle=continue_toggle)
    return landsat_dir, modis_dir
Exemplo n.º 10
0
def gen_dom(url):
    return parse(downloader(url).download())
Exemplo n.º 11
0
 def __init__(self, prep='http://cms.cern.ch/iCMS/prep/'):
     self.downloader = downloader(prep)
     self.request = None
Exemplo n.º 12
0
print('login successfully!')

# get Illust ID
while True:
    Iid = input("Illust ID:")
    base_url = url.get_base_url(session, Iid)
    counter = 0
    jpgFlag = True
    pngFlag = True
    while jpgFlag or pngFlag:
        _data = url.get_whole_url(session, base_url, str(counter), '.jpg')
        if _data['status_code'] != 200:
            jpgFlag = False
        else:
            jpgFlag = True
            file_name = download.downloader(_data['img_url'], str(counter),
                                            '.jpg')
            print(file_name + ' download completed!')

        _data = url.get_whole_url(session, base_url, str(counter), '.png')
        if _data['status_code'] != 200:
            pngFlag = False
        else:
            pngFlag = True
            file_name = download.downloader(_data['img_url'], str(counter),
                                            '.png')
            print(file_name + ' download completed!')
        counter += 1

    Continue = input('Do you want download next illust?(Y/N)')
    if Continue is 'N' or 'n':
        break
Exemplo n.º 13
0
def main():
    log = logging.getLogger("main")
    """Runs program and handles command line options"""
    p = optparse.OptionParser(version = "%prog " + version)
    p.add_option('-v', '--verbose', action ='count',help='Change global log level, increasing log output.', metavar='LOGFILE')
    p.add_option('-q', '--quiet', action ='count',help='Change global log level, decreasing log output.', metavar='LOGFILE')
    p.add_option('--branch', action ='store',help='set branch to commit rpms to.', metavar='OBSREPOARCH_BRANCH')
    p.add_option('--repo-uri', action ='store',help='base uri to downlaod', metavar='OBSREPOARCH_URI')
    p.add_option('--git-master-repo', action ='store',help='local shared git pack object store path', metavar='OBSREPOARCH_MASTERREPO')
    p.add_option('--git-origin', action ='store',help='upstream git repo.', metavar='OBSREPOARCH_ORIGIN')
    p.add_option('--dir-work', action ='store',help='Working directory fro checkout of repo.', metavar='OBSREPOARCH_WORKINGDIR')
    p.add_option('--log-config', action ='store',help='Logfile configuration file, (overrides command line).', metavar='LOGFILE')
    options, arguments = p.parse_args()
    logFile = None
    workingdir = 'workingdir'
    origin = ''
    shared_clone = ''
    branch = "ibs_product_1.0"
    uri = "http://download.suse.de/ibs/Devel:/Storage:/1.0:/Staging/openSUSE_Factory/"
    if 'OBSREPOARCH_LOG_CONF' in os.environ:
        logFile = os.environ['OBSREPOARCH_LOG_CONF']
    if 'OBSREPOARCH_ORIGIN' in os.environ:
        origin = os.environ['OBSREPOARCH_ORIGIN']
    if 'OBSREPOARCH_WORKINGDIR' in os.environ:
        workingdir = os.environ['OBSREPOARCH_WORKINGDIR']
    if 'OBSREPOARCH_BRANCH' in os.environ:
        branch = os.environ['OBSREPOARCH_BRANCH']
    if 'OBSREPOARCH_URI' in os.environ:
        uri = os.environ['OBSREPOARCH_URI']
    if 'OBSREPOARCH_MASTERREPO' in os.environ:
        shared_clone = os.environ['OBSREPOARCH_MASTERREPO']
    LoggingLevel = logging.WARNING
    LoggingLevelCounter = 2
    if options.verbose:
        LoggingLevelCounter = LoggingLevelCounter - options.verbose
        if options.verbose == 1:
            LoggingLevel = logging.INFO
        if options.verbose == 2:
            LoggingLevel = logging.DEBUG
    if options.quiet:
        LoggingLevelCounter = LoggingLevelCounter + options.quiet
    if LoggingLevelCounter <= 0:
        LoggingLevel = logging.DEBUG
    if LoggingLevelCounter == 1:
        LoggingLevel = logging.INFO
    if LoggingLevelCounter == 2:
        LoggingLevel = logging.WARNING
    if LoggingLevelCounter == 3:
        LoggingLevel = logging.ERROR
    if LoggingLevelCounter == 4:
        LoggingLevel = logging.FATAL
    if LoggingLevelCounter >= 5:
        LoggingLevel = logging.CRITICAL

    if options.log_config:
        logFile = options.log_config
    if logFile != None:
        if os.path.isfile(str(options.log_config)):
            logging.config.fileConfig(options.log_config)
        else:
            logging.basicConfig(level=LoggingLevel)
            log = logging.getLogger("main")
            log.error("Logfile configuration file '%s' was not found." % (options.log_config))
            sys.exit(1)
    else:
        logging.basicConfig(level=LoggingLevel)
    log = logging.getLogger("main")
    if options.branch:
        branch = options.branch
    if options.repo_uri:
        uri = options.repo_uri
    if options.dir_work:
        workingdir = options.dir_work
    if options.git_master_repo:
        shared_clone = options.git_master_repo
    if not options.git_origin:
        log.error("No git origin given, use --git-origin!")
        sys.exit(1)
    origin = options.git_origin

    downloader = download.downloader(
        workingdir=workingdir,
        origin=origin,
        shared_clone=shared_clone
        )
    downloader.work_dir_setup(
        branch=branch
        )
    downloader.update(
        uri=uri
        )
    return 0