예제 #1
0
 def download(url):
     proxylist = sqlhelper.select(10)
     if not proxylist:
         proxies=None
     else:
         proxy = random.choice(proxylist)
         proxies = {"http": "http://%s:%s" % ( proxy[0], proxy[1]), "https": "http://%s:%s" % ( proxy[0], proxy[1])}
     try:
         r = requests.get(url=url, headers=config.get_header(),proxies=proxies, timeout=config.TIMEOUT)
         r.encoding = chardet.detect(r.content)['encoding']
         if (not r.ok) or len(r.content) < 500:
             raise ConnectionError
         else:
             return r.text
     except Exception as e:
         count = 0  # 已重试次数
         proxylist = sqlhelper.select(10)
         if not proxylist:
             return None
         while count < config.RETRY_TIME:
             try:
                 proxy = random.choice(proxylist)
                 proxies = {"http": "http://%s:%s" % ( proxy[0], proxy[1]), "https": "http://%s:%s" % ( proxy[0], proxy[1])}
                 r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies)
                 r.encoding = chardet.detect(r.content)['encoding']
                 if (not r.ok) or len(r.content) < 500:
                     raise ConnectionError
                 else:
                     return r.text
             except Exception:
                 count += 1
     return None
예제 #2
0
    def download(self,url):
        count = 0#重试次数
        r=''
        try:
            r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT)
            r.encoding =chardet.detect(r.content)['encoding']
            while count< config.RETRY_TIME:
                if (not r.ok) or len(r.content)<500 :
                    proxylist = sqlhelper.select(10)
                    proxy = random.choice(proxylist)
                    ip = proxy[0]
                    port = proxy[1]
                    proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
                    try:
                        r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
                        r.encoding =chardet.detect(r.content)['encoding']
                        count += 1
                    except Exception as e:
                         count += 1


                else:
                    return r.text

            return None

        except Exception as e:
            while count< config.RETRY_TIME:
                if r==''or (not r.ok) or len(r.content)<500 :
                    try:
                        proxylist = sqlhelper.select(10)
                        proxy = random.choice(proxylist)
                        ip = proxy[0]
                        port = proxy[1]
                        proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)}
                        try:
                            r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies)
                            r.encoding =chardet.detect(r.content)['encoding']
                            count += 1
                        except Exception as e:
                             count += 1

                    except Exception as e:
                        return None

                else:
                    return r.text

            return None
예제 #3
0
    def run(self):
        while True:
            self.proxies.clear()
            str = 'IPProxyPool----->>>>>>>>beginning'
            sys.stdout.write(str + "\r\n")
            sys.stdout.flush()
            proxylist = sqlhelper.select()
            myip = getMyIP()
            spawns = []
            for proxy in proxylist:
                spawns.append(gevent.spawn(detect_from_db, myip, proxy, self.proxies))
            gevent.joinall(spawns)
            self.db_proxy_num.value = len(self.proxies)
            str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)

            if len(self.proxies) < MINNUM:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()
                self.crawl_pool.map(self.crawl, parserList)
            else:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()

            time.sleep(UPDATE_TIME)
예제 #4
0
    def run(self):
        while True:
            self.proxies.clear()
            str = 'IPProxyPool----->>>>>>>>beginning'
            sys.stdout.write(str + "\r\n")
            sys.stdout.flush()
            proxylist = sqlhelper.select()

            spawns = []
            for proxy in proxylist:
                spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies))
                if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
                    gevent.joinall(spawns)
                    spawns= []
            gevent.joinall(spawns)
            self.db_proxy_num.value = len(self.proxies)
            str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)

            if len(self.proxies) < MINNUM:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()
                spawns = []
                for p in parserList:
                    spawns.append(gevent.spawn(self.crawl, p))
                    if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
                        gevent.joinall(spawns)
                        spawns= []
                gevent.joinall(spawns)
            else:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()
            print('now sleep')
            time.sleep(UPDATE_TIME)
예제 #5
0
    def run(self):
        while True:
            self.proxies.clear()
            str = 'IPProxyPool----->>>>>>>>beginning'
            sys.stdout.write(str + "\r\n")
            sys.stdout.flush()
            proxylist = sqlhelper.select()
            myip = getMyIP()
            spawns = []
            for proxy in proxylist:
                spawns.append(gevent.spawn(detect_from_db, myip, proxy, self.proxies))
            gevent.joinall(spawns)
            self.db_proxy_num.value = len(self.proxies)
            str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)

            if len(self.proxies) < MINNUM:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()
                self.crawl_pool.map(self.crawl, parserList)
            else:
                str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()

            time.sleep(UPDATE_TIME)
예제 #6
0
 def GET(self):
     inputs = web.input()
     count = inputs.get('count', None)
     if count == None:
         count = 1
     text = [{
         'ip': i[0],
         'port': str(i[1]),
         'protocol': str(i[3])
     } for i in sqlhelper.select(count, inputs)]
     json_result = json.dumps(text)
     print('select' + str(json_result))
     return json_result
예제 #7
0
 def run(self):
     while True:
         self.proxies.clear()
         str = 'IPProxyPool---beginning'
         sys.stdout.write(str + "\r\n")
         sys.stdout.flush()
         proxylist = sqlhelper.select()
         spawns = []
         for proxy in proxylist:
             spawns.append(
                 gevent.spawn(detect_from_db, self.myip, proxy,
                              self.proxies))
             if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
                 gevent.joinall(spawns)
                 spawns = []
         gevent.joinall(spawns)
         self.db_proxy_num.value = len(sqlhelper.select())
         str = 'db exists ip:%d' % len(sqlhelper.select())
         spawns = []
         start = time.time()
         while True:
             if len(sqlhelper.select()) < MINNUM:
                 str += '\r\nnow ip num < MINNUM,start crawling...'
                 sys.stdout.write(str + "\r\n")
                 sys.stdout.flush()
             if len(sqlhelper.select()) >= MINNUM:
                 str += '\r\nACCONPLISH!!!ip num meet the requirement!'
                 sys.stdout.write(str + "\r\n")
                 sys.stdout.flush()
             p = random.randint(0, len(parserList) - 1)
             spawns.append(gevent.spawn(self.crawl, parserList[p]))
             if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
                 gevent.joinall(spawns)
                 spawns = []
             gevent.joinall(spawns)
             ending = time.time()
             if int(ending - start) >= int(UPDATE_TIME):
                 break
예제 #8
0
    def run(self):
        while True:
            self.proxies.clear()
            str = 'IPProxyPool----->>>>>>>>beginning'
            now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
            sys.stdout.write(now + ":" + str + "\r\n")
            sys.stdout.flush()
            proxylist = sqlhelper.select()

            spawns = []
            for proxy in proxylist:
                spawns.append(
                    gevent.spawn(detect_from_db, self.myip, proxy,
                                 self.proxies))
                if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
                    gevent.joinall(spawns)
                    spawns = []
            gevent.joinall(spawns)
            self.db_proxy_num.value = len(self.proxies)
            now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
            str = now + ':IPProxyPool----->>>>>>>>db exists ip:%d' % len(
                self.proxies)

            if len(self.proxies) < MINNUM:
                now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
                str += '\r\n' + now + ':IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()
                spawns = []
                for p in parserList:
                    spawns.append(gevent.spawn(self.crawl, p))
                    if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
                        gevent.joinall(spawns)
                        spawns = []
                gevent.joinall(spawns)
            else:
                now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")
                str += '\r\n' + now + ':IPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                sys.stdout.write(str + "\r\n")
                sys.stdout.flush()

            #time.sleep(UPDATE_TIME)
            self.sleep_condition.acquire()
            self.sleep_condition.wait()
            self.sleep_condition.release()
    def run(self):
        logger.info("启动爬虫!目标地址:")
        for p in parserList:
            logger.info(p['urls'][0])

        while True:
            self.proxies.clear()
            logger.info("IPProxyPool----->>>>>>>>beginning")

            proxylist = sqlhelper.select(count=99999)

            spawns = []
            for proxy in proxylist:
                spawns.append(
                    gevent.spawn(detect_from_db, self.myip, proxy,
                                 self.proxies))
                if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS:
                    gevent.joinall(spawns)
                    spawns = []
            gevent.joinall(spawns)
            self.db_proxy_num.value = len(self.proxies)
            logger.info('IPProxyPool----->>>>>>>>db exists ip:%d' %
                        len(self.proxies))

            self.check_exists_ip.value = True

            if len(self.proxies) < MINNUM:
                logger.info(
                    'IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...'
                )
                spawns = []
                for p in parserList:
                    spawns.append(gevent.spawn(self.crawl, p))
                    if len(spawns) >= MAX_DOWNLOAD_CONCURRENT:
                        gevent.joinall(spawns)
                        spawns = []
                gevent.joinall(spawns)
            else:
                logger.info(
                    'IPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...'
                )

            time.sleep(UPDATE_TIME)
예제 #10
0
    def download(url):
        try:
            r = requests.get(url=url,
                             headers=config.get_header(),
                             timeout=config.TIMEOUT)
            r.encoding = chardet.detect(r.content)['encoding']
            if (not r.ok) or len(r.content) < 500:
                raise ConnectionError

            else:
                return r.text

        except Exception:
            count = 0
            proxylist = sqlhelper.select(10)
            if not proxylist:
                return None

            while count < config.RETRY_TIME:
                try:
                    proxy = random.choice(proxylist)
                    ip = proxy[0]
                    port = proxy[1]
                    proxies = {
                        'http': 'http://%s:%s' % (ip, port),
                        'https': 'https://%s:%s' % (ip, port)
                    }

                    r = requests.get(url=url,
                                     headers=config.get_header(),
                                     timeout=config.TIMEOUT,
                                     proxies=proxies)
                    r.encoding = chardet.detect(r.content)['encoding']
                    if (not r.ok) or len(r.content) < 500:
                        raise ConnectionError

                    else:
                        return r.text
                except Exception:
                    count += 1

            return None
예제 #11
0
 def downimage(url, img_path):
     if os.path.exists(config.image_path_base + img_path):
         return True
     count = 0  # 重试次数
     proxylist = sqlhelper.select(100)
     if not proxylist:
         return None
     while count < config.RETRY_TIME:
         try:
             proxy = random.choice(proxylist)
             ip = proxy[0]
             port = proxy[1]
             proxies = {
                 "http": "http://%s:%s" % (ip, port),
                 "https": "http://%s:%s" % (ip, port)
             }
             r = requests.get(
                 url=url,
                 headers=config.get_header(),
                 timeout=config.TIMEOUT,
             )
             r.encoding = chardet.detect(r.content)['encoding']
             if r.status_code == 200:
                 if len(r.content) == 0:
                     return None
                 if len(r.content) < 500:
                     count += 1
                     continue
                 with open(config.image_path_base + img_path, 'wb') as file:
                     file.write(r.content)
                 del r
                 return True
         except Exception:
             sleep(0.1)
             count += 1
     return None
예제 #12
0
 def GET(self):
     #			web.config.debug = False
     inputs = web.input()
     json_result = json.dumps(
         sqlhelper.select(inputs.get('count', None), inputs))
     return json_result
예제 #13
0
파일: geturl.py 프로젝트: zegu/91porn
def refresh_download_url_by_tag():
    res = sqlhelper.select(100, {'downloaded': 0})
    for item in res:
        vid = (item[1])
        refresh_download_url(vid)
예제 #14
0
파일: geturl.py 프로젝트: zegu/91porn
def refresh_download_url_by_vno(vnos):
    for vno in vnos:
        res = sqlhelper.select(1, {'vno': vno})
        item = res[0]
        vid = (item[1])
        refresh_download_url(vid)
예제 #15
0
파일: randomip.py 프로젝트: zegu/91porn
def check_vid_exist(view_id):
    res = sqlhelper.select(1, {'view_id': view_id})
    if len(res):
        print('view id %s exist, skip' % view_id)
        return True
    return False
예제 #16
0
파일: asyncdl.py 프로젝트: zegu/91porn
def check_vid_exist(view_id):
    res = sqlhelper.select(1, {'view_id': view_id})
    if len(res):
        view_ids_queue.remove(view_id)
        return True
    return False
예제 #17
0
 def GET(self):
     inputs = web.input()
     json_result = json.dumps(
         sqlhelper.select(inputs.get('count', None), inputs))
     return json_result
예제 #18
0
파일: rename.py 프로젝트: zegu/91porn
def query_title(vno):
    res = sqlhelper.select(1, {'vno': vno})
    if len(res):
        return res[0][3]
    return None
예제 #19
0
 def GET(self):
     inputs = web.input()
     res = sqlhelper.select(inputs.get('count', None), inputs)
     json_result = json.dumps(list(map(lambda x: x._asdict(), res)))
     return json_result
예제 #20
0
def selectProxies():
    inputs = request.args
    json_result = json.dumps(
        sqlhelper.select(inputs.get('count', None), inputs))
    return json_result