def download(url): proxylist = sqlhelper.select(10) if not proxylist: proxies=None else: proxy = random.choice(proxylist) proxies = {"http": "http://%s:%s" % ( proxy[0], proxy[1]), "https": "http://%s:%s" % ( proxy[0], proxy[1])} try: r = requests.get(url=url, headers=config.get_header(),proxies=proxies, timeout=config.TIMEOUT) r.encoding = chardet.detect(r.content)['encoding'] if (not r.ok) or len(r.content) < 500: raise ConnectionError else: return r.text except Exception as e: count = 0 # 已重试次数 proxylist = sqlhelper.select(10) if not proxylist: return None while count < config.RETRY_TIME: try: proxy = random.choice(proxylist) proxies = {"http": "http://%s:%s" % ( proxy[0], proxy[1]), "https": "http://%s:%s" % ( proxy[0], proxy[1])} r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies) r.encoding = chardet.detect(r.content)['encoding'] if (not r.ok) or len(r.content) < 500: raise ConnectionError else: return r.text except Exception: count += 1 return None
def download(self,url): count = 0#重试次数 r='' try: r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT) r.encoding =chardet.detect(r.content)['encoding'] while count< config.RETRY_TIME: if (not r.ok) or len(r.content)<500 : proxylist = sqlhelper.select(10) proxy = random.choice(proxylist) ip = proxy[0] port = proxy[1] proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)} try: r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) r.encoding =chardet.detect(r.content)['encoding'] count += 1 except Exception as e: count += 1 else: return r.text return None except Exception as e: while count< config.RETRY_TIME: if r==''or (not r.ok) or len(r.content)<500 : try: proxylist = sqlhelper.select(10) proxy = random.choice(proxylist) ip = proxy[0] port = proxy[1] proxies={"http": "http://%s:%s"%(ip,port),"https": "http://%s:%s"%(ip,port)} try: r = requests.get(url=url,headers=config.HEADER,timeout=config.TIMEOUT,proxies=proxies) r.encoding =chardet.detect(r.content)['encoding'] count += 1 except Exception as e: count += 1 except Exception as e: return None else: return r.text return None
def run(self): while True: self.proxies.clear() str = 'IPProxyPool----->>>>>>>>beginning' sys.stdout.write(str + "\r\n") sys.stdout.flush() proxylist = sqlhelper.select() myip = getMyIP() spawns = [] for proxy in proxylist: spawns.append(gevent.spawn(detect_from_db, myip, proxy, self.proxies)) gevent.joinall(spawns) self.db_proxy_num.value = len(self.proxies) str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies) if len(self.proxies) < MINNUM: str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() self.crawl_pool.map(self.crawl, parserList) else: str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' sys.stdout.write(str + "\r\n") sys.stdout.flush() time.sleep(UPDATE_TIME)
def run(self): while True: self.proxies.clear() str = 'IPProxyPool----->>>>>>>>beginning' sys.stdout.write(str + "\r\n") sys.stdout.flush() proxylist = sqlhelper.select() spawns = [] for proxy in proxylist: spawns.append(gevent.spawn(detect_from_db, self.myip, proxy, self.proxies)) if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS: gevent.joinall(spawns) spawns= [] gevent.joinall(spawns) self.db_proxy_num.value = len(self.proxies) str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies) if len(self.proxies) < MINNUM: str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() spawns = [] for p in parserList: spawns.append(gevent.spawn(self.crawl, p)) if len(spawns) >= MAX_DOWNLOAD_CONCURRENT: gevent.joinall(spawns) spawns= [] gevent.joinall(spawns) else: str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' sys.stdout.write(str + "\r\n") sys.stdout.flush() print('now sleep') time.sleep(UPDATE_TIME)
def run(self): while True: self.proxies.clear() str = 'IPProxyPool----->>>>>>>>beginning' sys.stdout.write(str + "\r\n") sys.stdout.flush() proxylist = sqlhelper.select() myip = getMyIP() spawns = [] for proxy in proxylist: spawns.append(gevent.spawn(detect_from_db, myip, proxy, self.proxies)) gevent.joinall(spawns) self.db_proxy_num.value = len(self.proxies) str = 'IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies) if len(self.proxies) < MINNUM: str += '\r\nIPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() self.crawl_pool.map(self.crawl, parserList) else: str += '\r\nIPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' sys.stdout.write(str + "\r\n") sys.stdout.flush() time.sleep(UPDATE_TIME)
def GET(self): inputs = web.input() count = inputs.get('count', None) if count == None: count = 1 text = [{ 'ip': i[0], 'port': str(i[1]), 'protocol': str(i[3]) } for i in sqlhelper.select(count, inputs)] json_result = json.dumps(text) print('select' + str(json_result)) return json_result
def run(self): while True: self.proxies.clear() str = 'IPProxyPool---beginning' sys.stdout.write(str + "\r\n") sys.stdout.flush() proxylist = sqlhelper.select() spawns = [] for proxy in proxylist: spawns.append( gevent.spawn(detect_from_db, self.myip, proxy, self.proxies)) if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) self.db_proxy_num.value = len(sqlhelper.select()) str = 'db exists ip:%d' % len(sqlhelper.select()) spawns = [] start = time.time() while True: if len(sqlhelper.select()) < MINNUM: str += '\r\nnow ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() if len(sqlhelper.select()) >= MINNUM: str += '\r\nACCONPLISH!!!ip num meet the requirement!' sys.stdout.write(str + "\r\n") sys.stdout.flush() p = random.randint(0, len(parserList) - 1) spawns.append(gevent.spawn(self.crawl, parserList[p])) if len(spawns) >= MAX_DOWNLOAD_CONCURRENT: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) ending = time.time() if int(ending - start) >= int(UPDATE_TIME): break
def run(self): while True: self.proxies.clear() str = 'IPProxyPool----->>>>>>>>beginning' now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") sys.stdout.write(now + ":" + str + "\r\n") sys.stdout.flush() proxylist = sqlhelper.select() spawns = [] for proxy in proxylist: spawns.append( gevent.spawn(detect_from_db, self.myip, proxy, self.proxies)) if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) self.db_proxy_num.value = len(self.proxies) now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") str = now + ':IPProxyPool----->>>>>>>>db exists ip:%d' % len( self.proxies) if len(self.proxies) < MINNUM: now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") str += '\r\n' + now + ':IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' sys.stdout.write(str + "\r\n") sys.stdout.flush() spawns = [] for p in parserList: spawns.append(gevent.spawn(self.crawl, p)) if len(spawns) >= MAX_DOWNLOAD_CONCURRENT: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) else: now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f") str += '\r\n' + now + ':IPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' sys.stdout.write(str + "\r\n") sys.stdout.flush() #time.sleep(UPDATE_TIME) self.sleep_condition.acquire() self.sleep_condition.wait() self.sleep_condition.release()
def run(self): logger.info("启动爬虫!目标地址:") for p in parserList: logger.info(p['urls'][0]) while True: self.proxies.clear() logger.info("IPProxyPool----->>>>>>>>beginning") proxylist = sqlhelper.select(count=99999) spawns = [] for proxy in proxylist: spawns.append( gevent.spawn(detect_from_db, self.myip, proxy, self.proxies)) if len(spawns) >= MAX_CHECK_CONCURRENT_PER_PROCESS: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) self.db_proxy_num.value = len(self.proxies) logger.info('IPProxyPool----->>>>>>>>db exists ip:%d' % len(self.proxies)) self.check_exists_ip.value = True if len(self.proxies) < MINNUM: logger.info( 'IPProxyPool----->>>>>>>>now ip num < MINNUM,start crawling...' ) spawns = [] for p in parserList: spawns.append(gevent.spawn(self.crawl, p)) if len(spawns) >= MAX_DOWNLOAD_CONCURRENT: gevent.joinall(spawns) spawns = [] gevent.joinall(spawns) else: logger.info( 'IPProxyPool----->>>>>>>>now ip num meet the requirement,wait UPDATE_TIME...' ) time.sleep(UPDATE_TIME)
def download(url): try: r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT) r.encoding = chardet.detect(r.content)['encoding'] if (not r.ok) or len(r.content) < 500: raise ConnectionError else: return r.text except Exception: count = 0 proxylist = sqlhelper.select(10) if not proxylist: return None while count < config.RETRY_TIME: try: proxy = random.choice(proxylist) ip = proxy[0] port = proxy[1] proxies = { 'http': 'http://%s:%s' % (ip, port), 'https': 'https://%s:%s' % (ip, port) } r = requests.get(url=url, headers=config.get_header(), timeout=config.TIMEOUT, proxies=proxies) r.encoding = chardet.detect(r.content)['encoding'] if (not r.ok) or len(r.content) < 500: raise ConnectionError else: return r.text except Exception: count += 1 return None
def downimage(url, img_path): if os.path.exists(config.image_path_base + img_path): return True count = 0 # 重试次数 proxylist = sqlhelper.select(100) if not proxylist: return None while count < config.RETRY_TIME: try: proxy = random.choice(proxylist) ip = proxy[0] port = proxy[1] proxies = { "http": "http://%s:%s" % (ip, port), "https": "http://%s:%s" % (ip, port) } r = requests.get( url=url, headers=config.get_header(), timeout=config.TIMEOUT, ) r.encoding = chardet.detect(r.content)['encoding'] if r.status_code == 200: if len(r.content) == 0: return None if len(r.content) < 500: count += 1 continue with open(config.image_path_base + img_path, 'wb') as file: file.write(r.content) del r return True except Exception: sleep(0.1) count += 1 return None
def GET(self): # web.config.debug = False inputs = web.input() json_result = json.dumps( sqlhelper.select(inputs.get('count', None), inputs)) return json_result
def refresh_download_url_by_tag(): res = sqlhelper.select(100, {'downloaded': 0}) for item in res: vid = (item[1]) refresh_download_url(vid)
def refresh_download_url_by_vno(vnos): for vno in vnos: res = sqlhelper.select(1, {'vno': vno}) item = res[0] vid = (item[1]) refresh_download_url(vid)
def check_vid_exist(view_id): res = sqlhelper.select(1, {'view_id': view_id}) if len(res): print('view id %s exist, skip' % view_id) return True return False
def check_vid_exist(view_id): res = sqlhelper.select(1, {'view_id': view_id}) if len(res): view_ids_queue.remove(view_id) return True return False
def GET(self): inputs = web.input() json_result = json.dumps( sqlhelper.select(inputs.get('count', None), inputs)) return json_result
def query_title(vno): res = sqlhelper.select(1, {'vno': vno}) if len(res): return res[0][3] return None
def GET(self): inputs = web.input() res = sqlhelper.select(inputs.get('count', None), inputs) json_result = json.dumps(list(map(lambda x: x._asdict(), res))) return json_result
def selectProxies(): inputs = request.args json_result = json.dumps( sqlhelper.select(inputs.get('count', None), inputs)) return json_result