def getProxyLs(self): try: html = requests.get('http://47.97.7.119:8080/proxypool/proxys/0',timeout =self.timeout) db.conn() proxyLs = [] if html.status_code != requests.codes.ok: print(u'获取代理失败!') return proxyDataLs = json.loads(html.text)['data'] for proxy in proxyDataLs: proxyLs.append([proxy['proxyType'],proxy['proxyAddress'],str(proxy['proxyPort']),round(time.time()*1000),None,self.md5Encode(proxy['proxyAddress'])]) try: db.executemany('replace into proxyls(PROXY_TYPE,PROXY_ADDR,PROXY_PORT,PROXY_ADD_TIME,PROXY_CHECK_TIME,PROXY_MD5) values (%s,%s,%s,%s,%s,%s)',proxyLs) except Exception as e: traceback.print_exc() db.rollback() else: db.commit() time.sleep(3) print(u'爬取完毕') except Exception as e: traceback.print_exc() time.sleep(2) print(u'开始检验代理...') self.testbythread()
def complete(self, animeid): """这个函数是更新已完成的URL完成""" try: db.execute( 'UPDATE anime_home a SET a.ANIME_INFO_DOWNLOAD_STATUS = 2 where a.ANIME_ID = %s' % (animeid), None) except Exception as e: db.rollback() raise e finally: db.commit()
def getData(self, url): content = self.requestbyproxy(url) dataJson = json.loads(content.text) if dataJson['status'] == 200: if len(dataJson['data']['page']['list']): values_insert = [] for i in dataJson['data']['page']['list']: anime_bid = int(i['bid']) anime_name = i['title'] anime_cover = i['cover'] anime_vertical_cover = i['verticalCover'] anime_play_date = i['playDate'] anime_play_time = i['playTime'] anime_origin_time = i['originTime'] anime_play_site = i['playSite'] anime_origin_station = i['originStation'] anime_play_url = i['playUrl'] anime_play_episode = i['episode'] if db.execute( 'select 1 from anime_timetable a where a.ANIME_BID = %s limit 1', (anime_bid)) == 0: values_insert.append([ anime_bid, None, anime_name, anime_cover, anime_vertical_cover, anime_play_date, anime_play_time, anime_origin_time, anime_play_site, anime_origin_station, anime_play_url, anime_play_episode ]) else: try: db.execute( "update anime_timetable t set t.ANIME_PLAY_DATE = '%s',t.ANIME_PLAY_EPISODE = '%s' where t.ANIME_BID = %d" % (anime_play_date, anime_play_episode, anime_bid), None) except Exception as e: db.rollback() traceback.print_exc() try: if len(values_insert): db.executemany( 'insert into anime_timetable(ANIME_BID,ANIME_ID,ANIME_NAME,ANIME_COVER,ANIME_VERTICAL_COVER,ANIME_PLAY_DATE,ANIME_PLAY_TIME,ANIME_ORIGIN_TIME,ANIME_PLAY_SITE,ANIME_ORIGIN_STATION,ANIME_PLAY_URL,ANIME_PLAY_EPISODE) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', values_insert) except Exception as e: db.rollback() traceback.print_exc() finally: db.commit() time.sleep(1) else: pass else: pass
def repair(self): try: db.execute( 'SELECT h.* FROM anime_home h LEFT JOIN anime_info i on (h.ANIME_ID = i.ANIME_ID) WHERE i.ANIME_ID IS NOT NULL', None) records = db.fetchall() if records: for index, r in enumerate(records): db.execute( 'UPDATE anime_home a SET a.ANIME_INFO_DOWNLOAD_STATUS = 2 WHERE a.ID = %s' % (r[0]), None) db.execute( 'UPDATE anime_home h SET h.ANIME_INFO_DOWNLOAD_STATUS = 0 WHERE h.ANIME_INFO_DOWNLOAD_STATUS =1', None) except Exception as e: db.rollback() raise e finally: db.commit()
def pop(self): try: db.execute( 'SELECT a.ID,a.ANIME_LINE,a.ANIME_INFO_DOWNLOAD_STATUS FROM anime_home a WHERE a.ANIME_INFO_DOWNLOAD_STATUS = 0', None) records = db.fetchall() db.execute( 'UPDATE anime_home a SET a.ANIME_INFO_DOWNLOAD_STATUS = 1 WHERE a.ANIME_INFO_DOWNLOAD_STATUS = 0', None) if records: for r in records: self.queue.put(r) return self.queue # else: # self.repair() # raise KeyError except Exception as e: db.rollback() raise e finally: db.commit()
def getxiciProxy(self,i): proxyList=[] proxyLs=[] print(u'获取第',i,'页数据') html=self.request('http://www.xicidaili.com/nn/'+str(i),i) all_proxy=self.xpathResolve(html,"//table[@id='ip_list']/tr[@class][position()>1]//td[6]|//table[@id='ip_list']/tr[@class][position()>1]//td[2]|//table[@id='ip_list']/tr[@class][position()>1]//td[3]") if len(all_proxy)>0: for i in range(0,len(all_proxy),3): proxyList.append(all_proxy[i:i+3]) if len(proxyList)>0: for i in proxyList: proxyLs.append([i[2].text.lower(),i[0].text,str(i[1].text),round(time.time()*1000),None,self.md5Encode(i[0].text)]) self.lock.acquire() try: db.executemany('insert into proxyls(PROXY_TYPE,PROXY_ADDR,PROXY_PORT,PROXY_ADD_TIME,PROXY_CHECK_TIME,PROXY_MD5) values (%s,%s,%s,%s,%s,%s)',proxyLs) except Exception as e: traceback.print_exc() db.rollback() else: db.commit() self.lock.release()
def checkproxy2(self): start = time.clock() print(start) for result in self.proxyLs: try: content = requests.get('https://www.baidu.com/', proxies={result[1]:'http://'+result[2]+':'+result[3]},timeout =5) print(u'验证'+result[1]+':'+'http://'+result[2]+':'+result[3]) except Exception as e: traceback.print_exc() print('false') self.delproxy(result[0]) else: if content.status_code == requests.codes.ok: self.writeproxy(result[0]) else: self.delproxy(result[0]) finally: db.commit() db.close print(str(time.clock()-start) + "秒") print(u'验证完毕')
def testbythread(self): threads=[] db.execute('select * from proxyls',None); proxyLs = db.fetchall() start = time.clock() for p in proxyLs: self.queue.put(p) for i in range(8): thread = threading.Thread(target=self.checkproxy) thread.start() #启动线程 threads.append(thread) time.sleep(1) #结束线程 for thread in threads: thread.join() #等待所有任务完成 self.queue.join() db.commit() db.close() print(str(time.clock()-start) + "秒") print(u'验证完毕')
def testdb(self): db.execute('delete from proxy where ID = %s' % (73),None) db.commit() db.close()