def getProxyIp(self): """ 进行proxy ip获取,并且存到数据库(law_proxy_ip)提供调用 :return: """ url = self.config['proxy_ip_url'] # url = "https://nl.tan90.club/test/testHeader.html" try: data = curlData(url, timeout=5) except: data = "" try: data = json.loads(data) if str(data['ERRORCODE']) == "10032": debug("proxy ip 今日提取量已达上限,结束程序", True) self.ip_over = 1 return 1 for k, v in enumerate(data['RESULT']): v['id'] = "NEXT VALUE FOR LAW_PROXY_IP_SEQUENCE" v['table'] = "proxy_ip" v['time_stamp'] = str(getNowTimeStamp()) tmp = k + 1 try: # 加锁 self.ws_db.insert(v, is_close_db=False) # 解锁 debug("第" + str(tmp) + "条ip插入成功") except Exception as e: debug("第" + str(tmp) + "条ip插入失败") except Exception as e: debug(e) debug("proxy ip 获取出错,睡眠5秒") sleep(5)
def getWsContentHandle(self, thread_num, start_time_stamp, end_time_stamp, start_date, end_date, docid, docid_i, insertRecordArr, start_record): """ :param thread_num: :param start_time_stamp: :param end_time_stamp: :param start_date: :param end_date: :param docid: :param docid_i: :param insertRecordArr: :return: """ while True: if self.ip_over == 1: break if start_time_stamp > end_time_stamp: break else: param = self.getUrlAndCookie(docid['name'], start_date, end_date) # 将已经处理过的条件存储记录 insertArr = { "table": "ws_record", "id": "NEXT VALUE FOR LAW_WS_RECORD_SEQUENCE", "start_date": start_date, "end_date": end_date, "court_id": str(docid['name']) } # self.ws_db.insert(insertArr, False) insertRecordArr['table'] = "ws_docid_record" insertRecordArr['court_name'] = docid['name'] insertRecordArr['court_num'] = docid_i insertRecordArr['start_date'] = start_date insertRecordArr['court_year'] = start_record self.ws_db.insert(insertRecordArr) start_date = end_date start_time_stamp = start_time_stamp + self.config['interval_time_stamp'] end_date = getDateTime(start_time_stamp, "%Y-%m-%d") threads = list() for i in range(thread_num): tmpK = i + 1 threads.append( ExecuteCurl("线程" + str(tmpK) + ":" + str(tmpK), param, self.proxy_ip, self.table_columns)) for i in range(thread_num): threads[i].start() self.proxy_ip = threads[0].getLastProxyIp() for i in range(thread_num): threads[i].join() try: # 重置页码 threads[0].reset() except: pass try: status = threads[0].getStatus() if status == 2: debug("代理ip今日提取数已达上线") break except: pass
def handle_data(self, page_resource): """ :param page_resource: :return: """ bs = BeautifulSoup(page_resource, "html.parser") li_list = self.__get_li_list(bs) lock.acquire() for item in li_list: insert_arr = dict() insert_arr['recipe_type_id'] = self.category['id'] insert_arr['status'] = 0 insert_arr['img_url'] = self.__get_img_url(item) insert_arr['url'] = self.__get_url(item) insert_arr['introduce'] = self.__get_introduce(item) insert_arr['page_views'] = self.__get_page_views(item) insert_arr['name'] = self.__get_name(item) sql = self.db.getInsertSql(insert_arr, table="list", table_columns=self.table_columns) result = self.db.insert(sql, is_close_db=False) if result == 1: debug("插入成功") else: debug("插入失败") lock.release()
def __get_url_curl(self, post, referer): """ :param post: :param referer: :return: """ headers = { "user-agent": getUserAgent(), "origin": "https://www.pelisplay.tv", "referer": referer } url = "https://www.pelisplay.tv/entradas/procesar_player" data = curlData(url, value=post, cookie=self.cookie, header=headers) try: data = json.loads(data) except Exception as e: lock.acquire() self.get_cookie() lock.release() if self.cookie_get_num < 3: return self.__get_url_curl(post, referer=referer) else: data = {"estado": 500} debug("播放链接获取出错,错误信息:{error}".format(error=e)) if data['estado'] == 200: data = data['data'] else: data = "" self.cookie_get_num = 0 return data
def __get_url(self, bs, page_resource, item): """ :param bs: :param page_resource: :param item: :return: """ url = bs.find("tbody", attrs={"id": "servidores_online"}) url_str = "" try: token = re.findall('window.laravel_token = "([\w\W]*?)";', str(page_resource))[0] except Exception as e: debug("电影播放链接 _token 获取出错,出错信息:{error}".format(error=e)) return url_str try: url = url.find_all("tr") for k, v in enumerate(url): if k < 1: continue data = v.find("div", attrs={ "class": "embedplayer" }).attrs['data-player'] post = {"data": data, "tipo": "videohost", "_token": token} url_str = self.__get_url_curl(post, referer=item['url']) break except Exception as e: url_str = "" debug("电影播放链接获取出错,出错信息:{error}".format(error=e)) return url_str
def __handle_data(self, item): update_data = dict() update_data['status'] = 1 update_data['video_src'] = self.__get_video_src(item) debug(update_data['video_src']) self.__update_data(item['id'], update_data) return {"code": 0}
def saveCookie(self): cookie = self.getCookie() cookie = json.dumps(cookie) debug("此次获取到的cookie为: %s" % cookie, True) insertArr = {"id": 1, "table": "ws_docid_cookie", "cookie": cookie} self.ws_db.insert(insertArr, is_close_db=False)
def getData(self, docid): self.resetProxy() url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + docid referer = "http://wenshu.court.gov.cn/content/content?DocID=" + docid + "&KeyWord=" header_1 = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36", } while True: try: data = curlData(url, referer=referer, header=header_1, proxy_ip=self.proxy_ip, timeout=5) break except Exception as e: debug("数据获取出错,重新获取", True) self.resetProxy() tmp = data tmp = re.findall("dirData = ([\w\W]*?)};", tmp) tmp = tmp[0] + "}" tmp = curlData("http://127.0.0.1:3000/handleFlfg", {"data": tmp}) tmp = json.loads(tmp) # noinspection PyBroadException try: tmp['legislative_authority'] = json.dumps( tmp['legislative_authority']) except: tmp['legislative_authority'] = "" return self.handleWsData(data, tmp)
def getCookie(self): url = self.getUrl() header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36" } try: cookie = getCookie(url, referer="http://wenshu.court.gov.cn", header=header, proxy_ip=self.proxy_ip, timeout=5) try: vjkl5 = cookie['vjkl5'] except Exception as e: debug("cookie获取出错,重新获取 %s" % e.__str__(), True) sleep(3) return self.getCookie() return cookie except Exception as e: if self.is_change_proxy > 4: self.resetProxyIp() self.is_change_proxy = 0 else: self.is_change_proxy = self.is_change_proxy + 1 if e.__str__().find("HTTPConnectionPool") != -1: debug("cookie获取出错,HttpConnect错误,重新获取ip并重新获取") return self.getCookie() else: return 0
def __get_name(cls, item): name = item.find("h3") try: name = name.get_text().strip() except Exception as e: name = "" debug("菜谱名获取出错,错误信息:{error}".format(error=e)) return name
def __get_title(cls, item): title = item.find("div", attrs={"class": "Title"}) try: title = title.get_text().strip() except Exception as e: title = "" debug("电影标题获取出错,出错信息:{error}".format(error=e)) return title
def __get_introduce(cls, item): introduce = item.find_all("span") try: introduce = introduce[0].get_text().strip() except Exception as e: introduce = "" debug("介绍获取出错,错误信息:{error}".format(error=e)) return introduce
def __get_url(cls, item): url = item.find("a") try: url = url.attrs['href'] except Exception as e: url = "" debug("详情页链接获取出错,错误信息:{error}".format(error=e)) return url
def __get_img_url(cls, item): img_url = item.find("img") try: img_url = img_url.attrs['src'] except Exception as e: img_url = "" debug("图片链接获取出错,错误信息:{error}".format(error=e)) return img_url
def get_cookie(self): header = { "User-Agent": getUserAgent(), # "Cache-Control": "max-age=0", # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3" } url = "https://www.pelisplay.tv/" self.cookie = getCookie(url, header=header) debug(self.cookie)
def handle(self): data = self.get_data() for item in data: result = self.__handle(item) if result['result_1'] == 1 and result['result_2'] == 1 and result[ 'result_3'] == 1: debug(item['img_url']) else: break
def __move_data(self, data): for item in data: content = self.get_content_by_list_id(item['id']) try: content = content[0] content['status'] = 0 self.__insert_data(content) except Exception as e: debug(e)
def __get_img_url(cls, bs): # img_url = bs.find("div", attrs={"class": "ytp-cued-thumbnail-overlay-image"}) try: # img_url = img_url.attrs['style'] img_url = re.findall('"image": "([\w\W]*?)"', str(bs))[0] except Exception as e: img_url = "" debug("菜谱图片链接获取出错,错误信息:{error}".format(error=e)) return img_url
def __get_recipe_page(self): """ :return: """ for item in self.category: url = CommonFunc().generate_url(category=item['keyword']) try: self.__get_recipe_page_data(url, item['id']) except Exception as e: debug("页面数量抓取出错,出错信息:{error}".format(error=e))
def run(self): global num while True: mylock.acquire() if num > 10: mylock.release() break num = num + 1 debug(self.name + ": 得到num值为" + str(num)) mylock.release()
def __get_page_views(cls, item): page_views = item.find_all("span") try: page_views = page_views[1].get_text().strip() # 去掉逗号 page_views = page_views.replace(",", "") page_views = page_views.replace(" Plays", "") except Exception as e: page_views = 0 debug("浏览量获取出错,错误信息:{error}".format(error=e)) return page_views
def a_test(): threads = list() for k in range(3): tmpK = k + 1 threads.append(TestThread("线程" + str(tmpK))) for k in range(3): threads[k].start() for k in range(3): threads[k].join() threads[0].setNum() debug("完毕")
def __get_category_img_src(cls, item): """ :param item: :return: """ category_img_src = item.find("img") try: category_img_src = category_img_src.attrs['src'] except Exception as e: category_img_src = "" debug("图片地址获取失败,错误信息:{error}".format(error=e)) return category_img_src
def __get_star(cls, item): """ :param item: :return: """ star = item.find("span", attrs={"class": "qualification"}) try: star = star.get_text().strip() except Exception as e: star = "" debug("电影短述获取出错,出错信息:{error}".format(error=e)) return star
def __get_url(cls, item): """ :param item: :return: """ url = item.find("a") try: url = url.attrs['href'] except Exception as e: url = "" debug("电影详情链接地址获取出错,出错信息:{error}".format(error=e)) return url
def __get_origin_src(cls, item): """ :param item: :return: """ origin_src = item.find_all("img") try: origin_src = origin_src[1].attrs['src'] except Exception as e: origin_src = "" debug("电影小图标获取出错,出错信息:{error}".format(error=e)) return origin_src
def __get_img_src(cls, item): """ :param item: :return: """ img_src = item.find("img") try: img_src = img_src.attrs['src'] except Exception as e: img_src = "" debug("电影封面图获取出错,出错信息:{error}".format(error=e)) return img_src
def __download(self, data): for item in data: url = "https://www.youtube.com/watch?v=%s" % item['video_id'] debug("开始抓取:--> {video_id}".format(video_id=item['video_id'])) try: youtube = YouTube(url) youtube.streams.filter(subtype="mp4").first().download( "/Users/cpx/code/py/recipe/data/recipe/", filename=item['video_id']) self.__update_data(item['id']) except Exception as e: debug(e)
def __handle(self, item): img_url = item['img_url'] try: s = re.findall('squarethumbnails\/([\w\W]*.)', img_url)[0] except Exception as e: s = '' debug(e) if s == '': return s = 'http://www.laurainthekitchen.com/largethumbnails/' + s result = self.__update_data(s, item) return result
def __get_category_url(cls, item): """ :param item: :return: """ category_url = item.find("a") try: category_url = category_url.attrs['href'] except Exception as e: category_url = "" debug("分类url链接获取失败,错误信息:{error}".format(error=e)) return category_url