Пример #1
0
 def getProxyIp(self):
     """
     进行proxy ip获取,并且存到数据库(law_proxy_ip)提供调用
     :return:
     """
     url = self.config['proxy_ip_url']
     # url = "https://nl.tan90.club/test/testHeader.html"
     try:
         data = curlData(url, timeout=5)
     except:
         data = ""
     try:
         data = json.loads(data)
         if str(data['ERRORCODE']) == "10032":
             debug("proxy ip 今日提取量已达上限,结束程序", True)
             self.ip_over = 1
             return 1
         for k, v in enumerate(data['RESULT']):
             v['id'] = "NEXT VALUE FOR LAW_PROXY_IP_SEQUENCE"
             v['table'] = "proxy_ip"
             v['time_stamp'] = str(getNowTimeStamp())
             tmp = k + 1
             try:
                 # 加锁
                 self.ws_db.insert(v, is_close_db=False)
                 # 解锁
                 debug("第" + str(tmp) + "条ip插入成功")
             except Exception as e:
                 debug("第" + str(tmp) + "条ip插入失败")
     except Exception as e:
         debug(e)
         debug("proxy ip 获取出错,睡眠5秒")
         sleep(5)
Пример #2
0
 def getWsContentHandle(self, thread_num, start_time_stamp, end_time_stamp, start_date, end_date, docid, docid_i,
                        insertRecordArr, start_record):
     """
     :param thread_num:
     :param start_time_stamp:
     :param end_time_stamp:
     :param start_date:
     :param end_date:
     :param docid:
     :param docid_i:
     :param insertRecordArr:
     :return:
     """
     while True:
         if self.ip_over == 1:
             break
         if start_time_stamp > end_time_stamp:
             break
         else:
             param = self.getUrlAndCookie(docid['name'], start_date, end_date)
             # 将已经处理过的条件存储记录
             insertArr = {
                 "table": "ws_record",
                 "id": "NEXT VALUE FOR LAW_WS_RECORD_SEQUENCE",
                 "start_date": start_date,
                 "end_date": end_date,
                 "court_id": str(docid['name'])
             }
             # self.ws_db.insert(insertArr, False)
             insertRecordArr['table'] = "ws_docid_record"
             insertRecordArr['court_name'] = docid['name']
             insertRecordArr['court_num'] = docid_i
             insertRecordArr['start_date'] = start_date
             insertRecordArr['court_year'] = start_record
             self.ws_db.insert(insertRecordArr)
             start_date = end_date
             start_time_stamp = start_time_stamp + self.config['interval_time_stamp']
             end_date = getDateTime(start_time_stamp, "%Y-%m-%d")
         threads = list()
         for i in range(thread_num):
             tmpK = i + 1
             threads.append(
                 ExecuteCurl("线程" + str(tmpK) + ":" + str(tmpK), param, self.proxy_ip, self.table_columns))
         for i in range(thread_num):
             threads[i].start()
             self.proxy_ip = threads[0].getLastProxyIp()
         for i in range(thread_num):
             threads[i].join()
         try:
             # 重置页码
             threads[0].reset()
         except:
             pass
         try:
             status = threads[0].getStatus()
             if status == 2:
                 debug("代理ip今日提取数已达上线")
                 break
         except:
             pass
Пример #3
0
    def handle_data(self, page_resource):
        """
        :param page_resource:
        :return:
        """
        bs = BeautifulSoup(page_resource, "html.parser")
        li_list = self.__get_li_list(bs)
        lock.acquire()
        for item in li_list:
            insert_arr = dict()
            insert_arr['recipe_type_id'] = self.category['id']
            insert_arr['status'] = 0
            insert_arr['img_url'] = self.__get_img_url(item)
            insert_arr['url'] = self.__get_url(item)
            insert_arr['introduce'] = self.__get_introduce(item)
            insert_arr['page_views'] = self.__get_page_views(item)
            insert_arr['name'] = self.__get_name(item)

            sql = self.db.getInsertSql(insert_arr,
                                       table="list",
                                       table_columns=self.table_columns)
            result = self.db.insert(sql, is_close_db=False)
            if result == 1:
                debug("插入成功")
            else:
                debug("插入失败")
        lock.release()
Пример #4
0
 def __get_url_curl(self, post, referer):
     """
     :param post:
     :param referer:
     :return:
     """
     headers = {
         "user-agent": getUserAgent(),
         "origin": "https://www.pelisplay.tv",
         "referer": referer
     }
     url = "https://www.pelisplay.tv/entradas/procesar_player"
     data = curlData(url, value=post, cookie=self.cookie, header=headers)
     try:
         data = json.loads(data)
     except Exception as e:
         lock.acquire()
         self.get_cookie()
         lock.release()
         if self.cookie_get_num < 3:
             return self.__get_url_curl(post, referer=referer)
         else:
             data = {"estado": 500}
             debug("播放链接获取出错,错误信息:{error}".format(error=e))
     if data['estado'] == 200:
         data = data['data']
     else:
         data = ""
     self.cookie_get_num = 0
     return data
Пример #5
0
 def __get_url(self, bs, page_resource, item):
     """
     :param bs:
     :param page_resource:
     :param item:
     :return:
     """
     url = bs.find("tbody", attrs={"id": "servidores_online"})
     url_str = ""
     try:
         token = re.findall('window.laravel_token = "([\w\W]*?)";',
                            str(page_resource))[0]
     except Exception as e:
         debug("电影播放链接 _token 获取出错,出错信息:{error}".format(error=e))
         return url_str
     try:
         url = url.find_all("tr")
         for k, v in enumerate(url):
             if k < 1:
                 continue
             data = v.find("div", attrs={
                 "class": "embedplayer"
             }).attrs['data-player']
             post = {"data": data, "tipo": "videohost", "_token": token}
             url_str = self.__get_url_curl(post, referer=item['url'])
             break
     except Exception as e:
         url_str = ""
         debug("电影播放链接获取出错,出错信息:{error}".format(error=e))
     return url_str
Пример #6
0
 def __handle_data(self, item):
     update_data = dict()
     update_data['status'] = 1
     update_data['video_src'] = self.__get_video_src(item)
     debug(update_data['video_src'])
     self.__update_data(item['id'], update_data)
     return {"code": 0}
Пример #7
0
    def saveCookie(self):
        cookie = self.getCookie()

        cookie = json.dumps(cookie)
        debug("此次获取到的cookie为: %s" % cookie, True)
        insertArr = {"id": 1, "table": "ws_docid_cookie", "cookie": cookie}
        self.ws_db.insert(insertArr, is_close_db=False)
Пример #8
0
 def getData(self, docid):
     self.resetProxy()
     url = "http://wenshu.court.gov.cn/CreateContentJS/CreateContentJS.aspx?DocID=" + docid
     referer = "http://wenshu.court.gov.cn/content/content?DocID=" + docid + "&KeyWord="
     header_1 = {
         "User-Agent":
         "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
     }
     while True:
         try:
             data = curlData(url,
                             referer=referer,
                             header=header_1,
                             proxy_ip=self.proxy_ip,
                             timeout=5)
             break
         except Exception as e:
             debug("数据获取出错,重新获取", True)
             self.resetProxy()
     tmp = data
     tmp = re.findall("dirData = ([\w\W]*?)};", tmp)
     tmp = tmp[0] + "}"
     tmp = curlData("http://127.0.0.1:3000/handleFlfg", {"data": tmp})
     tmp = json.loads(tmp)
     # noinspection PyBroadException
     try:
         tmp['legislative_authority'] = json.dumps(
             tmp['legislative_authority'])
     except:
         tmp['legislative_authority'] = ""
     return self.handleWsData(data, tmp)
Пример #9
0
 def getCookie(self):
     url = self.getUrl()
     header = {
         "User-Agent":
         "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36"
     }
     try:
         cookie = getCookie(url,
                            referer="http://wenshu.court.gov.cn",
                            header=header,
                            proxy_ip=self.proxy_ip,
                            timeout=5)
         try:
             vjkl5 = cookie['vjkl5']
         except Exception as e:
             debug("cookie获取出错,重新获取 %s" % e.__str__(), True)
             sleep(3)
             return self.getCookie()
         return cookie
     except Exception as e:
         if self.is_change_proxy > 4:
             self.resetProxyIp()
             self.is_change_proxy = 0
         else:
             self.is_change_proxy = self.is_change_proxy + 1
         if e.__str__().find("HTTPConnectionPool") != -1:
             debug("cookie获取出错,HttpConnect错误,重新获取ip并重新获取")
             return self.getCookie()
         else:
             return 0
Пример #10
0
 def __get_name(cls, item):
     name = item.find("h3")
     try:
         name = name.get_text().strip()
     except Exception as e:
         name = ""
         debug("菜谱名获取出错,错误信息:{error}".format(error=e))
     return name
Пример #11
0
 def __get_title(cls, item):
     title = item.find("div", attrs={"class": "Title"})
     try:
         title = title.get_text().strip()
     except Exception as e:
         title = ""
         debug("电影标题获取出错,出错信息:{error}".format(error=e))
     return title
Пример #12
0
 def __get_introduce(cls, item):
     introduce = item.find_all("span")
     try:
         introduce = introduce[0].get_text().strip()
     except Exception as e:
         introduce = ""
         debug("介绍获取出错,错误信息:{error}".format(error=e))
     return introduce
Пример #13
0
 def __get_url(cls, item):
     url = item.find("a")
     try:
         url = url.attrs['href']
     except Exception as e:
         url = ""
         debug("详情页链接获取出错,错误信息:{error}".format(error=e))
     return url
Пример #14
0
 def __get_img_url(cls, item):
     img_url = item.find("img")
     try:
         img_url = img_url.attrs['src']
     except Exception as e:
         img_url = ""
         debug("图片链接获取出错,错误信息:{error}".format(error=e))
     return img_url
Пример #15
0
 def get_cookie(self):
     header = {
         "User-Agent": getUserAgent(),
         # "Cache-Control": "max-age=0",
         # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
     }
     url = "https://www.pelisplay.tv/"
     self.cookie = getCookie(url, header=header)
     debug(self.cookie)
Пример #16
0
 def handle(self):
     data = self.get_data()
     for item in data:
         result = self.__handle(item)
         if result['result_1'] == 1 and result['result_2'] == 1 and result[
                 'result_3'] == 1:
             debug(item['img_url'])
         else:
             break
Пример #17
0
 def __move_data(self, data):
     for item in data:
         content = self.get_content_by_list_id(item['id'])
         try:
             content = content[0]
             content['status'] = 0
             self.__insert_data(content)
         except Exception as e:
             debug(e)
Пример #18
0
 def __get_img_url(cls, bs):
     # img_url = bs.find("div", attrs={"class": "ytp-cued-thumbnail-overlay-image"})
     try:
         # img_url = img_url.attrs['style']
         img_url = re.findall('"image": "([\w\W]*?)"', str(bs))[0]
     except Exception as e:
         img_url = ""
         debug("菜谱图片链接获取出错,错误信息:{error}".format(error=e))
     return img_url
Пример #19
0
 def __get_recipe_page(self):
     """
     :return:
     """
     for item in self.category:
         url = CommonFunc().generate_url(category=item['keyword'])
         try:
             self.__get_recipe_page_data(url, item['id'])
         except Exception as e:
             debug("页面数量抓取出错,出错信息:{error}".format(error=e))
Пример #20
0
 def run(self):
     global num
     while True:
         mylock.acquire()
         if num > 10:
             mylock.release()
             break
         num = num + 1
         debug(self.name + ": 得到num值为" + str(num))
         mylock.release()
Пример #21
0
 def __get_page_views(cls, item):
     page_views = item.find_all("span")
     try:
         page_views = page_views[1].get_text().strip()
         # 去掉逗号
         page_views = page_views.replace(",", "")
         page_views = page_views.replace(" Plays", "")
     except Exception as e:
         page_views = 0
         debug("浏览量获取出错,错误信息:{error}".format(error=e))
     return page_views
Пример #22
0
def a_test():
    threads = list()
    for k in range(3):
        tmpK = k + 1
        threads.append(TestThread("线程" + str(tmpK)))
    for k in range(3):
        threads[k].start()
    for k in range(3):
        threads[k].join()
    threads[0].setNum()
    debug("完毕")
Пример #23
0
 def __get_category_img_src(cls, item):
     """
     :param item:
     :return:
     """
     category_img_src = item.find("img")
     try:
         category_img_src = category_img_src.attrs['src']
     except Exception as e:
         category_img_src = ""
         debug("图片地址获取失败,错误信息:{error}".format(error=e))
     return category_img_src
Пример #24
0
 def __get_star(cls, item):
     """
     :param item:
     :return:
     """
     star = item.find("span", attrs={"class": "qualification"})
     try:
         star = star.get_text().strip()
     except Exception as e:
         star = ""
         debug("电影短述获取出错,出错信息:{error}".format(error=e))
     return star
Пример #25
0
 def __get_url(cls, item):
     """
     :param item:
     :return:
     """
     url = item.find("a")
     try:
         url = url.attrs['href']
     except Exception as e:
         url = ""
         debug("电影详情链接地址获取出错,出错信息:{error}".format(error=e))
     return url
Пример #26
0
 def __get_origin_src(cls, item):
     """
     :param item:
     :return:
     """
     origin_src = item.find_all("img")
     try:
         origin_src = origin_src[1].attrs['src']
     except Exception as e:
         origin_src = ""
         debug("电影小图标获取出错,出错信息:{error}".format(error=e))
     return origin_src
Пример #27
0
 def __get_img_src(cls, item):
     """
     :param item:
     :return:
     """
     img_src = item.find("img")
     try:
         img_src = img_src.attrs['src']
     except Exception as e:
         img_src = ""
         debug("电影封面图获取出错,出错信息:{error}".format(error=e))
     return img_src
Пример #28
0
 def __download(self, data):
     for item in data:
         url = "https://www.youtube.com/watch?v=%s" % item['video_id']
         debug("开始抓取:--> {video_id}".format(video_id=item['video_id']))
         try:
             youtube = YouTube(url)
             youtube.streams.filter(subtype="mp4").first().download(
                 "/Users/cpx/code/py/recipe/data/recipe/",
                 filename=item['video_id'])
             self.__update_data(item['id'])
         except Exception as e:
             debug(e)
Пример #29
0
 def __handle(self, item):
     img_url = item['img_url']
     try:
         s = re.findall('squarethumbnails\/([\w\W]*.)', img_url)[0]
     except Exception as e:
         s = ''
         debug(e)
     if s == '':
         return
     s = 'http://www.laurainthekitchen.com/largethumbnails/' + s
     result = self.__update_data(s, item)
     return result
Пример #30
0
 def __get_category_url(cls, item):
     """
     :param item:
     :return:
     """
     category_url = item.find("a")
     try:
         category_url = category_url.attrs['href']
     except Exception as e:
         category_url = ""
         debug("分类url链接获取失败,错误信息:{error}".format(error=e))
     return category_url