Exemplo n.º 1
0
 def executeByInsOrUpdOrDel(self,
                            connection,
                            cursor,
                            sql: str,
                            values=None):
     """
     执行单个sql语句
     :param connection: 数据库连接对象
     :param cursor: mysql游标
     :param sql: 要执行的sql语句
     :param values: 需要插入的数据
     :return: 
     """
     try:
         res = []
         if connection and cursor and sql:
             if values:
                 if isinstance(values, tuple):
                     cursor.execute(sql, values)
                 elif isinstance(values, list):
                     cursor.executemany(sql, values)
                 connection.commit()
                 res = cursor.fetchall()
                 logger.info("操作成功")
                 logger.info("执行结果:{}".format(res))
                 return res
             logger.info("错误:")
             return res
         logger.info("None")
         return res
     except Exception as e:
         logger.error("操作失败:{}".format(e))
         connection.roback()
         return None
Exemplo n.º 2
0
 def get(self):
     """
     
     # @description: 
     # @param {type} 
     # @return: 
     
     """
     try:
         key = request.args.get("key", None)
         if key == "search":
             values = request.args.get("search", None)
             if values:
                 search_res = ZeroCrawler(values).start()
                 print(search_res)
                 res = {"code": 200, "msg": "OK", "data": search_res}
                 logger.info("请求成功:{}".format(res))
                 return res
             res = {"code": 1002, "msg": "error", "data": []}
             logger.info("请求失败:{}".format(res))
             return res
         res = {"code": 1000, "msg": "不做处理", "data": []}
         logger.info("请求成功:{}".format(res))
         return res
     except Exception as e:
         logger.error(e)
Exemplo n.º 3
0
    def download_page(self, item):
        """
        获取下一页的网页地址等
        根据当前状态标记新的状态
        0:初始页面
        1:全局列表页面
        2: 抽取下载连接
        """
        item = MyDict(item)
        try:
            print(f" download_page info: {item.url}")
            logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type))
            # current_app.logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type))
            old_type = item.type
            if item.url != "None" and not cache.get(item.url):
                # if item.url != "None":
                html_obj = requests.get(item.url, headers=self.headers)
                html_str = html_obj.content.decode("gbk")
                item.content = html_str
                print(len(html_str))

                # 请求结果存入redis数据库
                cache.set(item.url, html_str)
                cache.expire(item.url, self.redis_key_timeout)

                # item.xpath_obj = lxml.html.fromstring(html_str)
            logger.info("下载前类型:{}, 下载后类型:{}".format(old_type, item.type))
            self.push_item_in_redis_list(self.message_b, item)
        except Exception as e:
            logger.error("download_page:  {}".format(e))
Exemplo n.º 4
0
    def executeBySelect(self, connection, cursor, sql: str):
        try:

            res = []
            if connection and cursor and sql:
                cursor.execute(sql)
                res = cursor.fetchall()
                logger.info("操作成功")
                logger.info("执行结果:{}".format(res))
                return res
            logger.info("缺少关键参数")
            return res
        except Exception as e:
            logger.error(e)
            return []
Exemplo n.º 5
0
 def get(self):
     """
     提供get方法API,调用爬虫爬取相关数据
     :return:
     """
     if request.method != "GET":
         logger.error("error, method not allowed!")
         return "error, method not allowed!"
     key = request.args.get("request_type", None)
     if key == "download_dytt":
         number, result = self.processes(DyHeavenCrawler, key)
         return {"code": 200, "msg": "succeed", "data": result}
     if key == "download_proxy":
         number, result = self.processes(ProxyCrawler, key)
         return {"code": 200, "msg": "succeed", "data": result}
     return {"code": 2001, "msg": "not worker", "data": []}
Exemplo n.º 6
0
 def _connect_and_get_cursor(self):
     """
     创建连接并初始化游标
     :return: 
     """
     try:
         connection = pymysql.connect(host=self.host,
                                      port=self.port,
                                      user=self.user,
                                      password=self.password,
                                      db=self.db,
                                      charset=self.charset)
         cursor = connection.cursor()
         self.connection, self.cursor = connection, cursor
     except Exception as e:
         logger.error(e)
         return "None", "None"
Exemplo n.º 7
0
 def connectAndGetCursor(self):
     """
     创建连接并初始化游标
     :return: 
     """
     try:
         print(self.host, self.port, self.user, self.password, self.db,
               self.charset)
         connection = pymysql.connect(host=self.host,
                                      port=self.port,
                                      user=self.user,
                                      password=self.password,
                                      db=self.db,
                                      charset=self.charset)
         cursor = connection.cursor()
         return connection, cursor
     except Exception as e:
         logger.error(e)
         return "None", "None"
Exemplo n.º 8
0
 def merger_result(self, item):
     """
     获取下一页的网页地址等
     """
     item = MyDict(item)
     try:
         item_res = {
             "url": item.url,
             "name": item.name,
             "title": item.title,
             "size": item.size,
             "magnet": item.magnet
         }
         self.result.append(item_res)
         # if item.url in cache.keys() and item_res == cache.get(item.url):
         #     self.result.append(cache.get(item.url))
         # else:
         #     cache.set(item.url, json.dumps(item_res, ensure_ascii=False))
         # self.res
     except Exception as e:
         logger.error("merger_result:  {}".format(e))
Exemplo n.º 9
0
    def primary(self, item):
        """
        抽取主方法,抽取连接等所需要等内容
        """
        item = MyDict(item)
        try:
            logger.info("开始抽取:{}".format(item.url))
            xpath_obj = lxml.html.fromstring(item.content)
            # current_app.logger.info("开始抽取:{}".format(item.type))
            if item.type == "1":
                try:

                    tables = xpath_obj.xpath(
                        "//div[@class='bd3']//div[@class='co_content8']/ul//table"
                    )
                    print(len(tables))
                    for table in tables:
                        # URL抽取
                        url = table.xpath(".//tr[2]//a/@href")
                        if url:
                            new_url = url[0]
                            if not new_url.startswith("http"):
                                new_url = dyttUrl + new_url
                            tr_item = MyDict()
                            tr_item.url = new_url
                            tr_item.type = "2"
                            title = table.xpath(".//tr[2]//a/text()")
                            tr_item.title = title[0] if title else "None"

                            name = table.xpath(
                                ".//tr[2]//a/text()")[0] if table.xpath(
                                    ".//tr[2]//a/text()") else "《None》"
                            tr_item.name = "《{}》".format("".join(
                                re.findall(r"《(.*?)》", name)))

                            date = table.xpath(".//tr[3]//font/text()")
                            tr_item.date = date[0] if date else "None"

                            introduction = table.xpath(".//tr[4]/td/text()")
                            tr_item.introduction = introduction[
                                0] if introduction else "None"
                            self.push_item_in_redis_list(
                                self.message_a, tr_item)
                            logger.info("{}".format({
                                "url":
                                tr_item.url[:30],
                                "name":
                                tr_item.name[:6],
                                "title":
                                tr_item.title[:6],
                                "introduction":
                                tr_item.introduction[:30],
                                "type":
                                tr_item.type[:6]
                            }))
                except Exception as e:
                    logger.error("列表页抽取:  {}".format(e))
                    # current_app.logger.error("列表页抽取:  {}".format(e))
            elif item.type == "2":
                try:
                    logger.info("开始抽取详情页信息:{}".format(item.url))
                    # current_app.logger.info("开始抽取详情页信息:{}".format(item.url))
                    down_lists = xpath_obj.xpath("//div[@id='downlist']/table")
                    if not down_lists:
                        down_lists = xpath_obj.xpath("//div[@id='Zoom']/table")
                        # print("再次处理得到的连接:{}".format(len(down_lists)))
                    # print("共有下载连接:{}".format(len(down_lists)))
                    # magnet_info = {}
                    magnet_info = []
                    for downloader in down_lists:
                        magnet = downloader.xpath(
                            ".//a/@href")[0] if downloader.xpath(
                                ".//a/@href") else ""
                        download_name = downloader.xpath(
                            ".//a/text()")[0] if downloader.xpath(
                                ".//a/text()") else "]"
                        magnet_name = re.split(r"[=\]/]", download_name)[-1]
                        # magnet_info.update({magnet_name: magnet})
                        magnet_info.append((magnet_name, magnet))
                    item.magnet = magnet_info
                    self.push_item_in_redis_list(self.message_c, item)
                except Exception as e:
                    logger.error("信息页抽取:  {}".format(e))
                    # current_app.logger.error("信息页抽取:  {}".format(e))
            elif item.type == "0":
                try:
                    area2s = xpath_obj.xpath(
                        "//div[@class='bd2']/div[@class='index_list']/div[@class='co_area2']"
                    )
                    for area in area2s:
                        url = area.xpath(
                            ".//div[@class='title_all']/p/span/a/@href")
                        if url:
                            new_url = url[0]
                            if not new_url.startswith("http"):
                                new_url = dyttUrl + new_url
                            logger.info("首页抽取抽取到的网址:{}".format(new_url))
                            # current_app.logger.info("首页抽取抽取到的网址:{}".format(new_url))
                            # tr_item = DyHeavenCrawler.Default(new_url)

                            tr_item = MyDict()
                            tr_item.url = new_url

                            tr_item.type = "1"
                            tr_item.classify = area.xpath(
                                ".//div[@class='title_all']/p/span/a/text()"
                            )[0]
                            # print("首页抽取抽取到的分类:{}".format(tr_item.classify))
                            self.push_item_in_redis_list(
                                self.message_a, tr_item)
                except Exception as e:
                    logger.error("首页抽取:  {}".format(e))
                    # current_app.logger.error("首页抽取:  {}".format(e))
        except Exception as e:
            logger.error("primary:  {}".format(e))