Пример #1
0
    def parse_arts_search(self, url, keys, response):
        """
        parser, keys: ("arts_search", arts_key)
        """
        _, querys = spider.get_url_params(url)
        self.current_page = int(querys["page"][0]) if "page" in querys else self.current_page
        logging.debug("WeiXinPublic parse_arts_search: update current page, current_page=%d" % self.current_page)

        soup = BeautifulSoup(spider.get_html_content(response, charset="utf-8"), "html.parser")
        if not self.check_anti_by_captcha(soup):
            self.reset_this_class()
            return

        # current page
        for art_soup in soup.find_all("div", class_="txt-box"):
            art_url = spider.get_url_legal(art_soup.find("a").get("href"), base_url=url)
            user_openid = art_soup.find("a", id="weixin_account").get("i")
            user_name = art_soup.find("a", id="weixin_account").get("title")
            self.fetch_queue.put(item=(art_url, ("get_art", keys[1], user_openid, user_name), 0))

        # next page
        next_page = soup.find("a", id="sogou_next")
        if next_page:
            next_page_url = spider.get_url_legal(next_page.get("href"), base_url=url)
            self.fetch_queue.put(item=(next_page_url, keys, 0))
        return
Пример #2
0
    def working(self, task, spider_url, next_page_url, cookies):

        try:
            url, keys, contents, priority = task
            # contents = urllib.parse.unquote(content)
            re_group = re.compile(spider_url).findall(contents, re.IGNORECASE)
            url_set = {(spider.get_url_legal(_url, base_url=url)).split("#")[0]
                       for _url in re_group}
            next_page_set = {
                __url
                for __url in url_set if re.compile(next_page_url).search(__url)
            }
            flter_url_set = [
                url_set.remove(_url) for _url in next_page_set
                if _url in url_set
            ]
            if next_page_url:
                next_url_re_group = re.compile(next_page_url).findall(
                    contents, re.IGNORECASE)
                add_set = [
                    next_page_set.add(
                        (spider.get_url_legal(_url,
                                              base_url=url)).split("#")[0])
                    for _url in next_url_re_group
                ]
            state, item = self.htm_parse(url, contents, cookies)
            key = {"type": "parser"}
        except Exception as excep:
            next_page_set = {}
            url_set = {}
            item = None
            state = 0
            key = {"type": "parser"}
            logging.error("parer:", excep)
        return next_page_set, url_set, key, state, item
Пример #3
0
 def parse_user_arts(self, url, keys, response):
     """
     parser, keys: ("user_arts", user_id, user_name)
     """
     html = spider.get_html_content(response, charset="utf-8")
     json_data = spider.get_json_data(html, "msgList = '(?P<item>\{[\w\W]+?\})'")
     if json_data:
         for item in json_data.get("list", []):
             item_url = spider.get_url_legal(item["app_msg_ext_info"]["content_url"][1:], self.base_url_weixinqq).replace("&amp;", "&")
             self.fetch_queue.put(item=(item_url, ("get_art", None, keys[1], keys[2]), 0))
             for subitem in item["app_msg_ext_info"]["multi_app_msg_item_list"]:
                 subitem_url = spider.get_url_legal(subitem["content_url"][1:], self.base_url_weixinqq).replace("&amp;", "&")
                 self.fetch_queue.put(item=(subitem_url, ("get_art", None, keys[1], keys[2]), 0))
     logging.debug("WeiXinPublic parse_user_arts: len(fetch_queue)=%d" % self.fetch_queue.qsize())
     return
Пример #4
0
    def htm_parse(self, priority: int, url: str, keys: dict, deep: int,
                  content: object):
        # test error-logging
        assert random.randint(0, 100) != 8, "error-in-parser"

        status_code, url_now, html_text = content

        url_list = []
        if (self._max_deep < 0) or (deep < self._max_deep):
            re_group = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>",
                                  html_text,
                                  flags=re.IGNORECASE)
            url_list = [(spider.get_url_legal(_url, base_url=url), keys,
                         priority + 1) for _url in re_group]

        # save_list can be list / tuple / dict
        title = re.search(r"<title>(?P<title>.+?)</title>",
                          html_text,
                          flags=re.IGNORECASE)
        # item = (url, title.group("title").strip(), datetime.datetime.now()) if title else []
        item = {
            "url": url,
            "title": title.group("title").strip(),
            "datetime": datetime.datetime.now()
        } if title else {}

        # test multi-processing(heavy time)
        [BeautifulSoup(html_text, "lxml") for _ in range(10)]
        return 1, url_list, item
Пример #5
0
    def htm_parse(self, priority: int, url: str, keys: dict, deep: int,
                  content: object):
        """
        定义解析函数,解析抓取到的content,生成待抓取的url和待保存的item
        """
        status_code, url_now, html_text = content

        url_list = []
        if (self._max_deep < 0) or (deep < self._max_deep):
            re_group = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>",
                                  html_text,
                                  flags=re.IGNORECASE)
            url_list = [(spider.get_url_legal(_url, base_url=url), keys,
                         priority + 1) for _url in re_group]

        title = re.search(r"<title>(?P<title>.+?)</title>",
                          html_text,
                          flags=re.IGNORECASE)
        # item = (url, title.group("title").strip(), datetime.datetime.now()) if title else []
        item = {
            "url": url,
            "title": title.group("title").strip(),
            "datetime": datetime.datetime.now()
        } if title else {}

        # test multi-processing(heavy time)
        [BeautifulSoup(html_text, "lxml") for _ in range(10)]
        return 1, url_list, item
Пример #6
0
    def htm_parse(self, task_parse: spider.TaskParse) -> spider.ResultParse:
        """
        定义解析函数,解析抓取到的content,生成待抓取的url列表和待保存的item
        """
        status_code, url_now, html_text = task_parse.content

        task_fetch_list = []
        if (self._max_deep < 0) or (task_parse.deep < self._max_deep):
            re_group = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>",
                                  html_text,
                                  flags=re.IGNORECASE)
            url_list = [
                spider.get_url_legal(_url, base_url=task_parse.url)
                for _url in re_group
            ]
            task_fetch_list = [
                spider.TaskFetch.from_task_parse(task_parse, url_new=url)
                for url in url_list
            ]

        title = re.search(r"<title>(?P<title>.+?)</title>",
                          html_text,
                          flags=re.IGNORECASE)
        item = {"url": url_now, "title": title.group("title")} if title else {}
        task_save = spider.TaskSave.from_task_parse(task_parse, item=item)

        return spider.ResultParse(state_code=1,
                                  task_fetch_list=task_fetch_list,
                                  task_save=task_save)
Пример #7
0
    def htm_parse(self, priority, url, keys, deep, critical, parse_repeat, content):
        """
        重写函数htm_parse()
        """
        # parse content (cur_code, cur_url, cur_info, cur_html)
        cur_code, cur_url, cur_info, cur_html = content

        # get url_list and save_list
        url_list = []
        if (self.max_deep < 0) or (deep < self.max_deep):
            a_list = re.findall(r"<a[\w\W]+?href=\"(?P<url>[\w\W]+?)\"[\w\W]*?>[\w\W]+?</a>", cur_html, flags=re.IGNORECASE)
            url_list = [(_url, keys, critical, priority+1) for _url in [spider.get_url_legal(href, url) for href in a_list]]
        title = re.search(r"<title>(?P<title>[\w\W]+?)</title>", cur_html, flags=re.IGNORECASE)
        save_list = [(url, title.group("title"), datetime.datetime.now()), ] if title else []

        # test cpu task
        count = 0
        for i in range(1000):
            for j in range(1000):
                count += ((i*j) / 1000)

        # test parsing error
        if random.randint(0, 5) == 3:
            parse_repeat += (1 / 0)

        # return code, url_list, save_list
        return 1, url_list, save_list
Пример #8
0
    def htm_parse(self, priority: int, url: str, keys: dict, deep: int,
                  content: object):
        status_code, url_now, html_text = content
        # test multi-processing
        [BeautifulSoup(html_text, "lxml") for _ in range(10)]

        url_list = []
        if (self._max_deep < 0) or (deep < self._max_deep):
            for _url in re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>",
                                   html_text,
                                   flags=re.IGNORECASE):
                url_list.append(
                    (spider.get_url_legal(_url,
                                          base_url=url), keys, priority + 1))

        title = re.search(r"<title>(?P<title>.+?)</title>",
                          html_text,
                          flags=re.IGNORECASE)
        save_list = [
            (url, title.group("title").strip(), datetime.datetime.now()),
        ] if title else []

        # test error-logging
        # assert random.randint(0, 100) != 8, "error-in-parser"
        return 1, url_list, save_list
Пример #9
0
    def check_anti_by_captcha(self, html):
        """
        check anti-spider by captcha
        """
        soup = bs4.BeautifulSoup(html, "html.parser")

        cid, code = None, None
        while not code:
            captcha_url = soup.find("img", attrs={
                "node-type": "yzm_img"
            }).get("src")
            response = self.opener.open(
                spider.get_url_legal(captcha_url, self.search_url))
            cid, code = self.yundama.get_captcha(response.read(),
                                                 "captcha.jpeg",
                                                 "image/jpeg",
                                                 codetype="1004")

        verified_url = "http://s.weibo.com/ajax/pincode/verified?__rnd=%d" % int(
            time.time() * 1000)
        post_data = spider.make_post_data({
            "secode": code,
            "type": "sass",
            "pageid": "weibo",
            "_t": 0
        })
        temp = json.loads(
            spider.get_html_content(
                self.opener.open(verified_url, data=post_data)))
        if temp["code"] == "100000":
            logging.warning("WeiBoSearch anti-spider succeed")
        else:
            logging.warning("WeiBoSearch anti-spider failed")
            self.yundama.report(cid) if cid else 0
        return
Пример #10
0
    def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object):
        status_code, url_now, html_text = content

        url_list = []
        if (self._max_deep < 0) or (deep < self._max_deep):
            tmp_list = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE)
            url_list = [(_url, keys, priority+1) for _url in [spider.get_url_legal(href, url) for href in tmp_list]]

        title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE)
        save_list = [(url, title.group("title").strip(), datetime.datetime.now()), ] if title else []

        return 1, url_list, save_list
Пример #11
0
def url_parse(baseurl, html_doc, keys, priority, deep, MAX_DEEP):
    url_list = []
    soup = BeautifulSoup(html_doc, 'lxml')
    if (deep == 0):
        post_nodes = soup.select('#archive .floated-thumb .post-thumb a')
        if (len(post_nodes) != 0):
            for node in post_nodes:
                url_list.append(
                    (spider.get_url_legal(node['href'],
                                          baseurl), keys, priority + 1))
    elif deep < MAX_DEEP:
        related_nodes = soup.select('.digg-item-updated-title a')
        if (len(related_nodes) != 0):
            for node in related_nodes:
                if ('/#comments' in node['href']):
                    continue
                url_list.append(
                    (spider.get_url_legal(node['href'],
                                          baseurl), keys, priority + 1))

    return url_list
Пример #12
0
    def htm_parse(self, priority: int, url: str, keys: dict, deep: int, content: object):
        status_code, url_now, html_text = content

        url_list = []
        if (self._max_deep < 0) or (deep < self._max_deep):
            tmp_list = re.findall(r"<a.+?href=\"(?P<url>.{5,}?)\".*?>", html_text, flags=re.IGNORECASE)
            url_list = [(_url, keys, priority+1) for _url in [spider.get_url_legal(href, url) for href in tmp_list]]

        title = re.search(r"<title>(?P<title>.+?)</title>", html_text, flags=re.IGNORECASE)
        save_list = [(url, title.group("title").strip(), datetime.datetime.now()), ] if title else []

        return 1, url_list, save_list
Пример #13
0
    def check_anti_by_captcha(self, soup):
        """
        check anti-spider by captcha
        :return 1, 0: 1(can continue), 0(can repeat)
        """
        if not soup.find("img", id="seccodeImage"):
            return 1

        while 1:
            cid, code = None, None
            while not code:
                captcha_url = soup.find("img", id="seccodeImage").get("src")
                response = self.opener.open(spider.get_url_legal(captcha_url, self.base_url_antispider))
                cid, code = self.yundama.get_captcha(response.read(), "captcha.jpeg", "image/jpeg", codetype="1006")

            post_data = urllib.parse.urlencode({
                "c": code,
                "r": soup.find("input", id="from").get("value"),
                "v": 5
            }).encode()
            response = self.opener.open("http://weixin.sogou.com/antispider/thank.php", data=post_data)

            json_data = json.loads(spider.get_html_content(response, charset="utf-8"))
            if json_data["msg"].find("解封成功") >= 0:
                snuid = json_data["id"]
                self.cookie_jar.set_cookie(spider.make_cookie(name="SNUID", value=snuid, domain="weixin.sogou.com"))

                post_dict = {
                    "uigs_productid": "webapp",
                    "type": "antispider",
                    "subtype": "",
                    "domain": "weixin",
                    "suv": "",
                    "snuid": snuid,
                    "t": int(time.time() * 1000)
                }
                for cookie in self.cookie_jar:
                    if cookie.name == "SUV":
                        post_dict["suv"] = cookie.value

                post_dict["subtype"] = "0_seccodeInputSuccess"
                post_dict["t"] = int(time.time() * 1000)
                self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict))

                post_dict["subtype"] = "close_refresh"
                post_dict["t"] = int(time.time() * 1000)
                self.opener.open("http://pb.sogou.com/pv.gif?" + urllib.parse.urlencode(post_dict))
                break
            else:
                self.yundama.report(cid=cid) if cid else 0
        logging.warning("WeiXinPublic check_anti_by_captcha: anti-spider success!")
        return 0