Python HttpUtils.get_attrs示例，util.utils.HttpUtils.get_attrs Python示例

示例#1

0

显示文件

    def start(cls):

        root_url = "http://www.mangabz.com/manga-list-p%d/"
        page_num = 0

        while True:
            cls.init_thread()

            page_num += 1
            print("Now page " + str(page_num))
            url = root_url % page_num
            resp = HttpUtils.get_with_retry(url, headers=cls.headers)
            if resp is None:
                break

            links = HttpUtils.get_attrs(resp, ".mh-item-detali > .title > a",
                                        "href")
            if len(links) == 0:
                break

            for link in links:
                cls.task_pool.put(link)

        cls.process_thread.join()
        cls.fp.close()

示例#2

0

显示文件

    def parse_lvl_one(cls):
        if cls.book_id is None:
            return

        url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id
        retry = 0
        while True:
            resp = HttpUtils.get(url)
            if resp is not None:
                break
            else:
                retry += 1

            assert retry < 5, "fail to query %s" % url

        cls.comic_name = HttpUtils.get_content(resp, "title").strip()
        links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href")

        titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a")

        assert len(titles) == len(links)

        cls.init_thread()

        for index in range(len(titles)):
            link = links[index]
            title = titles[index].strip()
            cls.parse_lvl_two((link, title))
        cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()

示例#3

0

显示文件

    def parse_lvl_two(cls, info):
        url = info[0]
        index = info[1]

        # create folder once
        folder_name = "output/龙珠/" + str(index)
        if not os.path.exists(folder_name):
            os.makedirs(folder_name, exist_ok=True)

        retry = 0
        while True:
            resp = HttpUtils.get(url)
            if resp is not None:
                break
            else:
                retry += 1

            assert retry < 5, "fail to query %s" % url

        links = HttpUtils.get_attrs(resp, ".ListContainer .ItemThumb a",
                                    "style")

        assert links is not None

        for link in links:
            url = re.search("background:url\(.*'(.*)'",
                            link).group(1).replace("_thumb.", "")
            file_name = url.split("/")[-1]
            cls.task_pool.put([folder_name + "/" + file_name, url, 0])

示例#4

0

显示文件

    def fetch_sub_category(cls, category):
        # go into category to find sub-category info
        category_link = category[1]
        category_text = category[0]

        sub_category_data_list = list()

        if category_text in cls.category_black_list:
            return []

        soup_obj = HttpUtils.get(cls.amazon_base_url + category_link,
                                 headers=cls.amazon_headers)

        sub_category_text_list = HttpUtils.get_contents(
            soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a span")
        sub_category_link_list = HttpUtils.get_attrs(
            soup_obj, "div.sg-col-inner li.s-navigation-indent-2 span a",
            "href")

        if len(sub_category_link_list) != len(sub_category_text_list):
            print("ERROR: Sub-category number not correct")
            return []

        # no sub-category
        if len(sub_category_link_list) == 0:
            sub_category_text_list = [category_text]
            sub_category_link_list = [category_link]

        print("find lvl 2 categories for %s" % category_text)
        print(sub_category_text_list)

        # find sub-category page number
        for sub_index in range(0, len(sub_category_link_list)):
            sub_category_link = sub_category_link_list[sub_index]
            sub_category_text = sub_category_text_list[sub_index]
            soup_obj = HttpUtils.get(cls.amazon_base_url + sub_category_link,
                                     headers=cls.amazon_headers)
            page_info = HttpUtils.get_contents(
                soup_obj, "ul.a-pagination li.a-disabled")
            if len(page_info) == 2:
                max_page_num = page_info[1]
            elif len(page_info) == 0:
                # 没有分页
                max_page_num = 1
            else:
                # 5页以内
                max_page_num = HttpUtils.get_contents(
                    soup_obj, "ul.a-pagination li.a-normal a")[-1]

            print("cat=%s, sub-cat=%s, page=%s" %
                  (category_text, sub_category_text, max_page_num))
            sub_category_data_list.append((category_text, sub_category_text,
                                           sub_category_link, max_page_num))

        return sub_category_data_list

示例#5

0

显示文件

    def parse_lvl_one(cls):
        if cls.book_id is None:
            print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<")
            return

        resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id),
                                        headers=cls.headers)
        assert resp is not None

        cls.comic_name = HttpUtils.get_content(resp,
                                               ".detail-info-title").strip()
        cls.root_folder = os.path.join("output", cls.comic_name)
        links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href")

        titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a")
        image_numbers = HttpUtils.get_contents(
            resp, "div.detail-list-form-con a span")
        image_numbers = list(
            map(lambda x: re.search("(\d+)P", x).group(1), image_numbers))

        assert len(titles) == len(image_numbers)
        assert len(titles) == len(links)

        cnt = 0
        for index in range(len(titles)):
            cls.init_thread()

            link = links[index].replace("/", "").replace("m", "")
            title = titles[index].strip()
            image_number = image_numbers[index]
            if (cls.chapter_mode == 1 and "第" not in title and "话" not in title
                    and "話" not in title) or (cls.chapter_mode == 2
                                              and "卷" not in title
                                              and "第" not in title):
                print("Skip " + title)
                continue

            is_skip = False
            if cls.inclusion_list is not None:
                for inclusion in cls.inclusion_list:
                    if inclusion not in title:
                        is_skip = True
                        break

            if not is_skip and cls.parse_lvl_two((link, title, image_number)):
                cnt += 1

        if cnt > 0:
            cls.process_thread.join()

        # code below should be useless if everything goes well
        while not cls.task_pool.empty():
            print("pool size = " + str(cls.task_pool.qsize()))
            cls.init_thread()
            cls.process_thread.join()

示例#6

0

显示文件

    def fetch_meta_data(cls):
        with open("ku_meta.txt", "r", encoding="utf-8") as fp:
            if fp.readline():
                # already exist, skip
                return

        home_url = "https://www.amazon.cn/s?i=digital-text&rh=n%3A116087071%2Cn%3A116089071%2Cn%3A116176071%2Cn%3A1337022071&page=1"

        # find all category, sub-category and page number
        soup_obj = HttpUtils.get(home_url, headers=cls.amazon_headers)
        if soup_obj is None:
            print("ERROR: Cannot find category")
            return

        category_text_list = HttpUtils.get_contents(
            soup_obj,
            "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link span")
        category_link_list = HttpUtils.get_attrs(
            soup_obj,
            "#leftNav ul:nth-of-type(3) div li span a.s-ref-text-link", "href")

        if len(category_text_list) != len(category_link_list):
            print("ERROR: Category number not correct")
            return

        print("find lvl 1 categories:")
        print(category_text_list)

        category_list = list()
        for index in range(0, len(category_link_list)):
            category_list.append(
                (category_text_list[index], category_link_list[index]))

        parallel_template = ParallelTemplate(5)
        sub_category_data_list = parallel_template.run(cls.fetch_sub_category,
                                                       category_list)

        with open("ku_meta.txt", "w", encoding="utf-8") as fp:
            fp.write(json.dumps(sub_category_data_list))

示例#7

0

显示文件

    def water(self):
        self.check_in()

        url_prefix = "http://www.miui.com/forum.php?mod=forumdisplay&fid=5&orderby=dateline&filter=author&orderby=dateline&page="
        page = 1
        cnt = 1
        max_cnt = 50
        chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]

        id_list = []
        while True:
            soup_obj = HttpUtils.get(url_prefix + str(page))
            print("new page: " + str(page))
            id_list.extend(HttpUtils.get_attrs(soup_obj, "tbody", "id"))

            page += 1

            if len(id_list) > max_cnt:
                break

        id_list = id_list[:max_cnt]
        for id in id_list:
            if not id.startswith("normalthread"):
                continue

            id = id[13:]
            page_url = self.page_url_template.format(id)

            page_soup_obj = HttpUtils.get(page_url)
            assert page_soup_obj is not None

            i = str(cnt)
            length = len(i)
            num = ""
            for index in range(length):
                num += chinese_char[int(i[index])]

            id_num = ""
            for index in range(len(id)):
                id_num += chinese_char[int(id[index])]

            random_id = str(int(random() * 1000000000000000))
            chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]

            random_id_num = ""
            for index in range(len(random_id)):
                random_id_num += chinese_char[int(random_id[index])]

            title = HttpUtils.get_content(page_soup_obj,
                                          "title").strip().replace(
                                              "_灌者为王_MIUI论坛", "")

            message = "时间{0}，帖子ID{1}，标题\"{2}\"，随机数{3}，第{4}个积分，打扰".format(
                time.strftime("%b %d %Y %H:%M:%S", time.localtime()), id_num,
                title, random_id_num, num)
            # form_hash = page_soup_obj.select("input[name='formhash']")[0]["value"]
            post_data = dict()
            post_data["posttime"] = str(int(time.time()))
            post_data["formhash"] = self.form_hash_mirror
            post_data["usesig"] = "1"
            post_data["subject"] = "  "
            post_data["message"] = message

            form_submit_url = "http://www.miui.com/forum.php?mod=post&action=reply&fid=5&tid={0}&extra=page=1&replysubmit=yes&infloat=yes&handlekey=fastpost".format(
                id)

            # print(post_data)

            post_result = HttpUtils.post(form_submit_url,
                                         headers=self.site.login_headers,
                                         data=post_data,
                                         returnRaw=False)
            assert post_result is not None
            time.sleep(int(random() * 60) + 90)
            cnt += 1

示例#8

0

显示文件

    def vote(self):
        self.check_in()

        source_list_url_template = "http://www.miui.com/home.php?mod=space&uid=133153462&do=thread&view=me&order=dateline&from=space&page={0}"
        page_num = 1
        max_cnt = 10
        cnt = 0
        stop_flag = False
        while not stop_flag:
            soup = HttpUtils.get(source_list_url_template.format(page_num),
                                 headers=self.site.login_headers)
            assert soup is not None

            page_num += 1

            current_score = self.get_score()
            previous_score = current_score

            article_urls = HttpUtils.get_attrs(soup, "div.tl th > a", "href")
            for article_url in article_urls:
                try:
                    article_url = "http://www.miui.com/" + article_url
                    article_soup = HttpUtils.get(
                        article_url, headers=self.site.login_headers)
                    assert article_soup is not None
                    title = HttpUtils.get_content(article_soup, "title")
                    form = article_soup.select("#poll", limit=1)
                    option = article_soup.select("#option_1", limit=1)
                    if form is None or len(form) == 0:
                        continue
                    if option is None or len(option) == 0:
                        continue
                    print(title)

                    # do vote here
                    post_url = "http://www.miui.com/" + HttpUtils.get_attr(
                        article_soup, "#poll", "action") + "&inajax=1"

                    post_data = dict()
                    post_data["pollanswers[]"] = HttpUtils.get_attr(
                        article_soup, "#option_1", "value")
                    post_data["formhash"] = self.form_hash_mirror
                    post_result = HttpUtils.post(
                        post_url,
                        headers=self.site.login_headers,
                        data=post_data,
                        returnRaw=False)
                    assert post_result is not None

                    current_score = self.get_score()
                    print(previous_score)
                    print(current_score)

                    cnt += 1
                    if cnt >= max_cnt or previous_score == current_score:
                        stop_flag = True
                        break

                    previous_score = current_score
                    time.sleep(60)
                except:
                    pass

示例#9

0

显示文件

    def zz(self):
        source_url_template = "https://bh.sb/post/category/main/page/{0}/"
        post_url = "http://www.miui.com/forum.php?mod=post&action=newthread&fid=5&extra=&topicsubmit=yes"

        self.check_in()

        max_cnt = 10
        cnt = 0
        page_num = 1
        articles = list()
        stop_flag = False
        while not stop_flag:
            # get article of bhsb
            soup = HttpUtils.get(source_url_template.format(page_num))
            article_urls = HttpUtils.get_attrs(soup, "h2 a", "href")
            page_num += 1

            for article_index in range(len(article_urls)):
                article_url = article_urls[article_index]
                if Cache().get(article_url) is not None:
                    continue

                article_soup = HttpUtils.get(article_url)
                titles = HttpUtils.get_contents(article_soup,
                                                ".article-content p")

                title_cnt = int(len(titles) / 2)

                for title_index in range(0, title_cnt):
                    try:
                        title = titles[title_index * 2].split("】")[1]
                        image = titles[title_index * 2 + 1]

                        if type(image) != Tag:
                            continue

                        src = image.attrs["src"]
                        if src.endswith("jpg"):
                            continue

                        message = "好玩您就点个赞，不好笑请期待下一贴～\n"
                        message += "[img]{0}[/img]".format(src)

                        if Cache().get(title) is not None:
                            continue
                        Cache().set(title, message)

                        articles.append((title, message))

                        cnt += 1

                        if cnt >= max_cnt:
                            stop_flag = True
                            break
                    except:
                        pass

                if stop_flag:
                    break

                # only if all articles are included, then mark this url
                Cache().set(article_url, article_url)

        type_id_list = ["1629", "1631", "1633", "4481", "1641"]
        type_index = 0
        for (title, message) in articles:
            print((title, message))

            post_data = dict()
            post_data["posttime"] = str(int(time.time()))
            post_data["formhash"] = self.form_hash_mirror
            post_data["wysiwyg"] = "1"
            post_data["typeid"] = type_id_list[type_index]
            post_data["allownoticeauthor"] = "1"
            post_data["addfeed"] = "1"
            post_data["usesig"] = "1"
            post_data["save"] = ""
            post_data["uploadalbum"] = "-2"
            post_data["newalbum"] = "请输入相册名称"
            post_data["subject"] = title
            post_data["message"] = message

            post_result = HttpUtils.post(post_url,
                                         headers=self.site.login_headers,
                                         data=post_data,
                                         returnRaw=False)
            assert post_result is not None
            type_index = (type_index + 1) % len(type_id_list)
            time.sleep(int(random() * 300) + 2700)