예제 #1
0
    def parse_users(cls, url):
        soup_obj = HttpUtils.get(url)
        if soup_obj is None:
            print(">>>>>> Fail to parse " + url)
            return None

        data_state = HttpUtils.get_attr(soup_obj, "#data", "data-state")
        data_map = json.loads(data_state)
        return data_map['entities']['users']
예제 #2
0
    def read_msg(self, index):
        self.login_if_not()

        soup_obj = HttpUtils.get(self.url + index,
                                 headers=self.site.login_headers)
        assert soup_obj is not None

        tr_list = soup_obj.select("#outer form table tr")

        messages = []
        cnt = 0
        for tr in tr_list:
            cnt += 1
            if cnt == 1:
                # skip the caption tr
                continue

            td_list = tr.select("td.rowfollow")

            if len(td_list) < 4:
                # skip footer
                continue

            msg = Message()
            msg.read = len(td_list[0].select("img[alt=\"Read\"]")) > 0
            msg.title = HttpUtils.get_content(td_list[1], "a")
            msg.from_user = HttpUtils.get_content(td_list[2], "span a b")
            if msg.from_user is None:
                # for ad.
                msg.from_user = td_list[2].contents[0]
            msg.since = HttpUtils.get_content(td_list[3], "span")
            link = HttpUtils.get_attr(td_list[1], "a", "href")
            msg.id = link.split("id=")[1]
            messages.append(msg)

        print("--------------------------------------")
        index = 1
        for msg in messages:
            print("{:<2}|".format(index) + str(msg))
            index += 1
        print("--------------------------------------")

        return messages
예제 #3
0
    def parse_page(self, soup_obj):
        items = soup_obj.select("item")
        assert len(items) != 0

        seeds = []
        for item in items:
            try:
                info = HttpUtils.get_content(item, "title").split("[")

                seed = SeedInfo()

                seed.title = info[0].strip()
                seed.size = HttpUtils.pretty_format(info[1].split("]")[0], "MB")
                seed.url = HttpUtils.get_attr(item, "enclosure", "url")
                seed.id = self.parse_id(seed.url)
                #Cache().set(seed.id, str(seed))

                seeds.append(seed)
            except Exception as e:
                print(e.getMessage())

        return seeds
예제 #4
0
    def vote(self):
        self.check_in()

        source_list_url_template = "http://www.miui.com/home.php?mod=space&uid=133153462&do=thread&view=me&order=dateline&from=space&page={0}"
        page_num = 1
        max_cnt = 10
        cnt = 0
        stop_flag = False
        while not stop_flag:
            soup = HttpUtils.get(source_list_url_template.format(page_num),
                                 headers=self.site.login_headers)
            assert soup is not None

            page_num += 1

            current_score = self.get_score()
            previous_score = current_score

            article_urls = HttpUtils.get_attrs(soup, "div.tl th > a", "href")
            for article_url in article_urls:
                try:
                    article_url = "http://www.miui.com/" + article_url
                    article_soup = HttpUtils.get(
                        article_url, headers=self.site.login_headers)
                    assert article_soup is not None
                    title = HttpUtils.get_content(article_soup, "title")
                    form = article_soup.select("#poll", limit=1)
                    option = article_soup.select("#option_1", limit=1)
                    if form is None or len(form) == 0:
                        continue
                    if option is None or len(option) == 0:
                        continue
                    print(title)

                    # do vote here
                    post_url = "http://www.miui.com/" + HttpUtils.get_attr(
                        article_soup, "#poll", "action") + "&inajax=1"

                    post_data = dict()
                    post_data["pollanswers[]"] = HttpUtils.get_attr(
                        article_soup, "#option_1", "value")
                    post_data["formhash"] = self.form_hash_mirror
                    post_result = HttpUtils.post(
                        post_url,
                        headers=self.site.login_headers,
                        data=post_data,
                        returnRaw=False)
                    assert post_result is not None

                    current_score = self.get_score()
                    print(previous_score)
                    print(current_score)

                    cnt += 1
                    if cnt >= max_cnt or previous_score == current_score:
                        stop_flag = True
                        break

                    previous_score = current_score
                    time.sleep(60)
                except:
                    pass
    def crawl_book(cls):

        tag_source_url = "https://book.douban.com/tag/"
        soup_obj = HttpUtils.get(tag_source_url)

        tags = HttpUtils.get_contents(soup_obj, "div.article tr td a")

        tags = [
            '小说', '外国文学', '文学', '中国文学', '经典', '日本文学', '古典文学', '王小波', '当代文学',
            '钱钟书', '外国名著', '推理', '绘本', '青春', '东野圭吾', '科幻', '言情', '悬疑', '奇幻',
            '韩寒', '推理小说', '阿加莎·克里斯蒂', '科幻小说', '魔幻', '历史', '心理学', '哲学', '传记',
            '文化', '社会学', '艺术', '设计', '社会', '政治', '建筑', '宗教', '电影', '政治学', '数学',
            '中国历史', '回忆录', '思想', '国学', '人物传记', '人文', '音乐', '艺术史', '绘画', '戏剧',
            '西方哲学', '二战', '军事', '佛教', '近代史', '考古', '自由主义', '美术', '爱情', '旅行',
            '成长', '生活', '心理', '励志', '摄影', '教育', '游记', '灵修', '健康', '情感', '两性',
            '人际关系', '手工', '养生', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融',
            '投资', '营销', '理财', '创业', '广告', '股票', '企业史', '策划', '科普', '互联网', '编程',
            '科学', '交互设计', '用户体验', '算法', '科技', 'web', 'UE', '交互', '通信', 'UCD',
            '神经网络', '程序'
        ]
        print(tags)

        book_shelf = dict()
        for tag in tags:
            for page in range(0, 10):
                url = "https://book.douban.com/tag/%s?start=%d&type=T" % (
                    tag, page * 20)
                soup_obj = HttpUtils.get(url)

                if soup_obj is None:
                    print("blocked?")
                    break

                print(tag, page)
                books_obj = soup_obj.select("#subject_list ul > li")

                if len(books_obj) == 0:
                    break

                for book_obj in books_obj:
                    try:
                        title = HttpUtils.get_attr(book_obj, "h2 a", "title")
                        rating = float(
                            HttpUtils.get_content(book_obj,
                                                  "span.rating_nums"))
                        people = int(
                            HttpUtils.get_content(book_obj,
                                                  "span.pl").strip().replace(
                                                      "人评价", "").replace(
                                                          "(",
                                                          "").replace(")", ""))

                        if people > cls.people_threshold:
                            if title in book_shelf:
                                book_shelf[title].tag.append(tag)
                            else:
                                book_shelf[title] = Book(
                                    title, rating, people, [tag])
                    except Exception as e:
                        pass

                # 为了应对时间窗口内单 ip 访问数量限制,只是停顿一下
                sleep(random() * 0.5 + 0.5)

        books = list(book_shelf.values())

        with open("douban_book_raw.txt", "w") as fp:
            fp.write(json.dumps(books, default=Book.convert))