def parse_users(cls, url): soup_obj = HttpUtils.get(url) if soup_obj is None: print(">>>>>> Fail to parse " + url) return None data_state = HttpUtils.get_attr(soup_obj, "#data", "data-state") data_map = json.loads(data_state) return data_map['entities']['users']
def read_msg(self, index): self.login_if_not() soup_obj = HttpUtils.get(self.url + index, headers=self.site.login_headers) assert soup_obj is not None tr_list = soup_obj.select("#outer form table tr") messages = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue td_list = tr.select("td.rowfollow") if len(td_list) < 4: # skip footer continue msg = Message() msg.read = len(td_list[0].select("img[alt=\"Read\"]")) > 0 msg.title = HttpUtils.get_content(td_list[1], "a") msg.from_user = HttpUtils.get_content(td_list[2], "span a b") if msg.from_user is None: # for ad. msg.from_user = td_list[2].contents[0] msg.since = HttpUtils.get_content(td_list[3], "span") link = HttpUtils.get_attr(td_list[1], "a", "href") msg.id = link.split("id=")[1] messages.append(msg) print("--------------------------------------") index = 1 for msg in messages: print("{:<2}|".format(index) + str(msg)) index += 1 print("--------------------------------------") return messages
def parse_page(self, soup_obj): items = soup_obj.select("item") assert len(items) != 0 seeds = [] for item in items: try: info = HttpUtils.get_content(item, "title").split("[") seed = SeedInfo() seed.title = info[0].strip() seed.size = HttpUtils.pretty_format(info[1].split("]")[0], "MB") seed.url = HttpUtils.get_attr(item, "enclosure", "url") seed.id = self.parse_id(seed.url) #Cache().set(seed.id, str(seed)) seeds.append(seed) except Exception as e: print(e.getMessage()) return seeds
def vote(self): self.check_in() source_list_url_template = "http://www.miui.com/home.php?mod=space&uid=133153462&do=thread&view=me&order=dateline&from=space&page={0}" page_num = 1 max_cnt = 10 cnt = 0 stop_flag = False while not stop_flag: soup = HttpUtils.get(source_list_url_template.format(page_num), headers=self.site.login_headers) assert soup is not None page_num += 1 current_score = self.get_score() previous_score = current_score article_urls = HttpUtils.get_attrs(soup, "div.tl th > a", "href") for article_url in article_urls: try: article_url = "http://www.miui.com/" + article_url article_soup = HttpUtils.get( article_url, headers=self.site.login_headers) assert article_soup is not None title = HttpUtils.get_content(article_soup, "title") form = article_soup.select("#poll", limit=1) option = article_soup.select("#option_1", limit=1) if form is None or len(form) == 0: continue if option is None or len(option) == 0: continue print(title) # do vote here post_url = "http://www.miui.com/" + HttpUtils.get_attr( article_soup, "#poll", "action") + "&inajax=1" post_data = dict() post_data["pollanswers[]"] = HttpUtils.get_attr( article_soup, "#option_1", "value") post_data["formhash"] = self.form_hash_mirror post_result = HttpUtils.post( post_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None current_score = self.get_score() print(previous_score) print(current_score) cnt += 1 if cnt >= max_cnt or previous_score == current_score: stop_flag = True break previous_score = current_score time.sleep(60) except: pass
def crawl_book(cls): tag_source_url = "https://book.douban.com/tag/" soup_obj = HttpUtils.get(tag_source_url) tags = HttpUtils.get_contents(soup_obj, "div.article tr td a") tags = [ '小说', '外国文学', '文学', '中国文学', '经典', '日本文学', '古典文学', '王小波', '当代文学', '钱钟书', '外国名著', '推理', '绘本', '青春', '东野圭吾', '科幻', '言情', '悬疑', '奇幻', '韩寒', '推理小说', '阿加莎·克里斯蒂', '科幻小说', '魔幻', '历史', '心理学', '哲学', '传记', '文化', '社会学', '艺术', '设计', '社会', '政治', '建筑', '宗教', '电影', '政治学', '数学', '中国历史', '回忆录', '思想', '国学', '人物传记', '人文', '音乐', '艺术史', '绘画', '戏剧', '西方哲学', '二战', '军事', '佛教', '近代史', '考古', '自由主义', '美术', '爱情', '旅行', '成长', '生活', '心理', '励志', '摄影', '教育', '游记', '灵修', '健康', '情感', '两性', '人际关系', '手工', '养生', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融', '投资', '营销', '理财', '创业', '广告', '股票', '企业史', '策划', '科普', '互联网', '编程', '科学', '交互设计', '用户体验', '算法', '科技', 'web', 'UE', '交互', '通信', 'UCD', '神经网络', '程序' ] print(tags) book_shelf = dict() for tag in tags: for page in range(0, 10): url = "https://book.douban.com/tag/%s?start=%d&type=T" % ( tag, page * 20) soup_obj = HttpUtils.get(url) if soup_obj is None: print("blocked?") break print(tag, page) books_obj = soup_obj.select("#subject_list ul > li") if len(books_obj) == 0: break for book_obj in books_obj: try: title = HttpUtils.get_attr(book_obj, "h2 a", "title") rating = float( HttpUtils.get_content(book_obj, "span.rating_nums")) people = int( HttpUtils.get_content(book_obj, "span.pl").strip().replace( "人评价", "").replace( "(", "").replace(")", "")) if people > cls.people_threshold: if title in book_shelf: book_shelf[title].tag.append(tag) else: book_shelf[title] = Book( title, rating, people, [tag]) except Exception as e: pass # 为了应对时间窗口内单 ip 访问数量限制,只是停顿一下 sleep(random() * 0.5 + 0.5) books = list(book_shelf.values()) with open("douban_book_raw.txt", "w") as fp: fp.write(json.dumps(books, default=Book.convert))