def parse(self): res = [] comment_list = self.soup.select(".txt.J_rptlist") if len(comment_list) == 0: raise ValueError("The page is not what we want.") for comment_block in comment_list: j_report = comment_block.find(class_="j_report") shop_id = str(j_report["data-sid"]) shop_url = comment_block.find(class_="J_rpttitle")["href"] self._next_links.append( UrlData(shop_url, self._url_data.url, type=SHOP, collection=SHOP, id=shop_id)) review_id = j_report["data-id"] review = { "shop-id": shop_id, "member-id": self._url_data.id, "id": review_id } score_spans = comment_block.select(".mode-tc.comm-rst span") if len(score_spans) == 0 or not score_spans[0].has_attr('class'): review_url = "http://www.dianping.com/review/%s" % review_id self._next_links.append( UrlData(review_url, self._url_data.url, type=REVIEW, collection=REVIEW, id=review_id)) continue star_str = score_spans[0]['class'][1][-2:] review["star"] = float(star_str) / 10.0 if len(score_spans) > 1: pay = search_by_regex(r'(\d+)', score_spans[1].text.strip())[0] review["pay"] = int(pay) review["comment"] = comment_block.find( class_="mode-tc comm-entry").text.strip() create_time = comment_block.find( class_="mode-tc info").span.text.strip() review["create-time"] = self.supplement_time_format( create_time[3:]) res.append(review) return res
def update_links(self): member_reviews_url = self._url_data.url + "/reviews?reviewCityId=2&reviewShopType=10" self._next_links.append( UrlData(member_reviews_url, self._url_data.url, type=MEMBER, collection=REVIEW, id=self._url_data.id)) member_wish_url = self._url_data.url + "/wishlists?favorTag=s10_c2_t-1" self._next_links.append( UrlData(member_wish_url, self._url_data.url, type=MEMBER, collection=WISH_LIST, id=self._url_data.id))
def crawl(self, url, ignore_exists): if not url.startswith('http://'): self._dao.update("unfinished", {"url": url}, {"url": "http://" + url}, False) url = "http://" + url CrawlerClass.crawl_num += 1 if CrawlerClass.crawl_num % 10 == 0: print "==============crawl_num: %d success_num: %d==============" % \ (CrawlerClass.crawl_num, CrawlerClass.success_num) page = UrlData(url) if not ignore_exists and self._dao.exists(COLL_URL_LIST, url=url): print "[Already Crawled] %s" % url self._dao.remove(COLL_UNFINISHED, url=url) CrawlerClass.success_num += 1 return if self.whether_to_skip(page.collection): self._dao.move_to_last(COLL_UNFINISHED, url=url) # print "Crawl [%s] %s later..." % (page.collection, url) return try: crawled_data, links = self.crawl_page(page) # Insert with open('./newly_review_ids', 'a') as fopen: for data in crawled_data: self._dao.insert_with_update(page.collection, data) if page.collection == "review": fopen.write("%s\n" % data["id"]) if page.collection in ["wishlist", "review"]: for coll in ["member", "shop"]: self._dao.update(coll, {"id": data["%s-id" % coll]}, {"item2vec": False}, upsert=False) # Next Links for link in links: if self._dao.exists(COLL_URL_LIST, url=link.url) or self._dao.exists( COLL_UNFINISHED, url=link.url): continue self._dao.insert(COLL_UNFINISHED, url=link.url) self.done_crawl(page) CrawlerClass.success_num += 1 print "[%s][Crawled][%s] %s" % ( threading.currentThread().getName(), page.collection, page.url) except (urllib2.URLError, urllib2.HTTPError, socket.error): self._dao.move_to_last(COLL_UNFINISHED, url=url) # print "[%s][Exception][%s] %s: %s" % (threading.currentThread().getName(), page.collection, url, ex) except (ValueError, AttributeError, Exception), ex: self._dao.move_to_last(COLL_UNFINISHED, url=url) print "[%s] %s: %s" % (threading.currentThread().getName(), url, ex)
def parse(self): res = [] if self.soup.select(".modebox.p-tabs-box") is None: raise ValueError("The page is not what we want.") favor_list = self.soup.select(".pic-txt.favor-list li") for favorShop in favor_list: if favorShop.find(class_="tag-stop") is not None: break favor = { "member-id": self._url_data.id, "shop-id": str(favorShop.find(class_="J_favor")["referid"]) } favor_time = favorShop.find(class_="time").text.strip() favor["time"] = self.supplement_str_time(favor_time) shop_url = "http://www.dianping.com/shop/%s" % favor["shop-id"] self._next_links.append( UrlData(shop_url, self._url_data.url, type=SHOP, collection=SHOP, id=favor["shop-id"])) res.append(favor) return res
def update_links(self): if self.skip: return for review_type in ["good", "middle", "bad"]: review_suffix = "queryType=reviewGrade&queryVal=%s" % review_type self._next_links.append( UrlData("%s/review_all?%s" % (self._url_data.url, review_suffix), self._url_data.url, type=SHOP, collection=REVIEW, id=self._url_data.id, suffix=review_suffix)) links = self.soup.find_all(attrs={"itemprop": "url"}) for link in links: self._next_links.append(UrlData(link['href'], self._url_data.url))
def expand_by_page(self, max_page, page_format, _type, _col, _id): for pg in range(2, max_page + 1): pg_url = page_format % pg pg_url = urljoin(self._url_data.url, pg_url) ref = page_format % (pg - 1) ref = urljoin(self._url_data.url, ref) self._next_links.append( UrlData(pg_url, ref, type=_type, collection=_col, id=_id))
def main(): start_time = time.time() parser = add_parser() args = parser.parse_args() app_dir = args.path + "\\" + args.app_dir global encoding encoding = args.encoding #line feed line_feed = "\n" # use console to show information global console console = Console() console.show("Target Path: " + args.path) console.show("Webapp Directory: " + app_dir) console.show("Testing Website: " + args.website) console.show("Output File: " + args.output) # start fetching console.show("Start fetching url and its parameters in " + args.path) global url_data url_data = UrlData() get_url_list(args.path, app_dir, args.website) url_amount = url_data.len() #fetch complete console.show("Fetched " + str(url_amount) + " url(s).") if args.get_status != 1 or args.website == "": url_data.export(args.output) #exit sys.exit() console.show("Start testing url status with " \ + str(args.thread_num) + " thread(s).") #init thread pool pool = ThreadPool(args.thread_num) for url in url_data.get_urls(): pool.add_task(url_request, url) console.show_progress(pool.get_progress(), url_amount) while pool.get_progress() != url_amount: console.show_progress(pool.get_progress(), url_amount) #pool.destroy() finish_time = time.time() elapsed_time = int(finish_time - start_time) #export url_data.export(args.output) console.show("Task finished in " + str(elapsed_time) + " seconds.")
def expand_by_page_randomly(self, max_page, page_format, _type, _col, _id): prob = random.uniform(0.5, 1) page_limit = int(math.ceil(max_page * prob)) last_pg = 1 for i in range(1, page_limit): pg = random.randint(2, max_page) pg_url = page_format % pg pg_url = urljoin(self._url_data.url, pg_url) ref = page_format % last_pg ref = urljoin(self._url_data.url, ref) self._next_links.append( UrlData(pg_url, ref, type=_type, collection=_col, id=_id)) last_pg = pg
def update_links(self): if self.skip: return links = self.soup('a') for link in links: if link.has_attr('href'): url = urljoin(self._url_data.url, link['href']) # ignore invalid url if url.find("'") != -1: continue url = url.split('#')[0] # ignore picture url if url.endswith(('jpg', 'jpeg', 'svg', 'png', 'gif', 'bmp')): continue url_data = UrlData(url, self._url_data.url) if url_data.type == '': continue if url_data.type in ['shop', 'member', 'review' ] and url_data.collection == '': continue self._next_links.append(url_data)
def parse(self): if self.soup.find(class_="not-found"): raise ValueError("Crawler has been captured !!!") res = [] if self.soup.find(class_="errorMessage") is not None: self.skip = True print "Shop %s is closed. -> %s" % (self._url_data.id, self._url_data.url) return [] nav_w = self.soup.select(".list-crumb a") region = nav_w[0].text.strip() print "Shop %s region %s." % (self._url_data.id, region) if nav_w[0].text.strip() not in title_beijing: self.skip = True print "Review %s is not for shop in Beijing. -> %s" % ( self._url_data.id, self._url_data.url) return [] comment_list = self.soup.select(".reviews-items > ul > li") if len(comment_list) == 0: raise ValueError("The page is not what we want.") for comment_block in comment_list: review = { "id": comment_block.find(class_='report')['data-id'], "shop-id": self._url_data.id, "member-id": comment_block.find(class_='dper-photo-aside')['data-user-id'] } member_url = "http://www.dianping.com/member/%s" % review[ "member-id"] self._next_links.append( UrlData(member_url, self._url_data.url, type=MEMBER, collection=MEMBER, id=review["member-id"])) # Review rank review_rank = comment_block.find(class_="review-rank") star_block = review_rank.find(class_="star") if star_block is not None: review["star"] = float( star_block['class'][1][7:]) / 10 # sml_strXX score_list = review_rank.select(".score .item") key_val_pattern = u'(.+)\s*:\s*(.+)' if len(score_list) > 0: for _ in score_list: key_val = search_by_regex(key_val_pattern, _.text.strip()) if key_val[0] not in key_map and key_val[0] in pay_titles: review["pay"] = int( search_by_regex(r'(\d+)', key_val[1])[0]) else: review[key_map[key_val[0]]] = des_value[key_val[1]] # Review words review["comment"] = comment_block.find( class_="review-words").get_text(' ', 'br/') # Commend recommend_block = comment_block.find(class_="review-recommend") if recommend_block: review["recommend"] = [ dish.text.strip() for dish in recommend_block.select(".col-exp") ] # Time time_raw_str = comment_block.find(class_="time").text.strip() times = time_raw_str.split(u'更新于') review["create-time"] = self.supplement_time_format( times[0].strip()) if len(times) > 1: review["update-time"] = self.supplement_time_format(times[1]) # Heart heart_num_block = comment_block.find( class_="reply").find_previous_sibling('em') if heart_num_block is not None: review["heart-num"] = int(heart_num_block.text.strip("(|)")) else: review["heart-num"] = 0 res.append(review) return res