Пример #1
0
    def parse(self):
        res = []

        comment_list = self.soup.select(".txt.J_rptlist")
        if len(comment_list) == 0:
            raise ValueError("The page is not what we want.")

        for comment_block in comment_list:
            j_report = comment_block.find(class_="j_report")
            shop_id = str(j_report["data-sid"])
            shop_url = comment_block.find(class_="J_rpttitle")["href"]
            self._next_links.append(
                UrlData(shop_url,
                        self._url_data.url,
                        type=SHOP,
                        collection=SHOP,
                        id=shop_id))

            review_id = j_report["data-id"]

            review = {
                "shop-id": shop_id,
                "member-id": self._url_data.id,
                "id": review_id
            }
            score_spans = comment_block.select(".mode-tc.comm-rst span")
            if len(score_spans) == 0 or not score_spans[0].has_attr('class'):
                review_url = "http://www.dianping.com/review/%s" % review_id
                self._next_links.append(
                    UrlData(review_url,
                            self._url_data.url,
                            type=REVIEW,
                            collection=REVIEW,
                            id=review_id))
                continue

            star_str = score_spans[0]['class'][1][-2:]
            review["star"] = float(star_str) / 10.0

            if len(score_spans) > 1:
                pay = search_by_regex(r'(\d+)', score_spans[1].text.strip())[0]
                review["pay"] = int(pay)
            review["comment"] = comment_block.find(
                class_="mode-tc comm-entry").text.strip()

            create_time = comment_block.find(
                class_="mode-tc info").span.text.strip()
            review["create-time"] = self.supplement_time_format(
                create_time[3:])
            res.append(review)

        return res
Пример #2
0
 def update_links(self):
     member_reviews_url = self._url_data.url + "/reviews?reviewCityId=2&reviewShopType=10"
     self._next_links.append(
         UrlData(member_reviews_url,
                 self._url_data.url,
                 type=MEMBER,
                 collection=REVIEW,
                 id=self._url_data.id))
     member_wish_url = self._url_data.url + "/wishlists?favorTag=s10_c2_t-1"
     self._next_links.append(
         UrlData(member_wish_url,
                 self._url_data.url,
                 type=MEMBER,
                 collection=WISH_LIST,
                 id=self._url_data.id))
Пример #3
0
    def crawl(self, url, ignore_exists):
        if not url.startswith('http://'):
            self._dao.update("unfinished", {"url": url},
                             {"url": "http://" + url}, False)
            url = "http://" + url

        CrawlerClass.crawl_num += 1
        if CrawlerClass.crawl_num % 10 == 0:
            print "==============crawl_num: %d success_num: %d==============" % \
                  (CrawlerClass.crawl_num, CrawlerClass.success_num)

        page = UrlData(url)

        if not ignore_exists and self._dao.exists(COLL_URL_LIST, url=url):
            print "[Already Crawled] %s" % url
            self._dao.remove(COLL_UNFINISHED, url=url)
            CrawlerClass.success_num += 1
            return

        if self.whether_to_skip(page.collection):
            self._dao.move_to_last(COLL_UNFINISHED, url=url)
            # print "Crawl [%s] %s later..." % (page.collection, url)
            return

        try:
            crawled_data, links = self.crawl_page(page)

            # Insert
            with open('./newly_review_ids', 'a') as fopen:
                for data in crawled_data:
                    self._dao.insert_with_update(page.collection, data)
                    if page.collection == "review":
                        fopen.write("%s\n" % data["id"])
                    if page.collection in ["wishlist", "review"]:
                        for coll in ["member", "shop"]:
                            self._dao.update(coll,
                                             {"id": data["%s-id" % coll]},
                                             {"item2vec": False},
                                             upsert=False)

            # Next Links
            for link in links:
                if self._dao.exists(COLL_URL_LIST,
                                    url=link.url) or self._dao.exists(
                                        COLL_UNFINISHED, url=link.url):
                    continue
                self._dao.insert(COLL_UNFINISHED, url=link.url)

            self.done_crawl(page)
            CrawlerClass.success_num += 1
            print "[%s][Crawled][%s] %s" % (
                threading.currentThread().getName(), page.collection, page.url)

        except (urllib2.URLError, urllib2.HTTPError, socket.error):
            self._dao.move_to_last(COLL_UNFINISHED, url=url)
            # print "[%s][Exception][%s] %s: %s" % (threading.currentThread().getName(), page.collection, url, ex)
        except (ValueError, AttributeError, Exception), ex:
            self._dao.move_to_last(COLL_UNFINISHED, url=url)
            print "[%s] %s: %s" % (threading.currentThread().getName(), url,
                                   ex)
Пример #4
0
    def parse(self):
        res = []
        if self.soup.select(".modebox.p-tabs-box") is None:
            raise ValueError("The page is not what we want.")

        favor_list = self.soup.select(".pic-txt.favor-list li")

        for favorShop in favor_list:
            if favorShop.find(class_="tag-stop") is not None:
                break
            favor = {
                "member-id": self._url_data.id,
                "shop-id": str(favorShop.find(class_="J_favor")["referid"])
            }

            favor_time = favorShop.find(class_="time").text.strip()
            favor["time"] = self.supplement_str_time(favor_time)

            shop_url = "http://www.dianping.com/shop/%s" % favor["shop-id"]
            self._next_links.append(
                UrlData(shop_url,
                        self._url_data.url,
                        type=SHOP,
                        collection=SHOP,
                        id=favor["shop-id"]))

            res.append(favor)

        return res
Пример #5
0
    def update_links(self):
        if self.skip:
            return

        for review_type in ["good", "middle", "bad"]:
            review_suffix = "queryType=reviewGrade&queryVal=%s" % review_type
            self._next_links.append(
                UrlData("%s/review_all?%s" %
                        (self._url_data.url, review_suffix),
                        self._url_data.url,
                        type=SHOP,
                        collection=REVIEW,
                        id=self._url_data.id,
                        suffix=review_suffix))

        links = self.soup.find_all(attrs={"itemprop": "url"})
        for link in links:
            self._next_links.append(UrlData(link['href'], self._url_data.url))
Пример #6
0
    def expand_by_page(self, max_page, page_format, _type, _col, _id):
        for pg in range(2, max_page + 1):
            pg_url = page_format % pg
            pg_url = urljoin(self._url_data.url, pg_url)
            ref = page_format % (pg - 1)
            ref = urljoin(self._url_data.url, ref)

            self._next_links.append(
                UrlData(pg_url, ref, type=_type, collection=_col, id=_id))
Пример #7
0
def main():
    start_time = time.time()
    parser = add_parser()
    args = parser.parse_args()
    app_dir = args.path + "\\" + args.app_dir
    global encoding
    encoding = args.encoding
    #line feed
    line_feed = "\n"
    # use console to show information
    global console
    console = Console()

    console.show("Target Path:      " + args.path)
    console.show("Webapp Directory: " + app_dir)
    console.show("Testing Website:  " + args.website)
    console.show("Output File:      " + args.output)

    # start fetching
    console.show("Start fetching url and its parameters in " + args.path)

    global url_data
    url_data = UrlData()
    get_url_list(args.path, app_dir, args.website)
    url_amount = url_data.len()

    #fetch complete
    console.show("Fetched " + str(url_amount) + " url(s).")
    if args.get_status != 1 or args.website == "":
        url_data.export(args.output)
        #exit
        sys.exit()

    console.show("Start testing url status with " \
            + str(args.thread_num) + " thread(s).")
    #init thread pool
    pool = ThreadPool(args.thread_num)
    
    for url in url_data.get_urls():
        pool.add_task(url_request, url)
        console.show_progress(pool.get_progress(), url_amount)
    
    while pool.get_progress() != url_amount:
        console.show_progress(pool.get_progress(), url_amount)
    

    #pool.destroy()
    finish_time = time.time()
    elapsed_time = int(finish_time - start_time)
    #export
    url_data.export(args.output)
    console.show("Task finished in " + str(elapsed_time) + " seconds.")
Пример #8
0
    def expand_by_page_randomly(self, max_page, page_format, _type, _col, _id):
        prob = random.uniform(0.5, 1)
        page_limit = int(math.ceil(max_page * prob))

        last_pg = 1
        for i in range(1, page_limit):
            pg = random.randint(2, max_page)
            pg_url = page_format % pg
            pg_url = urljoin(self._url_data.url, pg_url)
            ref = page_format % last_pg
            ref = urljoin(self._url_data.url, ref)

            self._next_links.append(
                UrlData(pg_url, ref, type=_type, collection=_col, id=_id))

            last_pg = pg
Пример #9
0
    def update_links(self):
        if self.skip:
            return
        links = self.soup('a')
        for link in links:
            if link.has_attr('href'):
                url = urljoin(self._url_data.url, link['href'])
                # ignore invalid url
                if url.find("'") != -1:
                    continue

                url = url.split('#')[0]
                # ignore picture url
                if url.endswith(('jpg', 'jpeg', 'svg', 'png', 'gif', 'bmp')):
                    continue

                url_data = UrlData(url, self._url_data.url)
                if url_data.type == '':
                    continue
                if url_data.type in ['shop', 'member', 'review'
                                     ] and url_data.collection == '':
                    continue

                self._next_links.append(url_data)
Пример #10
0
    def parse(self):
        if self.soup.find(class_="not-found"):
            raise ValueError("Crawler has been captured !!!")

        res = []
        if self.soup.find(class_="errorMessage") is not None:
            self.skip = True
            print "Shop %s is closed. -> %s" % (self._url_data.id,
                                                self._url_data.url)
            return []

        nav_w = self.soup.select(".list-crumb a")
        region = nav_w[0].text.strip()
        print "Shop %s region %s." % (self._url_data.id, region)
        if nav_w[0].text.strip() not in title_beijing:
            self.skip = True
            print "Review %s is not for shop in Beijing. -> %s" % (
                self._url_data.id, self._url_data.url)
            return []

        comment_list = self.soup.select(".reviews-items > ul > li")
        if len(comment_list) == 0:
            raise ValueError("The page is not what we want.")

        for comment_block in comment_list:
            review = {
                "id":
                comment_block.find(class_='report')['data-id'],
                "shop-id":
                self._url_data.id,
                "member-id":
                comment_block.find(class_='dper-photo-aside')['data-user-id']
            }

            member_url = "http://www.dianping.com/member/%s" % review[
                "member-id"]
            self._next_links.append(
                UrlData(member_url,
                        self._url_data.url,
                        type=MEMBER,
                        collection=MEMBER,
                        id=review["member-id"]))

            # Review rank
            review_rank = comment_block.find(class_="review-rank")
            star_block = review_rank.find(class_="star")
            if star_block is not None:
                review["star"] = float(
                    star_block['class'][1][7:]) / 10  # sml_strXX

            score_list = review_rank.select(".score .item")
            key_val_pattern = u'(.+)\s*:\s*(.+)'
            if len(score_list) > 0:
                for _ in score_list:
                    key_val = search_by_regex(key_val_pattern, _.text.strip())
                    if key_val[0] not in key_map and key_val[0] in pay_titles:
                        review["pay"] = int(
                            search_by_regex(r'(\d+)', key_val[1])[0])
                    else:
                        review[key_map[key_val[0]]] = des_value[key_val[1]]

            # Review words
            review["comment"] = comment_block.find(
                class_="review-words").get_text(' ', 'br/')

            # Commend
            recommend_block = comment_block.find(class_="review-recommend")
            if recommend_block:
                review["recommend"] = [
                    dish.text.strip()
                    for dish in recommend_block.select(".col-exp")
                ]

            # Time
            time_raw_str = comment_block.find(class_="time").text.strip()
            times = time_raw_str.split(u'更新于')
            review["create-time"] = self.supplement_time_format(
                times[0].strip())
            if len(times) > 1:
                review["update-time"] = self.supplement_time_format(times[1])

            # Heart
            heart_num_block = comment_block.find(
                class_="reply").find_previous_sibling('em')
            if heart_num_block is not None:
                review["heart-num"] = int(heart_num_block.text.strip("(|)"))
            else:
                review["heart-num"] = 0

            res.append(review)
        return res