예제 #1
0
class MeituanSpider(CrawlSpider):
    name = 'meituan'
    allowed_domains = ['meituan.com']
    start_urls = []

    startUrlsFile = "../hlwdata/data/url/meituan_start_url.txt"
    downLoadUrlsFile = "../hlwdata/data/url/meituan_download_url.txt"

    lst = loadUrl(downLoadUrlsFile)
    rules = (Rule(FilterLinkExtractor(
        allow=r'http://cq.meituan.com/shop/[\d]+\.*[\w]*$', download=lst),
                  callback='parse_meituan',
                  process_links='link_filtering',
                  follow=True), )

    def link_filtering(self, links):
        for link in links:
            link.url = link.url.rstrip('.html')
        return links

    def start_requests(self):
        self.start_urls += loadUrl(self.startUrlsFile)
        for url in self.start_urls:
            yield Request(url, callback=self.parse_meituan)
            yield self.make_requests_from_url(url)

    def parse_meituan(self, response):
        with open(self.downLoadUrlsFile, 'a') as f:
            f.write(response.url + '\n')
        item = parse_meituan(response)
        if item:
            return item
예제 #2
0
class DianpingreviewSpider(scrapy.Spider):
    name = "dianpingreview"
    allowed_domains = ["dianping.com"]
    start_urls = (
        'https://www.dianping.com/shop/20919783/review_more',
        'https://www.dianping.com/shop/18506539/review_more'
    )

    start_urls =loadUrl("../hlwdata/data/url/dianping_start_review_url.txt")

    def parse(self, response):

        shop_id = re.search(r'/shop/([\d]+)/review_more', response.url).group(1)
        for comment in response.xpath('//div[@class="comment-list"]/ul/li'):
            item = DianPingReviewItem()
            item['shop_id'] = shop_id
            item['user_id'] = comment.xpath('./div[@class="pic"]/a/@user-id').extract()
            item['user_name'] = comment.xpath('./div[@class="pic"]/p[@class="name"]/a/text()').extract()
            item['review_star'] = comment.xpath('./div[@class="content"]/div[@class="user-info"]/span/@class').extract()
            item['review_Content'] = comment.xpath('./div[@class="content"]/div[@class="comment-txt"]/div/text()').extract()
            item['review_date'] = comment.xpath('./div[@class="content"]/div[@class="misc-info"]/span[@class="time"]/text()').extract()
            yield  item

        nextPages = response.xpath('//div[@class="Pages"]/div[@class="Pages"]/a[@class="NextPage"]/@href').extract_first()

        if nextPages:
            meta = dict()
            url = response.url.split('?')[0] + nextPages
            meta['rowkey'] = url
            yield Request(response.url.split('?')[0] + nextPages, dont_filter=True,callback= self.parse, meta=meta)
예제 #3
0
class DianpingSpider(CrawlSpider):
    name = 'dianping'
    allowed_domains = ['dianping.com']
    start_urls = []

    startUrlsFile = "../hlwdata/data/url/dianping_start_url.txt"
    downLoadUrlsFile = "../hlwdata/data/url/dianping_download_url.txt"

    lst = loadUrl(downLoadUrlsFile)
    rules = (
        Rule(FilterLinkExtractor(allow=r'/shop/[\d]+$',
                                 deny=r'upload/shop/',
                                 download=lst),
             callback='parse_dianping',
             follow=False),
        Rule(FilterLinkExtractor(allow=r'/search/category/[\d]+/[\d]+/p[\d]+',
                                 download=lst),
             callback='parse_dianping_search',
             follow=True),
        # Rule(FilterLinkExtractor(allow=r'https://www.dianping.com/shop/[\d]+$', download = lst), callback='parse_dianping', follow=True),
    )

    def start_requests(self):
        self.start_urls += loadUrl(self.startUrlsFile)
        for url in self.start_urls:
            yield Request(url, callback=self.parse_dianping)
            yield self.make_requests_from_url(url)

    def parse_dianping(self, response):
        with open(self.downLoadUrlsFile, 'a') as f:
            f.write(response.url + '\n')
        item = parse_dianping(response)
        if item:
            return item

    def parse_dianping_search(self, response):
        pass
예제 #4
0
 def start_requests(self):
     self.start_urls += loadUrl(self.startUrlsFile)
     for url in self.start_urls:
         yield Request(url, callback=self.parse_meituan)
         yield self.make_requests_from_url(url)
예제 #5
0
파일: nuomi.py 프로젝트: super-sponge/foods
class NuomiSpider(CrawlSpider):
    name = 'nuomi'
    allowed_domains = ['nuomi.com']
    start_urls = []

    startUrlsFile = "../hlwdata/data/url/nuomi_deal_start_url.txt"
    downLoadUrlsFile = "../hlwdata/data/url/nuomi_deal_download_url.txt"
    downshopUrlsFile = "../hlwdata/data/url/nuomi_shop_download_url.txt"
    jsonDir = "../hlwdata/data/json/nuomi/shop/"
    jsonDir = "../hlwdata/data/json/nuomi/city/"

    lst = loadUrl(downLoadUrlsFile)

    rules = (
        Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/deal/[\w]+',
                                 download=lst),
             callback='parse_nuomi_deal',
             follow=True),
        # Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/shop/[\d]+$', download = lst), callback='parse_nuomi_shop', follow=True),
    )

    def start_requests(self):
        self.start_urls += loadUrl(self.startUrlsFile)
        for url in self.start_urls:
            yield Request(url, callback=self.parse_nuomi_deal)
            yield self.make_requests_from_url(url)

    visitedShop = set(loadUrl(downshopUrlsFile))

    def parse_nuomi_deal(self, response):

        with open(self.downLoadUrlsFile, 'a') as f:
            f.write(response.url + '\n')
        navs = response.xpath(
            '//div[@class="w-bread-crumb"]/ul[@class="crumb-list clearfix"]/li/a/text()'
        ).extract()
        parmeta = dict()
        parmeta['nav'] = True
        parmeta['deal'] = response.url
        for i in range(6):
            parmeta['nav' + str(i)] = ''
        for i in range(len(navs)):
            parmeta['nav' + str(i)] = navs[i].strip('\n')

        dealId = response.xpath(
            '//div[@class="p-item-info"]/@mon').extract_first().split('=')[1]
        dealUrl = 'http://www.nuomi.com/pcindex/main/shopchain?dealId=' + dealId

        # html = requests.get(dealUrl, headers=self.headers)
        # js['data']['city'][shopCity]
        html = requests.get(dealUrl)
        js = json.loads(html.text)
        for shop in js['data']['shop']:
            shopCity = shop['city_id']
            district_id = shop['district_id']
            shopId = shop['merchant_id']
            with open(self.jsonDir + shopId, 'w') as f:
                f.write(json.dumps(shop))
            with open(self.jsonDir + shopId + '.' + shopCity, 'w') as f:
                f.write(json.dumps(js['data']['city'][shopCity]))

            shoplink = shop['link']
            #只获取重庆的美食信息
            # if shopId in self.visitedShop or shopCity != u'900010000':
            if shoplink in self.visitedShop:
                continue
            else:
                self.visitedShop.add(shoplink)
            city = js['data']['city'][shopCity]
            shopCityName = city['city_name']
            district = city['district'][district_id]['dist_name']

            parmeta['shopCityName'] = shopCityName
            parmeta['district'] = district

            yield scrapy.Request(shop['link'],
                                 self.parse_nuomi_shop,
                                 meta=parmeta)

    def parse_nuomi_shop(self, response):
        with open(self.downshopUrlsFile, 'a') as f:
            f.write(response.url + '\n')
        meta = response.meta
        item = parse_nuomi(response, meta=response.meta)
        if item:
            return item
예제 #6
0
파일: nuomi.py 프로젝트: super-sponge/foods
 def start_requests(self):
     self.start_urls += loadUrl(self.startUrlsFile)
     for url in self.start_urls:
         yield Request(url,callback=self.parse_nuomi_deal)
         yield self.make_requests_from_url(url)
예제 #7
0
    def start_requests(self):

        for url in loadUrl(self.startUrlsFile):
            yield self.make_requests_from_url(url)
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
예제 #8
0
    def start_requests(self):

        for url in loadUrl(self.startUrlsFile):
            yield self.make_requests_from_url(url)
        for url in self.start_urls:
            yield self.make_requests_from_url(url)
예제 #9
0
class ShopindexSpider(CrawlSpider):
    name = 'shopindex'
    allowed_domains = ['nuomi.com', 'dianping.com', 'cq.meituan.com']
    start_urls = [
        # 'https://www.dianping.com/shop/24098260'
        # 'http://cq.meituan.com/shop/82458075'
        # ,'http://www.nuomi.com/deal/d3ccslof.html'
        # ,'https://www.dianping.com/shop/32463358'
    ]

    # settings = get_project_settings()

    downLoadUrlsFile = '../hlwdata/data/start_url.txt'
    startUrlsFile = '../hlwdata/data/downloaded_url.txt'
    lst = loadUrl(downLoadUrlsFile)

    rules = (
        Rule(FilterLinkExtractor(allow=r'http://www.nuomi.com/deal/[\w]+',
                                 download=lst),
             callback='parse_nuomi',
             follow=True),
        Rule(FilterLinkExtractor(allow=r'https://www.dianping.com/shop/[\d]+$',
                                 download=lst),
             callback='parse_dianping',
             follow=True),
        Rule(FilterLinkExtractor(
            allow=r'http://cq.meituan.com/shop/[\d]+\.*[\w]*$', download=lst),
             callback='parse_meituan',
             process_links='link_filtering',
             follow=True),
    )

    def link_filtering(self, links):
        for link in links:
            link.url = link.url.rstrip('.html')
        return links

    visitedShop = set()

    def start_requests(self):

        for url in loadUrl(self.startUrlsFile):
            yield self.make_requests_from_url(url)
        for url in self.start_urls:
            yield self.make_requests_from_url(url)

    def parse_nuomi(self, response):

        #只爬取美食类信息
        prdType = response.xpath(
            '//div[@class="w-bread-crumb"]//a[@href="/326"]/text()').extract()
        prdType = "".join(prdType).strip('\n')
        if prdType != u'美食':
            return

        items = []
        sel = response.xpath('//div[@class="p-item-info"]')
        dealId = sel.xpath('@mon').extract_first().split('=')[1]
        shopUrl = 'http://www.nuomi.com/pcindex/main/shopchain?dealId=' + dealId

        html = requests.get(shopUrl, headers=headers)

        js = json.loads(html.text)

        # shopCity = js['data']['city']['900010000']['city_name']

        for shop in js['data']['shop']:

            shopId = shop['merchant_id']
            shopCity = shop['city_id']
            #只获取重庆的美食信息
            # if shopId in self.visitedShop or shopCity != u'900010000':
            if shopId in self.visitedShop:
                continue
            else:
                self.visitedShop.add(shopId)
            shopName = shop['name']
            shopCity = js['data']['city'][shopCity]['city_name']
            shopAddr = shop['address']
            shopPhone = shop['phone']
            shopGlat = shop['baidu_latitude']
            shopGlng = shop['baidu_longitude']
            shopUrl = shop['link']
            shopPicSave = ''
            shopScrapWeb = 'nuomi'

            item = ShopIndexItem()
            item['shopId'] = shopId
            item['shopCity'] = shopCity
            item['shopName'] = shopName
            item['shopAddr'] = shopAddr
            item['shopPhone'] = shopPhone
            item['shopGlat'] = shopGlat
            item['shopGlng'] = shopGlng
            item['shopUrl'] = shopUrl
            item['shopPicSave'] = shopPicSave
            item['shopScrapWeb'] = shopScrapWeb

            items.append(item)
        return items

    def parse_dianping(self, response):
        sel = response.xpath('//div[@id="basic-info"]')

        #只爬取美食类信息, 有如上标记,判断为美食信息

        if not sel:
            print 'not meishi ' + response.url
            return

        shopId = re.search(r'/shop/([\d]+)$', response.url).group(1)

        if shopId in self.visitedShop:
            return
        else:
            self.visitedShop.add(shopId)

        shopCity = response.xpath(
            '//*[@id="page-header"]//a[@class="city J-city"]/text()'
        ).extract_first()
        shopName = sel.xpath('h1[@class="shop-name"]/text()').extract_first()
        shopAddr = sel.xpath(
            './/span[@itemprop="street-address"]/text()').extract_first()
        shopPhone = sel.xpath(
            './/span[@itemprop="tel"]/text()').extract_first()

        # shopDataUrl = 'http://www.dianping.com/ajax/json/shop/wizard/BasicHideInfoAjaxFP?shopId=%s'%shopId
        # htmlshop = requests.get(shopDataUrl, headers= headers)
        # try:
        #     shopJson = json.loads(htmlshop.text)
        #     shopInfo = shopJson['msg']['shopInfo']
        #     shopGlat = str(shopInfo['glat'])
        #     shopGlng = str(shopInfo['glng'])
        #
        # except (ValueError, KeyError, TypeError):
        #     print "JSON format error"
        shopInfo = ''
        lng = re.search(r'lng:([\d]+\.[\d]+)', response.body)
        lat = re.search(r'lat:([\d]+\.[\d]+)', response.body)
        shopGlat = ''
        shopGlng = ''
        if lng and lat:
            shopGlng = lng.group(1)
            shopGlat = lat.group(1)

        shopUrl = response.url
        shopPicSave = ''
        shopScrapWeb = 'dianping'

        item = ShopIndexItem()
        item['shopId'] = shopId
        item['shopCity'] = shopCity
        item['shopName'] = shopName.strip('\n').strip(' ').strip('\n')
        item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n')
        item['shopPhone'] = shopPhone
        item['shopGlat'] = shopGlat
        item['shopGlng'] = shopGlng
        item['shopUrl'] = shopUrl
        item['shopPicSave'] = shopPicSave
        item['shopScrapWeb'] = shopScrapWeb

        yield item

    def parse_meituan(self, response):
        sel = response.xpath('//div[@class="fs-section__left"]')

        # if not response.xpath('//div[@id="meishi-menu"]/h2[@class="content-title"]'):
        #     print 'not meishi ' + response.url
        #     return

        shopId = re.search(r'/shop/([\d]+)$', response.url).group(1)
        if shopId in self.visitedShop:
            return
        else:
            self.visitedShop.add(shopId)

        shopName = sel.xpath(
            './/h2/span[@class="title"]/text()').extract_first()
        shopAddr = sel.xpath('.//p/span[@class="geo"]/text()').extract_first()

        shopJson = json.loads(
            sel.xpath(
                './/p/span[@id="map-canvas"]/@data-params').extract_first())
        shopInfo = shopJson['shops'][shopId]
        shopPhone = shopInfo['phone']
        shopGlat = str(shopInfo['position'][0])
        shopGlng = str(shopInfo['position'][1])

        shopUrl = response.url
        shopPicSave = ''
        shopScrapWeb = 'meituan'

        item = ShopIndexItem()
        item['shopId'] = shopId
        item['shopCity'] = ''
        item['shopName'] = shopName.strip('\n').strip(' ').strip('\n')
        item['shopAddr'] = shopAddr.strip('\n').strip(' ').strip('\n')
        item['shopPhone'] = shopPhone
        item['shopGlat'] = shopGlat
        item['shopGlng'] = shopGlng
        item['shopUrl'] = shopUrl
        item['shopPicSave'] = shopPicSave
        item['shopScrapWeb'] = shopScrapWeb

        yield item