示例#1
0
class RestaurantSpider(CrawlSpider):
    name = "RestaurantSpider"
    allowed_domains = ["domiciliosbogota.com"]
    start_urls = ('http://www.domiciliosbogota.com/', )
    productLinkGetter = ProductLinkGetter()
    rules = [
        Rule(LinkExtractor(allow=(r"http://www\.domiciliosbogota\.com/$")),
             'parseMain')
    ]

    def parseMain(self, response):
        self.restaurantIDsGetter = RestaurantIDsGetter(response)
        linksExtractor = LinkExtractor(
            allow=(r"http\:\/\/www\.domiciliosbogota\.com\/domicilios\-.*"))
        links = linksExtractor.extract_links(response)
        for link in links:
            yield Request(link.url, callback=self.parseRestaurants)

    def parseRestaurants(self, response):
        sel = RestaurantSelector(response)
        restaurant = Restaurant()
        restaurant["url"] = response.url
        restaurant["name"] = sel.getName()
        restaurant["id"] = self.restaurantIDsGetter.getID(
            "/" + response.url.split("/")[-1])
        restaurant["deliveryTimeInMinutes"] = sel.getDeliveryTimeInMinutes()
        restaurant["minOrderPrice"] = sel.getMinOrderPrice()
        restaurant["deliveryCost"] = sel.getDeliveryCost()
        restaurant["payMethods"] = sel.getPayMethods()
        restaurant["menu"] = sel.getMenuCategories()
        restaurant["tagCategories"] = sel.getTagCategories()
        restaurant["averagePunctuation"] = sel.getAveragePunctuation()
        restaurant["quantityOfComments"] = sel.getQuantityOfComments()
        return restaurant
示例#2
0
class ProductSpider(CrawlSpider):
    name = 'Product'
    allowed_domains = ['domiciliosbogota.com']
    start_urls = ['http://www.domiciliosbogota.com/']
    productLinkGetter = ProductLinkGetter()
    rules = (
        Rule(LinkExtractor(allow=()), follow=True),
        Rule(LinkExtractor(allow=(),
                               canonicalize = False,
                               tags = "li",
                               attrs = ("id",), 
                               process_value = productLinkGetter.getLink), 
             callback='parseProduct', follow=True),
    )
    
    def parseProduct(self, response):
        product = Product()
        product["product"] = json.loads(response.body)
        return product
 def testReturnNoneWhenLinkIsNotNumeric(self):
     id = "http://www.domiciliosbogota.com/cat-12345"
     productLinkGetter = ProductLinkGetter()
     self.assertEqual(None, productLinkGetter.getLink(id))
 def testGetUUrlFromProductID(self):
     id = "http://www.domiciliosbogota.com/12345"
     productLinkGetter = ProductLinkGetter()
     expectedLink = "http://www.domiciliosbogota.com/establecimientos/producto/12345"
     self.assertEqual(expectedLink, productLinkGetter.getLink(id))
 def testReturnNoneWhenLinkIsNotNumeric(self):
     id = "http://www.domiciliosbogota.com/cat-12345"
     productLinkGetter = ProductLinkGetter()
     self.assertEqual(None, productLinkGetter.getLink(id)) 
 def testGetUUrlFromProductID(self):
     id = "http://www.domiciliosbogota.com/12345"
     productLinkGetter = ProductLinkGetter()
     expectedLink = "http://www.domiciliosbogota.com/establecimientos/producto/12345"
     self.assertEqual(expectedLink, productLinkGetter.getLink(id))