Пример #1
0
    def process_detailpage(self):
        prod_url = self.response.meta["prod_url"]
        prod_name = self.response.meta["prod_name"]
        cat = self.response.meta["cat"]

        uid = get_uid(self.response.url)
        image = Image.open(StringIO(self.response.body))
        image_file = "%s/%s.%s" % (self.tmpfile_dir, uid, image.format.lower())
        image.save(image_file)
        price = gocr(image_file)
        log.msg("save image:%s, url:%s, price:%s" % (image_file, self.response.url, price))

        self.save(prod_url, prod_name, cat, price)
        return 0
Пример #2
0
 def __init__(self, cur_idepth, max_idepth, \
         cur_xdepth, max_xdepth, content_group, \
         pl_group, source, url):
     self.cur_idepth = cur_idepth
     self.max_idepth = max_idepth
     self.cur_xdepth = cur_xdepth
     self.max_xdepth = max_xdepth
     self.content_group = content_group
     self.pl_group = pl_group
     self.source = source
     self.url = url
     self.uid = get_uid(url)
     self.domain = get_domain(url) 
     self.host = urlparse(self.url).hostname
Пример #3
0
 def get(self, url, collection_name=None):
     uid = get_uid(url)
     dbrecord = self.dbclient.find_one(uid, collection_name)
     if dbrecord:
         item = GoodsItem()
         item.oid = str(dbrecord["_id"])
         item.url = dbrecord["url"]
         item.uid = dbrecord["uid"]
         item.name = dbrecord["name"]
         item.cat = dbrecord["cat"]
         item.data = dbrecord["data"]
         item.bottom_price = dbrecord["bottom_price"]
         item.domain = dbrecord["domain"]
         return item
     else:
         return None
Пример #4
0
    def process(self, item):
        if not isinstance(item, GoodsItem):
            log.msg('expect a GoodsItem instance, got %s' % type(item))
            return

        if len(item.data) < 2:
            return

        #compare the latest and second latest price
        discount = float(item.data[-1][0])/item.data[-2][0]
        if discount > self.accept_discount:
            return

        price = item.data[-1][0]
        recipients = self.rule.get(get_uid(item.url), price, discount)
        subject = 'Big Promotion[$title]'
        content = "$title is now ¥%s, discount %s, %s" % (price, discount, item.url)
        self.mail.send(recipients, subject, content)
Пример #5
0
    def put(self, url, name, cat, price, collection_name=None):
        uid = get_uid(url)
        domain = get_domain(url)
        crawl_time = int(time.time())
        item = self.get(url, collection_name)
        if item:
            if item.add_price(price, crawl_time):
                self.dbclient.update_field(uid, collection_name, data=item.data, bottom_price=item.bottom_price)
            else:
                log.msg("duplicate price")
                return
        else:
            item = {
                "url": url,
                "uid": uid,
                "name": name,
                "cat": cat,
                "data": [(price, crawl_time)],
                "bottom_price": (price, crawl_time),
                "domain": domain,
            }
            self.dbclient.insert(item, uid, collection_name)

        send_catch_log(signal=signals.item_saved, item=item)