Пример #1
0
 def validate_page(self, url):
     print 'validating page: %s' % url
     try:
         with connect(Requester) as c:
             r = c.urlopen(ro.Request(url,
                                      cookies=self.request_cookies))
     except ro.Exception, ex:
         print 'oException validating, retrying: %s %s' % (url,ex.msg)
         with connect(Requester) as c:
             r = c.urlopen(ro.Request(url,
                                      cookies=self.request_cookies))
Пример #2
0
    def GET(self,user_id_string):
        """
        return back the info for the next set of images
        expects to receive the user id string
        can receive the id of the last viewed image
        """

        # make sure we have a user string
        if not user_id_string:
            log.warning('ImageDetails GET [%s]: no user id string' %
                        user_id_string)
            web.badrequest()

        # find user's last viewed
        key = '%s:user_details:%s' % (NS, user_id_string)
        last_viewed_id = rc.hget(key, 'last_viewed_id')
        if last_viewed_id:
            # we get back a string
            last_viewed_id = int(last_viewed_id)

        # if there is no last viewed, it's 0
        else:
            last_viewed_id = 0

        # find the data on the next set of images
        try:
            with connect(Images) as c:
                images = c.get_images_since(image_id=last_viewed_id,
                                            timestamp=None,
                                            limit=10,
                                            offset=0)
        except io.Exception, ex:
            log.exception('ImageDetails GET [%s] [%s]: getting images' %
                          (user_id_string,last_viewed_id))
            web.internalerror()
Пример #3
0
 def populate_image_stats(self, image):
     """ returns a Image w/ image data + stats filled
         out """
     ti = image
     image_data = ti.data
     if not ti.data:
         return ti
     ti.size = len(image_data)
     try:
         with connect(Blobby) as c:
             ti.shahash = c.get_data_bhash(image_data)
     except o.Exception, ex:
         raise o.Exception('oException getting shahash: %s' % ex.msg)
Пример #4
0
 def download_image_data(self, url, cookies={}):
     # we want to download the image
     with connect(Requester) as c:
         try:
             img_r = c.urlopen(ro.Request(url,
                                          cookies=self.request_cookies))
         except Exception, ex:
             # fail, try again ?
             print 'exception getting img: %s' % ex
             try:
                 img_r = c.urlopen(ro.Request(img_url,
                                              cookies=self.request_cookies))
             except Exception:
                 print 'refailed'
                 return None
Пример #5
0
        for page_url in self.generate_page_urls():

            # make sure it's a valid page
            try:
                if not self.validate_page(page_url):
                    # we've hit an invalid page, done
                    return added
            except ro.Exception, ex:
                print 'oException validating: %s %s' % (page_url,ex.msg)
                return self.validate_page(page_url)
            except Exception, ex:
                print 'Exception validating: %s %s' % (page_url,ex)
                return self.validate_page(page_url)

            # get all the pics on the page
            with connect(Scraper) as c:
                print 'getting page images'
                try:
                    # TODO: be able to re-use cookies
                    img_urls = c.get_images(page_url)
                except so.Exception, ex:
                    print 'oException getting images: %s %s' % (page_url,ex.msg)
                    if not sync:
                        raise ex
                except Exception, ex:
                    print 'Exception getting images: %s %s' % (page_url,ex)
                    if not sync:
                        raise ex

                print 'images: %s' % len(img_urls)
Пример #6
0
 def _set_image_data(self, image):
     if image.data is not None:
         with connect(Blobby) as c:
             image.shahash = c.set_data(image.data)
     return image
Пример #7
0
 def _populate_image_data(self, image):
     if not image.shahash:
         return None
     with connect(Blobby) as c:
         image.data = c.get_data(image.shahash)
     return image
Пример #8
0
class ImagesHandler(object):
    def __init__(self, redis_host='127.0.0.1'):
        self.redis_host = redis_host
        self.rc = Redis(redis_host)
        self.revent = ReventClient(redis_host=self.redis_host)

        # redis keys

        # incr this for the next image id
        # images:next_id = next_id

        # all the images for the given sha
        # images:datainstances:<shahash> = (ids)

        # timestamp of when image was added
        # images:ids:timestamps = sorted (ids,timestamp)

        # all the image ids for the page
        # images:page_ids:<page_url> (ids)

        # last time an image was added from page
        # images:pages:timestamps = sorted (url,timestamp)

        # images meta data
        # images:id = {}

    def _image_to_dict(self, image):
        data = {}
        ignored_attrs = ['data']
        for attrs in image.thrift_spec[1:]:
            attr = attrs[2]
            if attr in ignored_attrs:
                continue
            v = getattr(image,attr)
            if v is not None:
                data[attr] = v
        return data

    def _dict_to_image(self, data):
        image = o.Image()
        for attrs in image.thrift_spec[1:]:
            attr = attrs[2]
            v = data.get(attr)
            if v is not None:
                # we might need to update the value
                # type, since all values come back
                # from redis as strings
                attr_type = attrs[1]

                # float
                if attr_type == 4:
                    setattr(image,attr,float(v))
                # int
                elif attr_type == 8:
                    setattr(image,attr,int(v))
                else:
                    setattr(image,attr,v)
        return image

    def _delete_from_redis(self, image):

        # make these a transaction
        pipe = self.rc.pipeline()

        # remove it from the id set
        pipe.zrem('images:ids:timestamps',image.id)

        # remove it's hash
        pipe.delete('images:%s' % image.id)

        # decriment the count for it's image data
        pipe.srem('images:datainstances:%s' % image.shahash,
                     image.id)

        # remove image from the page's id set
        if image.source_page_url:
            pipe.zrem('images:page_ids:%s' % image.source_page_url,
                      image.id)

        # make it happen
        pipe.execute()

        return True

    def _save_to_redis(self, image):

        # make these a transaction
        pipe = self.rc.pipeline()

        # if our image doesn't have an id, set it up w/ one
        if not image.id:
            print 'got new image: %s' % image.shahash
            image.id = self.rc.incr('images:next_id')
            pipe.sadd('images:datainstances:%s' % image.shahash,
                         image.id)

        # check and see if we used to have a different shahash
        old_shahash = self.rc.hget('images:%s' % image.id,'shahash')
        if old_shahash != image.shahash:
            # remove our id from the old shahash tracker
            pipe.srem('images:datainstances:%s' % old_shahash,
                         image.id)
            # add it to the new tracker
            pipe.sadd('images:datainstances:%s' % image.shahash,
                         image.id)


        # update / set our timestamp
        da = 0.0
        if image.downloaded_at:
            da = image.downloaded_at
        else:
            da = time.time()
        pipe.zadd('images:ids:timestamps',image.id, da)

        # add this image to the page's id set
        if image.source_page_url:
            pipe.zadd('images:page_ids:%s' % image.source_page_url,
                      image.id, da)

            # update our last scrape time for the page
            pipe.zadd('images:pages:timestamps',
                      image.source_page_url, image.id)

        # take our image and make a dict
        image_data = self._image_to_dict(image)

        # set our data to redis
        key = 'images:%s' % image.id
        pipe.hmset(key,image_data)

        # execute our pipe
        pipe.execute()

        return image

    def _get_from_redis(self, image_id):
        # if the image id is in the id set than pull it's details
        if self.rc.zrank('images:ids:timestamps',image_id) is not None:
            # get the image data from redis
            key = 'images:%s' % image_id
            image_data = self.rc.hgetall(key)
            if not image_data:
                print 'redis had no image data'
                return None
            image = self._dict_to_image(image_data)
            return image

        return None

    def _populate_image_data(self, image):
        if not image.shahash:
            return None
        with connect(Blobby) as c:
            image.data = c.get_data(image.shahash)
        return image

    def _set_image_data(self, image):
        if image.data is not None:
            with connect(Blobby) as c:
                image.shahash = c.set_data(image.data)
        return image

    def get_image(self, image_id):
        """ returns Image for given id or blank Image """

        # see if we have an image
        image = self._get_from_redis(image_id)

        if not image:
            raise o.ImageNotFound('Could not get image', image_id)

        # pull the actual image data
        self._populate_image_data(image)

        return image

    def add_image(self, image):
        """ like set but if we already have this image from this
            page we're not going to add it again. will also
            fill out image stats (size, dimension) """

        # we're only for new images, no i'ds allowed
        # if u want to set an id by hand use set_image
        if image.id:
            raise o.Exception('Can not add image with id')

        if not image.data:
            raise o.Exception('Image must have data')

        if not image.source_page_url:
            raise o.Exception('Image must have source page url')

        # update it's stats
        image = self.populate_image_stats(image)

        # only add the image if we haven't seen it beforeQ
        # if we've seen it before there will be an id which
        # the set of images w/ this data and from this page share
        ids = self.rc.sinter('images:datainstance:%s' % image.shahash,
                             'images:page_ids:%s' % image.source_page_url)


        # we don't need to continue
        # we'll return back their original msg, w/o the id set
        if ids:
            print 'image already exists [%s], not setting' % ids
            return image

        # so the image appears to be new, good for it
        return self.set_image(image)

    def set_image(self, image):
        """ sets image data, returns image """

        # would be better if we only saved if it didn't exist
        if image.data:
            # save the images data
            self._set_image_data(image)

        # could be an update, could be new
        image = self._save_to_redis(image)

        # let the world know we have added a new image
        self.revent.fire('image_added',{
            'source_page_url': image.source_page_url,
            'source_url': image.source_url,
            'shahash': image.shahash,
            'vhash': image.vhash,
            'xdim': image.xdim,
            'ydim': image.ydim,
        })

        return image

    def delete_image(self, image_id):
        """ removes an image """

        # get it's image obj
        try:
            image = self.get_image(image_id)
        except o.ImageNotFound, ex:
            return False

        # delete the redis data
        self._delete_from_redis(image)

        # see if we need to remove the image data
        if self.rc.scard('images:datainstances:%s' % image.shahash) == 0:
            # no more images w/ the same data, remove image data
            with connect(Blobby) as c:
                c.delete_data(image.shahash)

        # it's gone, let'm know
        self.revent.fire('image_deleted',{
            'source_page_url': image.source_page_url,
            'source_url': image.source_url,
            'shahash': image.shahash,
            'vhash': image.vhash,
            'xdim': image.xdim,
            'ydim': image.ydim,
        })

        # and we're done!
        return True