Exemplo n.º 1
0
 def __init__(self, session):
     self.session = session
     self.client = None
     self.limit = 20
     self.model = TumblrBlog
     self.posts = TumblrPostManager(self.session)
     self.properties = PropertyManager(self.session)
Exemplo n.º 2
0
class TumblrBlogManager(object):
    def __init__(self, session):
        self.session = session
        self.client = None
        self.limit = 20
        self.model = TumblrBlog
        self.posts = TumblrPostManager(self.session)
        self.properties = PropertyManager(self.session)
        
    def set_client(self, client):
        self.client = client
        self.posts.set_client(client)
        self.client_info = client.info()

    def compile_query(self, query):
        return compile_query(query)

    def _query(self):
        return self.session.query(self.model)
    
    def query(self):
        return self.session.query(self.model)
    
    def get(self, id):
        return self.session.query(self.model).get(id)
    
    def _range_filter(self, query, start, end):
        query = query.filter(self.model.updated_remote >= start)
        query = query.filter(self.model.updated_remote <= end)
        return query
    
    def get_ranged_blogs(self, start, end, timestamps=False):
        if timestamps:
            start, end = convert_range_to_datetime(start, end)
        q = self.session.query(self.model)
        q = self._range_filter(q, start, end)
        return q.all()
    
    
    

    # we are certain name is unique
    def get_by_name(self, name):
        q = self._query().filter_by(name=name)
        rows = q.all()
        if not len(rows):
            return None
        return rows.pop()

    def get_blog_posts_query(self, name, blog=None, type=None):
        if blog is None:
            blog = self.get_by_name(name)
        q = self.session.query(TumblrPost).join(TumblrBlogPost)
        q = q.filter(TumblrBlogPost.blog_id==blog.id)
        if type is not None:
            q = q.filter_by(type=type)
        q = q.order_by(TumblrPost.id)
        return q
            
    def get_blog_posts(self, name, offset=0, limit=20, blog=None, type=None):
        q = self.get_blog_posts_query(name, blog=blog, type=type)
        q = q.offset(offset).limit(limit)
        return q.all()

    def get_post_photos(self, post_id, thumbs=False):
        return self.posts.get_post_photos(post_id, thumbs=thumbs)

    def get_post_photos_and_paths(self, post_id, thumbs=False):
        photos = self.get_post_photos(post_id, thumbs=thumbs)
        repos = self.posts.photos.repos
        for photo in photos:
            basename = os.path.basename(photo.url)
            photo.filename = repos.filename(basename)
        return photos
    
    
    
    
    def get_all_ids_query(self):
        return self.session.query(self.model.id).order_by(self.model.id)
    
    def get_all_ids(self):
        q = self.get_all_ids_query()
        return q.all()

    def get_by_property_query(self, propname):
        prop = self.properties.get_by_name(propname)
        

    def add_blog_object(self, blog):
        with transaction.manager:
            b = self.model()
            for key in BLOGKEYS:
                setattr(b, key, blog[key])
            b.updated_remote = datetime.fromtimestamp(blog['updated'])
            b.updated_local = datetime.now()
            self.session.add(b)
        return self.session.merge(b)

    def update_blog_object(self, blog_id, blogdata):
        object_updated = False
        with transaction.manager:
            b = self._query().get(blog_id)
            if b is None:
                raise RuntimeError, "No object to update"
            for key in BLOGKEYS:
                value = blogdata[key]
                dbvalue = getattr(b, key)
                if value != dbvalue:
                    msg = "%s.%s has changed from %s ------> %s"
                    print msg % (b.name, key, dbvalue, value)
                    setattr(b, key, value)
                    object_updated = True
            updated_remote = datetime.fromtimestamp(blogdata['updated'])
            if b.updated_remote != updated_remote:
                b.updated_remote = updated_remote
                object_updated = True
            b.updated_local = datetime.now()    
            self.session.add(b)
        if object_updated:
            return self.session.merge(b)

    def _get_blog_info(self, blog_name):
        info = self.client.blog_info(blog_name)
        if 'blog' not in info:
            if info['meta']['status'] == 404:
                print "%s not found" % blog_name
                return None
        return info
    
    def add_blog(self, blog_name):
        info = self._get_blog_info(blog_name)
        return self.add_blog_object(info['blog'])

    def _update_blog_info(self, blog_id):
        b = self._query().get(blog_id)
        info = self._get_blog_info(b.name)['blog']
        return self.update_blog_object(b.id, info)
        
    def update_blog_info(self, blog_id):
        b = self._query().get(blog_id)
        info = self._get_blog_info(b.name)['blog']
        newb = self.update_blog_object(b.id, info)
        if newb is None:
            print "%s not updated." % b.name
        self.update_posts_for_blog('ignored', blog_id)
        stmt = ~exists().where(TumblrBlogPost.post_id==TumblrPost.id)
        last_post_query = self.session.query(func.max(TumblrBlogPost.post_id))
        last_post_id = last_post_query.one()[0]
        self.posts.get_all_posts(b.name, blog_id=b.id)
            
    def update_all_blog_info(self):
        for b in self._query():
            self.update_blog_info(b.id)
            
        
    def update_posts_for_blog(self, name, blog_id=None):
        if blog_id is None:
            blog = self.get_by_name(name)
        else:
            blog = self.get(blog_id)
        if blog is None:
            raise RuntimeError, "No blog named %s" % name
        
                        
        q = self.session.query(TumblrPost).filter_by(blog_name=blog.name)
        #blogposts = self.session.query(TumblrBlogPost.post_id)
        #blogposts = blogposts.filter_by(blog_id=blog.id)
        #blogposts = blogposts.subquery('blogposts')
        #q = q.filter(not_(TumblrPost.id.in_(blogposts)))
        stmt = ~exists().where(TumblrBlogPost.post_id==TumblrPost.id)
        q = q.filter(stmt)

        posts = q.all()
        total = len(posts)
        print "total", total
        count = 0
        if not total:
            print "Nothing to update for", blog.name
        for post in posts:
            tbp = self.session.query(TumblrBlogPost).get((blog.id, post.id))
            count += 1
            if tbp is None:
                with transaction.manager:
                    tbp = TumblrBlogPost()
                    tbp.blog_id = blog.id
                    tbp.post_id = post.id
                    self.session.add(tbp)
                    #print "Added %d for %s." % (post.id, blog.name)
            if not count % 100:
                remaining = total - count
                print "%d posts remaining for %s" % (remaining, blog.name)
                
    def get_followed_blogs(self):
        if self.client is None:
            raise RuntimeError, "Need to set client"
        offset = 0
        limit = self.limit
        blogs = self.client.following(offset=offset, limit=limit)
        total_blog_count = blogs['total_blogs']
        current_blogs = blogs['blogs']
        blog_count = len(current_blogs)
        for blog in current_blogs:
            blog_name = blog['name']
            if self.get_by_name(blog_name) is None:
                print "Adding %s" % blog_name
                b = self.add_blog(blog_name)
                if b is not None:
                    self.properties.tag_blog(b.id, 'followed')
        while len(current_blogs):
            offset += limit
            blogs = self.client.following(offset=offset, limit=limit)
            current_blogs = blogs['blogs']
            for blog in current_blogs:
                blog_name = blog['name']
                if self.get_by_name(blog_name) is None:
                    print "Adding %s" % blog_name
                    b = self.add_blog(blog_name)
                    if b is not None:
                        self.properties.tag_blog(b.id, 'followed')
            blog_count += len(current_blogs)
            remaining = total_blog_count - blog_count
            print '%d blogs remaining.' % remaining

    def sample_blogs(self, amount, update_first=False):
        import random
        blogs = self._query().all()
        random.shuffle(blogs)
        for b in blogs:
            if update_first:
                print "updating posts for %s" % b.name
                self.update_posts_for_blog('ignore', blog_id=b.id)
            info = self._get_blog_info(b.name)
            if info is not None:
                info = info['blog']
            else:
                continue
            newb = self.update_blog_object(b.id, info)
            if newb is not None:
                b = newb
                print "Blog %s updated" % b.name
                q = self.session.query(TumblrBlogPost)
                q = q.filter_by(blog_id=b.id)
                print "sampling %d posts from %s" % (amount, b.name)
                self.posts.get_all_posts(b.name, amount, blog_id=b.id)
                self.update_posts_for_blog('ignore', blog_id=b.id)
            else:
                print "Skipping", b.name
                
        
        
    def sample_blog_likes(self, amount):
        import random
        blogs = self._query().all()
        random.shuffle(blogs)
        for b in blogs:
            print "sampling %d likes from %s" % (amount, b.name)
            self.posts.get_blog_likes(b.name, amount)

    def _photoquery(self, post_id, thumbnails=False):
        urlmodel = TumblrPhotoUrl
        postmodel = TumblrPostPhoto
        if thumbnails:
            urlmodel = TumblrThumbnailUrl
            postmodel = TumblrPostThumbnail
        q = self.session.query(urlmodel).join(postmodel)
        return q.filter(postmodel.post_id == post_id)


    def _make_blog_directory(self, blogname, blogpath, thumbnails=False):
        blog = self.get_by_name(blogname)
        if blog is None:
            raise RuntimeError, "%s doesn't exist." % blogname
        if not os.path.isdir(blogpath):
            os.makedirs(blogpath)
        current_blog_files = os.listdir(blogpath)
        _bases = [x.split('.')[0] for x in current_blog_files]
        current_blog_file_ids = [x.split('-')[1] for x in _bases]
        repos = self.posts.photos.repos
        q = self.session.query(TumblrPost).join(TumblrBlogPost)
        q = q.filter(TumblrBlogPost.blog_id == blog.id)
        q = q.filter(not_(TumblrBlogPost.post_id.in_(current_blog_file_ids)))
        q = q.order_by(TumblrPost.id)
        for post in q:
            if post.type != 'photo':
                continue
            photoquery = self._photoquery(post.id, thumbnails=thumbnails)
            for tpu in photoquery:
                url = tpu.url
                basename = os.path.basename(url)
                if repos.file_exists(basename):
                    if len(basename.split('.')) == 2:
                        ext = basename.split('.')[1]
                    else:
                        print "WARNING! BAD GUESS", basename
                        ext = '.jpg'
                    filebase = '%d-%d.%s' % (post.id, tpu.id, ext)
                    filename = os.path.join(blogpath, filebase)
                    #if not os.path.isfile(filename):
                    if filebase not in current_blog_files:
                        print "Linking", filename
                        os.link(repos.filename(basename), filename)


    def make_blog_directory(self, blogname, blogpath):
        self._make_blog_directory(blogname, blogpath, thumbnails=False)
        
                
    def make_thumb_directory(self, blogname, blogpath):
        self._make_blog_directory(blogname, blogpath, thumbnails=True)