Exemplo n.º 1
0
    def _cache(blog, *args, **kwargs):
        class_name = blog.__class__.__name__
        method_name = method.func_name
        strargs = '%s----%s' % (
            '--'.join(slugify(str(arg)) for arg in args),
            '--'.join('%s=%s' % (
                slugify(key), slugify(str(value)))
                for key, value in kwargs.items()
            ),
        )

        cache_path = os.path.join(
            'cache', class_name, method_name, strargs)

        if blog.use_cache:
            if os.path.exists(cache_path):
                print '        Loading %s.%s(%s, %s) from cache %s' % (
                    class_name, method_name, args, kwargs, cache_path
                )
                return pickle.load(open(cache_path))

        result = method(blog, *args, **kwargs)
        dirname = os.path.dirname(cache_path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(cache_path, 'w+') as f:
            pickle.dump(result, f)
        return result
    def _cache(blog, *args, **kwargs):
        class_name = blog.__class__.__name__
        method_name = method.func_name
        strargs = '%s----%s' % (
            '--'.join(slugify(str(arg)) for arg in args),
            '--'.join('%s=%s' % (
                slugify(key), slugify(str(value)))
                for key, value in kwargs.items()
            ),
        )

        cache_path = os.path.join(
            'cache', class_name, method_name, strargs)

        if blog.use_cache:
            if os.path.exists(cache_path):
                print '        Loading %s.%s(%s, %s) from cache %s' % (
                    class_name, method_name, args, kwargs, cache_path
                )
                return pickle.load(open(cache_path))

        result = method(blog, *args, **kwargs)
        dirname = os.path.dirname(cache_path)
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        with open(cache_path, 'w+') as f:
            pickle.dump(result, f)
        return result
Exemplo n.º 3
0
    def from_metaweblog(cls, struct, post_type='post', is_edit=False):
        """Receive metaWeblog RPC struct and initialize a Post.
           Used both by migrate_from_wordpress and when receiving a new or
           edited post from MarsEdit.
        """
        title = struct.get('title', '')

        meta_description = struct.get('mt_excerpt', '')
        if len(meta_description) > 155:
            raise ValueError("Description is %d chars, max 155" %
                             len(meta_description))

        if 'mt_keywords' in struct:
            tags = [
                tag.strip() for tag in struct['mt_keywords'].split(',')
                if tag.strip()
            ]
        else:
            tags = None

        slug = (slugify.slugify(struct['wp_slug'])
                if struct.get('wp_slug') else slugify.slugify(title))

        description = struct.get('description', '')
        status = (struct.get('post_status') or struct.get('page_status')
                  or 'publish')

        if 'date_modified_gmt' in struct:
            tup = struct['date_modified_gmt'].timetuple()
            mod = utc_tz.localize(datetime.datetime(*tup[0:6]))
        else:
            mod = datetime.datetime.utcnow()

        body = markup.markup(description)

        rv = cls(
            title=title,
            # Format for display
            body=body,
            plain=plain.plain(body),
            summary=summarize.summarize(body, 200),
            original=description,
            meta_description=meta_description,
            tags=tags,
            slug=slug,
            type=post_type,
            status=status,
            wordpress_id=struct.get('postid'),
            mod=mod)

        if not is_edit and 'date_created_gmt' in struct:
            # TODO: can fail if two posts created in same second, add random
            #   suffix to ObjectId
            date_created = datetime.datetime.strptime(
                struct['date_created_gmt'].value, "%Y%m%dT%H:%M:%S")
            rv.id = ObjectId.from_datetime(date_created)

        return rv
Exemplo n.º 4
0
    def from_metaweblog(
        cls, struct, post_type='post', is_edit=False
    ):
        """Receive metaWeblog RPC struct and initialize a Post.
           Used both by migrate_from_wordpress and when receiving a new or
           edited post from MarsEdit.
        """
        title = struct.get('title', '')

        # We expect MarsEdit to set categories with mt_setPostCategories()
        assert 'categories' not in struct

        if 'mt_keywords' in struct:
            tags = [
                tag.strip() for tag in struct['mt_keywords'].split(',')
                if tag.strip()
            ]
        else:
            tags = None


        slug = (
            slugify.slugify(struct['wp_slug'])
            if struct.get('wp_slug')
            else slugify.slugify(title))

        description = struct.get('description', '')
        status = struct.get('post_status', 'publish')
        if 'date_modified_gmt' in struct:
            tup = struct['date_modified_gmt'].timetuple()
            mod = utc_tz.localize(datetime.datetime(*tup[0:6]))
        else:
            mod = datetime.datetime.utcnow()

        body = markup.markup(description)

        rv = cls(
            title=title,
            # Format for display
            body=body,
            summary=summarize.summarize(body, 200),
            original=description,
            tags=tags,
            slug=slug,
            type=post_type,
            status=status,
            wordpress_id=struct.get('postid'),
            mod=mod
        )

        if not is_edit and 'date_created_gmt' in struct:
            # TODO: can fail if two posts created in same second, add random
            #   suffix to ObjectId
            date_created = datetime.datetime.strptime(
                struct['date_created_gmt'].value, "%Y%m%dT%H:%M:%S")
            rv.id = ObjectId.from_datetime(date_created)

        return rv
Exemplo n.º 5
0
    def test_new_post(self):
        start = datetime.datetime.utcnow()
        post_id = self.new_post(
            title='the title',
            description=meta_description,
            body='the body')

        end = datetime.datetime.utcnow()

        post = self.fetch_rpc(
            'metaWeblog.getPost',
            (
                post_id,
                tornado_options.user,
                tornado_options.password))

        title_slug = slugify.slugify('the title')
        expected_url = self.reverse_url_absolute('post', title_slug)

        self.assertEqual(post_id, post['id'])
        self.assertEqual(expected_url, post['link'])
        self.assertEqual(expected_url, post['permaLink'])
        self.assertEqual('a tag,another tag', post['mt_keywords'])
        self.assertEqual('publish', post['status'])
        self.assertEqual('the title', post['title'])
        self.assertEqual(meta_description, post['mt_excerpt'])
        self.assertEqual('the body', post['description'])  # Confusing I know.
        self.assertTrue(
            start <= post['date_created_gmt'] <= end,
            "Post's date_created_gmt %s isn't between %s and %s" % (
                post['date_created_gmt'], start, end))
def replace_media_links(body, media_library, db, destination_url, source_base_url):
    for link in media_library:
        if link in body:
            # This is making some big assumptions about the structure
            # of the media URL, that it's like
            # http://emptysquare.net/blog/wp-content/uploads/2011/10/img.png
            url = link.split('/uploads/')[-1]

            media_doc = db.media.find_one({'_id': link})
            if not media_doc:
                # TODO: remove
                cache_path = os.path.join('cache', slugify(link))
                if os.path.exists(cache_path):
                    content, content_type = pickle.load(open(cache_path))
                else:
                    r = requests.get(link)
                    content = r.content
                    content_type = r.headers['content-type']
                    if not os.path.exists('cache'):
                        os.mkdir('cache')
                    with open(cache_path, 'w+') as f:
                        pickle.dump((content, content_type), f)

                db.media.insert({
                    'content': bson.Binary(content),
                    'length': len(content),
                    'type': content_type,
                    '_id': url,
                    'mod': datetime.datetime.utcnow(),
                })

            body = body.replace(
                link, os.path.join(destination_url, 'media', url))

    return body
Exemplo n.º 7
0
    def test_new_post(self):
        start = datetime.datetime.utcnow()
        post_id = self.new_post(title='the title',
                                description=meta_description,
                                body='the body')

        end = datetime.datetime.utcnow()

        post = self.fetch_rpc(
            'metaWeblog.getPost',
            (post_id, tornado_options.user, tornado_options.password))

        title_slug = slugify.slugify('the title')
        expected_url = self.reverse_url_absolute('post', title_slug)

        self.assertEqual(post_id, post['id'])
        self.assertEqual(expected_url, post['link'])
        self.assertEqual(expected_url, post['permaLink'])
        self.assertEqual('a tag,another tag', post['mt_keywords'])
        self.assertEqual('publish', post['status'])
        self.assertEqual('the title', post['title'])
        self.assertEqual(meta_description, post['mt_excerpt'])
        self.assertEqual('the body', post['description'])  # Confusing I know.
        self.assertTrue(
            start <= post['date_created_gmt'] <= end,
            "Post's date_created_gmt %s isn't between %s and %s" %
            (post['date_created_gmt'], start, end))
Exemplo n.º 8
0
def replace_media_links(body, media_library, db, destination_url,
                        source_base_url):
    for link in media_library:
        if link in body:
            # This is making some big assumptions about the structure
            # of the media URL, that it's like
            # http://emptysquare.net/blog/wp-content/uploads/2011/10/img.png
            url = link.split('/uploads/')[-1]

            media_doc = db.media.find_one({'_id': link})
            if not media_doc:
                # TODO: remove
                cache_path = os.path.join('cache', slugify(link))
                if os.path.exists(cache_path):
                    content, content_type = pickle.load(open(cache_path))
                else:
                    r = requests.get(link)
                    content = r.content
                    content_type = r.headers['content-type']
                    if not os.path.exists('cache'):
                        os.mkdir('cache')
                    with open(cache_path, 'w+') as f:
                        pickle.dump((content, content_type), f)

                db.media.insert({
                    'content': bson.Binary(content),
                    'type': content_type,
                    '_id': url,
                    'mod': datetime.datetime.utcnow(),
                })

            body = body.replace(link,
                                os.path.join(destination_url, 'media', url))

    return body
Exemplo n.º 9
0
 def test_if_modified_since_microseconds(self):
     # If-Modified-Since is rounded down to the second.
     post_id = self.new_post(title='title')
     doc = self.sync_db.posts.find_one({'_id': ObjectId(post_id)})
     dt = doc['mod']
     slug = slugify.slugify('title')
     url = self.reverse_url('post', slug)
     response = self.fetch(url, if_modified_since=dt.replace(microsecond=0))
     self.assertEqual(304, response.code)
Exemplo n.º 10
0
 def test_post_page(self):
     self.new_post()
     title_slug = slugify.slugify('the title')
     post_page = self.fetch(self.reverse_url('post', title_slug))
     self.assertEqual(200, post_page.code)
     soup = BeautifulSoup(post_page.body)
     description_tag = soup.find('meta', attrs={'name': 'description'})
     self.assertTrue(description_tag)
     self.assertEqual(self.meta_description, description_tag['content'])
Exemplo n.º 11
0
 def test_post_page(self):
     self.new_post()
     title_slug = slugify.slugify('the title')
     post_page = self.fetch(self.reverse_url('post', title_slug))
     self.assertEqual(200, post_page.code)
     soup = BeautifulSoup(post_page.body)
     description_tag = soup.find('meta', attrs={'name': 'description'})
     self.assertTrue(description_tag)
     self.assertEqual(self.meta_description, description_tag['content'])
Exemplo n.º 12
0
 def test_if_modified_since_microseconds(self):
     # If-Modified-Since is rounded down to the second.
     post_id = self.new_post(title='title')
     doc = self.sync_db.posts.find_one({'_id': ObjectId(post_id)})
     dt = doc['mod']
     slug = slugify.slugify('title')
     url = self.reverse_url('post', slug)
     response = self.fetch(url, if_modified_since=dt.replace(microsecond=0))
     self.assertEqual(304, response.code)
Exemplo n.º 13
0
    def test_category_feed(self):
        slug = slugify.slugify('category 0')
        response = self.fetch(self.reverse_url('category-feed', slug))
        self.assertEqual(200, response.code)
        feed = fromstring(response.body)
        entries = list(feed.findall(ns + 'entry'))
        self.assertEqual(1, len(entries))

        # Post with 'the title' is in this category, not 'other title'.
        self.assertEqual('the title', entries[0].find(ns + 'title').text)
Exemplo n.º 14
0
    def test_feed(self):
        response = self.fetch(self.reverse_url('feed'))
        self.assertEqual(200, response.code)
        feed = fromstring(response.body)
        entries = list(feed.findall(ns + 'entry'))
        self.assertEqual(2, len(entries))

        # Most recent first.
        self.assertEqual('other title', entries[0].find(ns + 'title').text)

        self.assertEqual(
            self.reverse_url_absolute('post', slugify.slugify('other title')),
            entries[0].find(ns + 'id').text)

        # Second post.
        self.assertEqual('the title', entries[1].find(ns + 'title').text)

        self.assertEqual(
            self.reverse_url_absolute('post', slugify.slugify('the title')),
            entries[1].find(ns + 'id').text)
Exemplo n.º 15
0
    def test_category_feed(self):
        slug = slugify.slugify('category 0')
        response = self.fetch(self.reverse_url('category-feed', slug))
        self.assertEqual(200, response.code)
        feed = fromstring(response.body)
        entries = list(feed.findall(ns + 'entry'))
        self.assertEqual(1, len(entries))

        # Post with 'the title' is in this category, not 'other title'.
        self.assertEqual(
            'the title',
            entries[0].find(ns + 'title').text)
Exemplo n.º 16
0
    def test_feed(self):
        response = self.fetch(self.reverse_url('feed'))
        self.assertEqual(200, response.code)
        feed = fromstring(response.body)
        entries = list(feed.findall(ns + 'entry'))
        self.assertEqual(2, len(entries))

        # Most recent first.
        self.assertEqual(
            'other title',
            entries[0].find(ns + 'title').text)

        self.assertEqual(
            self.reverse_url_absolute('post', slugify.slugify('other title')),
            entries[0].find(ns + 'id').text)

        # Second post.
        self.assertEqual(
            'the title',
            entries[1].find(ns + 'title').text)

        self.assertEqual(
            self.reverse_url_absolute('post', slugify.slugify('the title')),
            entries[1].find(ns + 'id').text)
Exemplo n.º 17
0
    def test_single_post_mod_date(self):
        one_id = self.new_post(title='title 1',
                               created=datetime.datetime(2014, 1, 1))

        self.new_post(title='title 2', created=datetime.datetime(2014, 1, 2))

        title_2_slug = slugify.slugify('title 2')
        url = self.reverse_url('post', title_2_slug)
        self.assert_modified(url, datetime.datetime(2014, 1, 2))
        self.new_post(title='title 3', created=datetime.datetime(2014, 1, 3))

        self.assert_modified(url, datetime.datetime(2014, 1, 3))
        self.edit_post(one_id,
                       'title 1',
                       updated=datetime.datetime(2014, 1, 4))

        self.assert_modified(url, datetime.datetime(2014, 1, 4))
Exemplo n.º 18
0
    def test_single_post_mod_date(self):
        one_id = self.new_post(
            title='title 1',
            created=datetime.datetime(2014, 1, 1))

        self.new_post(
            title='title 2',
            created=datetime.datetime(2014, 1, 2))

        title_2_slug = slugify.slugify('title 2')
        url = self.reverse_url('post', title_2_slug)
        self.assert_modified(url, datetime.datetime(2014, 1, 2))
        self.new_post(
            title='title 3',
            created=datetime.datetime(2014, 1, 3))

        self.assert_modified(url, datetime.datetime(2014, 1, 3))
        self.edit_post(
            one_id,
            'title 1',
            updated=datetime.datetime(2014, 1, 4))

        self.assert_modified(url, datetime.datetime(2014, 1, 4))
Exemplo n.º 19
0
def main(args):
    start = time.time()

    opts = options.options()
    destination_url = '/' + opts.base_url.lstrip('/')
    parts = urlparse(args.source_url)
    source_base_url = urljoin('%s://%s' % (parts[0], parts[1]),
                              parts[2].split('/xmlrpc.php')[0])

    print 'Base URL', source_base_url

    db = pymongo.Connection(safe=True).motorblog
    motordb = motor.MotorClient().open_sync().motorblog
    if args.wipe:
        print 'Wiping motorblog database'
        db.connection.drop_database('motorblog')
        print 'Creating capped collection "events"'
        create_events_collection(motordb)
        print 'Recreating indexes'
        ensure_indexes(db)

    source = Blog(args.source_url,
                  args.source_username,
                  args.source_password,
                  use_cache=not args.refresh,
                  verbose=args.verbose)
    print 'Getting media library'

    media_library = set([m['link'] for m in source.get_media_library()])

    print '    %s assets\n' % len(media_library)

    print 'Getting posts and pages'
    post_structs = source.get_recent_posts(args.nposts)
    print '    %s posts' % len(post_structs)
    page_structs = source.get_pages()
    print '    %s pages' % len(page_structs)
    print

    for structs, post_type in [
        (post_structs, 'post'),
        (page_structs, 'page'),
    ]:
        print '%sS' % post_type.upper()
        for struct in structs:
            categories = struct.pop('categories', [])
            struct['description'] = wordpress_to_markdown(
                struct, media_library, db, destination_url, source_base_url)

            post = Post.from_metaweblog(struct, post_type)

            print '%-34s %s' % (post.title, post.status.upper())
            for category_name in categories:
                doc = db.categories.find_one({'name': category_name})
                if doc:
                    category = Category(**doc)
                else:
                    category = Category(name=category_name,
                                        slug=slugify(category_name))
                    category.id = db.categories.insert(category.to_python())
                print '    %-30s %s' % (category_name,
                                        ' NEW' if not doc else '')

                post.categories.append(category)

            db.posts.insert(post.to_python())

        print '\nFinished %s %ss' % (len(structs), post_type)

    print 'Posting "categories_changed" event'

    db.events.insert(
        {
            'ts': datetime.datetime.utcnow(),
            'name': 'categories_changed'
        },
        manipulate=False)  # No need to add _id

    print '\nFinished in %.2f seconds' % (time.time() - start)
Exemplo n.º 20
0
def media_link(year, month, filename):
    base, extension = os.path.splitext(filename)
    return '%04d/%02d/%s' % (year, month, slugify.slugify(base)) + extension
Exemplo n.º 21
0
    def _from_rpc(cls, struct, name):
        _id = ObjectId(
            struct['categoryId']) if 'categoryId' in struct else None

        return cls(name=name, slug=slugify.slugify(name), id=_id)
Exemplo n.º 22
0
 def _from_rpc(cls, struct, name):
     _id = ObjectId(struct['categoryId']) if 'categoryId' in struct else None
     return cls(name=name, slug=slugify.slugify(name), id=_id)
Exemplo n.º 23
0
def main(args):
    start = time.time()

    opts = options.options()
    destination_url = '/' + opts.base_url.lstrip('/')
    parts = urlparse(args.source_url)
    source_base_url = urljoin(
        '%s://%s' % (parts[0], parts[1]), parts[2].split('/xmlrpc.php')[0])

    print 'Base URL', source_base_url

    db = pymongo.Connection(safe=True).motorblog
    motordb = motor.MotorConnection().open_sync().motorblog
    if args.wipe:
        print 'Wiping motorblog database'
        db.connection.drop_database('motorblog')
        print 'Creating capped collection "events"'
        create_events_collection(motordb)
        print 'Recreating indexes'
        ensure_indexes(db)

    source = Blog(
        args.source_url, args.source_username, args.source_password,
        use_cache=not args.refresh, verbose=args.verbose)
    print 'Getting media library'

    media_library = set([
        m['link'] for m in source.get_media_library()])

    print '    %s assets\n' % len(media_library)

    print 'Getting posts and pages'
    post_structs = source.get_recent_posts(args.nposts)
    print '    %s posts' % len(post_structs)
    page_structs = source.get_pages()
    print '    %s pages' % len(page_structs)
    print

    for structs, type in [
        (post_structs, 'post'),
        (page_structs, 'page'),
    ]:
        print '%sS' % type.upper()
        for struct in structs:
            categories = struct.pop('categories', [])
            struct['description'] = wordpress_to_markdown(
                struct, media_library, db, destination_url, source_base_url)

            post = Post.from_metaweblog(struct, type)

            print '%-34s %s' % (post.title, post.status.upper())
            for category_name in categories:
                doc = db.categories.find_one({'name': category_name})
                if doc:
                    category = Category(**doc)
                else:
                    category = Category(
                        name=category_name, slug=slugify(category_name))
                    category.id = db.categories.insert(category.to_python())
                print '    %-30s %s' % (
                    category_name, ' NEW' if not doc else ''
                )

                post.categories.append(category)

            db.posts.insert(post.to_python())

        print '\nFinished %s %ss' % (len(structs), type)


    print 'Posting "categories_changed" event'
    db.events.insert(
        {'ts': datetime.datetime.utcnow(), 'name': 'categories_changed'},
        manipulate=False) # No need to add _id

    print '\nFinished in %.2f seconds' % (time.time() - start)