示例#1
0
    def test_archive_transformers(self):
        storage = archival.Storage(self.portal)
        result = storage.transform_content(
            '<html><body><div class="foo">foo</div></body></html>',
            'http://foobar.com')

        self.assertTrue('>bar<' in result)
示例#2
0
 def __init__(self, site, request):
     self.site = site
     self.request = request
     bucket_name = api.portal.get_registry_record('castle.aws_s3_bucket_name')
     self.configured_bucket_name = bucket_name
     self.s3, self.bucket = aws.get_bucket(bucket_name)
     self.archive_storage = archival.Storage(site)
示例#3
0
    def crawl_archives(self):
        registry = getUtility(IRegistry)
        base_url = registry.get('castle.aws_s3_base_url', None)

        storage = archival.Storage(self.site)
        urls = []
        for key, archive_data in storage.archives.items():
            # archives do not need to be re-indexed ever.
            # see if the key is in ES, if it is move on
            url = archive_data.get('view_url', None) or archive_data['url']
            urls.append(aws.swap_url(url, base_url=base_url))

        query = {"bool": {"filter": {"term": {"sitemap": "archives"}}}}
        existing_urls = self.get_all_from_es(query)
        for _id in set(urls) - set(existing_urls):
            # pages that have not yet been crawled
            try:
                self.crawl_archive_url(_id)
            except Exception:
                logger.error('Error indexing archive url: ' + _id,
                             exc_info=True)

        for _id in set(existing_urls) - set(urls):
            # pages that have been removed from the archive
            self.delete_from_index(_id)
示例#4
0
    def test_archive_replacement_selector(self):
        storage = archival.Storage(self.portal)

        storage.replacements = {'.foobar': 'barfoo'}

        self.assertEqual(
            '<html><body><div class="foobar">barfoo</div></body></html>',
            storage.apply_replacements('<html><body><div class="foobar"></div>'
                                       '</body></html>'))
示例#5
0
    def test_archive_transformers(self):
        login(self.portal, TEST_USER_NAME)
        setRoles(self.portal, TEST_USER_ID, ('Member', 'Manager'))
        storage = archival.Storage(self.portal)
        result = storage.transform_content(
            '<html><body><div class="foo">foo</div></body></html>',
            'http://foobar.com')

        self.assertTrue('>bar<' in result)
示例#6
0
    def test_archive_replacement_text(self):
        storage = archival.Storage(self.portal)

        storage.replacements = {'foobar': 'barfoo'}

        self.assertEqual(
            '<html><body><div>barfoo</div></body></html>',
            storage.apply_replacements('<html><body><div>foobar</div>'
                                       '</body></html>'))
示例#7
0
    def test_archive_replacement_text(self):
        login(self.portal, TEST_USER_NAME)
        setRoles(self.portal, TEST_USER_ID, ('Member', 'Manager'))
        storage = archival.Storage(self.portal)

        storage.replacements = {'foobar': 'barfoo'}

        self.assertEqual(
            '<html><body><div>barfoo</div></body></html>',
            storage.apply_replacements('<html><body><div>foobar</div>'
                                       '</body></html>'))
示例#8
0
    def test_is_s3_url(self):
        site = api.portal.get()
        storage = archival.Storage(site)
        parsed_endpoint = urlparse(
            storage.s3_conn.meta.client.meta.endpoint_url)

        urlstr = "http://localhost:9000/{}/{}".format(self.test_bucket_name,
                                                      self.test_access_key)
        self.assertFalse(is_s3_url(urlstr, parsed_endpoint))

        urlstr = "https://s3.amazonaws.com/{}/{}".format(
            self.test_bucket_name, self.test_access_key)
        self.assertTrue(is_s3_url(urlstr, parsed_endpoint))
示例#9
0
    def __call__(self):
        shield.protect(self.request, recheck=True)
        self.notfound = self.context
        self.context = api.portal.get()
        if '++' in self.request.URL:
            self.request.response.setStatus(404)
            try:
                return self.index()
            except Exception:
                logger.warn(
                    "Failed to render 404 template, had to return simple response"
                )
                return "not found"

        archive_storage = archival.Storage(self.context)
        site_url = self.context.absolute_url()
        path = self.request.ACTUAL_URL[len(site_url):].rstrip('/')

        wants_view = False
        if path.endswith('/view'):
            wants_view = True
            path = path.rsplit('/view', 1)[0]

        new_url = None
        if path.startswith('/resolveuid'):
            uid = path.replace('/resolveuid/', '')
            try:
                new_url = archive_storage.get_archive_url_by_uid(uid)
            except Exception:
                pass
        else:
            try:
                new_url = archive_storage.get_archive_url_by_path(
                    path, wants_view)
            except Exception:
                pass
        if new_url:
            # XXX need to force redirect this way since normal redirect
            # gets overridden with 404
            if self.request.environ.get('QUERY_STRING'):
                new_url += '?' + self.request.environ['QUERY_STRING']
            raise Redirect(aws.swap_url(new_url))

        self.attempt_redirect()

        self.request.response.setStatus(404)
        return self.index()
示例#10
0
    def test_move_resource(self):
        # this is creating a bucket in the moto/mock s3 service
        s3conn = boto3.resource('s3', endpoint_url=self.test_bucket_endpoint)
        s3conn.create_bucket(Bucket='castletest')
        s3, bucket = aws.get_bucket("castletest")

        storage = archival.Storage(self.portal)

        testcontent = 'this is some content'
        moveresource = api.content.create(type='Document',
                                          id='moveresource',
                                          container=self.portal)
        moveresource.content = testcontent
        api.content.transition(moveresource, 'publish')
        vpath = "/moveresource"
        url = self.portal.absolute_url() + vpath

        new_url = storage.move_resource(url, use_vhm=False)

        self.assertIsNotNone(new_url)

        try:
            # trim off, e.g., 'https://s3.amazonaws.com/bucketname'
            # and then convert the path back from the url escaped version
            droppart = "{}/{}/".format(self.test_bucket_endpoint,
                                       self.test_bucket_name)
            content_path = unquote_plus(new_url[len(droppart):])
            bucket.Object(content_path).load()
        except botocore.exceptions.ClientError:
            self.fail("object does not exist after move")

        # move by url of content again
        new_url2 = storage.move_resource(url, use_vhm=False)
        self.assertEqual(new_url, new_url2)

        # test for existence of content in aws still
        try:
            # trim off 'https://s3.amazonaws.com/castletest/'
            # and then convert the path back from the url escaped version
            droppart = "{}/{}/".format(self.test_bucket_endpoint,
                                       self.test_bucket_name)
            content_path = unquote_plus(new_url[len(droppart):])
            bucket.Object(content_path).load()
        except botocore.exceptions.ClientError:
            self.fail("object does not exist after move")
示例#11
0
    def test_move_to_aws(self):
        # this is creating a bucket in the moto/mock s3 service
        s3conn = boto3.resource('s3', endpoint_url=self.test_bucket_endpoint)
        s3conn.create_bucket(Bucket='castletest')
        s3, bucket = aws.get_bucket("castletest")

        storage = archival.Storage(self.portal)
        content = "this is a test"
        content_path = "a/test/path/for/this/test.html"
        content_type = "text/html; charset=utf-8"

        # the key should not be there before we run through this
        self.assertRaises(botocore.exceptions.ClientError,
                          lambda: bucket.Object(content_path).load())

        storage.move_to_aws(content, content_path, content_type)

        try:
            bucket.Object(archival.CONTENT_KEY_PREFIX + content_path).load()
        except botocore.exceptions.ClientError:
            self.fail("object does not exist after move")
示例#12
0
    def __call__(self):
        shield.protect(self.request)

        self.notfound = self.context
        self.context = api.portal.get()
        archive_storage = archival.Storage(self.context)
        site_url = self.context.absolute_url()
        path = self.request.ACTUAL_URL[len(site_url):].rstrip('/')

        wants_view = False
        if path.endswith('/view'):
            wants_view = True
            path = path.rsplit('/view', 1)[0]

        new_url = None
        if path.startswith('/resolveuid'):
            uid = path.replace('/resolveuid/', '')
            try:
                new_url = archive_storage.get_archive_url_by_uid(uid)
            except:
                pass
        else:
            try:
                new_url = archive_storage.get_archive_url_by_path(path, wants_view)
            except:
                pass
        if new_url:
            # XXX need to force redirect this way since normal redirect
            # gets overridden with 404
            if self.request.environ.get('QUERY_STRING'):
                new_url += '?' + self.request.environ['QUERY_STRING']
            raise Redirect(aws.swap_url(new_url))

        # seems this overrides plone.app.redirector handler
        redirector = queryMultiAdapter((self.context, self.request),
                                       name=u'plone_redirector_view')
        if redirector:
            redirector.attempt_redirect()

        return self.index()
示例#13
0
    return '/'.join(parsed.path.split('/')[2:])


if __name__ == '__main__':
    login_as_admin(app)  # noqa
    site = app[args.site_id]  # noqa
    setSite(site)

    toremove = {}  # uid: path
    catalog = api.portal.get_tool('portal_catalog')
    registry = getUtility(IRegistry)
    crawler_settings = registry.forInterface(ICrawlerConfiguration,
                                             prefix='castle')
    es = ElasticSearchCatalog(catalog)
    crawler = Crawler(site, crawler_settings, es)
    storage = archival.Storage(site)
    for key, archive_data in storage.archives.items():
        for url in (archive_data.get('view_url'), archive_data['url']):
            if not url:
                continue
            resp = requests.get(url)
            if 'html' not in resp.headers.get('content-type'):
                continue
            print('processing ' + url)
            dom = fromstring(resp.content)
            prop = dom.cssselect('meta[property="og:url"]')

            fix_urls(storage, dom)
            html = tostring(dom)

            if html == resp.content:
示例#14
0
parser = argparse.ArgumentParser(description='...')
parser.add_argument('--file', dest='file', default=False)
parser.add_argument('--site-id', dest='site_id', default='Plone')
parser.add_argument('--site-url', dest='site_url', default='')
args, _ = parser.parse_known_args()

user = app.acl_users.getUser('admin')  # noqa
newSecurityManager(None, user.__of__(app.acl_users))  # noqa
site = app[args.site_id]  # noqa
setSite(site)

fi = open(args.file)
items = json.loads(fi.read())
fi.close()

storage = archival.Storage(site, UrlOpener=archival.RequestsUrlOpener)
count = 0
for item in items:
    count += 1
    content_path = '/' + '/'.join(item['path'].split('/')[2:])
    url = args.site_url.rstrip('/') + content_path
    # need to export UID also
    new_url = storage.add_url(url, content_path, item['uid'])
    if new_url:
        print('imported %s -> %s' % (url, new_url))
    else:
        print('error importing %s' % (url, ))
    if count % 100 == 0:
        print('done with %i' % count)
        transaction.commit()
示例#15
0
def _archive_content(obj):
    storage = archival.Storage(getSite())
    storage.add_content(obj)
    _sync_and_store(storage, IUUID(obj))
示例#16
0
def archive(site):
    setup_site(site)

    if (not api.portal.get_registry_record('castle.archival_enabled')
            or not api.portal.get_registry_record('castle.aws_s3_bucket_name')
            or not api.portal.get_registry_record('castle.aws_s3_key')
            or not api.portal.get_registry_record('castle.aws_s3_secret')
            or not api.portal.get_registry_record('plone.public_url')):
        logger.error(
            'Can not archive content. Either not enabled, S3 API not set or no public '
            'url set')
        return

    storage = archival.Storage(site)
    for brain in archival.getContentToArchive():
        try:
            ob = brain.getObject()

            container = aq_parent(ob)
            if (IPloneSiteRoot.providedBy(container)
                    and getDefaultPage(container) == ob.getId()):
                continue

            allowed = set(rolesForPermissionOn('View', ob))
            if 'Anonymous' not in allowed:
                # we can *not* archive unpublished content
                continue
            new_url = storage.add_content(ob)

            # resets login creds..
            login_as_admin(app)  # noqa

            if new_url:
                logger.warn('imported %s -> %s' % (ob.absolute_url(), new_url))
                # XXX might need to re-architect... might get conflict errors with how slow
                # archiving takes...
                api.content.delete(ob)
                transaction.commit()
            else:
                logger.error('error importing %s' % ob.absolute_url())
        except:
            logger.error('Error archiving %s' % brain.getPath(), exc_info=True)

    content_to_archive = archival.getContentToArchive(7)
    if len(content_to_archive) == 0:
        return

    backend_url = get_backend_url()
    # send out email warning of content about to be archived
    email_text = """
<p>Warning, this content will be archived in 7 days.
Login to
<a href="{site_url}">{site_title}</a> to extend this content.
</p>
<ul>""".format(site_title=api.portal.get_registry_record('plone.site_title'),
               site_url=backend_url)

    site_url = api.portal.get().absolute_url()
    for brain in content_to_archive:
        url = brain.getURL()
        url = url.replace(site_url, backend_url)
        email_text += """<li>
<a href="{url}">{title}</a></li>""".format(url=url, title=brain.Title)

    email_text += '</ul>'

    for user in api.user.get_users():
        roles = api.user.get_roles(user=user)
        if ('Site Administrator' not in roles and 'Manager' not in roles):
            continue
        email = user.getProperty('email')
        if not email:
            continue

        name = user.getProperty('fullname') or user.getId()
        html = '<p>Hi {name},</p>'.format(name=name) + email_text
        send_email(recipients=email,
                   subject="Content will be archived(Site: %s)" %
                   (api.portal.get_registry_record('plone.site_title')),
                   html=html)
示例#17
0
def _archive_url(url, path, uid):
    storage = archival.Storage(getSite(), UrlOpener=archival.RequestsUrlOpener)
    storage.add_url(url, path, uid)
    _sync_and_store(storage, uid)