示例#1
0
def get_seo_description(content, locale=None, strip_markup=True):
    # Create an SEO summary
    # TODO:  Google only takes the first 180 characters, so maybe we find a
    #        logical way to find the end of sentence before 180?
    seo_summary = ''
    if content:
        # Try constraining the search for summary to an explicit "Summary"
        # section, if any.
        summary_section = (parse(content).extractSection('Summary')
                           .serialize())
        if summary_section:
            content = summary_section

        # Need to add a BR to the page content otherwise pyQuery wont find
        # a <p></p> element if it's the only element in the doc_html
        seo_analyze_doc_html = content + '<br />'
        page = pq(seo_analyze_doc_html)

        # Look for the SEO summary class first
        summaryClasses = page.find('.seoSummary')
        if len(summaryClasses):
            if strip_markup:
                seo_summary = summaryClasses.text()
            else:
                seo_summary = ''.join(
                    to_html(item) for item in summaryClasses.items())
        else:
            paragraphs = page.find('p')
            if paragraphs.length:
                for p in range(len(paragraphs)):
                    item = paragraphs.eq(p)
                    if strip_markup:
                        text = item.text()
                    else:
                        text = to_html(item)
                    # Checking for a parent length of 2
                    # because we don't want p's wrapped
                    # in DIVs ("<div class='warning'>") and pyQuery adds
                    # "<html><div>" wrapping to entire document
                    text_match = (
                        text and len(text) and
                        'Redirect' not in text and
                        text.find(u'«') == -1 and
                        text.find('&laquo') == -1 and
                        item.parents().length == 2)
                    if text_match:
                        seo_summary = text.strip()
                        break

    if strip_markup:
        # Post-found cleanup
        # remove markup chars
        seo_summary = seo_summary.replace('<', '').replace('>', '')
        # remove spaces around some punctuation added by PyQuery
        if locale == 'en-US':
            seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary)
            seo_summary = re.sub(r'(\() ', r'\1', seo_summary)

    return seo_summary
示例#2
0
def test_xss_file_attachment_title(admin_client, constance_config, root_doc,
                                   wiki_user, editor_client):
    constance_config.WIKI_ATTACHMENT_ALLOWED_TYPES = 'text/plain'

    # use view to create new attachment
    file_for_upload = make_test_file()
    files_url = reverse('attachments.edit_attachment',
                        kwargs={'document_path': root_doc.slug})
    title = '"><img src=x onerror=prompt(navigator.userAgent);>'
    post_data = {
        'title': title,
        'description': 'xss',
        'comment': 'xss',
        'file': file_for_upload,
    }
    response = admin_client.post(files_url, data=post_data)
    assert response.status_code == 302

    # now stick it in/on a document
    attachment = Attachment.objects.get(title=title)
    content = '<img src="%s" />' % attachment.get_file_url()
    root_doc.current_revision = Revision.objects.create(
        document=root_doc, creator=wiki_user, content=content)

    # view it and verify markup is escaped
    response = editor_client.get(root_doc.get_edit_url())
    assert response.status_code == 200
    doc = pq(response.content)
    text = doc('.page-attachments-table .attachment-name-cell').text()
    assert text == ('%s\nxss' % title)
    html = to_html(doc('.page-attachments-table .attachment-name-cell'))
    assert '&gt;&lt;img src=x onerror=prompt(navigator.userAgent);&gt;' in html
    # security bug 1272791
    for script in doc('script'):
        assert title not in script.text_content()
示例#3
0
def test_revisions_locale_filter(dashboard_revisions, user_client):
    """Revisions can be filtered by locale."""
    url = urlparams(reverse('dashboards.revisions', locale='fr'), locale='fr')
    response = user_client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
    assert response.status_code == 200

    page = pq(response.content)
    revisions = page.find('.dashboard-row')
    assert revisions.length == 1
    locale = to_html(revisions.find('.locale'))
    assert locale == 'fr'
示例#4
0
    def test_topic_filter(self):
        url = urlparams(reverse('dashboards.revisions', locale='en-US'),
                        topic='article-with-revisions')
        response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        eq_(response.status_code, 200)

        page = pq(response.content)
        revisions = page.find('.dashboard-row')

        eq_(revisions.length, 7)
        for revision in revisions:
            ok_('lorem' not in to_html(pq(revision).find('.dashboard-title')))
示例#5
0
文件: content.py 项目: mozilla/kuma
def filter_out_noinclude(src):
    """
    Quick and dirty filter to remove <div class="noinclude"> blocks
    """
    # NOTE: This started as an html5lib filter, but it started getting really
    # complex. Seems like pyquery works well enough without corrupting
    # character encoding.
    if not src:
        return ''
    doc = pq(src)
    doc.remove('*[class=noinclude]')
    return to_html(doc)
示例#6
0
def filter_out_noinclude(src):
    """
    Quick and dirty filter to remove <div class="noinclude"> blocks
    """
    # NOTE: This started as an html5lib filter, but it started getting really
    # complex. Seems like pyquery works well enough without corrupting
    # character encoding.
    if not src:
        return ''
    doc = pq(src)
    doc.remove('*[class=noinclude]')
    return to_html(doc)
示例#7
0
文件: test_views.py 项目: Elchi3/kuma
def test_revisions_locale_filter(dashboard_revisions, user_client):
    """Revisions can be filtered by locale."""
    url = urlparams(reverse('dashboards.revisions', locale='fr'),
                    locale='fr')
    response = user_client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
    assert response.status_code == 200

    page = pq(response.content)
    revisions = page.find('.dashboard-row')
    assert revisions.length == 1
    locale = to_html(revisions.find('.locale'))
    assert locale == 'fr'
示例#8
0
def test_revisions_locale_filter(dashboard_revisions, user_client):
    """Revisions can be filtered by locale."""
    url = urlparams(reverse("dashboards.revisions", locale="fr"), locale="fr")
    response = user_client.get(url,
                               HTTP_HOST=settings.WIKI_HOST,
                               HTTP_X_REQUESTED_WITH="XMLHttpRequest")
    assert response.status_code == 200

    page = pq(response.content)
    revisions = page.find(".dashboard-row")
    assert revisions.length == 1
    locale = to_html(revisions.find(".locale"))
    assert locale == "fr"
示例#9
0
    def test_locale_filter(self):
        url = urlparams(reverse('dashboards.revisions', locale='fr'),
                        locale='fr')
        response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        eq_(200, response.status_code)

        page = pq(response.content)
        revisions = page.find('.dashboard-row')

        ok_(len(revisions))
        eq_(1, revisions.length)

        ok_('fr' in to_html(pq(revisions[0]).find('.locale')))
示例#10
0
    def test_known_authors_lookup(self):
        # Only testuser01 is in the Known Authors group
        url = urlparams(reverse('dashboards.revisions', locale='en-US'),
                        authors=RevisionDashboardForm.KNOWN_AUTHORS)
        response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        eq_(200, response.status_code)

        page = pq(response.content)
        revisions = page.find('.dashboard-row')

        for revision in revisions:
            author = to_html(pq(revision).find('.dashboard-author'))
            ok_('testuser01' in author)
            ok_('testuser2' not in author)
示例#11
0
def test_edit_attachment_post_with_vacant_file(admin_client, root_doc, tmpdir,
                                               mode):
    post_data = {
        'title': 'Test uploaded file',
        'description': 'A test file uploaded into kuma.',
        'comment': 'Initial upload',
    }

    if mode == 'empty-file':
        empty_file = tmpdir.join('empty')
        empty_file.write('')
        post_data['file'] = empty_file
        expected = 'The submitted file is empty.'
    else:
        expected = 'This field is required.'

    url = reverse('attachments.edit_attachment',
                  kwargs={'document_path': root_doc.slug})
    response = admin_client.post(url, data=post_data)
    assert response.status_code == 200
    doc = pq(response.content)
    assert to_html(doc('ul.errorlist a[href="#id_file"]')) == expected
示例#12
0
文件: test_views.py 项目: Elchi3/kuma
def test_edit_attachment_post_with_vacant_file(admin_client, root_doc, tmpdir,
                                               mode):
    post_data = {
        'title': 'Test uploaded file',
        'description': 'A test file uploaded into kuma.',
        'comment': 'Initial upload',
    }

    if mode == 'empty-file':
        empty_file = tmpdir.join('empty')
        empty_file.write('')
        post_data['file'] = empty_file
        expected = 'The submitted file is empty.'
    else:
        expected = 'This field is required.'

    url = reverse('attachments.edit_attachment',
                  kwargs={'document_path': root_doc.slug})
    response = admin_client.post(url, data=post_data)
    assert response.status_code == 200
    doc = pq(response.content)
    assert to_html(doc('ul.errorlist a[href="#id_file"]')) == expected
示例#13
0
def test_xss_file_attachment_title(admin_client, constance_config, root_doc,
                                   wiki_user, editor_client, settings):
    constance_config.WIKI_ATTACHMENT_ALLOWED_TYPES = "text/plain"

    # use view to create new attachment
    file_for_upload = make_test_file()
    files_url = reverse("attachments.edit_attachment",
                        kwargs={"document_path": root_doc.slug})
    title = '"><img src=x onerror=prompt(navigator.userAgent);>'
    post_data = {
        "title": title,
        "description": "xss",
        "comment": "xss",
        "file": file_for_upload,
    }
    response = admin_client.post(files_url,
                                 data=post_data,
                                 HTTP_HOST=settings.WIKI_HOST)
    assert response.status_code == 302

    # now stick it in/on a document
    attachment = Attachment.objects.get(title=title)
    content = '<img src="%s" />' % attachment.get_file_url()
    root_doc.current_revision = Revision.objects.create(document=root_doc,
                                                        creator=wiki_user,
                                                        content=content)

    # view it and verify markup is escaped
    response = editor_client.get(root_doc.get_edit_url(),
                                 HTTP_HOST=settings.WIKI_HOST)
    assert response.status_code == 200
    doc = pq(response.content)
    text = doc(".page-attachments-table .attachment-name-cell").text()
    assert text == ("%s\nxss" % title)
    html = to_html(doc(".page-attachments-table .attachment-name-cell"))
    assert "&gt;&lt;img src=x onerror=prompt(navigator.userAgent);&gt;" in html
    # security bug 1272791
    for script in doc("script"):
        assert title not in script.text_content()
示例#14
0
def test_edit_attachment_post_with_vacant_file(admin_client, root_doc, tmpdir,
                                               mode):
    post_data = {
        "title": "Test uploaded file",
        "description": "A test file uploaded into kuma.",
        "comment": "Initial upload",
    }

    if mode == "empty-file":
        empty_file = tmpdir.join("empty")
        empty_file.write("")
        post_data["file"] = empty_file
        expected = "The submitted file is empty."
    else:
        expected = "This field is required."

    url = reverse("attachments.edit_attachment",
                  kwargs={"document_path": root_doc.slug})
    response = admin_client.post(url,
                                 data=post_data,
                                 HTTP_HOST=settings.WIKI_HOST)
    assert response.status_code == 200
    doc = pq(response.content)
    assert to_html(doc('ul.errorlist a[href="#id_file"]')) == expected
示例#15
0
def test_xss_file_attachment_title(admin_client, constance_config, root_doc,
                                   wiki_user, editor_client):
    constance_config.WIKI_ATTACHMENT_ALLOWED_TYPES = 'text/plain'

    # use view to create new attachment
    file_for_upload = make_test_file()
    files_url = reverse('attachments.edit_attachment',
                        kwargs={'document_path': root_doc.slug},
                        locale='en-US')
    title = '"><img src=x onerror=prompt(navigator.userAgent);>'
    post_data = {
        'title': title,
        'description': 'xss',
        'comment': 'xss',
        'file': file_for_upload,
    }
    response = admin_client.post(files_url, data=post_data)
    assert response.status_code == 302

    # now stick it in/on a document
    attachment = Attachment.objects.get(title=title)
    content = '<img src="%s" />' % attachment.get_file_url()
    root_doc.current_revision = Revision.objects.create(
        document=root_doc, creator=wiki_user, content=content)

    # view it and verify markup is escaped
    response = editor_client.get(root_doc.get_edit_url())
    assert response.status_code == 200
    doc = pq(response.content)
    text = doc('.page-attachments-table .attachment-name-cell').text()
    assert text == ('%s\nxss' % title)
    html = to_html(doc('.page-attachments-table .attachment-name-cell'))
    assert '&gt;&lt;img src=x onerror=prompt(navigator.userAgent);&gt;' in html
    # security bug 1272791
    for script in doc('script'):
        assert title not in script.text_content()
示例#16
0
def _document_api_PUT(request, document_slug, document_locale):
    """
    Handle PUT requests for the document_api view.
    """

    # Try parsing one of the supported content types from the request
    try:
        content_type = request.META.get('CONTENT_TYPE', '')

        if content_type.startswith('application/json'):
            data = json.loads(request.body)

        elif content_type.startswith('multipart/form-data'):
            parser = MultiPartParser(request.META, StringIO(request.body),
                                     request.upload_handlers, request.encoding)
            data, files = parser.parse()

        elif content_type.startswith('text/html'):
            # TODO: Refactor this into wiki.content ?
            # First pass: Just assume the request body is an HTML fragment.
            html = request.body
            data = dict(content=html)

            # Second pass: Try parsing the body as a fuller HTML document,
            # and scrape out some of the interesting parts.
            try:
                doc = pq(html)
                head_title = doc.find('head title')
                if head_title.length > 0:
                    data['title'] = head_title.text()
                body_content = doc.find('body')
                if body_content.length > 0:
                    data['content'] = to_html(body_content)
            except Exception:
                pass

        else:
            resp = HttpResponse()
            resp.status_code = 400
            resp.content = ugettext(
                "Unsupported content-type: %s") % content_type
            return resp

    except Exception as e:
        resp = HttpResponse()
        resp.status_code = 400
        resp.content = ugettext("Request parsing error: %s") % e
        return resp

    try:
        # Look for existing document to edit:
        doc = Document.objects.get(locale=document_locale, slug=document_slug)
        section_id = request.GET.get('section', None)
        is_new = False

        # Use ETags to detect mid-air edit collision
        # see: http://www.w3.org/1999/04/Editing/
        if_match = request.META.get('HTTP_IF_MATCH')
        if if_match:
            try:
                expected_etags = parse_etags(if_match)
            except ValueError:
                expected_etags = []
            # Django's parse_etags returns a list of quoted rather than
            # un-quoted ETags starting with version 1.11.
            current_etag = quote_etag(calculate_etag(doc.get_html(section_id)))
            if current_etag not in expected_etags:
                resp = HttpResponse()
                resp.status_code = 412
                resp.content = ugettext('ETag precondition failed')
                return resp

    except Document.DoesNotExist:
        # TODO: There should be a model utility for creating a doc...

        # Let's see if this slug path implies a parent...
        slug_parts = split_slug(document_slug)
        if not slug_parts['parent']:
            # Apparently, this is a root page!
            parent_doc = None
        else:
            # There's a parent implied, so make sure we can find it.
            parent_doc = get_object_or_404(Document,
                                           locale=document_locale,
                                           slug=slug_parts['parent'])

        # Create and save the new document; we'll revise it immediately.
        doc = Document(slug=document_slug,
                       locale=document_locale,
                       title=data.get('title', document_slug),
                       parent_topic=parent_doc)
        doc.save()
        section_id = None  # No section editing for new document!
        is_new = True

    new_rev = doc.revise(request.user, data, section_id)
    doc.schedule_rendering('max-age=0')

    request.authkey.log('created' if is_new else 'updated', new_rev,
                        data.get('summary', None))

    resp = HttpResponse()
    if is_new:
        resp['Location'] = request.build_absolute_uri(doc.get_absolute_url())
        resp.status_code = 201
    else:
        resp.status_code = 205

    return resp
示例#17
0
文件: document.py 项目: mozilla/kuma
def _document_api_PUT(request, document_slug, document_locale):
    """
    Handle PUT requests for the document_api view.
    """

    # Try parsing one of the supported content types from the request
    try:
        content_type = request.META.get('CONTENT_TYPE', '')

        if content_type.startswith('application/json'):
            data = json.loads(request.body)

        elif content_type.startswith('multipart/form-data'):
            parser = MultiPartParser(request.META,
                                     StringIO(request.body),
                                     request.upload_handlers,
                                     request.encoding)
            data, files = parser.parse()

        elif content_type.startswith('text/html'):
            # TODO: Refactor this into wiki.content ?
            # First pass: Just assume the request body is an HTML fragment.
            html = request.body
            data = dict(content=html)

            # Second pass: Try parsing the body as a fuller HTML document,
            # and scrape out some of the interesting parts.
            try:
                doc = pq(html)
                head_title = doc.find('head title')
                if head_title.length > 0:
                    data['title'] = head_title.text()
                body_content = doc.find('body')
                if body_content.length > 0:
                    data['content'] = to_html(body_content)
            except Exception:
                pass

        else:
            resp = HttpResponse()
            resp.status_code = 400
            resp.content = ugettext(
                "Unsupported content-type: %s") % content_type
            return resp

    except Exception as e:
        resp = HttpResponse()
        resp.status_code = 400
        resp.content = ugettext("Request parsing error: %s") % e
        return resp

    try:
        # Look for existing document to edit:
        doc = Document.objects.get(locale=document_locale, slug=document_slug)
        section_id = request.GET.get('section', None)
        is_new = False

        # Use ETags to detect mid-air edit collision
        # see: http://www.w3.org/1999/04/Editing/
        if_match = request.META.get('HTTP_IF_MATCH')
        if if_match:
            try:
                expected_etags = parse_etags(if_match)
            except ValueError:
                expected_etags = []
            # Django's parse_etags returns a list of quoted rather than
            # un-quoted ETags starting with version 1.11.
            current_etag = quote_etag(calculate_etag(doc.get_html(section_id)))
            if current_etag not in expected_etags:
                resp = HttpResponse()
                resp.status_code = 412
                resp.content = ugettext('ETag precondition failed')
                return resp

    except Document.DoesNotExist:
        # TODO: There should be a model utility for creating a doc...

        # Let's see if this slug path implies a parent...
        slug_parts = split_slug(document_slug)
        if not slug_parts['parent']:
            # Apparently, this is a root page!
            parent_doc = None
        else:
            # There's a parent implied, so make sure we can find it.
            parent_doc = get_object_or_404(Document, locale=document_locale,
                                           slug=slug_parts['parent'])

        # Create and save the new document; we'll revise it immediately.
        doc = Document(slug=document_slug, locale=document_locale,
                       title=data.get('title', document_slug),
                       parent_topic=parent_doc)
        doc.save()
        section_id = None  # No section editing for new document!
        is_new = True

    new_rev = doc.revise(request.user, data, section_id)
    doc.schedule_rendering('max-age=0')

    request.authkey.log('created' if is_new else 'updated',
                        new_rev, data.get('summary', None))

    resp = HttpResponse()
    if is_new:
        resp['Location'] = request.build_absolute_uri(doc.get_absolute_url())
        resp.status_code = 201
    else:
        resp.status_code = 205

    return resp
示例#18
0
文件: content.py 项目: mozilla/kuma
def get_seo_description(content, locale=None, strip_markup=True):
    # Create an SEO summary
    # TODO:  Google only takes the first 180 characters, so maybe we find a
    #        logical way to find the end of sentence before 180?
    seo_summary = ''
    if content:
        # Try constraining the search for summary to an explicit "Summary"
        # section, if any.
        # This line is ~20x times slower than doing the PyQuery analysis.
        # Both `parse()` and `.serialize()` are slow and expensive.
        # That's why we're careful to avoid it if we can.
        if 'Summary' in content:
            summary_section = (parse(content).extractSection('Summary')
                               .serialize())
            if summary_section:
                content = summary_section

        # Need to add a BR to the page content otherwise pyQuery wont find
        # a <p></p> element if it's the only element in the doc_html.

        # Note, PyQuery is magically clumsy in that it will try a download
        # if the first and only argument looks like a URL.
        # It does that by looking for args[0] being a string and
        # containing 'http://' or 'https://'.
        # Adding an empty space, no matter what the content is will fool
        # PyQuery.
        seo_analyze_doc_html = ' ' + content + '<br />'
        page = pq(seo_analyze_doc_html)

        # Look for the SEO summary class first
        summaryClasses = page.find('.seoSummary')
        if len(summaryClasses):
            if strip_markup:
                seo_summary = summaryClasses.text()
            else:
                seo_summary = ''.join(
                    to_html(item) for item in summaryClasses.items())
        else:
            paragraphs = page.find('p')
            if paragraphs.length:
                for p in range(len(paragraphs)):
                    item = paragraphs.eq(p)
                    if strip_markup:
                        text = item.text()
                    else:
                        text = to_html(item)
                    # Checking for a parent length of 2
                    # because we don't want p's wrapped
                    # in DIVs ("<div class='warning'>") and pyQuery adds
                    # "<html><div>" wrapping to entire document
                    text_match = (
                        text and len(text) and
                        'Redirect' not in text and
                        text.find(u'«') == -1 and
                        text.find('&laquo') == -1 and
                        item.parents().length == 2)
                    if text_match:
                        seo_summary = text.strip()
                        break

    if strip_markup:
        # Post-found cleanup
        # remove markup chars
        seo_summary = seo_summary.replace('<', '').replace('>', '')
        # remove spaces around some punctuation added by PyQuery
        if locale == 'en-US':
            seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary)
            seo_summary = re.sub(r'(\() ', r'\1', seo_summary)

    return seo_summary
示例#19
0
文件: content.py 项目: b-xiang/kuma-1
def get_seo_description(content, locale=None, strip_markup=True):
    # Create an SEO summary
    # TODO:  Google only takes the first 180 characters, so maybe we find a
    #        logical way to find the end of sentence before 180?
    seo_summary = ""
    if content:
        # Try constraining the search for summary to an explicit "Summary"
        # section, if any.
        # This line is ~20x times slower than doing the PyQuery analysis.
        # Both `parse()` and `.serialize()` are slow and expensive.
        # That's why we're careful to avoid it if we can.
        if "Summary" in content:
            summary_section = parse(content).extractSection("Summary").serialize()
            if summary_section:
                content = summary_section

        # Need to add a BR to the page content otherwise pyQuery wont find
        # a <p></p> element if it's the only element in the doc_html.
        seo_analyze_doc_html = content + "<br />"
        page = pq(seo_analyze_doc_html)

        # Look for the SEO summary class first
        summaryClasses = page.find(".seoSummary")
        if len(summaryClasses):
            if strip_markup:
                seo_summary = summaryClasses.text()
            else:
                seo_summary = "".join(
                    to_html(item) or "" for item in summaryClasses.items()
                )
        else:
            paragraphs = page.find("p")
            if paragraphs.length:
                for p in range(len(paragraphs)):
                    item = paragraphs.eq(p)
                    if strip_markup:
                        text = item.text()
                    else:
                        text = to_html(item)
                    # Checking for a parent length of 2
                    # because we don't want p's wrapped
                    # in DIVs ("<div class='warning'>") and pyQuery adds
                    # "<html><div>" wrapping to entire document
                    text_match = (
                        text
                        and len(text)
                        and "Redirect" not in text
                        and text.find("«") == -1
                        and text.find("&laquo") == -1
                        and item.parents().length == 2
                    )
                    if text_match:
                        seo_summary = text.strip()
                        break

    if strip_markup:
        # Post-found cleanup
        # remove markup chars
        seo_summary = seo_summary.replace("<", "").replace(">", "")
        # remove spaces around some punctuation added by PyQuery
        if locale == "en-US":
            seo_summary = re.sub(r" ([,\)\.])", r"\1", seo_summary)
            seo_summary = re.sub(r"(\() ", r"\1", seo_summary)

    return seo_summary