Пример #1
0
def fetch_data_from_url(url, content):
    data = {"url": url}
    try:
        readable = Readability(url, content)
        data["title"] = reduce_whitespace(unescape_entities(readable.get_article_title()))
        # Try to get abstract from meta description:
        abstract = reduce_whitespace(unescape_entities(strip_tags(readable.get_meta_description()).strip()))
        if not abstract:
            abstract = reduce_whitespace(unescape_entities(strip_tags(readable.get_article_text()).strip()))
        abstract = truncate_words(abstract, 200)
        data["abstract"] = abstract
    except ReadabilityException:
        pass

    if VIDEO_URL_RE.search(url):
        data["media_formats"] = MediaFormat.objects.filter(name="Video")

    urls = URL_RE.findall(content)
    OLD_CC_LICENCES = [l[0] for l in CC_OLD_LICENSES[1:]]

    for url in urls:
        if CC_LICENSE_URL_RE.match(url):
            url = url.lower()
            if url in OLD_CC_LICENCES:
                data["license_type"] = "cc-old"
                data["license_cc_old"] = url
            else:
                data["license_type"] = "cc"
                data["license_cc"] = url

    return data
Пример #2
0
 def test_unescape_entities(self):
     items = [
         ('', ''),
         ('foo', 'foo'),
         ('&', '&'),
         ('&', '&'),
         ('&', '&'),
         ('foo & bar', 'foo & bar'),
         ('foo & bar', 'foo & bar'),
     ]
     for value, output in items:
         self.assertEqual(text.unescape_entities(value), output)
         self.assertEqual(text.unescape_entities(lazystr(value)), output)
Пример #3
0
 def test_unescape_entities(self):
     items = [
         ("", ""),
         ("foo", "foo"),
         ("&", "&"),
         ("&", "&"),
         ("&", "&"),
         ("foo & bar", "foo & bar"),
         ("foo & bar", "foo & bar"),
     ]
     for value, output in items:
         self.assertEqual(text.unescape_entities(value), output)
         self.assertEqual(text.unescape_entities(lazystr(value)), output)
Пример #4
0
 def parse_django_error(self):
     """Extract the summary part of a Django HTML error."""
     try:
         summary = self.msg.split(u'<body>\n<div id="summary">\n  ', 1)[1]\
                           .split(u'<th>Python Executable:</th>', 1)[0]
         traceback = self.msg.split(u'\n\nTraceback:', 1)[1]\
                             .split(u'</textarea>', 1)[0]
     except IndexError:
         return self.msg
     result = []
     title = None
     for line in strip_tags(summary).split('\n'):
         line_content = unescape_entities(line.strip())
         if line_content:
             if line_content.endswith(':'):
                 title = line_content
             elif title is None:
                 title = "%s:" % line_content
             else:
                 result.append("%s %s\n" % (title, line_content))
     result.append("Status code: %s" % self.status_code)
     indent, indent2 = u'  ', u'    '
     return u"%(summary)s %(traceback)s".strip() % {
         'summary': indent.join(force_unicode(line) for line in result),
         'traceback': indent2.join(force_unicode(line+"\n") \
                                     for line in traceback.split('\n')),
     }
Пример #5
0
    def sync_to_db(self, decoded_documents):
        """
        Sync the given list of documents (decoded fjson files from sphinx) to
        the database. Deletes all the release's documents first then
        reinserts them as needed.
        """
        self.documents.all().delete()

        # Read excluded paths from robots.docs.txt.
        robots_path = settings.BASE_DIR.joinpath('djangoproject', 'static', 'robots.docs.txt')
        with open(str(robots_path), 'r') as fh:
            excluded_paths = [
                line.strip().split('/')[-1] for line in fh
                if line.startswith("Disallow: /%s/%s/" % (self.lang, self.release_id))
            ]

        for document in decoded_documents:
            if ('body' not in document or 'title' not in document or
                    document['current_page_name'].split('/')[0] in excluded_paths):
                # We don't care about indexing documents with no body or title, or partially translated
                continue

            Document.objects.create(
                release=self,
                path=_clean_document_path(document['current_page_name']),
                title=unescape_entities(strip_tags(document['title'])),
            )
Пример #6
0
 def render(self, context):
     length = self.length
     full_value = self.content_node.render(context)
     value = unescape_entities(full_value)
     if len(value) > length - 3:
         value = value[:length - 3] + '...'
     return '<span title="%s">%s</span>' % (full_value, value)
Пример #7
0
    def parse(self, raw_email):
        """
        Fetches the content of the message and populates the available headers
        """
        body = u''
        html_body = u''
        msg = email.parser.Parser().parsestr(raw_email)

        for part in msg.walk():
            #for key, header in part.items():
            #    self.headers[key.lower()] = clean_header(header)

            payload = part.get_payload(decode=1)
            charset = part.get_content_charset()
            if charset is not None:
                payload = payload.decode(charset)

            if part.get_content_type() == 'text/plain':
                body += payload

            if part.get_content_type() == 'text/html':
                html_body += payload

        if not body:
            body = unescape_entities(strip_tags(html_body))
        self.body = body
Пример #8
0
    def get_description(self):
        object = self.object

        ### Assign variables -----------------------
        site_name = get_setting('site','global','sitedisplayname')
        geo_location = get_setting('site','global','sitegeographiclocation')

        if object.description:
            content = object.description

        content = strip_tags(content) #strips HTML tags
        content = unescape_entities(content)
        content = content.replace("\n","").replace("\r","")
        content = truncate_words(content, 50) # ~ about 250 chars

        ### Build string -----------------------
        value = object.name

        value = '%s : %s' % (value, content)

        value = '%s Photo Sets for %s, %s' % (
            value, site_name, geo_location)

        value = value.strip()

        return value
Пример #9
0
    def render(self, context):
        try:
            html = unescape_entities(self.nodelist.render(context))
            safe_html = self.sanitize(html)
            top_level_elements = fragments_fromstring(safe_html)
            # TODO: We need to remember to patch in whatever pre-save
            #       HTML processing we eventually do here, too.  E.g.
            #       a spam URL blacklist.
            out = []
            for elem in top_level_elements:
                if elem.tag == "iframe":
                    elem = self._process_iframe(elem)
                out.append(etree.tostring(elem, method="html", encoding="UTF-8"))
            return "".join(out)

        except IFrameSrcNotApproved:
            return (
                '<span class="plugin embed">'
                + _(
                    "The embedded URL is not on the list of approved providers.  "
                    "Contact the site administrator to add it."
                )
                + "</span>"
            )
        except:
            return '<span class="plugin embed">' + _("Invalid embed code") + "</span>"
Пример #10
0
def anchorify(anchor):
    """
    Filter which converts to a string suitable for use as an anchor id on a HTML element.

    This is useful when you want anchor id on a heading to match heading content, which can
    be an arbitrary string.

    Example usage::

        <h1 id="{{ _("My Blog")|anchorify }}">{% trans "My Blog" %}</h1>

    The result would be::

        <h1 id="my-blog">My Blog</h1>
    """

    try:
        anchor = template.defaultfilters.striptags(anchor)
        anchor = text.unescape_entities(anchor)
        anchor = url_tags.slugify2(anchor)
        if not anchor or not anchor[0].isalpha():
            anchor = 'a' + anchor
        return anchor
    except:
        if settings.DEBUG:
            raise
        else:
            return u''
Пример #11
0
 def save(self, fail_silently=False):
     """
     Build and send the email message.
     """
     body = unescape_entities(self.message()) # convert &quot; back to "", etc.
     msg = EmailMessage(self.subject(), body, self.from_email, self.recipient_list,
         headers={'Reply-To': self.reply_email()})
     msg.send(fail_silently=fail_silently)
Пример #12
0
 def _convert_to_plain(self, value):
     if value:
         value = force_unicode(value)
         text = re.sub('<br[^>]*>', u'\n', value)
         text = unescape_entities(text)
         text = strip_tags(text)
         text = text.strip()
         text = unicodedata.normalize('NFKD', text.lower()).encode('ascii', 'ignore')
         return text
     return ''
Пример #13
0
def add_to_corpus(article_id):
    """
    Retrieve an article in db, clean it, and add it to corpus.
    """
    t = Article.objects.get(pk=article_id).content
    t = normalize_text(unescape_entities(t))
    t = t.encode("utf-8")
    t = t.decode("string_escape")
    save_to_file("corpus/%s.txt" % article_id, t)
    print normalize_text(t)
Пример #14
0
 def _prepare_plain_text(self, from_field, to_field):
     original_text = getattr(self, from_field, None)
     if original_text:
         original_text = force_unicode(original_text)
         text = re.sub('<br[^>]*>', u'\n', original_text)
         text = unescape_entities(text)
         text = strip_tags(text)
         text = text.strip()
         setattr(self, to_field, text)
     else:
         setattr(self, to_field, original_text)
Пример #15
0
    def get_description(self):
        object = self.object

        ### Assign variables -----------------------
        primary_keywords = get_setting('site','global','siteprimarykeywords')
        category_set = object.category_set
        category = category_set.get('category', '')
        subcategory = category_set.get('sub_category', '')
        site_name = get_setting('site','global','sitedisplayname')
        geo_location = get_setting('site','global','sitegeographiclocation')
        creator_name = ''
        if object.creator:
            creator_name = '%s %s' % (
                object.creator.first_name,
                object.creator.last_name
            )
        creator_name = creator_name.strip()

        if object.summary:
            content = object.summary
        else:
            content = object.body

        content = strip_tags(content) #strips HTML tags
        content = unescape_entities(content)
        content = content.replace("\n","").replace("\r","")
        content = truncate_words(content, 50) # ~ about 250 chars

        ### Build string -----------------------
        value = object.headline

        if creator_name:
            value = '%s %s' % (value, creator_name)

        value = '%s : %s' % (value, content)

        if primary_keywords:
            value = '%s %s' % (value, primary_keywords)
        else:
            if category:
                value = '%s %s' % (value, category)
            if category and subcategory:
                value = '%s : %s' % (value, subcategory)

            value = '%s directory' % value

        value = '%s Directories for %s %s' % (
            value, site_name, geo_location)

        value = value.strip()

        return value
Пример #16
0
def anchorify(anchor):
  """
  Convert string to suitable for anchor id.
  """
  anchor = defaultfilters.striptags(anchor)
  anchor = text.unescape_entities(anchor)
  for a, b in HEADING_REPLACE:
    anchor = anchor.replace(a, b)
  anchor = defaultfilters.slugify(anchor)
  anchor = DASH_START_END_RE.sub('', anchor)
  if not anchor or not anchor[0].isalpha():
    anchor = 'a' + anchor 
  return anchor
Пример #17
0
def cleanHtml(html):
    r"""
    Returns a text version of html, by first removing any text in <blockquote>
    tags and then striping any other tags and replacing html entities. The
    <blockquote> strip is done for the same reasons as in cleanText().

    >>> cleanHtml('foo\n<blockquote some-attr="some">bar</blockquote>&amp;st')
    'foo\n&st'
    >>> cleanHtml('foo\n<blockquote\nsome-attr="some">bar</blockquote>&amp;st')
    'foo\n&st'
    """
    regex = re.compile(r"<blockquote.*</blockquote>", re.DOTALL)
    html = regex.sub("", html)
    return unescape_entities(strip_tags(html))
Пример #18
0
 def check_fk(self, indexes):
     field = self.get_field('fk', indexes)
     parent = field.find_element_by_xpath('parent::*')
     add_related = parent.find_element_by_css_selector('.add-related')
     add_related.click()
     name = self.get_name_for_indexes(indexes)
     with self.switch_to_popup_window():
         self.set_field('name', name)
         self.save_form()
     time.sleep(0.1)
     field_id = field.get_attribute('id')
     current_val = self.selenium.execute_script(
         'return $("#%s").find("option:selected").html()' % field_id)
     self.assertEqual(unescape_entities(current_val), name)
Пример #19
0
def normalize_text(text, language="french"):
    """
    Normalize text : clean, strip tags...
    Tests needed.
    """
    text = strip_tags(unescape_entities(text))
    text = text.replace(u"’", u"'")
    text = text.replace(u"qu'", u"qu' ")#qu' lorsqu', etc.
    text = re.sub(ur'(")([^ \n\.,!?]){1}', u"\xab\g<2>", text, re.U)#replacing opening quotes
    text = re.sub(ur'([^ \n]){1}(")', u"\g<1>\xbb", text, re.U)#replacing closing quotes
    #Replacing inverted pronouns.
    text = re.sub(ur"\-t\-", u" - t - ", text, re.U)
    text = re.sub(ur"\-(je|moi|tu|toi|il|le|elle|la|on|nous|vous|ils|elles|les|ci|là)([\W])", u" - \g<1>\g<2>", text, re.U)
    return text
Пример #20
0
    def get_description(self):
        object = self.object

        ### Assign variables -----------------------
        primary_keywords = get_setting('site', 'global', 'siteprimarykeywords')
        category_set = object.category_set
        category = category_set.get('category', '')
        subcategory = category_set.get('sub_category', '')
        site_name = get_setting('site', 'global', 'sitedisplayname')
        geo_location = get_setting('site', 'global', 'sitegeographiclocation')
        creator_name = ''
        if object.creator:
            creator_name = '%s %s' % (object.creator.first_name,
                                      object.creator.last_name)
        creator_name = creator_name.strip()

        if object.summary:
            content = object.summary
        else:
            content = object.body

        content = strip_tags(content)  #strips HTML tags
        content = unescape_entities(content)
        content = content.replace("\n", "").replace("\r", "")
        content = truncate_words(content, 50)  # ~ about 250 chars

        ### Build string -----------------------
        value = object.headline

        if creator_name:
            value = '%s %s' % (value, creator_name)

        value = '%s : %s' % (value, content)

        if primary_keywords:
            value = '%s %s' % (value, primary_keywords)
        else:
            if category:
                value = '%s %s' % (value, category)
            if category and subcategory:
                value = '%s : %s' % (value, subcategory)

            value = '%s directory' % value

        value = '%s Directories for %s %s' % (value, site_name, geo_location)

        value = value.strip()

        return value
Пример #21
0
    def _get_story_data(cls, story, site=None):

        url = story.get_absolute_url(site=site)
        preview_url = get_preview_url(story) or url

        # See http://codex.wordpress.org/Post_Status_Transitions
        if story.is_published:
            if story.pub_date > datetime.datetime.now():
                post_status = 'future'
            else:
                post_status = 'publish'
        else:
            post_status = 'draft'

        # unescaping as inlines are escaped.
        story_body = unescape_entities(story.raw_body)

        # Add media image items as HTML in the story body.
        # They'll get converted back when saving the story.
        images = list(story.images.all())
        videos = list(story.videos.all())
        story_body = cls._create_media_html(images, videos) + story_body

        return {
            'dateCreated': DateTime(story.pub_date),
            'userid': str(story.author.id),
            'postid': str(story.id),
            'description': story_body,
            'title': story.headline,
            'link': url,
            'permaLink': preview_url,
            'categories': [smart_unicode(cat) for cat in story.categories.all()],
            'mt_excerpt': story.get_short_summary(),
            'mt_text_more': '',
            'wp_more_text': '',
            'mt_allow_comments': int(story.comments.enabled),
            'mt_allow_pings': 0,
            'mt_keywords': ', '.join((smart_unicode(tag) for tag in story.tags)),
            'wp_slug': story.slug,
            'wp_password': '',
            'wp_author_id': str(story.author.id),
            'wp_author_display_name': story.author.username,
            'date_created_gmt': DateTime(to_gmt(story.pub_date)),
            'post_status': post_status,
            'custom_fields': [],
            'wp_post_format': 'standard',
            'date_modified': DateTime(story.updated_date or story.pub_date),
            'date_modified_gmt': DateTime(to_gmt(story.updated_date or story.pub_date)),
        }
Пример #22
0
 def clean(self):
     cleaned_data = super().clean()
     if cleaned_data['open_tag'] in ('if', 'elif'):
         if not cleaned_data['condition']:
             raise ValidationError(
                 _("The evaluation condition is missing or empty."))
         try:
             condition = unescape_entities(cleaned_data['condition'])
             engines['django'].from_string(
                 self.eval_template_string.format(condition))
         except TemplateSyntaxError as err:
             raise ValidationError(
                 _("Unable to evaluate condition: {}").format(str(err)))
     elif cleaned_data['open_tag'] == 'else':
         cleaned_data['condition'] = ''  # empty condition for else-block
     return cleaned_data
Пример #23
0
    def get_description(self):
        object = self.object

        ### Assign variables -----------------------
        primary_keywords = get_setting("site", "global", "siteprimarykeywords")
        category_set = object.category_set
        category = category_set.get("category", "")
        subcategory = category_set.get("sub_category", "")
        site_name = get_setting("site", "global", "sitedisplayname")
        geo_location = get_setting("site", "global", "sitegeographiclocation")
        creator_name = "%s %s" % (object.creator.first_name, object.creator.last_name)
        creator_name = creator_name.strip()

        if object.summary:
            content = object.summary
        else:
            content = object.body

        content = strip_tags(content)  # strips HTML tags
        content = unescape_entities(content)
        content = content.replace("\n", "").replace("\r", "")
        content = truncate_words(content, 50)  # ~ about 250 chars

        ### Build string -----------------------
        value = object.headline

        if creator_name:
            value = "%s %s" % (value, creator_name)

        value = "%s : %s" % (value, content)

        if primary_keywords:
            value = "%s %s" % (value, primary_keywords)
        else:
            if category:
                value = "%s %s" % (value, category)
            if category and subcategory:
                value = "%s : %s" % (value, subcategory)

            value = "%s article" % value

        value = "%s Articles and White Papers for %s %s" % (value, site_name, geo_location)

        value = value.strip()

        return value
Пример #24
0
    def sync_to_db(self, decoded_documents):
        """
        Sync the given list of documents (decoded fjson files from sphinx) to
        the database. Deletes all the release's documents first then
        reinserts them as needed.
        """
        self.documents.all().delete()

        for document in decoded_documents:
            if 'body' not in document or 'title' not in document:
                # We don't care about indexing documents with no body or title
                continue

            Document.objects.create(
                release=self,
                path=_clean_document_path(document['current_page_name']),
                title=unescape_entities(strip_tags(document['title'])),
            )
Пример #25
0
    def sync_to_db(self, decoded_documents):
        """
        Sync the given list of documents (decoded fjson files from sphinx) to
        the database. Deletes all the release's documents first then
        reinserts them as needed.
        """
        self.documents.all().delete()

        for document in decoded_documents:
            if 'body' not in document or 'title' not in document:
                # We don't care about indexing documents with no body or title
                continue

            Document.objects.create(
                release=self,
                path=_clean_document_path(document['current_page_name']),
                title=unescape_entities(strip_tags(document['title'])),
            )
Пример #26
0
 def check_fk(self, indexes):
     field = self.get_field('fk1', indexes)
     parent = field.find_element_by_xpath('parent::*')
     add_related = parent.find_element_by_css_selector('.add-related')
     if self.has_grappelli:
         # Grappelli can be very slow to initialize fk bindings, particularly
         # when run on travis-ci
         time.sleep(1)
     self.click(add_related)
     name = self.get_name_for_indexes(indexes)
     with self.switch_to_popup_window():
         self.set_field('name', name)
         self.save_form()
     time.sleep(0.1)
     field_id = field.get_attribute('id')
     current_val = self.selenium.execute_script(
         'return $("#%s").find("option:selected").html()' % field_id)
     self.assertEqual(unescape_entities(current_val), name)
Пример #27
0
 def check_fk(self, indexes):
     field = self.get_field('fk1', indexes)
     parent = field.find_element_by_xpath('parent::*')
     add_related = parent.find_element_by_css_selector('.add-related')
     if self.has_grappelli:
         # Grappelli can be very slow to initialize fk bindings, particularly
         # when run on travis-ci
         time.sleep(1)
     add_related.click()
     name = self.get_name_for_indexes(indexes)
     with self.switch_to_popup_window():
         self.set_field('name', name)
         self.save_form()
     time.sleep(0.1)
     field_id = field.get_attribute('id')
     current_val = self.selenium.execute_script(
         'return $("#%s").find("option:selected").html()' % field_id)
     self.assertEqual(unescape_entities(current_val), name)
Пример #28
0
def html2tex_bs4(el):
    result = []
    if isinstance(el, NavigableString):
        return str(el)
    for sel in el.children:
        if isinstance(sel, NavigableString):
            result.append(str(sel))
        ## Span styling
        elif sel.name in ["span"]:
            for att in list(sel.attrs.keys()):
                if att == 'style':
                    if 'font-style:italic' in sel.attrs[att]:
                        result.append(u'\\textit{%s}' % (html2tex_bs4(sel)))
                    elif 'font-weight:bold' in sel.attrs[att]:
                        result.append(u'\\textbf{%s}' % (html2tex(sel)))
                elif att == 'class' and 'math-tex' in sel.attrs[att]:
                    if sel.string is not None and sel.string[:2] == '\(':
                        if len(sel.contents) > 1:
                            print('WARNING:', 'Math with nested tags!!')
                            print(sel)
                        result.append(unescape_entities(sel.string))
                elif att == 'class' and 'lang-ltr' in sel.attrs[att]:
                    result.append(u'\\textenglish{%s}' % (html2tex_bs4(sel)))
        ## Bold
        elif sel.name in ["b", "strong"]:
            result.append(u'\\textbf{%s}' % (html2tex_bs4(sel)))
        ## Italic
        elif sel.name in ["i"]:
            result.append(u'\\textit{%s}' % (html2tex_bs4(sel)))
        ## Emph
        elif sel.name in ["em"]:
            result.append(u'\\emph{%s}' % (html2tex_bs4(sel)))
        ## Underline
        elif sel.name in ["u"]:
            result.append(u'\\underline{%s}' % (html2tex_bs4(sel)))
        ## English in RTL
        elif 'dir' in sel.attrs and sel.attrs['dir'] == 'ltr':
            result.append(u'\\begin{english}\n%s\n\\end{english}' % (html2tex_bs4(sel)))

        ## By default just append content
        else:
            result.append(html2tex_bs4(sel))
    return u"".join(result)
Пример #29
0
def html_to_text(html):
    """
    Return formated text from HTML source (keeping words separated and
    completed, paragraphs and new lines). The output is supposed to be fine for
    words and phrases searches.
    """
    text = force_unicode(html)
    text = text.strip()
    if text:
        # Format HTML source to ouput readable/searchable text
        text = _re_newline_in_text.sub('\g<1> \g<2>', text)
        text = text.replace('\n', '')
        text = _re_space.sub('\g<1> \g<2>', text)
        text = _re_newline.sub('\g<0>\n', text)
        text = _re_2newlines.sub('\g<0>\n\n', text)
        text = _re_strip_html.sub('', text)
        text = _re_strip_newlines.sub('\n\n', text.strip())
        text = _re_strip_spaces.sub(' ', text)
        text = unescape_entities(text)
        text = text.replace('&amp;', '&')
    return text
Пример #30
0
def html_to_text(html):
    """
    Return formated text from HTML source (keeping words separated and
    completed, paragraphs and new lines). The output is supposed to be fine for
    words and phrases searches.
    """
    text = force_unicode(html)
    text = text.strip()
    if text:
        # Format HTML source to ouput readable/searchable text
        text = _re_newline_in_text.sub('\g<1> \g<2>', text)
        text = text.replace('\n', '')
        text = _re_space.sub('\g<1> \g<2>', text)
        text = _re_newline.sub('\g<0>\n', text)
        text = _re_2newlines.sub('\g<0>\n\n', text)
        text = _re_strip_html.sub('', text)
        text = _re_strip_newlines.sub('\n\n', text.strip())
        text = _re_strip_spaces.sub(' ', text)
        text = unescape_entities(text)
        text = text.replace('&amp;', '&')
    return text
Пример #31
0
 def from_django(cls, obj):
     # turns HTML entities into unicode characters again and removes
     # all HTML tags, aka "plain text" versio of the document
     content = strip_tags(unescape_entities(obj.body).replace(u'¶', ''))
     doc = cls(path=obj.path,
               title=obj.title,
               content=content,
               meta={'id': obj.id})
     doc.release = {
         'id': obj.release.id,
         'lang': obj.release.lang,
         'version': obj.release.version,
     }
     breadcrumbs = []
     for breadcrumb in cls.model.objects.breadcrumbs(obj):
         breadcrumbs.append({
             'title': breadcrumb.title,
             'path': breadcrumb.path,
         })
     doc.breadcrumbs = breadcrumbs
     return doc
Пример #32
0
 def from_django(cls, obj):
     # turns HTML entities into unicode characters again and removes
     # all HTML tags, aka "plain text" versio of the document
     content = strip_tags(unescape_entities(obj.body).replace(u'¶', ''))
     doc = cls(path=obj.path,
               title=obj.title,
               content=content,
               meta={'id': obj.id})
     doc.release = {
         'id': obj.release.id,
         'lang': obj.release.lang,
         'version': obj.release.version,
     }
     breadcrumbs = []
     for breadcrumb in cls.model.objects.breadcrumbs(obj):
         breadcrumbs.append({
             'title': breadcrumb.title,
             'path': breadcrumb.path,
         })
     doc.breadcrumbs = breadcrumbs
     return doc
Пример #33
0
def importFromWp():

    for tmp in Post.objects.all():
        tmp.delete()

    xmlFile = os.getcwd() + "/static/wordpress.2010-06-25.xml"
    dateFmt = "%a, %d %b %Y %H:%M:%S +0000"

    rss = parse(xmlFile)

    for post in rss.getElementsByTagName("item"):

        content = post.getElementsByTagName("content:encoded")[0]

        if content.childNodes.length > 0:

            name = getInnerText(post, "dc:creator")
            dateStr = getInnerText(post, "pubDate")
            date = datetime.datetime.strptime(dateStr, dateFmt)
            slug = urlparse(getInnerText(post, "link")).path.split("/")[-2]
            body = unescape_entities(content.firstChild.wholeText)

            soup = BeautifulSoup(body)
            for img in soup.findAll('img'):
                if re.match("http://techmeetup.co.uk/", img['src']):
                    imgsrc = img["src"]
                    filename = imgsrc.split("/")[-1]
                    img["src"] = "/static/img/wp/" + filename
                    outpath = os.path.join("./static/img/wp", filename)
                    body = str(soup)
                    if not os.path.exists(outpath):
                        print "fetching: %s" % (img["src"])
                        urlretrieve(imgsrc, outpath)

            Post(author=getOrCreate(name),
                 title=getInnerText(post, "title"),
                 slug=slug,
                 body=body,
                 created=date,
                 updated=date).save()
Пример #34
0
def importFromWp(): 

    for tmp in Post.objects.all():
        tmp.delete()

    xmlFile = os.getcwd() + "/static/wordpress.2010-06-25.xml"
    dateFmt = "%a, %d %b %Y %H:%M:%S +0000"

    rss = parse(xmlFile)

    for post in rss.getElementsByTagName("item"):

        content = post.getElementsByTagName("content:encoded")[0]

        if content.childNodes.length > 0:

            name    = getInnerText(post, "dc:creator")
            dateStr = getInnerText(post, "pubDate")
            date    = datetime.datetime.strptime(dateStr, dateFmt)
            slug    = urlparse(getInnerText(post, "link")).path.split("/")[-2]
            body    = unescape_entities(content.firstChild.wholeText)

            soup = BeautifulSoup(body)
            for img in soup.findAll('img'):
                if re.match("http://techmeetup.co.uk/", img['src']):
                    imgsrc = img["src"]
                    filename = imgsrc.split("/")[-1]
                    img["src"] = "/static/img/wp/" + filename
                    outpath = os.path.join("./static/img/wp", filename)
                    body = str(soup)
                    if not os.path.exists(outpath): 
                        print "fetching: %s" % (img["src"])
                        urlretrieve(imgsrc, outpath)

            Post( author  = getOrCreate(name),
                  title   = getInnerText(post, "title"), 
                  slug    = slug, 
                  body    = body,
                  created = date, 
                  updated = date ).save()
Пример #35
0
    def get_description(self):
        object = self.object

        ### Assign variables -----------------------
        primary_keywords = get_setting('site','global','siteprimarykeywords')
        category_set = object.category_set
        category = ', '.join([cat.name for cat in object.cats.all()])
        subcategory = ', '.join([sub_cat.name for sub_cat in object.sub_cats.all()])
        site_name = get_setting('site','global','sitedisplayname')
        geo_location = get_setting('site','global','sitegeographiclocation')

        if object.summary:
            content = object.summary
        else:
            content = object.body

        content = strip_tags(content) #strips HTML tags
        content = unescape_entities(content)
        content = content.replace("\n","").replace("\r","")
        content = truncate_words(content, 50) # ~ about 250 chars

        ### Build string -----------------------
        value = object.headline

        value = '%s : %s' % (value, content)

        if primary_keywords:
            value = '%s %s' % (value, primary_keywords)
        else:
            if category:
                value = '%s %s' % (value, category)
            if category and subcategory:
                value = '%s : %s' % (value, subcategory)

        value = '%s Directories for %s %s' % (
            value, site_name, geo_location)

        value = value.strip()

        return value
Пример #36
0
    def sync_to_db(self, decoded_documents):
        """
        Sync the given list of documents (decoded fjson files from sphinx) to
        the database. Deletes all the release's documents first then
        reinserts them as needed.
        """
        self.documents.all().delete()

        # Read excluded paths from robots.docs.txt.
        robots_path = settings.BASE_DIR.joinpath('djangoproject', 'static',
                                                 'robots.docs.txt')
        with open(str(robots_path), 'r') as fh:
            excluded_paths = [
                line.strip().split('/')[-1] for line in fh
                if line.startswith("Disallow: /%s/%s/" %
                                   (self.lang, self.release_id))
            ]

        for document in decoded_documents:
            if ('body' not in document or 'title' not in document
                    or document['current_page_name'].split('/')[0]
                    in excluded_paths):
                # We don't care about indexing documents with no body or title, or partially translated
                continue

            document_path = _clean_document_path(document['current_page_name'])
            document['slug'] = Path(document_path).parts[-1]
            document['parents'] = ' '.join(Path(document_path).parts[:-1])
            Document.objects.create(
                release=self,
                path=document_path,
                title=unescape_entities(strip_tags(document['title'])),
                metadata=document,
                config=TSEARCH_CONFIG_LANGUAGES.get(
                    self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG),
            )
        for document in self.documents.all():
            document.metadata['breadcrumbs'] = list(
                Document.objects.breadcrumbs(document).values('title', 'path'))
            document.save(update_fields=('metadata', ))
Пример #37
0
    def render(self, context):
        try:
            html = unescape_entities(self.nodelist.render(context))
            safe_html = self.sanitize(html)
            top_level_elements = fragments_fromstring(safe_html)
            # TODO: We need to remember to patch in whatever pre-save
            #       HTML processing we eventually do here, too.  E.g.
            #       a spam URL blacklist.
            out = []
            for elem in top_level_elements:
                if elem.tag == 'iframe':
                    elem = self._process_iframe(elem)
                out.append(
                    etree.tostring(elem, method='html', encoding='UTF-8'))
            return ''.join(out)

        except IFrameSrcNotApproved:
            return ('<span class="plugin embed">' + _(
                'The embedded URL is not on the list of approved providers.  '
                'Contact the site administrator to add it.') + '</span>')
        except:
            return '<span class="plugin embed">' + _(
                'Invalid embed code') + '</span>'
Пример #38
0
    def get_description(self):
        object = self.object

        ### Assign variables -----------------------  
        category = Category.objects.get_for_object(object, 'category')
        subcategory = Category.objects.get_for_object(object, 'subcategory')
        site_name = get_setting('site','global','sitedisplayname')
        geo_location = get_setting('site','global','sitegeographiclocation')
       
        content = object.content

        content = strip_tags(content) #strips HTML tags
        content = unescape_entities(content)
        content = content.replace("\n","").replace("\r","")
        content = truncate_words(content, 50) # ~ about 250 chars

        ### Build string -----------------------
        value = object.title

        value = '%s - %s' % (value, content)

        if site_name:
            value = '%s %s' % (value, site_name)
        else:
            if category:
                value = '%s, %s' % (value, category)
            if category and subcategory:
                value = '%s, %s' % (value, subcategory)

            value = '%s ' % value

        value = '%s %s %s' % (
            value, site_name, geo_location)

        value = value.strip()

        return value
Пример #39
0
def generate_meta_keywords(value):
    """
        Take any string and removes the html and html entities
        and then runs a keyword density analyizer on the text
        to decided the 20 best one word and two word
        key phrases
    """
    try:
        from re import compile
        from operator import itemgetter

        from django.utils.text import unescape_entities
        from django.utils.translation import ugettext_lazy as _

        # translate the stop words
        TR_STOP_WORDS = _(' '.join(STOP_WORDS))
        TR_STOP_WORDS = TR_STOP_WORDS.split()

        # get rid of the html tags
        value = strip_tags(value)

        # get rid of the html entities
        value = unescape_entities(value)

        # lower case the value
        value = value.lower()

        # get the one word, two word, and three word patterns
        one_word_pattern = compile(r'\s*(\w+[a-zA-Z0-9:\-]*\w*(\'\w{1,2})?)')
        two_word_pattern = compile(
            r'\s*(\w+[a-zA-Z0-9:\-]*\w*(\'\w{1,2})?)(\s+|_)(\w+[a-zA-Z0-9:\-]*\w*(\'\w{1,2})?)'
        )

        # get the length of the value
        value_length = len(value)

        # make a list of one words
        search_end = 0
        one_words = []
        while search_end < value_length:
            s = one_word_pattern.search(value, search_end)
            if s:
                one_word = s.group(1)
                # remove the : from the word
                if one_word[-1] == ':':
                    one_word = one_word[:-1]

                one_words.append(one_word)
                search_end = s.end()
            else:
                break

        # remove the stop words
        one_words = [word for word in one_words if word not in TR_STOP_WORDS]

        # get the density, and word into a tuple
        one_words_length = len(one_words)
        unique_words = set(word for word in one_words)
        one_words = [
            (word, round((one_words.count(word) * 100.00 / one_words_length),
                         2)) for word in unique_words
        ]

        # sort the tuple by the density
        one_words = sorted(one_words, key=itemgetter(1), reverse=True)

        # get the 10 best keywords
        one_words = [word[0] for word in one_words[:10]]

        # make a list of two words phrases without stop phrases
        search_end = 0
        two_words = []
        while search_end < value_length:
            s = two_word_pattern.search(value, search_end)
            if s:
                word1 = s.group(1)
                word2 = s.group(4)
                # remove the : from the words
                if word1[-1] == ':':
                    word1 = word1[:-1]
                if word2[-1] == ':':
                    word2 = word2[:-1]

                if word1 not in TR_STOP_WORDS:
                    if word2 not in TR_STOP_WORDS:
                        two_word = word1 + ' ' + word2
                        two_words.append(two_word)

                search_start = s.start()
                next_search = one_word_pattern.search(value, search_start)
                search_end = next_search.end()
            else:
                # if no match, advance a word
                s = one_word_pattern.search(value, search_end)
                if s:
                    search_end = s.end()
                else:
                    search_end = value_length

        # get the density, and word into a tuple
        two_words_length = len(two_words)
        unique_words = set(words for words in two_words)
        two_words = [(words,
                      round(
                          (two_words.count(words) * 100.00 / two_words_length),
                          2)) for words in unique_words]

        # sort the tuple by the density
        two_words = sorted(two_words, key=itemgetter(1), reverse=True)

        # get the best 2 word keywords
        two_words = [word[0] for word in two_words[:10]]

        # add the two lists together
        keywords = two_words + one_words

        return ','.join(keywords)
    except AttributeError:
        return ''
Пример #40
0
 def test_unescape_entities_deprecated(self):
     msg = (
         'django.utils.text.unescape_entities() is deprecated in favor of '
         'html.unescape().')
     with self.assertWarnsMessage(RemovedInDjango40Warning, msg):
         text.unescape_entities('foo')
Пример #41
0
    def parse(self):
        """
        Parse the POST data and break it into a FILES MultiValueDict and a POST
        MultiValueDict.

        Return a tuple containing the POST and FILES dictionary, respectively.
        """
        from django.http import QueryDict

        encoding = self._encoding
        handlers = self._upload_handlers

        # HTTP spec says that Content-Length >= 0 is valid
        # handling content-length == 0 before continuing
        if self._content_length == 0:
            return QueryDict(encoding=self._encoding), MultiValueDict()

        # See if any of the handlers take care of the parsing.
        # This allows overriding everything if need be.
        for handler in handlers:
            result = handler.handle_raw_input(
                self._input_data,
                self._meta,
                self._content_length,
                self._boundary,
                encoding,
            )
            # Check to see if it was handled
            if result is not None:
                return result[0], result[1]

        # Create the data structures to be used later.
        self._post = QueryDict(mutable=True)
        self._files = MultiValueDict()

        # Instantiate the parser and stream:
        stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))

        # Whether or not to signal a file-completion at the beginning of the loop.
        old_field_name = None
        counters = [0] * len(handlers)

        # Number of bytes that have been read.
        num_bytes_read = 0
        # To count the number of keys in the request.
        num_post_keys = 0
        # To limit the amount of data read from the request.
        read_size = None

        try:
            for item_type, meta_data, field_stream in Parser(
                    stream, self._boundary):
                if old_field_name:
                    # We run this at the beginning of the next loop
                    # since we cannot be sure a file is complete until
                    # we hit the next boundary/part of the multipart content.
                    self.handle_file_complete(old_field_name, counters)
                    old_field_name = None

                try:
                    disposition = meta_data['content-disposition'][1]
                    field_name = disposition['name'].strip()
                except (KeyError, IndexError, AttributeError):
                    continue

                transfer_encoding = meta_data.get('content-transfer-encoding')
                if transfer_encoding is not None:
                    transfer_encoding = transfer_encoding[0].strip()
                field_name = force_text(field_name, encoding, errors='replace')

                if item_type == FIELD:
                    # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS.
                    num_post_keys += 1
                    if (settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None
                            and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS <
                            num_post_keys):
                        raise TooManyFieldsSent(
                            'The number of GET/POST parameters exceeded '
                            'settings.DATA_UPLOAD_MAX_NUMBER_FIELDS.')

                    # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE.
                    if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None:
                        read_size = settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read

                    # This is a post field, we can just set it in the post
                    if transfer_encoding == 'base64':
                        raw_data = field_stream.read(size=read_size)
                        num_bytes_read += len(raw_data)
                        try:
                            data = base64.b64decode(raw_data)
                        except binascii.Error:
                            data = raw_data
                    else:
                        data = field_stream.read(size=read_size)
                        num_bytes_read += len(data)

                    # Add two here to make the check consistent with the
                    # x-www-form-urlencoded check that includes '&='.
                    num_bytes_read += len(field_name) + 2
                    if (settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None
                            and num_bytes_read >
                            settings.DATA_UPLOAD_MAX_MEMORY_SIZE):
                        raise RequestDataTooBig(
                            'Request body exceeded settings.DATA_UPLOAD_MAX_MEMORY_SIZE.'
                        )

                    self._post.appendlist(
                        field_name, force_text(data,
                                               encoding,
                                               errors='replace'))
                elif item_type == FILE:
                    # This is a file, use the handler...
                    file_name = disposition.get('filename')
                    if file_name:
                        file_name = force_text(file_name,
                                               encoding,
                                               errors='replace')
                        file_name = self.IE_sanitize(
                            unescape_entities(file_name))
                    if not file_name:
                        continue

                    content_type, content_type_extra = meta_data.get(
                        'content-type', ('', {}))
                    content_type = content_type.strip()
                    charset = content_type_extra.get('charset')

                    try:
                        content_length = int(
                            meta_data.get('content-length')[0])
                    except (IndexError, TypeError, ValueError):
                        content_length = None

                    counters = [0] * len(handlers)
                    try:
                        for handler in handlers:
                            try:
                                handler.new_file(
                                    field_name,
                                    file_name,
                                    content_type,
                                    content_length,
                                    charset,
                                    content_type_extra,
                                )
                            except StopFutureHandlers:
                                break

                        for chunk in field_stream:
                            if transfer_encoding == 'base64':
                                # We only special-case base64 transfer encoding
                                # We should always decode base64 chunks by multiple of 4,
                                # ignoring whitespace.

                                stripped_chunk = b"".join(chunk.split())

                                remaining = len(stripped_chunk) % 4
                                while remaining != 0:
                                    over_chunk = field_stream.read(4 -
                                                                   remaining)
                                    stripped_chunk += b"".join(
                                        over_chunk.split())
                                    remaining = len(stripped_chunk) % 4

                                try:
                                    chunk = base64.b64decode(stripped_chunk)
                                except Exception as exc:
                                    # Since this is only a chunk, any error is an unfixable error.
                                    raise MultiPartParserError(
                                        "Could not decode base64 data."
                                    ) from exc

                            for i, handler in enumerate(handlers):
                                chunk_length = len(chunk)
                                chunk = handler.receive_data_chunk(
                                    chunk, counters[i])
                                counters[i] += chunk_length
                                if chunk is None:
                                    # Don't continue if the chunk received by
                                    # the handler is None.
                                    break

                    except SkipFile:
                        self._close_files()
                        # Just use up the rest of this file...
                        exhaust(field_stream)
                    else:
                        # Handle file upload completions on next iteration.
                        old_field_name = field_name
                else:
                    # If this is neither a FIELD or a FILE, just exhaust the stream.
                    exhaust(stream)
        except StopUpload as e:
            self._close_files()
            if not e.connection_reset:
                exhaust(self._input_data)
        else:
            # Make sure that the request data is all fed
            exhaust(self._input_data)

        # Signal that the upload has completed.
        for handler in handlers:
            retval = handler.upload_complete()
            if retval:
                break

        self._post._mutable = False
        return self._post, self._files
Пример #42
0
 def content_raw(self):
     return strip_tags(
         unescape_entities(self.metadata['content']).replace(u'¶', ''))
Пример #43
0
    def handle_noargs(self, **kwargs):
        try:
            verbosity = int(kwargs['verbosity'])
        except (KeyError, TypeError, ValueError):
            verbosity = 1

        # Somehow, bizarely, there's a bug in Sphinx such that if I try to
        # build 1.0 before other versions, things fail in weird ways. However,
        # building newer versions first works. I suspect Sphinx is hanging onto
        # some global state. Anyway, we can work around it by making sure that
        # "dev" builds before "1.0". This is ugly, but oh well.
        for release in DocumentRelease.objects.order_by('-version'):
            self.UncompressHTML(release)

            if verbosity >= 1:
                print "Updating %s..." % release

            zipfilename = os.path.join(settings.OPENRAVE_DOCUMENT_ROOT_PATH,
                                       'openravejson-%s.zip' % release.version)
            if not os.path.exists(zipfilename):
                print 'failed to find zipfile', zipfilename
                continue

            zipfiledir = os.path.splitext(zipfilename)[0]
            docsdir = os.path.join(zipfiledir, release.lang, 'sphinxjson')

            douncompress = True
            if os.path.exists(zipfiledir) and os.path.exists(docsdir):
                # check if timestamps of zipfile and dir match
                douncompress = os.stat(zipfiledir).st_mtime < os.stat(
                    zipfilename).st_mtime

            if douncompress:
                print 'uncompressing', zipfilename
                try:
                    zf = zipfile.ZipFile(zipfilename, 'r')
                except IOError, e:
                    print e
                    continue

                for files in zf.namelist():
                    zf.extract(files, settings.OPENRAVE_DOCUMENT_ROOT_PATH)
                zf.close()
                # have to touch zipfiledir incase zip file did not overwrite its timestamp
                os.utime(zipfiledir, None)

            # check if the language exists
            if not os.path.exists(docsdir):
                print 'language dir does not exist', docsdir
                continue

            #
            # Rebuild the imported document list and search index.
            #
            if not kwargs['reindex']:
                continue

            if verbosity >= 2:
                print "  reindexing...", release.version

            # Build a dict of {path_fragment: document_object}. We'll pop values
            # out of this dict as we go which'll make sure we know which
            # remaining documents need to be deleted (and unindexed) later on.
            documents = dict(
                (doc.path, doc) for doc in release.documents.all())

            # Walk the tree we've just built looking for ".fjson" documents
            # (just JSON, but Sphinx names them weirdly). Each one of those
            # documents gets a corresponding Document object created which
            # we'll then ask Sphinx to reindex.
            #
            # We have to be a bit careful to reverse-engineer the correct
            # relative path component, especially for "index" documents,
            # otherwise the search results will be incorrect.
            for dirpath, dirnames, filenames in os.walk(docsdir):
                for filename in filenames:
                    basename, ext = os.path.splitext(filename)
                    if ext == '.fjson':
                        # Convert into a relative path for inclusion into the model
                        if basename == 'index':
                            path = os.path.normpath(
                                os.path.relpath(dirpath, docsdir))
                        else:
                            path = os.path.normpath(
                                os.path.relpath(
                                    os.path.join(dirpath, basename), docsdir))
                        with open(os.path.join(dirpath, filename)) as fp:
                            json_doc = json.load(fp)
                            try:
                                json_doc[
                                    'body']  # Just to make sure it exists.
                                title = unescape_entities(
                                    strip_tags(json_doc['title']))
                            except KeyError, ex:
                                if verbosity >= 2:
                                    print "Skipping: %s (no %s)" % (path,
                                                                    ex.args[0])
                                continue

                        doc = documents.pop(
                            path, Document(path=path, release=release))
                        doc.title = title
                        doc.save()
Пример #44
0
    def _get_story_data(cls, story, site=None):

        url = story.get_absolute_url(site=site)
        preview_url = get_preview_url(story) or url

        # See http://codex.wordpress.org/Post_Status_Transitions
        if story.is_published:
            if story.pub_date > datetime.datetime.now():
                post_status = 'future'
            else:
                post_status = 'publish'
        else:
            post_status = 'draft'

        # unescaping as inlines are escaped.
        story_body = unescape_entities(story.raw_body)

        # Add media image items as HTML in the story body.
        # They'll get converted back when saving the story.
        images = list(story.images.all())
        videos = list(story.videos.all())
        story_body = cls._create_media_html(images, videos) + story_body

        return {
            'dateCreated':
            DateTime(story.pub_date),
            'userid':
            str(story.author.id),
            'postid':
            str(story.id),
            'description':
            story_body,
            'title':
            story.headline,
            'link':
            url,
            'permaLink':
            preview_url,
            'categories':
            [smart_unicode(cat) for cat in story.categories.all()],
            'mt_excerpt':
            story.get_short_summary(),
            'mt_text_more':
            '',
            'wp_more_text':
            '',
            'mt_allow_comments':
            int(story.comments.enabled),
            'mt_allow_pings':
            0,
            'mt_keywords':
            ', '.join((smart_unicode(tag) for tag in story.tags)),
            'wp_slug':
            story.slug,
            'wp_password':
            '',
            'wp_author_id':
            str(story.author.id),
            'wp_author_display_name':
            story.author.username,
            'date_created_gmt':
            DateTime(to_gmt(story.pub_date)),
            'post_status':
            post_status,
            'custom_fields': [],
            'wp_post_format':
            'standard',
            'date_modified':
            DateTime(story.updated_date or story.pub_date),
            'date_modified_gmt':
            DateTime(to_gmt(story.updated_date or story.pub_date)),
        }
Пример #45
0
    def handle(self, **kwargs):
        try:
            verbosity = int(kwargs['verbosity'])
        except (KeyError, TypeError, ValueError):
            verbosity = 1

        default_builders = ['json', 'html']

        # Somehow, bizarely, there's a bug in Sphinx such that if I try to
        # build 1.0 before other versions, things fail in weird ways. However,
        # building newer versions first works. I suspect Sphinx is hanging onto
        # some global state. Anyway, we can work around it by making sure that
        # "dev" builds before "1.0". This is ugly, but oh well.
        for release in DocumentRelease.objects.order_by('-release'):
            if verbosity >= 1:
                self.stdout.write("Updating %s..." % release)

            # checkout_dir is shared for all languages.
            checkout_dir = settings.DOCS_BUILD_ROOT.joinpath(release.version)
            parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath(
                release.lang, release.version)
            if not checkout_dir.exists():
                checkout_dir.mkdir(parents=True)
            if not parent_build_dir.exists():
                parent_build_dir.mkdir(parents=True)

            #
            # Update the release from SCM.
            #

            # Make a git checkout/update into the destination directory.
            self.update_git(release.scm_url, checkout_dir)

            source_dir = checkout_dir.joinpath('docs')

            if release.lang != 'en':
                scm_url = release.scm_url.replace(
                    'django.git', 'django-docs-translations.git')
                trans_dir = checkout_dir.joinpath('django-docs-translation')
                if not trans_dir.exists():
                    trans_dir.mkdir()
                self.update_git(scm_url, trans_dir)
                if not source_dir.joinpath('locale').exists():
                    source_dir.joinpath('locale').symlink_to(
                        trans_dir.joinpath('translations'))
                subprocess.call("cd %s && make translations" % trans_dir,
                                shell=True)

            if release.is_default:
                # Build the pot files (later retrieved by Transifex)
                builders = default_builders[:] + ['gettext']
            else:
                builders = default_builders

            #
            # Use Sphinx to build the release docs into JSON and HTML documents.
            #
            for builder in builders:
                # Wipe and re-create the build directory. See #18930.
                build_dir = parent_build_dir.joinpath('_build', builder)
                if build_dir.exists():
                    shutil.rmtree(str(build_dir))
                build_dir.mkdir(parents=True)

                if verbosity >= 2:
                    self.stdout.write("  building %s (%s -> %s)" %
                                      (builder, source_dir, build_dir))
                subprocess.check_call([
                    'sphinx-build',
                    '-j',
                    '4',
                    '-b',
                    builder,
                    '-D',
                    'language=%s' % release.lang,
                    '-q',  # Be vewy qwiet
                    str(source_dir),  # Source file directory
                    str(build_dir),  # Destination directory
                ])

            #
            # Create a zip file of the HTML build for offline reading.
            # This gets moved into MEDIA_ROOT for downloading.
            #
            html_build_dir = parent_build_dir.joinpath('_build', 'html')
            zipfile_name = 'django-docs-%s-%s.zip' % (release.version,
                                                      release.lang)
            zipfile_path = Path(settings.MEDIA_ROOT).joinpath(
                'docs', zipfile_name)
            if not zipfile_path.parent.exists():
                zipfile_path.parent.mkdir(parents=True)
            if verbosity >= 2:
                self.stdout.write("  build zip (into %s)" % zipfile_path)

            def zipfile_inclusion_filter(file_path):
                return '.doctrees' not in file_path.parts

            with closing(
                    zipfile.ZipFile(str(zipfile_path),
                                    'w',
                                    compression=zipfile.ZIP_DEFLATED)) as zf:
                for root, dirs, files in os.walk(str(html_build_dir)):
                    for f in files:
                        file_path = Path(os.path.join(root, f))
                        if zipfile_inclusion_filter(file_path):
                            rel_path = str(
                                file_path.relative_to(html_build_dir))
                            zf.write(str(file_path), rel_path)

            #
            # Copy the build results to the directory used for serving
            # the documentation in the least disruptive way possible.
            #
            build_dir = parent_build_dir.joinpath('_build')
            built_dir = parent_build_dir.joinpath('_built')
            subprocess.check_call([
                'rsync', '--archive', '--delete',
                '--link-dest={}'.format(build_dir), '{}/'.format(build_dir),
                str(built_dir)
            ])

            #
            # Rebuild the imported document list and search index.
            #
            if not kwargs['reindex']:
                continue

            if verbosity >= 2:
                self.stdout.write("  reindexing...")

            # Build a dict of {path_fragment: document_object}. We'll pop values
            # out of this dict as we go which'll make sure we know which
            # remaining documents need to be deleted (and unindexed) later on.
            documents = dict(
                (doc.path, doc) for doc in release.documents.all())

            # Walk the tree we've just built looking for ".fjson" documents
            # (just JSON, but Sphinx names them weirdly). Each one of those
            # documents gets a corresponding Document object created which
            # we'll then ask Sphinx to reindex.
            #
            # We have to be a bit careful to reverse-engineer the correct
            # relative path component, especially for "index" documents,
            # otherwise the search results will be incorrect.

            json_built_dir = parent_build_dir.joinpath('_built', 'json')
            for root, dirs, files in os.walk(str(json_built_dir)):
                for f in files:
                    built_doc = Path(root, f)
                    if built_doc.is_file() and built_doc.suffix == '.fjson':

                        # Convert the built_doc path which is now an absolute
                        # path (i.e. "/home/docs/en/1.2/_built/ref/models.json")
                        # into a path component (i.e. "ref/models").
                        path = built_doc.relative_to(json_built_dir)
                        if path.stem == 'index':
                            path = path.parent
                        path = str(path.parent.joinpath(path.stem))

                        # Read out the content and create a new Document object for
                        # it. We'll strip the HTML tags here (for want of a better
                        # place to do it).
                        with open(str(built_doc)) as fp:
                            json_doc = json.load(fp)
                            try:
                                json_doc[
                                    'body']  # Just to make sure it exists.
                                title = unescape_entities(
                                    strip_tags(json_doc['title']))
                            except KeyError as ex:
                                if verbosity >= 2:
                                    self.stdout.write("Skipping: %s (no %s)" %
                                                      (path, ex.args[0]))
                                continue

                        doc = documents.pop(
                            path, Document(path=path, release=release))
                        doc.title = title
                        doc.save()
                        DocumentDocType.index_object(doc)

            # Clean up any remaining documents.
            for doc in documents.values():
                if verbosity >= 2:
                    self.stdout.write("Deleting:", doc)
                try:
                    DocumentDocType.unindex_object(doc)
                except ElasticsearchException:
                    pass
                doc.delete()
Пример #46
0
def adjust_typo(texte, html=True):
    texte = smart_unicode(texte).strip()
    if not texte or (html and re.match(r'(\s*<(/?[^>]*[^>/]|br /)>\s*)+$',
                                       texte, re.UNICODE | re.IGNORECASE)):
        return u''

    # TODO: add unit tests
    # TODO: in regex add code to ignore tags replacement

    if html:
        # remove HTML tags before processing text
        tokens = re.findall(u'<[^>]+>', texte)

        for idx, value in enumerate(tokens):
            texte = texte.replace(value, ']TAG%s[' % idx, 1)

    # replace OE and AE by their correct ligature, Œ and Æ.
    for old, new in ligatures:
        texte = texte.replace(old, new)

# TODO: verify if these cases are cover
#    s/—/&#151;/g;
#    s/ - / &#151; /g;
#    s/--/—/g;
#    s/—/&#151;/g;
#    s/ — / —&nbsp;/g;
#    s/—/&#151;/g;

# do some typographic adjustments (mostly putting non-breaking space where needed)
    regexs = [
        (u'  +', u' '),  # remove more then one normal space
        (u'  +', u' '),  # remove more then one special space
        (u'«(\s| )+', u'«&nbsp;'),  # make space non-breaking after «
        (u'(\s| )+»', u'&nbsp;»'),  # make space non-breaking before »
        (u'«([^&])', u'«&nbsp;\g<1>'),  # add non-breaking space after «
        (u'([^;])»', u'\g<1>&nbsp;»'),  # add non-breaking space before »
        (u'(\s| )+(:|;|\?|!|$|%)',
         u'&nbsp;\g<2>'),  # make space non-breaking before :, ?, !, $, %
        (
            u'(\d)(\s| )+(cm)', u'\g<1>&nbsp;\g<3>'
        ),  # put non-breaking space between groups in long numbers (ex.: 23 000)
        (
            u'(\d)(\s| )+(\d{3})', u'\g<1>&nbsp;\g<3>'
        ),  # put non-breaking space between groups in long numbers (ex.: 23 000)
        (u'(\s| )P\.(\s| )',
         u'\g<1>P.&nbsp;'),  # put non-breaking space after Page abbreviation
        (u'(\s| )p\.',
         u'&nbsp;p.'),  # put non-breaking space before page abbreviation
        (u' -- ', u' — '),  # changed 2 hyphen in a EM dash
        (u'&(l|g)t;', u'&amp;\g<1>t;'
         ),  # to keep &lt; and &gt; as entities when doing unescape_entities
    ]

    if html:
        regexs.extend([
            (u'(\d)(ème|e|es)(\s| |-)', u'\g<1><sup>\g<2></sup>\g<3>'
             ),  # put number extension in exposant (ex. 2e)
            (u'([IVX])e(\s| )', u'\g<1><sup>e</sup>\g<2>'
             ),  # put roman number extension in exposant (ex. Xe)
            (u'1er(\s| |-)',
             u'1<sup>er</sup>\g<1>'),  # put 1 extension in exposant (ex. 1er)
        ])

    for old, new in regexs:
        texte = re.sub(old, new, texte)

    # replace html tags at their good location
    if html:
        for idx, value in enumerate(tokens):
            texte = texte.replace(']TAG%s[' % idx, value, 1)

    # do more typographic adjustments with smartypants
    texte = typogrify.smartypants(texte)
    return unescape_entities(texte).strip()
Пример #47
0
def _unescape_and_unquote(s):
    if not s: return s
    return unescape_entities(unquote(s).decode('utf-8'))
Пример #48
0
 def render(self, context, instance, placeholder):
     context = super(RevealMarkDownPlugin, self).render(context, instance, placeholder)
     content = unescape_entities(instance.glossary.get('markdown', ''))
     context['html_content'] = mark_safe(markdown.markdown(content))
     return context
Пример #49
0
    def parse(self):
        """
        Parse the POST data and break it into a FILES MultiValueDict and a POST
        MultiValueDict.

        Returns a tuple containing the POST and FILES dictionary, respectively.
        """
        # We have to import QueryDict down here to avoid a circular import.
        from django.http import QueryDict

        encoding = self._encoding
        handlers = self._upload_handlers

        limited_input_data = LimitBytes(self._input_data, self._content_length)

        # See if the handler will want to take care of the parsing.
        # This allows overriding everything if somebody wants it.
        for handler in handlers:
            result = handler.handle_raw_input(limited_input_data,
                                              self._meta,
                                              self._content_length,
                                              self._boundary,
                                              encoding)
            if result is not None:
                return result[0], result[1]

        # Create the data structures to be used later.
        self._post = QueryDict('', mutable=True)
        self._files = MultiValueDict()

        # Instantiate the parser and stream:
        stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size))

        # Whether or not to signal a file-completion at the beginning of the loop.
        old_field_name = None
        counters = [0] * len(handlers)

        try:
            for item_type, meta_data, field_stream in Parser(stream, self._boundary):
                if old_field_name:
                    # We run this at the beginning of the next loop
                    # since we cannot be sure a file is complete until
                    # we hit the next boundary/part of the multipart content.
                    self.handle_file_complete(old_field_name, counters)
                    old_field_name = None

                try:
                    disposition = meta_data['content-disposition'][1]
                    field_name = disposition['name'].strip()
                except (KeyError, IndexError, AttributeError):
                    continue

                transfer_encoding = meta_data.get('content-transfer-encoding')
                field_name = force_unicode(field_name, encoding, errors='replace')

                if item_type == FIELD:
                    # This is a post field, we can just set it in the post
                    if transfer_encoding == 'base64':
                        raw_data = field_stream.read()
                        try:
                            data = str(raw_data).decode('base64')
                        except:
                            data = raw_data
                    else:
                        data = field_stream.read()

                    self._post.appendlist(field_name,
                                          force_unicode(data, encoding, errors='replace'))
                elif item_type == FILE:
                    # This is a file, use the handler...
                    file_name = disposition.get('filename')
                    if not file_name:
                        continue
                    file_name = force_unicode(file_name, encoding, errors='replace')
                    file_name = self.IE_sanitize(unescape_entities(file_name))

                    content_type = meta_data.get('content-type', ('',))[0].strip()
                    try:
                        charset = meta_data.get('content-type', (0,{}))[1].get('charset', None)
                    except:
                        charset = None

                    try:
                        content_length = int(meta_data.get('content-length')[0])
                    except (IndexError, TypeError, ValueError):
                        content_length = None

                    counters = [0] * len(handlers)
                    try:
                        for handler in handlers:
                            try:
                                handler.new_file(field_name, file_name,
                                                 content_type, content_length,
                                                 charset)
                            except StopFutureHandlers:
                                break

                        for chunk in field_stream:
                            if transfer_encoding == 'base64':
                                # We only special-case base64 transfer encoding
                                try:
                                    chunk = str(chunk).decode('base64')
                                except Exception, e:
                                    # Since this is only a chunk, any error is an unfixable error.
                                    raise MultiPartParserError("Could not decode base64 data: %r" % e)

                            for i, handler in enumerate(handlers):
                                chunk_length = len(chunk)
                                chunk = handler.receive_data_chunk(chunk,
                                                                   counters[i])
                                counters[i] += chunk_length
                                if chunk is None:
                                    # If the chunk received by the handler is None, then don't continue.
                                    break

                    except SkipFile, e:
                        # Just use up the rest of this file...
                        exhaust(field_stream)
                    else:
                        # Handle file upload completions on next iteration.
                        old_field_name = field_name
                else:
                    # If this is neither a FIELD or a FILE, just exhaust the stream.
                    exhaust(stream)
        except StopUpload, e:
            if not e.connection_reset:
                exhaust(limited_input_data)
Пример #50
0
def _unescape_and_unquote(s):
    if not s: return s
    return unescape_entities(unquote(s))
Пример #51
0
 def sanitized_title(self):
     if self.title:
         return unescape_entities(
             bleach.clean(self.title, tags=[], strip=True))
     return _('(No title)')
Пример #52
0
    def handle_noargs(self, **kwargs):
        try:
            verbosity = int(kwargs['verbosity'])
        except (KeyError, TypeError, ValueError):
            verbosity = 1

        # Somehow, bizarely, there's a bug in Sphinx such that if I try to
        # build 1.0 before other versions, things fail in weird ways. However,
        # building newer versions first works. I suspect Sphinx is hanging onto
        # some global state. Anyway, we can work around it by making sure that
        # "dev" builds before "1.0". This is ugly, but oh well.
        for release in DocumentRelease.objects.order_by('-version'):
            if verbosity >= 1:
                print "Updating %s..." % release

            # checkout_dir is shared for all languages.
            checkout_dir = Path(settings.DOCS_BUILD_ROOT).child(
                release.version)
            parent_build_dir = Path(settings.DOCS_BUILD_ROOT).child(
                release.lang, release.version)
            if not checkout_dir.exists():
                checkout_dir.mkdir(parents=True)
            if not parent_build_dir.exists():
                parent_build_dir.mkdir(parents=True)

            #
            # Update the release from SCM.
            #

            # Make an SCM checkout/update into the destination directory.
            # Do this dynamically in case we add other SCM later.
            getattr(self, 'update_%s' % release.scm)(release.scm_url,
                                                     checkout_dir)

            if release.docs_subdir:
                source_dir = checkout_dir.child(
                    *release.docs_subdir.split('/'))
            else:
                source_dir = checkout_dir

            if release.lang != 'en':
                scm_url = release.scm_url.replace(
                    'django.git', 'django-docs-translations.git')
                trans_dir = checkout_dir.child('django-docs-translation')
                if not trans_dir.exists():
                    trans_dir.mkdir()
                getattr(self, 'update_%s' % release.scm)(scm_url, trans_dir)
                if not source_dir.child('locale').exists():
                    source_dir.child('locale').write_link(
                        trans_dir.child('translations'))
                subprocess.call("cd %s && make translations" % trans_dir,
                                shell=True)

            #
            # Use Sphinx to build the release docs into JSON and HTML documents.
            #
            for builder in ('json', 'html'):
                # Wipe and re-create the build directory. See #18930.
                build_dir = parent_build_dir.child('_build', builder)
                if build_dir.exists():
                    shutil.rmtree(build_dir)
                build_dir.mkdir(parents=True)

                # "Shell out" (not exactly, but basically) to sphinx-build.
                if verbosity >= 2:
                    print "  building %s (%s -> %s)" % (builder, source_dir,
                                                        build_dir)
                sphinx.cmdline.main([
                    'sphinx-build',
                    '-b',
                    builder,
                    '-D',
                    'language=%s' % release.lang,
                    '-q',  # Be vewy qwiet
                    source_dir,  # Source file directory
                    build_dir,  # Destination directory
                ])

            #
            # Create a zip file of the HTML build for offline reading.
            # This gets moved into MEDIA_ROOT for downloading.
            #
            html_build_dir = parent_build_dir.child('_build', 'html')
            zipfile_name = 'django-docs-%s-%s.zip' % (release.version,
                                                      release.lang)
            zipfile_path = Path(settings.MEDIA_ROOT).child(
                'docs', zipfile_name)
            if not zipfile_path.parent.exists():
                zipfile_path.parent.mkdir(parents=True)
            if verbosity >= 2:
                print "  build zip (into %s)" % zipfile_path

            def zipfile_inclusion_filter(f):
                return f.isfile() and '.doctrees' not in f.components()

            with closing(zipfile.ZipFile(zipfile_path, 'w')) as zf:
                for f in html_build_dir.walk(filter=zipfile_inclusion_filter):
                    zf.write(f, html_build_dir.rel_path_to(f))

            #
            # Copy the build results to the directory used for serving
            # the documentation in the least disruptive way possible.
            #
            build_dir = parent_build_dir.child('_build')
            built_dir = parent_build_dir.child('_built')
            subprocess.check_call([
                'rsync', '--archive', '--delete', '--link-dest=' + build_dir,
                build_dir + '/', built_dir
            ])

            #
            # Rebuild the imported document list and search index.
            #
            if not kwargs['reindex']:
                continue

            if verbosity >= 2:
                print "  reindexing..."

            # Build a dict of {path_fragment: document_object}. We'll pop values
            # out of this dict as we go which'll make sure we know which
            # remaining documents need to be deleted (and unindexed) later on.
            documents = dict(
                (doc.path, doc) for doc in release.documents.all())

            # Walk the tree we've just built looking for ".fjson" documents
            # (just JSON, but Sphinx names them weirdly). Each one of those
            # documents gets a corresponding Document object created which
            # we'll then ask Sphinx to reindex.
            #
            # We have to be a bit careful to reverse-engineer the correct
            # relative path component, especially for "index" documents,
            # otherwise the search results will be incorrect.
            json_built_dir = parent_build_dir.child('_built', 'json')
            for built_doc in json_built_dir.walk():
                if built_doc.isfile() and built_doc.ext == '.fjson':

                    # Convert the built_doc path which is now an absolute
                    # path (i.e. "/home/docs/en/1.2/_built/ref/models.json")
                    # into a path component (i.e. "ref/models").
                    path = json_built_dir.rel_path_to(built_doc)
                    if path.stem == 'index':
                        path = path.parent
                    path = str(path.parent.child(path.stem))

                    # Read out the content and create a new Document object for
                    # it. We'll strip the HTML tags here (for want of a better
                    # place to do it).
                    with open(built_doc) as fp:
                        json_doc = json.load(fp)
                        try:
                            json_doc['body']  # Just to make sure it exists.
                            title = unescape_entities(
                                strip_tags(json_doc['title']))
                        except KeyError, ex:
                            if verbosity >= 2:
                                print "Skipping: %s (no %s)" % (path,
                                                                ex.args[0])
                            continue

                    doc = documents.pop(path,
                                        Document(path=path, release=release))
                    doc.title = title
                    doc.save()
                    haystack.site.update_object(doc)

            # Clean up any remaining documents.
            for doc in documents.values():
                if verbosity >= 2:
                    print "Deleting:", doc
                haystack.site.remove_object(doc)
                doc.delete()
Пример #53
0
 def sanitized_title(self):
     if self.title:
         return unescape_entities(bleach.clean(self.title, tags=[],
                                               strip=True))
     return _('(No title)')
Пример #54
0
    def handle_raw_input(self,
                         input_data,
                         META,
                         content_length,
                         boundary,
                         encoding=None):
        """
        Parse the raw input from the HTTP request and split items into fields
        and files, executing callback methods as necessary.

        Shamelessly adapted and borrowed from
        django.http.multiparser.MultiPartParser.
        """
        # following suit from the source class, this is imported here to avoid
        # a potential circular import
        from django.http import QueryDict

        # create return values
        self.POST = QueryDict('', mutable=True)
        self.FILES = MultiValueDict()

        # initialize the parser and stream
        stream = LazyStream(ChunkIter(input_data, self.chunk_size))
        # whether or not to signal a file-completion at the beginning
        # of the loop.
        old_field_name = None
        counter = 0

        try:
            for item_type, meta_data, field_stream in Parser(stream, boundary):
                if old_field_name:
                    # we run this test at the beginning of the next loop since
                    # we cannot be sure a file is complete until we hit the
                    # next boundary/part of the multipart content.
                    file_obj = self.file_complete(counter)

                    if file_obj:
                        # if we return a file object, add it to the files dict
                        self.FILES.appendlist(
                            force_text(old_field_name,
                                       encoding,
                                       errors='replace'), file_obj)

                    # wipe it out to prevent havoc
                    old_field_name = None
                try:
                    disposition = meta_data['content-disposition'][1]
                    field_name = disposition['name'].strip()
                except (KeyError, IndexError, AttributeError):
                    continue

                transfer_encoding = meta_data.get('content-transfer-encoding')

                if transfer_encoding is not None:
                    transfer_encoding = transfer_encoding[0].strip()

                field_name = force_text(field_name, encoding, errors='replace')

                if item_type == FIELD:
                    # this is a POST field
                    if transfer_encoding == "base64":
                        raw_data = field_stream.read()
                        try:
                            data = str(raw_data).decode('base64')
                        except:
                            data = raw_data
                    else:
                        data = field_stream.read()

                    self.POST.appendlist(
                        field_name, force_text(data,
                                               encoding,
                                               errors='replace'))

                    # trigger listener
                    self.field_parsed(field_name, self.POST.get(field_name))
                elif item_type == FILE:
                    # this is a file
                    file_name = disposition.get('filename')

                    if not file_name:
                        continue

                    # transform the file name
                    file_name = force_text(file_name,
                                           encoding,
                                           errors='replace')
                    file_name = self.IE_sanitize(unescape_entities(file_name))

                    content_type = meta_data.get('content-type',
                                                 ('', ))[0].strip()

                    try:
                        charset = meta_data.get('content-type', (0, {}))[1]\
                            .get('charset', None)
                    except:
                        charset = None

                    try:
                        file_content_length = int(
                            meta_data.get('content-length')[0])
                    except (IndexError, TypeError, ValueError):
                        file_content_length = None

                    counter = 0

                    # now, do the important file stuff
                    try:
                        # alert on the new file
                        kwargs = {
                            'content_type': content_type,
                            'content_length': file_content_length,
                            'charset': charset
                        }
                        self.new_file(field_name, file_name, **kwargs)

                        # chubber-chunk it
                        for chunk in field_stream:
                            # we need AES compatibles blocks (multiples of 16 bits)
                            over_bytes = len(chunk) % 16
                            if over_bytes:
                                over_chunk =\
                                    field_stream.read(16 - over_bytes)
                                chunk += over_chunk

                            if transfer_encoding == "base64":
                                try:
                                    chunk = base64.b64decode(chunk)
                                except Exception as e:
                                    # since this is anly a chunk, any
                                    # error is an unfixable error
                                    raise MultiPartParserError(
                                        "Could not decode base64 data: %r" % e)

                            chunk_length = len(chunk)
                            self.receive_data_chunk(chunk, counter)
                            counter += chunk_length

                            if counter > settings.UPLOAD_FILE_SIZE_LIMIT:
                                raise SkipFile('File is too big.')
                            # ... and we're done
                    except SkipFile:
                        # just eat the rest
                        exhaust(field_stream)
                    else:
                        # handle file upload completions on next iteration
                        old_field_name = field_name

        except StopUpload as e:
            # if we get a request to stop the upload,
            # exhaust it if no con reset
            if not e.connection_reset:
                exhaust(input_data)
        else:
            # make sure that the request data is all fed
            exhaust(input_data)

        # signal the upload has been completed
        self.upload_complete()

        return self.POST, self.FILES
Пример #55
0
    def parse(self):
        """
        Parse the POST data and break it into a FILES MultiValueDict and a POST
        MultiValueDict.

        Returns a tuple containing the POST and FILES dictionary, respectively.
        """
        # We have to import QueryDict down here to avoid a circular import.
        from django.http import QueryDict

        encoding = self._encoding
        handlers = self._upload_handlers

        # HTTP spec says that Content-Length >= 0 is valid
        # handling content-length == 0 before continuing
        if self._content_length == 0:
            return QueryDict('', encoding=self._encoding), MultiValueDict()

        # See if any of the handlers take care of the parsing.
        # This allows overriding everything if need be.
        for handler in handlers:
            result = handler.handle_raw_input(self._input_data,
                                              self._meta,
                                              self._content_length,
                                              self._boundary,
                                              encoding)
            # Check to see if it was handled
            if result is not None:
                return result[0], result[1]

        # Create the data structures to be used later.
        self._post = QueryDict('', mutable=True)
        self._files = MultiValueDict()

        # Instantiate the parser and stream:
        stream = LazyStream(ChunkIter(self._input_data, self._chunk_size))

        # Whether or not to signal a file-completion at the beginning of the loop.
        old_field_name = None
        counters = [0] * len(handlers)

        try:
            for item_type, meta_data, field_stream in Parser(stream, self._boundary):
                if old_field_name:
                    # We run this at the beginning of the next loop
                    # since we cannot be sure a file is complete until
                    # we hit the next boundary/part of the multipart content.
                    self.handle_file_complete(old_field_name, counters)
                    old_field_name = None

                try:
                    disposition = meta_data['content-disposition'][1]
                    field_name = disposition['name'].strip()
                except (KeyError, IndexError, AttributeError):
                    continue

                transfer_encoding = meta_data.get('content-transfer-encoding')
                if transfer_encoding is not None:
                    transfer_encoding = transfer_encoding[0].strip()
                field_name = force_text(field_name, encoding, errors='replace')

                if item_type == FIELD:
                    # This is a post field, we can just set it in the post
                    if transfer_encoding == 'base64':
                        raw_data = field_stream.read()
                        try:
                            data = base64.b64decode(raw_data)
                        except _BASE64_DECODE_ERROR:
                            data = raw_data
                    else:
                        data = field_stream.read()

                    self._post.appendlist(field_name,
                                          force_text(data, encoding, errors='replace'))
                elif item_type == FILE:
                    # This is a file, use the handler...
                    file_name = disposition.get('filename')
                    if not file_name:
                        continue
                    file_name = force_text(file_name, encoding, errors='replace')
                    file_name = self.IE_sanitize(unescape_entities(file_name))

                    content_type, content_type_extra = meta_data.get('content-type', ('', {}))
                    content_type = content_type.strip()
                    charset = content_type_extra.get('charset')

                    try:
                        content_length = int(meta_data.get('content-length')[0])
                    except (IndexError, TypeError, ValueError):
                        content_length = None

                    counters = [0] * len(handlers)
                    try:
                        for handler in handlers:
                            try:
                                handler.new_file(field_name, file_name,
                                                 content_type, content_length,
                                                 charset, content_type_extra)
                            except StopFutureHandlers:
                                break

                        for chunk in field_stream:
                            if transfer_encoding == 'base64':
                                # We only special-case base64 transfer encoding
                                # We should always decode base64 chunks by multiple of 4,
                                # ignoring whitespace.

                                stripped_chunk = b"".join(chunk.split())

                                remaining = len(stripped_chunk) % 4
                                while remaining != 0:
                                    over_chunk = field_stream.read(4 - remaining)
                                    stripped_chunk += b"".join(over_chunk.split())
                                    remaining = len(stripped_chunk) % 4

                                try:
                                    chunk = base64.b64decode(stripped_chunk)
                                except Exception as e:
                                    # Since this is only a chunk, any error is an unfixable error.
                                    msg = "Could not decode base64 data: %r" % e
                                    six.reraise(MultiPartParserError, MultiPartParserError(msg), sys.exc_info()[2])

                            for i, handler in enumerate(handlers):
                                chunk_length = len(chunk)
                                chunk = handler.receive_data_chunk(chunk,
                                                                   counters[i])
                                counters[i] += chunk_length
                                if chunk is None:
                                    # If the chunk received by the handler is None, then don't continue.
                                    break

                    except SkipFile:
                        self._close_files()
                        # Just use up the rest of this file...
                        exhaust(field_stream)
                    else:
                        # Handle file upload completions on next iteration.
                        old_field_name = field_name
                else:
                    # If this is neither a FIELD or a FILE, just exhaust the stream.
                    exhaust(stream)
        except StopUpload as e:
            self._close_files()
            if not e.connection_reset:
                exhaust(self._input_data)
        else:
            # Make sure that the request data is all fed
            exhaust(self._input_data)

        # Signal that the upload has completed.
        for handler in handlers:
            retval = handler.upload_complete()
            if retval:
                break

        return self._post, self._files
Пример #56
0
 def sanitize_file_name(self, file_name):
     file_name = unescape_entities(file_name)
     # Cleanup Windows-style path separators.
     file_name = file_name[file_name.rfind('\\') + 1:].strip()
     return os.path.basename(file_name)
Пример #57
0
def adjust_typo(texte, html=True):
    texte = smart_unicode(texte).strip()
    if not texte or (html and re.match(r'(\s*<(/?[^>]*[^>/]|br /)>\s*)+$', texte, re.UNICODE | re.IGNORECASE)):
        return u''

    # TODO: add unit tests
    # TODO: in regex add code to ignore tags replacement

    if html:
        # remove HTML tags before processing text
        tokens = re.findall(u'<[^>]+>', texte)

        for idx, value in enumerate(tokens):
            texte = texte.replace(value, ']TAG%s[' % idx, 1)

    # replace OE and AE by their correct ligature, Œ and Æ.
    for old, new in ligatures:
        texte = texte.replace(old, new)

# TODO: verify if these cases are cover
#    s/—/&#151;/g;
#    s/ - / &#151; /g;
#    s/--/—/g;
#    s/—/&#151;/g;
#    s/ — / —&nbsp;/g;
#    s/—/&#151;/g;

    # do some typographic adjustments (mostly putting non-breaking space where needed)
    regexs = [
        (u'  +', u' '),  # remove more then one normal space
        (u'  +', u' '),  # remove more then one special space
        (u'«(\s| )+', u'«&nbsp;'),  # make space non-breaking after «
        (u'(\s| )+»', u'&nbsp;»'),  # make space non-breaking before »
        (u'«([^&])', u'«&nbsp;\g<1>'),  # add non-breaking space after «
        (u'([^;])»', u'\g<1>&nbsp;»'),  # add non-breaking space before »
        (u'(\s| )+(:|;|\?|!|$|%)', u'&nbsp;\g<2>'),  # make space non-breaking before :, ?, !, $, %
        (u'(\d)(\s| )+(cm)', u'\g<1>&nbsp;\g<3>'),  # put non-breaking space between groups in long numbers (ex.: 23 000)
        (u'(\d)(\s| )+(\d{3})', u'\g<1>&nbsp;\g<3>'),  # put non-breaking space between groups in long numbers (ex.: 23 000)
        (u'(\s| )P\.(\s| )', u'\g<1>P.&nbsp;'),  # put non-breaking space after Page abbreviation
        (u'(\s| )p\.', u'&nbsp;p.'),  # put non-breaking space before page abbreviation

        (u' -- ', u' — '),  # changed 2 hyphen in a EM dash

        (u'&(l|g)t;', u'&amp;\g<1>t;'),  # to keep &lt; and &gt; as entities when doing unescape_entities
    ]

    if html:
        regexs.extend([
            (u'(\d)(ème|e|es)(\s| |-)', u'\g<1><sup>\g<2></sup>\g<3>'),  # put number extension in exposant (ex. 2e)
            (u'([IVX])e(\s| )', u'\g<1><sup>e</sup>\g<2>'),  # put roman number extension in exposant (ex. Xe)
            (u'1er(\s| |-)', u'1<sup>er</sup>\g<1>'),  # put 1 extension in exposant (ex. 1er)
        ])

    for old, new in regexs:
        texte = re.sub(old, new, texte)

    # replace html tags at their good location
    if html:
        for idx, value in enumerate(tokens):
            texte = texte.replace(']TAG%s[' % idx, value, 1)

    # do more typographic adjustments with smartypants
    texte = typogrify.smartypants(texte)
    return unescape_entities(texte).strip()
Пример #58
0
def convert_html_to_string(text):
    """ Returns text containing html tags as text without its tags """
    return unescape_entities(strip_tags(force_text(text)))