def fetch_data_from_url(url, content): data = {"url": url} try: readable = Readability(url, content) data["title"] = reduce_whitespace(unescape_entities(readable.get_article_title())) # Try to get abstract from meta description: abstract = reduce_whitespace(unescape_entities(strip_tags(readable.get_meta_description()).strip())) if not abstract: abstract = reduce_whitespace(unescape_entities(strip_tags(readable.get_article_text()).strip())) abstract = truncate_words(abstract, 200) data["abstract"] = abstract except ReadabilityException: pass if VIDEO_URL_RE.search(url): data["media_formats"] = MediaFormat.objects.filter(name="Video") urls = URL_RE.findall(content) OLD_CC_LICENCES = [l[0] for l in CC_OLD_LICENSES[1:]] for url in urls: if CC_LICENSE_URL_RE.match(url): url = url.lower() if url in OLD_CC_LICENCES: data["license_type"] = "cc-old" data["license_cc_old"] = url else: data["license_type"] = "cc" data["license_cc"] = url return data
def test_unescape_entities(self): items = [ ('', ''), ('foo', 'foo'), ('&', '&'), ('&', '&'), ('&', '&'), ('foo & bar', 'foo & bar'), ('foo & bar', 'foo & bar'), ] for value, output in items: self.assertEqual(text.unescape_entities(value), output) self.assertEqual(text.unescape_entities(lazystr(value)), output)
def test_unescape_entities(self): items = [ ("", ""), ("foo", "foo"), ("&", "&"), ("&", "&"), ("&", "&"), ("foo & bar", "foo & bar"), ("foo & bar", "foo & bar"), ] for value, output in items: self.assertEqual(text.unescape_entities(value), output) self.assertEqual(text.unescape_entities(lazystr(value)), output)
def parse_django_error(self): """Extract the summary part of a Django HTML error.""" try: summary = self.msg.split(u'<body>\n<div id="summary">\n ', 1)[1]\ .split(u'<th>Python Executable:</th>', 1)[0] traceback = self.msg.split(u'\n\nTraceback:', 1)[1]\ .split(u'</textarea>', 1)[0] except IndexError: return self.msg result = [] title = None for line in strip_tags(summary).split('\n'): line_content = unescape_entities(line.strip()) if line_content: if line_content.endswith(':'): title = line_content elif title is None: title = "%s:" % line_content else: result.append("%s %s\n" % (title, line_content)) result.append("Status code: %s" % self.status_code) indent, indent2 = u' ', u' ' return u"%(summary)s %(traceback)s".strip() % { 'summary': indent.join(force_unicode(line) for line in result), 'traceback': indent2.join(force_unicode(line+"\n") \ for line in traceback.split('\n')), }
def sync_to_db(self, decoded_documents): """ Sync the given list of documents (decoded fjson files from sphinx) to the database. Deletes all the release's documents first then reinserts them as needed. """ self.documents.all().delete() # Read excluded paths from robots.docs.txt. robots_path = settings.BASE_DIR.joinpath('djangoproject', 'static', 'robots.docs.txt') with open(str(robots_path), 'r') as fh: excluded_paths = [ line.strip().split('/')[-1] for line in fh if line.startswith("Disallow: /%s/%s/" % (self.lang, self.release_id)) ] for document in decoded_documents: if ('body' not in document or 'title' not in document or document['current_page_name'].split('/')[0] in excluded_paths): # We don't care about indexing documents with no body or title, or partially translated continue Document.objects.create( release=self, path=_clean_document_path(document['current_page_name']), title=unescape_entities(strip_tags(document['title'])), )
def render(self, context): length = self.length full_value = self.content_node.render(context) value = unescape_entities(full_value) if len(value) > length - 3: value = value[:length - 3] + '...' return '<span title="%s">%s</span>' % (full_value, value)
def parse(self, raw_email): """ Fetches the content of the message and populates the available headers """ body = u'' html_body = u'' msg = email.parser.Parser().parsestr(raw_email) for part in msg.walk(): #for key, header in part.items(): # self.headers[key.lower()] = clean_header(header) payload = part.get_payload(decode=1) charset = part.get_content_charset() if charset is not None: payload = payload.decode(charset) if part.get_content_type() == 'text/plain': body += payload if part.get_content_type() == 'text/html': html_body += payload if not body: body = unescape_entities(strip_tags(html_body)) self.body = body
def get_description(self): object = self.object ### Assign variables ----------------------- site_name = get_setting('site','global','sitedisplayname') geo_location = get_setting('site','global','sitegeographiclocation') if object.description: content = object.description content = strip_tags(content) #strips HTML tags content = unescape_entities(content) content = content.replace("\n","").replace("\r","") content = truncate_words(content, 50) # ~ about 250 chars ### Build string ----------------------- value = object.name value = '%s : %s' % (value, content) value = '%s Photo Sets for %s, %s' % ( value, site_name, geo_location) value = value.strip() return value
def render(self, context): try: html = unescape_entities(self.nodelist.render(context)) safe_html = self.sanitize(html) top_level_elements = fragments_fromstring(safe_html) # TODO: We need to remember to patch in whatever pre-save # HTML processing we eventually do here, too. E.g. # a spam URL blacklist. out = [] for elem in top_level_elements: if elem.tag == "iframe": elem = self._process_iframe(elem) out.append(etree.tostring(elem, method="html", encoding="UTF-8")) return "".join(out) except IFrameSrcNotApproved: return ( '<span class="plugin embed">' + _( "The embedded URL is not on the list of approved providers. " "Contact the site administrator to add it." ) + "</span>" ) except: return '<span class="plugin embed">' + _("Invalid embed code") + "</span>"
def anchorify(anchor): """ Filter which converts to a string suitable for use as an anchor id on a HTML element. This is useful when you want anchor id on a heading to match heading content, which can be an arbitrary string. Example usage:: <h1 id="{{ _("My Blog")|anchorify }}">{% trans "My Blog" %}</h1> The result would be:: <h1 id="my-blog">My Blog</h1> """ try: anchor = template.defaultfilters.striptags(anchor) anchor = text.unescape_entities(anchor) anchor = url_tags.slugify2(anchor) if not anchor or not anchor[0].isalpha(): anchor = 'a' + anchor return anchor except: if settings.DEBUG: raise else: return u''
def save(self, fail_silently=False): """ Build and send the email message. """ body = unescape_entities(self.message()) # convert " back to "", etc. msg = EmailMessage(self.subject(), body, self.from_email, self.recipient_list, headers={'Reply-To': self.reply_email()}) msg.send(fail_silently=fail_silently)
def _convert_to_plain(self, value): if value: value = force_unicode(value) text = re.sub('<br[^>]*>', u'\n', value) text = unescape_entities(text) text = strip_tags(text) text = text.strip() text = unicodedata.normalize('NFKD', text.lower()).encode('ascii', 'ignore') return text return ''
def add_to_corpus(article_id): """ Retrieve an article in db, clean it, and add it to corpus. """ t = Article.objects.get(pk=article_id).content t = normalize_text(unescape_entities(t)) t = t.encode("utf-8") t = t.decode("string_escape") save_to_file("corpus/%s.txt" % article_id, t) print normalize_text(t)
def _prepare_plain_text(self, from_field, to_field): original_text = getattr(self, from_field, None) if original_text: original_text = force_unicode(original_text) text = re.sub('<br[^>]*>', u'\n', original_text) text = unescape_entities(text) text = strip_tags(text) text = text.strip() setattr(self, to_field, text) else: setattr(self, to_field, original_text)
def get_description(self): object = self.object ### Assign variables ----------------------- primary_keywords = get_setting('site','global','siteprimarykeywords') category_set = object.category_set category = category_set.get('category', '') subcategory = category_set.get('sub_category', '') site_name = get_setting('site','global','sitedisplayname') geo_location = get_setting('site','global','sitegeographiclocation') creator_name = '' if object.creator: creator_name = '%s %s' % ( object.creator.first_name, object.creator.last_name ) creator_name = creator_name.strip() if object.summary: content = object.summary else: content = object.body content = strip_tags(content) #strips HTML tags content = unescape_entities(content) content = content.replace("\n","").replace("\r","") content = truncate_words(content, 50) # ~ about 250 chars ### Build string ----------------------- value = object.headline if creator_name: value = '%s %s' % (value, creator_name) value = '%s : %s' % (value, content) if primary_keywords: value = '%s %s' % (value, primary_keywords) else: if category: value = '%s %s' % (value, category) if category and subcategory: value = '%s : %s' % (value, subcategory) value = '%s directory' % value value = '%s Directories for %s %s' % ( value, site_name, geo_location) value = value.strip() return value
def anchorify(anchor): """ Convert string to suitable for anchor id. """ anchor = defaultfilters.striptags(anchor) anchor = text.unescape_entities(anchor) for a, b in HEADING_REPLACE: anchor = anchor.replace(a, b) anchor = defaultfilters.slugify(anchor) anchor = DASH_START_END_RE.sub('', anchor) if not anchor or not anchor[0].isalpha(): anchor = 'a' + anchor return anchor
def cleanHtml(html): r""" Returns a text version of html, by first removing any text in <blockquote> tags and then striping any other tags and replacing html entities. The <blockquote> strip is done for the same reasons as in cleanText(). >>> cleanHtml('foo\n<blockquote some-attr="some">bar</blockquote>&st') 'foo\n&st' >>> cleanHtml('foo\n<blockquote\nsome-attr="some">bar</blockquote>&st') 'foo\n&st' """ regex = re.compile(r"<blockquote.*</blockquote>", re.DOTALL) html = regex.sub("", html) return unescape_entities(strip_tags(html))
def check_fk(self, indexes): field = self.get_field('fk', indexes) parent = field.find_element_by_xpath('parent::*') add_related = parent.find_element_by_css_selector('.add-related') add_related.click() name = self.get_name_for_indexes(indexes) with self.switch_to_popup_window(): self.set_field('name', name) self.save_form() time.sleep(0.1) field_id = field.get_attribute('id') current_val = self.selenium.execute_script( 'return $("#%s").find("option:selected").html()' % field_id) self.assertEqual(unescape_entities(current_val), name)
def normalize_text(text, language="french"): """ Normalize text : clean, strip tags... Tests needed. """ text = strip_tags(unescape_entities(text)) text = text.replace(u"’", u"'") text = text.replace(u"qu'", u"qu' ")#qu' lorsqu', etc. text = re.sub(ur'(")([^ \n\.,!?]){1}', u"\xab\g<2>", text, re.U)#replacing opening quotes text = re.sub(ur'([^ \n]){1}(")', u"\g<1>\xbb", text, re.U)#replacing closing quotes #Replacing inverted pronouns. text = re.sub(ur"\-t\-", u" - t - ", text, re.U) text = re.sub(ur"\-(je|moi|tu|toi|il|le|elle|la|on|nous|vous|ils|elles|les|ci|là)([\W])", u" - \g<1>\g<2>", text, re.U) return text
def get_description(self): object = self.object ### Assign variables ----------------------- primary_keywords = get_setting('site', 'global', 'siteprimarykeywords') category_set = object.category_set category = category_set.get('category', '') subcategory = category_set.get('sub_category', '') site_name = get_setting('site', 'global', 'sitedisplayname') geo_location = get_setting('site', 'global', 'sitegeographiclocation') creator_name = '' if object.creator: creator_name = '%s %s' % (object.creator.first_name, object.creator.last_name) creator_name = creator_name.strip() if object.summary: content = object.summary else: content = object.body content = strip_tags(content) #strips HTML tags content = unescape_entities(content) content = content.replace("\n", "").replace("\r", "") content = truncate_words(content, 50) # ~ about 250 chars ### Build string ----------------------- value = object.headline if creator_name: value = '%s %s' % (value, creator_name) value = '%s : %s' % (value, content) if primary_keywords: value = '%s %s' % (value, primary_keywords) else: if category: value = '%s %s' % (value, category) if category and subcategory: value = '%s : %s' % (value, subcategory) value = '%s directory' % value value = '%s Directories for %s %s' % (value, site_name, geo_location) value = value.strip() return value
def _get_story_data(cls, story, site=None): url = story.get_absolute_url(site=site) preview_url = get_preview_url(story) or url # See http://codex.wordpress.org/Post_Status_Transitions if story.is_published: if story.pub_date > datetime.datetime.now(): post_status = 'future' else: post_status = 'publish' else: post_status = 'draft' # unescaping as inlines are escaped. story_body = unescape_entities(story.raw_body) # Add media image items as HTML in the story body. # They'll get converted back when saving the story. images = list(story.images.all()) videos = list(story.videos.all()) story_body = cls._create_media_html(images, videos) + story_body return { 'dateCreated': DateTime(story.pub_date), 'userid': str(story.author.id), 'postid': str(story.id), 'description': story_body, 'title': story.headline, 'link': url, 'permaLink': preview_url, 'categories': [smart_unicode(cat) for cat in story.categories.all()], 'mt_excerpt': story.get_short_summary(), 'mt_text_more': '', 'wp_more_text': '', 'mt_allow_comments': int(story.comments.enabled), 'mt_allow_pings': 0, 'mt_keywords': ', '.join((smart_unicode(tag) for tag in story.tags)), 'wp_slug': story.slug, 'wp_password': '', 'wp_author_id': str(story.author.id), 'wp_author_display_name': story.author.username, 'date_created_gmt': DateTime(to_gmt(story.pub_date)), 'post_status': post_status, 'custom_fields': [], 'wp_post_format': 'standard', 'date_modified': DateTime(story.updated_date or story.pub_date), 'date_modified_gmt': DateTime(to_gmt(story.updated_date or story.pub_date)), }
def clean(self): cleaned_data = super().clean() if cleaned_data['open_tag'] in ('if', 'elif'): if not cleaned_data['condition']: raise ValidationError( _("The evaluation condition is missing or empty.")) try: condition = unescape_entities(cleaned_data['condition']) engines['django'].from_string( self.eval_template_string.format(condition)) except TemplateSyntaxError as err: raise ValidationError( _("Unable to evaluate condition: {}").format(str(err))) elif cleaned_data['open_tag'] == 'else': cleaned_data['condition'] = '' # empty condition for else-block return cleaned_data
def get_description(self): object = self.object ### Assign variables ----------------------- primary_keywords = get_setting("site", "global", "siteprimarykeywords") category_set = object.category_set category = category_set.get("category", "") subcategory = category_set.get("sub_category", "") site_name = get_setting("site", "global", "sitedisplayname") geo_location = get_setting("site", "global", "sitegeographiclocation") creator_name = "%s %s" % (object.creator.first_name, object.creator.last_name) creator_name = creator_name.strip() if object.summary: content = object.summary else: content = object.body content = strip_tags(content) # strips HTML tags content = unescape_entities(content) content = content.replace("\n", "").replace("\r", "") content = truncate_words(content, 50) # ~ about 250 chars ### Build string ----------------------- value = object.headline if creator_name: value = "%s %s" % (value, creator_name) value = "%s : %s" % (value, content) if primary_keywords: value = "%s %s" % (value, primary_keywords) else: if category: value = "%s %s" % (value, category) if category and subcategory: value = "%s : %s" % (value, subcategory) value = "%s article" % value value = "%s Articles and White Papers for %s %s" % (value, site_name, geo_location) value = value.strip() return value
def sync_to_db(self, decoded_documents): """ Sync the given list of documents (decoded fjson files from sphinx) to the database. Deletes all the release's documents first then reinserts them as needed. """ self.documents.all().delete() for document in decoded_documents: if 'body' not in document or 'title' not in document: # We don't care about indexing documents with no body or title continue Document.objects.create( release=self, path=_clean_document_path(document['current_page_name']), title=unescape_entities(strip_tags(document['title'])), )
def check_fk(self, indexes): field = self.get_field('fk1', indexes) parent = field.find_element_by_xpath('parent::*') add_related = parent.find_element_by_css_selector('.add-related') if self.has_grappelli: # Grappelli can be very slow to initialize fk bindings, particularly # when run on travis-ci time.sleep(1) self.click(add_related) name = self.get_name_for_indexes(indexes) with self.switch_to_popup_window(): self.set_field('name', name) self.save_form() time.sleep(0.1) field_id = field.get_attribute('id') current_val = self.selenium.execute_script( 'return $("#%s").find("option:selected").html()' % field_id) self.assertEqual(unescape_entities(current_val), name)
def check_fk(self, indexes): field = self.get_field('fk1', indexes) parent = field.find_element_by_xpath('parent::*') add_related = parent.find_element_by_css_selector('.add-related') if self.has_grappelli: # Grappelli can be very slow to initialize fk bindings, particularly # when run on travis-ci time.sleep(1) add_related.click() name = self.get_name_for_indexes(indexes) with self.switch_to_popup_window(): self.set_field('name', name) self.save_form() time.sleep(0.1) field_id = field.get_attribute('id') current_val = self.selenium.execute_script( 'return $("#%s").find("option:selected").html()' % field_id) self.assertEqual(unescape_entities(current_val), name)
def html2tex_bs4(el): result = [] if isinstance(el, NavigableString): return str(el) for sel in el.children: if isinstance(sel, NavigableString): result.append(str(sel)) ## Span styling elif sel.name in ["span"]: for att in list(sel.attrs.keys()): if att == 'style': if 'font-style:italic' in sel.attrs[att]: result.append(u'\\textit{%s}' % (html2tex_bs4(sel))) elif 'font-weight:bold' in sel.attrs[att]: result.append(u'\\textbf{%s}' % (html2tex(sel))) elif att == 'class' and 'math-tex' in sel.attrs[att]: if sel.string is not None and sel.string[:2] == '\(': if len(sel.contents) > 1: print('WARNING:', 'Math with nested tags!!') print(sel) result.append(unescape_entities(sel.string)) elif att == 'class' and 'lang-ltr' in sel.attrs[att]: result.append(u'\\textenglish{%s}' % (html2tex_bs4(sel))) ## Bold elif sel.name in ["b", "strong"]: result.append(u'\\textbf{%s}' % (html2tex_bs4(sel))) ## Italic elif sel.name in ["i"]: result.append(u'\\textit{%s}' % (html2tex_bs4(sel))) ## Emph elif sel.name in ["em"]: result.append(u'\\emph{%s}' % (html2tex_bs4(sel))) ## Underline elif sel.name in ["u"]: result.append(u'\\underline{%s}' % (html2tex_bs4(sel))) ## English in RTL elif 'dir' in sel.attrs and sel.attrs['dir'] == 'ltr': result.append(u'\\begin{english}\n%s\n\\end{english}' % (html2tex_bs4(sel))) ## By default just append content else: result.append(html2tex_bs4(sel)) return u"".join(result)
def html_to_text(html): """ Return formated text from HTML source (keeping words separated and completed, paragraphs and new lines). The output is supposed to be fine for words and phrases searches. """ text = force_unicode(html) text = text.strip() if text: # Format HTML source to ouput readable/searchable text text = _re_newline_in_text.sub('\g<1> \g<2>', text) text = text.replace('\n', '') text = _re_space.sub('\g<1> \g<2>', text) text = _re_newline.sub('\g<0>\n', text) text = _re_2newlines.sub('\g<0>\n\n', text) text = _re_strip_html.sub('', text) text = _re_strip_newlines.sub('\n\n', text.strip()) text = _re_strip_spaces.sub(' ', text) text = unescape_entities(text) text = text.replace('&', '&') return text
def from_django(cls, obj): # turns HTML entities into unicode characters again and removes # all HTML tags, aka "plain text" versio of the document content = strip_tags(unescape_entities(obj.body).replace(u'¶', '')) doc = cls(path=obj.path, title=obj.title, content=content, meta={'id': obj.id}) doc.release = { 'id': obj.release.id, 'lang': obj.release.lang, 'version': obj.release.version, } breadcrumbs = [] for breadcrumb in cls.model.objects.breadcrumbs(obj): breadcrumbs.append({ 'title': breadcrumb.title, 'path': breadcrumb.path, }) doc.breadcrumbs = breadcrumbs return doc
def importFromWp(): for tmp in Post.objects.all(): tmp.delete() xmlFile = os.getcwd() + "/static/wordpress.2010-06-25.xml" dateFmt = "%a, %d %b %Y %H:%M:%S +0000" rss = parse(xmlFile) for post in rss.getElementsByTagName("item"): content = post.getElementsByTagName("content:encoded")[0] if content.childNodes.length > 0: name = getInnerText(post, "dc:creator") dateStr = getInnerText(post, "pubDate") date = datetime.datetime.strptime(dateStr, dateFmt) slug = urlparse(getInnerText(post, "link")).path.split("/")[-2] body = unescape_entities(content.firstChild.wholeText) soup = BeautifulSoup(body) for img in soup.findAll('img'): if re.match("http://techmeetup.co.uk/", img['src']): imgsrc = img["src"] filename = imgsrc.split("/")[-1] img["src"] = "/static/img/wp/" + filename outpath = os.path.join("./static/img/wp", filename) body = str(soup) if not os.path.exists(outpath): print "fetching: %s" % (img["src"]) urlretrieve(imgsrc, outpath) Post(author=getOrCreate(name), title=getInnerText(post, "title"), slug=slug, body=body, created=date, updated=date).save()
def importFromWp(): for tmp in Post.objects.all(): tmp.delete() xmlFile = os.getcwd() + "/static/wordpress.2010-06-25.xml" dateFmt = "%a, %d %b %Y %H:%M:%S +0000" rss = parse(xmlFile) for post in rss.getElementsByTagName("item"): content = post.getElementsByTagName("content:encoded")[0] if content.childNodes.length > 0: name = getInnerText(post, "dc:creator") dateStr = getInnerText(post, "pubDate") date = datetime.datetime.strptime(dateStr, dateFmt) slug = urlparse(getInnerText(post, "link")).path.split("/")[-2] body = unescape_entities(content.firstChild.wholeText) soup = BeautifulSoup(body) for img in soup.findAll('img'): if re.match("http://techmeetup.co.uk/", img['src']): imgsrc = img["src"] filename = imgsrc.split("/")[-1] img["src"] = "/static/img/wp/" + filename outpath = os.path.join("./static/img/wp", filename) body = str(soup) if not os.path.exists(outpath): print "fetching: %s" % (img["src"]) urlretrieve(imgsrc, outpath) Post( author = getOrCreate(name), title = getInnerText(post, "title"), slug = slug, body = body, created = date, updated = date ).save()
def get_description(self): object = self.object ### Assign variables ----------------------- primary_keywords = get_setting('site','global','siteprimarykeywords') category_set = object.category_set category = ', '.join([cat.name for cat in object.cats.all()]) subcategory = ', '.join([sub_cat.name for sub_cat in object.sub_cats.all()]) site_name = get_setting('site','global','sitedisplayname') geo_location = get_setting('site','global','sitegeographiclocation') if object.summary: content = object.summary else: content = object.body content = strip_tags(content) #strips HTML tags content = unescape_entities(content) content = content.replace("\n","").replace("\r","") content = truncate_words(content, 50) # ~ about 250 chars ### Build string ----------------------- value = object.headline value = '%s : %s' % (value, content) if primary_keywords: value = '%s %s' % (value, primary_keywords) else: if category: value = '%s %s' % (value, category) if category and subcategory: value = '%s : %s' % (value, subcategory) value = '%s Directories for %s %s' % ( value, site_name, geo_location) value = value.strip() return value
def sync_to_db(self, decoded_documents): """ Sync the given list of documents (decoded fjson files from sphinx) to the database. Deletes all the release's documents first then reinserts them as needed. """ self.documents.all().delete() # Read excluded paths from robots.docs.txt. robots_path = settings.BASE_DIR.joinpath('djangoproject', 'static', 'robots.docs.txt') with open(str(robots_path), 'r') as fh: excluded_paths = [ line.strip().split('/')[-1] for line in fh if line.startswith("Disallow: /%s/%s/" % (self.lang, self.release_id)) ] for document in decoded_documents: if ('body' not in document or 'title' not in document or document['current_page_name'].split('/')[0] in excluded_paths): # We don't care about indexing documents with no body or title, or partially translated continue document_path = _clean_document_path(document['current_page_name']) document['slug'] = Path(document_path).parts[-1] document['parents'] = ' '.join(Path(document_path).parts[:-1]) Document.objects.create( release=self, path=document_path, title=unescape_entities(strip_tags(document['title'])), metadata=document, config=TSEARCH_CONFIG_LANGUAGES.get( self.lang[:2], DEFAULT_TEXT_SEARCH_CONFIG), ) for document in self.documents.all(): document.metadata['breadcrumbs'] = list( Document.objects.breadcrumbs(document).values('title', 'path')) document.save(update_fields=('metadata', ))
def render(self, context): try: html = unescape_entities(self.nodelist.render(context)) safe_html = self.sanitize(html) top_level_elements = fragments_fromstring(safe_html) # TODO: We need to remember to patch in whatever pre-save # HTML processing we eventually do here, too. E.g. # a spam URL blacklist. out = [] for elem in top_level_elements: if elem.tag == 'iframe': elem = self._process_iframe(elem) out.append( etree.tostring(elem, method='html', encoding='UTF-8')) return ''.join(out) except IFrameSrcNotApproved: return ('<span class="plugin embed">' + _( 'The embedded URL is not on the list of approved providers. ' 'Contact the site administrator to add it.') + '</span>') except: return '<span class="plugin embed">' + _( 'Invalid embed code') + '</span>'
def get_description(self): object = self.object ### Assign variables ----------------------- category = Category.objects.get_for_object(object, 'category') subcategory = Category.objects.get_for_object(object, 'subcategory') site_name = get_setting('site','global','sitedisplayname') geo_location = get_setting('site','global','sitegeographiclocation') content = object.content content = strip_tags(content) #strips HTML tags content = unescape_entities(content) content = content.replace("\n","").replace("\r","") content = truncate_words(content, 50) # ~ about 250 chars ### Build string ----------------------- value = object.title value = '%s - %s' % (value, content) if site_name: value = '%s %s' % (value, site_name) else: if category: value = '%s, %s' % (value, category) if category and subcategory: value = '%s, %s' % (value, subcategory) value = '%s ' % value value = '%s %s %s' % ( value, site_name, geo_location) value = value.strip() return value
def generate_meta_keywords(value): """ Take any string and removes the html and html entities and then runs a keyword density analyizer on the text to decided the 20 best one word and two word key phrases """ try: from re import compile from operator import itemgetter from django.utils.text import unescape_entities from django.utils.translation import ugettext_lazy as _ # translate the stop words TR_STOP_WORDS = _(' '.join(STOP_WORDS)) TR_STOP_WORDS = TR_STOP_WORDS.split() # get rid of the html tags value = strip_tags(value) # get rid of the html entities value = unescape_entities(value) # lower case the value value = value.lower() # get the one word, two word, and three word patterns one_word_pattern = compile(r'\s*(\w+[a-zA-Z0-9:\-]*\w*(\'\w{1,2})?)') two_word_pattern = compile( r'\s*(\w+[a-zA-Z0-9:\-]*\w*(\'\w{1,2})?)(\s+|_)(\w+[a-zA-Z0-9:\-]*\w*(\'\w{1,2})?)' ) # get the length of the value value_length = len(value) # make a list of one words search_end = 0 one_words = [] while search_end < value_length: s = one_word_pattern.search(value, search_end) if s: one_word = s.group(1) # remove the : from the word if one_word[-1] == ':': one_word = one_word[:-1] one_words.append(one_word) search_end = s.end() else: break # remove the stop words one_words = [word for word in one_words if word not in TR_STOP_WORDS] # get the density, and word into a tuple one_words_length = len(one_words) unique_words = set(word for word in one_words) one_words = [ (word, round((one_words.count(word) * 100.00 / one_words_length), 2)) for word in unique_words ] # sort the tuple by the density one_words = sorted(one_words, key=itemgetter(1), reverse=True) # get the 10 best keywords one_words = [word[0] for word in one_words[:10]] # make a list of two words phrases without stop phrases search_end = 0 two_words = [] while search_end < value_length: s = two_word_pattern.search(value, search_end) if s: word1 = s.group(1) word2 = s.group(4) # remove the : from the words if word1[-1] == ':': word1 = word1[:-1] if word2[-1] == ':': word2 = word2[:-1] if word1 not in TR_STOP_WORDS: if word2 not in TR_STOP_WORDS: two_word = word1 + ' ' + word2 two_words.append(two_word) search_start = s.start() next_search = one_word_pattern.search(value, search_start) search_end = next_search.end() else: # if no match, advance a word s = one_word_pattern.search(value, search_end) if s: search_end = s.end() else: search_end = value_length # get the density, and word into a tuple two_words_length = len(two_words) unique_words = set(words for words in two_words) two_words = [(words, round( (two_words.count(words) * 100.00 / two_words_length), 2)) for words in unique_words] # sort the tuple by the density two_words = sorted(two_words, key=itemgetter(1), reverse=True) # get the best 2 word keywords two_words = [word[0] for word in two_words[:10]] # add the two lists together keywords = two_words + one_words return ','.join(keywords) except AttributeError: return ''
def test_unescape_entities_deprecated(self): msg = ( 'django.utils.text.unescape_entities() is deprecated in favor of ' 'html.unescape().') with self.assertWarnsMessage(RemovedInDjango40Warning, msg): text.unescape_entities('foo')
def parse(self): """ Parse the POST data and break it into a FILES MultiValueDict and a POST MultiValueDict. Return a tuple containing the POST and FILES dictionary, respectively. """ from django.http import QueryDict encoding = self._encoding handlers = self._upload_handlers # HTTP spec says that Content-Length >= 0 is valid # handling content-length == 0 before continuing if self._content_length == 0: return QueryDict(encoding=self._encoding), MultiValueDict() # See if any of the handlers take care of the parsing. # This allows overriding everything if need be. for handler in handlers: result = handler.handle_raw_input( self._input_data, self._meta, self._content_length, self._boundary, encoding, ) # Check to see if it was handled if result is not None: return result[0], result[1] # Create the data structures to be used later. self._post = QueryDict(mutable=True) self._files = MultiValueDict() # Instantiate the parser and stream: stream = LazyStream(ChunkIter(self._input_data, self._chunk_size)) # Whether or not to signal a file-completion at the beginning of the loop. old_field_name = None counters = [0] * len(handlers) # Number of bytes that have been read. num_bytes_read = 0 # To count the number of keys in the request. num_post_keys = 0 # To limit the amount of data read from the request. read_size = None try: for item_type, meta_data, field_stream in Parser( stream, self._boundary): if old_field_name: # We run this at the beginning of the next loop # since we cannot be sure a file is complete until # we hit the next boundary/part of the multipart content. self.handle_file_complete(old_field_name, counters) old_field_name = None try: disposition = meta_data['content-disposition'][1] field_name = disposition['name'].strip() except (KeyError, IndexError, AttributeError): continue transfer_encoding = meta_data.get('content-transfer-encoding') if transfer_encoding is not None: transfer_encoding = transfer_encoding[0].strip() field_name = force_text(field_name, encoding, errors='replace') if item_type == FIELD: # Avoid storing more than DATA_UPLOAD_MAX_NUMBER_FIELDS. num_post_keys += 1 if (settings.DATA_UPLOAD_MAX_NUMBER_FIELDS is not None and settings.DATA_UPLOAD_MAX_NUMBER_FIELDS < num_post_keys): raise TooManyFieldsSent( 'The number of GET/POST parameters exceeded ' 'settings.DATA_UPLOAD_MAX_NUMBER_FIELDS.') # Avoid reading more than DATA_UPLOAD_MAX_MEMORY_SIZE. if settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None: read_size = settings.DATA_UPLOAD_MAX_MEMORY_SIZE - num_bytes_read # This is a post field, we can just set it in the post if transfer_encoding == 'base64': raw_data = field_stream.read(size=read_size) num_bytes_read += len(raw_data) try: data = base64.b64decode(raw_data) except binascii.Error: data = raw_data else: data = field_stream.read(size=read_size) num_bytes_read += len(data) # Add two here to make the check consistent with the # x-www-form-urlencoded check that includes '&='. num_bytes_read += len(field_name) + 2 if (settings.DATA_UPLOAD_MAX_MEMORY_SIZE is not None and num_bytes_read > settings.DATA_UPLOAD_MAX_MEMORY_SIZE): raise RequestDataTooBig( 'Request body exceeded settings.DATA_UPLOAD_MAX_MEMORY_SIZE.' ) self._post.appendlist( field_name, force_text(data, encoding, errors='replace')) elif item_type == FILE: # This is a file, use the handler... file_name = disposition.get('filename') if file_name: file_name = force_text(file_name, encoding, errors='replace') file_name = self.IE_sanitize( unescape_entities(file_name)) if not file_name: continue content_type, content_type_extra = meta_data.get( 'content-type', ('', {})) content_type = content_type.strip() charset = content_type_extra.get('charset') try: content_length = int( meta_data.get('content-length')[0]) except (IndexError, TypeError, ValueError): content_length = None counters = [0] * len(handlers) try: for handler in handlers: try: handler.new_file( field_name, file_name, content_type, content_length, charset, content_type_extra, ) except StopFutureHandlers: break for chunk in field_stream: if transfer_encoding == 'base64': # We only special-case base64 transfer encoding # We should always decode base64 chunks by multiple of 4, # ignoring whitespace. stripped_chunk = b"".join(chunk.split()) remaining = len(stripped_chunk) % 4 while remaining != 0: over_chunk = field_stream.read(4 - remaining) stripped_chunk += b"".join( over_chunk.split()) remaining = len(stripped_chunk) % 4 try: chunk = base64.b64decode(stripped_chunk) except Exception as exc: # Since this is only a chunk, any error is an unfixable error. raise MultiPartParserError( "Could not decode base64 data." ) from exc for i, handler in enumerate(handlers): chunk_length = len(chunk) chunk = handler.receive_data_chunk( chunk, counters[i]) counters[i] += chunk_length if chunk is None: # Don't continue if the chunk received by # the handler is None. break except SkipFile: self._close_files() # Just use up the rest of this file... exhaust(field_stream) else: # Handle file upload completions on next iteration. old_field_name = field_name else: # If this is neither a FIELD or a FILE, just exhaust the stream. exhaust(stream) except StopUpload as e: self._close_files() if not e.connection_reset: exhaust(self._input_data) else: # Make sure that the request data is all fed exhaust(self._input_data) # Signal that the upload has completed. for handler in handlers: retval = handler.upload_complete() if retval: break self._post._mutable = False return self._post, self._files
def content_raw(self): return strip_tags( unescape_entities(self.metadata['content']).replace(u'¶', ''))
def handle_noargs(self, **kwargs): try: verbosity = int(kwargs['verbosity']) except (KeyError, TypeError, ValueError): verbosity = 1 # Somehow, bizarely, there's a bug in Sphinx such that if I try to # build 1.0 before other versions, things fail in weird ways. However, # building newer versions first works. I suspect Sphinx is hanging onto # some global state. Anyway, we can work around it by making sure that # "dev" builds before "1.0". This is ugly, but oh well. for release in DocumentRelease.objects.order_by('-version'): self.UncompressHTML(release) if verbosity >= 1: print "Updating %s..." % release zipfilename = os.path.join(settings.OPENRAVE_DOCUMENT_ROOT_PATH, 'openravejson-%s.zip' % release.version) if not os.path.exists(zipfilename): print 'failed to find zipfile', zipfilename continue zipfiledir = os.path.splitext(zipfilename)[0] docsdir = os.path.join(zipfiledir, release.lang, 'sphinxjson') douncompress = True if os.path.exists(zipfiledir) and os.path.exists(docsdir): # check if timestamps of zipfile and dir match douncompress = os.stat(zipfiledir).st_mtime < os.stat( zipfilename).st_mtime if douncompress: print 'uncompressing', zipfilename try: zf = zipfile.ZipFile(zipfilename, 'r') except IOError, e: print e continue for files in zf.namelist(): zf.extract(files, settings.OPENRAVE_DOCUMENT_ROOT_PATH) zf.close() # have to touch zipfiledir incase zip file did not overwrite its timestamp os.utime(zipfiledir, None) # check if the language exists if not os.path.exists(docsdir): print 'language dir does not exist', docsdir continue # # Rebuild the imported document list and search index. # if not kwargs['reindex']: continue if verbosity >= 2: print " reindexing...", release.version # Build a dict of {path_fragment: document_object}. We'll pop values # out of this dict as we go which'll make sure we know which # remaining documents need to be deleted (and unindexed) later on. documents = dict( (doc.path, doc) for doc in release.documents.all()) # Walk the tree we've just built looking for ".fjson" documents # (just JSON, but Sphinx names them weirdly). Each one of those # documents gets a corresponding Document object created which # we'll then ask Sphinx to reindex. # # We have to be a bit careful to reverse-engineer the correct # relative path component, especially for "index" documents, # otherwise the search results will be incorrect. for dirpath, dirnames, filenames in os.walk(docsdir): for filename in filenames: basename, ext = os.path.splitext(filename) if ext == '.fjson': # Convert into a relative path for inclusion into the model if basename == 'index': path = os.path.normpath( os.path.relpath(dirpath, docsdir)) else: path = os.path.normpath( os.path.relpath( os.path.join(dirpath, basename), docsdir)) with open(os.path.join(dirpath, filename)) as fp: json_doc = json.load(fp) try: json_doc[ 'body'] # Just to make sure it exists. title = unescape_entities( strip_tags(json_doc['title'])) except KeyError, ex: if verbosity >= 2: print "Skipping: %s (no %s)" % (path, ex.args[0]) continue doc = documents.pop( path, Document(path=path, release=release)) doc.title = title doc.save()
def handle(self, **kwargs): try: verbosity = int(kwargs['verbosity']) except (KeyError, TypeError, ValueError): verbosity = 1 default_builders = ['json', 'html'] # Somehow, bizarely, there's a bug in Sphinx such that if I try to # build 1.0 before other versions, things fail in weird ways. However, # building newer versions first works. I suspect Sphinx is hanging onto # some global state. Anyway, we can work around it by making sure that # "dev" builds before "1.0". This is ugly, but oh well. for release in DocumentRelease.objects.order_by('-release'): if verbosity >= 1: self.stdout.write("Updating %s..." % release) # checkout_dir is shared for all languages. checkout_dir = settings.DOCS_BUILD_ROOT.joinpath(release.version) parent_build_dir = settings.DOCS_BUILD_ROOT.joinpath( release.lang, release.version) if not checkout_dir.exists(): checkout_dir.mkdir(parents=True) if not parent_build_dir.exists(): parent_build_dir.mkdir(parents=True) # # Update the release from SCM. # # Make a git checkout/update into the destination directory. self.update_git(release.scm_url, checkout_dir) source_dir = checkout_dir.joinpath('docs') if release.lang != 'en': scm_url = release.scm_url.replace( 'django.git', 'django-docs-translations.git') trans_dir = checkout_dir.joinpath('django-docs-translation') if not trans_dir.exists(): trans_dir.mkdir() self.update_git(scm_url, trans_dir) if not source_dir.joinpath('locale').exists(): source_dir.joinpath('locale').symlink_to( trans_dir.joinpath('translations')) subprocess.call("cd %s && make translations" % trans_dir, shell=True) if release.is_default: # Build the pot files (later retrieved by Transifex) builders = default_builders[:] + ['gettext'] else: builders = default_builders # # Use Sphinx to build the release docs into JSON and HTML documents. # for builder in builders: # Wipe and re-create the build directory. See #18930. build_dir = parent_build_dir.joinpath('_build', builder) if build_dir.exists(): shutil.rmtree(str(build_dir)) build_dir.mkdir(parents=True) if verbosity >= 2: self.stdout.write(" building %s (%s -> %s)" % (builder, source_dir, build_dir)) subprocess.check_call([ 'sphinx-build', '-j', '4', '-b', builder, '-D', 'language=%s' % release.lang, '-q', # Be vewy qwiet str(source_dir), # Source file directory str(build_dir), # Destination directory ]) # # Create a zip file of the HTML build for offline reading. # This gets moved into MEDIA_ROOT for downloading. # html_build_dir = parent_build_dir.joinpath('_build', 'html') zipfile_name = 'django-docs-%s-%s.zip' % (release.version, release.lang) zipfile_path = Path(settings.MEDIA_ROOT).joinpath( 'docs', zipfile_name) if not zipfile_path.parent.exists(): zipfile_path.parent.mkdir(parents=True) if verbosity >= 2: self.stdout.write(" build zip (into %s)" % zipfile_path) def zipfile_inclusion_filter(file_path): return '.doctrees' not in file_path.parts with closing( zipfile.ZipFile(str(zipfile_path), 'w', compression=zipfile.ZIP_DEFLATED)) as zf: for root, dirs, files in os.walk(str(html_build_dir)): for f in files: file_path = Path(os.path.join(root, f)) if zipfile_inclusion_filter(file_path): rel_path = str( file_path.relative_to(html_build_dir)) zf.write(str(file_path), rel_path) # # Copy the build results to the directory used for serving # the documentation in the least disruptive way possible. # build_dir = parent_build_dir.joinpath('_build') built_dir = parent_build_dir.joinpath('_built') subprocess.check_call([ 'rsync', '--archive', '--delete', '--link-dest={}'.format(build_dir), '{}/'.format(build_dir), str(built_dir) ]) # # Rebuild the imported document list and search index. # if not kwargs['reindex']: continue if verbosity >= 2: self.stdout.write(" reindexing...") # Build a dict of {path_fragment: document_object}. We'll pop values # out of this dict as we go which'll make sure we know which # remaining documents need to be deleted (and unindexed) later on. documents = dict( (doc.path, doc) for doc in release.documents.all()) # Walk the tree we've just built looking for ".fjson" documents # (just JSON, but Sphinx names them weirdly). Each one of those # documents gets a corresponding Document object created which # we'll then ask Sphinx to reindex. # # We have to be a bit careful to reverse-engineer the correct # relative path component, especially for "index" documents, # otherwise the search results will be incorrect. json_built_dir = parent_build_dir.joinpath('_built', 'json') for root, dirs, files in os.walk(str(json_built_dir)): for f in files: built_doc = Path(root, f) if built_doc.is_file() and built_doc.suffix == '.fjson': # Convert the built_doc path which is now an absolute # path (i.e. "/home/docs/en/1.2/_built/ref/models.json") # into a path component (i.e. "ref/models"). path = built_doc.relative_to(json_built_dir) if path.stem == 'index': path = path.parent path = str(path.parent.joinpath(path.stem)) # Read out the content and create a new Document object for # it. We'll strip the HTML tags here (for want of a better # place to do it). with open(str(built_doc)) as fp: json_doc = json.load(fp) try: json_doc[ 'body'] # Just to make sure it exists. title = unescape_entities( strip_tags(json_doc['title'])) except KeyError as ex: if verbosity >= 2: self.stdout.write("Skipping: %s (no %s)" % (path, ex.args[0])) continue doc = documents.pop( path, Document(path=path, release=release)) doc.title = title doc.save() DocumentDocType.index_object(doc) # Clean up any remaining documents. for doc in documents.values(): if verbosity >= 2: self.stdout.write("Deleting:", doc) try: DocumentDocType.unindex_object(doc) except ElasticsearchException: pass doc.delete()
def adjust_typo(texte, html=True): texte = smart_unicode(texte).strip() if not texte or (html and re.match(r'(\s*<(/?[^>]*[^>/]|br /)>\s*)+$', texte, re.UNICODE | re.IGNORECASE)): return u'' # TODO: add unit tests # TODO: in regex add code to ignore tags replacement if html: # remove HTML tags before processing text tokens = re.findall(u'<[^>]+>', texte) for idx, value in enumerate(tokens): texte = texte.replace(value, ']TAG%s[' % idx, 1) # replace OE and AE by their correct ligature, Œ and Æ. for old, new in ligatures: texte = texte.replace(old, new) # TODO: verify if these cases are cover # s/—/—/g; # s/ - / — /g; # s/--/—/g; # s/—/—/g; # s/ — / — /g; # s/—/—/g; # do some typographic adjustments (mostly putting non-breaking space where needed) regexs = [ (u' +', u' '), # remove more then one normal space (u' +', u' '), # remove more then one special space (u'«(\s| )+', u'« '), # make space non-breaking after « (u'(\s| )+»', u' »'), # make space non-breaking before » (u'«([^&])', u'« \g<1>'), # add non-breaking space after « (u'([^;])»', u'\g<1> »'), # add non-breaking space before » (u'(\s| )+(:|;|\?|!|$|%)', u' \g<2>'), # make space non-breaking before :, ?, !, $, % ( u'(\d)(\s| )+(cm)', u'\g<1> \g<3>' ), # put non-breaking space between groups in long numbers (ex.: 23 000) ( u'(\d)(\s| )+(\d{3})', u'\g<1> \g<3>' ), # put non-breaking space between groups in long numbers (ex.: 23 000) (u'(\s| )P\.(\s| )', u'\g<1>P. '), # put non-breaking space after Page abbreviation (u'(\s| )p\.', u' p.'), # put non-breaking space before page abbreviation (u' -- ', u' — '), # changed 2 hyphen in a EM dash (u'&(l|g)t;', u'&\g<1>t;' ), # to keep < and > as entities when doing unescape_entities ] if html: regexs.extend([ (u'(\d)(ème|e|es)(\s| |-)', u'\g<1><sup>\g<2></sup>\g<3>' ), # put number extension in exposant (ex. 2e) (u'([IVX])e(\s| )', u'\g<1><sup>e</sup>\g<2>' ), # put roman number extension in exposant (ex. Xe) (u'1er(\s| |-)', u'1<sup>er</sup>\g<1>'), # put 1 extension in exposant (ex. 1er) ]) for old, new in regexs: texte = re.sub(old, new, texte) # replace html tags at their good location if html: for idx, value in enumerate(tokens): texte = texte.replace(']TAG%s[' % idx, value, 1) # do more typographic adjustments with smartypants texte = typogrify.smartypants(texte) return unescape_entities(texte).strip()
def _unescape_and_unquote(s): if not s: return s return unescape_entities(unquote(s).decode('utf-8'))
def render(self, context, instance, placeholder): context = super(RevealMarkDownPlugin, self).render(context, instance, placeholder) content = unescape_entities(instance.glossary.get('markdown', '')) context['html_content'] = mark_safe(markdown.markdown(content)) return context
def parse(self): """ Parse the POST data and break it into a FILES MultiValueDict and a POST MultiValueDict. Returns a tuple containing the POST and FILES dictionary, respectively. """ # We have to import QueryDict down here to avoid a circular import. from django.http import QueryDict encoding = self._encoding handlers = self._upload_handlers limited_input_data = LimitBytes(self._input_data, self._content_length) # See if the handler will want to take care of the parsing. # This allows overriding everything if somebody wants it. for handler in handlers: result = handler.handle_raw_input(limited_input_data, self._meta, self._content_length, self._boundary, encoding) if result is not None: return result[0], result[1] # Create the data structures to be used later. self._post = QueryDict('', mutable=True) self._files = MultiValueDict() # Instantiate the parser and stream: stream = LazyStream(ChunkIter(limited_input_data, self._chunk_size)) # Whether or not to signal a file-completion at the beginning of the loop. old_field_name = None counters = [0] * len(handlers) try: for item_type, meta_data, field_stream in Parser(stream, self._boundary): if old_field_name: # We run this at the beginning of the next loop # since we cannot be sure a file is complete until # we hit the next boundary/part of the multipart content. self.handle_file_complete(old_field_name, counters) old_field_name = None try: disposition = meta_data['content-disposition'][1] field_name = disposition['name'].strip() except (KeyError, IndexError, AttributeError): continue transfer_encoding = meta_data.get('content-transfer-encoding') field_name = force_unicode(field_name, encoding, errors='replace') if item_type == FIELD: # This is a post field, we can just set it in the post if transfer_encoding == 'base64': raw_data = field_stream.read() try: data = str(raw_data).decode('base64') except: data = raw_data else: data = field_stream.read() self._post.appendlist(field_name, force_unicode(data, encoding, errors='replace')) elif item_type == FILE: # This is a file, use the handler... file_name = disposition.get('filename') if not file_name: continue file_name = force_unicode(file_name, encoding, errors='replace') file_name = self.IE_sanitize(unescape_entities(file_name)) content_type = meta_data.get('content-type', ('',))[0].strip() try: charset = meta_data.get('content-type', (0,{}))[1].get('charset', None) except: charset = None try: content_length = int(meta_data.get('content-length')[0]) except (IndexError, TypeError, ValueError): content_length = None counters = [0] * len(handlers) try: for handler in handlers: try: handler.new_file(field_name, file_name, content_type, content_length, charset) except StopFutureHandlers: break for chunk in field_stream: if transfer_encoding == 'base64': # We only special-case base64 transfer encoding try: chunk = str(chunk).decode('base64') except Exception, e: # Since this is only a chunk, any error is an unfixable error. raise MultiPartParserError("Could not decode base64 data: %r" % e) for i, handler in enumerate(handlers): chunk_length = len(chunk) chunk = handler.receive_data_chunk(chunk, counters[i]) counters[i] += chunk_length if chunk is None: # If the chunk received by the handler is None, then don't continue. break except SkipFile, e: # Just use up the rest of this file... exhaust(field_stream) else: # Handle file upload completions on next iteration. old_field_name = field_name else: # If this is neither a FIELD or a FILE, just exhaust the stream. exhaust(stream) except StopUpload, e: if not e.connection_reset: exhaust(limited_input_data)
def _unescape_and_unquote(s): if not s: return s return unescape_entities(unquote(s))
def sanitized_title(self): if self.title: return unescape_entities( bleach.clean(self.title, tags=[], strip=True)) return _('(No title)')
def handle_noargs(self, **kwargs): try: verbosity = int(kwargs['verbosity']) except (KeyError, TypeError, ValueError): verbosity = 1 # Somehow, bizarely, there's a bug in Sphinx such that if I try to # build 1.0 before other versions, things fail in weird ways. However, # building newer versions first works. I suspect Sphinx is hanging onto # some global state. Anyway, we can work around it by making sure that # "dev" builds before "1.0". This is ugly, but oh well. for release in DocumentRelease.objects.order_by('-version'): if verbosity >= 1: print "Updating %s..." % release # checkout_dir is shared for all languages. checkout_dir = Path(settings.DOCS_BUILD_ROOT).child( release.version) parent_build_dir = Path(settings.DOCS_BUILD_ROOT).child( release.lang, release.version) if not checkout_dir.exists(): checkout_dir.mkdir(parents=True) if not parent_build_dir.exists(): parent_build_dir.mkdir(parents=True) # # Update the release from SCM. # # Make an SCM checkout/update into the destination directory. # Do this dynamically in case we add other SCM later. getattr(self, 'update_%s' % release.scm)(release.scm_url, checkout_dir) if release.docs_subdir: source_dir = checkout_dir.child( *release.docs_subdir.split('/')) else: source_dir = checkout_dir if release.lang != 'en': scm_url = release.scm_url.replace( 'django.git', 'django-docs-translations.git') trans_dir = checkout_dir.child('django-docs-translation') if not trans_dir.exists(): trans_dir.mkdir() getattr(self, 'update_%s' % release.scm)(scm_url, trans_dir) if not source_dir.child('locale').exists(): source_dir.child('locale').write_link( trans_dir.child('translations')) subprocess.call("cd %s && make translations" % trans_dir, shell=True) # # Use Sphinx to build the release docs into JSON and HTML documents. # for builder in ('json', 'html'): # Wipe and re-create the build directory. See #18930. build_dir = parent_build_dir.child('_build', builder) if build_dir.exists(): shutil.rmtree(build_dir) build_dir.mkdir(parents=True) # "Shell out" (not exactly, but basically) to sphinx-build. if verbosity >= 2: print " building %s (%s -> %s)" % (builder, source_dir, build_dir) sphinx.cmdline.main([ 'sphinx-build', '-b', builder, '-D', 'language=%s' % release.lang, '-q', # Be vewy qwiet source_dir, # Source file directory build_dir, # Destination directory ]) # # Create a zip file of the HTML build for offline reading. # This gets moved into MEDIA_ROOT for downloading. # html_build_dir = parent_build_dir.child('_build', 'html') zipfile_name = 'django-docs-%s-%s.zip' % (release.version, release.lang) zipfile_path = Path(settings.MEDIA_ROOT).child( 'docs', zipfile_name) if not zipfile_path.parent.exists(): zipfile_path.parent.mkdir(parents=True) if verbosity >= 2: print " build zip (into %s)" % zipfile_path def zipfile_inclusion_filter(f): return f.isfile() and '.doctrees' not in f.components() with closing(zipfile.ZipFile(zipfile_path, 'w')) as zf: for f in html_build_dir.walk(filter=zipfile_inclusion_filter): zf.write(f, html_build_dir.rel_path_to(f)) # # Copy the build results to the directory used for serving # the documentation in the least disruptive way possible. # build_dir = parent_build_dir.child('_build') built_dir = parent_build_dir.child('_built') subprocess.check_call([ 'rsync', '--archive', '--delete', '--link-dest=' + build_dir, build_dir + '/', built_dir ]) # # Rebuild the imported document list and search index. # if not kwargs['reindex']: continue if verbosity >= 2: print " reindexing..." # Build a dict of {path_fragment: document_object}. We'll pop values # out of this dict as we go which'll make sure we know which # remaining documents need to be deleted (and unindexed) later on. documents = dict( (doc.path, doc) for doc in release.documents.all()) # Walk the tree we've just built looking for ".fjson" documents # (just JSON, but Sphinx names them weirdly). Each one of those # documents gets a corresponding Document object created which # we'll then ask Sphinx to reindex. # # We have to be a bit careful to reverse-engineer the correct # relative path component, especially for "index" documents, # otherwise the search results will be incorrect. json_built_dir = parent_build_dir.child('_built', 'json') for built_doc in json_built_dir.walk(): if built_doc.isfile() and built_doc.ext == '.fjson': # Convert the built_doc path which is now an absolute # path (i.e. "/home/docs/en/1.2/_built/ref/models.json") # into a path component (i.e. "ref/models"). path = json_built_dir.rel_path_to(built_doc) if path.stem == 'index': path = path.parent path = str(path.parent.child(path.stem)) # Read out the content and create a new Document object for # it. We'll strip the HTML tags here (for want of a better # place to do it). with open(built_doc) as fp: json_doc = json.load(fp) try: json_doc['body'] # Just to make sure it exists. title = unescape_entities( strip_tags(json_doc['title'])) except KeyError, ex: if verbosity >= 2: print "Skipping: %s (no %s)" % (path, ex.args[0]) continue doc = documents.pop(path, Document(path=path, release=release)) doc.title = title doc.save() haystack.site.update_object(doc) # Clean up any remaining documents. for doc in documents.values(): if verbosity >= 2: print "Deleting:", doc haystack.site.remove_object(doc) doc.delete()
def sanitized_title(self): if self.title: return unescape_entities(bleach.clean(self.title, tags=[], strip=True)) return _('(No title)')
def handle_raw_input(self, input_data, META, content_length, boundary, encoding=None): """ Parse the raw input from the HTTP request and split items into fields and files, executing callback methods as necessary. Shamelessly adapted and borrowed from django.http.multiparser.MultiPartParser. """ # following suit from the source class, this is imported here to avoid # a potential circular import from django.http import QueryDict # create return values self.POST = QueryDict('', mutable=True) self.FILES = MultiValueDict() # initialize the parser and stream stream = LazyStream(ChunkIter(input_data, self.chunk_size)) # whether or not to signal a file-completion at the beginning # of the loop. old_field_name = None counter = 0 try: for item_type, meta_data, field_stream in Parser(stream, boundary): if old_field_name: # we run this test at the beginning of the next loop since # we cannot be sure a file is complete until we hit the # next boundary/part of the multipart content. file_obj = self.file_complete(counter) if file_obj: # if we return a file object, add it to the files dict self.FILES.appendlist( force_text(old_field_name, encoding, errors='replace'), file_obj) # wipe it out to prevent havoc old_field_name = None try: disposition = meta_data['content-disposition'][1] field_name = disposition['name'].strip() except (KeyError, IndexError, AttributeError): continue transfer_encoding = meta_data.get('content-transfer-encoding') if transfer_encoding is not None: transfer_encoding = transfer_encoding[0].strip() field_name = force_text(field_name, encoding, errors='replace') if item_type == FIELD: # this is a POST field if transfer_encoding == "base64": raw_data = field_stream.read() try: data = str(raw_data).decode('base64') except: data = raw_data else: data = field_stream.read() self.POST.appendlist( field_name, force_text(data, encoding, errors='replace')) # trigger listener self.field_parsed(field_name, self.POST.get(field_name)) elif item_type == FILE: # this is a file file_name = disposition.get('filename') if not file_name: continue # transform the file name file_name = force_text(file_name, encoding, errors='replace') file_name = self.IE_sanitize(unescape_entities(file_name)) content_type = meta_data.get('content-type', ('', ))[0].strip() try: charset = meta_data.get('content-type', (0, {}))[1]\ .get('charset', None) except: charset = None try: file_content_length = int( meta_data.get('content-length')[0]) except (IndexError, TypeError, ValueError): file_content_length = None counter = 0 # now, do the important file stuff try: # alert on the new file kwargs = { 'content_type': content_type, 'content_length': file_content_length, 'charset': charset } self.new_file(field_name, file_name, **kwargs) # chubber-chunk it for chunk in field_stream: # we need AES compatibles blocks (multiples of 16 bits) over_bytes = len(chunk) % 16 if over_bytes: over_chunk =\ field_stream.read(16 - over_bytes) chunk += over_chunk if transfer_encoding == "base64": try: chunk = base64.b64decode(chunk) except Exception as e: # since this is anly a chunk, any # error is an unfixable error raise MultiPartParserError( "Could not decode base64 data: %r" % e) chunk_length = len(chunk) self.receive_data_chunk(chunk, counter) counter += chunk_length if counter > settings.UPLOAD_FILE_SIZE_LIMIT: raise SkipFile('File is too big.') # ... and we're done except SkipFile: # just eat the rest exhaust(field_stream) else: # handle file upload completions on next iteration old_field_name = field_name except StopUpload as e: # if we get a request to stop the upload, # exhaust it if no con reset if not e.connection_reset: exhaust(input_data) else: # make sure that the request data is all fed exhaust(input_data) # signal the upload has been completed self.upload_complete() return self.POST, self.FILES
def parse(self): """ Parse the POST data and break it into a FILES MultiValueDict and a POST MultiValueDict. Returns a tuple containing the POST and FILES dictionary, respectively. """ # We have to import QueryDict down here to avoid a circular import. from django.http import QueryDict encoding = self._encoding handlers = self._upload_handlers # HTTP spec says that Content-Length >= 0 is valid # handling content-length == 0 before continuing if self._content_length == 0: return QueryDict('', encoding=self._encoding), MultiValueDict() # See if any of the handlers take care of the parsing. # This allows overriding everything if need be. for handler in handlers: result = handler.handle_raw_input(self._input_data, self._meta, self._content_length, self._boundary, encoding) # Check to see if it was handled if result is not None: return result[0], result[1] # Create the data structures to be used later. self._post = QueryDict('', mutable=True) self._files = MultiValueDict() # Instantiate the parser and stream: stream = LazyStream(ChunkIter(self._input_data, self._chunk_size)) # Whether or not to signal a file-completion at the beginning of the loop. old_field_name = None counters = [0] * len(handlers) try: for item_type, meta_data, field_stream in Parser(stream, self._boundary): if old_field_name: # We run this at the beginning of the next loop # since we cannot be sure a file is complete until # we hit the next boundary/part of the multipart content. self.handle_file_complete(old_field_name, counters) old_field_name = None try: disposition = meta_data['content-disposition'][1] field_name = disposition['name'].strip() except (KeyError, IndexError, AttributeError): continue transfer_encoding = meta_data.get('content-transfer-encoding') if transfer_encoding is not None: transfer_encoding = transfer_encoding[0].strip() field_name = force_text(field_name, encoding, errors='replace') if item_type == FIELD: # This is a post field, we can just set it in the post if transfer_encoding == 'base64': raw_data = field_stream.read() try: data = base64.b64decode(raw_data) except _BASE64_DECODE_ERROR: data = raw_data else: data = field_stream.read() self._post.appendlist(field_name, force_text(data, encoding, errors='replace')) elif item_type == FILE: # This is a file, use the handler... file_name = disposition.get('filename') if not file_name: continue file_name = force_text(file_name, encoding, errors='replace') file_name = self.IE_sanitize(unescape_entities(file_name)) content_type, content_type_extra = meta_data.get('content-type', ('', {})) content_type = content_type.strip() charset = content_type_extra.get('charset') try: content_length = int(meta_data.get('content-length')[0]) except (IndexError, TypeError, ValueError): content_length = None counters = [0] * len(handlers) try: for handler in handlers: try: handler.new_file(field_name, file_name, content_type, content_length, charset, content_type_extra) except StopFutureHandlers: break for chunk in field_stream: if transfer_encoding == 'base64': # We only special-case base64 transfer encoding # We should always decode base64 chunks by multiple of 4, # ignoring whitespace. stripped_chunk = b"".join(chunk.split()) remaining = len(stripped_chunk) % 4 while remaining != 0: over_chunk = field_stream.read(4 - remaining) stripped_chunk += b"".join(over_chunk.split()) remaining = len(stripped_chunk) % 4 try: chunk = base64.b64decode(stripped_chunk) except Exception as e: # Since this is only a chunk, any error is an unfixable error. msg = "Could not decode base64 data: %r" % e six.reraise(MultiPartParserError, MultiPartParserError(msg), sys.exc_info()[2]) for i, handler in enumerate(handlers): chunk_length = len(chunk) chunk = handler.receive_data_chunk(chunk, counters[i]) counters[i] += chunk_length if chunk is None: # If the chunk received by the handler is None, then don't continue. break except SkipFile: self._close_files() # Just use up the rest of this file... exhaust(field_stream) else: # Handle file upload completions on next iteration. old_field_name = field_name else: # If this is neither a FIELD or a FILE, just exhaust the stream. exhaust(stream) except StopUpload as e: self._close_files() if not e.connection_reset: exhaust(self._input_data) else: # Make sure that the request data is all fed exhaust(self._input_data) # Signal that the upload has completed. for handler in handlers: retval = handler.upload_complete() if retval: break return self._post, self._files
def sanitize_file_name(self, file_name): file_name = unescape_entities(file_name) # Cleanup Windows-style path separators. file_name = file_name[file_name.rfind('\\') + 1:].strip() return os.path.basename(file_name)
def adjust_typo(texte, html=True): texte = smart_unicode(texte).strip() if not texte or (html and re.match(r'(\s*<(/?[^>]*[^>/]|br /)>\s*)+$', texte, re.UNICODE | re.IGNORECASE)): return u'' # TODO: add unit tests # TODO: in regex add code to ignore tags replacement if html: # remove HTML tags before processing text tokens = re.findall(u'<[^>]+>', texte) for idx, value in enumerate(tokens): texte = texte.replace(value, ']TAG%s[' % idx, 1) # replace OE and AE by their correct ligature, Œ and Æ. for old, new in ligatures: texte = texte.replace(old, new) # TODO: verify if these cases are cover # s/—/—/g; # s/ - / — /g; # s/--/—/g; # s/—/—/g; # s/ — / — /g; # s/—/—/g; # do some typographic adjustments (mostly putting non-breaking space where needed) regexs = [ (u' +', u' '), # remove more then one normal space (u' +', u' '), # remove more then one special space (u'«(\s| )+', u'« '), # make space non-breaking after « (u'(\s| )+»', u' »'), # make space non-breaking before » (u'«([^&])', u'« \g<1>'), # add non-breaking space after « (u'([^;])»', u'\g<1> »'), # add non-breaking space before » (u'(\s| )+(:|;|\?|!|$|%)', u' \g<2>'), # make space non-breaking before :, ?, !, $, % (u'(\d)(\s| )+(cm)', u'\g<1> \g<3>'), # put non-breaking space between groups in long numbers (ex.: 23 000) (u'(\d)(\s| )+(\d{3})', u'\g<1> \g<3>'), # put non-breaking space between groups in long numbers (ex.: 23 000) (u'(\s| )P\.(\s| )', u'\g<1>P. '), # put non-breaking space after Page abbreviation (u'(\s| )p\.', u' p.'), # put non-breaking space before page abbreviation (u' -- ', u' — '), # changed 2 hyphen in a EM dash (u'&(l|g)t;', u'&\g<1>t;'), # to keep < and > as entities when doing unescape_entities ] if html: regexs.extend([ (u'(\d)(ème|e|es)(\s| |-)', u'\g<1><sup>\g<2></sup>\g<3>'), # put number extension in exposant (ex. 2e) (u'([IVX])e(\s| )', u'\g<1><sup>e</sup>\g<2>'), # put roman number extension in exposant (ex. Xe) (u'1er(\s| |-)', u'1<sup>er</sup>\g<1>'), # put 1 extension in exposant (ex. 1er) ]) for old, new in regexs: texte = re.sub(old, new, texte) # replace html tags at their good location if html: for idx, value in enumerate(tokens): texte = texte.replace(']TAG%s[' % idx, value, 1) # do more typographic adjustments with smartypants texte = typogrify.smartypants(texte) return unescape_entities(texte).strip()
def convert_html_to_string(text): """ Returns text containing html tags as text without its tags """ return unescape_entities(strip_tags(force_text(text)))