def get_html(url): true_url = parse_qs(url)['url'][0] document = pq(url=true_url) article_text = clean_html(document('div#article_body')) article_text = strip_tags(article_text) article_text = article_text.replace(' ', '') return article_text
def dump_studentmodules(self, module, display_header, display_prompt, deanonymize): '''Identify the list of StudentModule objects of combinedopenended type that belong to the specified module_id''' module = UsageKey.from_string(module) modules = StudentModule.objects.filter( module_state_key=module, module_type='combinedopenended' ) filename = "{0}.html".format(module).replace(':','-').replace('/','-') with io.StringIO() as handle: handle.write(u'<html><head></head><body>') handle.write(u'<h1>Задание "{0}"</h1>\n\n'.format(display_header)) handle.write(u'<p>{0}</p>\n\n'.format(display_prompt)) for module in modules: self.dump_studentmodule_answer(module, handle, deanonymize) handle.write(u'</body></html>') filedata = handle.getvalue() soup = BeautifulSoup(clean_html(filedata)) metatag = soup.new_tag('meta') metatag.attrs['charset'] = 'UTF-8' soup.head.append(metatag) return (filename, u"<!DOCTYPE html>\n"+soup.prettify())
def write_rst(request, rst_template, context, filename=None): if not filename: filename = _get_default_filename() rst_filename = '%s.rst'%filename destination = os.path.join(settings.MEDIA_ROOT, 'resume_download') destination_rst = os.path.join(destination, rst_filename) if not os.path.exists(destination): os.makedirs(destination) with open(destination_rst, 'w+') as f: t = loader.get_template(rst_template) rst_content = html.clean_html(t.render(Context(context))) logger.debug("Writing %s bytes to %s"%(len(rst_content), destination_rst)) logger.debug("RST content:\n%s"%rst_content) try: f.write(rst_content) except UnicodeEncodeError as e: logger.error(rst_content[e.start:e.end]) logger.error(rst_content[e.start-20:e.end+20]) logger.error("%d, %d"%(e.start, e.end)) f.close() f.close() return destination_rst
def dump_studentmodules(self, module, display_header, display_prompt, deanonymize): '''Identify the list of StudentModule objects of combinedopenended type that belong to the specified module_id''' module = UsageKey.from_string(module) modules = StudentModule.objects.filter(module_state_key=module, module_type='combinedopenended') filename = "{0}.html".format(module).replace(':', '-').replace('/', '-') with io.StringIO() as handle: handle.write(u'<html><head></head><body>') handle.write(u'<h1>Задание "{0}"</h1>\n\n'.format(display_header)) handle.write(u'<p>{0}</p>\n\n'.format(display_prompt)) for module in modules: self.dump_studentmodule_answer(module, handle, deanonymize) handle.write(u'</body></html>') filedata = handle.getvalue() soup = BeautifulSoup(clean_html(filedata)) metatag = soup.new_tag('meta') metatag.attrs['charset'] = 'UTF-8' soup.head.append(metatag) return (filename, u"<!DOCTYPE html>\n" + soup.prettify())
def products_info(): results = tb_category_info.select().where(pid__ne = '0').execute() for result in results: url_path = PRODUCTS_BASE_URL + result.cid + '/' + result.url while url_path: print url_path resp = session.get(url_path) url_path = next_pattern.findall(resp.text) if url_path: url_path = url_path[0] urls = product_url_pattern.findall(resp.text) for url in urls: try: product_info = {} print url product_id, product_name = product_id_pattern.findall(url)[0] resp = session.get(url) # groups = company_info_pattern.findall(resp.text) company_id = company_id_pattern.findall(resp.text)[0] product_description = product_description_pattern.findall(resp.text)[0].strip() product_info['cid'] = result.cid product_info['product_id'] = product_id product_info['company_id'] = company_id product_info['product_name'] = product_name.strip() product_info['description'] = clean_html(product_description) print product_info product_info_db = tb_product_info(**product_info) product_info_db.save() except Exception: print traceback.format_exc() with open('product_error.txt', 'a') as FILE: FILE.write(url + '\n') print '出错'
def save(self, force_insert=False, force_update=False): from django.utils.html import strip_tags, clean_html from django.utils.text import truncate_words self.html = clean_html(self.html) if not self.anounce and self.html: self.anounce = truncate_words(strip_tags(self.html), 100) super(News, self).save(force_insert, force_update) # Call the "real" save() method.
def save(self, *args, **kwargs): self.content = clean_html(self.content) super(Post, self).save(*args, **kwargs) # Initial the views and comments count to 0 if the PostMeta isn't available pm, created = PostMeta.objects.get_or_create(post=self, meta_key='views') if created: pm.meta_value = '0' pm.save() pm, created = PostMeta.objects.get_or_create(post=self, meta_key='comments_count') if created: pm.meta_value = '0' pm.save()
def save(self): try: self.content = html.clean_html(self.content) except: pass super(Post, self).save() # Initial the views and comments count to 0 if the PostMeta isn't available pm, created = PostMeta.objects.get_or_create(post=self, meta_key='views') if created: pm.meta_value = '0' pm.save() pm, created = PostMeta.objects.get_or_create(post=self, meta_key='comments_count') if created: pm.meta_value = '0' pm.save()
def render_content(content, text_type, images=None): try: if not content: return '' text_type = int(text_type) # the big bad switch ;) if text_type == MARKUP_PLAIN_TEXT: ret = html.linebreaks(html.escape(content)) elif text_type == MARKUP_HTML: ret = html.clean_html(content) elif text_type == MARKUP_TEXTILE: ret = markup.textile(content) elif text_type == MARKUP_MARKDOWN: ret = markup.markdown(content) elif text_type == MARKUP_REST: ret = markup.restructuredtext(content) else: # this can never happen return 'UNKNOWN CONTENT %d' % text_type except template.TemplateSyntaxError, err: return 'ERROR: %s' % err
def changelog_entry(request, slug): slug = clean_html(slug) change = get_object_or_404(models.Change, slug = slug) context = PageContext(request, "Change:%s" %change.title, d = locals()) return direct_to_template(request, template = 'changelog/change.html', extra_context = context)
def custom_cliente(self, obj): return clean_html("<a href='#'>%s</a>" % obj.cliente)