def process_text(self, text): try: tree = fromstring(clean_html(text)) except etree.XMLSyntaxError: return xhtmlim(text) self._parse_element(tree) return html2md(etree.tostring(tree, method="xml", encoding=unicode))
def process_text(self, text): try: tree = fromstring(clean_html(text)) except etree.XMLSyntaxError: return xhtmlim(text) for el in tree.find_class('tags'): el.drop_tree() self._parse_element(tree) return html2md(etree.tostring(tree, method="xml", encoding=unicode))
def process_node_body(self, html): try: html = autolink(html_cleaner.clean_html(html)) #To stay on the safe side escape all % characters html = html.replace(u"%", u"%%") doc = filter_style(fromstring(html)) for el in doc.iter(u'img'): src = el.get(u"src").replace(u"%%", u"%") if not src: el.getparent().remove(el) if src.startswith(djsettings.APP_URL): src.replace(djsettings.APP_URL, u"/", 1) if src.startswith(djsettings.MEDIA_URL): elem_id = el.get(u"id") try: if elem_id and elem_id.startswith(self.image_id_prefix): image_id = int(elem_id.replace(self.image_id_prefix, "")) image = Image.objects.get(id=image_id) self.existing_images.append(image) self.set_image_attributes(el, image) continue except: logger.error(u'Malformed id (%s)found on our own img url %s' % (elem_id, src)) try: image = Image.objects.get(image=src.replace(djsettings.MEDIA_URL,u"")) self.set_image_attributes(el, image) self.existing_images.append(image) continue except: logger.error(u'Unable to locate img stored under url %s in Image table' % src) if src.startswith(u"/"): src = src.replace(u"/", djsettings.APP_URL, 1) image_file = download_image_file(src) image = Image(image=image_file, upload_url=src) image.save() self.new_images.append(image) self.set_image_attributes(el, image) return etree.tounicode(doc, method="html") except ValidationError: for image in self.new_images: image.delete() raise except Exception: logger.exception(u'Unhandled exception while parsing "%s" body' % html) for image in self.new_images: image.delete() raise ValidationError(_(u"Unexpected error happened :("))
def process_entry(self, entry): out = super(LorFeedProcessor, self).process_entry(entry) out['tags'] = [] tree = fromstring(entry['summary']) for el in tree.find_class('tags'): for t in el.find_class('tag'): out['tags'].append(t.text.strip()) return out
def process_text(self, text): try: tree = fromstring(clean_html(text)) except etree.XMLSyntaxError: return xhtmlim(text) rm = False for el in tree.iterchildren(): if el.tag.lower() == 'a' and el.get('name').lower().startswith('cutid'): rm = True if rm: el.drop_tree() self._parse_element(tree) return html2md(etree.tostring(tree, method="xml", encoding=unicode))