def parse_html_lxml(html): 'parses a logfile with lxml' messages = [] doc = lxml.html.document_fromstring(html, parser = lxmlparser()) for div in doc.xpath('//html/body/div'): try: message_type = div.attrib.get('class', '') if not 'message' in message_type: continue message_type = message_type.replace('message', '').strip() if not message_type in ('incoming', 'outgoing'): continue buddyname = div.find_class('buddy')[0].text timestamp = div.attrib.get('timestamp') if timestamp is not None: timestamp = parse_timestamp(timestamp) message = render_contents(div.find_class('msgcontent')[0]) auto = boolify(div.attrib.get('auto', 'false')) except Exception: print_exc() else: messages.append(Message(buddy = S(name=buddyname), timestamp = timestamp, message = message, type = message_type, auto = auto, has_autotext = auto, )) return messages
def from_xml(self, x): atomget = lambda k, default=None: getattr( x, '{%s}%s' % (constants.NS.Atom, k), default) id = unicode(atomget('id') or '') title = unicode(atomget('title') or '') author_tag = atomget('author') if author_tag is None: author_name = author_id = author_url = author_tag else: author_name = unicode(author_tag.name) author_id = author_url = unicode(author_tag.uri) source_tag = atomget('source') if source_tag is None: source = None else: source = ActivitySource() source.populate(source_tag, InputType.XML) icon_url = None preview_url = None url = None links = atomget('link', []) for link in links: rel = link.attrib.get('rel') if rel == 'icon': icon_url = unicode(link.attrib.get('href')) if rel == 'preview': preview_url = unicode(link.attrib.get('href')) if rel == 'alternate' and url is None: url = unicode(link.attrib.get('href')) contents = [] for content in atomget('content', []): contents.append((unicode(content.attrib.get('type')), htmlutils.render_contents(content))) self.__dict__.update( id=id, title=title, author_id=author_id, author_url=author_url, author_name=author_name, source=source, url=url, icon_url=icon_url, preview_url=preview_url, contents=contents, )
def from_xml(self, x): atomget = lambda k, default=None: getattr(x, "{%s}%s" % (constants.NS.Atom, k), default) id = unicode(atomget("id") or "") title = unicode(atomget("title") or "") author_tag = atomget("author") if author_tag is None: author_name = author_id = author_url = author_tag else: author_name = unicode(author_tag.name) author_id = author_url = unicode(author_tag.uri) source_tag = atomget("source") if source_tag is None: source = None else: source = ActivitySource() source.populate(source_tag, InputType.XML) icon_url = None preview_url = None url = None links = atomget("link", []) for link in links: rel = link.attrib.get("rel") if rel == "icon": icon_url = unicode(link.attrib.get("href")) if rel == "preview": preview_url = unicode(link.attrib.get("href")) if rel == "alternate" and url is None: url = unicode(link.attrib.get("href")) contents = [] for content in atomget("content", []): contents.append((unicode(content.attrib.get("type")), htmlutils.render_contents(content))) self.__dict__.update( id=id, title=title, author_id=author_id, author_url=author_url, author_name=author_name, source=source, url=url, icon_url=icon_url, preview_url=preview_url, contents=contents, )
def parse_html_lxml(html): 'parses a logfile with lxml' messages = [] doc = lxml.html.document_fromstring(html, parser=lxmlparser()) for div in doc.xpath('//html/body/div'): try: message_type = div.attrib.get('class', '') if not 'message' in message_type: continue message_type = message_type.replace('message', '').strip() if not message_type in ('incoming', 'outgoing'): continue buddyname = div.find_class('buddy')[0].text timestamp = div.attrib.get('timestamp') if timestamp is not None: timestamp = parse_timestamp(timestamp) message = render_contents(div.find_class('msgcontent')[0]) auto = boolify(div.attrib.get('auto', 'false')) except Exception: print_exc() else: messages.append( Message( buddy=S(name=buddyname), timestamp=timestamp, message=message, type=message_type, auto=auto, has_autotext=auto, )) return messages
def tree_to_string(tree): return render_contents(tree)