def _fetch_from_cache(language, url): from . import utils cms_url = utils.get_cms_url(language, url) if cms_url in cache: html = cache.get(cms_url) else: html = utils.get_cms_page(language, url) cache.set(cms_url, html) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser).getroot() toc = CSSSelector('.toc') # Removing all table of contents for table in toc(tree): table.getparent().remove(table) title = CSSSelector('.page-title')(tree)[0] title.getparent().remove(title) elements = list(CSSSelector('.cms-content')(tree)[0]) headers = [i for i, e in enumerate(elements) if CSSSelector('.section-header')(e)] title_icons = list(CSSSelector('.title-icon')(tree)) page_contents = [] for i, h in enumerate(headers): icon = "" if i < len(title_icons) and 'src' in title_icons[i].attrib: icon = title_icons[i].attrib['src'] element = elements[h] if (i + 1) == len(headers): contents = elements[h + 1:] else: contents = elements[h + 1:headers[i + 1]] for e in elements: if 'dir' in e.attrib: del e.attrib['dir'] section_title = CSSSelector('a[name]')(element)[0].text section_body = "" for c in contents: section_body += etree.tostring(c, pretty_print=True, method="html") page_contents.append({ "is_important": True if CSSSelector('.important')(element) else False, "title": section_title, "body": section_body, "icon": icon }) return { "title": title.text, "contents": page_contents }
def get_or_create_head(root): """Ensures that `root` contains a <head> element and returns it. """ head = CSSSelector('head')(root) if not head: head = etree.Element('head') body = CSSSelector('body')(root)[0] body.getparent().insert(0, head) return head else: return head[0]
def get_or_create_head(root): """Ensures that `root` contains a <head> element and returns it. """ head = CSSSelector('head')(root) if not head: head = etree.Element('head') body = CSSSelector('body')(root)[0] body.getparent().insert(0, head) return head else: return head[0]
def scrape_divisions(self, html, divisions): print 'scrape_division:', divisions players = [] mens = True for wc in divisions: wccss = wc.replace(" ", "_").replace("'", ".27") if "Women's" in wc: mens = False wc = wc.replace("Women's ", "").capitalize() print 'team:', wc, wccss css_string = 'span[id^="' + wccss + '"]' print css_string try: css = CSSSelector(css_string)(html)[0] except: css_string = css_string.replace(wc.lower(), wc) print css_string css = CSSSelector(css_string)(html)[0] t = css.getparent() while t.tag != 'table': t = t.getnext() print 'finally:', t.tag for tr in t.findall('.//tr')[2:][:-1]: #print etree.tostring(tr) #country = parse.csstext(tr.findall('.//td')[0]) fighter = {} try: fighter[keys.entity_nickname] = parse.csstext( tr.find('.//td[3]/i')) except: pass fighter[keys.entity_gender] = 'Male' if mens else 'Female' #fighter[keys.entity_origin] = country try: a = tr.cssselect('td span.vcard span a')[0] fighter[keys.entity_name] = parse.csstext(a) except: fighter[keys.entity_name] = parse.csstext( tr.find('.//td[1]')) if '(C)' in parse.csstext(tr): fighter['titleholder'] = 'yes' fighter[keys.entity_weightclass] = wc print fighter players.append(fighter) print 'done: figther len', len(players) return players
def handle(self, *args, **options): if not args: return page_id, = args parser = etree.HTMLParser() selector = CSSSelector('body') # content = selector(tree.getroot()) dict_list = [] page = Page.objects.get(id=page_id) page = page.get_draft_object() for placeholder in page.get_placeholders(): for plugin in placeholder.get_plugins('en'): instance, t = plugin.get_plugin_instance() typename = type(t).__name__ if typename == 'TextPlugin': tree = etree.parse(StringIO.StringIO(instance.body), parser).getroot() for child in instance.get_children(): child_instance, child_type = child.get_plugin_instance( ) child_type_name = type(child_type).__name__ img = CSSSelector('[id=plugin_obj_{}]'.format( child_instance.id))(tree) if not img: child.delete() continue img = img[0] parent = img.getparent() element = None if child_type_name == "LinkPlugin": element = etree.Element('a', attrib={ "target": "_blank", "href": child_instance.url }) element.text = child_instance.name elif child_type_name == "CMSLinkButtonPlugin": element = etree.Element('a', attrib={ "class": "link-button", "target": "_blank", "href": child_instance.url }) element.text = child_instance.name if element is not None: parent.insert(parent.index(img), element) parent.remove(img) child.delete() body = selector(tree)[0] out = (body.text or '') + '\n'.join([ etree.tostring(h, pretty_print=True, method="html") for h in list(body) ]) instance.body = out instance.save()
def _fetch_from_cache(language, url): from . import utils cms_url = utils.get_cms_url(language, url) if cms_url in cache: html = cache.get(cms_url) else: html = utils.get_cms_page(language, url) cache.set(cms_url, html) parser = etree.HTMLParser() tree = etree.parse(StringIO(html), parser).getroot() toc = CSSSelector('.toc') # Removing all table of contents for table in toc(tree): table.getparent().remove(table) title = CSSSelector('.page-title')(tree)[0] title.getparent().remove(title) elements = list(CSSSelector('.cms-content')(tree)[0]) headers = [ i for i, e in enumerate(elements) if CSSSelector('.section-header')(e) ] title_icons = list(CSSSelector('.title-icon')(tree)) page_contents = [] for i, h in enumerate(headers): icon = "" if i < len(title_icons) and 'src' in title_icons[i].attrib: icon = title_icons[i].attrib['src'] element = elements[h] if (i + 1) == len(headers): contents = elements[h + 1:] else: contents = elements[h + 1:headers[i + 1]] for e in elements: if 'dir' in e.attrib: del e.attrib['dir'] section_title = CSSSelector('a[name]')(element)[0].text section_body = "" for c in contents: section_body += etree.tostring(c, pretty_print=True, method="html") page_contents.append({ "is_important": True if CSSSelector('.important')(element) else False, "title": section_title, "body": section_body, "icon": icon }) return {"title": title.text, "contents": page_contents}
def handle(self, *args, **options): if not args: return page_id, = args parser = etree.HTMLParser() selector = CSSSelector('body') # content = selector(tree.getroot()) dict_list = [] page = Page.objects.get(id=page_id) page = page.get_draft_object() for placeholder in page.get_placeholders(): for plugin in placeholder.get_plugins('en'): instance, t = plugin.get_plugin_instance() typename = type(t).__name__ if typename == 'TextPlugin': tree = etree.parse(StringIO.StringIO(instance.body), parser).getroot() for child in instance.get_children(): child_instance, child_type = child.get_plugin_instance() child_type_name = type(child_type).__name__ img = CSSSelector('[id=plugin_obj_{}]'.format(child_instance.id))(tree) if not img: child.delete() continue img = img[0] parent = img.getparent() element = None if child_type_name == "LinkPlugin": element = etree.Element('a', attrib={ "target": "_blank", "href": child_instance.url }) element.text = child_instance.name elif child_type_name == "CMSLinkButtonPlugin": element = etree.Element('a', attrib={ "class": "link-button", "target": "_blank", "href": child_instance.url }) element.text = child_instance.name if element is not None: parent.insert(parent.index(img), element) parent.remove(img) child.delete() body = selector(tree)[0] out = (body.text or '') + '\n'.join( [etree.tostring(h, pretty_print=True, method="html") for h in list(body)] ) instance.body = out instance.save()