예제 #1
0
    def _fetch_from_cache(language, url):
        from . import utils

        cms_url = utils.get_cms_url(language, url)

        if cms_url in cache:
            html = cache.get(cms_url)
        else:
            html = utils.get_cms_page(language, url)
            cache.set(cms_url, html)

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser).getroot()
        toc = CSSSelector('.toc')

        # Removing all table of contents
        for table in toc(tree):
            table.getparent().remove(table)

        title = CSSSelector('.page-title')(tree)[0]
        title.getparent().remove(title)

        elements = list(CSSSelector('.cms-content')(tree)[0])

        headers = [i for i, e in enumerate(elements) if CSSSelector('.section-header')(e)]
        title_icons = list(CSSSelector('.title-icon')(tree))

        page_contents = []

        for i, h in enumerate(headers):
            icon = ""
            if i < len(title_icons) and 'src' in title_icons[i].attrib:
                icon = title_icons[i].attrib['src']

            element = elements[h]
            if (i + 1) == len(headers):
                contents = elements[h + 1:]
            else:
                contents = elements[h + 1:headers[i + 1]]

            for e in elements:
                if 'dir' in e.attrib:
                    del e.attrib['dir']

            section_title = CSSSelector('a[name]')(element)[0].text
            section_body = ""
            for c in contents:
                section_body += etree.tostring(c, pretty_print=True, method="html")

            page_contents.append({
                "is_important": True if CSSSelector('.important')(element) else False,
                "title": section_title,
                "body": section_body,
                "icon": icon
            })

        return {
            "title": title.text,
            "contents": page_contents
        }
예제 #2
0
def get_or_create_head(root):
    """Ensures that `root` contains a <head> element and returns it.
    """
    head = CSSSelector('head')(root)
    if not head:
        head = etree.Element('head')
        body = CSSSelector('body')(root)[0]
        body.getparent().insert(0, head)
        return head
    else:
        return head[0]
예제 #3
0
def get_or_create_head(root):
    """Ensures that `root` contains a <head> element and returns it.
    """
    head = CSSSelector('head')(root)
    if not head:
        head = etree.Element('head')
        body = CSSSelector('body')(root)[0]
        body.getparent().insert(0, head)
        return head
    else:
        return head[0]
예제 #4
0
    def scrape_divisions(self, html, divisions):
        print 'scrape_division:', divisions
        players = []
        mens = True
        for wc in divisions:
            wccss = wc.replace(" ", "_").replace("'", ".27")
            if "Women's" in wc:
                mens = False
            wc = wc.replace("Women's ", "").capitalize()
            print 'team:', wc, wccss
            css_string = 'span[id^="' + wccss + '"]'
            print css_string
            try:
                css = CSSSelector(css_string)(html)[0]
            except:
                css_string = css_string.replace(wc.lower(), wc)
                print css_string
                css = CSSSelector(css_string)(html)[0]
            t = css.getparent()
            while t.tag != 'table':
                t = t.getnext()
            print 'finally:', t.tag
            for tr in t.findall('.//tr')[2:][:-1]:
                #print etree.tostring(tr)
                #country = parse.csstext(tr.findall('.//td')[0])
                fighter = {}
                try:
                    fighter[keys.entity_nickname] = parse.csstext(
                        tr.find('.//td[3]/i'))
                except:
                    pass

                fighter[keys.entity_gender] = 'Male' if mens else 'Female'
                #fighter[keys.entity_origin] = country
                try:
                    a = tr.cssselect('td span.vcard span a')[0]
                    fighter[keys.entity_name] = parse.csstext(a)
                except:
                    fighter[keys.entity_name] = parse.csstext(
                        tr.find('.//td[1]'))
                if '(C)' in parse.csstext(tr):
                    fighter['titleholder'] = 'yes'
                fighter[keys.entity_weightclass] = wc
                print fighter
                players.append(fighter)
        print 'done: figther len', len(players)
        return players
예제 #5
0
    def handle(self, *args, **options):
        if not args:
            return

        page_id, = args

        parser = etree.HTMLParser()
        selector = CSSSelector('body')

        # content = selector(tree.getroot())
        dict_list = []

        page = Page.objects.get(id=page_id)
        page = page.get_draft_object()
        for placeholder in page.get_placeholders():
            for plugin in placeholder.get_plugins('en'):
                instance, t = plugin.get_plugin_instance()
                typename = type(t).__name__
                if typename == 'TextPlugin':
                    tree = etree.parse(StringIO.StringIO(instance.body),
                                       parser).getroot()
                    for child in instance.get_children():
                        child_instance, child_type = child.get_plugin_instance(
                        )
                        child_type_name = type(child_type).__name__

                        img = CSSSelector('[id=plugin_obj_{}]'.format(
                            child_instance.id))(tree)
                        if not img:
                            child.delete()
                            continue

                        img = img[0]
                        parent = img.getparent()
                        element = None

                        if child_type_name == "LinkPlugin":
                            element = etree.Element('a',
                                                    attrib={
                                                        "target": "_blank",
                                                        "href":
                                                        child_instance.url
                                                    })
                            element.text = child_instance.name
                        elif child_type_name == "CMSLinkButtonPlugin":
                            element = etree.Element('a',
                                                    attrib={
                                                        "class": "link-button",
                                                        "target": "_blank",
                                                        "href":
                                                        child_instance.url
                                                    })
                            element.text = child_instance.name

                        if element is not None:
                            parent.insert(parent.index(img), element)
                            parent.remove(img)

                            child.delete()

                    body = selector(tree)[0]

                    out = (body.text or '') + '\n'.join([
                        etree.tostring(h, pretty_print=True, method="html")
                        for h in list(body)
                    ])

                    instance.body = out
                    instance.save()
예제 #6
0
    def _fetch_from_cache(language, url):
        from . import utils

        cms_url = utils.get_cms_url(language, url)

        if cms_url in cache:
            html = cache.get(cms_url)
        else:
            html = utils.get_cms_page(language, url)
            cache.set(cms_url, html)

        parser = etree.HTMLParser()
        tree = etree.parse(StringIO(html), parser).getroot()
        toc = CSSSelector('.toc')

        # Removing all table of contents
        for table in toc(tree):
            table.getparent().remove(table)

        title = CSSSelector('.page-title')(tree)[0]
        title.getparent().remove(title)

        elements = list(CSSSelector('.cms-content')(tree)[0])

        headers = [
            i for i, e in enumerate(elements)
            if CSSSelector('.section-header')(e)
        ]
        title_icons = list(CSSSelector('.title-icon')(tree))

        page_contents = []

        for i, h in enumerate(headers):
            icon = ""
            if i < len(title_icons) and 'src' in title_icons[i].attrib:
                icon = title_icons[i].attrib['src']

            element = elements[h]
            if (i + 1) == len(headers):
                contents = elements[h + 1:]
            else:
                contents = elements[h + 1:headers[i + 1]]

            for e in elements:
                if 'dir' in e.attrib:
                    del e.attrib['dir']

            section_title = CSSSelector('a[name]')(element)[0].text
            section_body = ""
            for c in contents:
                section_body += etree.tostring(c,
                                               pretty_print=True,
                                               method="html")

            page_contents.append({
                "is_important":
                True if CSSSelector('.important')(element) else False,
                "title":
                section_title,
                "body":
                section_body,
                "icon":
                icon
            })

        return {"title": title.text, "contents": page_contents}
예제 #7
0
    def handle(self, *args, **options):
        if not args:
            return

        page_id, = args

        parser = etree.HTMLParser()
        selector = CSSSelector('body')

        # content = selector(tree.getroot())
        dict_list = []

        page = Page.objects.get(id=page_id)
        page = page.get_draft_object()
        for placeholder in page.get_placeholders():
            for plugin in placeholder.get_plugins('en'):
                instance, t = plugin.get_plugin_instance()
                typename = type(t).__name__
                if typename == 'TextPlugin':
                    tree = etree.parse(StringIO.StringIO(instance.body), parser).getroot()
                    for child in instance.get_children():
                        child_instance, child_type = child.get_plugin_instance()
                        child_type_name = type(child_type).__name__

                        img = CSSSelector('[id=plugin_obj_{}]'.format(child_instance.id))(tree)
                        if not img:
                            child.delete()
                            continue

                        img = img[0]
                        parent = img.getparent()
                        element = None

                        if child_type_name == "LinkPlugin":
                            element = etree.Element('a', attrib={
                                "target": "_blank",
                                "href": child_instance.url
                            })
                            element.text = child_instance.name
                        elif child_type_name == "CMSLinkButtonPlugin":
                            element = etree.Element('a', attrib={
                                "class": "link-button",
                                "target": "_blank",
                                "href": child_instance.url
                            })
                            element.text = child_instance.name

                        if element is not None:
                            parent.insert(parent.index(img), element)
                            parent.remove(img)

                            child.delete()


                    body = selector(tree)[0]

                    out = (body.text or '') + '\n'.join(
                        [etree.tostring(h, pretty_print=True, method="html") for h in list(body)]
                    )

                    instance.body = out
                    instance.save()