Exemplo n.º 1
0
    def __init__(self, html):
        """Create a parse tree from the given HTML."""
        def really_parse_fragment(parser, html):
            """Parse a possibly multi-rooted HTML fragment, wrapping it in a
            <div> to make it easy to query later.

            As far as I can tell, this is what parseFragment is supposed to do
            (but doesn't). See
            http://code.google.com/p/html5lib/issues/detail?id=161.

            """
            top_level_elements = parser.parseFragment(html)
            container = Element(self.CONTAINER_TAG)

            # Why lxml couldn't just have text nodes, I'll never understand.
            # Text nodes that come other than first are automatically stuffed
            # into the tail attrs of the preceding elements by html5lib.
            if top_level_elements and isinstance(top_level_elements[0],
                                                 basestring):
                container.text = top_level_elements.pop(0)

            container.extend(top_level_elements)
            return container

        p = HTMLParser(tree=getTreeBuilder(self.TREEBUILDER))
        self._root = really_parse_fragment(p, html)
Exemplo n.º 2
0
def wiki_string_to_tiddlers(content):
    """
    Turn a string that is a TiddlyWiki into individual tiddlers.
    """
    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    doc = parser.parse(content)
    # minidom will not provide working getElementById without
    # first having a valid document, which means some very specific
    # doctype hooey. So we traverse
    body = doc.getElementsByTagName('body')[0]
    body_divs = body.getElementsByTagName('div')
    is_wiki = False
    for div in body_divs:
        if div.hasAttribute('id') and div.getAttribute('id') == 'storeArea':
            divs = div.getElementsByTagName('div')
            is_wiki = True
            break

    if is_wiki:
        tiddlers = []
        for tiddler_div in divs:
            tiddlers.append(_get_tiddler_from_div(tiddler_div))
        return tiddlers
    else:
        raise ValueError('content not a tiddlywiki 2.x')
Exemplo n.º 3
0
def extract_text_from_html(html: str,
                           skip_tags: Optional[List[str]] = None) -> str:
    """Extract plain text content from the elements inside an HTML string."""
    def extract_text(element: Element, skip_tags: List[str]) -> Iterator[str]:
        """Extract text recursively from elements, optionally skipping some tags.

        This function is Python's xml.etree.ElementTree.Element.itertext() but with the
        added ability to skip over particular tags and not include the text from inside
        them or any of their children.
        """
        if not isinstance(element.tag, str) and element.tag is not None:
            return

        if element.tag in skip_tags:
            return

        if element.text:
            yield element.text

        for subelement in element:
            yield from extract_text(subelement, skip_tags)

            if subelement.tail:
                yield subelement.tail

    skip_tags = skip_tags or []

    html_tree = HTMLParser(namespaceHTMLElements=False).parseFragment(html)

    # extract the text from all of the HTML elements
    extracted_text = "".join(extract_text(html_tree, skip_tags))

    # sanitize unicode, remove leading/trailing whitespace, etc.
    return simplify_string(extracted_text)
Exemplo n.º 4
0
def html_parser(html):
    try:
        soup = BeautifulSoup(html)
    except:
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
        soup = parser.parse(html)
    return soup
Exemplo n.º 5
0
def summary_scrape(urn):
    print " - summary"
    url = "http://www.edubase.gov.uk/establishment/summary.xhtml?urn=" + urn
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))

    keyvaluepairs = table_extract(page)

    raw_address = [x.strip() for x in keyvaluepairs.pop("").split(" / ")]
    if postcode.match(raw_address[-1]):
        keyvaluepairs["Postcode"] = raw_address[-1]
        raw_address = raw_address[:-1]
    keyvaluepairs["Address"] = " / ".join(raw_address)

    for t in page.findall(
            path([
                "body", "div", "div", "div", "div", "table", "tbody", "tr",
                "td", "h1"
            ], pre)):
        x = t.text.split(": ")
        keyvaluepairs[x[0]] = x[1]

    for t in page.findall(
            path([
                "body", "div", "div", "div", "div", "table", "tbody", "tr",
                "td", "div", "p", "b"
            ], pre)):
        keyvaluepairs[t.text.strip().strip(":")] = (t.tail or "").strip()

    return keyvaluepairs
Exemplo n.º 6
0
def clean_html(input, sanitize=False):
    """
    Takes an HTML fragment and processes it using html5lib to ensure that the HTML is well-formed.

    :param sanitize: Remove unwanted HTML tags and attributes.

    >>> clean_html("<p>Foo<b>bar</b></p>")
    u'<p>Foo<b>bar</b></p>'
    >>> clean_html("<p>Foo<b>bar</b><i>Ooops!</p>")
    u'<p>Foo<b>bar</b><i>Ooops!</i></p>'
    >>> clean_html('<p>Foo<b>bar</b>& oops<a href="#foo&bar">This is a <>link</a></p>')
    u'<p>Foo<b>bar</b>&amp; oops<a href=#foo&amp;bar>This is a &lt;&gt;link</a></p>'
    """
    parser_kwargs = {}
    serializer_kwargs = {}
    if sanitize:
        if HTMLSanitizer is None:
            # new syntax as of 0.99999999/1.0b9 (Released on July 14, 2016)
            serializer_kwargs['sanitize'] = True
        else:
            parser_kwargs['tokenizer'] = HTMLSanitizer

    p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"), **parser_kwargs)
    dom_tree = p.parseFragment(input)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)

    s = HTMLSerializer(omit_optional_tags=False, **serializer_kwargs)
    return "".join(s.serialize(stream))
Exemplo n.º 7
0
def runParserEncodingTest(data, encoding):
    p = HTMLParser()
    assert p.documentEncoding is None
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode('ascii')

    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
Exemplo n.º 8
0
def runParserEncodingTest(data, encoding):
    p = HTMLParser()
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode("ascii")

    assert encoding == p.tokenizer.stream.charEncoding[0], errorMessage(
        data, encoding, p.tokenizer.stream.charEncoding[0])
Exemplo n.º 9
0
def read_hcard(url):
    try:
        f = urlopen(url)
        content_type = f.info().getheader('content-type', 'text/html')
        value, params = cgi.parse_header(content_type)
        charset = params.get('charset', 'utf-8').replace("'", '')
        dom = HTMLParser().parse(urlopen(url).read(512 * 1024).decode(charset, 'ignore'))
    except IOError:
        return

    def _find(node, class_name):
        for child in (c for c in node if c.type == 5):
            if re.search(r'\b%s\b' % class_name, child.attributes.get('class', '')):
                return child

    vcard = _find(dom, 'vcard')
    if vcard is None:
        return

    def _parse_property(class_name):
        el = _find(vcard, class_name)
        if el is None:
            return
        if el.name == 'abbr' and 'title' in el.attributes:
            result = el.attributes['title']
        else:
            result = u''.join(s.value for s in el if s.type == 4)
        return result.replace(u'\n', u' ').strip()

    return {
        'nickname': _parse_property('nickname') or _parse_property('fn') or '',
    }
Exemplo n.º 10
0
    def extract_html_urls(self, html):
        """
        Take all ``<img src="..">`` from the HTML
        """
        p = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        dom = p.parse(html)
        urls = []

        for img in dom.getElementsByTagName("img"):
            src = img.getAttribute("src")
            if src:
                urls.append(unquote_utf8(src))

            srcset = img.getAttribute("srcset")
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName("source"):
            srcset = source.getAttribute("srcset")
            if srcset:
                urls += self.extract_srcset(srcset)

        for source in dom.getElementsByTagName("a"):
            href = source.getAttribute("href")
            if href:
                urls.append(unquote_utf8(href))

        return urls
Exemplo n.º 11
0
def test_parser_encoding(data, encoding):
    p = HTMLParser()
    assert p.documentEncoding is None
    p.parse(data, useChardet=False)
    encoding = encoding.lower().decode("ascii")

    assert encoding == p.documentEncoding, errorMessage(data, encoding, p.documentEncoding)
def do_year(y, url):
    pagetext = urllib2.urlopen(url)
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"),
                        tokenizer=sanitizer.HTMLSanitizer)
    page = parser.parse(pagetext)

    for section in page.findall(
            "body/div/div/div/div/div/div/div/div/table[@class='fixture']"):

        matchtype = section.find("caption").text

        for match in section.findall("tbody/tr"):

            l = list(match.getchildren())
            d = {}
            d["Match type"] = matchtype
            d["Match number"] = l[0].text
            d["Date"] = make_date(l[1].text, y)
            d["Team 1"] = flatten_refs(l[3])
            d["Team 2"] = flatten_refs(l[5])
            a = l[4].find("a")
            d["Score"] = a.text
            d["Report"] = "http://www.fifa.com" + a.get("href")
            print "%d (%s) %s - %s" % (y, d["Match type"], d["Team 1"],
                                       d["Team 2"])
            datastore.save(unique_keys=["Date", "Team 1", "Team 2"], data=d)
Exemplo n.º 13
0
def get_html_parse_tree(url, data=None, headers={}, treetype='beautifulsoup'):
    "Request a URL, parse with html5lib, and return a parse tree from it"

    req = urllib2.Request(iri_to_uri(url), data, headers)
    f = urllib2.urlopen(req)

    if f.info().gettype() not in ('text/html', 'application/xhtml+xml'):
        f.close()
        raise ContentTypeException("Content type isn't HTML, but " +
                                   f.info().gettype())

    data = f.read()
    f.close()

    encoding = None
    contentType = f.headers.get('content-type')
    if contentType:
        (mediaType, params) = cgi.parse_header(contentType)
        encoding = params.get('charset')

    compression = f.headers.get('content-encoding')
    if compression:
        if compression.lower() == "deflate":
            try:
                data = zlib.decompress(data)
            except zlib.error:
                data = zlib.decompress(data, -zlib.MAX_WBITS)
        elif compression.lower() == "gzip":
            compressedstream = StringIO(data)
            gzipper = GzipFile(fileobj=compressedstream)
            data = gzipper.read()

    if treetype == "beautifulsoup":
        return BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES)
    elif treetype == "etree":
        kwargs = {'tree': treebuilders.getTreeBuilder('etree', ElementTree)}
        # http://code.google.com/p/html5lib/issues/detail?id=138
        if ('namespaceHTMLElements'
                in inspect.getargspec(HTMLParser.__init__)[0]):
            kwargs['namespaceHTMLElements'] = False
        parser = HTMLParser(**kwargs)
    else:
        if treetype == "html5lib-beautifulsoup":
            treetype = "beautifulsoup"
        parser = HTMLParser(tree=treebuilders.getTreeBuilder(treetype))

    return parser.parse(data, encoding=encoding)
Exemplo n.º 14
0
 def parse(fname):
     if fname in etree_cache:
         return etree_cache[fname]
     with (fname).open('rb') as fp:
         etree = HTMLParser(namespaceHTMLElements=False).parse(fp)
         etree_cache.clear()
         etree_cache[fname] = etree
         return etree
Exemplo n.º 15
0
def test_parser_args(expected, data, kwargs):
    stream = _inputstream.HTMLBinaryInputStream(data,
                                                useChardet=False,
                                                **kwargs)
    assert expected == stream.charEncoding[0].name
    p = HTMLParser()
    p.parse(data, useChardet=False, **kwargs)
    assert expected == p.documentEncoding
Exemplo n.º 16
0
def test_productionlist(app, status, warning):
    app.builder.build_all()

    warnings = warning.getvalue().split("\n")
    assert len(warnings) == 2
    assert warnings[-1] == ''
    assert "Dup2.rst:4: WARNING: duplicate token description of Dup, other instance in Dup1" in warnings[
        0]

    with (app.outdir / 'index.html').open('rb') as f:
        etree = HTMLParser(namespaceHTMLElements=False).parse(f)
    ul = list(etree.iter('ul'))[1]
    cases = []
    for li in list(ul):
        assert len(list(li)) == 1
        p = list(li)[0]
        assert p.tag == 'p'
        text = str(p.text).strip(' :')
        assert len(list(p)) == 1
        a = list(p)[0]
        assert a.tag == 'a'
        link = a.get('href')
        assert len(list(a)) == 1
        code = list(a)[0]
        assert code.tag == 'code'
        assert len(list(code)) == 1
        span = list(code)[0]
        assert span.tag == 'span'
        linkText = span.text.strip()
        cases.append((text, link, linkText))
    assert cases == [
        ('A', 'Bare.html#grammar-token-A', 'A'),
        ('B', 'Bare.html#grammar-token-B', 'B'),
        ('P1:A', 'P1.html#grammar-token-P1-A', 'P1:A'),
        ('P1:B', 'P1.html#grammar-token-P1-B', 'P1:B'),
        ('P2:A', 'P1.html#grammar-token-P1-A', 'P1:A'),
        ('P2:B', 'P2.html#grammar-token-P2-B', 'P2:B'),
        ('Explicit title A, plain', 'Bare.html#grammar-token-A', 'MyTitle'),
        ('Explicit title A, colon', 'Bare.html#grammar-token-A', 'My:Title'),
        ('Explicit title P1:A, plain', 'P1.html#grammar-token-P1-A',
         'MyTitle'),
        ('Explicit title P1:A, colon', 'P1.html#grammar-token-P1-A',
         'My:Title'),
        ('Tilde A', 'Bare.html#grammar-token-A', 'A'),
        ('Tilde P1:A', 'P1.html#grammar-token-P1-A', 'A'),
        ('Tilde explicit title P1:A', 'P1.html#grammar-token-P1-A',
         '~MyTitle'),
        ('Tilde, explicit title P1:A', 'P1.html#grammar-token-P1-A',
         'MyTitle'),
        ('Dup', 'Dup2.html#grammar-token-Dup', 'Dup'),
        ('FirstLine', 'firstLineRule.html#grammar-token-FirstLine',
         'FirstLine'),
        ('SecondLine', 'firstLineRule.html#grammar-token-SecondLine',
         'SecondLine'),
    ]

    text = (app.outdir / 'LineContinuation.html').read_text()
    assert "A</strong> ::=  B C D    E F G" in text
Exemplo n.º 17
0
    def body_html(self):
        body_html = self.get_part_content(self.mail_pyzmail.html_part)
        if not body_html and self.body_text:
            body_html = self.body_text.replace('\n', '<br />')

        parser = HTMLParser(tokenizer=HTMLSanitizer)
        parser.parse(body_html)

        return body_html
def scrape_pct(link, pct_name):
    """
    Scrapes the data associated with the PCT, and calls functions to scrape
    data associated with the services.
    """

    print
    print
    print pct_name
    print "-" * len(pct_name)

    url = "http://www.nhs.uk" + link
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(scrape(url))
    root = page.getroot()

    d = {}

    # basic contact details
    d["PCT"] = pct_name
    d["type"] = "main"
    d["name"] = pct_name
    address = root.find("body/div/form/div/div/p").text
    d["address"] = address
    postcode = geo.extract_gb_postcode(address)
    d["postcode"] = postcode
    d["latlng"] = geo.gb_postcode_to_latlng(postcode)
    d["info HTML"] = url

    # quality
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div[@class='service-feedback clear']"
    ):
        k = t.find("div/h4").text.strip()
        v = t.find("div/img").attrib["alt"]
        d[k] = v

    # head honcho
    for t in root.findall(
            "body/div/form/div/div/div/div/div/div/div/p[@class='profiles-picture-caption']"
    ):
        d["Boss"] = t.text.replace("<br />", ", ")

    # boring text
    for t in root.findall("body/div/form/div/div/div/div/div/div/p"):
        if t.text:
            if t.attrib.get("class", False) == "intro":
                d["intro text"] = t.text
            else:
                d["boilerplate"] = d.get("boilerplate", "") + "\n" + t.text

    datastore.save(unique_keys=["PCT", "type", "name", "address"],
                   data=d,
                   latlng=d.get("latlng"))

    scrape_facilities(pct_name, root)
    scrape_others(pct_name, url)
def scrape_others(pct_name, url):
    types = ["doctor", "dentist", "pharmacy", "optician"]
    for facility_type, i in zip(types, range(2, 6)):
        parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
        page = parser.parse(scrape(url + "&v=%d" % i))
        root = page.getroot()

        s = root.find("body/div/form/div/div/div/div/div/dl")
        extract_table_data(pct_name, s, facility_type)
Exemplo n.º 20
0
def _try_process_source(stream, options):
    """
    Tries to parse input as xhtml, xml (e.g. svg) or html(5), modifying options
    while figuring out input..

    Returns a DOM tree.
    """
    parse = xml.dom.minidom.parse
    try:
        dom = parse(stream)
        # Try to second-guess the input type
        # This is _not_ really kosher, but the minidom is not really namespace aware...
        # In practice the goal is to have the system recognize svg content automatically
        # First see if there is a default namespace defined for the document:
        top = dom.documentElement
        if top.hasAttribute("xmlns"):
            key = (top.getAttribute("xmlns"), top.nodeName)
            if key in _HOST_LANG:
                options.host_language = _HOST_LANG[key]
    except:
        # XML Parsing error in the input
        type, value, traceback = sys.exc_info()
        if options.host_language == GENERIC_XML or options.lax == False:
            raise RDFaError('Parsing error in input file: "%s"' % value)

        # XML Parsing error in the input
        msg = "XHTML Parsing error in input file: %s. Falling back on the HTML5 parser" % value
        if options != None and options.warnings:
            options.comment_graph.add_warning(msg)

        # in Ivan's original code he reopened the stream if it was from urllib
        if isinstance(stream, urllib.addinfourl):
            stream = urllib.urlopen(stream.url)

        # Now try to see if and HTML5 parser is an alternative...
        try:
            from html5lib import HTMLParser, treebuilders
        except ImportError:
            # no alternative to the XHTML error, because HTML5 parser not available...
            msg2 = 'XHTML Parsing error in input file: %s. Though parsing is lax, HTML5 parser not available. Try installing html5lib <http://code.google.com/p/html5lib>' % value
            raise RDFaError(msg2)

        parser = HTMLParser(tree=treebuilders.getTreeBuilder("dom"))
        parse = parser.parse
        try:
            dom = parse(stream)
            # The host language has changed
            options.host_language = HTML5_RDFA
        except:
            # Well, even the HTML5 parser could not do anything with this...
            (type, value, traceback) = sys.exc_info()
            msg2 = 'Parsing error in input file as HTML5: "%s"' % value
            msg3 = msg + '\n' + msg2
            raise RDFaError, msg3

    return dom
Exemplo n.º 21
0
def get_highest_id(floor=0):
    rssfeed_url = 'http://digitalmedia.fws.gov/cdm4/rss.php'
    html = urllib2.urlopen(rssfeed_url).read()
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    last_item = soup.findAll('item')[-0]
    last_id = last_item.description.contents[0].split('CISOPTR=')[1].split(
        '&')[0]
    last_id = int(last_id)
    return last_id
Exemplo n.º 22
0
def get_first_result_index_from_quick_search_results(html):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    block = soup.find(
        '',
        {'id': 'photoresult'})  # isolate the table of data on the first result
    block = block.findAll('', {'class': 'photobox'})[0]
    id = block.find('p').find('a').contents[0]
    id = int(id)
    return id
Exemplo n.º 23
0
def get_first_result_index_from_quick_search_results(html):
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    block = soup.find(border="0", bgcolor="white") # isolate the table of data on the first result
    id_str = block.find('font').contents[0] #contents of first <font>
    # this should looke like: 'ID#:11901'
    # parse out the actual id and cast as int
    id = int(id_str.partition(':')[2])
    print id
    return id
Exemplo n.º 24
0
 def encodingTest(self, data=test['data'], 
                  encoding=test['encoding']):
     p = HTMLParser()
     t = p.parse(data, useChardet=False)
     
     errorMessage = ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
                     (data, repr(encoding.lower()), 
                      repr(p.tokenizer.stream.charEncoding)))
     self.assertEquals(encoding.lower(),
                       p.tokenizer.stream.charEncoding[0], 
                       errorMessage)
Exemplo n.º 25
0
def test_parser_reparse():
    data = "<title>Caf\u00E9</title><!--a--><meta charset='utf-8'>".encode('utf-8')
    pad = 10240 - len(data) + 1
    data = data.replace(b"-a-", b"-" + (b"a" * pad) + b"-")
    assert len(data) == 10240  # Sanity
    stream = _inputstream.HTMLBinaryInputStream(data, useChardet=False)
    assert 'windows-1252' == stream.charEncoding[0].name
    p = HTMLParser(namespaceHTMLElements=False)
    doc = p.parse(data, useChardet=False)
    assert 'utf-8' == p.documentEncoding
    assert doc.find(".//title").text == "Caf\u00E9"
Exemplo n.º 26
0
def sanitize_html(html):
    """Sanitizes an HTML fragment."""
    p = HTMLParser(tokenizer=HTMLSanitizer,
                   tree=treebuilders.getTreeBuilder("dom"))
    dom_tree = p.parseFragment(html)
    walker = treewalkers.getTreeWalker("dom")
    stream = walker(dom_tree)
    s = serializer.HTMLSerializer(omit_optional_tags=False,
                                  quote_attr_values=True)
    output_generator = s.serialize(stream)
    return u''.join(output_generator)
Exemplo n.º 27
0
def parse_img_html_page(html):
    if not html or html == '':
        print "wait, the page appears blank. abort mission!"
        return None
    metadict = init_dict()
    # soupify the html
    parser = HTMLParser(tree=treebuilders.getTreeBuilder("beautifulsoup"))
    soup = parser.parse(html)
    if not soup:
        print "wait, we couldn't make a soup. i don't know WHY..."
        return None
    
    try:
        metadict['id'] = int(soup.find('input', {'type':'hidden', 'name': 'CISOPTR'})['value'])
    except:
        favorite_link_href = soup.find("a", {"title": u"Add to My Favorites"})['href']
        the_split = favorite_link_href.split("'")
        the_split.pop()
        metadict['id'] = int(the_split.pop())

    #TODO: this is kinda hackey but probably fine
    metadict['url_to_thumb_img'] = u'http://digitalmedia.fws.gov/cgi-bin/thumbnail.exe?CISOROOT=/natdiglib&CISOPTR=' + str(metadict['id'])

    hires_link = soup.find(text=lambda str: str.strip() == u'(Full Resolution Image Link)', recursive=True).parent.find('a')
    metadict['url_to_hires_img'] = hires_link['href']
    try:
        metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("img", {"id" : "imagexy"})['src']
    except:
        metadict['url_to_lores_img'] = u'http://digitalmedia.fws.gov' + soup.find("input", {"type" : "image"})['src']

    data_table = soup.find("table", {"style": "border-top: 1px solid #cccccc"}).find("tbody")
    parsed_tuples = []
    for data_label_cell in data_table.findAll("td", {"width": "150"}):
        try:
            label = get_text_within(data_label_cell)
            print label
        except:
            continue
        data_cell = data_label_cell.findNextSibling("td")
        if label == 'Subject':
            data = data_cell.findAll(text=True)
        else:
            data = get_text_within(data_cell).strip()
        parsed_tuples.append((label, data))
    # now we have a list of tuples of the parsed metadata

    print parsed_tuples
    for label, data in parsed_tuples:
        field_key = data_schema.get_field_key_by_full_name(label)
        if not field_key:
            continue
        metadict[field_key] = data
        
    return metadict
Exemplo n.º 28
0
def schoolscrape(categoryurl, name, url):

    print ""
    print name

    parser = HTMLParser(tree=treebuilders.getTreeBuilder("lxml"))
    page = parser.parse(specialscrape(url))

    # pre = "{http://www.w3.org/1999/xhtml}"
    pre = ""

    keyvaluepairs = {}

    def addkeyvaluepair(k, v):
        keyvaluepairs[k] = v
        print k + ": " + v

    data_rows = [
        t
        for t in page.findall(path(["body", "div", "div", "div", "div"], pre))
        if t.attrib.get("class", "") == "detailsRow"
    ]

    for row in data_rows:
        key = [
            t for t in row.findall(path(["span"], pre))
            if t.attrib.get("class", "") == "leftColumn"
        ][0].text.rstrip(": ")
        valuetag = [
            t for t in row.findall(path(["span"], pre))
            if t.attrib.get("class", "") == "rightColumn"
        ][0]
        if valuetag.text:
            if key == "Address":
                raw_address = [valuetag.text] + [
                    br.tail for br in valuetag.findall(path(["br"], pre))
                ]
                addkeyvaluepair("Address", " / ".join(raw_address[:-1]))
                addkeyvaluepair("Postcode", raw_address[-1])
            else:
                addkeyvaluepair(key, valuetag.text)
        else:
            links = valuetag.findall(path(["a"], pre))
            if len(links) == 1:
                addkeyvaluepair(key, links[0].attrib["href"])
            else:
                for link in links:
                    href = link.attrib["href"]
                    if href[:7] != "http://":
                        href = categoryurl + "details/" + href
                    addkeyvaluepair(link.text, href)

    datastore.save(unique_keys=["Name"], data=keyvaluepairs)
Exemplo n.º 29
0
def from_tiddler(handle):
    """
    generates a tiddler from a Cook-style .tiddler file
    """
    content = handle.read().decode('utf-8', 'replace')
    content = _escape_brackets(content)

    parser = HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
    dom = parser.parse(content)
    node = dom.getElementsByTagName('div')[0]

    return _get_tiddler_from_div(node)
Exemplo n.º 30
0
    def strict_validator(self):
        """
        Strict validation method.

        We just call html5lib parser with strict=True. Error messages are awful,
        and it complaints about many small errors, so it can be annoying.
        """

        strict_parser = HTMLParser(strict=True)
        try:
            strict_parser.parse(self.data)
        except ParseError as ex:
            raise ValidationError(str(ex))