Python HTMLParser.HTMLParser示例，lxml.html.HTMLParser.HTMLParser Python示例

示例#1

0

显示文件

文件： disclosures.py 项目： influence-usa/scrapers-us-state

    def scrape_committees(self):
        SEARCH_COMMITTEES_URL = "http://cfreports.sbe.virginia.gov/"

        _, resp = self.urlretrieve(SEARCH_COMMITTEES_URL)
        d = etree.fromstring(resp.content, parser=HTMLParser())
        ptp = '//span[@id="PagingTotalPages"]/text()'
        ptr = '//span[@id="PagingTotalRecords"]/text()'
        number_of_result_pages = int(d.xpath(ptp)[0])
        number_of_results = int(d.xpath(ptr)[0])

        committee_list = []
        for index in range(number_of_result_pages):
            # for index in range(2): # Reduce time for debugging
            _, resp = self.urlretrieve(SEARCH_COMMITTEES_URL + '?page=' +
                                       str(index + 1))
            d = etree.fromstring(resp.content, parser=HTMLParser())
            target_table = d.xpath('//table/tbody')[0]
            committee_list = committee_list + \
                self.parse_committee_table(target_table)

        assert len(committee_list) == number_of_results

        for result in committee_list:
            org = Organization(
                name=result['org_name'],
                classification='political action committee',
            )
            org.add_source(url=SEARCH_COMMITTEES_URL)
            org.source_identified = True
            yield org

示例#2

0

显示文件

文件： jinja_migration.py 项目： StephenPauwels/UA_INGINIOUS

def check_same_tpl(html_a, html_b):
    """ Given html_a and html_b, two HTML pages, check that they contain the same structure.
        Raises an exception if it's not the case. Otherwise, returns html_a.
    """
    structa = fromstring(str(html_a),
                         parser=HTMLParser(remove_blank_text=True))
    structb = fromstring(str(html_b),
                         parser=HTMLParser(remove_blank_text=True))
    if not elements_equal(structa, structb):
        raise Exception("The two templates do not contain the same thing!")
    return html_a

示例#3

0

显示文件

文件： session.py 项目： skaras/ztis-criminal-analyzer

    def __init__(self, encoding='utf8'):
        # Obiekt Session, używany przy kolejnych zapytaniach. Potrzebny, żeby
        # raz ustawić nagłówki i nie przekazywać ich do każdego zapytania
        # osobno.
        self.session = requests.Session()
        self.session.headers.update({
            'Accept'            : 'text/html,application/xhtml+xml,'\
                    'application/xml;q=0.9,*/*;q=0.8',
            'Accept-Encoding'   : 'gzip, deflate',
            'Accept-Language'   : 'pl,en-US;q=0.7,en;q=0.3',
            'Cache-Control'     : 'max-age=0',
            'Connection'        : 'keep-alive',
            #'Host'              : 'www.krs-online.com.pl',
            'User-Agent'        : 'Mozilla/5.0 (X11; Linux x86_64; rv:28.0) '\
                    'Gecko/20100101 Firefox/28.0'
            #'Referer'           : 'http://www.krs-online.com.pl/muzeum-slaska-opolskiego-krs-1260077.html',
            #'Cookie'            : 'krs_fk45=h5mfc4oblmd1e1nokkpu4694e5; krs_cookie_accepted=true',
            #'DNT'               : '1',
            })

        self.parser = HTMLParser(encoding=encoding)
        self.cleaner = Cleaner(
            # usuwanie skryptów, styli i komentarzy
            scripts=True,
            javascript=True,
            comments=True,
            style=True,
            # head i body zostają
            page_structure=False)

示例#4

0

显示文件

def export(html, custom_filename=''):
    try:
        root = fromstring(html, parser=HTMLParser(collect_ids=False))
    except (ParserError, XMLSyntaxError):
        raise SystemExit('Error while parsing content')

    try:
        content, = root.xpath(CONTENT_PATH)
    except ValueError:
        raise SystemExit('Multiple results while parsing the content')

    for xpath in BAD_XPATHS:
        for bad in content.xpath(xpath):
            bad.getparent().remove(bad)

    if custom_filename:
        filename = custom_filename
    else:
        file_options = dict(
            mode='w+b',
            suffix='.html',
            prefix=f'{int(time())}_',
            dir=getcwd(),
            delete=False,
        )
        with NamedTemporaryFile(**file_options) as html_file:
            filename = html_file.name

    ElementTree(content).write(filename, encoding=HTML_ENCODING)

示例#5

0

显示文件

文件： parser.py 项目： d4interactive/webparser

def convert_to_doc(html, domain_for_absolute_link=None):
    """
    Accept the html content document, convert it to the doc element
    If we want to convert relative links to absolute links, we pass the
    domain url to the absolute links.

    :param html: A response html content or string body
    :param domain_for_absolute_link: A domain URL which used for creating an absolute links
    :return doc instance of the html.
    """
    # if isinstance(html, (unicode)):
    #     html = html.encode('ascii', 'xmlcharrefreplace')

    parser = HTMLParser(encoding='utf-8')
    try:
        doc = fromstring(html, parser=parser)
    except TypeError:
        doc = etree.parse(html, parser)
    except ValueError:
        doc = fromstring(html.encode('utf-8'))

    if domain_for_absolute_link:
        try:
            doc.make_links_absolute(domain_for_absolute_link)
        except Exception as ex:
            absolute_links_patch(doc, domain_for_absolute_link)

    return doc

示例#6

0

显示文件

 def test_missing_root_empty_string_defusedxml(self):
     parser = HTMLParser()
     with self.assertRaises((XMLSyntaxError, AssertionError)) as cm:
         defusedxml.lxml.parse(BytesIO(b''), parser=parser)
     self.assertTrue(cm.exception.args[0] is None
                     or 'ElementTree not initialized, missing root' in str(
                         cm.exception))

示例#7

0

显示文件

文件： Util.py 项目： c050226113/pack

 def translate_html_special_characters(string):
     html_parser = HTMLParser()
     print(dir(html_parser))
     data = '<br>'
     print(html_parser.feed(data))
     print(data)
     return html_parser.parse(string)

示例#8

0

显示文件

def get_etree(fileobj, encoding='utf-8'):
    """
    Get an ElementTree instance from a given file object
    *fileobj*. The encoding is assumed to be utf8.
    """
    return parse(fileobj, HTMLParser(encoding=encoding,
                                     remove_blank_text=True))

示例#9

0

显示文件

文件： dances.py 项目： pmartin23/metapub

def the_jama_dance(pma, verify=True):
    '''  :param: pma (PubMedArticle object)
         :param: verify (bool) [default: True]
         :return: url (string)
         :raises: AccessDenied, NoPDFLink
    '''
    if not pma.doi:
        raise NoPDFLink('MISSING: doi needed for JAMA article.')

    baseurl = the_doi_2step(pma.doi)
    res = requests.get(baseurl)
    parser = HTMLParser()
    tree = etree.fromstring(res.content, parser)
    # we're looking for a meta tag like this:
    # <meta name="citation_pdf_url" content="http://archneur.jamanetwork.com/data/Journals/NEUR/13776/NOC40008.pdf" />
    for item in tree.findall('head/meta'):
        if item.get('name') == 'citation_pdf_url':
            pdfurl = item.get('content')
        else:
            raise NoPDFLink('DENIED: JAMA did not provide PDF link in (%s).' %
                            baseurl)
    if verify:
        #TODO: form navigation
        verify_pdf_url(pdfurl, 'JAMA')
    return pdfurl

示例#10

0

显示文件

文件： proxyclient.py 项目： simonm3/mim

    def processHTML(self):
        """ decodes data; calls callbacks; recodes data """
        zipencoding = None
        try:
            zipencoding = self.father.responseHeaders.getRawHeaders(
                "Content-Encoding")[-1]
            self.data = zips(zipencoding).decompress(self.data)
        except:
            zipencoding = None

        if len(self.data) > 1 \
            and not self.data == "<!-- default response -->":
            # use encoding header if available. default is meta tag but not always present.
            # assumes unicode. could enforce with unicodedammit but very slow and never found necessary
            try:
                encoding = self.father.responseHeaders.getRawHeaders \
                                ("content-type")[-1].split('charset=')[1].split(";")[0]
            except:
                encoding = "ISO-8859-1"
            try:
                tree = fromstring(self.data,
                                  parser=HTMLParser(encoding=encoding))
                events.gotResponseTree(self, tree)
                self.data = tostring(tree, encoding=encoding)
            except:
                log.exception("{id} Content not parseable len={len} enc={enc}\n{data}"\
                    .format(id=self.father.id, len=len(self.data), enc=encoding, data=self.data[:100]))

        # gotResponseTree used by most plugins but but sometimes may want to see raw text
        events.gotResponseText(self)

        if zipencoding:
            self.data = zips(zipencoding).compress(self.data)

示例#11

0

显示文件

文件： pypi_pip_search.py 项目： achernet/pyscripts

def query_initial_packages(search_term):
    """
    Perform an initial package search on PyPI with the given :attr:`search_term`, and return a list of
    :attr:`PypiSearchResult` named objects.

    :param str search_term: The initial search query
    :return: The list of search results
    :rtype: list[PypiSearchResult]
    """
    logging.info("Querying initial packages for %s...", search_term)
    result_page = requests.get("https://pypi.python.org/pypi",
                               params={
                                   ":action": "search",
                                   "term": search_term
                               })
    result_tree = etree.fromstring(result_page.content, HTMLParser())
    result_tree.make_links_absolute(result_page.url)
    result_tags = result_tree.xpath("//table[@class='list']/tr[@class][td]")
    results = []
    for lxml_element in result_tags:
        result_obj = PypiJsonSearchResult(link="{0}/json".format(
            lxml_element[0][0].get("href")),
                                          weight=int(lxml_element[1].text),
                                          summary=lxml_element[2].text or '')
        if result_obj.is_pip_result(search_term):
            results.append(result_obj)
    return results

示例#12

0

显示文件

文件： main.py 项目： Ccbangbang/364-Project

def view_page():
    url = flask.request.args.get("url")
    o = urllib.parse.urlparse(url)
    if o.scheme == '':
        print("Invalid url: Scheme error")
        exit(2)
    if o.netloc == '':
        print("Invalid url: Scheme error")
        exit(2)

    req = urllib.request.Request(url)
    req.add_header('User-Agent', 'PurdueUniversityClassProject/1.0 ([email protected] https://goo.gl/dk8u5S)')

    with urllib.request.urlopen(req) as response:
        htm = response.read()
        html = htm.decode("UTF-8")
    # Credit: Adapted from example in Python 3.4 Documentation, urllib.request
    #         License: PSFL https://www.python.org/download/releases/3.4.1/license/
    #                  https://docs.python.org/3.4/library/urllib.request.html


    #root = lxml.html.parse(html)
    #print(root)
    parser = HTMLParser(encoding="UTF-8")
    root = document_fromstring(html, parser=parser, base_url=url)
    #print(root)
    #root.make_links_absolute(url, resolve_base_href=True)
    #print(html)
    for node in root.iter():
        if node.tag == 'head':
            newstr = "<base href = {}>".format(url)
            html = html.replace('<head>','<head>' + '\n' + newstr)
        if node.tag == 'HEAD':
            newstr = "<BASE HREF = {}>".format(url)
            html = html.replace('<TITLE>','<TITLE>' + '\n' + newstr)
    path = copy_profile_photo_to_static(root)
    static_url = flask.url_for('static',filename = os.path.basename(path), _external = True)
    #print(static_url)

    expr = r"(/home/ecegridfs/a/ee364a13/hpo/static/)([\w]+)(\.[\w]+)"
    o = re.match(expr,path)
    photo  = o.group(2)
    for node in root.iter():
        if node.tag == "img":
            url = node.get("src")
            with urllib.request.urlopen(url) as response:
                type = response.info().get('Content-Type')
                extension = mimetypes.guess_extension(type)
                filename = make_filename(url,extension)
                col = filename.split(".")
                name = col[0]
                match_name = name

                if  match_name == photo:
                    #print("Found")
                    #print(node.attrib['src'])
                    html = html.replace(node.attrib['src'],static_url)


    return html

示例#13

0

显示文件

文件： core.py 项目： vishalbelsare/libextract

def parse_html(fileobj, encoding):
    """
    Given a file object *fileobj*, get an ElementTree instance.
    The *encoding* is assumed to be utf8.
    """
    parser = HTMLParser(encoding=encoding, remove_blank_text=True)
    return parse(fileobj, parser)

示例#14

0

显示文件

    def pre_parse(self):

        http_content_type = self.response.headers.get('content-type', '')
        target = HTMLEncodings(http_content_type)
        # parser will fail on non-ascii unless we set it explicitly
        parser = HTMLParser(target=target, encoding='ISO-8859-1')
        total_bytes = 0

        self.response.seek(0)
        while target:
            chunk = self.response.read(PRE_PARSE_CHUNK_SIZE)
            if not chunk:
                try:
                    parser.close()
                except XMLSyntaxError:
                    pass
                break

            if self.bom is None:
                assert PRE_PARSE_CHUNK_SIZE >= 4
                self.bom = b''
                for i in range(4, 1, -1):
                    if chunk[:i] in BOM_ENC:
                        self.bom = chunk[:i]
                        target.encodings.append(('bom', BOM_ENC[self.bom]))
                        # the can only be one BOM - stop here
                        break

            parser.feed(chunk)
            total_bytes += len(chunk)
            if total_bytes >= MAX_PRE_PARSE_BYTES:
                break

        return target.encodings

示例#15

0

显示文件

文件： dances.py 项目： pmartin23/metapub

def the_wolterskluwer_volta(pma, verify=True):
    '''  :param: pma (PubMedArticle object)
         :param: verify (bool) [default: True]
         :return: url
         :raises: AccessDenied, NoPDFLink
    '''
    doiurl = 'http://content.wkhealth.com/linkback/openurl?doi=%s'
    volissurl = 'http://content.wkhealth.com/linkback/openurl?issn={a.issn}&volume={a.volume}&issue={a.issue}&spage={a.first_page}'
    if pma.doi:
        baseurl = requests.get(doiurl % pma.doi).url
    elif pma.issn:
        pma = rectify_pma_for_vip_links(pma)  #raises NoPDFLink if missing data
        baseurl = requests.get(volissurl.format(a=pma)).url

    res = requests.get(baseurl)
    tree = etree.fromstring(res.content, HTMLParser())
    try:
        item = tree.cssselect('li.ej-box-01-body-li-article-tools-pdf')[0]
    except IndexError:
        raise NoPDFLink(
            'DENIED: wolterskluwer did not provide PDF link for this article')
    link = item.getchildren()[0]
    url = link.get('href')
    if verify:
        verify_pdf_url(url)
    return url

示例#16

0

显示文件

    def _load(self):
        """
        Load the ElementTree from the source
        """
        # Convert directional quotation marks to regular quotes
        double_quotes = ur'[\u201c\u201d]'
        self.source = re.sub(double_quotes, u'"', self.source)
        single_quotes = ur'[\u2019\u2018]'
        self.source = re.sub(single_quotes, u"'", self.source)
        # Convert colons
        self.source = self.source.replace(u'\uff1a', u':')
        # Remove line breaks and tabs
        self.source = self.source.replace(u'\n', u'')
        self.source = self.source.replace(u'\t', u'')
        # There are also some "zero width joiners" in random places in the text
        # Should remove them here, since they make string search unreliable
        # these are the codes: &#8205, &#160 (nbsp), \xa0 (nbsp), \u200d
        zero_width_joiners = u'\u200d'
        self.source = self.source.replace(zero_width_joiners, u'')
        # Also previously had some non breaking spaces in unicode \u00a0, but this
        # may have been fixed by changing the parser below

        # Use the lxml cleaner
        cleaner = Cleaner()
        parser = HTMLParser(encoding='utf-8')
        # Finally, load the cleaned string to an ElementTree
        self.tree = cleaner.clean_html(
            lxml.html.fromstring(to_string(self.source), parser=parser))

示例#17

0

显示文件

文件： 03_crawl.py 项目： ajmarcus/nosranet

def read_image(recipe_id: int) -> Optional[str]:
    tree = parse(
        f"./data/crawl/{Kind.RECIPE.name.lower()}/{recipe_id}.html",
        parser=HTMLParser(encoding="utf-8"),
    )
    elements = tree.xpath(JSON_LD)
    if len(elements) == 0:
        logging.debug(log(f"{filename}: no recipe"))
        return None
    for element in elements:
        j = None
        try:
            j = json.loads(element.text)
        except JSONDecodeError as e:
            logging.debug(log(f"{filename} invalid json: {element.text}"))
            return None
        if "@type" in j.keys():
            if j["@type"] == "Recipe":
                return j["image"].strip()
        if "@graph" in j.keys():
            for node in j["@graph"]:
                if node["@type"] == "Recipe":
                    return j["image"].strip()
        else:
            logging.debug(log(f"{filename}: no @graph or @type=Recipe element in json"))
            logging.debug(log(json.dumps(j, indent=2, ensure_ascii=False)))
            return None
    return None

示例#18

0

显示文件

文件： __init__.py 项目： influence-usa/scrapers-us-federal

    def parse(self, **kwargs):
        from lxml.html import HTMLParser
        html_parser = HTMLParser()

        etree_root = etree.fromstring(kwargs['root'], parser=html_parser)

        return super().parse(root=etree_root,
                             document_id=kwargs['document_id'])

示例#19

0

显示文件

 def __init__(self, resp: Response):
     self.origin: str = resp.text  # 原始数据
     utf8_parser = HTMLParser(encoding="utf-8")
     data = PyQuery(fromstring(self.origin, parser=utf8_parser))
     self.raw: List[EHentaiItem] = [
         EHentaiItem(i) for i in data.find(".glcat").parents("tr").items()
     ]
     self.url: str = str(resp.url)

示例#20

0

显示文件

文件： core.py 项目： rajatomar788/pywebcopy7

 def get_forms(self):
     """Returns a list of form elements available on the page."""
     source, encoding = self.get_source(buffered=True)
     return parse(
         source,
         parser=HTMLParser(encoding=encoding, collect_ids=False)).xpath(
             "descendant-or-self::form|descendant-or-self::x:form",
             namespaces={'x': XHTML_NAMESPACE})

示例#21

0

显示文件

文件： main.py 项目： yjung199/image-web-crawler

def _make_etree(html, url):
    from lxml.html import HTMLParser, document_fromstring

    parser = HTMLParser(encoding="UTF-8")
    root = document_fromstring(html, parser=parser, base_url=url)
    root.make_links_absolute(root.base_url)

    return root

示例#22

0

显示文件

文件： aerostat.py 项目： Chau/aerostat

 def __init__(self, fpath=None):
     if not fpath:
         self.file_path = '/home/vera/work/aerostat/AEROSTATICA/aerostatica.ru/2019/04/28/728-beltain-flook-2019/index.html'
     else:
         self.file_path = fpath
     with open(self.file_path, 'rb') as f:
         self.soup = fromstring(f.read(),
                                parser=HTMLParser(encoding='utf8'))

示例#23

0

显示文件

文件： do_sshfile.py 项目： achernet/pyscripts

def fetch_winpython_lib_page():
    """
    Fetch the Windows Python compiled libraries page and return the parsed element tree.
    """
    resp = requests.get(WINPYTHON_LIBS_URL, timeout=30)
    tree = etree.fromstring(resp.content, HTMLParser())
    tree.make_links_absolute(resp.url)
    return tree

示例#24

0

显示文件

    def __init__(self, path: str, dev: bool):
        self.path = path
        self.dev = dev

        kinxfile = load_fm(self.__get_path("Kinxfile"))

        self.markdown = Markdown(
            renderer=Renderer(escape=False, hard_wrap=True),
            inline=InlineLexer,
            block=BlockLexer,
        )

        try:
            try:
                jinja_extensions = kinxfile["extensions"]["jinja"]
            except KeyError:
                jinja_extensions = ()
            self.env = Environment(
                extensions=jinja_extensions,
                autoescape=True,
                loader=FileSystemLoader(self.__get_path(kinxfile["root"])),
            )
            del jinja_extensions
        except KeyError:
            print("Unexpected error: 'root' key not found in Kinxfile")
            raise

        project_dir: dict = {}

        self.headers: List[Union[Tuple[str, int], str]] = [self.DEFAULT_HEADER]
        for i in etree.fromstring(self.markdown(kinxfile.content),
                                  HTMLParser())[0]:
            if i.tag == "ul":
                project_dir[self.headers[-1]].extend(self.__get_links_md(i))
            elif (i.tag[:1] == "h"
                  and i.tag[1:] in (str(j) for j in range(1, 7))):
                self.headers.append((i.text, i.tag[1:]))
                project_dir[self.headers[-1]] = []
            else:
                project_dir[self.headers[-1]].append(i.tag)
                if i.tag not in ["hr"]:
                    print("{} is not read in Kinxfile".format(
                        etree.tostring(i)))

        self.kx: dict = {}
        for i in [
                "title", "author", "description", "url", "copyright", "theme"
        ]:
            self.kx[i] = kinxfile[i]
        del kinxfile

        self.kx["content"] = (project_dir)
        del project_dir

        print(self.kx)

        self.pages: Dict[str, str] = {}

示例#25

0

显示文件

 def lxmldom(self, url):
     req = self.fetch(url)
     parser = HTMLParser(encoding='utf-8',
                         remove_pis=True,
                         remove_comments=True,
                         remove_blank_text=True)
     dom = lxml.html.fromstring(req.text, base_url=url, parser=parser)
     dom.make_links_absolute(url)
     return dom

示例#26

0

显示文件

文件： files.py 项目： HootingYard/ubercoordinator

def parse_xhtml_file(file: Path) -> HtmlElement:
    """
    Parse an XHTML file into plain HTML,
    i.e. a tree of HTMLElements without namespace annotations.

    :param file: XHTML file
    :return: the 'html' element
    """
    # The HTMLParser seems to do what I want okay...
    return parse(str(file), parser=HTMLParser()).getroot()

示例#27

0

显示文件

文件： html.py 项目： stevenschaerer/pandas

    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.etree import XMLSyntaxError
        from lxml.html import (
            HTMLParser,
            fromstring,
            parse,
        )

        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, OSError) as e:
            # if the input is a blob of html goop
            if not is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, "text_content"):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)

        for br in r.xpath("*//br"):
            br.tail = "\n" + (br.tail or "")

        return r

示例#28

0

显示文件

 def test_missing_root_only_comment_defusedxml(self):
     parser = HTMLParser()
     self.assertRaises(AssertionError,
                       defusedxml.lxml.parse,
                       BytesIO(b'<!-- foo -->'),
                       parser=parser)
     try:
         defusedxml.lxml.parse(BytesIO(b'<!-- foo -->'), parser=parser)
     except Exception as ex:
         self.assertTrue(
             'ElementTree not initialized, missing root' in str(ex))

示例#29

0

显示文件

文件： sesc-proper-encoding.py 项目： yuandra/scraperwiki-scraper-vault

class SESCSchedule:
    parser = HTMLParser(encoding='utf-8')

    baseurl = 'http://www.sescsp.org.br/sesc/programa_new/busca.cfm'

    basepage = parse(baseurl, parser).getroot()

    units = {}
    for option in basepage.cssselect('select[name="unidade_id"] option'):
        value = int(option.get('value'))
        units[value] = option.text_content()

    def __init__(self, unit):
        # checks whether it is a valid unit
        if unit in SESCSchedule.units:
            self.unit = unit
            self.events = []
        else:
            # TODO: document the "unit doesn't exist" error
            raise

    def get_page_url(self, page):
        return SESCSchedule.baseurl + '?' + urlencode({'unidade_id': self.unit, 'page': page})

    def get_events(self, page):
        self.page_url = self.get_page_url(page)
        self.page_tree = parse(self.page_url).getroot()        
        entries = self.page_tree.cssselect('#box')
        if entries:
            print 'found {0} entries in page {1}'.format(len(entries), page)
            page_events = []
            for entry_count,entry in enumerate(entries):
                if entry.cssselect('.tit2'):
                    events = SESCProgram(entry, page).events
                elif entry.cssselect('.tit'):
                    events = [SESCEvent(entry, page).dictionaries]
                else:
                    events = []
                page_events.extend(events)
                print '{0} of {1} entries have been scraped'.format(entry_count, len(entries))
            # let the world know the scraper was succesful
            return page_events
        else:
            print 'no more events in unit {0} schedule'.format(self.unit)
            return 0
        print 'done scraping page {0}'.format(page)

    def __iter__(self, page=1):
        events = self.get_events(page)
        while events:
            yield events
            page += 1
            events = self.get_events(page)
        print 'iteration complete over {0} pages'.format(page)

示例#30

0

显示文件

    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = (('{invalid!r} is not a valid url scheme, valid '
                            'schemes are {valid}')
                           .format(invalid=scheme, valid=_valid_schemes))
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r