Пример #1
0
    def _get_version_filenames(self, session, chamber):
        '''All bills have "versions", but for those lacking html documents,
        the .wpd file is available via ftp. Create a dict of those links
        in advance; any bills lacking html versions will get version info
        from this dict.'''

        chamber_name = {'upper': 'senate', 'lower': 'House'}[chamber]
        ftp_url = 'ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/'
        ftp_url = ftp_url % (session, chamber_name)

        html = self.urlopen(ftp_url)
        dirs = [' '.join(x.split()[3:]) for x in html.splitlines()]

        split = re.compile(r'\s+').split
        matchwpd = re.compile(r'\.wpd$', re.I).search
        splitext = os.path.splitext
        version_filenames = collections.defaultdict(list)
        for d in dirs:
            url = ('%s%s/' % (ftp_url, d)).replace(' ', '%20')
            html = self.urlopen(url)
            filenames = [split(x, 3)[-1] for x in html.splitlines()]
            filenames = filter(matchwpd, filenames)
            for fn in filenames:
                fn, ext = splitext(fn)
                if ' ' in fn:
                    bill_id, _ = fn.split(' ', 1)
                else:
                    # One bill during 2011 had no spaces
                    # in the filename. Probably a fluke.
                    digits = re.search(r'\d+', fn)
                    bill_id = fn[:digits.end()]

                version_filenames[bill_id.lower()].append((d, fn))

        self._version_filenames = version_filenames
Пример #2
0
    def _get_version_filenames(self, session, chamber):
        '''All bills have "versions", but for those lacking html documents,
        the .wpd file is available via ftp. Create a dict of those links
        in advance; any bills lacking html versions will get version info
        from this dict.'''

        chamber_name = {'upper': 'senate', 'lower': 'House'}[chamber]
        ftp_url = 'ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/'
        ftp_url = ftp_url % (session, chamber_name)

        html = self.urlopen(ftp_url)
        dirs = [' '.join(x.split()[3:]) for x in html.splitlines()]

        split = re.compile(r'\s+').split
        matchwpd = re.compile(r'\.wpd$', re.I).search
        splitext = os.path.splitext
        version_filenames = collections.defaultdict(list)
        for d in dirs:
            url = ('%s%s/' % (ftp_url, d)).replace(' ', '%20')
            html = self.urlopen(url)
            filenames = [split(x, 3)[-1] for x in html.splitlines()]
            filenames = filter(matchwpd, filenames)
            for fn in filenames:
                fn, ext = splitext(fn)
                if ' ' in fn:
                    bill_id, _ = fn.split(' ', 1)
                else:
                    # One bill during 2011 had no spaces
                    # in the filename. Probably a fluke.
                    digits = re.search(r'\d+', fn)
                    bill_id = fn[:digits.end()]

                version_filenames[bill_id.lower()].append((d, fn))

        self._version_filenames = version_filenames
Пример #3
0
    def _get_version_filenames(self, session, chamber):
        """All bills have "versions", but for those lacking html documents,
        the .wpd file is available via ftp. Create a dict of those links
        in advance; any bills lacking html versions will get version info
        from this dict."""

        chamber_name = {"upper": "senate", "lower": "House"}[chamber]
        ftp_url = "ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/"
        ftp_url = ftp_url % (session, chamber_name)

        html = self.urlopen(ftp_url).decode("iso-8859-1")
        dirs = [" ".join(x.split()[3:]) for x in html.splitlines()]

        split = re.compile(r"\s+").split
        matchwpd = re.compile(r"\.wpd$", re.I).search
        splitext = os.path.splitext
        version_filenames = collections.defaultdict(list)
        for d in dirs:
            url = ("%s%s/" % (ftp_url, d)).replace(" ", "%20")
            html = self.urlopen(url).decode("iso-8859-1")
            filenames = [split(x, 3)[-1] for x in html.splitlines()]
            filenames = filter(matchwpd, filenames)

            for fn in filenames:
                fn, ext = splitext(fn)
                bill_id, _ = fn.split(" ", 1)
                version_filenames[bill_id.lower()].append((d, fn))

        self._version_filenames = version_filenames
Пример #4
0
def get_cleaned_form_html(form, human_readable=True):
    """
    Return a cleaned up version of <form> HTML contents.
    If ``human_readable`` is True, HTML is cleaned to make
    source code more readable for humans; otherwise it is cleaned to make
    rendered form more safe to render.
    """
    params = dict(
        forms=False,
        javascript=True,
        scripts=True,
        remove_unknown_tags=False,
    )

    if human_readable:
        params.update(
            style=True,
            allow_tags={'form', 'input', 'textarea', 'label', 'option',
                        'select', 'submit', 'a'},
        )
    else:
        params.update(style=False)

    cleaner = Cleaner(**params)
    raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode")
    html = cleaner.clean_html(raw_html)
    if human_readable:
        lines = [line.strip() for line in html.splitlines(False) if line.strip()]
        html = "\n".join(lines)
    return html
Пример #5
0
    def __parse(self, html):
        # remove xml decl and doctype, we will add the correct one before serializing
        # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html)
        # FIXME: do not remove doctype because we need it to load the dtd

        # remove xml declaration because of parser error: "Unicode
        # strings with encoding declaration are not supported. Please
        # use bytes input or XML fragments without declaration."
        re_xml_decl = re.compile(r'^.*?<\?xml.*?\?>', re.S | re.U)
        html = re_xml_decl.sub('', html)
        try:
            return etree.fromstring(html,
                                    lxml.html.XHTMLParser(huge_tree=True),
                                    base_url=self.attribs.url)
        except etree.ParseError as what:
            # cannot try HTML parser because we depend on correct xhtml namespace
            m = re.search(r"Entity '([^']+)'", str(what))
            if m:
                warning("Missing entity: '%s'" % m.group(1))
            else:
                error("Failed to parse file because: %s" % what)
            m = re.search(r'line\s(\d+),', str(what))
            if m:
                lineno = int(m.group(1))
                error("Line %d: %s" % (lineno, html.splitlines()[lineno - 1]))
            raise
Пример #6
0
def get_cleaned_form_html(form, human_readable=True):
    """
    Return a cleaned up version of <form> HTML contents.
    If ``human_readable`` is True, HTML is cleaned to make
    source code more readable for humans; otherwise it is cleaned to make
    rendered form more safe to render.
    """
    params = dict(
        forms=False,
        javascript=True,
        scripts=True,
        remove_unknown_tags=False,
    )

    if human_readable:
        params.update(
            style=True,
            allow_tags={
                'form', 'input', 'textarea', 'label', 'option', 'select',
                'submit', 'a'
            },
        )
    else:
        params.update(style=False)

    cleaner = Cleaner(**params)
    raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode")
    html = cleaner.clean_html(raw_html)
    if human_readable:
        lines = [
            line.strip() for line in html.splitlines(False) if line.strip()
        ]
        html = "\n".join(lines)
    return html
Пример #7
0
def html2plaintext(html, body_id=None, encoding='utf-8'):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)
    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath('//*[@id=%s]' % (body_id,))
    else:
        source = tree.xpath('//body')
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall('.//a'):
        url = link.get('href')
        if url:
            i += 1
            link.tag = 'span'
            link.text = '%s [%s]' % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))
    # \r char is converted into &#13;, must remove it
    html = html.replace('&#13;', '')

    html = html.replace('<strong>', '*').replace('</strong>', '*')
    html = html.replace('<b>', '*').replace('</b>', '*')
    html = html.replace('<h3>', '*').replace('</h3>', '*')
    html = html.replace('<h2>', '**').replace('</h2>', '**')
    html = html.replace('<h1>', '**').replace('</h1>', '**')
    html = html.replace('<em>', '/').replace('</em>', '/')
    html = html.replace('<tr>', '\n')
    html = html.replace('</p>', '\n')
    html = re.sub('<br\s*/?>', '\n', html)
    html = re.sub('<.*?>', ' ', html)
    html = html.replace(' ' * 2, ' ')
    html = html.replace('&gt;', '>')
    html = html.replace('&lt;', '<')
    html = html.replace('&amp;', '&')

    # strip all lines
    html = '\n'.join([x.strip() for x in html.splitlines()])
    html = html.replace('\n' * 2, '\n')

    for i, url in enumerate(url_index):
        if i == 0:
            html += '\n\n'
        html += ustr('[%s] %s\n') % (i + 1, url)

    return html
Пример #8
0
def html2plaintext(html, body_id=None, encoding="utf-8"):
    """ From an HTML text, convert the HTML to plain text.
    If @param body_id is provided then this is the tag where the
    body (not necessarily <body>) starts.
    """
    ## (c) Fry-IT, www.fry-it.com, 2007
    ## <*****@*****.**>
    ## download here: http://www.peterbe.com/plog/html2plaintext

    html = ustr(html)
    tree = etree.fromstring(html, parser=etree.HTMLParser())

    if body_id is not None:
        source = tree.xpath("//*[@id=%s]" % (body_id,))
    else:
        source = tree.xpath("//body")
    if len(source):
        tree = source[0]

    url_index = []
    i = 0
    for link in tree.findall(".//a"):
        url = link.get("href")
        if url:
            i += 1
            link.tag = "span"
            link.text = "%s [%s]" % (link.text, i)
            url_index.append(url)

    html = ustr(etree.tostring(tree, encoding=encoding))
    # \r char is converted into &#13;, must remove it
    html = html.replace("&#13;", "")

    html = html.replace("<strong>", "*").replace("</strong>", "*")
    html = html.replace("<b>", "*").replace("</b>", "*")
    html = html.replace("<h3>", "*").replace("</h3>", "*")
    html = html.replace("<h2>", "**").replace("</h2>", "**")
    html = html.replace("<h1>", "**").replace("</h1>", "**")
    html = html.replace("<em>", "/").replace("</em>", "/")
    html = html.replace("<tr>", "\n")
    html = html.replace("</p>", "\n")
    html = re.sub("<br\s*/?>", "\n", html)
    html = re.sub("<.*?>", " ", html)
    html = html.replace(" " * 2, " ")
    html = html.replace("&gt;", ">")
    html = html.replace("&lt;", "<")
    html = html.replace("&amp;", "&")

    # strip all lines
    html = "\n".join([x.strip() for x in html.splitlines()])
    html = html.replace("\n" * 2, "\n")

    for i, url in enumerate(url_index):
        if i == 0:
            html += "\n\n"
        html += ustr("[%s] %s\n") % (i + 1, url)

    return html
Пример #9
0
def step_HTML_matches_MD(context):
    with codecs.open(context.html_path, encoding='utf-8') as fi:
        html = context.html_text = normalize_html(fi.read())
    assert context.translated_html_text == html, '\nDifferences:\n' + '\n'.join(
        difflib.context_diff([n.encode('ascii', 'replace')
                              for n in context.translated_html_text.splitlines()],
                             [n.encode('ascii', 'replace')
                              for n in html.splitlines()],
                             fromfile='Got', tofile='Expected'))
Пример #10
0
    def _get_version_filenames(self, session, chamber):
        """All bills have "versions", but for those lacking html documents,
        the .wpd file is available via ftp. Create a dict of those links
        in advance; any bills lacking html versions will get version info
        from this dict."""

        chamber_name = {"upper": "senate", "lower": "House"}[chamber]
        ftp_url = "ftp://www.legis.state.wv.us/publicdocs/%s/RS/%s/"
        ftp_url = ftp_url % (session, chamber_name)

        try:
            html = self.urlopen(ftp_url)
        except scrapelib.FTPError:
            # The url doesn't exist. Just set _version_filenames
            # to an empty dict.
            self._version_filenames = {}
            return

        dirs = [" ".join(x.split()[3:]) for x in html.splitlines()]

        split = re.compile(r"\s+").split
        matchwpd = re.compile(r"\.wpd$", re.I).search
        splitext = os.path.splitext
        version_filenames = collections.defaultdict(list)
        for d in dirs:
            url = ("%s%s/" % (ftp_url, d)).replace(" ", "%20")
            html = self.urlopen(url)
            filenames = [split(x, 3)[-1] for x in html.splitlines()]
            filenames = filter(matchwpd, filenames)
            for fn in filenames:
                fn, ext = splitext(fn)
                if " " in fn:
                    bill_id, _ = fn.split(" ", 1)
                else:
                    # One bill during 2011 had no spaces
                    # in the filename. Probably a fluke.
                    digits = re.search(r"\d+", fn)
                    bill_id = fn[: digits.end()]

                version_filenames[bill_id.lower()].append((d, fn))

        self._version_filenames = version_filenames
Пример #11
0
def decode_html(original_html):
    html = urllib2.urlopen(original_html).read()
    #将获取到的html源码分行,因为新浪微博将网页进行了压缩
    lines = html.splitlines()
    for line in lines:
        if line.startswith(
                '<script>STK && STK.pageletM && STK.pageletM.view({"pid":"pl_weibo_direct","js":["apps'
        ):
            n = line.find('"html":"')
            if n > 0:
                decoded_html = line[n + 8:].encode("utf-8").decode(
                    'unicode_escape').encode("utf-8").replace("\\", "")
    return decoded_html
Пример #12
0
def print_form_html(form):
    """ Print a cleaned up version of <form> HTML contents """
    cleaner = Cleaner(
        forms=False,
        javascript=True,
        scripts=True,
        style=True,
        allow_tags={'form', 'input', 'textarea', 'label', 'option', 'select', 'submit', 'a'},

        remove_unknown_tags=False,
    )
    html = cleaner.clean_html(lxml.html.tostring(form, pretty_print=True))
    lines = [line.strip() for line in html.splitlines(False) if line.strip()]
    print("\n".join(lines))
Пример #13
0
def print_form_html(form):
    """ Print a cleaned up version of <form> HTML contents """
    cleaner = Cleaner(
        forms=False,
        javascript=True,
        scripts=True,
        style=True,
        allow_tags={
            'form', 'input', 'textarea', 'label', 'option', 'select', 'submit',
            'a'
        },
        remove_unknown_tags=False,
    )
    raw_html = lxml.html.tostring(form, pretty_print=True, encoding="unicode")
    html = cleaner.clean_html(raw_html)
    lines = [line.strip() for line in html.splitlines(False) if line.strip()]
    print("\n".join(lines))
Пример #14
0
def get_form_hash(form, only_visible=True):
    """
    Return a string which is the same for duplicate forms, but different
    for forms which are not the same.

    If only_visible is True, hidden fields are not taken in account.
    """
    if isinstance(form, six.string_types):
        form = lxml.html.fromstring(form)
    else:
        form = deepcopy(form)

    if only_visible:
        remove_by_xpath(form, "input[@type='hidden']")

    html = lxml.html.tostring(form, pretty_print=True, encoding="unicode")
    lines = [line.strip() for line in html.splitlines(False) if line.strip()]

    # return the whole string as a hash, for easier debugging
    return "\n".join(lines)
Пример #15
0
    def __parse (self, html):
        # remove xml decl and doctype, we will add the correct one before serializing
        # html = re.compile ('^.*<html ', re.I | re.S).sub ('<html ', html)
        # FIXME: do not remove doctype because we need it to load the dtd

        # remove xml declaration because of parser error: "Unicode
        # strings with encoding declaration are not supported. Please
        # use bytes input or XML fragments without declaration."
        re_xml_decl = re.compile (r'^<\?xml.*?\?>', re.S)
        html = re_xml_decl.sub ('', html)
        try:
            return etree.fromstring (
                html,
                lxml.html.XHTMLParser (),
                base_url = self.url)
        except etree.ParseError, what:
            # cannot try HTML parser because we depend on correct xhtml namespace
            error ("etree.fromstring says: %s" % what)
            m = re.search (r'line\s(\d+),', str (what))
            if m:
                lineno = int (m.group (1))
                error ("Line %d: %s" % (lineno, html.splitlines ()[lineno - 1]))
            raise
Пример #16
0
def _get_form_hash(form):
    # it just returns a full string as a hash, for easier debugging
    html = lxml.html.tostring(form, pretty_print=True, encoding="unicode")
    lines = [line.strip() for line in html.splitlines(False) if line.strip()]
    return "\n".join(lines)
Пример #17
0
def ct_session_info():
    html = scrapelib.urlopen("ftp://ftp.cga.ct.gov")
    sessions = [line.split()[-1] for line in html.splitlines()]
    sessions.pop()  # remove pub/
    return sessions, sessions[-1]
Пример #18
0
nodes = sel(html)

print len(nodes)
for node in nodes:
    #print lxml.html.tostring(item)
    print node.get('href'), node.text

웹데이터-2: 웹파일 가져와서 자료구조에 넣기
import urllib2
url='http://archive.ics.uci.edu/ml/machine-learning-databases/horse-colic/horse-colic.data'
res=urllib2.urlopen(url)
html = res.read()
res.close()
print len(html)

lines=html.splitlines()
data=[]
for line in lines:
    data.append(line.split())
print len(data), len(data[0])
print data[0]


웹데이터-3: wiki에서 'python'으로 검색해서 http url출력하기

from urllib import urlopen
keyword='python'
resp = urlopen('https://www.google.com/search?q='+keyword)
html=resp.read()
len(html)
Пример #19
0
def ct_session_info():
    html = scrapelib.urlopen("ftp://ftp.cga.ct.gov")
    sessions = [line.split()[-1] for line in html.splitlines()]
    sessions.pop()    # remove pub/
    return sessions, sessions[-1]
Пример #20
0
def _get_form_hash(form):
    # it just returns a full string as a hash, for easier debugging
    html = lxml.html.tostring(form, pretty_print=True, encoding="unicode")
    lines = [line.strip() for line in html.splitlines(False) if line.strip()]
    return "\n".join(lines)