def _read(io): """Try to read from a url, file or string. Parameters ---------- io : str, unicode, or file-like Returns ------- raw_text : str """ if _is_url(io): try: with contextlib.closing(urllib2.urlopen(io)) as url: raw_text = url.read() except urllib2.URLError: raise ValueError('Invalid URL: "{0}"'.format(io)) elif hasattr(io, 'read'): raw_text = io.read() elif os.path.isfile(io): with open(io) as f: raw_text = f.read() elif isinstance(io, basestring): raw_text = io else: raise ValueError("Cannot read object of type '{0}'".format(type(io))) return raw_text
def _build_doc(self): """ Raises ------ IOError * If a valid URL is detected, but for some reason cannot be parsed. This is probably due to a faulty or non-existent internet connection. ValueError * If a URL that lxml cannot parse is passed. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring try: # try to parse the input in the simplest way return parse(self.io) except (UnicodeDecodeError, IOError): # something went wrong, check for not-a-url because it's probably a # huge string blob if not _is_url(self.io): return fromstring(self.io) elif urlparse.urlparse(self.io).scheme not in ('http', 'ftp', 'file'): raise ValueError('"{0}" does not have a valid URL' ' protocol'.format(self.io)) else: raise IOError('"{0}" is a valid URL, so you probably are not' ' properly connected to the' ' internet'.format(self.io))
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring, HTMLParser from lxml.etree import XMLSyntaxError parser = HTMLParser(recover=False) try: # try to parse the input in the simplest way r = parse(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass except (UnicodeDecodeError, IOError): # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io, parser=parser) try: r = r.getroot() except AttributeError: pass else: # not a url scheme = urlparse.urlparse(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = ('{0} is not a valid url scheme, valid schemes are ' '{1}').format(scheme, _valid_schemes) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise else: if not hasattr(r, 'text_content'): raise XMLSyntaxError("no text parsed from document", 0, 0, 0) return r
def _build_doc(self): if _is_url(self.io): try: with contextlib.closing(urllib2.urlopen(self.io)) as url: raw_text = url.read() except urllib2.URLError: raise ValueError('Invalid URL: "{0}"'.format(self.io)) elif hasattr(self.io, 'read'): raw_text = self.io.read() elif os.path.isfile(self.io): with open(self.io) as f: raw_text = f.read() elif isinstance(self.io, basestring): raw_text = self.io else: raise ValueError("Cannot read object of" " type '{0}'".format(type(self.io))) assert raw_text, 'No text parsed from document' from bs4 import BeautifulSoup, SoupStrainer strainer = SoupStrainer('table') return BeautifulSoup(raw_text, parse_only=strainer)
def _build_doc(self): """ Raises ------ ValueError * If a URL that lxml cannot parse is passed. Exception * Any other ``Exception`` thrown. For example, trying to parse a URL that is syntactically correct on a machine with no internet connection will fail. See Also -------- pandas.io.html._HtmlFrameParser._build_doc """ from lxml.html import parse, fromstring from lxml.html.clean import clean_html try: # try to parse the input in the simplest way r = parse(self.io) except (UnicodeDecodeError, IOError) as e: # if the input is a blob of html goop if not _is_url(self.io): r = fromstring(self.io) else: # not a url scheme = urlparse.urlparse(self.io).scheme if scheme not in _valid_schemes: # lxml can't parse it msg = ('{0} is not a valid url scheme, valid schemes are ' '{1}').format(scheme, _valid_schemes) raise ValueError(msg) else: # something else happened: maybe a faulty connection raise e return clean_html(r)
def __init__(self, path_or_buf, encoding=None): super(StataReader, self).__init__(encoding) self.col_sizes = () self._has_string_data = False self._missing_values = False self._data_read = False self._value_labels_read = False if isinstance(path_or_buf, str) and _is_url(path_or_buf): from urllib.request import urlopen path_or_buf = urlopen(path_or_buf) if py3compat.PY3: # pragma: no cover if self._encoding: errors = 'strict' else: errors = 'replace' self._encoding = 'cp1252' bytes = path_or_buf.read() self.path_or_buf = StringIO(self._decode_bytes(bytes, errors)) elif type(path_or_buf) is str: self.path_or_buf = open(path_or_buf, 'rb') else: self.path_or_buf = path_or_buf self._read_header()