Пример #1
0
def _read(obj):
    """Try to read from a url, file or string.

    Parameters
    ----------
    obj : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(obj):
        with urlopen(obj) as url:
            text = url.read()
    elif hasattr(obj, 'read'):
        text = obj.read()
    elif isinstance(obj, char_types):
        text = obj
        try:
            if os.path.isfile(text):
                with open(text, 'rb') as f:
                    return f.read()
        except (TypeError, ValueError):
            pass
    else:
        raise TypeError("Cannot read object of type %r" % type(obj).__name__)
    return text
Пример #2
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        try:
            with urlopen(io) as url:
                raw_text = url.read()
        except urllib2.URLError:
            raise ValueError('Invalid URL: "{0}"'.format(io))
    elif hasattr(io, "read"):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, basestring):
        raw_text = io
    else:
        raise TypeError("Cannot read object of type " "'{0.__class__.__name__!r}'".format(io))
    return raw_text
Пример #3
0
    def __init__(self, io, **kwds):

        import xlrd  # throw an ImportError if we need to

        ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2]))
        if ver < (0, 9):  # pragma: no cover
            raise ImportError("pandas requires xlrd >= 0.9.0 for excel "
                              "support, current version " + xlrd.__VERSION__)

        self.io = io

        engine = kwds.pop('engine', None)

        if engine is not None and engine != 'xlrd':
            raise ValueError("Unknown engine: %s" % engine)

        if isinstance(io, compat.string_types):
            if _is_url(io):
                data = _urlopen(io).read()
                self.book = xlrd.open_workbook(file_contents=data)
            else:
                self.book = xlrd.open_workbook(io)
        elif engine == 'xlrd' and isinstance(io, xlrd.Book):
            self.book = io
        elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
            # N.B. xlrd.Book has a read attribute too
            data = io.read()
            self.book = xlrd.open_workbook(file_contents=data)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
Пример #4
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        with urlopen(io) as url:
            raw_text = url.read()
    elif hasattr(io, 'read'):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, compat.string_types):
        raw_text = io
    else:
        raise TypeError("Cannot read object of type "
                        "'{0.__class__.__name__!r}'".format(io))
    return raw_text
Пример #5
0
    def _write_cell(self, s, kind='td', indent=0, tags=None):
        if tags is not None:
            start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
        else:
            start_tag = '<{kind}>'.format(kind=kind)

        if self.escape:
            # escape & first to prevent double escaping of &
            esc = OrderedDict([('&', r'&amp;'), ('<', r'&lt;'),
                               ('>', r'&gt;')])
        else:
            esc = {}

        rs = pprint_thing(s, escape_chars=esc).strip()

        if self.render_links and _is_url(rs):
            rs_unescaped = pprint_thing(s, escape_chars={}).strip()
            start_tag += '<a href="{url}" target="_blank">'.format(
                url=rs_unescaped)
            end_a = '</a>'
        else:
            end_a = ''

        self.write(u'{start}{rs}{end_a}</{kind}>'.format(
            start=start_tag, rs=rs, end_a=end_a, kind=kind), indent)
Пример #6
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
Пример #7
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError
        parser = HTMLParser(recover=True, encoding=self.encoding)

        try:
            if _is_url(self.io):
                with urlopen(self.io) as f:
                    r = parse(f, parser=parser)
            else:
                # try to parse the input in the simplest way
                r = parse(self.io, parser=parser)
            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError) as e:
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                raise e
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
Пример #8
0
    def _write_cell(self, s, kind='td', indent=0, tags=None, i=None, j=None):
        if self.plot_type is not None and i is not None and j is not None:
            if self.is_multi_c:
                column_code = [c[j] for c in self.tr_frame.columns.codes]
            else:
                column_code = [j]
            if self.is_multi_r:
                row_code = [c[i] for c in self.tr_frame.index.codes]
            else:
                row_code = [i]
            column_code = np.asarray(column_code)
            row_code = np.asarray(row_code)
            color = np.hstack((column_code, row_code))[self.color_indices]
            color += np.ones((2, ), dtype=np.uint8)
            color = self.colormap(color[0], color[1])
            miniplot_str = self.miniplot(self.tr_frame.iloc[i,
                                                            j], self.rheight,
                                         self.pwidth, color, self.plot_type)

        if tags is not None:
            start_tag = '<{kind} {tags}>'.format(kind=kind, tags=tags)
        else:
            start_tag = '<{kind}>'.format(kind=kind)
            tags = ""

        start_miniplot_tag = '<{kind} {tags}>'.format(
            kind=kind, tags=tags + ' style="padding-right: 1em;"')

        if self.escape:
            # escape & first to prevent double escaping of &
            esc = OrderedDict([('&', r'&amp;'), ('<', r'&lt;'),
                               ('>', r'&gt;')])
        else:
            esc = {}

        rs = s

        if self.render_links and _is_url(rs):
            rs_unescaped = pprint_thing(s, escape_chars={}).strip()
            rs = '<a href="{url}" target="_blank">{rs}</a>'.format(
                url=rs_unescaped, rs=rs)

        if self.plot_type is not None and i is not None and j is not None and self.ppos == "left":
            self.write(
                '{start}{rs}</{kind}>'.format(start=start_miniplot_tag,
                                              rs=miniplot_str,
                                              kind=kind), indent)
        self.write(
            '{start}{rs}</{kind}>'.format(start=start_tag, rs=rs, kind=kind),
            indent)
        if self.plot_type is not None and i is not None and j is not None and self.ppos == "right":
            self.write(
                '{start}{rs}</{kind}>'.format(start=start_miniplot_tag,
                                              rs=miniplot_str,
                                              kind=kind), indent)
Пример #9
0
    def _build_doc(self):
        """
        Raises
        ------
        ValueError
            * If a URL that lxml cannot parse is passed.

        Exception
            * Any other ``Exception`` thrown. For example, trying to parse a
              URL that is syntactically correct on a machine with no internet
              connection will fail.

        See Also
        --------
        pandas.io.html._HtmlFrameParser._build_doc
        """
        from lxml.html import parse, fromstring, HTMLParser
        from lxml.etree import XMLSyntaxError

        parser = HTMLParser(recover=False, encoding=self.encoding)

        try:
            # try to parse the input in the simplest way
            r = parse(self.io, parser=parser)

            try:
                r = r.getroot()
            except AttributeError:
                pass
        except (UnicodeDecodeError, IOError):
            # if the input is a blob of html goop
            if not _is_url(self.io):
                r = fromstring(self.io, parser=parser)

                try:
                    r = r.getroot()
                except AttributeError:
                    pass
            else:
                # not a url
                scheme = parse_url(self.io).scheme
                if scheme not in _valid_schemes:
                    # lxml can't parse it
                    msg = (('{invalid!r} is not a valid url scheme, valid '
                            'schemes are {valid}')
                           .format(invalid=scheme, valid=_valid_schemes))
                    raise ValueError(msg)
                else:
                    # something else happened: maybe a faulty connection
                    raise
        else:
            if not hasattr(r, 'text_content'):
                raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
        return r
Пример #10
0
    def __init__(self, filepath_or_buffer):
        """Reader using xlrd engine.

        Parameters
        ----------
        filepath_or_buffer : string, path object or Workbook
            Object to be parsed.
        """
        err_msg = "Install xlrd >= 1.0.0 for Excel support"

        try:
            import xlrd
        except ImportError:
            raise ImportError(err_msg)
        else:
            if xlrd.__VERSION__ < LooseVersion("1.0.0"):
                raise ImportError(err_msg +
                                  ". Current version " + xlrd.__VERSION__)

        from pandas.io.excel._base import ExcelFile
        # If filepath_or_buffer is a url, want to keep the data as bytes so
        # can't pass to get_filepath_or_buffer()
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = urlopen(filepath_or_buffer)
        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
                filepath_or_buffer)

        if isinstance(filepath_or_buffer, xlrd.Book):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            if hasattr(filepath_or_buffer, 'seek'):
                try:
                    # GH 19779
                    filepath_or_buffer.seek(0)
                except UnsupportedOperation:
                    # HTTPResponse does not support seek()
                    # GH 20434
                    pass

            data = filepath_or_buffer.read()
            self.book = xlrd.open_workbook(file_contents=data)
        elif isinstance(filepath_or_buffer, compat.string_types):
            self.book = xlrd.open_workbook(filepath_or_buffer)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
Пример #11
0
    def __init__(self, filepath_or_buffer):
        """Reader using xlrd engine.

        Parameters
        ----------
        filepath_or_buffer : string, path object or Workbook
            Object to be parsed.
        """
        err_msg = "Install xlrd >= 1.0.0 for Excel support"

        try:
            import xlrd
        except ImportError:
            raise ImportError(err_msg)
        else:
            if xlrd.__VERSION__ < LooseVersion("1.0.0"):
                raise ImportError(err_msg +
                                  ". Current version " + xlrd.__VERSION__)

        from pandas.io.excel._base import ExcelFile
        # If filepath_or_buffer is a url, want to keep the data as bytes so
        # can't pass to get_filepath_or_buffer()
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = _urlopen(filepath_or_buffer)
        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
                filepath_or_buffer)

        if isinstance(filepath_or_buffer, xlrd.Book):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            if hasattr(filepath_or_buffer, 'seek'):
                try:
                    # GH 19779
                    filepath_or_buffer.seek(0)
                except UnsupportedOperation:
                    # HTTPResponse does not support seek()
                    # GH 20434
                    pass

            data = filepath_or_buffer.read()
            self.book = xlrd.open_workbook(file_contents=data)
        elif isinstance(filepath_or_buffer, compat.string_types):
            self.book = xlrd.open_workbook(filepath_or_buffer)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
Пример #12
0
def _read(obj):
    if _is_url(obj):
        with urlopen(obj) as url:
            text = url.read()
    elif hasattr(obj, 'read'):
        text = obj.read()
    elif isinstance(obj, char_types):
        text = obj
        try:
            if os.path.isfile(text):
                with open(text, 'rb') as f:
                    return f.read()
        except (TypeError, ValueError):
            pass
    else:
        raise TypeError("Cannot read object of type %r" % type(obj).__name__)
    return text
Пример #13
0
    def __init__(self, filepath_or_buffer):
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
        elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer)

        if isinstance(filepath_or_buffer, self._workbook_class):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            filepath_or_buffer.seek(0)
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, str):
            self.book = self.load_workbook(filepath_or_buffer)
        else:
            raise ValueError(
                "Must explicitly set engine if not passing in buffer or path for io."
            )
Пример #14
0
    def __init__(self, filepath_or_buffer):
        # If filepath_or_buffer is a url, load the data into a BytesIO
        if _is_url(filepath_or_buffer):
            filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read())
        elif not isinstance(filepath_or_buffer,
                            (ExcelFile, self._workbook_class)):
            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
                filepath_or_buffer)

        if isinstance(filepath_or_buffer, self._workbook_class):
            self.book = filepath_or_buffer
        elif hasattr(filepath_or_buffer, "read"):
            # N.B. xlrd.Book has a read attribute too
            filepath_or_buffer.seek(0)
            self.book = self.load_workbook(filepath_or_buffer)
        elif isinstance(filepath_or_buffer, str):
            self.book = self.load_workbook(filepath_or_buffer)
        else:
            raise ValueError('Must explicitly set engine if not passing in'
                             ' buffer or path for io.')
Пример #15
0
def filepath_to_buffer(filepath,
                       encoding=None,
                       compression=None,
                       timeout=None,
                       start_byte=0):
    if not is_str(filepath):
        #if start_byte:
        #    filepath.seek(start_byte)
        return filepath, encoding, compression, filepath.size()
    if _is_url(filepath):
        headers = None
        if start_byte:
            headers = {"Range": "bytes={}-".format(start_byte)}
        req = requests.get(filepath,
                           stream=True,
                           headers=headers,
                           timeout=timeout)
        content_encoding = req.headers.get('Content-Encoding', None)
        if content_encoding == 'gzip':
            compression = 'gzip'
        size = req.headers.get('Content-Length', 0)
        #return HttpDesc(req.raw, filepath), encoding, compression, int(size)
        return req.raw, encoding, compression, int(size)
    if _is_s3_url(filepath):
        from pandas.io import s3
        reader, encoding, compression = s3_get_filepath_or_buffer(
            filepath, encoding=encoding, compression=compression)
        return reader, encoding, compression, reader.size
    if _is_buffer_url(filepath):
        buffer = _url_to_buffer(filepath)
        return buffer, encoding, compression, buffer.size()
    filepath = os.path.expanduser(filepath)
    if not os.path.exists(filepath):
        raise ValueError("wrong filepath: {}".format(filepath))
    size = os.stat(filepath).st_size
    stream = io.FileIO(filepath)
    if start_byte:
        stream.seek(start_byte)
    return stream, encoding, compression, size
Пример #16
0
def _read(io):
    """Try to read from a url, file or string.

    Parameters
    ----------
    io : str, unicode, or file-like

    Returns
    -------
    raw_text : str
    """
    if _is_url(io):
        with urlopen(io) as url:
            raw_text = url.read()
    elif hasattr(io, 'read'):
        raw_text = io.read()
    elif os.path.isfile(io):
        with open(io) as f:
            raw_text = f.read()
    elif isinstance(io, string_types):
        raw_text = io
    else:
        raise TypeError("Cannot read object of type %r" % type(io).__name__)
    return raw_text