Exemplo n.º 1
0
def _apply_columns_list(collist, colsdesc):
    n = len(colsdesc)
    nn = len(collist)
    if n != nn:
        raise ValueError("Input contains %s, whereas `columns` "
                         "parameter specifies only %s"
                         % (plural(n, "column"), plural(nn, "column")))
    colnames = []
    coltypes = [rtype.rdrop.value] * n
    for i in range(n):
        entry = collist[i]
        if entry is None or entry is False:
            pass
        elif entry is True or entry is Ellipsis:
            colnames.append(colsdesc[i].name)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, str):
            colnames.append(entry)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, (stype, ltype, type)):
            colnames.append(colsdesc[i].name)
            coltypes[i] = _rtypes_map[entry].value
        elif isinstance(entry, tuple):
            newname, newtype = entry
            if newtype not in _rtypes_map:
                raise ValueError("Unknown type %r used as an override "
                                 "for column %r" % (newtype, newname))
            colnames.append(newname)
            coltypes[i] = _rtypes_map[newtype].value
        else:
            raise TypeError("Entry `columns[%d]` has invalid type %r"
                            % (i, entry.__class__.__name__))
    return (colnames, coltypes)
Exemplo n.º 2
0
def _apply_columns_dict(colsdict, colsdesc):
    default_entry = colsdict.get(..., ...)
    colnames = []
    coltypes = [rtype.rdrop.value] * len(colsdesc)
    new_entries = {}
    for key, val in colsdict.items():
        if isinstance(key, (type, stype, ltype)):
            if isinstance(val, str):
                val = [val]
            if isinstance(val, slice):
                val = [
                    colsdesc[i].name
                    for i in range(*val.indices(len(colsdesc)))
                ]
            if isinstance(val, range):
                val = [colsdesc[i].name for i in val]
            if isinstance(val, (list, tuple, set)):
                for entry in val:
                    if not isinstance(entry, str):
                        raise TypeError(
                            "Type %s in the `columns` parameter should map"
                            " to a string or list of strings (column names)"
                            "; however it contains an entry %r" % (key, entry))
                    if entry in colsdict:
                        continue
                    new_entries[entry] = key
            else:
                raise TypeError("Unknown entry %r for %s in `columns`" %
                                (val, key))
    if new_entries:
        colsdict = {**colsdict, **new_entries}
    for i, desc in enumerate(colsdesc):
        name = desc.name
        entry = colsdict.get(name, default_entry)
        if entry is None:
            pass  # coltype is already "drop"
        elif entry is Ellipsis:
            colnames.append(name)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, str):
            colnames.append(entry)
            coltypes[i] = rtype.rauto.value
        elif isinstance(entry, (stype, ltype, type)):
            colnames.append(name)
            coltypes[i] = _rtypes_map[entry].value
        elif isinstance(entry, tuple):
            newname, newtype = entry
            colnames.append(newname)
            coltypes[i] = _rtypes_map[newtype].value
            assert isinstance(newname, str)
            if not coltypes[i]:
                raise ValueError("Unknown type %r used as an override "
                                 "for column %r" % (newtype, newname))
        else:
            raise TypeError("Unknown value %r for column '%s' in "
                            "columns descriptor" % (entry, name))
    return (colnames, coltypes)
Exemplo n.º 3
0
def _apply_columns_slice(colslice, colsdesc):
    n = len(colsdesc)

    if isinstance(colslice, slice):
        start, count, step = normalize_slice(colslice, n)
    else:
        t = normalize_range(colslice, n)
        if t is None:
            raise ValueError("Invalid range iterator for a file with "
                             "%d columns: %r" % (n, colslice))
        start, count, step = t
    if step <= 0:
        raise ValueError("Cannot use slice/range with negative step "
                         "for column filter: %r" % colslice)

    colnames = [None] * count
    coltypes = [rtype.rdrop.value] * n
    for j in range(count):
        i = start + j * step
        colnames[j] = colsdesc[i].name
        coltypes[i] = rtype.rauto.value
    return (colnames, coltypes)
Exemplo n.º 4
0
 def _resolve_source(self, anysource, file, text, cmd, url):
     args = (["any"] * (anysource is not None) + ["file"] *
             (file is not None) + ["text"] * (text is not None) + ["cmd"] *
             (cmd is not None) + ["url"] * (url is not None))
     if len(args) == 0:
         raise ValueError(
             "No input source for `fread` was given. Please specify one of "
             "the parameters `file`, `text`, `url`, or `cmd`")
     if len(args) > 1:
         if anysource is None:
             raise ValueError(
                 "Both parameters `%s` and `%s` cannot be passed to fread "
                 "simultaneously." % (args[0], args[1]))
         else:
             args.remove("any")
             raise ValueError(
                 "When an unnamed argument is passed, it is invalid to also "
                 "provide the `%s` parameter." % (args[0], ))
     self._resolve_source_any(anysource)
     self._resolve_source_text(text)
     self._resolve_source_file(file)
     self._resolve_source_cmd(cmd)
     self._resolve_source_url(url)
Exemplo n.º 5
0
def _resolve_source_cmd(cmd):
    import subprocess
    if not isinstance(cmd, str):
        raise TypeError("Invalid parameter `cmd` in fread: expected str, "
                        "got %r" % type(cmd))
    proc = subprocess.Popen(cmd, shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    msgout, msgerr = proc.communicate()
    ret = proc.returncode
    if ret:
        msgerr = msgerr.decode("utf-8", errors="replace").strip()
        raise ValueError("Shell command returned error code %r: `%s`"
                         % (ret, msgerr))
    else:
        # src, file, fileno, text, result
        return (cmd, None, None, msgout), None
Exemplo n.º 6
0
def ___new___(cls, value):
    # We're re-implementing Enum.__new__() method, which is called by the
    # metaclass' `__call__` (for example `stype(5)` or `stype("int64")`).
    # Also called by pickle.
    if isinstance(value, cls):
        return value
    try:
        if value in cls._value2member_map_ and not isinstance(value, bool):
            return cls._value2member_map_[value]
        if not isinstance(value, int) and not _numpy_init_attempted:
            _init_numpy_transforms()
            if value in cls._value2member_map_:
                return cls._value2member_map_[value]
    except TypeError:
        # `value` is not hasheable -- not valid for our enum. Pass-through
        # and raise the ValueError below.
        pass
    raise ValueError("`%r` does not map to any %s" % (value, cls.__name__))
Exemplo n.º 7
0
def _resolve_source_file(file, tempfiles):
    logger = tempfiles._logger
    if isinstance(file, _pathlike):
        # `_pathlike` contains (str, bytes), and on Python 3.6 also
        # os.PathLike interface
        file = os.path.expanduser(file)
        file = os.fsdecode(file)
    elif isinstance(file, pathlib.Path):
        # This is only for Python 3.5; in Python 3.6 pathlib.Path implements
        # os.PathLike interface and is included in `_pathlike`.
        file = file.expanduser()
        file = str(file)
    elif hasattr(file, "read") and callable(file.read):
        out_src = None
        out_fileno = None
        out_text = None
        # A builtin `file` object, or something similar. We check for the
        # presence of `fileno` attribute, which will allow us to provide a
        # more direct access to the underlying file.
        # noinspection PyBroadException
        try:
            if sys.platform == "win32":
                raise Exception("Do not use file descriptors on Windows")
            # .fileno can be either a method, or a property
            # The implementation of .fileno may raise an exception too
            # (indicating that no file descriptor is available)
            fd = file.fileno
            if callable(fd):
                fd = fd()
            if not isinstance(fd, int) or fd <= 0:
                raise Exception
            out_fileno = fd
        except Exception:
            # Catching if: file.fileno is not defined, or is not an integer,
            # or raises an error, or returns a closed file descriptor
            rawtxt = file.read()
            out_text = rawtxt
        file = getattr(file, "name", None)
        if not isinstance(file, (str, bytes)):
            out_src = "<file>"
        elif isinstance(file, bytes):
            out_src = os.fsdecode(file)
        else:
            out_src = file
        return (out_src, None, out_fileno, out_text), None
    else:
        raise TypeError("Invalid parameter `file` in fread: expected a "
                        "str/bytes/PathLike, got %r" % type(file))
    # if `file` is not str, then `os.path.join(file, "..")` below will fail
    assert isinstance(file, str)
    if not os.path.exists(file):
        # File does not exist -- search up the tree for the first file that
        # does. This will allow us to provide a better error message to the
        # user; also if the first path component that exists is a file (not
        # a folder), then the user probably tries to specify a file within
        # an archive -- and this is not an error at all!
        xpath = os.path.abspath(file)
        ypath = xpath
        while not os.path.exists(xpath):
            xpath = os.path.abspath(os.path.join(xpath, ".."))
        ypath = ypath[len(xpath):]
        if os.path.isfile(xpath):
            return _resolve_archive(xpath, ypath, tempfiles)
        else:
            raise ValueError("File %s`%s` does not exist"
                             % (escape(xpath), escape(ypath)))
    if not os.path.isfile(file):
        raise ValueError("Path `%s` is not a file" % escape(file))
    return _resolve_archive(file, None, tempfiles)
Exemplo n.º 8
0
    def _resolve_archive(self, filename, subpath=None):
        ext = os.path.splitext(filename)[1]
        if subpath and subpath[0] == "/":
            subpath = subpath[1:]

        if ext == ".zip":
            import zipfile
            zf = zipfile.ZipFile(filename)
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [
                name for name in zf.namelist()
                if not (name.startswith("__MACOSX/") or name.endswith("/"))
            ]
            if subpath:
                if subpath in zff:
                    zff = [subpath]
                else:
                    raise ValueError(
                        "File `%s` does not exist in archive `%s`" %
                        (subpath, filename))
            if len(zff) > 1:
                warnings.warn(
                    "Zip file %s contains multiple compressed "
                    "files: %r. Only the first of them will be used." %
                    (filename, zff),
                    category=FreadWarning)
            if len(zff) == 0:
                raise ValueError("Zip file %s is empty" % filename)
            if self._verbose:
                self._logger.debug("Extracting %s to temporary directory %s" %
                                   (filename, self.tempdir))
            self._tempfiles.append(zf.extract(zff[0], path=self.tempdir))
            self._file = self._tempfiles[-1]

        elif ext == ".gz":
            import gzip
            zf = gzip.GzipFile(filename, mode="rb")
            if self._verbose:
                self._logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self._logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".bz2":
            import bz2
            with bz2.open(filename, mode="rb") as zf:
                if self._verbose:
                    self._logger.debug("Extracting %s into memory" % filename)
                self._text = zf.read()
                if self._verbose:
                    self._logger.debug("Extracted: size = %d" %
                                       len(self._text))

        elif ext == ".xz":
            import lzma
            with lzma.open(filename, mode="rb") as zf:
                if self._verbose:
                    self._logger.debug("Extracting %s into memory" % filename)
                self._text = zf.read()
                if self._verbose:
                    self._logger.debug("Extracted: size = %d" %
                                       len(self._text))

        elif ext == ".xlsx" or ext == ".xls":
            self._result = read_xls_workbook(filename, subpath)

        elif ext == ".jay":
            self._result = core.open_jay(filename)

        else:
            self._file = filename