Exemplo n.º 1
0
def open(path):
    if isinstance(path, bytes):
        return core.open_jay(path)
    if not isinstance(path, str):
        raise TTypeError("Parameter `path` should be a string")
    path = os.path.expanduser(path)
    if not os.path.exists(path):
        msg = "Path %s does not exist" % path
        if not path.startswith("/"):
            msg += " (current directory = %s)" % os.getcwd()
        raise ValueError(msg)

    if not os.path.isdir(path):
        return core.open_jay(path)

    nff_version = None
    nrows = 0
    metafile = os.path.join(path, "_meta.nff")
    with _builtin_open(metafile, encoding="utf-8") as inp:
        info = []
        for line in inp:
            if line.startswith("#"):
                info.append(line[1:].strip())
            else:
                break
        if not (info and info[0].startswith("NFF")):
            raise ValueError("File _meta.nff has invalid format")
        if info[0] == "NFF1":
            nff_version = 1
        elif info[0] == "NFF1+":
            nff_version = 1.5
        elif info[0] == "NFF2":
            nff_version = 2
        if nff_version:
            assert len(info) == 2
            mm = re.match("nrows\s*=\s*(\d+)", info[1])
            if mm:
                nrows = int(mm.group(1))
            else:
                raise ValueError("nrows info not found in line %r" % info[1])
        else:
            raise ValueError("Unknown NFF format: %s" % info[0])

    coltypes = [dt.stype.str32] * 4
    if nff_version > 1:
        coltypes += [None] * 2
    f0 = dt.fread(metafile, sep=",", columns=coltypes)
    f1 = f0(select=["filename", "stype"])
    colnames = f0[:, "colname"].to_list()[0]
    df = core.datatable_load(f1.internal, nrows, path, nff_version < 2,
                             colnames)
    assert df.nrows == nrows, "Wrong number of rows read: %d" % df.nrows
    return df
Exemplo n.º 2
0
def open(path):
    if isinstance(path, bytes):
        return core.open_jay(path)
    if not isinstance(path, str):
        raise TTypeError("Parameter `path` should be a string")
    path = os.path.expanduser(path)
    if not os.path.exists(path):
        msg = "Path %s does not exist" % path
        if not path.startswith("/"):
            msg += " (current directory = %s)" % os.getcwd()
        raise TValueError(msg)
    if os.path.isdir(path):
        raise TValueError("Path %s is a directory" % path)
    return core.open_jay(path)
Exemplo n.º 3
0
    def _resolve_archive(self, filename, subpath=None):
        ext = os.path.splitext(filename)[1]
        if subpath and subpath[0] == "/":
            subpath = subpath[1:]

        if ext == ".zip":
            import zipfile
            zf = zipfile.ZipFile(filename)
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [name for name in zf.namelist()
                   if not(name.startswith("__MACOSX/") or name.endswith("/"))]
            if subpath:
                if subpath in zff:
                    zff = [subpath]
                else:
                    raise TValueError("File `%s` does not exist in archive "
                                      "`%s`" % (subpath, filename))
            if len(zff) > 1:
                warnings.warn("Zip file %s contains multiple compressed "
                              "files: %r. Only the first of them will be used."
                              % (filename, zff), category=FreadWarning)
            if len(zff) == 0:
                raise TValueError("Zip file %s is empty" % filename)
            if self._verbose:
                self._logger.debug("Extracting %s to temporary directory %s"
                                  % (filename, self.tempdir))
            self._tempfiles.append(zf.extract(zff[0], path=self.tempdir))
            self._file = self._tempfiles[-1]

        elif ext == ".gz":
            import gzip
            zf = gzip.GzipFile(filename, mode="rb")
            if self._verbose:
                self._logger.debug("Extracting %s into memory" % filename)
            self._text = zf.read()
            if self._verbose:
                self._logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".bz2":
            import bz2
            with bz2.open(filename, mode="rb") as zf:
                if self._verbose:
                    self._logger.debug("Extracting %s into memory" % filename)
                self._text = zf.read()
                if self._verbose:
                    self._logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".xz":
            import lzma
            with lzma.open(filename, mode="rb") as zf:
                if self._verbose:
                    self._logger.debug("Extracting %s into memory" % filename)
                self._text = zf.read()
                if self._verbose:
                    self._logger.debug("Extracted: size = %d" % len(self._text))

        elif ext == ".xlsx" or ext == ".xls":
            self._result = read_xls_workbook(filename, subpath)

        elif ext == ".jay":
            self._result = core.open_jay(filename)

        else:
            self._file = filename
Exemplo n.º 4
0
def _resolve_archive(filename, subpath, tempfiles):
    logger = tempfiles._logger
    ext = os.path.splitext(filename)[1]
    if subpath and subpath[0] in ["/", "\\"]:
        subpath = subpath[1:]

    out_file = None
    out_text = None
    out_result = None
    # TODO: file extarction should be lazy
    if ext == ".zip":
        import zipfile
        with zipfile.ZipFile(filename) as zf:
            # MacOS is found guilty of adding extra files into the Zip archives
            # it creates. The files are hidden, and in the directory __MACOSX/.
            # We remove those files from the list, since they are not real user
            # files, and have an unknown binary format.
            zff = [name for name in zf.namelist()
                   if not(name.startswith("__MACOSX/") or name.endswith("/"))]
            if subpath:
                if subpath in zff:
                    filename = os.path.join(filename, subpath)
                    zff = [subpath]
                else:
                    raise IOError("File `%s` does not exist in archive `%s`"
                                   % (subpath, filename))
            extracted_files = []
            for zf_file in zff:
                if logger:
                    logger.debug("Extracting %s/%s to temporary directory %s"
                                 % (filename, zf_file, tempfiles.tempdir))
                newfile = zf.extract(zf_file, path=tempfiles.tempdir)
                srcname = os.path.join(filename, zf_file)
                tempfiles.add(newfile)
                extracted_files.append(((srcname, newfile, None, None), None))

            if len(extracted_files) == 1:
                out_file = extracted_files[0][0][1]
            else:
                return (None, None, None, None), extracted_files

    elif filename.endswith(".tar.gz") or filename.endswith(".tgz"):
        import tarfile
        zf = tarfile.open(filename, mode="r:gz")
        zff = [entry.name for entry in zf.getmembers() if entry.isfile()]
        if subpath:
            if subpath in zff:
                filename = os.path.join(filename, subpath)
                zff = [subpath]
            else:
                raise IOError("File `%s` does not exist in archive `%s`"
                              % (subpath, filename))
        extracted_files = []
        for entryname in zff:
            if logger:
                logger.debug("Extracting %s/%s to temporary directory %s"
                             % (filename, entryname, tempfiles.tempdir))
            newfile = tempfiles.create_temp_file()
            with zf.extractfile(entryname) as inp, open(newfile, "wb") as out:
                out.write(inp.read())
            srcname = os.path.join(filename, entryname)
            extracted_files.append(((srcname, newfile, None, None), None))
        if len(extracted_files) == 1:
            out_file = extracted_files[0][0][1]
        else:
            return (None, None, None, None), extracted_files

    elif ext == ".gz":
        import gzip
        zf = gzip.GzipFile(filename, mode="rb")
        if logger:
            logger.debug("Extracting %s into memory" % filename)
        out_text = zf.read()
        if logger:
            logger.debug("Extracted: size = %d" % len(out_text))

    elif ext == ".bz2":
        import bz2
        with bz2.open(filename, mode="rb") as zf:
            if logger:
                logger.debug("Extracting %s into memory" % filename)
            out_text = zf.read()
            if logger:
                logger.debug("Extracted: size = %d" % len(out_text))

    elif ext == ".xz":
        import lzma
        with lzma.open(filename, mode="rb") as zf:
            if logger:
                logger.debug("Extracting %s into memory" % filename)
            out_text = zf.read()
            if logger:
                logger.debug("Extracted: size = %d" % len(out_text))

    elif ext == ".xlsx" or ext == ".xls":
        out_result = read_xls_workbook(filename, subpath)
        if subpath:
            filename = os.path.join(filename, subpath)

    elif ext == ".jay":
        out_result = core.open_jay(filename)

    else:
        out_file = filename
    # src, file, fileno, text, result
    return (filename, out_file, None, out_text), out_result