Пример #1
0
def install_bids(sourcedata_dir: PathLike, bids_filename: PathLike) -> None:
    from pathlib import Path

    from fsspec.implementations.local import LocalFileSystem

    fs = LocalFileSystem(auto_mkdir=True)

    source_file = fs.open(fs.ls(sourcedata_dir)[0], mode="rb")
    target_file = fs.open(bids_filename, mode="wb")

    with source_file as sf, target_file as tf:
        tf.write(sf.read())

    source_basename = Path(Path(Path(fs.ls(sourcedata_dir)[0]).stem).stem)
    target_basename = Path(bids_filename.stem).stem

    # The following part adds the sidecar files related to the nifti with the same name: it can be tsv or json files.
    # It may or may not be used, since there might not be any sidecars.
    sidecar_dir = sourcedata_dir.parent / "BIDS"
    for source_sidecar in sidecar_dir.rglob(f"{source_basename}*"):
        target_sidecar = Path.joinpath(
            bids_filename.parent, target_basename).with_name(
                f"{target_basename}{source_sidecar.suffix}")
        source_file = fs.open(source_sidecar, mode="rb")
        target_file = fs.open(target_sidecar, mode="wb")
        with source_file as sf, target_file as tf:
            tf.write(sf.read())
Пример #2
0
def write_bids(
    to: PathLike,
    participants: DataFrame,
    sessions: DataFrame,
    scans: DataFrame,
) -> List[PathLike]:
    from pathlib import Path

    from fsspec.implementations.local import LocalFileSystem

    to = Path(to)
    fs = LocalFileSystem(auto_mkdir=True)

    # Ensure BIDS hierarchy is written first.
    with fs.transaction:
        with fs.open(to / "participants.tsv", "w") as participant_file:
            write_to_tsv(participants, participant_file)

        for participant_id, sessions_group in sessions.groupby(
                "participant_id"):
            sessions_group = sessions_group.droplevel("participant_id")
            sessions_filepath = to / participant_id / f"{participant_id}_sessions.tsv"
            with fs.open(sessions_filepath, "w") as sessions_file:
                write_to_tsv(sessions_group, sessions_file)

    # Perform import of imaging data next.
    for filename, metadata in scans.iterrows():
        if metadata.format == "DCM":
            convert_dicom(sourcedata_dir=metadata.source_dir,
                          bids_filename=to / filename)
        else:
            install_nifti(sourcedata_dir=metadata.source_dir,
                          bids_filename=to / filename)

    return scans.index.to_list()
Пример #3
0
def install_nifti(sourcedata_dir: PathLike, bids_filename: PathLike) -> None:
    from fsspec.implementations.local import LocalFileSystem

    fs = LocalFileSystem(auto_mkdir=True)
    source_file = fs.open(fs.ls(sourcedata_dir)[0], mode="rb")
    target_file = fs.open(bids_filename, mode="wb", compression="gzip")

    with source_file as sf, target_file as tf:
        tf.write(sf.read())
Пример #4
0
def test_transaction(tmpdir):
    file = str(tmpdir / "test.txt")
    fs = LocalFileSystem()

    with fs.transaction:
        content = "hello world"
        with fs.open(file, "w") as fp:
            fp.write(content)

    with fs.open(file, "r") as fp:
        read_content = fp.read()

    assert content == read_content
Пример #5
0
    def test_csv_equality(self):
        fs = LocalFileSystem()
        with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f:
            schema1 = from_file(f, {"read_headers": True})
            assert(isinstance(schema1, TextSchema))

        with fs.open(from_root('/test/sample_data/csv_sample_2.csv')) as f:
            schema2 = from_file(f, {"read_headers": True})
            assert(isinstance(schema2, TextSchema))

        schema = find_conflicts([schema1, schema2])[0]
        assert(isinstance(schema, SchemaConflict))
        expect = {'CountDistinctSchemas': 2, 'DistinctSchemas': [{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}]},{'SchemaType': 'csv', 'Columns': [{'Name': 'type', 'Type': 'object'}, {'Name': 'price', 'Type': 'float64'}, {'Name': 'availabile', 'Type': 'bool'}, {'Name': 'date', 'Type': 'object'}]}], 'NonOverlappingColumns': [{'name': 'availabile', 'type': 'bool'}, {'name': 'date', 'type': 'object'}]}
        assert(schema.to_dict() == {'SchemaConflicts': expect})
Пример #6
0
 def test_jsonl(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f:
         schema = from_file(f)
         assert(isinstance(schema, JsonSchema))
         expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'}}, 'type': 'object'}
         assert(schema.schema == expect)
Пример #7
0
def test_file_ops(tmpdir):
    tmpdir = str(tmpdir)
    fs = LocalFileSystem()
    with pytest.raises(FileNotFoundError):
        fs.info(tmpdir + "/nofile")
    fs.touch(tmpdir + "/afile")
    i1 = fs.ukey(tmpdir + "/afile")

    assert tmpdir + "/afile" in fs.ls(tmpdir)

    with fs.open(tmpdir + "/afile", "wb") as f:
        f.write(b"data")
    i2 = fs.ukey(tmpdir + "/afile")
    assert i1 != i2  # because file changed

    fs.copy(tmpdir + "/afile", tmpdir + "/afile2")
    assert tmpdir + "/afile2" in fs.ls(tmpdir)

    fs.move(tmpdir + "/afile", tmpdir + "/afile3")
    assert not fs.exists(tmpdir + "/afile")

    fs.rm(tmpdir + "/afile3", recursive=True)
    assert not fs.exists(tmpdir + "/afile3")

    fs.rm(tmpdir, recursive=True)
    assert not fs.exists(tmpdir)
Пример #8
0
def test_pickle(tmpdir):
    fs = LocalFileSystem()
    tmpdir = str(tmpdir)
    fn0 = os.path.join(tmpdir, "target")

    with open(fn0, "wb") as f:
        f.write(b"data")

    f = fs.open(fn0, "rb")
    f.seek(1)
    f2 = pickle.loads(pickle.dumps(f))
    assert f2.read() == f.read()

    f = fs.open(fn0, "wb")
    with pytest.raises(ValueError):
        pickle.dumps(f)
Пример #9
0
 def test_invalid_json(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/bad_json.json')) as f:
         schema = from_file(f, {})
         assert(isinstance(schema, InvalidSchema))
         message = f"File type not supported for file {from_root('/test/sample_data/bad_json.json')}.  Type: ASCII text, with no line terminators"
         assert(message in schema.reason)
Пример #10
0
def test_file_ops(tmpdir):
    tmpdir = str(tmpdir)
    fs = LocalFileSystem()
    with pytest.raises(FileNotFoundError):
        fs.info(tmpdir + '/nofile')
    fs.touch(tmpdir + '/afile')
    i1 = fs.ukey(tmpdir + '/afile')

    assert tmpdir + '/afile' in fs.ls(tmpdir)

    with fs.open(tmpdir + '/afile', 'wb') as f:
        f.write(b'data')
    i2 = fs.ukey(tmpdir + '/afile')
    assert i1 != i2  # because file changed

    fs.copy(tmpdir + '/afile', tmpdir + '/afile2')
    assert tmpdir + '/afile2' in fs.ls(tmpdir)

    fs.move(tmpdir + '/afile', tmpdir + '/afile3')
    assert not fs.exists(tmpdir + '/afile')

    fs.rm(tmpdir + '/afile3', recursive=True)
    assert not fs.exists(tmpdir + '/afile3')

    fs.rm(tmpdir, recursive=True)
    assert not fs.exists(tmpdir)
Пример #11
0
 def test_csv_no_header(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/csv_no_header.csv')) as f:
         schema = from_file(f)
         assert(isinstance(schema, TextSchema))
         assert(list(map(lambda c: c.name,schema.columns)) == [0,1])
         assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
Пример #12
0
 def test_file_not_supported(self):
     logger.set_level("error")
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f:
         schema = from_file(f)
         assert(isinstance(schema, InvalidSchema))
         assert(schema.reason[0:32] == f"File type not supported for file")
Пример #13
0
 def test_complex_json(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/complex_json.json')) as f:
         schema = from_file(f)
         assert(isinstance(schema, JsonSchema))
         expect = {'$schema': 'http://json-schema.org/schema#', 'type': 'object', 'properties': {'data': {'type': 'array', 'items': {'type': 'object','properties': {'field1': {'type': 'string'},'field2': {'type': ['integer', 'string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'object','properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff']}}}}}, 'required': ['data']}
         assert(schema.schema == expect)
Пример #14
0
 def test_valid_csv(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f:
         schema = from_file(f, {"read_headers": True})
         assert(isinstance(schema, TextSchema))
         assert(list(map(lambda c: c.name, schema.columns)) == ["type","price"])
         assert(list(map(lambda c: c.type,schema.columns)) == ["object","float64"])
Пример #15
0
def test_commit_discard(tmpdir):
    tmpdir = str(tmpdir)
    fs = LocalFileSystem()
    with fs.transaction:
        with fs.open(tmpdir + '/afile', 'wb') as f:
            assert not fs.exists(tmpdir + '/afile')
            f.write(b'data')
        assert not fs.exists(tmpdir + '/afile')
    assert fs.cat(tmpdir + '/afile') == b'data'

    try:
        with fs.transaction:
            with fs.open(tmpdir + '/bfile', 'wb') as f:
                f.write(b'data')
            raise KeyboardInterrupt
    except KeyboardInterrupt:
        assert not fs.exists(tmpdir + '/bfile')
Пример #16
0
def test_commit_discard(tmpdir):
    tmpdir = str(tmpdir)
    fs = LocalFileSystem()
    with fs.transaction:
        with fs.open(tmpdir + "/afile", "wb") as f:
            assert not fs.exists(tmpdir + "/afile")
            f.write(b"data")
        assert not fs.exists(tmpdir + "/afile")
    assert fs.cat(tmpdir + "/afile") == b"data"

    try:
        with fs.transaction:
            with fs.open(tmpdir + "/bfile", "wb") as f:
                f.write(b"data")
            raise KeyboardInterrupt
    except KeyboardInterrupt:
        assert not fs.exists(tmpdir + "/bfile")
Пример #17
0
def test_infer_compression(tmpdir, opener, ext):
    filename = str(tmpdir / f"test{ext}")
    content = b"hello world"
    with opener(filename, "wb") as fp:
        fp.write(content)

    fs = LocalFileSystem()
    with fs.open(f"file://{filename}", "rb", compression="infer") as fp:
        read_content = fp.read()

    assert content == read_content
Пример #18
0
 def test_valid_json(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/json_simple.json')) as f:
         schema = from_file(f)
         assert(isinstance(schema, JsonSchema))
         expect = {'$schema': 'http://json-schema.org/schema#',
                   'properties': {'field': {'type': 'string'}, 'field2': {'type': 'string'}, 'field3': {'type': 'string'}},
                   'required': ['field', 'field2', 'field3'],
                   'type': 'object'}
         assert(schema.schema == expect)
         assert(schema.to_dict() == {'Columns': [], 'SchemaType': 'json'})
         assert(schema.to_pd_dict() == {})
Пример #19
0
 def open(self, key: str):
     if (key == "s3://crawler-poc/catalog_poc_data/test1.csv"
             or key == "s3://crawler-poc/catalog_poc_data/test2.csv"):
         fs = LocalFileSystem()
         return fs.open(
             from_root('/test/sample_data/sample.snappy.parquet'))
     elif key in [
             "s3://tests/in/csv/sample.csv", "s3://tests/in/csv/sample2.csv"
     ]:
         fs = LocalFileSystem()
         return fs.open(from_root('/test/sample_data/csv_sample.csv'))
     elif (key == "s3://test-data/test-path/test1.usf"):
         fs = LocalFileSystem()
         return fs.open(
             from_root('/test/sample_data/unsupported_file_type.usf'))
     elif (key == "s3://test-data/test-path/test2.usf"):
         fs = LocalFileSystem()
         return fs.open(
             from_root('/test/sample_data/unsupported_file_type.usf'))
     elif (key == "s3://test-data/test-path/sample.snappy.parquet"):
         fs = LocalFileSystem()
         return fs.open(
             from_root('/test/sample_data/sample.snappy.parquet'))
     elif (key == "s3://test-data/test-path/sample.snappy.parquet"):
         fs = LocalFileSystem()
         return fs.open(
             from_root('/test/sample_data/sample.snappy.parquet'))
     else:
         raise Exception(f"Unmocked S3 API endpoint: {key}")
Пример #20
0
def test_seekable(tmpdir):
    fs = LocalFileSystem()
    tmpdir = str(tmpdir)
    fn0 = os.path.join(tmpdir, "target")

    with open(fn0, "wb") as f:
        f.write(b"data")

    f = fs.open(fn0, "rt")
    assert f.seekable(), "file is not seekable"
    f.seek(1)
    assert f.read(1) == "a"
    assert f.tell() == 2
Пример #21
0
def test_abs_paths(tmpdir):
    tmpdir = str(tmpdir)
    here = os.getcwd()
    os.chdir(tmpdir)
    with open("tmp", "w") as f:
        f.write("hi")
    out = LocalFileSystem().glob("*")
    assert len(out) == 1
    assert "/" in out[0]
    assert "tmp" in out[0]

    fs = LocalFileSystem()
    os.chdir(here)
    with fs.open(out[0], "r") as f:
        res = f.read()
    assert res == "hi"
Пример #22
0
    def test_check_schemas(self):
        fs = LocalFileSystem()
        with fs.open(from_root('/test/sample_data/complex_json.json')) as f:
            schema1 = from_file(f)
            assert(isinstance(schema1, JsonSchema))

        with fs.open(from_root('/test/sample_data/complex_json_2.json')) as f:
            schema2 = from_file(f)
            assert(isinstance(schema2, JsonSchema))

        with fs.open(from_root('/test/sample_data/json_simple.json')) as f:
            schema3 = from_file(f)
            assert(isinstance(schema3, JsonSchema))

        with fs.open(from_root('/test/sample_data/unsupported_file_type.usf')) as f:
            schema4 = from_file(f)
            assert(isinstance(schema4, InvalidSchema))

        with fs.open(from_root('/test/sample_data/csv_sample.csv')) as f:
            schema5 = from_file(f, {"read_headers": True})
            assert(isinstance(schema5, TextSchema))

        with fs.open(from_root('/test/sample_data/json_lines.jsonl')) as f:
            schema6 = from_file(f)
            assert(isinstance(schema6, JsonSchema))


        with fs.open(from_root('/test/sample_data/json_lines2.jsonl')) as f:
            schema7 = from_file(f)
            assert(isinstance(schema7, JsonSchema))

        schema = find_conflicts([schema1, schema2])[0]
        expect = {'$schema': 'http://json-schema.org/schema#','properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}},'type': 'object'},'type': 'array'}},'required': ['data'],'type': 'object'}
        assert(isinstance(schema, JsonSchema))
        assert(schema.schema == expect)

        schema = find_conflicts([schema1, schema2, schema3])[0]
        assert(isinstance(schema, JsonSchema))
        expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'data': {'items': {'properties': {'field1': {'type': 'string'},'field2': {'type': ['integer','string']},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'properties': {'some_other_stuff': {'type': 'string'}},'required': ['some_other_stuff'],'type': 'object'},'field6': {'type': 'string'}}, 'type': 'object'}, 'type': 'array'},'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'}}, 'required': [], 'type': 'object'}
        assert(schema.schema == expect)
        schema = find_conflicts([schema1, schema2, schema3,  schema5])[0]
        assert(isinstance(schema, InvalidSchema))
        assert(schema.reason == "Mixed type schemas not supported at this time.  Ensure that files are of one type: ['csv', 'json']")

        schema = find_conflicts([schema6, schema7])[0]
        assert(isinstance(schema, JsonSchema))
        expect = {'$schema': 'http://json-schema.org/schema#', 'properties': {'field': {'type': 'string'},'field2': {'type': 'string'},'field3': {'type': 'string'},'field4': {'type': 'string'},'field5': {'type': 'string'},'field6': {'type': 'string'},'field7': {'type': 'string'},'other': {'type': 'string'},'other2': {'type': 'string'},'other3': {'type': 'string'}}, 'required': ['other'], 'type': 'object'}
        assert(schema.schema == expect)
Пример #23
0
def test_file_ops(tmpdir):
    tmpdir = make_path_posix(str(tmpdir))
    fs = LocalFileSystem(auto_mkdir=True)
    with pytest.raises(FileNotFoundError):
        fs.info(tmpdir + "/nofile")
    fs.touch(tmpdir + "/afile")
    i1 = fs.ukey(tmpdir + "/afile")

    assert tmpdir + "/afile" in fs.ls(tmpdir)

    with fs.open(tmpdir + "/afile", "wb") as f:
        f.write(b"data")
    i2 = fs.ukey(tmpdir + "/afile")
    assert i1 != i2  # because file changed

    fs.copy(tmpdir + "/afile", tmpdir + "/afile2")
    assert tmpdir + "/afile2" in fs.ls(tmpdir)

    fs.move(tmpdir + "/afile", tmpdir + "/afile3")
    assert not fs.exists(tmpdir + "/afile")

    fs.cp(tmpdir + "/afile3", tmpdir + "/deeply/nested/file")
    assert fs.exists(tmpdir + "/deeply/nested/file")

    fs.rm(tmpdir + "/afile3", recursive=True)
    assert not fs.exists(tmpdir + "/afile3")

    files = [tmpdir + "/afile4", tmpdir + "/afile5"]
    [fs.touch(f) for f in files]

    with pytest.raises(TypeError):
        fs.rm_file(files)
    fs.rm(files)
    assert all(not fs.exists(f) for f in files)

    fs.touch(tmpdir + "/afile6")
    fs.rm_file(tmpdir + "/afile6")
    assert not fs.exists(tmpdir + "/afile6")

    # IsADirectoryError raised on Linux, PermissionError on Windows
    with pytest.raises((IsADirectoryError, PermissionError)):
        fs.rm_file(tmpdir)

    fs.rm(tmpdir, recursive=True)
    assert not fs.exists(tmpdir)
Пример #24
0
class DeltaLake:
    """An instance of containing a Delta Lake

    This class provides an interface for Delta Lake
    using python-like filesystems provided by fsspec.

    """

    def __init__(
        self,
        path: str,
        filesystem: AbstractFileSystem = None,
        time_travel: datetime = None,
    ):
        """Initializes a Delta Lake

        Retrieves rows pertaining to the given keys from the Table instance
        represented by table_handle.  String keys will be UTF-8 encoded.

        Args:
            path: the path to the table on the filesystem
            filesystem: python-like filesystem (If unset, assume local)
            time_travel: set the delta lake to a specific version

        Returns:
            An instance of a delta table.
        """
        if not filesystem:
            self.filesystem = LocalFileSystem(path)
        else:
            self.filesystem = filesystem
        self.path = path
        self._set_timestamp(time_travel)
        self.checkpoint_info = self._get_checkpoint_info()
        self.fileset = set()

    def _set_timestamp(self, time_travel: datetime):
        if not time_travel:
            self.timestamp = None
        else:
            self.timestamp = round(time.mktime(time_travel.timetuple()))

    def _get_checkpoint_info(self) -> Dict:
        try:
            with self.filesystem.open(
                os.path.join(self.path, "_delta_log", "_last_checkpoint")
            ) as last_checkpoint:
                return json.load(last_checkpoint)
        except (FileNotFoundError, OSError):
            return None

    def _replay_log(self, file: TextIO) -> Tuple[Set, Set]:
        actions = ndjson.loads(file.read())

        if not self.timestamp:
            cut_time = round(time.time() * 1000)
        else:
            cut_time = self.timestamp * 1000

        adds = set(
            action["add"]["path"]
            for action in actions
            if "add" in action.keys() and action["add"]["modificationTime"] < cut_time
        )
        removes = set(
            action["remove"]["path"]
            for action in actions
            if "remove" in action.keys()
            and action["remove"]["deletionTimestamp"] < cut_time
        )

        return adds, removes

    def _delta_files(self, version: int = None) -> str:
        if not version:
            version = 0
        while True:
            try:
                loc = f"{self.path}/_delta_log/{str(version).zfill(20)}.json"
                file = self.filesystem.open(loc)
                version += 1
                yield file
            except (FileNotFoundError, OSError):
                break

    def _replay_delta_and_update_fileset(self, version: int = 0):
        for file in self._delta_files(version):
            adds, removes = self._replay_log(file)
            self.fileset |= adds
            self.fileset -= removes

    def _get_checkpoint_files(self) -> List[str]:
        if "parts" in self.checkpoint_info.keys():
            checkpoint_files = [
                f"{self.path}/_delta_log/"
                + f"{str(self.checkpoint_info['version']).zfill(20)}"
                + f".checkpoint.{str(i).zfill(10)}"
                + f".{str(self.checkpoint_info['parts']).zfill(10)}.parquet"
                for i in range(1, self.checkpoint_info["parts"] + 1)
            ]
        else:
            checkpoint_files = [
                f"{self.path}/_delta_log/"
                + f"{str(self.checkpoint_info['version']).zfill(20)}.checkpoint.parquet"
            ]
        return checkpoint_files

    def _get_checkpoint(self) -> DataFrame:
        checkpoints = []
        for checkpoint_file in self._get_checkpoint_files():
            with self.filesystem.open(checkpoint_file) as file_handler:
                checkpoints.append(pandas.read_parquet(file_handler))
        return pandas.concat(checkpoints)

    def _replay_checkpoint_and_update_fileset(self):
        checkpoint = self._get_checkpoint()
        self.fileset |= set(
            x["path"] for x in checkpoint[checkpoint["add"].notnull()]["add"]
        )

    def files(self) -> Set:
        """Fetches the parquet file list from the delta lake.

        Provides a list of the parquet files on the delta lake on the
        date specified during instantiation.

        Returns:
            A set of the parquet files on the delta lake.

        """
        if (
            self.timestamp or not self.checkpoint_info
        ):  # time travel needs to replay all
            self._replay_delta_and_update_fileset()
        else:
            self._replay_checkpoint_and_update_fileset()
            self._replay_delta_and_update_fileset(self.checkpoint_info["version"] + 1)
        return self.fileset
Пример #25
0
 def test_valid_csv_crlf_lf(self):
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/csv_crlf_sample.csv')) as f:
         schema = from_file(f, {"read_headers": True})
         assert(isinstance(schema, TextSchema))
Пример #26
0
 def test_snappy_parquet_schema_support(self):
     logger.set_level("info")
     fs = LocalFileSystem()
     with fs.open(from_root('/test/sample_data/sample.snappy.parquet')) as f:
         schema = from_file(f)
         assert(isinstance(schema, ParquetSchema))