示例#1
0
    def setUp(self, MockPoolExecutor):
        """
        Set up temporary test directory and mock S3 bucket connection
        """
        # Magic mocking of multiprocessing
        MockPoolExecutor().__enter__().map = map_mock
        # Mock S3 directory for upload
        self.storage_dir = "raw_frames/SMS-2010-01-01-00-00-00-0001"
        # Create temporary directory and write temp image
        self.tempdir = TempDirectory()
        self.temp_path = self.tempdir.path
        # Temporary frame
        self.im = np.ones((10, 15), dtype=np.uint16)
        self.im[2:5, 3:12] = 10000
        # Save test tif files
        self.channel_names = ['phase', 'brightfield', '666']
        # Write files in dir
        for c in self.channel_names:
            for z in range(2):
                file_name = 'img_{}_t000_p050_z00{}.tif'.format(c, z)
                file_path = os.path.join(self.temp_path, file_name)
                ijmeta = {"Info": json.dumps({"c": c, "z": z})}
                tifffile.imsave(
                    file_path,
                    self.im + 5000 * z,
                    ijmetadata=ijmeta,
                )
        # Write external metadata in dir
        self.meta_dict = {
            'Summary': {
                'Slices': 26,
                'PixelType': 'GRAY16',
                'Time': '2018-11-01 19:20:34 -0700',
                'z-step_um': 0.5,
                'PixelSize_um': 0,
                'BitDepth': 16,
                'Width': 15,
                'Height': 10
            },
        }
        self.json_filename = os.path.join(self.temp_path, 'metadata.txt')
        json_ops.write_json_file(self.meta_dict, self.json_filename)

        # Setup mock S3 bucket
        self.mock = mock_s3()
        self.mock.start()
        self.conn = boto3.resource('s3', region_name='us-east-1')
        self.bucket_name = 'czbiohub-imaging'
        self.conn.create_bucket(Bucket=self.bucket_name)
        # Instantiate file parser class
        storage_class = aux_utils.get_storage_class('s3')
        self.frames_inst = tif_splitter.TifFolderSplitter(
            data_path=self.temp_path,
            storage_dir=self.storage_dir,
            storage_class=storage_class,
        )
        # Upload data
        self.frames_inst.get_frames_and_metadata(
            filename_parser='parse_sms_name', )
示例#2
0
    def setUp(self):
        # Setup mock local storage
        # Create temporary directory and write temp image
        self.tempdir = TempDirectory()
        self.temp_path = self.tempdir.path
        self.tempdir.makedir('storage_mount_point')
        mount_point = os.path.join(self.temp_path, 'storage_mount_point')

        self.test_path = "/datapath/testfile.tif"
        self.storage_dir = "raw_frames/ISP-2005-06-09-20-00-00-0001"
        storage_class = aux_utils.get_storage_class('local')
        self.mock_inst = file_splitter.FileSplitter(
            data_path=self.test_path,
            storage_dir=self.storage_dir,
            storage_class=storage_class,
            storage_access=mount_point)
 def setUp(self):
     """
     Set up temporary test directory and mock S3 bucket connection
     """
     # Test metadata parameters
     self.nbr_channels = 2
     self.nbr_slices = 3
     # Mock S3 dir
     self.storage_dir = "raw_frames/ML-2005-06-09-20-00-00-1000"
     # Create temporary directory and write temp image
     self.tempdir = TempDirectory()
     self.temp_path = self.tempdir.path
     # Temporary file with 6 frames, tifffile stores channels first
     self.im = 50 * np.ones((6, 10, 15), dtype=np.uint16)
     self.im[0, :5, 3:12] = 50000
     self.im[2, :5, 3:12] = 40000
     self.im[4, :5, 3:12] = 30000
     # Metadata
     self.description = 'ImageJ=1.52e\nimages=6\nchannels=2\nslices=3\nmax=10411.0'
     # Save test tif file
     self.file_path = os.path.join(self.temp_path, "A1_2_PROTEIN_test.tif")
     tifffile.imsave(
         self.file_path,
         self.im,
         description=self.description,
     )
     # Setup mock S3 bucket
     self.mock = mock_s3()
     self.mock.start()
     self.conn = boto3.resource('s3', region_name='us-east-1')
     self.bucket_name = 'czbiohub-imaging'
     self.conn.create_bucket(Bucket=self.bucket_name)
     # Instantiate file parser class
     storage_class = aux_utils.get_storage_class('s3')
     self.frames_inst = tif_id_splitter.TifIDSplitter(
         data_path=self.file_path,
         storage_dir="raw_frames/ML-2005-06-09-20-00-00-1000",
         storage_class=storage_class,
     )
     # Upload data
     self.frames_inst.get_frames_and_metadata(
         filename_parser="parse_ml_name",
     )
示例#4
0
def download_data(dataset_serial,
                  login,
                  dest,
                  storage='local',
                  storage_access=None,
                  metadata=True,
                  download=True,
                  nbr_workers=None,
                  positions=None,
                  times=None,
                  channels=None,
                  slices=None):
    """
    Find all files associated with unique project identifier and
    download them to a local directory.

    :param str dataset_serial: Unique dataset identifier
    :param str login: Full path to json file containing database login
                credentials
    :param str dest: Local destination directory name
    :param str storage: 'local' (default) - data will be stored locally and
                synced to S3 the same day. Or 'S3' - data will be uploaded
                directly to S3 then synced with local storage daily.
    :param str/None storage_access: If not using predefined storage locations,
                this parameter refers to mount_point for local storage and
                bucket_name for S3 storage.
    :param bool download: Downloads all files associated with dataset (default)
                If False, will only write csvs with metadata. Only for
                datasets split into frames
    :param bool metadata: Writes metadata (default True)
                global metadata in json, local for each frame in csv
    :param int, None nbr_workers: Number of workers for parallel download
                If None, it defaults to number of machine processors * 5
    :param list, None positions: Positions (FOVs) as integers (default
                None downloads all)
    :param list, None times: Timepoints as integers (default None downloads all)
    :param list, None channels: Channels as integer indices or strings for channel
                names (default None downloads all)
    :param list, None slices: Slice (z) integer indices (Default None downloads all)
    """
    try:
        cli_utils.validate_id(dataset_serial)
    except AssertionError as e:
        raise AssertionError("Invalid ID:", e)

    # Create output directory as a subdirectory in dest named
    # dataset_serial. It stops if the subdirectory already exists to avoid
    # the risk of overwriting existing data
    dest_dir = os.path.join(dest, dataset_serial)
    try:
        os.makedirs(dest_dir, exist_ok=False)
    except FileExistsError as e:
        raise FileExistsError("Folder {} already exists, {}".format(
            dest_dir, e))

    # Get database connection URI
    db_connection = db_utils.get_connection_str(login)
    db_utils.check_connection(db_connection)

    # Instantiate database class
    db_inst = db_ops.DatabaseOperations(dataset_serial=dataset_serial, )
    # Import local or S3 storage class
    storage_class = aux_utils.get_storage_class(storage_type=storage)

    if metadata is False:
        # Just download file(s)
        assert download,\
            "You set metadata *and* download to False. You get nothing."
        with db_ops.session_scope(db_connection) as session:
            storage_dir, file_names = db_inst.get_filenames(session=session, )
    else:
        # If channels can be converted to ints, they're indices
        if channels is not None:
            if not isinstance(channels, list):
                channels = [channels]
            try:
                channels = [int(c) for c in channels]
            except ValueError:
                # Channels are names, not indices
                assert all([isinstance(c, str) for c in channels]), \
                    "channels must be either all str or int"

        # Get the metadata from the requested frames
        with db_ops.session_scope(db_connection) as session:
            global_meta, frames_meta = db_inst.get_frames_meta(
                session=session,
                positions=positions,
                times=times,
                channels=channels,
                slices=slices,
            )
        # Write global metadata to destination directory
        global_meta_filename = os.path.join(
            dest_dir,
            "global_metadata.json",
        )
        json_ops.write_json_file(
            meta_dict=global_meta,
            json_filename=global_meta_filename,
        )
        # Write info for each frame to destination directory
        local_meta_filename = os.path.join(
            dest_dir,
            "frames_meta.csv",
        )
        frames_meta.to_csv(local_meta_filename, sep=",")
        # Extract folder and file names if we want to download
        storage_dir = global_meta["storage_dir"]
        file_names = frames_meta["file_name"]

    if download:
        if nbr_workers is not None:
            assert nbr_workers > 0,\
                "Nbr of worker must be >0, not {}".format(nbr_workers)
        data_loader = storage_class(
            storage_dir=storage_dir,
            nbr_workers=nbr_workers,
            access_point=storage_access,
        )
        data_loader.download_files(file_names, dest_dir)
示例#5
0
 def setUp(self):
     """
     Set up temporary test directory and mock S3 bucket connection
     """
     # Test metadata parameters
     self.channel_idx = 1
     self.slice_idx = 2
     self.time_idx = 3
     self.channel_name = "TESTCHANNEL"
     # Mock S3 dir
     self.storage_dir = "raw_frames/ISP-2005-06-09-20-00-00-0001"
     # Create temporary directory and write temp image
     self.tempdir = TempDirectory()
     self.temp_path = self.tempdir.path
     # Temporary frame
     self.im = np.ones((10, 15), dtype=np.uint16)
     self.im[2:5, 3:12] = 50000
     # Metadata
     mmmetadata = self._get_mmmeta()
     ijmeta = self._get_ijmeta()
     extra_tags = [('MicroManagerMetadata', 's', 0, mmmetadata, True)]
     # Save test ome tif file
     self.file_path1 = os.path.join(self.temp_path, "test_Pos1.ome.tif")
     tifffile.imsave(
         self.file_path1,
         self.im,
         ijmetadata=ijmeta,
         extratags=extra_tags,
     )
     mmmetadata = self._get_mmmeta(pos_idx=3)
     extra_tags = [('MicroManagerMetadata', 's', 0, mmmetadata, True)]
     # Save test ome tif file
     self.file_path3 = os.path.join(self.temp_path, "test_Pos3.ome.tif")
     tifffile.imsave(
         self.file_path3,
         self.im,
         ijmetadata=ijmeta,
         extratags=extra_tags,
     )
     # Setup mock S3 bucket
     self.mock = mock_s3()
     self.mock.start()
     self.conn = boto3.resource('s3', region_name='us-east-1')
     self.bucket_name = 'czbiohub-imaging'
     self.conn.create_bucket(Bucket=self.bucket_name)
     # Instantiate file parser class
     self.storage_class = aux_utils.get_storage_class('s3')
     self.frames_inst = ometif_splitter.OmeTiffSplitter(
         data_path=self.temp_path,
         storage_dir="raw_frames/ISP-2005-06-09-20-00-00-0001",
         storage_class=self.storage_class,
     )
     # Get path to json schema file
     dir_name = os.path.dirname(__file__)
     self.schema_file_path = os.path.realpath(
         os.path.join(dir_name, '..', '..', 'metadata_schema.json'), )
     # Upload data
     self.frames_inst.get_frames_and_metadata(
         schema_filename=self.schema_file_path,
         positions='[1, 3]',
     )
示例#6
0
def test_get_bad_storage_class():
    aux_utils.get_storage_class('no_valid_format')
示例#7
0
def test_get_storage_class():
    storage_type = 'local'
    class_inst = aux_utils.get_storage_class(storage_type)
    nose.tools.assert_true(inspect.isclass(class_inst))
    nose.tools.assert_equal(class_inst.__name__, 'LocalStorage')
示例#8
0
def upload_data_and_update_db(csv,
                              login,
                              config,
                              nbr_workers=None,
                              overwrite=False):
    """
    Takes a csv file in which each row represents a dataset, uploads the data
    to storage and metadata to database. If 'frames' is selected as upload
    type, each dataset will be split into individual 2D frames before moving
    to storage.
    TODO: Add logging instead of printing

    :param str login: Full path to json file containing login credentials
    :param str csv: Full path to csv file containing the following fields
        for each file to be uploaded:
            str dataset_id: Unique dataset ID <ID>-YYYY-MM-DD-HH-MM-SS-<SSSS>
            str file_name: Full path to file to be uploaded
            str description: Short description of file
            str parent_dataset_id: Parent dataset unique ID if there is one
                list positions: Which position files in folder to upload.
                Uploads all if left empty and file_name is a folder.
                Only valid for ome-tiff uploads.
    :param  str config: Full path to json config file containing the fields:
            str upload_type: Specify if the file should be split prior to upload
                Valid options: 'frames' or 'file'
            str frames_format: Which file splitter class to use.
                Valid options:
                'ome_tiff' needs MicroManagerMetadata tag for each frame for metadata
                'tif_folder' when each file is already an individual frame
                and relies on MicroManager metadata
                'tif_id' needs ImageDescription tag in first frame page for metadata
            str storage: 'local' (default) - data will be stored locally and
                synced to S3 the same day. Or 'S3' - data will be uploaded
                directly to S3 then synced with local storage daily.
            str storage_access: If not using predefined storage locations,
                this parameter refers to mount_point for local storage and
                bucket_name for S3 storage. (optional)
            str json_meta: If splitting to frames, give full path to json
                metadata schema for reading metadata (optional)
    :param int, None nbr_workers: Number of workers for parallel uploads
    :param bool overwrite: Use with caution if your upload if your upload was
            interrupted and you want to overwrite existing data in database
            and storage
    """
    # Assert that csv file exists and load it
    assert os.path.isfile(csv), \
        "File doesn't exist: {}".format(csv)
    files_data = pd.read_csv(csv)

    # Get database connection URI
    db_connection = db_utils.get_connection_str(login)
    db_utils.check_connection(db_connection)
    # Read and validate config json
    config_json = json_ops.read_json_file(
        json_filename=config,
        schema_name="CONFIG_SCHEMA",
    )
    # Assert that upload type is valid
    upload_type = config_json['upload_type'].lower()
    assert upload_type in {"file", "frames"}, \
        "upload_type should be 'file' or 'frames', not {}".format(
            upload_type,
        )
    if nbr_workers is not None:
        assert nbr_workers > 0, \
            "Nbr of worker must be >0, not {}".format(nbr_workers)
    # Import local or S3 storage class
    storage = 'local'
    if 'storage' in config_json:
        storage = config_json['storage']
    storage_class = aux_utils.get_storage_class(storage_type=storage)
    storage_access = None
    if 'storage_access' in config_json:
        storage_access = config_json['storage_access']

    # Make sure microscope is a string
    microscope = None
    if 'microscope' in config_json:
        if isinstance(config_json['microscope'], str):
            microscope = config_json['microscope']

    if upload_type == 'frames':
        # If upload type is frames, check from frames format
        assert 'frames_format' in config_json, \
            'You must specify the type of file(s)'
        splitter_class = aux_utils.get_splitter_class(
            config_json['frames_format'],
        )
    # Upload all files
    for file_nbr, row in files_data.iterrows():
        # Assert that ID is correctly formatted
        dataset_serial = row.dataset_id
        try:
            cli_utils.validate_id(dataset_serial)
        except AssertionError as e:
            raise AssertionError("Invalid ID:", e)

        # Get S3 directory based on upload type
        if upload_type == "frames":
            storage_dir = "/".join([FRAME_FOLDER_NAME, dataset_serial])
        else:
            storage_dir = "/".join([FILE_FOLDER_NAME, dataset_serial])
        # Instantiate database operations class
        db_inst = db_ops.DatabaseOperations(
            dataset_serial=dataset_serial,
        )
        # Make sure dataset is not already in database
        if not overwrite:
            with db_ops.session_scope(db_connection) as session:
                db_inst.assert_unique_id(session)
        # Check for parent dataset
        parent_dataset_id = 'None'
        if 'parent_dataset_id' in row:
            parent_dataset_id = row.parent_dataset_id
        # Check for dataset description
        description = None
        if 'description' in row:
            if row.description == row.description:
                description = row.description

        if upload_type == "frames":
            # Instantiate splitter class
            frames_inst = splitter_class(
                data_path=row.file_name,
                storage_dir=storage_dir,
                storage_class=storage_class,
                storage_access=storage_access,
                overwrite=overwrite,
                file_format=FRAME_FILE_FORMAT,
                nbr_workers=nbr_workers,
            )
            # Get kwargs if any
            kwargs = {}
            if 'positions' in row:
                positions = row['positions']
                if not pd.isna(positions):
                    kwargs['positions'] = positions
            if 'schema_filename' in config_json:
                kwargs['schema_filename'] = config_json['schema_filename']
            if 'filename_parser' in config_json:
                filename_parser = config_json['filename_parser']
                kwargs['filename_parser'] = filename_parser
            # Extract metadata and split file into frames
            frames_inst.get_frames_and_metadata(**kwargs)

            # Add frames metadata to database
            try:
                with db_ops.session_scope(db_connection) as session:
                    db_inst.insert_frames(
                        session=session,
                        description=description,
                        frames_meta=frames_inst.get_frames_meta(),
                        frames_json_meta=frames_inst.get_frames_json(),
                        global_meta=frames_inst.get_global_meta(),
                        global_json_meta=frames_inst.get_global_json(),
                        microscope=microscope,
                        parent_dataset=parent_dataset_id,
                    )
            except AssertionError as e:
                print("Data set {} already in DB".format(dataset_serial), e)
        # File upload
        else:
            # Just upload file without opening it
            assert os.path.isfile(row.file_name), \
                "File doesn't exist: {}".format(row.file_name)
            data_uploader = storage_class(
                storage_dir=storage_dir,
                access_point=storage_access,
            )
            if not overwrite:
                data_uploader.assert_unique_id()
            try:
                data_uploader.upload_file(file_path=row.file_name)
                print("File {} uploaded to S3".format(row.file_name))
            except AssertionError as e:
                print("File already on S3, moving on to DB entry. {}".format(e))

            sha = meta_utils.gen_sha256(row.file_name)
            # Add file entry to DB once I can get it tested
            global_json = {"file_origin": row.file_name}
            file_name = row.file_name.split("/")[-1]
            try:
                with db_ops.session_scope(db_connection) as session:
                    db_inst.insert_file(
                        session=session,
                        description=description,
                        storage_dir=storage_dir,
                        file_name=file_name,
                        global_json_meta=global_json,
                        microscope=microscope,
                        parent_dataset=parent_dataset_id,
                        sha256=sha,
                    )
                print("File info for {} inserted in DB".format(dataset_serial))
            except AssertionError as e:
                print("File {} already in database".format(dataset_serial))