def get_positions(db_credentials: str, dataset_serial: str): """ Queries the database for a given dataset serial number and returns list of available positions Parameters ---------- db_credentials: str Absolute url to location of .json credentials dataset_serial: str dataset_serial field of a dataset in the database Returns ------- List[int] of positions for a given experiment """ credentials_str = db_utils.get_connection_str(db_credentials) with db_ops.session_scope(credentials_str) as session: frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_serial) positions = set() for f in frames: positions.add(f.pos_idx) return list(positions)
def get_channels(db_credentials: str, dataset_id: str): """ Queries the database for a dataset id and returns a dict of channel assignments Parameters ---------- db_credentials: str Absolute url to location of .json credentials dataset_serial: str dataset_serial field of a dataset in the database Returns ------- Dict of (channel_idx:channel_name) pairs """ dbops = DatabaseOperations(dataset_id) credentials_str = db_utils.get_connection_str(db_credentials) with db_ops.session_scope(credentials_str) as session: _, frames_meta = dbops.get_frames_meta(session) df = frames_meta[['channel_name', 'channel_idx']].drop_duplicates() channels = {} for idx, row in df.iterrows(): channels[row['channel_idx']] = row['channel_name'] return channels
def getIDs(db_credentials, string): credentials_str = db_utils.get_connection_str(db_credentials) with db_ops.session_scope(credentials_str) as session: frames = session.query(db_ops.DataSet) set_of_ids = list() for f in frames: name = f.dataset_serial if string in name: set_of_ids.append(name) return list(set_of_ids)
def check_connection(db_connection): """ Make sure you can connect to database before anything else. :param str db_connection: URI for connecting to the DB :raises IOError: If you can't connect to the DB """ try: with db_ops.session_scope(db_connection) as session: db_ops.test_connection(session) except Exception as e: raise IOError("Can't connect to DB: {}".format(e))
def getPositions(db_credentials, dataset_identifier): credentials_str = db_utils.get_connection_str(db_credentials) with db_ops.session_scope(credentials_str) as session: frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_identifier) positions = set() for f in frames: positions.add(f.pos_idx) return list(positions)
def migrate_db(credentials_filename): """ Updates sha256 checksums for all files and frames :param credentials_filename: Full path to DB credentials file """ # Edit this depending on where your database credential file is stored # This assumes it's stored in dir above imagingDB dir_name = os.path.abspath(os.path.join('..')) dest_dir = os.path.join(dir_name, 'temp_downloads') os.makedirs(dest_dir, exist_ok=True) credentials_str = db_utils.get_connection_str( credentials_filename=credentials_filename, ) # Get files and compute checksums with db_ops.session_scope(credentials_str) as session: files = session.query(db_ops.FileGlobal) for file in files: if file.sha256 is None: data_loader = s3_storage.S3Storage( storage_dir=file.storage_dir, ) file_name = file.metadata_json["file_origin"] file_name = file_name.split("/")[-1] dest_path = os.path.join(dest_dir, file_name) data_loader.download_file( file_name=file_name, dest_path=dest_path, ) checksum = meta_utils.gen_sha256(dest_path) file.sha256 = checksum # Get frames and compute checksums with db_ops.session_scope(credentials_filename) as session: frames = session.query(db_ops.Frames) for frame in frames: if frame.sha256 is None: data_loader = s3_storage.S3Storage( storage_dir=frame.frames_global.storage_dir, ) im = data_loader.get_im(frame.file_name) checksum = meta_utils.gen_sha256(im) frame.sha256 = checksum
def getNbrPositions(self, dataset_identifier): importlib.reload(db_session) with db_ops.session_scope(self.credentials_filename) as session: # Find the Frames of interest frames_global = session.query(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \ .all() nbr_positions = frames_global[0].nbr_positions return nbr_positions
def getAcqMeta(self, dataset_identifier): importlib.reload(db_session) with db_ops.session_scope(self.credentials_filename) as session: # Find the Frames of interest all_frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \ .all() acq_meta = all_frames[0].frames_global.metadata_json['IJMetadata'] return acq_meta
def query_data(login, project_id=None, microscope=None, start_date=None, end_date=None, description=None): """ Provide CLI access to wrappers for common queries. Prints the dataset IDs of the datasets returned from the query to the standard output device. :param str login: Full path to json file containing database login credentials :param str project_id: First part of dataset_serial containing project ID (e.g. ML) :param str microscope: Microscope column :param str start_date: Format YYYY-MM-DD. Find >= dates in date_time column :param str end_date: Format YYYY-MM-DD. Find <= dates in date_time column :param str description: Find substring in description column """ # Get database connection URI db_connection = db_utils.get_connection_str(login) search_dict = {} if project_id is not None: search_dict['project_id'] = project_id if microscope is not None: search_dict['microscope'] = microscope if start_date is not None: search_dict['start_date'] = start_date if end_date is not None: cli_utils.assert_date_order(start_date, end_date) if end_date is not None: search_dict['end_date'] = end_date if description is not None: search_dict['description'] = description with db_ops.session_scope(db_connection) as session: datasets = db_ops.get_datasets(session, search_dict) print("Number of datasets matching your query: {}".format( len(datasets))) for i, d in enumerate(datasets): print(i, d.dataset_serial)
def getImageMeta(self, dataset_identifier): ''' Return metadata for each frame in a list. ''' importlib.reload(db_session) with db_ops.session_scope(self.credentials_filename) as session: datasets = session.query(db_ops.DataSet) # Find the Frames of interest all_frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \ .all() # Get the image metadata image_metadata = [] for im in all_frames: image_metadata.append(im.metadata_json['MicroManagerMetadata']) return image_metadata
def search_ids(db_credentials: str, string: str): """ Retrieves all the datasets in database whose id's contain a specified string Parameters ---------- db_credentials: str Absolute url to location of .json credentials string: str string to match to dataset id's Returns -------s List[string] - list of dataset id's that contain the specified string """ credentials_str = db_utils.get_connection_str(db_credentials) with db_ops.session_scope(credentials_str) as session: frames = session.query(db_ops.DataSet) set_of_ids = list() for f in frames: name = f.dataset_serial if string in name: set_of_ids.append(name) return list(set_of_ids)
def getFrames(self, dataset_identifier, channels='all', slices='all'): ''' Get particular slices from an imaging dataset. Todo: add slicing for pos and time. ''' # Open the session importlib.reload(db_session) with db_ops.session_scope(self.credentials_filename) as session: datasets = session.query(db_ops.DataSet) # Find the Frames of interest all_frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_identifier) # Filter by channel if channels == 'all': pass elif type(channels) is tuple: slice_filtered = all_frames.filter( db_ops.Frames.channel_name.in_(channels)) else: print('Invalid channel query') # Filter by slice if slices == 'all': pass elif type(slices) is tuple: slice_filtered = all_frames.filter( db_ops.Frames.slice_idx.in_(Frames)) else: print('Invalid slice query') # Get the names of the files file_names = [im.filename for im in all_frames] # for im in all_frames: # file_names.append(im.file_name) # Get the bit depth bit_depth = all_frames[0].frames_global.bit_depth # Get the shape of the stack # TODO: get the shape from the acq meta stack_shape = ( all_frames[0].frames_global.im_width, all_frames[0].frames_global.im_height, all_frames[0].frames_global.im_colors, len(all_frames), ) # Get the folder s3_dir = all_frames[0].frames_global.s3_dir # Download the files data_loader = s3_storage.DataStorage(s3_dir=s3_dir) im_stack = data_loader.get_stack(file_names, stack_shape, bit_depth) session.rollback() session.close() return im_stack
def getStack(self, dataset_identifier, channel, time_idx=0, pos_idx=0, verbose=False): ''' Download a stack at a given set of pos, time, channel indices Returns im_ordered : np.ndarray containing the image [time, chan, z, x, y] ''' with db_ops.session_scope(self.credentials_filename) as session: datasets = session.query(db_ops.DataSet) # Find the Frames of interest all_frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == dataset_identifier) \ .filter(db_ops.Frames.channel_name == channel) \ .filter(db_ops.Frames.time_idx == time_idx) \ .filter(db_ops.Frames.pos_idx == pos_idx) \ .all() # Get the names of the files file_names = [im.file_name for im in all_frames] if len(file_names) == 0: raise ValueError('No images match query') # Get the bit depth bit_depth = all_frames[0].frames_global.bit_depth # Get the shape of the stack # TODO: get the shape from the acq meta stack_shape = ( all_frames[0].frames_global.im_width, all_frames[0].frames_global.im_height, all_frames[0].frames_global.im_colors, len(all_frames), ) # Get the folder s3_dir = all_frames[0].frames_global.s3_dir # Download the files data_loader = s3_storage.DataStorage(s3_dir=s3_dir) im_stack = data_loader.get_stack(file_names, stack_shape, bit_depth) im_ordered = np.zeros( (1, 1, stack_shape[3], stack_shape[0], stack_shape[1]), dtype='uint16') # Todo update get_stack so this isn't required... for im_idx in range(len(all_frames)): im_ordered[0, 0, im_idx, :, :] = im_stack[:, :, 0, im_idx] session.rollback() session.close() return im_ordered
def make_experiment_csv(db_credentials: str, csv_file: str, image_ids: List[str], channels: List[str], metadata_format: str = 'micromanager', positions: int = [0], time: int = 0, data_path: str = '/Volumes/imaging/czbiohub-imaging'): """ Creates a CSV file mapping imagingDB frames to indices in an ImageStack for usage with the spacetx format writer. Parameters ---------- db_credentials : str Path to the database credentials file file_path : str file_path of the resulting CSV file metadata_format : str Format for the image metadata on imagingDB. For micromanager, set to 'micromanager'. Default value is 'micromanager' image_ids : List[str] A list of the image ids in the order of the channels : Optional[] A list of the channels to be downloaded in the index order. positions : int Index of the position to download. The default value is 0. time : int Index of the time point to download. The default value is 0. data_path : str Path to the image store volume """ meta_keys = metadata_keys[metadata_format.lower()] fov = [] rnd = [] channel = [] z = [] file_path = [] #sha = [] xc_min = [] xc_max = [] yc_min = [] yc_max = [] zc_min = [] zc_max = [] tile_width = [] tile_height = [] credentials_str = db_utils.get_connection_str(db_credentials) with db_ops.session_scope(credentials_str) as session: for r, im_id in enumerate(image_ids): for fov_idx, p in enumerate(positions): for chan_idx, c in enumerate(channels): frames = session.query(db_ops.Frames) \ .join(db_ops.FramesGlobal) \ .join(db_ops.DataSet) \ .filter(db_ops.DataSet.dataset_serial == im_id) \ .filter(db_ops.Frames.pos_idx == p) \ .filter(db_ops.Frames.channel_name == c) \ .filter(db_ops.Frames.time_idx == time) \ for frame in frames: # Determine pixel_size = frame.metadata_json[meta_keys['key']][ meta_keys['pixel_size']] im_width = frame.frames_global.im_width im_height = frame.frames_global.im_height # Add frame indices fov.append(fov_idx) rnd.append(r) channel.append(chan_idx) z.append(frame.slice_idx) # Clean any windows file path seps before adding path fp = os.path.join(frame.frames_global.storage_dir, frame.file_name) clean_fp = os.path.join(*fp.split('\\')) file_path.append(clean_fp) xc_min.append(frame.metadata_json[meta_keys['key']][ meta_keys['xpos_um']]) xc_max.append(xc_min[-1] + im_width * pixel_size) yc_min.append(frame.metadata_json[meta_keys['key']][ meta_keys['ypos_um']]) yc_max.append(yc_min[-1] + im_height * pixel_size) zc_min.append(frame.metadata_json[meta_keys['key']][ meta_keys['zpos_um']]) zc_max.append(frame.metadata_json[meta_keys['key']][ meta_keys['zpos_um']]) tile_width.append(im_width) tile_height.append(im_height) sha = _calc_checksums(file_path, data_path) data = [ fov, rnd, channel, z, file_path, sha, xc_min, xc_max, yc_min, yc_max, zc_min, zc_max, tile_width, tile_height ] columns = [ 'fov', 'round', 'ch', 'zplane', 'path', 'sha256', 'xc_min', 'xc_max', 'yc_min', 'yc_max', 'zc_min', 'zc_max', 'tile_width', 'tile_height' ] im_df = pd.DataFrame(dict(zip(columns, data))) im_df.to_csv(csv_file) return im_width, im_height
def download_data(dataset_serial, login, dest, storage='local', storage_access=None, metadata=True, download=True, nbr_workers=None, positions=None, times=None, channels=None, slices=None): """ Find all files associated with unique project identifier and download them to a local directory. :param str dataset_serial: Unique dataset identifier :param str login: Full path to json file containing database login credentials :param str dest: Local destination directory name :param str storage: 'local' (default) - data will be stored locally and synced to S3 the same day. Or 'S3' - data will be uploaded directly to S3 then synced with local storage daily. :param str/None storage_access: If not using predefined storage locations, this parameter refers to mount_point for local storage and bucket_name for S3 storage. :param bool download: Downloads all files associated with dataset (default) If False, will only write csvs with metadata. Only for datasets split into frames :param bool metadata: Writes metadata (default True) global metadata in json, local for each frame in csv :param int, None nbr_workers: Number of workers for parallel download If None, it defaults to number of machine processors * 5 :param list, None positions: Positions (FOVs) as integers (default None downloads all) :param list, None times: Timepoints as integers (default None downloads all) :param list, None channels: Channels as integer indices or strings for channel names (default None downloads all) :param list, None slices: Slice (z) integer indices (Default None downloads all) """ try: cli_utils.validate_id(dataset_serial) except AssertionError as e: raise AssertionError("Invalid ID:", e) # Create output directory as a subdirectory in dest named # dataset_serial. It stops if the subdirectory already exists to avoid # the risk of overwriting existing data dest_dir = os.path.join(dest, dataset_serial) try: os.makedirs(dest_dir, exist_ok=False) except FileExistsError as e: raise FileExistsError("Folder {} already exists, {}".format( dest_dir, e)) # Get database connection URI db_connection = db_utils.get_connection_str(login) db_utils.check_connection(db_connection) # Instantiate database class db_inst = db_ops.DatabaseOperations(dataset_serial=dataset_serial, ) # Import local or S3 storage class storage_class = aux_utils.get_storage_class(storage_type=storage) if metadata is False: # Just download file(s) assert download,\ "You set metadata *and* download to False. You get nothing." with db_ops.session_scope(db_connection) as session: storage_dir, file_names = db_inst.get_filenames(session=session, ) else: # If channels can be converted to ints, they're indices if channels is not None: if not isinstance(channels, list): channels = [channels] try: channels = [int(c) for c in channels] except ValueError: # Channels are names, not indices assert all([isinstance(c, str) for c in channels]), \ "channels must be either all str or int" # Get the metadata from the requested frames with db_ops.session_scope(db_connection) as session: global_meta, frames_meta = db_inst.get_frames_meta( session=session, positions=positions, times=times, channels=channels, slices=slices, ) # Write global metadata to destination directory global_meta_filename = os.path.join( dest_dir, "global_metadata.json", ) json_ops.write_json_file( meta_dict=global_meta, json_filename=global_meta_filename, ) # Write info for each frame to destination directory local_meta_filename = os.path.join( dest_dir, "frames_meta.csv", ) frames_meta.to_csv(local_meta_filename, sep=",") # Extract folder and file names if we want to download storage_dir = global_meta["storage_dir"] file_names = frames_meta["file_name"] if download: if nbr_workers is not None: assert nbr_workers > 0,\ "Nbr of worker must be >0, not {}".format(nbr_workers) data_loader = storage_class( storage_dir=storage_dir, nbr_workers=nbr_workers, access_point=storage_access, ) data_loader.download_files(file_names, dest_dir)
def upload_data_and_update_db(csv, login, config, nbr_workers=None, overwrite=False): """ Takes a csv file in which each row represents a dataset, uploads the data to storage and metadata to database. If 'frames' is selected as upload type, each dataset will be split into individual 2D frames before moving to storage. TODO: Add logging instead of printing :param str login: Full path to json file containing login credentials :param str csv: Full path to csv file containing the following fields for each file to be uploaded: str dataset_id: Unique dataset ID <ID>-YYYY-MM-DD-HH-MM-SS-<SSSS> str file_name: Full path to file to be uploaded str description: Short description of file str parent_dataset_id: Parent dataset unique ID if there is one list positions: Which position files in folder to upload. Uploads all if left empty and file_name is a folder. Only valid for ome-tiff uploads. :param str config: Full path to json config file containing the fields: str upload_type: Specify if the file should be split prior to upload Valid options: 'frames' or 'file' str frames_format: Which file splitter class to use. Valid options: 'ome_tiff' needs MicroManagerMetadata tag for each frame for metadata 'tif_folder' when each file is already an individual frame and relies on MicroManager metadata 'tif_id' needs ImageDescription tag in first frame page for metadata str storage: 'local' (default) - data will be stored locally and synced to S3 the same day. Or 'S3' - data will be uploaded directly to S3 then synced with local storage daily. str storage_access: If not using predefined storage locations, this parameter refers to mount_point for local storage and bucket_name for S3 storage. (optional) str json_meta: If splitting to frames, give full path to json metadata schema for reading metadata (optional) :param int, None nbr_workers: Number of workers for parallel uploads :param bool overwrite: Use with caution if your upload if your upload was interrupted and you want to overwrite existing data in database and storage """ # Assert that csv file exists and load it assert os.path.isfile(csv), \ "File doesn't exist: {}".format(csv) files_data = pd.read_csv(csv) # Get database connection URI db_connection = db_utils.get_connection_str(login) db_utils.check_connection(db_connection) # Read and validate config json config_json = json_ops.read_json_file( json_filename=config, schema_name="CONFIG_SCHEMA", ) # Assert that upload type is valid upload_type = config_json['upload_type'].lower() assert upload_type in {"file", "frames"}, \ "upload_type should be 'file' or 'frames', not {}".format( upload_type, ) if nbr_workers is not None: assert nbr_workers > 0, \ "Nbr of worker must be >0, not {}".format(nbr_workers) # Import local or S3 storage class storage = 'local' if 'storage' in config_json: storage = config_json['storage'] storage_class = aux_utils.get_storage_class(storage_type=storage) storage_access = None if 'storage_access' in config_json: storage_access = config_json['storage_access'] # Make sure microscope is a string microscope = None if 'microscope' in config_json: if isinstance(config_json['microscope'], str): microscope = config_json['microscope'] if upload_type == 'frames': # If upload type is frames, check from frames format assert 'frames_format' in config_json, \ 'You must specify the type of file(s)' splitter_class = aux_utils.get_splitter_class( config_json['frames_format'], ) # Upload all files for file_nbr, row in files_data.iterrows(): # Assert that ID is correctly formatted dataset_serial = row.dataset_id try: cli_utils.validate_id(dataset_serial) except AssertionError as e: raise AssertionError("Invalid ID:", e) # Get S3 directory based on upload type if upload_type == "frames": storage_dir = "/".join([FRAME_FOLDER_NAME, dataset_serial]) else: storage_dir = "/".join([FILE_FOLDER_NAME, dataset_serial]) # Instantiate database operations class db_inst = db_ops.DatabaseOperations( dataset_serial=dataset_serial, ) # Make sure dataset is not already in database if not overwrite: with db_ops.session_scope(db_connection) as session: db_inst.assert_unique_id(session) # Check for parent dataset parent_dataset_id = 'None' if 'parent_dataset_id' in row: parent_dataset_id = row.parent_dataset_id # Check for dataset description description = None if 'description' in row: if row.description == row.description: description = row.description if upload_type == "frames": # Instantiate splitter class frames_inst = splitter_class( data_path=row.file_name, storage_dir=storage_dir, storage_class=storage_class, storage_access=storage_access, overwrite=overwrite, file_format=FRAME_FILE_FORMAT, nbr_workers=nbr_workers, ) # Get kwargs if any kwargs = {} if 'positions' in row: positions = row['positions'] if not pd.isna(positions): kwargs['positions'] = positions if 'schema_filename' in config_json: kwargs['schema_filename'] = config_json['schema_filename'] if 'filename_parser' in config_json: filename_parser = config_json['filename_parser'] kwargs['filename_parser'] = filename_parser # Extract metadata and split file into frames frames_inst.get_frames_and_metadata(**kwargs) # Add frames metadata to database try: with db_ops.session_scope(db_connection) as session: db_inst.insert_frames( session=session, description=description, frames_meta=frames_inst.get_frames_meta(), frames_json_meta=frames_inst.get_frames_json(), global_meta=frames_inst.get_global_meta(), global_json_meta=frames_inst.get_global_json(), microscope=microscope, parent_dataset=parent_dataset_id, ) except AssertionError as e: print("Data set {} already in DB".format(dataset_serial), e) # File upload else: # Just upload file without opening it assert os.path.isfile(row.file_name), \ "File doesn't exist: {}".format(row.file_name) data_uploader = storage_class( storage_dir=storage_dir, access_point=storage_access, ) if not overwrite: data_uploader.assert_unique_id() try: data_uploader.upload_file(file_path=row.file_name) print("File {} uploaded to S3".format(row.file_name)) except AssertionError as e: print("File already on S3, moving on to DB entry. {}".format(e)) sha = meta_utils.gen_sha256(row.file_name) # Add file entry to DB once I can get it tested global_json = {"file_origin": row.file_name} file_name = row.file_name.split("/")[-1] try: with db_ops.session_scope(db_connection) as session: db_inst.insert_file( session=session, description=description, storage_dir=storage_dir, file_name=file_name, global_json_meta=global_json, microscope=microscope, parent_dataset=parent_dataset_id, sha256=sha, ) print("File info for {} inserted in DB".format(dataset_serial)) except AssertionError as e: print("File {} already in database".format(dataset_serial))