def distribute_struct_scores_bonus( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/manifest_20201007_tg.csv", dataset_name="struct_scores_bonus", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/20200911_classifier_features_bonus/README.md", ) # set data path cols, metadata cols, and extra files # ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["result_image_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_scrnaseq_data( test=False, csv_loc="scrnaseq_data_raw.csv", dataset_name="scrnaseq_data", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): df = pd.read_csv(csv_loc) # subsample features to make test if test: # write test matrix make_test_mtx(csv_loc=csv_loc) # make test manifest; counts only; no anndata df = pd.DataFrame({ "counts": [ "raw_counts_test.mtx", df["counts"][1], "cells_test.csv", "cells_test.csv", ] }) dataset_name = f"{dataset_name}_test" # create the dataset without supplementary files ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # columns with files to upload ds.set_path_columns(["counts"]) else: ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # columns with files to upload ds.set_path_columns(["counts", "anndata"]) # anndata object (h5ad) as supplementary files ds.set_extra_files([ "/allen/aics/gene-editing/RNA_seq/scRNAseq_SeeligCollaboration/2019_analysis/merged_experiment_1_2/scrnaseq_cardio_20191210.RData" ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_seg_dataset( test=False, csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv", col_name_map={ "fov_path": "original_fov_location", "FOVId": "fov_id", "seg_file_name": "2D_fov_tiff_path", }, dataset_name="2d_segmented_fields", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # structure scores as auxilary file score_files = [ Path(f"../structure_scores/structure_score_55000000{p}.csv") for p in (13, 14) ] score_dfs = [ pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns") for f in score_files ] df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False) df_score.to_csv(Path("../structure_scores/structure_scores.csv")) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["2D_fov_tiff_path"]) ds.set_extra_files( ["../channel_defs.json", "../structure_scores/structure_scores.csv"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def test_dataset_metadata_numpy_type_casting(example_frame, example_readme): # Add numpy column to frame example_frame["NumpyTypes"] = np.zeros(9) ds = Dataset(example_frame, "test_dataset", "me", example_readme) # Add column filled with numpy types to index ds.set_metadata_columns(["NumpyTypes"]) # Just run distribute to make sure that numpy types are cast fine ds.distribute()
def distribute_cellprofiler_features( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/features2quilt.csv", dataset_name="2d_autocontrasted_single_cell_features_actn2_2", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): df = pd.read_csv(csv_loc) # subsample features to make test if test: # write test feature csv and test image counts csv make_test_csv(csv_loc=csv_loc) cell_line = df["cell_line"][0] cellprofiler_id = df["cellprofiler_id"][0] # make test manifest df = pd.DataFrame({ "feature_file": ["cp_features_test.csv"], "image_object_count_file": ["image_object_counts_test.csv"], "cell_line": [cell_line], "cellprofiler_id": [cellprofiler_id], }) dataset_name = f"{dataset_name}_test" # Create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/cp_20201022/merged_features/features2quilt/README.md", ) # Optionally add common additional requirements ds.add_usage_doc( "https://docs.quiltdata.com/walkthrough/reading-from-a-package") ds.add_license("https://www.allencell.org/terms-of-use.html") # Optionally indicate column values to use for file metadata ds.set_metadata_columns(["cell_line", "cellprofiler_id"]) # Optionally rename the columns on the package level ds.set_column_names_map({ "feature_file": "features", "image_object_count_file": "object_counts" }) # add commit hash to message label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) # Distribute ds.distribute(push_uri=s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def test_dataset_metadata_non_json_serializable_type(example_frame, example_readme): # Add non json serializable type to dataframe example_frame["BadType"] = [SomeDummyObject(i) for i in range(9)] ds = Dataset(example_frame, "test_dataset", "me", example_readme) # Add column filled with non serializable type to index ds.set_metadata_columns(["BadType"]) # Check non json serializable type check fails with pytest.raises(TypeError): ds.distribute()
def distribute_struct_scores_actn2_live( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/20201012_actn2_live_classifier_with_metadata/live_manifest.csv", dataset_name="struct_scores_actn2_live", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) df["CellPath_x"] = df["CellPath_x"].str.replace( "singlecells", "/allen/aics/assay-dev/computational/data/cardio_pipeline_datastep/local_staging_pipeline_actn2/singlecells/singlecells", regex=False, ) df = df.drop(columns=[ "BackgroundPath", "ClassificationPath", "MemMaxProjectionPath", "MemSegmentationPath", "NucMaxProjectionPath", "StrMaxIntensitySlicePath", "CellPath_y", "path", "image_name", "cell_id_filename", ]) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README_actn2_live.md", ) # set data path cols, metadata cols, and extra files # ds.set_metadata_columns(["RawFilePath", "BackgroundPath", "ClassificationPath", "MemMaxProjectionPath", "MemSegmentationPath", "NucMaxProjectionPath", "StrMaxIntensitySlicePath"]) ds.set_path_columns(["CellPath_x"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_autocontrasted_dataset( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv", col_name_map={}, dataset_name="2d_autocontrasted_fields_and_single_cells_actn2", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns( ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"]) ds.set_extra_files([ "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json", "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json", ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def main(): try: args = Args() # Create dataset ds = Dataset(dataset=args.dataset_path, name=args.dataset_name, package_owner=args.package_owner, readme_path=args.readme_path) # Handle optional provided if args.usage_doc_or_link: ds.add_usage_doc(args.usage_doc_or_link) if args.license_doc_or_link: ds.add_license(args.license_doc_or_link) if args.metadata_columns: ds.set_metadata_columns(args.metadata_columns) if args.path_columns: ds.set_path_columns(args.path_columns) # Distribute pkg = ds.distribute(push_uri=args.push_uri, message=args.message) log.info( f"Completed distribution. " f"Package [name: '{args.package_owner}/{args.dataset_name}', version: {pkg.top_hash}]" ) except Exception as e: log.error("=============================================") if args.debug: log.error("\n\n" + traceback.format_exc()) log.error("=============================================") log.error("\n\n" + str(e) + "\n") log.error("=============================================") sys.exit(1)
def test_dataset_auto_metadata_grouping_repeated_values( repeated_values_frame, example_readme): """ Because the repeated values dataset has three unique files but has nine rows of data, this function checks that there are only three files passed to the package object but that each file has a list of the unique CellIds but that because all the structures are the same per file, that the structure has been reduced to a single value. """ # Create dataset from frame ds = Dataset(repeated_values_frame, "test_dataset", "me", example_readme) ds.set_metadata_columns(["CellId", "Structure"]) # Generate package pkg = ds.distribute() # Check file groupings available assert set(pkg.keys()) == { "SourceReadPath", "README.md", "metadata.csv", "referenced_files" } # Check that only three tiffs were attached to package assert len(pkg["SourceReadPath"]) == 3 # Check that CellId is a list because of repeated values but that Structure is a string because always unique for f in pkg["SourceReadPath"]: assert isinstance(pkg["SourceReadPath"][f].meta["CellId"], list) assert isinstance(pkg["SourceReadPath"][f].meta["Structure"], str)
def distribute_struct_scores_actn2( test=False, csv_loc="/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/results_Fish/AssayDevFishAnalsysis-Handoff-transcript2protein.csv", dataset_name="struct_scores_actn2_2", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # only include new actn2 fish in this package -> 5500000322/323 imaged 2020-10 date = df["original_fov_location"].str.split("/", expand=True) df["date"] = date[7] df = df[df.date.isin(["20201002", "20201006"])] df = df.drop(columns=["date"]) # update result image dir (moved after processing) img_dir = "/allen/aics/assay-dev/MicroscopyOtherData/Viana/projects/fish_morphology_code/fish_morphology_code/processing/structure_organization/output_Fish/" new_result_path = [ img_dir + Path(x).name for x in df["result_image_path"].tolist() ] df["result_image_path"] = new_result_path # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/20200929_classifier_features_actn2/README.md", ) # set data path cols, metadata cols, and extra files # ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["result_image_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset( test=False, csv_loc="nonstructure_fov_manifest_for_quilt.csv", col_name_map={ "FOVId": "fov_id", "fov_path": "original_fov_location" }, dataset_name="2d_nonstructure_fields", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["merged_2D_fov_tiff_path"]) ds.set_extra_files(["channel_defs.json"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nuclear_masks( test=False, csv_loc=Path( "/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv" ), dataset_name="2d_nuclear_masks", package_owner="calystay", s3_bucket="s3://allencell-internal-quilt", readme_path="README.md", ): # read in original csv df_in = pd.read_csv(csv_loc) # extract original_fov_location and nuc_mask_path from dataframe df = df_in[["original_fov_location", "nuc_mask_path"]] df = df.drop_duplicates() # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path=readme_path, ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["nuclear_mask_path"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def test_dataset_file_grouping_with_matching_names(same_filenames_frame, example_readme): # Create dataset from frame ds = Dataset(same_filenames_frame, "test_dataset", "me", example_readme) # Generate package pkg = ds.distribute() # Check file groupings available assert set(pkg.keys()) == { "SourceReadPath", "README.md", "metadata.csv", "referenced_files" } # Check that 18 unique files were attached to package assert len(pkg["SourceReadPath"]) == 18
# Indicate column values to use for file metadata ds.set_metadata_columns([ "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm", "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId", "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow", "FeatureExplorerURL" ]) # Set produced package directory naming ds.set_column_names_map({ "save_feats_path": "cell_features", "save_reg_path": "cell_images_3d", "save_reg_path_flat": "cell_images_2d", "save_reg_path_flat_proj": "cell_images_2d_projections" }) # Add any extra files ds.set_extra_files({ "contact_sheets": list(scp_output_dir.glob("diagnostics_*.png")) }) # Step 6: # Distribute the package ds.distribute( push_uri="s3://allencell", message="Statistical Integrated Cell Research Data including Controls" ) print("-" * 80) print("COMPLETE")
"NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId", "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow", "FeatureExplorerURL" ]) # Set produced package directory naming ds.set_column_names_map({ "MembraneContourReadPath": "membrane_contours", "MembraneSegmentationReadPath": "membrane_segmentations", "NucleusContourReadPath": "dna_contours", "NucleusSegmentationReadPath": "dna_segmentations", "SourceReadPath": "fovs", "StructureContourReadPath": "structure_contours", "StructureSegmentationReadPath": "structure_segmentations" }) # Step 6: # Distribute the package ds.distribute(push_uri="s3://quilt-aics", message="Add feature explorer links to metadata") print("-" * 80) print("COMPLETE")
"### Global structure organization and local structural alignment features\n\n" ) for meta in metadata: for key, value in meta.items(): ftxt.write("- `{0}`: {1}\n".format( value["name"] if value["name"] is not None else key, value["description"], )) # Checking expected shape of the dataframe assert df.shape == (5161, 25) # Save a hand off version for the Modeling team df.to_csv("../results/AssayDevFishAnalsysis-Handoff.csv") # Upload to Quilt ds = Dataset( dataset="../results/AssayDevFishAnalsysis-Handoff.csv", name="assay_dev_fish_analysis", package_owner="matheus", readme_path="assay-dev-fish.md", ) # Set metadata and path columns ds.set_metadata_columns(["CellId"]) ds.set_path_columns(["result_image_path"]) # Send to Quilt pkg = ds.distribute(push_uri="s3://allencell-internal-quilt", message="Fish dataset by assay-dev")
"WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow", "FeatureExplorerURL" ]) # Set produced package directory naming ds.set_column_names_map({ "MembraneContourReadPath": "membrane_contours", "MembraneSegmentationReadPath": "membrane_segmentations", "NucleusContourReadPath": "dna_contours", "NucleusSegmentationReadPath": "dna_segmentations", "SourceReadPath": "fovs", "StructureContourReadPath": "structure_contours", "StructureSegmentationReadPath": "structure_segmentations" }) # Step 6: # Distribute the package ds.distribute( push_uri="s3://allencell", message="Update feature explorer links and documentation for new bucket") print("-" * 80) print("COMPLETE")
"tanyasg/2d_autocontrasted_single_cell_features", "s3://allencell-internal-quilt", ) df_feat_inds = p_feats["features"]["a749d0e2_cp_features.csv"]()[["fov_path"]].rename(columns={"fov_path":"original_fov_location"}) df_feat_inds = df_feat_inds.drop_duplicates() for index, row in df_feat_inds.iterrows(): df_feat_inds.loc[index, 'original_fov_name'] = row['original_fov_location'].split('/')[-1] for index, row in df.iterrows(): df.loc[index, 'original_fov_location'] = df_feat_inds.loc[df_feat_inds['file_name'] == row['original_fov_name'], 'original_fov_location'].values.tolist()[0] # merge df df_new = df.merge(df_feat_inds, how='inner', on=['original_fov_name']) df_new = df_new.set_index('index') # Upload to quilt test_df = df_new[0:2] ds = Dataset( dataset=df_new, name='3d_actn2_segmentation', package_owner='calystay', readme_path=r'C:\Users\calystay\Desktop\README.md', ) ds.set_metadata_columns(["original_fov_location"]) ds.set_path_columns(["struc_seg_path"]) ds.distribute( "s3://allencell-internal-quilt", message="3D actn2 segmentation with original_fov_location" )
image_name = row['image_name'] location = list( set(df_feat_inds.loc[df_feat_inds['image_name'] == image_name, 'original_fov_location']))[0] df.loc[index, 'original_fov_location'] = location plot_df = plot_ds.merge( right=df, left_on=['FOV path', 'Cell number'], right_on=['original_fov_location', 'napariCell_ObjectNumber']) plot_df = plot_df[[ 'original_fov_location', 'napariCell_ObjectNumber', 'seg_561_cell_dist_nuc_per_obj_median', 'seg_638_cell_dist_nuc_per_obj_median' ]] plot_df.to_csv('probe_localization_for_plot.csv') test_df = df.loc[0:2] ds = Dataset( dataset=df, name='probe_localization', package_owner='calystay', readme_path='C:/Users/calystay/Desktop/README.md', ) ds.set_extra_files(['probe_localization_for_plot.csv']) ds.set_metadata_columns(["original_fov_location"]) ds.distribute("s3://allencell-internal-quilt", message="probe localization with original_fov_location")
import pandas as pd from quilt3distribute import Dataset df = pd.read_csv( '/allen/aics/microscopy/Calysta/test/fish_struc_seg/sarc_classification_for_Rory.csv' ) df = df.drop(['Unnamed: 0'], axis=1) df = df.drop(['Unnamed: 0.1'], axis=1) df = df.rename(columns={ 'fov_path': 'original_fov_location', 'cell_num': 'napariCell_ObjectNumber' }) df = df[['nuc_mask_path', 'original_fov_location']] df = df.drop_duplicates() test_df = df.loc[0:2] ds = Dataset( dataset=test_df, name='2d_nuclear_masks_test', package_owner='calystay', readme_path=r'C:\Users\calystay\Desktop\README.md', ) ds.set_metadata_columns(["original_fov_location"]) ds.set_path_columns(["nuc_mask_path"]) ds.distribute("s3://allencell-internal-quilt", message="2D nuclear masks with original_fov_location")