def extra_additions_dataset(example_frame, example_readme): ds = Dataset(example_frame, "test_dataset", "me", example_readme) ds.set_path_columns(["2dReadPath"]) ds.set_extra_files([example_readme]) ds.set_column_names_map({"2dReadPath": "MappedPath"}) ds.set_metadata_columns(["Structure"]) return ds
def distribute_seg_dataset( test=False, csv_loc="../input_segs_and_tiffs/raw_seg_013_014_images.csv", col_name_map={ "fov_path": "original_fov_location", "FOVId": "fov_id", "seg_file_name": "2D_fov_tiff_path", }, dataset_name="2d_segmented_fields", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # structure scores as auxilary file score_files = [ Path(f"../structure_scores/structure_score_55000000{p}.csv") for p in (13, 14) ] score_dfs = [ pd.read_csv(f).rename({"mh Score": "mh score"}, axis="columns") for f in score_files ] df_score = pd.concat(score_dfs, axis="rows", ignore_index=True, sort=False) df_score.to_csv(Path("../structure_scores/structure_scores.csv")) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["2D_fov_tiff_path"]) ds.set_extra_files( ["../channel_defs.json", "../structure_scores/structure_scores.csv"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_scrnaseq_data( test=False, csv_loc="scrnaseq_data_raw.csv", dataset_name="scrnaseq_data", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): df = pd.read_csv(csv_loc) # subsample features to make test if test: # write test matrix make_test_mtx(csv_loc=csv_loc) # make test manifest; counts only; no anndata df = pd.DataFrame({ "counts": [ "raw_counts_test.mtx", df["counts"][1], "cells_test.csv", "cells_test.csv", ] }) dataset_name = f"{dataset_name}_test" # create the dataset without supplementary files ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # columns with files to upload ds.set_path_columns(["counts"]) else: ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # columns with files to upload ds.set_path_columns(["counts", "anndata"]) # anndata object (h5ad) as supplementary files ds.set_extra_files([ "/allen/aics/gene-editing/RNA_seq/scRNAseq_SeeligCollaboration/2019_analysis/merged_experiment_1_2/scrnaseq_cardio_20191210.RData" ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_autocontrasted_dataset( test=False, csv_loc="/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/image_manifest_final.csv", col_name_map={}, dataset_name="2d_autocontrasted_fields_and_single_cells_actn2", package_owner="rorydm", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") df = df.drop(["2D_fov_tiff_path"], axis="columns").rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path= "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns( ["rescaled_2D_fov_tiff_path", "rescaled_2D_single_cell_tiff_path"]) ds.set_extra_files([ "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/channel_defs.json", "/allen/aics/gene-editing/FISH/2019/chaos/data/normalized_2D_tiffs/5500000075_B3/parameters.json", ]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
def distribute_nonstructure_dataset( test=False, csv_loc="nonstructure_fov_manifest_for_quilt.csv", col_name_map={ "FOVId": "fov_id", "fov_path": "original_fov_location" }, dataset_name="2d_nonstructure_fields", package_owner="tanyasg", s3_bucket="s3://allencell-internal-quilt", ): # read in original csv df = pd.read_csv(csv_loc) # rename some cols df = df.rename(col_name_map, axis="columns") # drop any cols with missing data vds = validate(df, drop_on_error=True) df = vds.data.reset_index(drop=True) # subsample df for eg a test dataset if test: df = df.sample(2, random_state=0) dataset_name = f"{dataset_name}_test" # create the dataset ds = Dataset( dataset=df, name=dataset_name, package_owner=package_owner, readme_path="README.md", ) # set data path cols, metadata cols, and extra files ds.set_metadata_columns(["fov_id", "original_fov_location"]) ds.set_path_columns(["merged_2D_fov_tiff_path"]) ds.set_extra_files(["channel_defs.json"]) # tag with commit hash label = (subprocess.check_output(["git", "rev-parse", "HEAD"]).strip().decode("utf-8")) ds.distribute(s3_bucket, message=f"git commit hash of fish_morphology_code = {label}")
image_name = row['image_name'] location = list( set(df_feat_inds.loc[df_feat_inds['image_name'] == image_name, 'original_fov_location']))[0] df.loc[index, 'original_fov_location'] = location plot_df = plot_ds.merge( right=df, left_on=['FOV path', 'Cell number'], right_on=['original_fov_location', 'napariCell_ObjectNumber']) plot_df = plot_df[[ 'original_fov_location', 'napariCell_ObjectNumber', 'seg_561_cell_dist_nuc_per_obj_median', 'seg_638_cell_dist_nuc_per_obj_median' ]] plot_df.to_csv('probe_localization_for_plot.csv') test_df = df.loc[0:2] ds = Dataset( dataset=df, name='probe_localization', package_owner='calystay', readme_path='C:/Users/calystay/Desktop/README.md', ) ds.set_extra_files(['probe_localization_for_plot.csv']) ds.set_metadata_columns(["original_fov_location"]) ds.distribute("s3://allencell-internal-quilt", message="probe localization with original_fov_location")
# Add a license ds.add_license("https://www.allencell.org/terms-of-use.html") # Indicate column values to use for file metadata ds.set_metadata_columns([ "CellId", "CellIndex", "CellLine", "NucMembSegmentationAlgorithm", "NucMembSegmentationAlgorithmVersion", "FOVId", "Gene", "PlateId", "WellId", "ProteinDisplayName", "StructureDisplayName", "Workflow", "FeatureExplorerURL" ]) # Set produced package directory naming ds.set_column_names_map({ "save_feats_path": "cell_features", "save_reg_path": "cell_images_3d", "save_reg_path_flat": "cell_images_2d", "save_reg_path_flat_proj": "cell_images_2d_projections" }) # Add any extra files ds.set_extra_files( {"contact_sheets": list(scp_output_dir.glob("diagnostics_*.png"))}) # Step 6: # Distribute the package ds.distribute(push_uri="s3://quilt-aics", message="Add feature explorer links to metadata") print("-" * 80) print("COMPLETE")