def load_single_cell_compartment_csv(compartment_dir, compartment, metadata_cols): """ Load and process columns for CellProfiler output data Arguments: compartment_dir - path location of where the compartment csv files are stored compartment - string representing the compartment to load (e.g. cytoplasm) metadata_cols - a list of columns to add `Metadata_` prefix. Note the entries should not already be prefixed by compartment (e.g. AreaShape and not Cells_AreaShape) Output: A compartment dataframe with compartment prefixed column names """ # Setup compartment file compartment = compartment.capitalize() compartment_file = pathlib.Path(compartment_dir, f"{compartment}.csv") # Load compartment data compartment_df = read_csvs_with_chunksize(compartment_file) compartment_df.columns = [ f"{compartment}_{x}" for x in compartment_df.columns ] # Identify and rename metadata_cols metadata_rename = {} for col in metadata_cols: metadata_col = f"Metadata_{compartment}_{col}" metadata_rename[f"{compartment}_{col}"] = metadata_col compartment_df = compartment_df.rename(metadata_rename, axis="columns") return compartment_df
def load_compartments(core, example_dir): compartments = core["compartments"] data = {} for compartment in compartments: compart_file = get_compartment_file(compartment, example_dir) df = read_csvs_with_chunksize(compart_file) df = recode_cols(df, core, compartment) data[compartment] = df return data
print("Starting 2.process-cells.") logging.info(f"Starting 2.process-cells.") cell_quality = CellQuality( quality_func, category_class_name=quality_col, category_col_index=quality_idx ) cell_category_dict = cell_quality.define_cell_quality() empty_cell_category = len(cell_category_dict) + 1 cell_category_dict[empty_cell_category] = "Empty" cell_category_df = pd.DataFrame(cell_category_dict, index=[quality_col]).transpose() # Enables feature filtering by loading the Cell Painting feature file. # 0.prefilter-features.py must be run first try: all_feature_df = read_csvs_with_chunksize(prefilter_file, sep="\t").query( "not prefilter_column" ) except FileNotFoundError: raise FileNotFoundError( "Error", f"{prefilter_file} not found. ", "Perform 0.prefilter-features.py prefilter before continuing...", ) # Load image metadata summary file to extract out important metadata indicators # 1.process-spots.py must be run first try: image_df = read_csvs_with_chunksize(input_image_file, sep="\t") except FileNotFoundError: raise FileNotFoundError( "Error",
split_info, separator="___") # Read and Merge Data cell_quality_list = [] site_stat_list = [] pert_counts_list = [] for data_split_site in site_info_dict: split_sites = site_info_dict[data_split_site] for site in split_sites: # Aggregates cell quality by site into single list cell_count_file = pathlib.Path( f"{input_paintdir}/{site}/cell_counts_{site}.tsv") cell_quality_list.append( read_csvs_with_chunksize(cell_count_file, sep="\t")) # Aggregates site summary stats into a single list site_stat_file = pathlib.Path(input_spotdir, site, f"site_stats.tsv") site_stat_list.append( read_csvs_with_chunksize(site_stat_file, sep="\t")) # Aggregates perturbation counts by site into a single list pert_count_file = pathlib.Path( input_spotdir, site, f"cell_perturbation_category_summary_counts.tsv") pert_counts_list.append( read_csvs_with_chunksize(pert_count_file, sep="\t")) # Creates dataframe from cell quality list cell_count_df = pd.concat(cell_quality_list,
), ) print( f"Now performing feature selection for {data_level}...with operations: {feature_select_operations} for split {data_split_site}" ) logging.info( f"Performing feature selection for {data_level} with operations: {feature_select_operations} for split {data_split_site}" ) output_file = feature_select_output_files[data_level] output_file = pathlib.Path( feature_select_output_files[data_level].parents[0], output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"), ) df = read_csvs_with_chunksize(file_to_feature_select) feature_select( profiles=df, features=feature_select_features, samples=feature_select_drop_samples, operation=feature_select_operations, na_cutoff=feature_select_nacutoff, corr_threshold=feature_select_corr_threshold, output_file=output_file, compression_options=compression, float_format=float_format, ) print("Finished 3.feature-select.") logging.info("Finished 3.feature-select.")
normalize_input_dir, file_to_normalize.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"), ) print( f"Now normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}" ) logging.info( f"Normalizing {data_level}...with operation: {normalize_method} for split {data_split_site}" ) output_file = normalize_output_files[data_level] output_file = pathlib.Path( normalize_output_files[data_level].parents[0], output_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"), ) df = read_csvs_with_chunksize(file_to_normalize) normalize( profiles=df, features=normalize_these_features, samples=normalize_by_samples, method=normalize_method, output_file=output_file, compression_options=compression, float_format=float_format, ) print("Finished 2.normalize.") logging.info("Finished 2.normalize.")
force = args.force # Check if single cell file already exists, and warn user about no effect if single_file_only: if single_file_only_output_file.exists(): if not force: warnings.warn( "Combined single cell file exists. Use '--force' to overwrite." ) logging.warning("Combined single cell file already exists.") print("Starting 0.merge-single-cells.") logging.info(f"Started 0.merge-single-cells.") # Load preselected features all_feature_df = read_csvs_with_chunksize(prefilter_file, sep="\t") if prefilter_features: all_feature_df = all_feature_df.query("not prefilter_column") # Pull out all sites that were measured sites = [x.name for x in input_spotdir.iterdir() if x.name not in ignore_files] site_info_dict = get_split_aware_site_info( config["experiment"], sites, split_info, separator="___" ) allowed_skip_counter = 0 for data_split_site in site_info_dict: split_sites = site_info_dict[data_split_site] sc_df = []
for data_split_site in site_info_dict: # Define a dataset specific file single_cell_dataset_file = pathlib.Path( single_cell_output_dir, single_cell_file.name.replace(".csv.gz", f"_{data_split_site}.csv.gz"), ) # Input argument flow control if aggregate_from_single_file: assert ( single_cell_dataset_file.exists() ), "Error! The single cell file does not exist! Check 0.merge-single-cells.py" # Load single cell data if aggregate_from_single_file: print(f"Loading one single cell file: {single_cell_dataset_file}") single_cell_df = read_csvs_with_chunksize(single_cell_dataset_file, sep=",") logging.info( f"Loaded one single cell file: {single_cell_dataset_file}") else: sites = site_info_dict[data_split_site] print(f"Now loading data from {len(sites)} sites") logging.info(f"Loading data from {len(sites)} sites") single_cell_df = [] for site in sites: site_file = single_cell_site_files[site] if site_file.exists(): site_df = read_csvs_with_chunksize(site_file, sep=",") single_cell_df.append(site_df) print(f"Appended {site}") logging.info(f"Appended {site}") else:
force = plate_summary_config["force_overwrite"] perform = plate_summary_config["perform"] # check if this step should be performed if not perform: sys.exit("Config file set to perform=False, not performing {}".format(__file__)) # Forced overwrite can be achieved in one of two ways. # The command line overrides the config file, check here if it is provided if not force: force = args.force print("Starting 4.image-and-segmentation-qc.") logging.info(f"Started 4.image-and-segmentation-qc.") cell_count_df = read_csvs_with_chunksize(cell_count_file, sep="\t") # Creates x, y coordinates for plotting per-plate views. # Assumes image numbering starts in upper left corner and proceeds down final_order = [] for i in range(1, sites_per_image_grid_side + 1): build_seq = list( zip( ([i] * (sites_per_image_grid_side + 1)), reversed(range(1, (sites_per_image_grid_side + 1))), ) ) final_order += build_seq # Uses sites_list in case there are fewer analyzed sites than acquired sites sites_list = [*range(1, (sites_per_image_grid_side * sites_per_image_grid_side) + 1)]
allowed_skip_counter = 0 for data_split_site in site_info_dict: split_sites = site_info_dict[data_split_site] for site in split_sites: if allowed_skips >= allowed_skip_counter: print( f"Now processing spots for {site}...part of set {data_split_site}" ) logging.info( f"Now processing spots for {site}...part of set {data_split_site}" ) # Load image metadata per site try: image_file = pathlib.Path(input_batchdir, site, "Image.csv") image_df = read_csvs_with_chunksize(image_file).assign( Metadata_site=site, Metadata_dataset_split=data_split_site) image_list.append(image_df) # Obtain specific metadata info well = image_df.loc[:, image_cols["well"]].squeeze() plate = image_df.loc[:, image_cols["plate"]].squeeze() site_location = image_df.loc[:, image_cols["site"]].squeeze() except FileNotFoundError: print(f"{site} image metadata does not exist. Skipping...") logging.info(f"Skipped {site}. No Image.csv") continue except: print(f"Couldn't parse {site} image metadata. Skipping...") logging.warning( f"Couldn't parse {site} image metadata. Skipping...") allowed_skip_counter += 1