def set_pros_of_interest(self, protein_ids=[]): """ protein_ids: a (possibly empty) list of protein_ids """ check.t(protein_ids, list) self.all( lambda run: run.prep.set_pros_of_interest(protein_ids=protein_ids))
def from_prep_fixture(cls, prep_result, labels: str, n_edmans=5, priors=None): """ Run a (likely small) simulation to make a SimResult fixture for testing labels: a CSV list of aas. Eg: "DE,C" Common labels: "DE", "C", "Y", "K", "H" """ from plaster.run.sim_v2.sim_v2_worker import sim_v2 from plaster.tools.schema import check from plaster.run.priors import PriorsMLEFixtures check.t(labels, str) labels = labels.split(",") if priors is None: priors = PriorsMLEFixtures.val_defaults() sim_v2_params = SimV2Params.from_aa_list_fixture( labels, priors=priors, n_pres=1, n_mocks=0, n_edmans=n_edmans, use_lognormal_model=False, n_samples_train=100, n_samples_test=100, train_includes_radmat=True, ) return sim_v2(sim_v2_params, prep_result)
def mat_flatter(mat): """ Flatten to two dimensions. Eg: From (10, 20, 30) to (10, 60) """ check.t(mat, np.ndarray) assert mat.ndim >= 2 return mat.reshape(mat.shape[0], np.prod(mat.shape[1:]))
def validate_filters(cls, filters): """ Validates filters against schema, and fills in defaults where missing. This is a class method so that higher level objects like JobResult can make use of filtering that seems more logical to group with our filters, but are applied at higher level (e.g. objective). """ check.t(filters, Munch) cls.survey_filter_schema.apply_defaults(cls.defaults, filters) cls.survey_filter_schema.validate(filters)
def _dyemat_sim(sim_v2_params, pcbs, n_samples, progress=None): """ Run via the C fast_sim module a dyemat sim. Arguments: sim_v2_params: SimV2Params pcbs: This is an encoding of flus. See SimV2Params.pcbs() Each peptide has a row per amino-acid and either a channel number or a np.nan to indicate a label at that position, plus a p_bright for that aa. n_samples: number of samples to try ... BUT NOT NEC. THE NUMBER RETURNED! -- because all-dark samples are not returned. See "Dealing with dark-rows" above Returns: dyemat: ndarray(n_uniq_dyetracks, n_channels, n_cycle) dyepep: ndarray(dye_i, pep_i, count) pep_recalls: ndarray(n_peps) """ check.t(sim_v2_params, SimV2Params) check.array_t(pcbs, shape=(None, 3), dtype=float) # TODO: Refactor to use priors correctly # The following is assuming that all dyes have the same p_bleach dyemat, dyepeps, pep_recalls = sim_v2_fast.sim( pcbs, n_samples, sim_v2_params.n_channels, len(sim_v2_params.labels), sim_v2_params.cycles_array(), # TODO: Needs to be per-channel and sampled correctly sim_v2_params.channel__priors().set_index("ch_i" ).iloc[0].p_bleach.sample(), # TODO: The following two need to be sampled correctly sim_v2_params.priors.get_mle("p_detach"), sim_v2_params.priors.get_mle("p_edman_failure"), sim_v2_params.allow_edman_cterm, n_threads=get_cpu_limit(), rng_seed=sim_v2_params.random_seed, progress=progress, ) # lex sort dyemats and then remap n_rows, n_cols = dyemat.shape lex_cols = tuple(dyemat[:, n_cols - i - 1] for i in range(n_cols)) sort_args = np.lexsort(lex_cols) lut = np.zeros((n_rows, ), dtype=int) lut[sort_args] = np.arange(n_rows, dtype=int) dyepeps[:, 0] = lut[dyepeps[:, 0]] return dyemat[sort_args], dyepeps, pep_recalls
def __init__(self, job_folders, include_manifest=True): check.t(job_folders, list) self.job_folder = "MultiJobResult has multiple folders in job_folders" self.job_folders = [] self._run_results = {} for job_folder in job_folders: job_folder = assets.validate_job_folder_return_path(job_folder) self.job_folders += [job_folder] self._run_results.update({ run_folder.name: RunResult(run_folder, include_manifest=include_manifest) for run_folder in job_folder if run_folder.is_dir() and "run_manifest.yaml" in run_folder })
def _do_store_get_cache_or_execute(run=None, key=None, inner_fn=None, _args=None, _clear_cache=False): """ If the key is in the run's store, return that. Otherwise execute fn and store. """ check.t(run, RunResult) check.t(key, str) if _clear_cache: plaster.run.store.rm(key) if key not in plaster.run.store: return False, inner_fn(*_args) else: return True, plaster.run.store[key]
def _label_str_permutate(self, label_str): """ Return list of permutations of a label_str such as: "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C") A suffix label set may be added to each permutation with +: "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S") "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T") """ check.t(label_str, str) semi_split = label_str.split(":") if len(semi_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 colon.") suffix_labels = "" if len(semi_split) == 2: suffix_split = semi_split[1].split("+") if len(suffix_split) > 2: raise ValueError(f"Label-set '{label_str}' has >1 plus.") if len(suffix_split) == 2: semi_split = [semi_split[0], suffix_split[0]] suffix_labels = suffix_split[1].split(",") suffix_labels = [slabel.strip() for slabel in suffix_labels] labels = semi_split[0].split(",") labels = [label.strip() for label in labels] if len(semi_split) == 1: perm_count = len(labels) else: perm_count = int(semi_split[1]) if not 0 < perm_count < len(labels): raise ValueError( f"Label-set '{label_str}' has a permutation count " f"of {perm_count}; needs to be between 0 and {len(labels) - 1}" ) perms = list(itertools.combinations(labels, perm_count)) if suffix_labels: perms = [p + tuple(suffix_labels) for p in perms] return perms
def erisyon(generator_name="", sample="", run_name="", **kwargs): """ This method is an example of name-space reservation. Certain tools things like proteases, aa_list are emitted here not because all generators have those but to prevent multiple generators for having *different* meanings for these fields. """ check.t(generator_name, str) check.t(sample, str) return Munch( _erisyon=Munch( run_name=run_name, run_pk=utils.random_str(8), sample=sample, generator_name=generator_name, **kwargs, ) )
def ims_import( src_dir, is_movie=False, n_cycles_limit=None, start_cycle=0, dst_ch_i_to_src_ch_i=None, ): if dst_ch_i_to_src_ch_i is not None: check.t(dst_ch_i_to_src_ch_i, str) dst_ch_i_to_src_ch_i = [int(ch_i) for ch_i in dst_ch_i_to_src_ch_i.split(",")] return Munch( ims_import=Munch( version="1.0", inputs=Munch(src_dir=src_dir), parameters=Munch( is_movie=is_movie, n_cycles_limit=n_cycles_limit, start_cycle=start_cycle, dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i, ), ) )
def save_field(self, field_i, field_chcy_ims, metadata_by_cycle=None, chcy_qualities=None): """ When using parallel field maps we can not save into the result because that will not be serialized back to the main thread. Rather, all field oriented results are written to a temporary pickle file and are reduced to a single value in the main thread's result instance. """ check.t(field_chcy_ims, ArrayResult) field_chcy_ims.flush() if metadata_by_cycle is not None: utils.pickle_save(self._field_metadata_filename(field_i), metadata_by_cycle) if chcy_qualities is not None: utils.pickle_save(self._field_qualities_filename(field_i), chcy_qualities)
def monotonic(bal_sig, beta, lif_len, monotonic_threshold=1.0): """ Examine a cycle-balanced radat (one channel) for the maximum increase in intensity per row and normalize by beta. This puts it roughly into units of dye-count. Arguments: bal_sig: ndarray(n_peaks, n_cycle). Cycle balanced beta: float. approximate intensity per dye lif_len: ndarray(n_peaks). lifespan of each row in cycles monotonic_threshold: float. In dye count units, max increase alloed Returns: monotonic_metric: ndarray((n_peaks)). Max increase in any cycle for each peak in dye counts good_mask: ndarray((n_peaks), dtype=bool). Where monotonic_metric > monotonic_threshold and life_len > 1 and first cycle is not dark """ check.array_t(bal_sig, ndim=2) check.t(beta, float) check.array_t(lif_len, ndim=1) check.t(monotonic_threshold, float) assert len(lif_len) == bal_sig.shape[0] _, col_iz = np.indices(bal_sig.shape) sig_lif = np.where(col_iz < lif_len[:, None], bal_sig, np.nan) with utils.np_no_warn(): d = np.diff(sig_lif, append=0.0, axis=1) maxs_diff = np.nanmax(d, axis=1) monotonic_metric = maxs_diff / beta monotonic_metric_exceeds_thresh_mask = monotonic_metric > monotonic_threshold lif_gt_1_mask = lif_len > 1 starts_high_mask = bal_sig[:, 0] > 0.8 * beta good_mask = ~(monotonic_metric_exceeds_thresh_mask & lif_gt_1_mask & starts_high_mask) return monotonic_metric, good_mask
def test_nn(test_nn_params, prep_result, sim_result, progress=None, pipeline=None): n_channels, n_cycles = sim_result.params.n_channels_and_cycles n_phases = 6 if test_nn_params.include_training_set else 3 if pipeline is not None: pipeline.set_phase(0, n_phases) shape = sim_result.test_radmat.shape assert len(shape) == 4 test_radmat = sim_result.test_radmat.reshape( (shape[0] * shape[1], shape[2], shape[3])) test_dyemat = sim_result.test_dyemat.reshape( (shape[0] * shape[1], shape[2], shape[3])) test_result = nn( test_nn_params, sim_result, radmat=test_radmat, true_dyemat=test_dyemat, progress=progress, ) test_result.true_pep_iz = ArrayResult( filename="test_true_pep_iz", shape=(shape[0] * shape[1], ), dtype=IndexType, mode="w+", ) test_result.true_pep_iz[:] = np.repeat( np.arange(shape[0]).astype(IndexType), shape[1]) check.t(test_result.true_pep_iz, ArrayResult) check.t(test_result.pred_pep_iz, ArrayResult) call_bag = CallBag( true_pep_iz=test_result.true_pep_iz.arr(), pred_pep_iz=test_result.pred_pep_iz.arr(), scores=test_result.scores.arr(), prep_result=prep_result, sim_result=sim_result, ) if pipeline is not None: pipeline.set_phase(1, n_phases) test_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress) # If there is abundance information, compute the abundance-adjusted PR # This call returns None if there is no abundance info avail. if pipeline is not None: pipeline.set_phase(2, n_phases) test_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance( progress=progress) if test_nn_params.include_training_set: # Permit testing for over-fitting by classifying on the train data if pipeline is not None: pipeline.set_phase(3, n_phases) real_pep_iz = prep_result.peps__no_decoys().pep_i.values keep_rows = np.isin(sim_result.train_true_pep_iz, real_pep_iz) train_radmat = sim_result.train_radmat[keep_rows] train_dyemat = sim_result.train_dyemat[keep_rows] assert train_radmat.shape == shape train_result = nn( test_nn_params.use_gmm, sim_result, radmat=train_radmat, true_dyemat=train_dyemat, progress=progress, ) train_result.true_pep_iz = sim_result.train_true_pep_iz train_result.true_pep_iz = ArrayResult( filename="train_true_pep_iz", shape=(shape[0] * shape[1], ), dtype=IndexType, mode="w+", ) train_result.true_pep_iz[:] = np.repeat( np.arange(shape[0]).astype(IndexType), shape[1]) check.t(train_result.true_pep_iz, ArrayResult) check.t(train_result.pred_pep_iz, ArrayResult) call_bag = CallBag( true_pep_iz=train_result.true_pep_iz.arr(), pred_pep_iz=train_result.pred_pep_iz.arr(), scores=train_result.scores.arr(), prep_result=prep_result, sim_result=sim_result, ) if pipeline is not None: pipeline.set_phase(4, n_phases) train_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress) if pipeline is not None: pipeline.set_phase(5, n_phases) train_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance( progress=progress) else: train_result = {k: None for k in test_result.keys()} def rename(d, prefix): return {f"{prefix}{k}": v for k, v in d.items()} return TestNNResult( params=test_nn_params, **rename(test_result, "test_"), **rename(train_result, "train_"), )
def __init__(self, reg_psf: RegPSFPrior, **kws): check.t(reg_psf, RegPSFPrior) self.reg_psf = reg_psf super().__init__(**kws)
def task_rename(task_block, new_name): """Expects a task dictionary that has only one root key, then renames that key""" check.t(task_block, Munch) old_name = utils.get_root_key(task_block) task_block[old_name].task = old_name utils.ren_key(task_block, old_name, new_name)
def it_checks_type_tuples(): some_float = 1.0 some_int = 1 check.t(some_float, (float, int)) check.t(some_int, (float, int))
def test_func(): some_float = 1.0 check.t(some_float, int)
def it_converts_none_to_type_none_scalar(): a = None check.t(a, None)
def it_converts_none_to_type_none_tuple(): a = None check.t(a, (None,))
def add(self, name, prior, source=None): check.t(prior, Prior) if source is not None: prior.source = source assert prior.source is not None self.priors[name] = prior
def context( train_dyemat, train_dyepeps, radmat, radmat_filter_mask, priors, n_channels, n_neighbors=8, run_row_k_fit=True, run_against_all_dyetracks=False, scoring_verbose=False, scoring_verbose_cc=False, use_row_k_p_val=True, row_k_score_factor=1.0, ): """ with nn_v2.context(...) as ctx: zap.work_orders(do_classify_radrows, ...) """ lib = load_lib() check.t(priors, Priors) output_dtype = NNV2Context.tab_type("output") n_radrows = radmat.shape[0] output = np.zeros((n_radrows, NNV2ContextOutputFields.n_fields), dtype=output_dtype) # This is a possible place to optimize to avoid this conversion to float # But as it is now it is needed because the FLANN needs to lookup by float # so it is easier to convert is all here to RadType. train_fdyemat = train_dyemat.astype(RadType) n_dyts = train_fdyemat.shape[0] assert train_fdyemat.shape[1] == radmat.shape[1] n_cols = train_fdyemat.shape[1] n_channels = n_channels n_cycles = n_cols // n_channels assert n_cycles * n_channels == n_cols illum_model = priors.helper_illum_model(n_channels) # TODO: Cleanup legacy gain_model naming conventions (esp in the C code) row_k_beta = 1.0 row_k_sigma = priors.get_mle(f"row_k_sigma") against_all_dyetracks_output_dtype = None against_all_dyetracks_output = None if run_against_all_dyetracks: against_all_dyetracks_output_dtype = NNV2Context.tab_type( "against_all_dyetracks_output") against_all_dyetracks_output = np.zeros( (n_radrows, 3 * n_dyts), dtype=against_all_dyetracks_output_dtype) scoring_verbose_output_dtype = None scoring_verbose_output = None scoring_verbose_cc_output_dtype = None scoring_verbose_cc_output = None if scoring_verbose: scoring_verbose_output_dtype = NNV2Context.tab_type( "scoring_verbose_output") scoring_verbose_output = np.zeros( (n_radrows * n_neighbors, len(NNV2ScoringVerboseFields.col_names)), dtype=scoring_verbose_output_dtype, ) if scoring_verbose_cc: scoring_verbose_cc_output_dtype = NNV2Context.tab_type( "scoring_verbose_cc_output") n_chcy = train_fdyemat.shape[1] scoring_verbose_cc_output = np.zeros( (n_radrows * n_neighbors * n_chcy, 4), dtype=scoring_verbose_cc_output_dtype) nn_v2_context = NNV2Context( train_fdyemat=Tab.from_mat(train_fdyemat, NNV2Context.tab_type("train_fdyemat")), train_dyepeps=Tab.from_mat(train_dyepeps, NNV2Context.tab_type("train_dyepeps")), radmat=Tab.from_mat(radmat, NNV2Context.tab_type("radmat")), radmat_filter_mask=Tab.from_mat( radmat_filter_mask, NNV2Context.tab_type("radmat_filter_mask")), _radmat_filter_mask=radmat_filter_mask, ch_gain_model=Tab.from_mat(illum_model, NNV2Context.tab_type("ch_gain_model")), row_k_beta=row_k_beta, row_k_sigma=row_k_sigma, row_k_score_factor=row_k_score_factor, n_neighbors=n_neighbors, run_row_k_fit=run_row_k_fit, run_against_all_dyetracks=run_against_all_dyetracks, scoring_verbose=scoring_verbose, scoring_verbose_cc=scoring_verbose_cc, use_row_k_p_val=use_row_k_p_val, n_cols=n_cols, n_channels=n_channels, n_cycles=n_cycles, output=Tab.from_mat(output, output_dtype), _output=output, against_all_dyetracks_output=Tab.from_mat( against_all_dyetracks_output, against_all_dyetracks_output_dtype), _against_all_dyetracks_output=against_all_dyetracks_output, scoring_verbose_output=Tab.from_mat(scoring_verbose_output, scoring_verbose_output_dtype), _scoring_verbose_output=scoring_verbose_output, scoring_verbose_cc_output=Tab.from_mat( scoring_verbose_cc_output, scoring_verbose_cc_output_dtype), _scoring_verbose_cc_output=scoring_verbose_cc_output, ) assert ((-1e5 < radmat) & (radmat < 1e6)).sum( ) > 0.5 * radmat.size, "Too many values are out of bounds for radmat" assert radmat.dtype == RadType error = lib.context_init(nn_v2_context) if error is not None: raise CException(error) try: yield nn_v2_context finally: lib.context_free(nn_v2_context)
def get_sample(self, request_name): found_prior = super().get(request_name) prior = found_prior.prior check.t(prior, MLEPrior) return prior.sample()
def validate_job_folder(job_folder, allow_run_folders=False): """ job_folder can be: * Canonical (relative): ./jobs_folder/job_folder * Stand-alone, in which case it is assumed to be ./jobs_folder/job_folder job_folder * URL-like, in which case it is convert to ./jobs_folder/job_folder //jobs_folder/job_folder * Absolute (must be in same as jobs_folder) ${ERISYON_ROOT}/jobs_folder/job_folder Run folders are optionally allowed. ./jobs_folder/job_folder/run DEPRECATED: /path/to/file (If there is already a symlink in ./jobs_folder to this file) Returns: The job_folder alone (without ./jobs_folder) or job_folder Raises: On any unrecognized form. """ root_jobs_folder = jobs_folder() # NORMALIZE into string forms if isinstance(job_folder, LocalPath): # If plumbum style, path must be absolute if not str(job_folder).startswith("/"): raise ValueError( f"job_folder passed by plumbum path must be absolute") job_folder = str(job_folder) check.t(job_folder, str) # CONVERT URL-like form into canonical form if job_folder.startswith("//jobs_folder/"): job_folder = "./jobs_folder/" + job_folder[len("//jobs_folder/"):] # CONVERT stand-alone into canonical. Referenced directory must be in the root_jobs_folder if "/" not in job_folder: job_folder = "./jobs_folder/" + job_folder # CONVERT absolute to relative if job_folder.startswith(jobs_folder_as_str()): job_folder = "./jobs_folder" + job_folder[len(jobs_folder_as_str()):] if not job_folder.startswith("./"): raise ValueError( f"job_folder canonical form starts with './' but found: {job_folder}" ) # Now in canonical form, convert to absolute path abs_path = (root_jobs_folder / job_folder[len("./jobs_folder/"):] ) # Strip "./jobs_folder" def check_exists(): if not abs_path.exists(): raise FileNotFoundError("Unknown job or run folder") parts = job_folder.split("/")[2:] # Skip the initial ".", "jobs_folder" if parts[-1] == "": del parts[-1] n_parts = len(parts) if allow_run_folders: if n_parts > 2: raise ValueError( f"{job_folder} is too many levels deep for a job_folder spec.") if n_parts == 2: check_exists() return f"{parts[0]}/{parts[1]}" if n_parts == 1: check_exists() return parts[0] else: if n_parts != 1: raise ValueError( f"{job_folder} is too many levels deep for a job_folder spec.") check_exists() return parts[0]
def tasks_for_sigproc_v2(self): tasks = {} if self.sigproc_source: ims_import_task = task_templates.ims_import( self.sigproc_source, is_movie=self.movie, n_cycles_limit=self.n_cycles_limit, start_cycle=self.start_cycle, dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i, ) calib_priors = None if self.calibration_job is not None: calib_src_path = (local.path(self.calibration_job) / "sigproc_v2_calib/plaster_output/sigproc_v2") calib_result = SigprocV2Result.load_from_folder( calib_src_path, prop_list=["calib_priors"]) calib_priors = calib_result.calib_priors if self.calib_dst_ch_i_to_src_ch_i is not None: # Convert a string like 2,1,0 and remap check.t(self.calib_dst_ch_i_to_src_ch_i, str) calib_dst_ch_i_to_src_ch_i = [ int(ch_i) for ch_i in self.calib_dst_ch_i_to_src_ch_i.split(",") ] ch_remapped_priors = Priors.copy(calib_priors) ch_remapped_priors.delete_ch_specific_records() ch_aln_prior = ch_remapped_priors.get_exact(f"ch_aln") if ch_aln_prior is not None: ch_aln_prior = ChannelAlignPrior.ch_remap( ch_aln_prior.prior, calib_dst_ch_i_to_src_ch_i) for dst_ch_i, src_ch_i in enumerate( calib_dst_ch_i_to_src_ch_i): def remap(src_key, dst_key): prior = calib_priors.get_exact(src_key) if prior is not None: ch_remapped_priors.add( dst_key, prior.prior, "remapped channel in gen") remap(f"reg_illum.ch_{src_ch_i}", f"reg_illum.ch_{dst_ch_i}") remap(f"reg_psf.ch_{src_ch_i}", f"reg_psf.ch_{dst_ch_i}") calib_priors = ch_remapped_priors ch_aln = None if self.ch_aln is not None: ch_aln = np.array([float(i) for i in self.ch_aln.split(",")]) assert ch_aln.shape[0] % 2 == 0 ch_aln = ch_aln.reshape((-1, 2)) sigproc_v2_task = task_templates.sigproc_v2_analyze( calib_priors=calib_priors, self_calib=self.self_calib, ch_aln=ch_aln, ch_for_alignment=self.ch_for_alignment, ) tasks = Munch(**ims_import_task, **sigproc_v2_task) return tasks
def call_bag(self): check.t(self._call_bag, CallBag) return self._call_bag
def sim_v2(sim_v2_params, prep_result, progress=None, pipeline=None): test_dyemat = None test_radmat = None test_true_pep_iz = None test_true_dye_iz = None test_true_row_ks = None train_radmat = None train_true_pep_iz = None train_true_dye_iz = None train_true_row_ks = None phase_i = 0 n_phases = 1 if sim_v2_params.train_includes_radmat: n_phases += 1 if not sim_v2_params.is_survey: n_phases += 2 # Training data # * always includes decoys # * may include radiometry # ----------------------------------------------------------------------- # debug("gen flus") # train_flus, train_pi_brights = _gen_flus(sim_v2_params, prep_result.pepseqs()) # debug("gen flus done") # RANDOM cleanup # Make the pipeline have a stub so I don't have to if pipeline... # Get rid of phases and just pass in a name to display if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 n_channels, n_cycles = sim_v2_params.n_channels_and_cycles train_dyemat, train_dyepeps, train_pep_recalls = prep_result.get_photobleaching( ) if train_dyemat is None: # This is a regular, non-photo-bleaching run pepseqs = prep_result.pepseqs__with_decoys() check.t(pepseqs, pd.DataFrame) # (pep_i, aa, pep_off_in_pro) pcbs = sim_v2_params.pcbs( pepseqs) # (p)ep_i, (c)hannel_i, (b)right_probability train_dyemat, train_dyepeps, train_pep_recalls = _dyemat_sim( sim_v2_params, pcbs, sim_v2_params.n_samples_train, progress, ) n_dyts = train_dyemat.shape[0] check.array_t( train_dyemat, shape=( n_dyts, n_channels * n_cycles, ), # unique dyetracks (n_rows, n_channels * n_cycles) ) # dyepeps are a map between dyetracks and peptides with a count # Example: # (2, 5, 110) => dyt_i=2 was generated by pep_i==5 110 times # (2, 7, 50) => dyt_i=2 was generated by pep_i==7 50 times check.array_t(train_dyepeps, shape=(None, 3)) # (dyt_i, pep_i, count) assert np.max(train_dyepeps[:, 0]) + 1 == n_dyts # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2) # Note that np.lexsort puts the primary sort key LAST in the argument # Seems like this sorting should be in _dyemat_sim? train_dyepeps = train_dyepeps[np.lexsort( (-train_dyepeps[:, 2], train_dyepeps[:, 0]))] if sim_v2_params.train_includes_radmat: if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 ( train_radmat, train_true_pep_iz, train_true_dye_iz, train_true_rows_ks, ) = _radmat_sim( train_dyemat.reshape(( train_dyemat.shape[0], sim_v2_params.n_channels, sim_v2_params.n_cycles, )), train_dyepeps, sim_v2_params.by_channel(), sim_v2_params.n_samples_train, sim_v2_params.n_channels, sim_v2_params.n_cycles, sim_v2_params.use_lognormal_model, progress, ) # Test data # * does not include decoys # * always includes radiometry # * may include dyetracks # * skipped if is_survey # ----------------------------------------------------------------------- if not sim_v2_params.is_survey: # test_flus, test_pi_brights = _gen_flus( # sim_v2_params, prep_result.pepseqs__no_decoys() # ) if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 test_dyemat, test_dyepeps, test_pep_recalls = prep_result.get_photobleaching( ) if test_dyemat is None: # This is a regular, non-photo-bleaching run test_dyemat, test_dyepeps, test_pep_recalls = _dyemat_sim( sim_v2_params, sim_v2_params.pcbs(prep_result.pepseqs__no_decoys()), sim_v2_params.n_samples_test, progress, ) # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2) # Note that np.lexsort puts the primary sort key LAST in the argument test_dyepeps = test_dyepeps[np.lexsort( (-test_dyepeps[:, 2], test_dyepeps[:, 0]))] if pipeline: pipeline.set_phase(phase_i, n_phases) phase_i += 1 ( test_radmat, test_true_pep_iz, test_true_dye_iz, test_true_row_ks, ) = _radmat_sim( test_dyemat.reshape( (test_dyemat.shape[0], sim_v2_params.n_channels, sim_v2_params.n_cycles)), test_dyepeps, sim_v2_params.channel__priors(), sim_v2_params.n_samples_test, sim_v2_params.n_channels, sim_v2_params.n_cycles, sim_v2_params.use_lognormal_model, progress, ) if not sim_v2_params.allow_train_test_to_be_identical: # Move to a standalone _method # TASK: Add a dyepeps check # train_dyepeps_df = pd.DataFrame(train_dyepeps, columns=["dye_i", "pep_i", "count"]) # test_dyepeps_df = pd.DataFrame(test_dyepeps, columns=["dye_i", "pep_i", "count"]) # joined_df = train_dyepeps_df.set_index("pep_i").join( # test_dyepeps_df.set_index("pep_i") # ) if (train_radmat is not None and train_radmat.shape[0] == test_radmat.shape[0]): check.affirm( not _any_identical_non_zero_rows( train_radmat.reshape(( train_radmat.shape[0], train_radmat.shape[1] * train_radmat.shape[2], )), test_radmat.reshape(( test_radmat.shape[0], test_radmat.shape[1] * test_radmat.shape[2], )), ), "Train and test sets are identical. Probably RNG bug.", ) # REMOVE all-zero rows (EXCEPT THE FIRST which is the nul row) # Seems liek the remove should go into _dye non_zero_rows = np.argwhere(test_true_pep_iz != 0).flatten() test_radmat = test_radmat[non_zero_rows] test_true_pep_iz = test_true_pep_iz[non_zero_rows] test_true_dye_iz = test_true_dye_iz[non_zero_rows] if test_true_row_ks is not None: test_true_row_ks = test_true_row_ks[non_zero_rows] sim_result_v2 = SimV2Result( params=sim_v2_params, train_dyemat=train_dyemat, train_radmat=train_radmat, train_pep_recalls=train_pep_recalls, train_true_pep_iz=train_true_pep_iz, train_true_dye_iz=train_true_dye_iz, train_dyepeps=train_dyepeps, train_true_row_ks=train_true_row_ks, test_dyemat=test_dyemat, test_radmat=test_radmat, test_true_pep_iz=test_true_pep_iz, test_true_dye_iz=test_true_dye_iz, test_true_row_ks=test_true_row_ks, _flus=None, ) if sim_v2_params.generate_flus: # Why optional? Should it be optimized? sim_result_v2._generate_flu_info(prep_result) return sim_result_v2