def it_drop_all_darks(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "DD", "EE"], pro_is_decoys=[False, False, False], peps=[".", "DD", "EE"], pep_pro_iz=[0, 1, 2], ) n_peptides = 3 sim_params = _stub_sim_params(no_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert sim_result.test_dyemat.shape == ( 0, n_channels, n_cycles, ) assert sim_result.test_dyemat.dtype == np.uint8 assert np.all(sim_result.test_dyemat[:] == 0) # All dark assert sim_result.train_dyemat.shape == ( 1, n_channels, n_cycles, ) assert sim_result.train_dyemat.dtype == np.uint8 assert np.all(sim_result.train_pep_recalls[:] == 0.0)
def it_allows_env_set_for_the_yaml_file(): with tmp_folder(chdir=True, remove=False) as t: lines = [ "version: 1", "formatters:", " json:", " class: plaster.tools.zlog.zlog.TypeAwareJsonFormatter", " format: '%(asctime)s %(levelname)s %(message)s %(filename)s %(lineno)d'", "handlers:", " console:", " class: logging.StreamHandler", " stream: ext://sys.stdout", " formatter: json", " level: DEBUG", "loggers:", " plaster:", " level: DEBUG", " handlers: [console]", " propagate: 0", ] with open(t / "logger.yaml", "w") as f: f.write("\n".join(lines)) _run("normal_traces", plaster_zlog_config_path=t / "logger.yaml") lines = so.split("\n") s = json.loads(lines[0]) assert "wrote to module logger.debug" in s["message"]
def it_generates_flu_info(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "XAXCD", "XAXCDXX", "XCCXX"], pro_is_decoys=[False, False, False, False], peps=[".", "XAXCD", "XAXCDXX", "XCCXX"], pep_pro_iz=[0, 1, 2, 3], ) sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) sim_result._generate_flu_info(prep_result) def it_computes_head_and_tail(): _flus = sim_result._flus assert np.all(_flus[_flus.pep_i.isin([1, 2])].flu_count == 2) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_0 == 1) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_head_ch_1 == 0) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_0 == 0) assert np.all(_flus[_flus.pep_i.isin([1, 2])].n_tail_ch_1 == 1) assert np.all(_flus[_flus.pep_i == 3].flu_count == 1) def it_peps__flus(): df = sim_result.peps__flus(prep_result) assert "flustr" in df assert len(df) == 4 def it_peps__flus__unique_flus(): df = sim_result.peps__flus__unique_flus(prep_result) assert np.all(df.pep_i.values == [0, 3]) zest()
def it_resizes(): with tmp.tmp_folder(chdir=True): ar = ArrayResult("test1", shape=(10, 5), dtype=np.uint8, mode="w+") ar[:] = np.arange(10 * 5).astype(np.uint8).reshape((10, 5)) ar.reshape((4, 5)) assert ar.shape == (4, 5) assert np.all(ar.arr() == np.arange(4 * 5).astype(np.uint8).reshape((4, 5)))
def it_saves_and_loads_array_results(): with tmp.tmp_folder() as folder: with local.cwd(folder): shape = (100, 87) arr = ArrayResult("arr.arr", dtype=np.float64, shape=shape, mode="w+") r = np.random.uniform(size=shape) arr[:] = r res1 = ComplexPropertyResult(foo=3, arr=arr) res1.save() pickle_file = local.path(ComplexPropertyResult.filename) assert ( pickle_file.stat().st_size < 200 ) # The important part is that it doesn't include the array! arr_file = local.path("arr.arr") assert (arr_file.stat().st_size == shape[0] * shape[1] * 8 ) # 8 bytes for a float64 # It should go back to a different folder # but the load_from_folder() should be able # deal with that assert local.cwd != folder res2 = ComplexPropertyResult.load_from_folder(folder) assert res2.foo == 3 assert np.all(res2.arr == r)
def it_returns_the_fraction_of_all_dark_samples(): with tmp.tmp_folder(chdir=True): n_samples = 5000 sim_params = _stub_sim_params( ErrorModel.from_defaults(n_channels=2), n_samples) prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI"], pro_is_decoys=[False, False], peps=[".", "ABB"], pep_pro_iz=[0, 1], ) pep_seq_df = prep_result.pepseqs() dyemat, radmat, recall = _make_arrays("test1", n_peps=2, n_samples=n_samples) sim_v1_worker._do_pep_sim( pep_seq_df[pep_seq_df.pep_i == 1], sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, ) assert np.all((0.9 < recall[1]) & (recall[1] < 1.0))
def zest_survey_v2_integration(): """ This needs a lot of work on figuring out what the metric of success of the survey is exactly. Also need some brain-dead simpler cases. Cases where the peptides are super clearly separated and make sure that we get sensible results. """ with tmp.tmp_folder(chdir=True): prep_result = prep_fixtures.result_random_fixture(20) sim_v2_result = SimV2Result.from_prep_fixture(prep_result, labels="DE,C,Y") sim_v2_result.save() survey_v2_result = survey_v2_worker.survey_v2(SurveyV2Params(), prep_result, sim_v2_result) # survey_v2_result._survey.to_csv("/erisyon/internal/test.csv") # I will need to set the RNG on this to test. # There's a weird effect # https://docs.google.com/spreadsheets/d/1SrOjdNTpw7uLWU1iS7PFm4kbfNLTnW6Am2t85b-GKww/edit#gid=1462476311 # Why are 3 peptides with the same flu not all showing each other as the nn? zest()
def it_saves_and_loads_a_property_list(): with tmp.tmp_folder(chdir=True): res1 = SimplePropertyResult(foo=2) res1.save() assert local.path(SimplePropertyResult.filename).exists() res2 = SimplePropertyResult.load_from_folder(".") assert res2.foo == 2
def it_returns_an_open_array_without_overwrite(): with tmp.tmp_folder(chdir=True): ar = ArrayResult("test1", shape=(10, 5), dtype=np.uint8, mode="w+") fp = ar.arr() ar[:] = np.arange(10 * 5).astype(np.uint8).reshape((10, 5)) _fp = ar.arr() assert _fp is fp ar.flush() assert local.path("test1").stat().st_size == 10 * 5
def it_maintains_decoys_for_train(): with tmp.tmp_folder(chdir=True): sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_worker.sim(sim_params, prep_result) assert sim_result.train_dyemat.shape == ( n_peptides, n_samples, n_channels, n_cycles, )
def it_gets_same_result_as_single_threaded(): #n_peps, n_samples, n_channels, n_cycles = (50, 1000, 2, 15) n_peps, n_samples, n_channels, n_cycles = (20, 100, 2, 15) bin_vecs = np.random.randint( 0, 2, size=(n_peps, n_samples, n_channels, n_cycles) ) dyemat = np.cumsum(bin_vecs, axis=3)[:, :, :, ::-1] dyemat[0, 0] = np.zeros((n_channels, n_cycles), dtype=DyeType) dyemat = np.repeat(dyemat, 80, 0) np.random.shuffle(dyemat) with tmp.tmp_folder(chdir=True): output_dt_mat_st = ArrayResult( "dt_mat_st", shape=(n_peps * n_samples, n_channels, n_cycles), dtype=DyeType, mode="w+", ).arr() output_dt_mat_mt = ArrayResult( "dt_mat_mt", shape=(n_peps * n_samples, n_channels, n_cycles), dtype=DyeType, mode="w+", ).arr() # prof() ( dyetracks_df_st, dt_pep_sources_df_st, dye_to_best_pep_df_st, flann_st, n_dts_st, ) = nn._step_1_create_neighbors_lookup_singleprocess(dyemat, output_dt_mat_st) # prof("st") # prof() ( dyetracks_df_mt, dt_pep_sources_df_mt, dye_to_best_pep_df_mt, flann_mt, n_dts_mt, ) = nn._step_1_create_neighbors_lookup_multiprocess(dyemat, output_dt_mat_mt) # prof("mt") assert_frame_equal(dyetracks_df_st, dyetracks_df_mt) assert_frame_equal(dt_pep_sources_df_st, dt_pep_sources_df_st) assert_frame_equal(dye_to_best_pep_df_st, dye_to_best_pep_df_st) assert n_dts_st == n_dts_mt assert np.all(output_dt_mat_st == output_dt_mat_mt)
def it_removes_decoys_for_test(): with tmp.tmp_folder(chdir=True): sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_worker.sim(sim_params, prep_result) assert sim_result.test_dyemat.shape == ( n_peptides, n_samples, n_channels, n_cycles, ) assert np.all( sim_result.test_dyemat[0] == 0) # Nul should be all zero assert np.all( sim_result.test_dyemat[4] == 0) # Decoy should be all zero assert sim_result.test_radmat.dtype == np.float32
def it_surveys(): with tmp.tmp_folder(chdir=True): n_samples = 1 sim_params = _stub_sim_params(some_error_model, n_samples) sim_params.is_survey = True sim_params.n_samples_train = n_samples sim_params.n_samples_test = None sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert sim_result.train_dyemat.shape == ( n_peptides * n_samples, n_channels, n_cycles, ) assert sim_result.train_dyemat.dtype == np.uint8 assert sim_result.test_dyemat is None
def _run(func_name, plaster_zlog_config_path=None): with tmp_folder(chdir=True): nonlocal rc, so, se, lc, jl here_folder = local.path(__file__).dirname if plaster_zlog_config_path is None: plaster_zlog_config_path = here_folder / "../example_zlog.yaml" with local.env(PLASTER_ZLOG_CONFIG_PATH=plaster_zlog_config_path): main = here_folder / "./example_main.py" rc, so, se = local.python.run(( "-u", main, func_name, ), retcode=None) log_file = local.path("plaster_example.log") lc, jl = None, None if log_file.exists(): lc = log_file.read() jl = [json.loads(i) for i in lc.split("\n") if i != ""]
def synth_to_ims_import_result(synth: Synth): chcy_ims = synth.render_chcy() with tmp_folder(remove=False) as folder: # A tmp folder is needed here because tests can run # multi-threaded and we need to avoid collisions # It can't be removed because the file will be opened # later outside of this scope so we assume that # tmp will be garbage collected outside of the # test system. ims_import_params = ImsImportParams() ims_import_result = ImsImportResult( folder=folder, params=ims_import_params, tsv_data=None, n_fields=synth.n_fields, n_channels=synth.n_channels, n_cycles=synth.n_cycles, dim=synth.dim[0], dtype=np.dtype(OUTPUT_NP_TYPE).name, src_dir="", ) for fl_i in range(synth.n_fields): field_chcy_arr = ims_import_result.allocate_field( fl_i, (synth.n_channels, synth.n_cycles, synth.dim[0], synth.dim[1]), OUTPUT_NP_TYPE, ) field_chcy_ims = field_chcy_arr.arr() field_chcy_ims[:, :, :, :] = chcy_ims ims_import_result.save_field(fl_i, field_chcy_arr, None, None) ims_import_result.save() return ims_import_result
def it_returns_no_all_dark_samples_on_valid_peps(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI"], pro_is_decoys=[False, False], peps=[".", "AAA"], pep_pro_iz=[0, 1], ) pep_seq_df = prep_result.pepseqs() n_samples = 1000 dyemat, radmat, recall = _make_arrays("test1", n_peps=2, n_samples=n_samples) sim_v1_worker._do_pep_sim( pep_seq_df[pep_seq_df.pep_i == 1], sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, ) assert not np.any(np.all(dyemat[1] == 0, axis=(1, 2)))
def _make_dyemat(): n_peps = 3 n_samples = 7 n_channels = 2 n_cycles = 5 with tmp.tmp_folder(chdir=True): dyemat = ArrayResult( "dyemat", shape=(n_peps, n_samples, n_channels, n_cycles), dtype=DyeType, mode="w+", ).arr() dyemat[1, 0:5] = np.array([[2, 2, 1, 1, 0], [2, 1, 0, 0, 0],]) dyemat[1, 5:7] = np.array([[1, 1, 1, 1, 0], [1, 1, 0, 0, 0],]) dyemat[2, 0:1] = np.array( [ # Same as dyemat[1][0:5] [2, 2, 1, 1, 0], [2, 1, 0, 0, 0], ] ) dyemat[2, 1:7] = np.array( [ # Unique [3, 3, 2, 2, 0], [2, 1, 0, 0, 0], ] ) # output_dt_mat is big enough to hold every possible dyetrack but would # be truncated after this call. output_dt_mat = ArrayResult( "dt_mat", shape=(n_peps * n_samples, n_channels, n_cycles), dtype=DyeType, mode="w+", ).arr() return dyemat, output_dt_mat
def it_gives_up_on_hard_peptides_and_returns_none(): with tmp.tmp_folder(chdir=True): prep_result = PrepResult.prep_result_fixture( pros=[".", "ABCDEFGHI"], pro_is_decoys=[False, False], peps=[".", "DDD"], pep_pro_iz=[0, 1], ) pep_seq_df = prep_result.pepseqs() n_samples = 1000 dyemat, radmat, recall = _make_arrays("test1", n_peps=2, n_samples=n_samples) sim_v1_worker._do_pep_sim( pep_seq_df[pep_seq_df.pep_i == 1], sim_params, n_samples=n_samples, output_dyemat=dyemat, output_radmat=radmat, output_recall=recall, ) assert np.all(recall[:] == 0.0)
def it_sets_all_output_arrays(): n_peps, n_samples, n_channels, n_cycles = (3, 2, 2, 3) nn_params = TestNNParams() sim_params = SimParams.construct_from_aa_list( ["A", "B"], error_model=ErrorModel.no_errors(n_channels) ) sim_params.error_model.dyes[0].gain = 100.0 sim_params.error_model.dyes[1].gain = 400.0 sim_params._build_join_dfs() with tmp.tmp_folder(chdir=True): train_dyemat = ArrayResult( "train_dyemat", shape=(n_peps, n_samples, n_channels, n_cycles), dtype=DyeType, mode="w+", ) train_dyemat[:] = np.array( [ [ # Pep 0 [[0, 0, 0], [0, 0, 0],], # Sample 0 [[0, 0, 0], [0, 0, 0],], # Sample 1 ], [ # Pep 1 [[2, 2, 1], [1, 0, 0],], # Sample 0 [[2, 2, 1], [1, 0, 0],], # Sample 1 ], [ # Pep 2 [[2, 2, 2], [2, 1, 0],], # Sample 0 [ # Sample 1 [2, 2, 1], [1, 0, 0], # Same same sample 0 & 1 of pep 1 ], ], ] ) sim_result = SimResult( params=sim_params, train_dyemat=train_dyemat.arr(), # None of the following are used by nn train_radmat=ArrayResult( "train_radmat", shape=(1,), dtype=RadType, mode="w+" ).arr(), train_recalls=ArrayResult( "train_recalls", shape=(1,), dtype=RecallType, mode="w+" ).arr(), train_flus=ArrayResult( "train_flus", shape=(1,), dtype=DyeType, mode="w+" ).arr(), train_flu_remainders=ArrayResult( "train_flu_remainders", shape=(1,), dtype=DyeType, mode="w+" ).arr(), ) test_radmat = ArrayResult( "test_radmat", shape=(3, n_channels, n_cycles), dtype=RadType, mode="w+" ) test_radmat[:] = np.array( [ [ # pep 1, sample 0 & 1; pep 2, sample 1 [2.1, 1.9, 1.1], [ 0.9, 0.1, 0.1, ], # Should pred to dt 1, could be pep 1 or pep 2 but pep 1 has more instances ], [ # pep 0, sample 0 [0.1, 0.1, 0.1], [0.1, 0.1, 0.1], # Should pred to dt 0, must be pep 0 ], [ # Pep 2, sample 0 [2.1, 1.9, 1.9], [2.1, 1.1, 0.1], # Should pred to dt 2, must be pep 2 ], ] ) test_radmat[:, 0, :] *= sim_params.error_model.dyes[0].gain test_radmat[:, 1, :] *= sim_params.error_model.dyes[1].gain nn_result = nn.nn(nn_params, sim_result, test_radmat.arr()) assert np.all( nn_result.dt_mat.arr() == [ [[0, 0, 0], [0, 0, 0]], [[2, 2, 1], [1, 0, 0]], [[2, 2, 2], [2, 1, 0]], ] ) assert np.all(nn_result.dyetracks_df.dye_i.values == [0, 1, 2]) assert np.all(nn_result.dyetracks_df.weight.values == [2, 3, 1]) assert np.all(nn_result.dt_pep_sources_df.dye_i.values == [0, 1, 1, 2]) assert np.all(nn_result.dt_pep_sources_df.pep_i.values == [0, 1, 2, 2]) assert np.all(nn_result.dt_pep_sources_df.n_rows.values == [2, 2, 1, 1]) assert np.all(nn_result.pred_dt_iz.arr() == [1, 0, 2]) # TODO: Check all the nn_results here # Then I need to implement the avoidance of the max calc # And then I can profile it on large datasets assert np.all(nn_result.pred_pep_iz.arr() == [1, 0, 2]) assert np.all( (0 <= nn_result.scores.arr()) & (nn_result.scores.arr() <= 1.0) ) assert nn_result.scores.shape == (3,) assert np.all( (0 <= nn_result.dt_scores.arr()) & (nn_result.dt_scores.arr() <= 1.0) ) assert nn_result.dt_scores.shape == (3,)
def _before(): nonlocal nn_params, radmat, dt_mat, dt_inv_var_mat, dt_weights, flann nonlocal channel_i_to_gain_inv, dye_to_best_pep_df, dt_scores, scores nonlocal pred_pep_iz, pred_dt_iz, true_dt_iz, true_dyemat nn_params = TestNNParams() dt_mat = np.array( [ [[0, 0, 0], [0, 0, 0]], # Target 0 [[2, 1, 0], [2, 2, 0]], # Target 1 [[1, 1, 0], [1, 0, 0]], # Target 2 ], dtype=DyeType, ) dt_weights = np.array([0, 5, 10], dtype=DyeWeightType) true_dyemat = np.array( [ [[1, 1, 0], [1, 0, 0]], # Target == 2 [[2, 1, 0], [2, 2, 0]], # Target == 1 [[10, 10, 9], [10, 10, 10]], # Target == None ], dtype=DyeType, ) radmat = np.array( [ [[1.1, 0.9, 0.0], [1.1, 0.1, 0.0]], # Target == 2 [[2.1, 1.1, 0.0], [2.1, 1.9, 0.0]], # Target == 1 [[10.0, 10.0, 9.0], [10.0, 10.0, 10.0]], # Target == None ], dtype=RadType, ) channel_i_to_vpd = np.array([1.5, 2.0], dtype=RadType) channel_i_to_gain = np.array([10.0, 100.0], dtype=RadType) radmat = radmat * channel_i_to_gain[None, :, None] channel_i_to_gain_inv = 1.0 / channel_i_to_gain dt_inv_var_mat = nn._step_2_create_inverse_variances( dt_mat, np.array(channel_i_to_vpd) ) flann = nn._create_flann(dt_mat) dye_to_best_pep_df = pd.DataFrame( dict(dye_i=[0, 1, 2], pep_i=[0, 2, 1], score=[1.0, 0.5, 1.0],) ) n_rows = radmat.shape[0] with tmp.tmp_folder(chdir=True): dt_scores = ArrayResult( "dt_scores", nn.ScoreType, (n_rows,), mode="w+" ).arr() scores = ArrayResult("scores", nn.ScoreType, (n_rows,), mode="w+").arr() pred_pep_iz = ArrayResult( "pred_pep_iz", IndexType, (n_rows,), mode="w+" ).arr() pred_dt_iz = ArrayResult( "pred_dt_iz", IndexType, (n_rows,), mode="w+" ).arr() true_dt_iz = ArrayResult( "true_dt_iz", IndexType, (n_rows,), mode="w+" ).arr()
def it_removes_decoys_for_test(): with tmp.tmp_folder(chdir=True): sim_params = _stub_sim_params(some_error_model, n_samples) sim_result = sim_v1_worker.sim_v1(sim_params, prep_result) assert not np.any(sim_result.test_true_pep_iz == 4)
def it_raises_if_train_and_test_identical(): with tmp.tmp_folder(chdir=True): with zest.raises(in_message="are identical"): sim_params = _stub_sim_params(no_error_model, n_samples) sim_v1_worker.sim_v1(sim_params, prep_result)