Пример #1
0
 def it_limits_slices_with_int():
     res_a, res_b = zap.arrays(
         test6,
         dict(a=np.arange(10), b=np.arange(10)),
         c=3,
         _batch_size=2,
         _limit_slice=3,
     )
     assert len(res_a) == 3 and len(res_b) == 3
Пример #2
0
 def it_limits_slices():
     res_a, res_b = zap.arrays(
         test4,
         dict(a=np.arange(10), b=np.arange(10)),
         c=3,
         _batch_size=2,
         _limit_slice=slice(3, 6),
     )
     assert len(res_a) == 3 and len(res_b) == 3
Пример #3
0
    def it_stacks_one_field():
        res = zap.arrays(test5,
                         dict(a=[1, 2], b=[3, 4]),
                         c=3,
                         _batch_size=2,
                         _stack=True)

        assert isinstance(res, np.ndarray)
        assert np.all(res == np.array([[2 * 1, 2 * 3, 2 *
                                        3], [2 * 2, 2 * 4, 2 * 3]]))
Пример #4
0
    def it_maintains_returned_tuples():
        res = zap.arrays(
            test4,
            dict(a=[1, 2], b=[3, 4]),
            c=3,
            _batch_size=2,
        )

        assert isinstance(res, tuple)
        assert res == ([1 + 1, 2 + 1], [3 + 2, 4 + 2])
Пример #5
0
    def false_rates_all_peps(self, at_prec, n_false=4):
        pep_iz = self._prep_result.peps().pep_i.values

        return pd.concat(
            zap.arrays(
                _do_false_rates_by_pep,
                dict(pep_i=pep_iz),
                bag=self,
                at_prec=at_prec,
                n_false=n_false,
            )).reset_index(drop=True)
Пример #6
0
    def it_maintains_array_returns():
        res = zap.arrays(
            test5,
            dict(a=[1, 2], b=[3, 4]),
            c=3,
            _batch_size=2,
        )

        assert isinstance(res, list)
        assert np.all(res[0] == np.array([2 * 1, 2 * 3, 2 * 3]))
        assert np.all(res[1] == np.array([2 * 2, 2 * 4, 2 * 3]))
Пример #7
0
    def it_stacks_all_fields():
        res = zap.arrays(test4,
                         dict(a=[1, 2], b=[3, 4]),
                         c=3,
                         _batch_size=2,
                         _stack=True)

        assert isinstance(res, tuple)
        assert isinstance(res[0], np.ndarray)
        assert isinstance(res[1], np.ndarray)
        assert np.all(res[0] == np.array([[1 + 1, 2 + 1]]))
        assert np.all(res[1] == np.array([[3 + 2, 4 + 2]]))
Пример #8
0
    def it_stacks_some_fields():
        res = zap.arrays(test6,
                         dict(a=[1, 2], b=[3, 4]),
                         c=3,
                         _batch_size=2,
                         _stack=[True, False])

        assert isinstance(res, tuple)
        assert isinstance(res[0], np.ndarray)
        assert isinstance(res[1], list)
        assert np.all(res[0] == np.array([[1 * 2, 3 * 2, 3 *
                                           2], [2 * 2, 4 * 2, 3 * 2]]))
        assert res[1] == ["foo", "foo"]
Пример #9
0
    def it_eliminates_batch_lists():
        res = zap.arrays(
            test3,
            dict(a=[1, 2], b=[3, 4]),
            c=3,
            _batch_size=2,
        )

        assert isinstance(res, list)
        assert res == [
            [2 * 1, 2 * 3, 2 * 3],
            [2 * 2, 2 * 4, 2 * 3],
        ]
Пример #10
0
def _step_4_gmm_classify(
    radmat,
    dyemat,
    dt_mat,
    dt_inv_var_mat,
    dt_weights,
    flann,
    n_neighbors,
    dt_score_mode,
    dt_filter_threshold,
    dt_score_metric,
    dt_score_bias,
    penalty_coefs,
    rare_penalty,
    radius,
    progress,
):
    """
    The dyemat is passed so that we can get the true_dt_iz for debugging
    """
    check.array_t(radmat, ndim=3)
    true_dt_iz, pred_dt_iz, scores, vdists = zap.arrays(
        _do_nn_and_gmm,
        dict(unit_radrow=radmat, dyerow=dyemat),
        dt_mat=dt_mat,
        dt_inv_var_mat=dt_inv_var_mat,
        dt_weights=dt_weights,
        flann=flann,
        n_neighbors=n_neighbors,
        dt_score_mode=dt_score_mode,
        dt_filter_threshold=dt_filter_threshold,
        dt_score_metric=dt_score_metric,
        dt_score_bias=dt_score_bias,
        penalty_coefs=penalty_coefs,
        rare_penalty=rare_penalty,
        radius=radius,
        _progress=progress,
        _stack=True,
    )

    # I use the dt counts as a weighting factor on the PDFs
    # which means that the scores can be > 1.0.
    # To ensure that all rows get an equal treatment in
    # normalization I simply divide them through by the
    # max value to put them into 0-1 range.

    scores = scores.flatten()
    scores /= np.max(scores)

    return true_dt_iz.flatten(), pred_dt_iz.flatten(), scores, vdists
Пример #11
0
def psf_fields_one_channel(ims_import_result,
                           sigproc_v2_params,
                           field_iz,
                           channel_i,
                           progress=None) -> priors.RegPSFPrior:
    """
    Build up a regional PSF for one channel on the RAW images.

    Implemented in a parallel zap over every field and then combine the
    fields into a single RegPSF which stores: (divs, divs, peak_mea, peak_mea)
    """

    if ims_import_result.n_fields == 0:
        return None

    with zap.Context(progress=progress):
        region_to_psf_per_field = zap.arrays(
            _do_psf_one_field_one_channel,
            dict(field_i=field_iz),
            _stack=True,
            peak_mea=sigproc_v2_params.peak_mea,
            divs=sigproc_v2_params.divs,
            bandpass_kwargs=dict(
                low_inflection=sigproc_v2_params.low_inflection,
                low_sharpness=sigproc_v2_params.low_sharpness,
                high_inflection=sigproc_v2_params.high_inflection,
                high_sharpness=sigproc_v2_params.high_sharpness,
            ),
            ims_import_result=ims_import_result,
            channel_i=channel_i,
            n_cycles_limit=sigproc_v2_params.n_cycles_limit,
        )

    # SUM over fields
    psf_ims = np.sum(region_to_psf_per_field, axis=0)
    psf_ims = psf_normalize(psf_ims)

    # At this point psf_ims is a pixel image of the PSF at each reg div.
    # ie, 4 dimensional: (divs_y, divs_x, n_pixels_h, n_pixels_w)
    # Now we convert it to Gaussian Parameters by fitting so we don't have
    # to store the pixels anymore: just the 3 critical shape parameters:
    # sigma_x, sigma_y, and rho.
    # Use one frame of ims_import_result to sort out dimensions
    im = ims_import_result.ims[0, 0, 0]
    check.array_t(im, is_square=True)
    reg_psf = priors.RegPSFPrior.from_psf_ims(im.shape[-1], psf_ims)
    return reg_psf
Пример #12
0
def _step_2_create_pros_and_pro_seqs_dfs(pro_spec_df):
    """
    Create pros_df and pro_seqs_df.
    Converts the sequence as a string into normalzied DataFrames
    """

    # Sort proteins such that the protein(s) being 'reported' are at the top, which means
    # the most interesting peptides start at pep_i==1.
    _pro_spec_df = pro_spec_df.sort_values(by=["report", "name"],
                                           ascending=False)

    # pro_lists = parallel_array_split_map(
    #     aa_str_to_list, dict(seqstr=_pro_spec_df.sequence.values)
    # )
    pro_lists = zap.arrays(aa_str_to_list,
                           dict(seqstr=_pro_spec_df.sequence.values))

    # Make a full-df with columns "aa", "pro_i", "pro_name", and "ptm_locs", "pro_report"
    # Then split this into the two fully normalized dfs
    df = pd.DataFrame(
        [(i, pro_i + 1, pro_name, pro_ptm_locs, pro_report)
         for pro_i, (pro, pro_name, pro_ptm_locs, pro_report) in enumerate(
             zip(
                 pro_lists,
                 _pro_spec_df.name,
                 _pro_spec_df.ptm_locs,
                 _pro_spec_df.report,
             )) for i in pro],
        columns=["aa", "pro_i", "pro_name", "pro_ptm_locs", "pro_report"],
    )

    # ADD reserved nul row
    nul = pd.DataFrame(
        [dict(aa=".", pro_i=0, pro_name="nul", pro_ptm_locs="", pro_report=0)])
    df = pd.concat((nul, df))

    pros_df = (df[["pro_i", "pro_name", "pro_ptm_locs",
                   "pro_report"]].drop_duplicates().reset_index(
                       drop=True).rename(columns=dict(pro_name="pro_id")))
    pros_df["pro_is_decoy"] = False

    pro_seqs_df = df[["pro_i", "aa"]].reset_index(drop=True)

    return pros_df, pro_seqs_df
Пример #13
0
def ims_import(src_dir: Path,
               ims_import_params: ImsImportParams,
               progress=None,
               pipeline=None):
    reference_nd2_file_for_metadata = None

    scan_result = _scan_files(src_dir)
    if len(scan_result.nd2_paths) > 0:
        reference_nd2_file_for_metadata = scan_result.nd2_paths[0]

    target_mea = max(scan_result.dim[0], scan_result.dim[1])

    if not utils.is_power_of_2(target_mea):
        new_dim = utils.next_power_of_2(target_mea)
        _convert_message(target_mea, new_dim)
        target_mea = new_dim

    def clamp_fields(n_fields_true: int) -> Tuple[int, int]:
        n_fields = n_fields_true
        n_fields_limit = ims_import_params.get("n_fields_limit")
        if n_fields_limit is not None:
            n_fields = n_fields_limit

        start_field = ims_import_params.get("start_field", 0)
        if start_field + n_fields > n_fields_true:
            n_fields = n_fields_true - start_field

        return start_field, n_fields

    def clamp_cycles(n_cycles_true: int) -> Tuple[int, int]:
        n_cycles = n_cycles_true
        n_cycles_limit = ims_import_params.get("n_cycles_limit")
        if n_cycles_limit is not None:
            n_cycles = n_cycles_limit

        start_cycle = ims_import_params.get("start_cycle", 0)
        if start_cycle is None:
            start_cycle = 0
        if start_cycle + n_cycles > n_cycles_true:
            n_cycles = n_cycles_true - start_cycle

        return start_cycle, n_cycles

    tsv_data = tsv.load_tsv_for_folder(src_dir)

    # ALLOCATE the ImsImportResult
    ims_import_result = ImsImportResult(params=ims_import_params,
                                        tsv_data=Munch(tsv_data))

    dst_ch_i_to_src_ch_i = ims_import_params.dst_ch_i_to_src_ch_i
    if dst_ch_i_to_src_ch_i is None:
        dst_ch_i_to_src_ch_i = [i for i in range(scan_result.n_channels)]

    n_out_channels = len(dst_ch_i_to_src_ch_i)

    # Sanity check that we didn't end up with any src_channels outside of the channel range
    assert all([
        0 <= src_ch_i < scan_result.n_channels
        for src_ch_i in dst_ch_i_to_src_ch_i
    ])

    if ims_import_params.is_z_stack_single_file:
        field_iz, n_cycles_found = _z_stack_import(
            scan_result.nd2_paths[0],
            target_mea,
            ims_import_result,
            dst_ch_i_to_src_ch_i,
            ims_import_params.z_stack_n_slices_per_field,
        )
        n_cycles = ims_import_params.z_stack_n_slices_per_field

    elif ims_import_params.is_movie:
        if scan_result.mode == ScanFileMode.nd2:
            # "Movie mode" means that there aren't any chemical cycles, but rather we are using "cycles" to represent different images in a zstack
            start_field, n_fields = clamp_fields(len(scan_result.nd2_paths))

            # In movie mode, the n_fields from the .nd2 file is becoming n_cycles
            scan_result.n_cycles = scan_result.n_fields
            start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles)

            with zap.Context(progress=progress):
                field_iz, n_cycles_found = zap.arrays(
                    _do_movie_import_nd2,
                    dict(
                        input_field_i=list(
                            range(start_field, start_field + n_fields)),
                        output_field_i=list(range(n_fields)),
                    ),
                    _stack=True,
                    scan_result=scan_result,
                    start_cycle=start_cycle,
                    n_cycles=n_cycles,
                    target_mea=target_mea,
                    import_result=ims_import_result,
                    dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i,
                )
        elif scan_result.mode == ScanFileMode.npy:
            start_field, n_fields = clamp_fields(scan_result.n_fields)
            start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles)

            with zap.Context(progress=progress):
                field_iz, n_cycles_found = zap.arrays(
                    _do_movie_import_npy,
                    dict(
                        input_field_i=list(
                            range(start_field, start_field + n_fields)),
                        output_field_i=list(range(n_fields)),
                    ),
                    _stack=True,
                    scan_result=scan_result,
                    start_cycle=start_cycle,
                    n_cycles=n_cycles,
                    target_mea=target_mea,
                    import_result=ims_import_result,
                    dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i,
                )
        else:
            raise NotImplementedError()

    else:
        start_field, n_fields = clamp_fields(scan_result.n_fields)

        if pipeline:
            pipeline.set_phase(0, 2)

        if scan_result.mode == ScanFileMode.nd2:
            scan_result.n_cycles = len(scan_result.nd2_paths)

            # SCATTER
            with zap.Context(mode="thread", progress=progress):
                zap.arrays(
                    _do_nd2_scatter,
                    dict(
                        cycle_i=list(range(len(scan_result.nd2_paths))),
                        src_path=scan_result.nd2_paths,
                    ),
                    _stack=True,
                    start_field=start_field,
                    n_fields=n_fields,
                    n_channels=scan_result.n_channels,
                    target_mea=target_mea,
                )

        elif scan_result.mode == ScanFileMode.tif:
            # SCATTER
            work_orders = [
                Munch(field_i=k[0], channel_i=k[1], cycle_i=k[2], path=path)
                for k, path in
                scan_result.tif_paths_by_field_channel_cycle.items()
            ]
            with zap.Context(trap_exceptions=False):
                results = zap.work_orders(_do_tif_scatter, work_orders)

            # CHECK that every file exists
            for f in range(n_fields):
                for ch in range(scan_result.n_channels):
                    for cy in range(scan_result.n_cycles):
                        expected = f"__{f:03d}-{ch:02d}-{cy:02d}.npy"
                        if expected not in results:
                            raise FileNotFoundError(
                                f"File is missing in tif pattern: {expected}")

        elif scan_result.mode == ScanFileMode.npy:
            # In npy mode there's no scatter as the files are already fully scattered
            pass

        else:
            raise ValueError(f"Unknown im import mode {scan_result.mode}")

        if pipeline:
            pipeline.set_phase(1, 2)

        # GATHER
        start_cycle, n_cycles = clamp_cycles(scan_result.n_cycles)

        with zap.Context(progress=progress):
            field_iz = zap.arrays(
                _do_gather,
                dict(
                    input_field_i=list(
                        range(start_field, start_field + n_fields)),
                    output_field_i=list(range(0, n_fields)),
                ),
                _stack=True,
                start_cycle=start_cycle,
                n_cycles=n_cycles,
                dim=target_mea,
                import_result=ims_import_result,
                mode=scan_result.mode,
                npy_paths_by_field_channel_cycle=scan_result.
                npy_paths_by_field_channel_cycle,
                dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i,
            )

    if reference_nd2_file_for_metadata:
        with _nd2(reference_nd2_file_for_metadata) as nd2:
            if hasattr(nd2, "metadata"):
                full = Munch(
                    metadata=nd2.metadata,
                    metadata_seq=nd2.metadata_seq,
                )
                ims_import_result._nd2_metadata_full = full

                def me(block_name, default=None):
                    return utils.block_search(full.metadata.SLxExperiment,
                                              block_name, default)

                def mp(block_name, default=None):
                    return utils.block_search(
                        full.metadata_seq.SLxPictureMetadata, block_name,
                        default)

                n_channels = mp("sPicturePlanes.uiSampleCount", 1)

                ims_import_result._nd2_metadata = Munch(
                    calibrated_pixel_size=mp("dCalibration"),
                    experiment_type="movie" if me("eType") == 1 else "edman",
                    n_cycles=me("uLoopPars.uiCount"),
                    cmd_before=me("wsCommandBeforeCapture"),
                    cmd_after=me("wsCommandAfterCapture"),
                    n_channels=n_channels,
                )

                per_channel = []
                for ch_i in range(n_channels):
                    laser_wavelength = None
                    laser_power = None
                    n_lasers = mp(
                        f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_uiMultiLaserLines0",
                        0,
                    )
                    for i in range(n_lasers):
                        is_used = mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_bMultiLaserLineUsed0-{i:02d}",
                            0,
                        )
                        if is_used == 1:
                            laser_wavelength = mp(
                                f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_uiMultiLaserLineWavelength0-{i:02d}",
                                0,
                            )
                            laser_power = mp(
                                f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dMultiLaserLinePower0-{i:02d}",
                                0,
                            )

                    ch_munch = Munch(
                        laser_wavelength=laser_wavelength,
                        laser_power=laser_power,
                        camera_name=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.CameraUniqueName"
                        ),
                        sensor_pixels_x=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorPixels.cx"
                        ),
                        sensor_pixels_y=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorPixels.cy"
                        ),
                        sensor_microns_x=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorMicrons.cx"
                        ),
                        sensor_microns_y=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.sizeSensorMicrons.cy"
                        ),
                        bin_x=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.dBinningX"
                        ),
                        bin_y=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.dBinningY"
                        ),
                        format=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.fmtDesc.wszFormatDesc"
                        ),
                        roi_l=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.left"
                        ),
                        roi_r=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.right"
                        ),
                        roi_t=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.top"
                        ),
                        roi_b=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.FormatQuality.rectSensorUser.bottom"
                        ),
                        averaging=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.PropertiesQuality.Average"
                        ),
                        integration=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.PropertiesQuality.Integrate"
                        ),
                        name=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pCameraSetting.Metadata.Channels.Channel_0.Name"
                        ),
                        dichroic_filter=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_sFilterName0"
                        ),
                        emission_filter=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_sFilterName1"
                        ),
                        optivar=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dZoomPosition"
                        ),
                        tirf_focus=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionFocus"
                        ),
                        tirf_align_x=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionX"
                        ),
                        tirf_align_y=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pDeviceSetting.m_dTIRFPositionY"
                        ),
                        objective_mag=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dObjectiveMag"
                        ),
                        objective_na=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dObjectiveNA"
                        ),
                        objective_refractive_index=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.pObjectiveSetting.dRefractIndex"
                        ),
                        settings_name=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.sOpticalConfigs.\x02.sOpticalConfigName"
                        ),
                        readout_mode=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Readout Mode"
                        ),
                        readout_rate=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Readout Rate"
                        ),
                        noise_filter=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Noise Filter"
                        ),
                        temperature=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.sSpecSettings.Temperature"
                        ),
                        exposure=mp(
                            f"sPicturePlanes.sSampleSetting.a{ch_i}.dExposureTime"
                        ),
                    )
                    per_channel += [ch_munch]

                ims_import_result._nd2_metadata.update(**Munch(
                    per_channel=per_channel))

                if me("eType") == 1:
                    # Movie mode
                    ims_import_result._nd2_metadata.update(**Munch(
                        movie_start=me("dStart"),
                        movie_period=me("dPeriod"),
                        movie_duration=me("dDuration"),
                        movie_duration_pref=me("bDurationPref"),
                        movie_max_period_diff=me("dMaxPeriodDiff"),
                        movie_min_period_diff=me("dMinPeriodDiff"),
                        movie_avg_period_diff=me("dAvgPeriodDiff"),
                    ))

    ims_import_result.n_fields = len(field_iz)
    ims_import_result.n_channels = n_out_channels
    ims_import_result.n_cycles = n_cycles
    ims_import_result.dim = target_mea
    ims_import_result.dtype = np.dtype(OUTPUT_NP_TYPE).name
    ims_import_result.src_dir = src_dir

    # CLEAN
    for file in local.cwd // "__*":
        file.delete()

    return ims_import_result
Пример #14
0
def nn(nn_params, sim_result, radmat, true_dyemat=None, progress=None):
    """
    Main entrypoint for nearest_neighbors.

    Arguments:
        nn_params: TestNNParams
        sim_result: SimResult -- Uses the train_* values
        radmat: The radmat to classify.
        true_dyemat: Optional for debugging -- the dyemat of the radmat
            ie. the dyerow that corresponds to each radrow.
        progress: Optional progress callback

    Returns:
        pred_pep_iz
        scores

    This is composed of the following steps:
        1. Create a unit radmat
        2. Create a unique dyetrack mat (dt_mat); these are the
           "neighbors" that will be searched.
        3. Create inverse variance for each row of dt_mat; inv_var_dt_mat
        4. Classify each row of the unit radmat with the Gaussian Mixture Model.
    """

    # Allocate the dt_mat as large as it COULD possibly be
    # and then after populating it with the unique values
    # we can resize if using dt_mat.base.resize(n_bytes)
    # The max size is the (extremely unlikely) value of
    # n_peps * n_samples
    check.array_t(radmat, ndim=3, dtype=RadType)
    check.array_t(sim_result.train_dyemat, ndim=4)
    shape = sim_result.train_dyemat.shape
    n_dts_max = shape[0] * shape[1]
    n_channels, n_cycles = shape[2:]
    dt_mat = ArrayResult("dt_mat",
                         DyeType,
                         shape=(n_dts_max, n_channels, n_cycles),
                         mode="w+")
    # prof()

    _step_1_create_neighbors_lookup = _step_1_create_neighbors_lookup_multiprocess
    #_step_1_create_neighbors_lookup = _step_1_create_neighbors_lookup_singleprocess

    (
        dyetracks_df,
        dt_pep_sources_df,
        dye_to_best_pep_df,
        flann,
        n_dts,
    ) = _step_1_create_neighbors_lookup(
        sim_result.train_dyemat,
        output_dt_mat=dt_mat.arr(),
    )
    # prof("create neighbors")

    # dyetracks_df: (dye_i, weight)
    # dt_pep_sources_df: (dye_i, pep_i, n_rows)
    assert n_dts <= n_dts_max and n_dts == dyetracks_df.dye_i.max() + 1

    # Collapse the dt_mat to the actual number of rows.
    # This will cause the memmap file to truncate in size.
    dt_mat.reshape((n_dts, n_channels, n_cycles))

    # dt_mat is the dyetrack mat of the TARGETS as build by the training set
    # Not to be confused with dyemat which is the dyemat of the test points
    # There is no guarantee that the dyerow of a test point is even *in*
    # the training set.

    dt_inv_var_mat = _step_2_create_inverse_variances(
        dt_mat.arr(), np.array(sim_result.params.channel_i_to_vpd))

    dt_weights = dyetracks_df.reindex(np.arange(n_dts),
                                      fill_value=0).weight.values

    channel_i_to_gain_inv = (
        1.0 / np.array(sim_result.params.channel_i_to_gain)).astype(RadType)

    # Now classify each radrow
    check.array_t(radmat, ndim=3)
    n_rows = radmat.shape[0]
    if true_dyemat is not None:
        assert true_dyemat.shape == radmat.shape

    pred_dt_scores = ArrayResult("pred_dt_scores",
                                 ScoreType, (n_rows, ),
                                 mode="w+")
    pred_scores = ArrayResult("pred_scores", ScoreType, (n_rows, ), mode="w+")
    pred_pep_iz = ArrayResult("pred_pep_iz", IndexType, (n_rows, ), mode="w+")
    pred_dt_iz = ArrayResult("pred_dt_iz", IndexType, (n_rows, ), mode="w+")
    true_dt_iz = ArrayResult("true_dt_iz", IndexType, (n_rows, ), mode="w+")

    # Score normalization requires knowing about the distribution of
    # scores but I do not want to make two full passes over the dataset.
    # To avoid this, I randomly sample a fraction of the dataset
    # to collect the score distribution and then I pass in a normalization
    # term into the second pass.

    if nn_params.random_seed is None:
        nn_params.random_seed = int(time.time())
    # prof()

    zap.arrays(
        _do_nn,
        dict(i=np.arange(n_rows)),
        nn_params=nn_params,
        radmat=radmat,
        dt_mat=dt_mat.arr(),
        dt_inv_var_mat=dt_inv_var_mat,
        dt_weights=dt_weights,
        flann=flann,
        channel_i_to_gain_inv=channel_i_to_gain_inv,
        score_normalization=1.0,
        dye_to_best_pep_df=dye_to_best_pep_df,
        output_pred_dt_scores=pred_dt_scores.arr(),
        output_pred_scores=pred_scores.arr(),
        output_pred_pep_iz=pred_pep_iz.arr(),
        output_pred_dt_iz=pred_dt_iz.arr(),
        output_true_dt_iz=true_dt_iz.arr(),
        true_dyemat=true_dyemat,
        _progress=progress,
    )

    return Munch(
        dt_mat=dt_mat,
        dyetracks_df=dyetracks_df,
        dt_pep_sources_df=dt_pep_sources_df,
        true_dt_iz=true_dt_iz,
        pred_dt_iz=pred_dt_iz,
        dt_scores=pred_dt_scores,
        scores=pred_scores,
        pred_pep_iz=pred_pep_iz,
    )
Пример #15
0
def ims_import(src_dir, ims_import_params, progress=None, pipeline=None):
    (
        mode,
        nd2_paths,
        tif_paths_by_field_channel_cycle,
        npy_paths_by_field_channel_cycle,
        n_fields_true,
        n_channels,
        n_cycles_true,
        dim,
    ) = _scan_files(src_dir)

    target_dim = max(dim[0], dim[1])

    if not utils.is_power_of_2(target_dim):
        new_dim = utils.next_power_of_2(target_dim)
        _convert_message(target_dim, new_dim)
        target_dim = new_dim

    src_channels = list(range(n_channels))

    def clamp_fields(n_fields_true):
        n_fields = n_fields_true
        n_fields_limit = ims_import_params.get("n_fields_limit")
        if n_fields_limit is not None:
            n_fields = n_fields_limit

        start_field = ims_import_params.get("start_field", 0)
        if start_field + n_fields > n_fields_true:
            n_fields = n_fields_true - start_field

        return start_field, n_fields

    def clamp_cycles(n_cycles_true):
        n_cycles = n_cycles_true
        n_cycles_limit = ims_import_params.get("n_cycles_limit")
        if n_cycles_limit is not None:
            n_cycles = n_cycles_limit

        start_cycle = ims_import_params.get("start_cycle", 0)
        if start_cycle + n_cycles > n_cycles_true:
            n_cycles = n_cycles_true - start_cycle

        return start_cycle, n_cycles

    tsv_data = tsv.load_tsv_for_folder(src_dir)
    ims_import_result = ImsImportResult(params=ims_import_params,
                                        tsv_data=Munch(tsv_data))

    if ims_import_params.is_movie:
        start_field, n_fields = clamp_fields(len(nd2_paths))

        # In movie mode, the n_fields from the .nd2 file is becoming n_cycles
        n_cycles_true = n_fields_true
        start_cycle, n_cycles = clamp_cycles(n_cycles_true)

        field_iz, n_cycles_found = zap.arrays(
            _do_movie_import,
            dict(
                nd2_path=nd2_paths[start_field:start_field + n_fields],
                output_field_i=list(range(n_fields)),
            ),
            _process_mode=True,
            _progress=progress,
            _stack=True,
            start_cycle=start_cycle,
            n_cycles=n_cycles,
            target_dim=target_dim,
            nd2_import_result=ims_import_result,
        )

    else:
        start_field, n_fields = clamp_fields(n_fields_true)

        if pipeline:
            pipeline.set_phase(0, 2)

        if mode == "nd2":
            n_cycles_true = len(nd2_paths)

            # SCATTER
            zap.arrays(
                _do_nd2_scatter,
                dict(cycle_i=list(range(len(nd2_paths))), src_path=nd2_paths),
                _process_mode=True,
                _progress=progress,
                _stack=True,
                start_field=start_field,
                n_fields=n_fields,
                n_channels=n_channels,
                target_dim=target_dim,
            )

        elif mode == "tif":
            # SCATTER
            work_orders = [
                Munch(field_i=k[0], channel_i=k[1], cycle_i=k[2], path=path)
                for k, path in tif_paths_by_field_channel_cycle.items()
            ]
            results = zap.work_orders(_do_tif_scatter,
                                      work_orders,
                                      _trap_exceptions=False)

            # CHECK that every file exists
            for f in range(n_fields):
                for ch in range(n_channels):
                    for cy in range(n_cycles_true):
                        expected = f"__{f:03d}-{ch:02d}-{cy:02d}.npy"
                        if expected not in results:
                            raise FileNotFoundError(
                                f"File is missing in tif pattern: {expected}")

        elif mode == "npy":
            # In npy mode there's no scatter as the files are already fully scattered
            pass

        else:
            raise ValueError(f"Unknown im import mode {mode}")

        if pipeline:
            pipeline.set_phase(1, 2)

        # GATHER
        start_cycle, n_cycles = clamp_cycles(n_cycles_true)

        field_iz = zap.arrays(
            _do_gather,
            dict(
                input_field_i=list(range(start_field, start_field + n_fields)),
                output_field_i=list(range(0, n_fields)),
            ),
            _process_mode=True,
            _progress=progress,
            _stack=True,
            src_channels=src_channels,
            start_cycle=start_cycle,
            n_cycles=n_cycles,
            dim=target_dim,
            nd2_import_result=ims_import_result,
            mode=mode,
            npy_paths_by_field_channel_cycle=npy_paths_by_field_channel_cycle,
        )

    ims_import_result.n_fields = len(field_iz)
    ims_import_result.n_channels = n_channels
    ims_import_result.n_cycles = n_cycles
    ims_import_result.dim = target_dim

    # CLEAN
    for file in local.cwd // "__*":
        file.delete()

    return ims_import_result