Exemplo n.º 1
0
    def _hash(self):
        """De-duplicate rows before fitting"""
        def hash_series(row):
            """Return a positive hash value"""

            return ctypes.c_size_t(hash(tuple(row)))

        self._y_dataframe_dup = self.y_dataframe.copy()
        if self.seq_to_fit is not None:
            # filter the seq to fit
            self._seq_to_fit_dup = self.seq_to_fit.copy()
            self.y_dataframe = self.y_dataframe.loc[self.seq_to_fit]
        # find seq to hash mapping
        self._seq_to_hash = self.y_dataframe.apply(hash_series,
                                                   axis=1).to_dict()
        # only keep the first instance of each hash
        self.y_dataframe = self.y_dataframe[~self.y_dataframe.duplicated(
            keep='first')]
        if isinstance(self.sigma, pd.DataFrame):
            # only accept sigma as an pd.DataFrame
            self._sigma_dup = self.sigma.copy()
            # filter sigma table for only the first instance of each hash
            self.sigma = self.sigma.loc[self.y_dataframe.index]
            # convert seq --> hash
            self.sigma.rename(index=self._seq_to_hash, inplace=True)
        # convert seq --> hash
        self.y_dataframe.rename(index=self._seq_to_hash, inplace=True)
        if self.seq_to_fit is not None:
            self.seq_to_fit = [
                self._seq_to_hash[seq] for seq in self.seq_to_fit
            ]
        logging.info(
            'Shrink rows in table by removing duplicates: '
            f'{self._y_dataframe_dup.shape[0]} --> {self.y_dataframe.shape[0]}'
        )
Exemplo n.º 2
0
    def fit(self, **kwargs):
        """Batch fit simulated result"""

        if self.seed:
            np.random.seed(self.seed)

        if self.y_dataframe is None:
            self.simulate_samples()

        from ..estimate.least_squares_batch import BatchFitter
        fitter = BatchFitter(y_dataframe=self.y_dataframe,
                             x_data=self.x_data,
                             model=self.model,
                             large_dataset=True,
                             **self.batchfitter_kwargs)
        fitter.fit(convergence_test=self.batchfitter_kwargs['conv_reps'] > 0,
                   bootstrap=self.batchfitter_kwargs['bootstrap_num'] > 0,
                   point_estimate=True,
                   stream_to=self.save_to,
                   **kwargs)
        self.results = fitter.results
        self.results.summary.to_csv(self.save_to.joinpath('results.csv'))
        logging.info(f"Result saved to {self.save_to.joinpath('results.csv')}")
        os.system(
            f"cd {self.save_to} && tar -czf seqs.tar.gz seqs && rm -r seqs")
Exemplo n.º 3
0
Arquivo: simu.py Projeto: ynshen/k-seq
    def __init__(self, dataset_dir, result_dir):
        """Survey estimation results
        - load fitting results from `result_dir/fit_summary.csv`
        - load truth and input count infor from `dataset_dir/truth.csv` and `input_counts`

        Optional to include:
        - input_counts: counts of sequences in the input pool
        - mean_counts: mean counts in all samples (input and reacted)

        Return:
            results: table of estimated k, A, kA
            truth: table of true k, A, p0, ka, and input_counts
            seq_list: list of indices of sequences that were able to estimate
        """

        allowed_col = [
            'k', 'A', 'kA', 'A_mean', 'k_mean', 'kA_mean', 'A_std', 'k_std',
            'kA_std', 'A_2.5%', 'k_2.5%', 'kA_2.5%', 'A_50%', 'k_50%',
            'kA_50%', 'A_97.5%', 'k_97.5%', 'kA_97.5%', 'bs_A_mean',
            'bs_k_mean', 'bs_kA_mean', 'bs_A_std', 'bs_k_std', 'bs_kA_std',
            'bs_A_2.5%', 'bs_k_2.5%', 'bs_kA_2.5%', 'bs_A_50%', 'bs_k_50%',
            'bs_kA_50%', 'bs_A_97.5%', 'bs_k_97.5%', 'bs_kA_97.5%',
            'rep_A_mean', 'rep_k_mean', 'rep_kA_mean', 'rep_A_std',
            'rep_k_std', 'rep_kA_std'
        ]

        from pathlib import Path
        from ..utility.file_tools import read_pickle

        self.results = pd.read_csv(f'{result_dir}/fit_summary.csv',
                                   index_col=0)
        self.cols = self.results.columns[self.results.columns.isin(
            allowed_col)].values
        self.seq_list = self.results[~self.results[self.cols].isna().any(
            axis=1)].index.values
        self._bs_prefix = 'bs_' if 'bs_kA_2.5%' in self.results.columns else ''

        if Path(dataset_dir).is_file():
            dataset = read_pickle(dataset_dir)
        else:
            dataset = read_pickle(dataset_dir + '/seq_table.pkl')

        self.truth = dataset.truth
        self.truth['input_counts'] = dataset.table.original.reindex(
            self.truth.index).s0
        self.truth['mean_counts'] = dataset.table.original.reindex(
            self.truth.index).mean(axis=1)

        logging.info(f'{self.truth.shape[0]} sequences simulated, '
                     f'{self.results.shape[0]} fitted, '
                     f'{len(self.seq_list)} has valid results')
Exemplo n.º 4
0
Arquivo: simu.py Projeto: ynshen/k-seq
        def generate_params(param_input):
            """Parse single distribution input and reformat as generated results
            """

            from types import GeneratorType

            if isinstance(param_input, (list, np.ndarray, pd.Series)):
                if len(param_input) == uniq_seq_num:
                    return param_input
                else:
                    logging.info(
                        'Size fo input param list and expected uniq_seq_num does not match, '
                        'resample to given uniq_seq_num with replacement')
                    return np.random.choice(param_input,
                                            replace=True,
                                            size=uniq_seq_num)
            elif isinstance(param_input, GeneratorType):
                # assume only generate one realization
                return [next(param_input) for _ in range(uniq_seq_num)]
            elif callable(param_input):
                try:
                    # if there is a uniq_seq_num parameter to pass
                    param_output = param_input(size=uniq_seq_num)
                    if isinstance(param_output, (list, np.ndarray, pd.Series)):
                        return param_output
                    elif isinstance(param_output, GeneratorType):
                        return next(param_output)
                    else:
                        logging.error(
                            "Unknown input to draw a distribution value",
                            error_type=TypeError)
                except TypeError:
                    # if can not pass uniq_seq_num, assume generate single samples
                    param_output = param_input()
                    if isinstance(param_output, GeneratorType):
                        return [
                            next(param_output) for _ in range(uniq_seq_num)
                        ]
                    elif isinstance(param_output, (float, int)):
                        return [param_input() for _ in range(uniq_seq_num)]
                    else:
                        logging.error(
                            "Unknown callable return type for distribution",
                            error_type=TypeError)
            else:
                logging.error("Unknown input to draw a distribution value",
                              error_type=TypeError)
Exemplo n.º 5
0
    def get_FitResult(self, seq=None):
        """Get FitResults from a JSON file
        """

        from .least_squares import FitResults
        if self._bs_record is None:
            logging.error('No bootstrap or convergence test record found',
                          error_type=TypeError)
        else:
            seq_to_hash = self._bs_record

        if seq is None:
            return seq_to_hash

        if isinstance(seq_to_hash[seq], (list, tuple)):
            # new hierarchical format
            tg_ix, hash_ix = seq_to_hash[seq]
            result = FitResults.from_json(json_path=f'{hash_ix}.json',
                                          tarfile=self.result_path.joinpath(
                                              'seqs', f'{tg_ix}.tar.gz'))
        else:
            # old format
            if self.result_path.joinpath('seqs',
                                         f"{seq_to_hash[seq]}.json").exists():
                logging.info(f"load result from {seq_to_hash[seq]}.json")
                result = FitResults.from_json(
                    self.result_path.joinpath('seqs',
                                              f'{seq_to_hash[seq]}.json'))
            elif self.result_path.joinpath('seqs.tar.gz').exists():
                try:
                    result = FitResults.from_json(
                        json_path=f'seqs/{seq_to_hash[seq]}.json',
                        tarfile=self.result_path.joinpath('seqs.tar.gz'))
                except:
                    result = FitResults.from_json(
                        json_path=f'results/seqs/{seq_to_hash[seq]}.json',
                        tarfile=self.result_path.joinpath('seqs.tar.gz'))

        if result.data.x_data is None and self.data.y_dataframe is not None:
            # add from data attribute
            result.data.x_data = self.data.x_data
            result.data.y_data = self.data.y_dataframe.loc[seq]

        return result
Exemplo n.º 6
0
    def __init__(self, data, data_unit=None, sample_list=None, seq_list=None, data_note=None, use_sparse=True,
                 seq_metadata=None, sample_metadata=None,
                 grouper=None, x_values=None, x_unit=None, note=None, dataset_metadata=None):

        # initialize metadata
        from datetime import datetime
        self.metadata = AttrScope(created_time=datetime.now(), note=note)
        # add metadata
        if dataset_metadata is not None:
            self.metadata.add(dataset_metadata)
        if sample_metadata is not None:
            self.metadata.samples = AttrScope(sample_metadata)
        if seq_metadata is not None:
            self.metadata.seqs = AttrScope(seq_metadata)
        logging.info('SeqData created')

        # add original seq_table
        self.table = AttrScope(original=SeqTable(data, columns=sample_list, index=seq_list,
                                                 unit=data_unit, note=data_note, use_sparse=use_sparse))
        # add x values
        if x_values is None:
            self.x_values = None
            self.x_unit = None
        elif isinstance(x_values, (dict, pd.Series)):
            self.x_values = pd.Series(x_values)
            self.x_unit = x_unit
        elif isinstance(x_values, (list, np.ndarray)):
            self.x_values = pd.Series(x_values, index=self.table.original.samples)
            self.x_unit = x_unit
        else:
            logging.error('Unknown type for x_values', error_type=TypeError)
        self.x_values = self.x_values[self.table.original.samples]
        self.x_values = self.x_values[~self.x_values.isna()]

        if grouper is not None:
            from .grouper import GrouperCollection
            self.grouper = GrouperCollection()
            self.grouper.add(**grouper)

        self.update_analysis()
Exemplo n.º 7
0
    def _update_norm_factor(self):
        """Update norm factor value based on current seq_data.total_amounts and seq_data.full_table"""
        if (self.full_table is not None) and (self.total_amounts is not None):
            for sample in self.full_table.columns:
                if sample not in self.total_amounts.keys():
                    logging.info(
                        f'Notice: {sample} is not in total_amount, skip this sample'
                    )

            self.norm_factor = {}
            from ..utility.func_tools import is_sparse

            for sample, amount in self.total_amounts.items():
                if sample in self.full_table.columns:
                    if is_sparse(self.full_table[sample]):
                        self.norm_factor[sample] = amount / self.full_table[
                            sample].sparse.to_dense().sum()
                    else:
                        self.norm_factor[
                            sample] = amount / self.full_table[sample].sum()
                else:
                    logging.info(f"Notice: {sample} is not in full_table")
Exemplo n.º 8
0
    def _hash_inv(self):
        """Recover the hashed results"""

        logging.info('Recovering original table from hash...')

        def get_summary(seq):
            return self.results.summary.loc[self._seq_to_hash[seq]]

        # map hash --> seq for results summary
        self.results.summary = pd.Series(
            data=list(self._seq_to_hash.keys()),
            index=list(self._seq_to_hash.keys())).apply(get_summary)

        # recover the original y_dataframe
        self.y_dataframe = self._y_dataframe_dup.copy()
        del self._y_dataframe_dup
        # recover the original sigma if exists
        if hasattr(self, '_sigma_dup'):
            self.sigma = self._sigma_dup.copy()
            del self._sigma_dup
        # recover the original seq_to_fit if exists
        if hasattr(self, '_seq_to_fit'):
            self.seq_to_fit = self._seq_to_fit_dup.copy()
            del self._seq_to_fit_dup
Exemplo n.º 9
0
def run_subprocess(cmd, name=None, **kwargs):
    if name is None:
        name = cmd[0]
    logging.info(f"Running {name}...")
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, env=os.environ, **kwargs)
    logging.info('Output:')
    while True:
        output = p.stdout.readline()
        if not output:
            break
        logging.info(output)
Exemplo n.º 10
0
    def fit(self,
            parallel_cores=1,
            point_estimate=True,
            replicates=False,
            bootstrap=False,
            convergence_test=False,
            stream_to=None,
            overwrite=False):
        """Run the estimation
        Args:
            parallel_cores (int): number of parallel cores to use. Default 1
            point_estimate (bool): if perform point estimation, default True
            bootstrap (bool): if perform bootstrap uncertainty estimation, default False
            replicates (bool): if perform replicates for uncertainty estimation, default False
            convergence_test (bool): if perform convergence test, default False
            stream_to (str): Directly stream fitting results to disk if output path is given
                will create a folder with name of seq/hash with pickled dict of fitting results
            overwrite (bool): if overwrite existing results when stream to disk. Default False.
        """

        from yutility.log import Timer
        logging.info('Batch fitting starting...')

        with Timer():
            if self.large_dataset and stream_to is None:
                logging.error(
                    'You are working with large dataset and stream_to needs to be specified',
                    error_type=ValueError)
            if not self.large_dataset and stream_to is not None:
                self.large_dataset = True
                logging.warning(
                    "You provided `stream_to` so the large_dataset method is used"
                )

            if self.large_dataset:
                self._hash()
                self.results.result_path = Path(stream_to)
                check_dir(self.results.result_path.joinpath('seqs'))
                dump_json(obj=self._seq_to_hash,
                          path=self.results.result_path.joinpath(
                              'seqs', 'seq_to_hash.json'))

            from functools import partial
            work_fn = partial(_work_fn,
                              point_estimate=point_estimate,
                              replicates=replicates,
                              bootstrap=bootstrap,
                              convergence_test=convergence_test)
            worker_generator = self._worker_generator(stream_to=stream_to,
                                                      overwrite=overwrite)
            if parallel_cores > 1:
                import multiprocessing as mp
                pool = mp.Pool(processes=int(parallel_cores))
                logging.info(
                    'Use multiprocessing to fit in {} parallel threads...'.
                    format(parallel_cores))
                workers = pool.map(work_fn, worker_generator)
            else:
                # single thread
                logging.info('Fitting in a single thread...')
                workers = [work_fn(worker) for worker in worker_generator]

            # print(workers[0].summary())
            self.results.summary = pd.DataFrame(
                {worker.name: worker.summary()
                 for worker in workers}).transpose()
            self.results.summary.index.name = 'seq'
            # record result
            if self.bootstrap:
                if self.large_dataset:
                    self.results._bs_record = self._seq_to_hash
                else:
                    self.results._bs_record = {
                        worker.name: worker.results.uncertainty.records
                        for worker in workers
                    }
            if convergence_test:
                if self.large_dataset:
                    self.results._conv_record = self._seq_to_hash
                else:
                    self.results._conv_record = {
                        worker.name: worker.results.convergence.records
                        for worker in workers
                    }

            if self.large_dataset:
                self._hash_inv()
                self.results.to_json(output_dir=stream_to)

            logging.info('Fitting finished')
Exemplo n.º 11
0
def main():
    """Main function for fitting"""

    from k_seq.estimate import BatchFitter
    from k_seq.model.kinetic import BYOModel

    work_table, x_data, sigma, seq_data = read_table()
    if args.bs_method.lower() == 'stratified':
        try:
            grouper = getattr(seq_data.grouper, args.stratified_grouper).group
        except:
            logging.error('Can not find grouper for stratified bootstrapping',
                          error_type=ValueError)
            sys.exit(1)
    else:
        grouper = None

    logging.info(f'exclude_zero: {args.exclude_zero}')
    logging.info(f'inverse_weight: {args.inverse_weight}')
    logging.info(f'fit_top_n: {args.fit_top_n}')
    logging.info(f'large_data: {args.large_data}')
    logging.info(f'convergence: {args.convergence_num > 0}')
    logging.info(f'bootstrap: {args.bootstrap_num > 0}')

    batch_fitter = BatchFitter(y_dataframe=work_table,
                               x_data=x_data,
                               sigma=sigma,
                               bounds=[[0, 0], [np.inf, 1]],
                               metrics={'kA': kA},
                               model=BYOModel.reacted_frac(broadcast=False),
                               exclude_zero=args.exclude_zero,
                               grouper=grouper,
                               bootstrap_num=args.bootstrap_num,
                               bs_record_num=args.bs_record_num,
                               bs_method=args.bs_method,
                               bs_stats={},
                               conv_reps=args.convergence_num,
                               conv_init_range=((0, 10), (0, 1)),
                               conv_stats={},
                               large_dataset=True,
                               note=args.note,
                               rnd_seed=args.seed)
    stream_to = args.output_dir if args.large_data else None
    batch_fitter.fit(parallel_cores=args.core_num,
                     point_estimate=True,
                     bootstrap=args.bootstrap_num > 0,
                     convergence_test=args.convergence_num > 0,
                     stream_to=stream_to,
                     overwrite=args.overwrite)

    batch_fitter.summary(save_to=f'{args.output_dir}/fit_summary.csv')
    batch_fitter.save_model(output_dir=args.output_dir,
                            results=True,
                            bs_record=False,
                            tables=True)

    # zip seq info
    os.system(
        f"cd {str(args.output_dir)} && tar -czf seq.tar.gz seqs && rm -r seqs")
Exemplo n.º 12
0
        '--inverse_weight',
        dest='inverse_weight',
        default=False,
        action='store_true',
        help='Use counts (with pseudo counts 0.5) as the sigma in fitting')

    parser.add_argument('--seed', type=int, default=23, help='Random seed')

    args = parser.parse_args()
    check_dir(args.output_dir)
    dump_json(obj=vars(args), path=f"{args.output_dir}/config.json")
    args.output_dir = Path(args.output_dir).resolve()

    return args


if __name__ == '__main__':

    args = parse_args()
    # pkg_path = args.pkg_path
    # if pkg_path is not None and pkg_path not in sys.path:
    #     sys.path.insert(0, pkg_path)

    logging.add_console_handler()
    logging.add_file_handler(args.output_dir / "LOG")
    logging.set_level('info')
    logging.info(f"Log stream to {args.output_dir/'LOG'})")
    info = logging.info
    with Timer():
        main()
Exemplo n.º 13
0
    def __init__(self,
                 model,
                 sample_n,
                 x_data,
                 save_to,
                 param1_name,
                 param1_range,
                 param2_name,
                 param2_range,
                 param1_log=False,
                 param2_log=False,
                 model_kwargs=None,
                 bootstrap_num=100,
                 bs_record_num=50,
                 bs_method='data',
                 bs_stats=None,
                 grouper=None,
                 record_full=False,
                 conv_reps=20,
                 conv_stats=None,
                 conv_init_range=None,
                 fitting_kwargs=None,
                 seed=23):
        from ..utility.func_tools import AttrScope
        from ..utility.file_tools import dump_json
        from pathlib import Path

        # assign model
        self.model = model
        self.parameters = None
        self.model_kwargs = model_kwargs if model_kwargs is not None else {}

        # assign parameter
        self.param1 = AttrScope(name=param1_name,
                                range=param1_range,
                                log=param1_log)
        self.param2 = AttrScope(name=param2_name,
                                range=param2_range,
                                log=param2_log)
        self.sample_n = sample_n

        # assign fitting specs
        fitting_kwargs = fitting_kwargs if fitting_kwargs is not None else {}
        self.batchfitter_kwargs = dict(
            bootstrap_num=bootstrap_num,
            bs_record_num=bs_record_num,
            bs_method=bs_method,
            bs_stats=bs_stats,
            grouper=grouper,
            record_full=record_full,
            conv_reps=conv_reps,
            conv_init_range=conv_init_range,
            conv_stats=conv_stats,
        )
        self.batchfitter_kwargs.update(fitting_kwargs)

        # assign x_data
        if not isinstance(x_data, pd.Series):
            x_data = pd.Series(x_data)
        self.x_data = x_data
        self.y_dataframe = None
        self.seed = seed
        self.results = None

        config = dict(sample_n=sample_n,
                      x_data=list(x_data),
                      param1_name=param1_name,
                      param1_range=param1_range,
                      param1_log=param1_log,
                      param2_name=param2_name,
                      param2_range=param2_range,
                      param2_log=param2_log,
                      bootstrap_num=bootstrap_num,
                      bs_record_num=bs_record_num,
                      bs_method=bs_method,
                      bs_stats=list(bs_stats.keys()) if isinstance(
                          bs_stats, dict) else bs_stats,
                      conv_reps=conv_reps,
                      conv_stats=list(conv_stats.keys()) if isinstance(
                          conv_stats, dict) else conv_stats,
                      conv_init_range=conv_init_range
                      if conv_init_range is None else list(conv_init_range),
                      seed=seed)

        # create saving path and save config
        self.save_to = Path(save_to)
        if not self.save_to.exists():
            self.save_to.mkdir(parents=True)
            logging.info(f'Output dir {str(self.save_to)} created')
        dump_json(config, path=self.save_to.joinpath('config.json'))
Exemplo n.º 14
0
    def simulate_samples(self,
                         grid=True,
                         const_err=None,
                         rel_err=None,
                         y_enforce_positive=True):
        """Simulate a set of samples (param1 and param2)"""

        logging.info(
            f"Simulating dataset with const_error: {const_err}, pct_error: {rel_err}, "
            f"y_enforce_positive: {y_enforce_positive}...")

        if self.seed is not None:
            np.random.seed(self.seed)

        if grid:
            n_cell = int(np.sqrt(self.sample_n)) + 1
            if self.param1.log:
                param1 = np.logspace(np.log10(self.param1.range[0]),
                                     np.log10(self.param1.range[1]), n_cell)
            else:
                param1 = np.linspace(self.param1.range[0],
                                     self.param1.range[1], n_cell)
            if self.param2.log:
                param2 = np.logspace(np.log10(self.param2.range[0]),
                                     np.log10(self.param2.range[1]), n_cell)
            else:
                param2 = np.linspace(self.param2.range[0],
                                     self.param2.range[1], n_cell)

            self.parameters = pd.DataFrame({
                self.param1.name:
                np.repeat(np.expand_dims(param1, -1), n_cell,
                          -1).T.reshape(-1),
                self.param2.name:
                np.repeat(param2, n_cell)
            })
        else:
            self.parameters = pd.DataFrame({
                self.param1.name:
                _parameter_gen(self.param1.range,
                               self.param1.log,
                               size=self.sample_n),
                self.param2.name:
                _parameter_gen(self.param2.range,
                               self.param2.log,
                               size=self.sample_n)
            })

        def partial_model(param):

            y = self.model(self.x_data, **param.to_dict(), **self.model_kwargs)
            if not isinstance(y, pd.Series) and isinstance(
                    self.x_data, pd.Series):
                y = pd.Series(y, index=self.x_data.index)
            return y

        self.y_dataframe = self.parameters.apply(partial_model, axis=1)

        if const_err is not None:
            self.y_dataframe += np.random.normal(loc=0,
                                                 scale=const_err,
                                                 size=self.y_dataframe.shape)
        if rel_err is not None:
            self.y_dataframe += np.random.normal(loc=0,
                                                 scale=self.y_dataframe *
                                                 rel_err,
                                                 size=self.y_dataframe.shape)

        if y_enforce_positive:
            self.y_dataframe[self.y_dataframe < 0] = 0

        logging.info('Simulation done.')

        self.save_to.joinpath('data').mkdir(exist_ok=True)
        self.parameters.to_csv(self.save_to.joinpath('data', 'parameters.csv'))
        self.y_dataframe.to_csv(self.save_to.joinpath('data', 'y.csv'))
        self.x_data.to_csv(self.save_to.joinpath('data', 'x.csv'))
Exemplo n.º 15
0
Arquivo: simu.py Projeto: ynshen/k-seq
def simulate_counts(uniq_seq_num,
                    x_values,
                    total_reads,
                    p0_generator=None,
                    kinetic_model=None,
                    count_model=None,
                    total_amount_error=None,
                    param_sample_from_df=None,
                    weights=None,
                    replace=True,
                    reps=1,
                    seed=None,
                    note=None,
                    save_to=None,
                    **param_generators):
    from ..model import pool

    if seed is not None:
        np.random.seed(seed)

    # default models
    # kinetic_model: BYO first-order model returns absolute amount
    # count_model: multinomial
    if kinetic_model is None:
        from ..model import kinetic
        kinetic_model = kinetic.BYOModel.amount_first_order(broadcast=False)
        logging.info(
            'No kinetic model provided, use BYOModel.amount_first_order')
    if count_model is None:
        from ..model import count
        count_model = count.multinomial
        logging.info('No count model provided, use multinomial distribution')

    # compose sequence parameter from (with priority high to low)
    # 1. p0
    # 2. param_generator
    # 3. param_sample_from_df

    param_table = pd.DataFrame(index=np.arange(uniq_seq_num))
    logging.info(f'param_table created, param_table shape {param_table.shape}')
    # if sample p0 from a generator
    if p0_generator is not None:
        param_table['p0'] = PoolParamGenerator.sample_from_iid_dist(
            p0=p0_generator, uniq_seq_num=uniq_seq_num)['p0']
        logging.info(
            f'p0 added from distribution, param_table shape {param_table.shape}'
        )
    # if extra param_generator detected
    if param_generators != {}:
        temp_table = PoolParamGenerator.sample_from_iid_dist(
            uniq_seq_num=uniq_seq_num, **param_generators)
        col_name = temp_table.columns[~temp_table.columns.isin(param_table.
                                                               columns.values)]
        param_table = pd.concat([param_table, temp_table[col_name]],
                                ignore_index=True,
                                axis=1)
        logging.info(
            f'{list(param_generators.keys())} added from distribution, param_table shape {param_table.shape}'
        )

    # if a param dataframe if provided
    if param_sample_from_df is not None:
        temp_table = PoolParamGenerator.sample_from_dataframe(
            df=param_sample_from_df,
            uniq_seq_num=uniq_seq_num,
            replace=replace,
            weights=weights)
        col_name = temp_table.columns[~temp_table.columns.isin(param_table.
                                                               columns.values)]
        param_table = pd.concat([param_table, temp_table[col_name]], axis=1)
        logging.info(
            f'{col_name} added from dataframe, param_table shape {param_table.shape}'
        )
    param_table.index.name = 'seq'

    # get pool model
    pool_model = pool.PoolModel(count_model=count_model,
                                kinetic_model=kinetic_model,
                                param_table=param_table)
    x = {}
    Y = {}
    total_amount = {}

    if is_numeric(total_reads):
        total_reads = np.repeat(total_reads, len(x_values))
    for sample_ix, (c, n) in enumerate(zip(x_values, total_reads)):
        if reps is None or reps == 1:
            total_amount[f"s{sample_ix}"], Y[
                f"s{sample_ix}"] = pool_model.predict(c=c, N=n)
            x[f"s{sample_ix}"] = {'c': c, 'n': n}
        else:
            for rep in range(reps):
                total_amount[f"s{sample_ix}-{rep}"], Y[
                    f"s{sample_ix}-{rep}"] = pool_model.predict(c=c, N=n)
                x[f"s{sample_ix}-{rep}"] = {'c': c, 'N': n}
    # return x, Y, total_amounts, param_table
    x = pd.DataFrame.from_dict(x, orient='columns')
    Y = pd.DataFrame.from_dict(Y, orient='columns')
    total_amount = pd.Series(total_amount)

    if total_amount_error is not None:
        if is_numeric(total_amount_error):
            total_amount += np.random.normal(loc=0,
                                             scale=total_amount_error,
                                             size=len(total_amount))
        elif callable(total_amount_error):
            total_amount = total_amount.apply(total_amount_error)
        else:
            logging.error('Unknown total_amount_error type',
                          error_type=TypeError)

    x.index.name = 'param'
    Y.index.name = 'seq'
    total_amount.index.name = 'amount'

    # return x, Y, total_amount, param_table, None
    from .seq_data import SeqData

    input_samples = x.loc['c']
    input_samples = list(input_samples[input_samples < 0].index)

    seq_table = SeqData(
        data=Y,
        x_values=x.loc['c'],
        note=note,
        data_unit='counts',
        grouper={
            'input':
            input_samples,
            'reacted':
            [sample for sample in x.columns if sample not in input_samples]
        })
    seq_table.add_sample_total(total_amounts=total_amount.to_dict(),
                               full_table=seq_table.table.original)
    seq_table.table.abs_amnt = seq_table.sample_total.apply(
        target=seq_table.table.original)

    from .transform import ReactedFractionNormalizer
    reacted_frac = ReactedFractionNormalizer(input_samples=input_samples,
                                             reduce_method='median',
                                             remove_empty=True)
    seq_table.table.reacted_frac = reacted_frac.apply(seq_table.table.abs_amnt)
    from .filters import DetectedTimesFilter
    seq_table.table.seq_in_all_smpl_reacted_frac = DetectedTimesFilter(
        min_detected_times=seq_table.table.reacted_frac.shape[1])(
            seq_table.table.reacted_frac)
    seq_table.truth = param_table

    if save_to is not None:
        from pathlib import Path
        save_path = Path(save_to)
        if save_path.suffix == '':
            save_path.mkdir(parents=True, exist_ok=True)
            total_amount.to_csv(f'{save_path}/dna_amount.csv')
            x.to_csv(f'{save_path}/x.csv')
            Y.to_csv(f'{save_path}/Y.csv')
            param_table.to_csv(f'{save_path}/truth.csv')
            seq_table.to_pickle(f"{save_path}/seq_table.pkl")
        else:
            logging.error('save_to should be a directory',
                          error_type=TypeError)
    return x, Y, total_amount, param_table, seq_table
Exemplo n.º 16
0
    def __init__(self,
                 y_dataframe,
                 x_data,
                 model,
                 x_label=None,
                 y_label=None,
                 seq_to_fit=None,
                 sigma=None,
                 bounds=None,
                 init_guess=None,
                 opt_method='trf',
                 exclude_zero=False,
                 metrics=None,
                 rnd_seed=None,
                 curve_fit_kwargs=None,
                 replicates=None,
                 bootstrap_num=0,
                 bs_record_num=0,
                 bs_method='pct_res',
                 bs_stats=None,
                 grouper=None,
                 record_full=False,
                 conv_reps=0,
                 conv_init_range=None,
                 conv_stats=None,
                 note=None,
                 large_dataset=False,
                 verbose=1,
                 result_path=None):

        from ..utility.func_tools import AttrScope, get_func_params

        super().__init__()

        logging.info('Creating the BatchFitter...')

        self.model = model
        self.note = note

        # parse y_dataframe
        from ..utility.file_tools import table_object_to_dataframe
        self.y_dataframe = table_object_to_dataframe(y_dataframe)

        # process seq_to_fit
        if seq_to_fit is not None:
            if isinstance(seq_to_fit, (list, np.ndarray, pd.Series)):
                self.seq_to_fit = list(seq_to_fit)
            elif isinstance(seq_to_fit, int):
                self.seq_to_fit = y_dataframe.index[:seq_to_fit].values
            else:
                logging.error(
                    'Unknown seq_to_fit type, is it list-like or int?',
                    error_type=TypeError)
        else:
            self.seq_to_fit = seq_to_fit

        # prep fitting params shared by all fittings
        if isinstance(x_data, pd.Series):
            self.x_data = x_data[y_dataframe.columns.values]
        elif len(x_data) != y_dataframe.shape[1]:
            logging.error(
                'x_data length and table column number does not match',
                error_type=ValueError)
        else:
            self.x_data = np.array(x_data)

        if sigma is not None:
            if np.shape(sigma) != np.shape(self.y_dataframe):
                logging.error(
                    'Shape of sigma does not match the shape of y_dataframe',
                    error_type=ValueError)
        self.sigma = sigma

        if bounds is None:
            bounds = (-np.inf, np.inf)

        if len(x_data) <= 1:
            logging.warning(
                "Number of data points less than 2, bootstrap will not be performed"
            )
            bootstrap_num = 0
        self.bootstrap = bootstrap_num > 0

        # contains arguments should pass to the single estimator
        self.fit_params = AttrScope(
            x_data=self.x_data,
            x_label=x_label,
            y_label=y_label,
            model=self.model,
            bounds=bounds,
            init_guess=init_guess,
            opt_method=opt_method,
            exclude_zero=exclude_zero,
            metrics=metrics,
            rnd_seed=rnd_seed,
            curve_fit_kwargs=curve_fit_kwargs,
            replicates=replicates,
            bootstrap_num=bootstrap_num,
            bs_record_num=bs_record_num,
            bs_method=bs_method,
            bs_stats=bs_stats,
            grouper=grouper if bs_method == 'stratified' else None,
            record_full=record_full,
            conv_reps=conv_reps,
            conv_init_range=conv_init_range,
            conv_stats=conv_stats,
            verbose=verbose,
        )
        if result_path is None:
            self.results = BatchFitResults(estimator=self)
        else:
            self.results = BatchFitResults.load_result(result_path)
        self.large_dataset = large_dataset
        self.results.large_dataset = large_dataset
        self.workers = None

        logging.info('BatchFitter created')
Exemplo n.º 17
0
def test_logging_can_log():
    logging.info('Some info')
    logging.warning("Some warning")
    with raises(ValueError):
        logging.error("let's get some ValueError", error_type=ValueError)