Exemplo n.º 1
0
def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations, correlations_k_of_n, allow_new_computation):
    def arg_mapper(gr,f_proxy):
        g,r = gr
        series = dataset.get_one_series(g,r)
        return f_proxy(series,fitter)
        
    # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard
    # this also requires that the list of all genes be taken from the whole data
    # and not from each dataset. Otherwise we can get a mismatch between the genes 
    # in the shard for different datasets.
    dataset_fits = job_splitting.compute(
        name = 'fits',
        f = _compute_fit,
        arg_mapper = arg_mapper,
        all_keys = list(product(dataset.gene_names,dataset.region_names)),
        all_sharding_keys = data.gene_names,
        f_sharding_key = lambda gr: gr[0],
        k_of_n = k_of_n,
        base_filename = fit_results_relative_path(dataset,fitter),
        allow_new_computation = allow_new_computation,
    )
    
    if n_correlation_iterations > 0:
        # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region
        # which is necessary for computing correlations in that region.
        assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level" 
        _add_dataset_correlation_fits(dataset, fitter, dataset_fits, n_correlation_iterations, correlations_k_of_n, allow_new_computation)

    if cfg.verbosity > 0:
        print 'Adding fit scores... ',
    _add_scores(dataset, dataset_fits)
    if cfg.verbosity > 0:
        print 'done!'
    
    return dataset_fits
Exemplo n.º 2
0
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations,
                                  k_of_n, allow_new_computation):
    def arg_mapper(key, f_proxy):
        ir, loo_point = key
        r = dataset.region_names[ir]
        series = dataset.get_several_series(dataset.gene_names, r)
        basic_theta = [ds_fits[(g, r)].theta for g in dataset.gene_names]
        return f_proxy(series, fitter, basic_theta, loo_point, n_iterations)

    all_keys = []
    for ir, r in enumerate(dataset.region_names):
        all_keys.append((ir, None))
        series = dataset.get_several_series(dataset.gene_names, r)
        for iy, g in enumerate(dataset.gene_names):
            for ix in xrange(len(series.ages)):
                loo_point = (ix, iy)
                all_keys.append((ir, loo_point))

    def f_sharding_key(
            key):  # keep all x points in the same shard for same r,iy
        r, loo_point = key
        if loo_point is None:
            return (r, None)
        else:
            ix, iy = loo_point
            return (r, iy)

    dct_results = job_splitting.compute(
        name='fits-correlations',
        f=_compute_fit_with_correlations,
        arg_mapper=arg_mapper,
        all_keys=all_keys,
        f_sharding_key=f_sharding_key,
        k_of_n=k_of_n,
        base_filename=fit_results_relative_path(dataset, fitter) +
        '-correlations-{}'.format(n_iterations),
        allow_new_computation=allow_new_computation,
    )
    _add_dataset_correlation_fits_from_results_dictionary(
        dataset, ds_fits, dct_results)
Exemplo n.º 3
0
def _get_dataset_fits(data, dataset, fitter, k_of_n, n_correlation_iterations,
                      correlations_k_of_n, allow_new_computation):
    def arg_mapper(gr, f_proxy):
        g, r = gr
        series = dataset.get_one_series(g, r)
        return f_proxy(series, fitter)

    # sharding is done by gene, so plots.plot_and_save_all_genes can work on a shard
    # this also requires that the list of all genes be taken from the whole data
    # and not from each dataset. Otherwise we can get a mismatch between the genes
    # in the shard for different datasets.
    dataset_fits = job_splitting.compute(
        name='fits',
        f=_compute_fit,
        arg_mapper=arg_mapper,
        all_keys=list(product(dataset.gene_names, dataset.region_names)),
        all_sharding_keys=data.gene_names,
        f_sharding_key=lambda gr: gr[0],
        k_of_n=k_of_n,
        base_filename=fit_results_relative_path(dataset, fitter),
        allow_new_computation=allow_new_computation,
    )

    if n_correlation_iterations > 0:
        # The problem is that if we're using a shard for the basic fits we won't have theta for all genes in a region
        # which is necessary for computing correlations in that region.
        assert k_of_n is None, "Can't perform correlation computations when sharding is enabled at the basic fit level"
        _add_dataset_correlation_fits(dataset, fitter, dataset_fits,
                                      n_correlation_iterations,
                                      correlations_k_of_n,
                                      allow_new_computation)

    if cfg.verbosity > 0:
        print 'Adding fit scores... ',
    _add_scores(dataset, dataset_fits)
    if cfg.verbosity > 0:
        print 'done!'

    return dataset_fits
Exemplo n.º 4
0
def _add_dataset_correlation_fits(dataset, fitter, ds_fits, n_iterations, k_of_n, allow_new_computation):
    def arg_mapper(key, f_proxy):
        ir, loo_point = key
        r = dataset.region_names[ir]
        series = dataset.get_several_series(dataset.gene_names,r)
        basic_theta = [ds_fits[(g,r)].theta for g in dataset.gene_names]
        return f_proxy(series, fitter, basic_theta, loo_point, n_iterations)
        
    all_keys = []
    for ir,r in enumerate(dataset.region_names):
        all_keys.append((ir,None))
        series = dataset.get_several_series(dataset.gene_names,r)
        for iy,g in enumerate(dataset.gene_names):
            for ix in xrange(len(series.ages)):
                loo_point = (ix,iy)
                all_keys.append((ir,loo_point))
        
    def f_sharding_key(key): # keep all x points in the same shard for same r,iy
        r, loo_point = key
        if loo_point is None:
            return (r,None)
        else:
            ix,iy = loo_point
            return (r,iy)
        
    dct_results = job_splitting.compute(
        name = 'fits-correlations',
        f = _compute_fit_with_correlations,
        arg_mapper = arg_mapper,
        all_keys = all_keys,
        f_sharding_key = f_sharding_key,
        k_of_n = k_of_n,
        base_filename = fit_results_relative_path(dataset,fitter) + '-correlations-{}'.format(n_iterations),
        allow_new_computation = allow_new_computation,
    )
    _add_dataset_correlation_fits_from_results_dictionary(dataset, ds_fits, dct_results)