def resolved_data(self): data = self.data log = (log_durations(logger.debug, "resolving values") if self._enable_parametrization else nullcontext()) with log: data = self.resolver.resolve() return data.get("stages", {})
def resolved_data(self): data = self.data if self._enable_parametrization: wdir = PathInfo(self.dvcfile.path).parent with log_durations(logger.debug, "resolving values"): resolver = DataResolver(self.repo, wdir, data) data = resolver.resolve() return data.get("stages", {})
def stages(self): data, _ = self._load() if self.repo.config["feature"]["parametrization"]: with log_durations(logger.debug, "resolving values"): resolver = DataResolver(data) data = resolver.resolve() lockfile_data = self._lockfile.load() return StageLoader(self, data.get("stages", {}), lockfile_data)
def get_balanced_permutations(balanced, permutations): balanced_permutations = balanced if not permutations.empty: logger.info('Estimating significance by permutation for %s' % analysis.analysis_name) with log_durations( logger.debug, 'Estimating significance by permutation for %s' % analysis.analysis_name): recs = [] permutations.sort_index(inplace=True) #to speed lookups # permutations.to_csv("permutations.csv") for mygene in balanced.index: perms = permutations.ix[mygene] random_TE = balanced.ix[mygene].random_TE random_side = 'right' if random_TE > 0 else "left" random_perms = perms.random_TE.order() random_nperms = random_perms.count() random_rank = random_perms.searchsorted(random_TE, side=random_side)[0] if random_side == "right": random_rank = random_nperms - random_rank random_pval_perm = float(random_rank) / random_nperms fixed_TE = balanced.ix[mygene].fixed_TE fixed_side = 'right' if fixed_TE > 0 else "left" fixed_perms = perms.fixed_TE.order() fixed_nperms = fixed_perms.count() fixed_rank = fixed_perms.searchsorted(fixed_TE, side=fixed_side)[0] if fixed_side == "right": fixed_rank = fixed_nperms - fixed_rank fixed_pval_perm = float(fixed_rank) / fixed_nperms rec = dict(random_rank=random_rank, random_nperms=random_nperms, random_pval_perm=random_pval_perm, fixed_rank=fixed_rank, fixed_nperms=fixed_nperms, fixed_pval_perm=fixed_pval_perm) recs.append(rec) df = pd.DataFrame(recs) df.index = balanced.index balanced_permutations = balanced.join(df) return balanced_permutations
def get_balanced_permutations(analysis, balanced, permutations): balanced_permutations = balanced if not permutations.empty: logger.info("Estimating significance by permutation for %s" % analysis.analysis_name) with log_durations(logger.debug, "Estimating significance by permutation for %s" % analysis.analysis_name): recs = [] permutations.sort_index(inplace=True) # to speed lookups # permutations.to_csv("permutations.csv") for mygene in balanced.index: perms = permutations.ix[mygene] random_TE = balanced.ix[mygene].random_TE random_side = "right" if random_TE > 0 else "left" random_perms = perms.random_TE.order() random_nperms = random_perms.count() random_rank = random_perms.searchsorted(random_TE, side=random_side)[0] if random_side == "right": random_rank = random_nperms - random_rank random_pval_perm = float(random_rank) / random_nperms fixed_TE = balanced.ix[mygene].fixed_TE fixed_side = "right" if fixed_TE > 0 else "left" fixed_perms = perms.fixed_TE.order() fixed_nperms = fixed_perms.count() fixed_rank = fixed_perms.searchsorted(fixed_TE, side=fixed_side)[0] if fixed_side == "right": fixed_rank = fixed_nperms - fixed_rank fixed_pval_perm = float(fixed_rank) / fixed_nperms rec = dict( random_rank=random_rank, random_nperms=random_nperms, random_pval_perm=random_pval_perm, fixed_rank=fixed_rank, fixed_nperms=fixed_nperms, fixed_pval_perm=fixed_pval_perm, ) recs.append(rec) df = pd.DataFrame(recs) df.index = balanced.index balanced_permutations = balanced.join(df) return balanced_permutations
def perform_analysis(analysis, debug=False): logger.info('Started %s analysis', analysis.analysis_name) with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name): df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups) # Remove single-class sources query = df.groupby(['series_id', 'platform_id' ]).sample_class.agg(lambda x: set(x)) >= {0, 1} df = filter_sources(df, query, 'as single-class') # Check for minimum number of samples if analysis.min_samples: counts = df.groupby(['series_id', 'platform_id' ]).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, 'by min samples') # Check number of sources sources = df.groupby(['series_id', 'platform_id']).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ('single source' if sources else 'no data')) return # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info('Stats: %d sources, %d series, %d platforms, %d samples' % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info('Loading data and calculating fold changes for %s', analysis.analysis_name) with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name): gses = (load_gse(df, series_id) for series_id in sorted(df.series_id.unique())) fold_changes = pd.concat(imap(get_fold_change_analysis, gses)) debug and fold_changes.to_csv("%s.fc.csv" % debug) logger.info('Meta-Analyzing %s', analysis.analysis_name) with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name): balanced = getFullMetaAnalysis(fold_changes, debug=debug).reset_index() debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Inserting %s analysis results', analysis.analysis_name) # with log_durations(logger.debug, 'Saving results of %s' % analysis.analysis_name):#, \ # # transaction.atomic(): # balanced['analysis'] = analysis # balanced.columns = balanced.columns.map(lambda x: x.replace(".", "_").lower()) # field_names = [f.name for f in MetaAnalysis._meta.fields if f.name != 'id'] # rows = balanced[field_names].T.to_dict().values() # Delete old values in case we recalculating analysis # MetaAnalysis.objects.filter(analysis=analysis).delete() # MetaAnalysis.objects.bulk_create(MetaAnalysis(**row) for row in rows) logger.info('DONE %s analysis', analysis.analysis_name) return balanced
def perform_analysis(conn, analysis, debug=False): cursor = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) logger.info('Started %s analysis', analysis.analysis_name) with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name): df = get_analysis_df(conn, analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups) # Remove single-class sources query = df.groupby(['series_id', 'platform_id']).sample_class.agg(lambda x: set(x)) >= {0, 1} df = filter_sources(df, query, 'as single-class') # Check for minimum number of samples if analysis.min_samples: counts = df.groupby(['series_id', 'platform_id']).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, 'by min samples') # Check number of sources sources = df.groupby(['series_id', 'platform_id']).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ('single source' if sources else 'no data')) return # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info('Stats: %d sources, %d series, %d platforms, %d samples' % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info('Loading data and calculating fold changes for %s', analysis.analysis_name) with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name): gses = (load_gse(cursor, df, series_id) for series_id in sorted(df.series_id.unique())) fold_changes = pd.concat(imap(get_fold_change_analysis, gses)) debug and fold_changes.to_csv("%s.fc.csv" % debug) logger.info('Meta-Analyzing %s', analysis.analysis_name) with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name): balanced = getFullMetaAnalysis(fold_changes, debug=debug).reset_index() debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Inserting %s analysis results', analysis.analysis_name) # with log_durations(logger.debug, 'Saving results of %s' % analysis.analysis_name):#, \ # # transaction.atomic(): # balanced['analysis'] = analysis # balanced.columns = balanced.columns.map(lambda x: x.replace(".", "_").lower()) # field_names = [f.name for f in MetaAnalysis._meta.fields if f.name != 'id'] # rows = balanced[field_names].T.to_dict().values() # Delete old values in case we recalculating analysis # MetaAnalysis.objects.filter(analysis=analysis).delete() # MetaAnalysis.objects.bulk_create(MetaAnalysis(**row) for row in rows) logger.info('DONE %s analysis', analysis.analysis_name) return balanced
def perform_analysis(analysis, debug=False, impute=False, nperm=0, mygene_filter=None): """ Returns a tuple of sample_df, fold_change, balanced_permutations, permutations """ logger.info('Started %s analysis', analysis.analysis_name) # from multiprocessing import Pool # pool = Pool(processes=4) with log_durations(logger.debug, 'Loading dataframe for %s' % analysis.analysis_name): df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info('Matching sources: %d' % df.groupby(['series_id', 'platform_id']).ngroups) # Remove single-class sources query = df.groupby([ 'series_id', 'platform_id' ]).sample_class.agg(lambda x: set(x)).map(lambda x: x >= {0, 1}) df = filter_sources(df, query, 'as single-class') # Check for minimum number of samples if analysis.min_samples: counts = df.groupby(['series_id', 'platform_id' ]).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, 'by min samples') # Check number of sources sources = df.groupby(['series_id', 'platform_id']).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ('single source' if sources else 'no data')) return df, None, None, None # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info('Stats: %d sources, %d series, %d platforms, %d samples' % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count)) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info('Loading data and calculating fold change for %s', analysis.analysis_name) with log_durations(logger.debug, 'Load/fold for %s' % analysis.analysis_name): gses = (load_gse(df, series_id, impute) for series_id in sorted(df.series_id.unique())) debugs = [debug] * df.series_id.nunique() nperms = [nperm] * df.series_id.nunique() mygene_filters = [mygene_filter] * df.series_id.nunique() # start a pool with 4 processes fold_change = pd.concat( imap(get_gene_fold_change, gses, debugs, nperms, mygene_filters)) # fold_change = pd.concat(pool.imap(multi_run_wrapper, zip(gses, debugs, nperms))) debug and fold_change.to_csv("%s.fc.csv" % debug) #Start metaanalysis logger.info('Meta-Analyzing %s', analysis.analysis_name) with log_durations(logger.debug, 'Meta analysis for %s' % analysis.analysis_name): # logger.info('Meta analysis of real data for %s' % analysis.analysis_name) with log_durations( logger.debug, 'meta analysis of real data for %s' % analysis.analysis_name): balanced = get_full_meta(fold_change.query("""perm == 0"""), debug=debug) debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Meta-Analyzing of permutations for %s', analysis.analysis_name) with log_durations( logger.debug, 'meta analysis of permutations for %s' % analysis.analysis_name): permutations = pd.DataFrame() fold_change = fold_change.reset_index().sort('perm').set_index( 'perm') for i in range(nperm): perm = i + 1 # logger.info('Meta analysis of permutation %s for %s' % (perm, analysis.analysis_name)) with log_durations( logger.debug, 'meta analysis of permutation %s / %s for %s' % (perm, nperm, analysis.analysis_name)): # balanced_perm = get_full_meta(fold_change.query("""perm == %s"""%perm), debug=debug) balanced_perm = get_full_meta(fold_change.ix[perm], debug=debug) permutation = balanced_perm[['random_TE', 'fixed_TE']] permutation['perm'] = perm permutations = pd.concat([permutations, permutation]) balanced_permutations = get_balanced_permutations( balanced, permutations) logger.info('DONE %s analysis', analysis.analysis_name) return df, fold_change, balanced_permutations, permutations
def identifier(self) -> str: """Unique identifier for the index. We can use this to optimize and skip opening some indices eg: on push/pull/fetch/gc --all-commits. Currently, it is unique to the platform (windows vs posix). """ return dict_md5(self.dumpd()) if __name__ == "__main__": from funcy import log_durations from dvc.repo import Repo repo = Repo() index = Index(repo, repo.fs) print(index) with log_durations(print, "collecting stages"): # pylint: disable=pointless-statement print("no of stages", len(index.stages)) with log_durations(print, "building graph"): index.build_graph() with log_durations(print, "calculating hash"): print(index.identifier) with log_durations(print, "updating"): index2 = index.update(index.stages) with log_durations(print, "calculating hash"): print(index2.identifier)
def perform_analysis(analysis, debug=False, impute=False, nperm=0, mygene_filter=None): """ Returns a tuple of sample_df, fold_change, balanced_permutations, permutations """ logger.info("Started %s analysis", analysis.analysis_name) # from multiprocessing import Pool # pool = Pool(processes=4) with log_durations(logger.debug, "Loading dataframe for %s" % analysis.analysis_name): df = get_analysis_df(analysis.case_query, analysis.control_query, analysis.modifier_query) debug and df.to_csv("%s.analysis_df.csv" % analysis.analysis_name) logger.info("Matching sources: %d" % df.groupby(["series_id", "platform_id"]).ngroups) # Remove single-class sources query = df.groupby(["series_id", "platform_id"]).sample_class.agg(lambda x: set(x)).map(lambda x: x >= {0, 1}) df = filter_sources(df, query, "as single-class") # Check for minimum number of samples if not df.empty and analysis.min_samples: counts = df.groupby(["series_id", "platform_id"]).sample_class.value_counts().unstack() query = (counts[0] >= analysis.min_samples) & (counts[1] >= analysis.min_samples) df = filter_sources(df, query, "by min samples") # Check number of sources sources = df.groupby(["series_id", "platform_id"]).ngroups if sources <= 1: logger.error("FAIL Can't perform meta-analysis on %s" % ("single source" if sources else "no data")) return df, None, None, None # Calculating stats analysis.series_count = len(df.series_id.unique()) analysis.platform_count = len(df.platform_id.unique()) analysis.sample_count = len(df.sample_id.unique()) analysis.series_ids = df.series_id.unique().tolist() analysis.platform_ids = df.platform_id.unique().tolist() analysis.sample_ids = df.sample_id.unique().tolist() # analysis.save(update_fields=['series_count', 'platform_count', 'sample_count', # 'series_ids', 'platform_ids', 'sample_ids']) logger.info( "Stats: %d sources, %d series, %d platforms, %d samples" % (sources, analysis.series_count, analysis.platform_count, analysis.sample_count) ) # Load GSE data, make and concat all fold change analyses results. # NOTE: we are doing load_gse() lazily here to avoid loading all matrices at once. logger.info("Loading data and calculating fold change for %s", analysis.analysis_name) with log_durations(logger.debug, "Load/fold for %s" % analysis.analysis_name): gses = (load_gse(df, series_id, impute) for series_id in sorted(df.series_id.unique())) debugs = [debug] * df.series_id.nunique() nperms = [nperm] * df.series_id.nunique() mygene_filters = [mygene_filter] * df.series_id.nunique() # start a pool with 4 processes fold_change = pd.concat(imap(get_gene_fold_change, gses, debugs, nperms, mygene_filters)) # fold_change = pd.concat(pool.imap(multi_run_wrapper, zip(gses, debugs, nperms))) debug and fold_change.to_csv("%s.fc.csv" % debug) # Start metaanalysis logger.info("Meta-Analyzing %s", analysis.analysis_name) with log_durations(logger.debug, "Meta analysis for %s" % analysis.analysis_name): # logger.info('Meta analysis of real data for %s' % analysis.analysis_name) with log_durations(logger.debug, "meta analysis of real data for %s" % analysis.analysis_name): balanced = get_full_meta(fold_change.query("""perm == 0"""), debug=debug) if balanced is None: logger.error("FAIL Got empty meta-analysis") return df, fold_change, None, None debug and balanced.to_csv("%s.meta.csv" % debug) # logger.info('Meta-Analyzing of permutations for %s', analysis.analysis_name) with log_durations(logger.debug, "meta analysis of permutations for %s" % analysis.analysis_name): permutations = pd.DataFrame() fold_change = fold_change.reset_index().sort("perm").set_index("perm") for i in range(nperm): perm = i + 1 # logger.info('Meta analysis of permutation %s for %s' % (perm, analysis.analysis_name)) with log_durations( logger.debug, "meta analysis of permutation %s / %s for %s" % (perm, nperm, analysis.analysis_name) ): # balanced_perm = get_full_meta(fold_change.query("""perm == %s"""%perm), debug=debug) balanced_perm = get_full_meta(fold_change.ix[perm], debug=debug) permutation = balanced_perm[["random_TE", "fixed_TE"]] permutation["perm"] = perm permutations = pd.concat([permutations, permutation]) balanced_permutations = get_balanced_permutations(analysis, balanced, permutations) logger.info("DONE %s analysis", analysis.analysis_name) return df, fold_change, balanced_permutations, permutations
def resolved_data(self): data = self.data with log_durations(logger.debug, "resolving values"): data = self.resolver.resolve() return data.get("stages", {})