def get_corrs(data, adjust=identity, corr_func='pearson'): max_slice = defaultdict(int) for sl in data.columns: sl = sl.split('_sl') emb = sl[0] max_slice[emb] = max(max_slice[emb], int(sl[1][0:2])) xs = pd.Series(index=data.columns, data=[ int(a.split('_sl')[1][:2]) / max_slice[a.split('_sl')[0]] for a in data.columns if 'sl' in a ]) corrs_same = defaultdict(list) corrs_diff = defaultdict(list) all_corrs = [corrs_diff, corrs_same] for emb1_name in pb()(max_slice): emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust) genotype = emb1_name.split('_')[0] xs1 = xs.select(startswith(emb1_name)) for emb2_name in max_slice: if emb1_name == emb2_name: continue emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust) xs2 = xs.select(startswith(emb2_name)) closest = { column: min((abs(x2 - x1), c2) for c2, x2 in xs2.items())[1] for column, x1 in xs1.items() } for col in emb1.columns: same = genotype == emb2_name.split('_')[0] all_corrs[same][genotype].append(emb1.ix[:, col].corr( emb2.ix[:, closest[col]], corr_func, )) return all_corrs
def get_corrs(data, adjust=identity, corr_func='pearson'): max_slice = defaultdict(int) for sl in data.columns: sl = sl.split('_sl') emb = sl[0] max_slice[emb] = max(max_slice[emb], int(sl[1][0:2])) xs = pd.Series(index=data.columns, data=[int(a.split('_sl')[1][:2])/max_slice[a.split('_sl')[0]] for a in data.columns if 'sl' in a]) corrs_same = defaultdict(list) corrs_diff = defaultdict(list) all_corrs = [corrs_diff, corrs_same] for emb1_name in pb()(max_slice): emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust) genotype = emb1_name.split('_')[0] xs1 = xs.select(startswith(emb1_name)) for emb2_name in max_slice: if emb1_name == emb2_name: continue emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust) xs2 = xs.select(startswith(emb2_name)) closest = { column: min((abs(x2 - x1), c2) for c2, x2 in xs2.items())[1] for column, x1 in xs1.items() } for col in emb1.columns: same = genotype == emb2_name.split('_')[0] all_corrs[same][genotype].append(emb1.ix[:, col].corr( emb2.ix[:, closest[col]], corr_func, )) return all_corrs
def is_directionally_biased(ase, gene, bias_direction=None, style='ttest', ase_level=0.33, min_slices=10, too_few_slices_val=99, frac_for_biased=0.65, two_tailed=False, alpha=.05): if bias_direction is None: bias_direction = [1 for col in ase.columns] genotypes = {col.split('_')[0] for col in ase.columns} biases = {} for genotype in genotypes: genease = (ase.ix[gene] * bias_direction).select(startswith(genotype)) if style == 'ttest': tstat, pval = ttest_1samp(genease, 0, nan_policy='omit') if isinstance(pval, np.ma.core.MaskedConstant): biases[genotype] = too_few_slices_val continue if two_tailed: biases[genotype] = np.sign(tstat) * (pval * len(ase) < alpha) else: pval = pval/2 if tstat > 0 else 1-pval/2 biases[genotype] = pval * len(ase) < alpha elif style == 'cutoff': slices_with_aseval = genease.count() if slices_with_aseval < min_slices: biases[genotype] = too_few_slices_val continue biases[genotype] = 0 for dir in [-1, 1]: if ((dir * genease > ase_level).sum() > max(frac_for_biased * slices_with_aseval, min_slices)): biases[genotype] = dir break else: raise NotImplementedError("Don't know how to use test style '{}'".format(style)) return biases
def get_class(gene, ase, subset='', slices_with_expr=None, expr=None): sample = ase.ix[gene] sample = sample.select(startswith(subset)) if slices_with_expr is not None and gene in slices_with_expr.index: slices_with_expr = slices_with_expr.ix[gene] elif slices_with_expr is None and expr is not None and gene in expr.index: slices_with_expr = (expr.ix[gene].select(startswith(subset)) > EXPR_MIN).sum() else: return nan ase_vals = (abs(sample) > ASE_MIN) * sign(sample) slices_with_ase = isfinite(sample).sum() if slices_with_expr < len(sample) * .90: return 99 if slices_with_ase < .5 * slices_with_expr: return 999 if sum(ase_vals == 1) > slices_with_ase * FRAC_FOR_MATERNAL: return 1 if sum(ase_vals == -1) > slices_with_ase * FRAC_FOR_MATERNAL: return -1 return 0
def get_diffs(expr, mel_spline, sim_spline, col_headers, offset=EXPR_MIN): mel = expr.select(startswith('melXmel_')) sim = expr.select(startswith('simXsim_')) melXsim = expr.select(startswith('melXsim_')) simXmel = expr.select(startswith('simXmel_')) hybrids = expr.select(startswith(('melXsim', 'simXmel'))) parental_diffs = dd.earth_mover_multi_rep( mel+offset, sim+offset, #normer=lambda x: expr.max(), ) mel_hyb_diffs = dd.earth_mover_multi_rep( mel+offset, melXsim+offset, #normer=lambda x: expr.max(), ) sim_hyb_diffs = dd.earth_mover_multi_rep( sim+offset, simXmel+offset, #normer=lambda x: expr.max(), ) hyb_hyb_diffs = dd.earth_mover_multi_rep( melXsim+offset, simXmel+offset, #normer=lambda x: expr.max(), #normer=pd.np.sum, ) within_melXsim_diff = dd.earth_mover_within( melXsim+offset, #normer=expr.max(), ) within_simXmel_diff = dd.earth_mover_within( simXmel+offset, #normer=expr.max(), ) avgs = pd.Series((mel_spline(xs) + sim_spline(xs))/2, index=col_headers, ) avg_hyb_diffs = dd.earth_mover_multi_rep( avgs.astype(float).clip(0, 1e6), hybrids, normer=lambda x: expr.max(), ) avg_level = avgs.max() hyb_level = [hybrids.select(startswith(g)).max() for g in ['melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2', 'melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep1', 'simXmel_cyc14C_rep2']] return ( hyb_hyb_diffs, parental_diffs, mel_hyb_diffs, sim_hyb_diffs, avgs, avg_hyb_diffs, avg_level, hyb_level, within_melXsim_diff, within_simXmel_diff, )
def is_directionally_biased(ase, gene, bias_direction=None, style='ttest', ase_level=0.33, min_slices=10, too_few_slices_val=99, frac_for_biased=0.65, two_tailed=False, alpha=.05): if bias_direction is None: bias_direction = [1 for col in ase.columns] genotypes = {col.split('_')[0] for col in ase.columns} biases = {} for genotype in genotypes: genease = (ase.ix[gene] * bias_direction).select(startswith(genotype)) if style == 'ttest': tstat, pval = ttest_1samp(genease, 0, nan_policy='omit') if isinstance(pval, np.ma.core.MaskedConstant): biases[genotype] = too_few_slices_val continue if two_tailed: biases[genotype] = np.sign(tstat) * (pval * len(ase) < alpha) else: pval = pval / 2 if tstat > 0 else 1 - pval / 2 biases[genotype] = pval * len(ase) < alpha elif style == 'cutoff': slices_with_aseval = genease.count() if slices_with_aseval < min_slices: biases[genotype] = too_few_slices_val continue biases[genotype] = 0 for dir in [-1, 1]: if ((dir * genease > ase_level).sum() > max( frac_for_biased * slices_with_aseval, min_slices)): biases[genotype] = dir break else: raise NotImplementedError( "Don't know how to use test style '{}'".format(style)) return biases
path.join(cwd, 'analysis_godot/ase_summary_by_read.tsv'), **pd_kwargs) .select(**sel_startswith(('melXsim', 'simXmel'))) ) n_slices = slices_per_embryo(ase) actual = [] computed = [] for embryo, n in n_slices.items(): if n not in virtual_slices: virtual_slices[n] = make_virtual_slices( mel_expr_at_stage, sim_expr_at_matching, mel_atlas_pos.ix[:, :, mel_stage].T, n ) actual.extend(ase.ix[target].select(startswith(embryo))) computed.extend(virtual_slices[n][1][0]) vslice_25 = virtual_slices[25][1][0].copy() vslice_25[13:19] = np.nan vslice_25 = pd.Series(index=['virtual_sl{}'.format(i+1) for i in range(25)], data=vslice_25) kw = pu.kwargs_ase_heatmap.copy() kw.pop('draw_row_labels') kw.pop('draw_name') kw['box_height'] = 60 kw['total_width'] = 200 pu.svg_heatmap(vslice_25, 'analysis/results/hb_atlas_ase_slice_25_pu_M{}S{}.svg' .format(mel_atlas_expr.minor_axis.get_loc(mel_stage), sim_atlas_expr.minor_axis.get_loc(sim_stage)),
'analysis/results/{prefix}peak{suffix}_fits'.format( prefix=args.prefix, suffix=args.suffix), ) if args.print_keggs: synonyms = get_synonyms() wnt_genes = [line.strip() for line in open('prereqs/wnt.kegg.genes')] wnt_scores = pd.Series(index=synonyms[wnt_genes], data=best_r2[synonyms[wnt_genes]]) wnt_scores.index = ['dme:Dmel_' + CG for CG in wnt_genes] wnt_scores.index.name = '#dme' wnt_scores.name = 'svASE' (wnt_scores.sort_values(na_position='first').to_csv( 'analysis/results/wnt_scores.tsv', sep='\t', header=True)) all_cgs = synonyms.select( startswith(('CG1', 'CG2', 'CG3', 'CG4', 'CG5', 'CG6', 'CG7', 'CG8', 'CG9'))) all_scores = pd.Series(index=all_cgs, data=best_r2[all_cgs]) all_scores.index = ['dme:Dmel_' + CG for CG in all_cgs.index] all_scores.index.name = '#dme' all_scores.name = 'svASE' all_scores2 = all_scores.copy() all_scores2.index = [ix.split(':')[1] for ix in all_scores2.index] (all_scores.sort_values(na_position='first').dropna().to_csv( 'analysis/results/all_svase_scores_cg.tsv', sep='\t', header=True)) keggs = { line.split()[0]: line.split()[1].strip().strip(',').split(',') for line in open('prereqs/kegg_database.txt') } kegg_stats = Counter() kegg_pvals = Counter() for pathway in ProgressBar()(keggs):
from GetASEStats import slices_per_embryo virtual_slices = {} ase = (pd.read_table( path.join(cwd, 'analysis_godot/ase_summary_by_read.tsv'), **pd_kwargs).select(**sel_startswith(('melXsim', 'simXmel')))) n_slices = slices_per_embryo(ase) actual = [] computed = [] for embryo, n in n_slices.items(): if n not in virtual_slices: virtual_slices[n] = make_virtual_slices( mel_expr_at_stage, sim_expr_at_matching, mel_atlas_pos.ix[:, :, mel_stage].T, n) actual.extend(ase.ix[target].select(startswith(embryo))) computed.extend(virtual_slices[n][1][0]) vslice_25 = virtual_slices[25][1][0].copy() vslice_25[13:19] = np.nan vslice_25 = pd.Series( index=['virtual_sl{}'.format(i + 1) for i in range(25)], data=vslice_25) kw = pu.kwargs_ase_heatmap.copy() kw.pop('draw_row_labels') kw.pop('draw_name') kw['box_height'] = 60 kw['total_width'] = 200 pu.svg_heatmap( vslice_25, 'analysis/results/hb_atlas_ase_slice_25_pu_M{}S{}.svg'.format(
if args.print_keggs: synonyms = get_synonyms() wnt_genes = [line.strip() for line in open('prereqs/wnt.kegg.genes')] wnt_scores = pd.Series(index=synonyms[wnt_genes], data=best_r2[synonyms[wnt_genes]]) wnt_scores.index = ['dme:Dmel_' + CG for CG in wnt_genes] wnt_scores.index.name = '#dme' wnt_scores.name = 'svASE' (wnt_scores .sort_values(na_position='first') .to_csv('analysis/results/wnt_scores.tsv', sep='\t', header=True) ) all_cgs = synonyms.select(startswith(('CG1', 'CG2', 'CG3', 'CG4', 'CG5', 'CG6', 'CG7', 'CG8', 'CG9'))) all_scores = pd.Series(index=all_cgs, data=best_r2[all_cgs]) all_scores.index = ['dme:Dmel_' + CG for CG in all_cgs.index] all_scores.index.name = '#dme' all_scores.name = 'svASE' all_scores2 = all_scores.copy() all_scores2.index = [ix.split(':')[1] for ix in all_scores2.index] (all_scores .sort_values(na_position='first') .dropna() .to_csv('analysis/results/all_svase_scores_cg.tsv', sep='\t', header=True) ) keggs = {line.split()[0]:line.split()[1].strip().strip(',').split(',')
def get_diffs(expr, mel_spline, sim_spline, col_headers, offset=EXPR_MIN): mel = expr.select(startswith('melXmel_')) sim = expr.select(startswith('simXsim_')) melXsim = expr.select(startswith('melXsim_')) simXmel = expr.select(startswith('simXmel_')) hybrids = expr.select(startswith(('melXsim', 'simXmel'))) parental_diffs = dd.earth_mover_multi_rep( mel + offset, sim + offset, #normer=lambda x: expr.max(), ) mel_hyb_diffs = dd.earth_mover_multi_rep( mel + offset, melXsim + offset, #normer=lambda x: expr.max(), ) sim_hyb_diffs = dd.earth_mover_multi_rep( sim + offset, simXmel + offset, #normer=lambda x: expr.max(), ) hyb_hyb_diffs = dd.earth_mover_multi_rep( melXsim + offset, simXmel + offset, #normer=lambda x: expr.max(), #normer=pd.np.sum, ) within_melXsim_diff = dd.earth_mover_within(melXsim + offset, #normer=expr.max(), ) within_simXmel_diff = dd.earth_mover_within(simXmel + offset, #normer=expr.max(), ) avgs = pd.Series( (mel_spline(xs) + sim_spline(xs)) / 2, index=col_headers, ) avg_hyb_diffs = dd.earth_mover_multi_rep( avgs.astype(float).clip(0, 1e6), hybrids, normer=lambda x: expr.max(), ) avg_level = avgs.max() hyb_level = [ hybrids.select(startswith(g)).max() for g in [ 'melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2', 'melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep1', 'simXmel_cyc14C_rep2' ] ] return ( hyb_hyb_diffs, parental_diffs, mel_hyb_diffs, sim_hyb_diffs, avgs, avg_hyb_diffs, avg_level, hyb_level, within_melXsim_diff, within_simXmel_diff, )