def get_corrs(data, adjust=identity, corr_func='pearson'): max_slice = defaultdict(int) for sl in data.columns: sl = sl.split('_sl') emb = sl[0] max_slice[emb] = max(max_slice[emb], int(sl[1][0:2])) xs = pd.Series(index=data.columns, data=[int(a.split('_sl')[1][:2])/max_slice[a.split('_sl')[0]] for a in data.columns if 'sl' in a]) corrs_same = defaultdict(list) corrs_diff = defaultdict(list) all_corrs = [corrs_diff, corrs_same] for emb1_name in pb()(max_slice): emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust) genotype = emb1_name.split('_')[0] xs1 = xs.select(startswith(emb1_name)) for emb2_name in max_slice: if emb1_name == emb2_name: continue emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust) xs2 = xs.select(startswith(emb2_name)) closest = { column: min((abs(x2 - x1), c2) for c2, x2 in xs2.items())[1] for column, x1 in xs1.items() } for col in emb1.columns: same = genotype == emb2_name.split('_')[0] all_corrs[same][genotype].append(emb1.ix[:, col].corr( emb2.ix[:, closest[col]], corr_func, )) return all_corrs
def get_corrs(data, adjust=identity, corr_func='pearson'): max_slice = defaultdict(int) for sl in data.columns: sl = sl.split('_sl') emb = sl[0] max_slice[emb] = max(max_slice[emb], int(sl[1][0:2])) xs = pd.Series(index=data.columns, data=[ int(a.split('_sl')[1][:2]) / max_slice[a.split('_sl')[0]] for a in data.columns if 'sl' in a ]) corrs_same = defaultdict(list) corrs_diff = defaultdict(list) all_corrs = [corrs_diff, corrs_same] for emb1_name in pb()(max_slice): emb1 = data.select(**sel_startswith(emb1_name)).applymap(adjust) genotype = emb1_name.split('_')[0] xs1 = xs.select(startswith(emb1_name)) for emb2_name in max_slice: if emb1_name == emb2_name: continue emb2 = data.select(**sel_startswith(emb2_name)).applymap(adjust) xs2 = xs.select(startswith(emb2_name)) closest = { column: min((abs(x2 - x1), c2) for c2, x2 in xs2.items())[1] for column, x1 in xs1.items() } for col in emb1.columns: same = genotype == emb2_name.split('_')[0] all_corrs[same][genotype].append(emb1.ix[:, col].corr( emb2.ix[:, closest[col]], corr_func, )) return all_corrs
def get_randomized_scores(ase): rand = np.random.randint(2, size=ase_expr.shape)*2-1 flipped = ase * rand melXsim = flipped.select(**sel_startswith('melXsim')) simXmel = flipped.select(**sel_startswith('simXmel')) weaker_sim_bias = np.min([melXsim.T.quantile(1-FRAC_FOR_MATERNAL), simXmel.T.quantile(1-FRAC_FOR_MATERNAL)], axis=0) weaker_mel_bias = np.max([melXsim.T.quantile(FRAC_FOR_MATERNAL), simXmel.T.quantile(FRAC_FOR_MATERNAL)], axis=0) return (weaker_mel_bias, weaker_sim_bias)
def plot_expr_comparison(expr, gene, prefix=None, smoothed=0): mel = expr.select(**sel_startswith('melXmel_')).ix[gene] sim = expr.select(**sel_startswith('simXsim_')).ix[gene] hyb = expr.select(**sel_startswith(('melXsim', 'simXmel'))).ix[gene] if smoothed: mel = pd.rolling_mean(mel, smoothed, min_periods=1, center=True) sim = pd.rolling_mean(sim, smoothed, min_periods=1, center=True) hyb = pd.rolling_mean(hyb, smoothed, min_periods=1, center=True) pu.svg_heatmap((None, mel, sim, hyb), 'analysis_godot/results/spatial_diffs/{}.svg'.format(gene), cmap=(gene, cm.Reds, cm.Blues, pu.ISH), norm_rows_by=tuple([gene] + ['maxall'] * 7), **pu_kwargs)
def get_randomized_scores(ase): rand = np.random.randint(2, size=ase_expr.shape) * 2 - 1 flipped = ase * rand melXsim = flipped.select(**sel_startswith('melXsim')) simXmel = flipped.select(**sel_startswith('simXmel')) weaker_sim_bias = np.min([ melXsim.T.quantile(1 - FRAC_FOR_MATERNAL), simXmel.T.quantile(1 - FRAC_FOR_MATERNAL) ], axis=0) weaker_mel_bias = np.max([ melXsim.T.quantile(FRAC_FOR_MATERNAL), simXmel.T.quantile(FRAC_FOR_MATERNAL) ], axis=0) return (weaker_mel_bias, weaker_sim_bias)
def plot_expr_comparison(expr, gene, prefix=None, smoothed=0): mel = expr.select(**sel_startswith('melXmel_')).ix[gene] sim = expr.select(**sel_startswith('simXsim_')).ix[gene] hyb = expr.select(**sel_startswith(('melXsim', 'simXmel'))).ix[gene] if smoothed: mel = pd.rolling_mean(mel, smoothed, min_periods=1, center=True) sim = pd.rolling_mean(sim, smoothed, min_periods=1, center=True) hyb = pd.rolling_mean(hyb, smoothed, min_periods=1, center=True) pu.svg_heatmap( (None, mel, sim, hyb), 'analysis_godot/results/spatial_diffs/{}.svg'.format(gene), cmap=(gene, cm.Reds, cm.Blues, pu.ISH), norm_rows_by=tuple([gene] + ['maxall']*7), **pu_kwargs )
parser.add_argument('--fit-ymax', default=1, type=float) args = parser.parse_args() args.male_samples = tuple(args.male_samples) return args if __name__ == "__main__": from Utils import fbgns args = parse_args() expr = pd.read_table(args.expression_file, **pd_kwargs).drop('---', axis=1, errors='ignore') ase = (pd.read_table(args.ase_file, **pd_kwargs).dropna( how='all', axis=1).dropna(how='all', axis=0).drop( args.overlapping_genes, errors='ignore').select(**sel_startswith(('melXsim', 'simXmel')))) ase_perm = pd.DataFrame( data=np.random.permutation(ase.T).T, index=ase.index, columns=ase.columns, ) chrom_of = get_chroms() if args.male_samples and 'keep' not in args.male_samples: on_x = chrom_of[ase.index] == 'X' is_male = [col.startswith(args.male_samples) for col in ase.columns] ase.ix[on_x, is_male] = np.nan ase = ase.ix[ase.T.count() >= args.min_samples] if args.min_var: ase = ase.ix[ase.T.var() >= args.min_var] ase_perm = ase_perm.ix[ase.index]
ase_cdt = ase.ix[cdt.index] exp_cdt = expr.ix[cdt.index] columns = ( 'melXsim', 'simXmel', ) ranges = { 'meldominant': ('FBgn0034816', 'FBgn0250755'), 'simdominant': ('FBgn0004087', 'FBgn0038934'), } if 'sparse' in argv[1]: pu.svg_heatmap( data=exp_cdt.select(**sel_startswith(columns)), filename='analysis/results/all_sparse.svg', norm_rows_by='max', progress_bar=True, col_sep='_sl', total_width=120, box_height=1, split_columns=True, draw_box=True, draw_row_labels=False, draw_name=True, cmap_by_prefix=cmap_by_prefix, make_hyperlinks=True, convert=True, ) from sys import exit
make_hyperlinks=True, convert=True, vspacer=0, max_width=200, cmap=cm.RdBu, ) if __name__ == "__main__": ase = (pd.read_table( 'analysis_godot/ase_summary_by_read.tsv', index_col=0, keep_default_na=False, na_values=['---'], ).dropna(how='all', axis=1).dropna(how='all', axis=0).select(**sel_startswith(('melXsim', 'simXmel')))) paris = pd.read_table( 'prereqs/GSE68062_Gene_CLASS_after_FPKM_normalization.txt', index_col=1)['mel.CLASS'] pzyg = paris[paris == 'zyg'] melXsim = ase.select(**sel_startswith('melXsim')).select(pzyg.__contains__) simXmel = ase.select(**sel_startswith('simXmel')).select(pzyg.__contains__) fbgns = pd.read_table('prereqs/gene_map_table_fb_2016_01.tsv', index_col=1, skiprows=5).ix[:, 0] max_slice = defaultdict(int) for sl in ase.columns: sl = sl.split('_sl')
startswith, get_chroms) import pandas as pd import numpy as np import PlotUtils as pu from FitASEFuncs import (logistic, peak, fit_all_ase, calculate_variance_explained) if __name__ == "__main__": expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).dropna(how='all', axis=1) ase = (pd .read_table('analysis_godot/ase_summary_by_read.tsv', **pd_kwargs ) .dropna(how='all', axis=1) .dropna(how='all', axis=0) .select(**sel_startswith(('melXsim', 'simXmel'))) ) ase_limited = ase.select(**sel_startswith('melXsim')) chrom_of = get_chroms() males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = chrom_of[ase.index] == 'X' is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan xs = get_xs(ase) xs_ltd = get_xs(ase_limited) colnames = ['Amp', 'width', 'center', 'y_offset'] recalc_ase = locals().get('recalc_ase', True) if recalc_ase:
return (weaker_mel_bias, weaker_sim_bias) if __name__ == "__main__": ase = locals().get('ase', None) expr = locals().get('expr', None) if ase is None or expr is None or not np.all(ase.index == expr.index): print("reloading files") expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).dropna(how='all', axis=1) ase = (pd .read_table('analysis_godot/ase_summary_by_read.tsv', **pd_kwargs ) .dropna(how='all', axis=1) .dropna(how='all', axis=0) .select(**sel_startswith(('melXsim', 'simXmel'))) ) chrom_of = get_chroms() males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = [chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index] is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan melXsim_expr = expr.select(**sel_startswith('melXsim')) simXmel_expr = expr.select(**sel_startswith('simXmel')) melXsim_ase = ase.select(**sel_startswith('melXsim')) simXmel_ase = ase.select(**sel_startswith('simXmel')) melXsim_is_expr = (melXsim_expr > EXPR_MIN) simXmel_is_expr = (simXmel_expr > EXPR_MIN) all_is_expr = expr > EXPR_MIN
return r2 EXPR_MIN = 10 if __name__ == "__main__": print("Reading data") expr = (pd.read_table('analysis_godot/summary_wasp.tsv', **pd_kwargs).dropna(how='all', axis=0).dropna(how='all', axis=1)) if expr.index[0].startswith('FBgn'): expr.index = fbgns[expr.index] ase = ( pd.read_table('analysis_godot/ase_summary_by_read_with_wasp.tsv', **pd_kwargs).select( **sel_startswith(('melXsim', 'simXmel'))).dropna( how='all', axis=0).rename_axis(lambda x: x.split('_ase')[0], axis=1) #.replace(pd.np.nan, 0) ) ase = ase.ix[expr.index] read_counts = pd.read_table('analysis_godot/map_stats.tsv', index_col='LongName') chrom_of = get_chroms() males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = chrom_of[ase.index] == 'X' is_male = [col.startswith(males) for col in ase.columns]
def svg_heatmap(data, filename, row_labels=None, box_size=4, index=None, all_indices=None, all_colnames=None, internal_datanames=None, cmap=ISH, norm_rows_by=None, draw_row_labels=False, color_row_labels=False, col_sep='', box_height=None, total_width=None, draw_box=False, draw_name=False, data_names=None, make_hyperlinks = False, progress_bar = False, max_width=np.inf, x_min=10, y_min=10, spacers=None, convert=False, squeeze_rows=None, cmap_by_prefix=None, draw_average=False, draw_average_only=False, average_scale=1, split_columns=False, vspacer=30, hatch_nan=True, hatch_size=20, figure_title=None, nan_replace=None, first_col='', last_col=''): """ Draw heatmap as an SVG file stored in filename *data* can be either a 2D array-like type (list of lists, numpy array, pandas DataFrame, etc), or a tuple of 2D array-likes, in which case a separator will be added between each one in the output *cmap* is a matplotlib-like colormap (i.e. a callable that expects floats in the range 0.0-1.0.), or an iterable of the same length as the tuple *data* containing colormaps *row_labels* can be supplied, otherwise they will detected from the first item in *data*, if available, and if not they will be blank. If *total_width* is supplied, width of each dataset in *data* will be scaled to that constant. If *box_height* is supplied, the height of each row will be *box_height*, otherwise it will be equal to the width of each element. If neither are supplied, elements will be squares equal to *box_size*. IT IS STRONGLY RECOMMENDED that if if supplying *total_width*, *box_height* also be specified, but this is not enforced. *draw_row_labels*, if True, will label the rows on the right hand side. As of 2013/09/03, this won't scale the SVG properly, so including the resulting file in an html element won't display properly. *spacers* is the distance between adjacent datasets. Can either be a number, in which case it will apply to all datasets, or an interable for different distances. If the iterable is shorter than the number of datasets, the last value will be repeated. """ import svgwrite as svg try: import pandas as pd has_pandas = True except: has_pandas = False assert all_indices assert all_colnames if not isinstance(data, tuple): data = (data,) if not isinstance(norm_rows_by, tuple): norm_rows_by = repeat(norm_rows_by) old_data = data colname_tuple = repeat(None) if split_columns and has_pandas: from Utils import sel_startswith data = [] new_normers = [] new_cmaps = [] if isinstance(cmap, tuple): cmaps = cmap else: cmaps = repeat(cmap) for dataset, normer, c_cmap in zip(old_data, norm_rows_by, cmaps): if dataset is None: data.append(dataset) new_normers.append(normer) new_cmaps.append(c_cmap) continue if not isinstance(dataset, pd.DataFrame): dataset = pd.DataFrame(dataset).T colnames = list(sorted( {col.split(col_sep)[0] for col in dataset.columns})) data.extend( dataset.select(**sel_startswith(colname)) for colname in colnames ) new_normers.extend(normer for colname in colnames) new_cmaps.extend(c_cmap for colname in colnames) data = tuple(data) norm_rows_by = tuple(new_normers) cmap = tuple(new_cmaps) elif split_columns and all_colnames: colnames = list(sorted( {col.split(col_sep)[0] for col in all_colnames})) colname = colnames[0] data = tuple([ data[:, array([c.startswith(colname) for c in internal_datanames])] for colname in colnames ]) colname_tuple = tuple( [c for c in all_colnames if c.startswith(colname)] for colname in colnames ) elif not split_columns and all_colnames: colname_tuple = tuple( [c for c in all_colnames if c.startswith(dataname)] for dataname in internal_datanames ) rows, cols = np.shape([ds for ds in data if ds is not None][0]) if index is not None: rows = len(index) if box_height is None: box_height = box_size if row_labels is None: if index is not None: row_labels = list(index) elif hasattr(data[0], 'index'): row_labels = list(data[0].index) else: row_labels = ['' for row in range(rows)] if total_width is not None and max_width is not np.inf: boxes_per_row = max_width // (1.1 * total_width) if ((boxes_per_row + 1) * 1.1 * total_width - .1 * total_width < max_width): boxes_per_row += 1 num_plotted_rows = np.ceil(len(data) / boxes_per_row + (draw_average or draw_average_only)) if figure_title is None: fig_title_height = 0 elif isinstance(figure_title, tuple): fig_title_height = len(figure_title) else: fig_title_height = 1 dwg = svg.Drawing(filename, size=(max_width + 2 * x_min + 200 * draw_row_labels, 2 * y_min + (num_plotted_rows * (rows) * box_height) + 80 * (fig_title_height) + 80 * draw_name + (num_plotted_rows - 1) * vspacer), ) elif total_width is not None: width = len(data) * total_width * 1.1 - .1 * total_width height = rows * box_height max_row_label_len = max(len(str(i)) for i in row_labels) dwg = svg.Drawing(filename, size=(width + 2 * x_min + 20 * draw_row_labels * max_row_label_len, height + 2 * y_min + 80 * draw_name + (80 * (figure_title is not None))) ) else: dwg = svg.Drawing(filename) dwg.add(svg.base.Title(path.basename(filename))) pat = dwg.pattern(id='hatch', insert=(0, 0), size=(hatch_size, hatch_size), patternUnits='userSpaceOnUse') g = pat.add(dwg.g(style="fill:none; stroke:#B0B0B0; stroke-width:1")) g.add(dwg.path(('M0,0', 'l{hatch},{hatch}'.format(hatch=hatch_size)))) g.add(dwg.path(('M{hatch2},0 l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split()))) g.add(dwg.path(('M0,{hatch2} l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split()))) dwg.add(pat) if box_height is None: box_height = box_size if not hasattr(cmap, "__len__"): cmap = [cmap for frame in data] if data_names is None: data_names = ["" for frame in data] if len(cmap) != len(data): raise ValueError("cmap and data should be the same length ({} vs {})" .format(len(cmap), len(data))) if not hasattr(spacers, "__len__"): spacers = [spacers] else: spacers = list(spacers) while len(spacers) < len(data): spacers.append(spacers[-1]) if ((isinstance(norm_rows_by, repeat) and isinstance(next(norm_rows_by), str) and next(norm_rows_by).startswith('center0all')) or (not isinstance(norm_rows_by, repeat) and isinstance(norm_rows_by[0], str) and np.any([i.startswith('center0all') for i in norm_rows_by]))): all_data = pd.concat(data, axis=1) if squeeze_rows is not None: data = [ pd.DataFrame(d.apply(squeeze_rows, axis=1), columns=[path.commonprefix(list(d.columns))]) for d in data ] x_start = x_min y_start = y_min y_diff = 0 iterator = zip(data, cmap, data_names, norm_rows_by, spacers, colname_tuple) if figure_title: if isinstance(figure_title, tuple): font_size = '3em' for title_line in figure_title: dwg.add(dwg.text(title_line, (x_start, y_start+75,), style="font-size:{};font-family:sans-serif".format(font_size))) y_start += 80 font_size = '1.5em' else: dwg.add(dwg.text(figure_title, (x_start, y_start+75,), style="font-size:3em;font-family:sans-serif")) y_start += 80 if progress_bar: from progressbar import ProgressBar pbar = ProgressBar(maxval=len(data)*rows).start() pbar_val = 0 for frame, c_cmap, name, normer, spacer, colnames in iterator: if frame is None: dwg.add(dwg.text(normer, (x_start, y_start + box_height/2))) if total_width is not None: if spacer is None: x_start += total_width * 1.1 else: x_start += total_width + spacer else: if spacer is None: x_start += box_size else: x_start += spacer if x_start > max_width: x_start = x_min y_start += box_height + vspacer continue if has_pandas: frame = pd.DataFrame(frame) if index is not None: if has_pandas: frame = frame.ix[index] else: setix = set(index) #selector = [i for i, name in enumerate(all_indices) if name in setix] #frame = frame[selector, :] if normer is None: norm_data = array(frame.copy()) elif normer is 'mean': if has_pandas: norm_data = array(frame.divide(frame.dropna(axis=1, how='all').mean(axis=1)+10, axis=0)) else: norm_data = frame / (frame[:,isfinite(frame[0,:])].mean(axis=1) + 10).reshape((rows, 1)) elif normer == 'max': if has_pandas: norm_data = array(frame.divide(frame.dropna(axis=1, how='all').max(axis=1)+10, axis=0)) else: norm_data = frame / (frame[:,isfinite(frame[0,:])].max(axis=1) + 10).reshape((rows, 1)) elif normer == 'maxall': if has_pandas: maxall = frame.max(axis=1) assert len(data) == len(new_normers) for old_frame, norm_type in zip(data, new_normers): if norm_type != 'maxall': continue if old_frame is not None: old_frame = old_frame.max(axis=1).ix[index if index is not None else old_frame.index] maxall = maxall.where(maxall > old_frame, old_frame) norm_data = array(frame.divide(maxall + 10, axis=0)) else: norm_data = frame / (old_data[:, isfinite(old_data[0, :])] .max(axis=1) + 10).reshape((rows, 1)) elif normer == 'fullvar': norm_data = frame.subtract(frame .dropna(axis=1, how='all') .min(axis=1)-1e-6, axis=0) norm_data = array(norm_data.divide(norm_data .dropna(axis=1, how='all') .max(axis=1), axis=0)) elif normer == 'center0': norm_data = array(0.5 + 0.5 * frame.divide(frame.dropna(axis=1).abs().max(axis=1), axis=0) ) elif isinstance(normer, str) and normer.startswith('center0min'): min_norm = ( frame.dropna(axis=1).abs() .max(axis=1).clip(float(normer[10:]), 1e6) ) norm_data = array(0.5+ 0.5 * frame.divide(min_norm, axis=0)) elif isinstance(normer, str) and normer.startswith('center0allmin'): min_norm = ( all_data.dropna(axis=1).abs() .max(axis=1).clip(float(normer[13:]), 1e6) ) norm_data = array(0.5+ 0.5 * frame.divide(min_norm, axis=0)) elif normer == 'center0all': norm_data = array(0.5 + 0.5 * frame.divide(all_data.dropna(how='all', axis=1).abs().max(axis=1), axis=0) ) elif normer == 'center0pre': norm_data = array(0.5 + 0.5 * frame) elif isinstance(normer, (int, float)): norm_data = array(frame / normer) normer = 'constant' elif index is not None and hasattr(normer, "ix"): norm_data = array(frame.divide(normer.ix[index], axis=0)) elif hasattr(normer, "__len__") and len(normer) == rows: if has_pandas: norm_data = array(frame.divide(normer, axis=0)) else: norm_data = array(frame / np.reshape(normer, (rows, -1))) elif hasattr(normer, "__len__"): print('\n'*5) print(len(normer), normer, normer=='max') print(frame.shape) raise TypeError("norm_rows_by should be the same shape " "as the number of rows") else: norm_data = array(frame / normer) if not c_cmap or str(c_cmap).lower() == 'default': c_cmap = ISH new_rows, new_cols = np.shape(frame) if hasattr(frame, 'index'): col_labels = frame.columns elif colnames: col_labels = colnames else: col_labels = ['' for col in range(new_cols)] if new_rows != rows: raise ValueError("All input elements must have the same number of" " rows (and same row meanings --unchecked)") if total_width is not None: box_size = total_width / float(new_cols) i = 0 if not draw_average_only: for i in range(rows): if progress_bar: pbar.update(pbar_val) pbar_val += 1 prefix = col_labels[0][:col_labels[0].find(col_sep)] if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) for j in range(new_cols): g = dwg.g() val = frame.ix[i,j] if has_pandas else frame[i,j] g.add(svg.base.Title("{}, {}: {:.2f}".format(row_labels[i], col_labels[j], val))) hatch = not isfinite(norm_data[i,j]) if hatch and nan_replace is not None: if isinstance(nan_replace, float): norm_data[i, j] = nan_replace else: if normer.startswith('center0'): norm_data[i, j] = 0.5 else: norm_data[i, j] = 0.0 elif hatch: n = 0 norm_data[i, j] = 0 left = j - 1 while left >= 0: if isfinite(norm_data[i, left]): norm_data[i, j] += norm_data[i, left] n += 1 break left -= 1 right = j + 1 while right < norm_data.shape[1]: if isfinite(norm_data[i, right]): norm_data[i, j] += norm_data[i, right] n+= 1 break right += 1 if n == 0: norm_data[i, j] = .5 if 'center' in normer else 0 else: norm_data[i, j] /= n g.add(dwg.rect((x_start + box_size*j, y_start + i*box_height), (box_size, box_height), style="fill:#{:02x}{:02x}{:02x}" .format(*[int(255*x) for x in c_cmap(norm_data[i, j])]))) dwg.add(g) if hatch_nan and hatch: g.add(dwg.rect((x_start + box_size*j, y_start + i*box_height), (box_size, box_height), style="fill:url(#hatch)" ) ) col_base = col_labels[j][:col_labels[j].find(col_sep)] if col_base != prefix: prefix = col_base if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) g.add(dwg.line((x_start + box_size * j, y_start + i * box_height), (x_start + box_size * j, y_start + (i + 1) * box_height), style="stroke-width:{}; stroke:#000000" .format(.1 * box_size))) else: for j in range(new_cols): hatch = not isfinite(norm_data[0, j]) if hatch: n = 0 norm_data[:, j] = 0 if j > 0 and isfinite(norm_data[0,j-1]): norm_data[:, j] += norm_data[:, j-1] n += 1 if (j + 1 < norm_data.shape[1] and isfinite(norm_data[0, j+1])): norm_data[:, j] += norm_data[:, j+1] n += 1 norm_data[:, j] /= n dwg.add(dwg.text(first_col, (x_start, y_start + (i + 1) * box_height))) dwg.add(dwg.text(last_col, (x_start + (new_cols - 1) * box_size, y_start + (i + 1) * box_height))) if draw_box and not draw_average_only: dwg.add(dwg.rect((x_start, y_start + 0), (new_cols*box_size, rows*box_height), style="stroke-width:1; " "stroke:#000000; fill:none")) if draw_average or draw_average_only: avg_frame = norm_data.mean(axis=0) for j in range(new_cols): col_base = col_labels[j][:col_labels[j].find(col_sep)] prefix = col_base if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) g = dwg.g() g.add(svg.base.Title("Average, {}: {:.2f}".format(col_labels[j], avg_frame[j]))) g.add(dwg.rect((x_start + box_size*j, y_start + (i+(not draw_average_only))*box_height), (box_size, box_height), style="fill:#{:02x}{:02x}{:02x}" .format(*[int(255*x) for x in c_cmap(average_scale*avg_frame[j])]))) if not isfinite(norm_data[0, j]) and hatch_nan: g.add(dwg.rect((x_start + box_size*j, y_start + (i+(not draw_average_only))*box_height), (box_size, box_height), style="fill:url(#hatch)" ) ) dwg.add(g) dwg.add(dwg.rect((x_start, y_start + (i+(not draw_average_only))*box_height), (new_cols*box_size, 1*box_height), style="stroke-width:1; stroke:#000000; fill:none" )) if draw_name: if name == "" and split_columns: name = col_base xpos = x_start + box_size * new_cols / 2.0 text = dwg.text('', (xpos, y_start + box_height * (rows) * (1-draw_average_only) + box_height * (draw_average or draw_average_only) + 13), style="text-anchor: middle;font-family:sans-serif;") text.add(dwg.tspan("", dy=["-1.5em"])) for line in name.split('_'): text.add(dwg.tspan(line, dy=["1.5em"], x=[xpos], style="text-anchor: middle;", )) dwg.add(text) if total_width is not None: if spacer is None: x_start += total_width * 1.1 else: x_start += total_width + spacer else: if spacer is None: x_start += new_cols * box_size + box_size else: x_start += new_cols * box_size + spacer #y_diff = new_rows * box_height + vspacer if x_start + total_width >= max_width: x_start = x_min y_start += new_rows*box_height*(not draw_average_only) + vspacer y_start += box_height*(draw_average_only or draw_average) if draw_row_labels and isinstance(row_labels[0], tuple): lwidths = Counter() for r in row_labels: for i, l in enumerate(r): lwidths[i] = max(lwidths[i], len(str(l))) cum_len = 0 for i in range(len(lwidths)): old_width = lwidths[i] lwidths[i] += cum_len cum_len += old_width if draw_row_labels and not draw_average_only: for i in range(rows): if color_row_labels: style = "font-family:sans-serif; font-size: {size}; fill: {color};".format( size=box_height, color='red' if row_labels[i] in color_row_labels else 'black', ) else: style = "font-family:sans-serif; font-size: {}".format(box_height) if isinstance(row_labels[i], tuple): labeltext = dwg.g() for lnum, ltext in enumerate(row_labels[i]): labeltext.add(dwg.text(ltext, (x_start + lwidths[lnum-1] * 10 + lnum * 50, y_start + i * box_height + box_height), style=style, )) else: labeltext = (dwg.text(row_labels[i], (x_start, y_start + i*box_height+box_height), style=style, )) if make_hyperlinks: if make_hyperlinks is True: link = dwg.a('http://insitu.fruitfly.org/cgi-bin/ex/report.pl?ftype={}&ftext={}' .format(2 if (isinstance(row_labels[i], str) and (row_labels[i].startswith('FBgn')) ) else 1, row_labels[i]), target='_replace', ) else: link = dwg.a(make_hyperlinks.format(frame.index[i])) link.add(labeltext) dwg.add(link) else: dwg.add(labeltext) if progress_bar: pbar.finish() dwg.saveas(filename) if convert: cmd = [ 'convert', filename, '-units', 'PixelsPerInch', '+antialias', '-density', '600', '-background', 'none', '-transparent', 'white', filename.replace('svg', 'png'), ] subprocess.Popen(cmd)
parser.add_argument('--no-multi', dest='multi', action='store_false') return parser.parse_args() if __name__ == "__main__": synonyms = get_synonyms() args = parse_args() expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).drop('---', axis='columns') ase = pd.read_table('analysis_godot/ase_summary_by_read.tsv', **pd_kwargs).drop('---', axis='columns') mel = expr.select(**sel_startswith('melXmel_')) sim = expr.select(**sel_startswith('simXsim_')) hybrids = expr.select(**sel_startswith(('melXsim', 'simXmel'))) melXsim = expr.select(**sel_startswith('melXsim')) simXmel = expr.select(**sel_startswith('simXmel')) expr_in_mel = (mel.max(axis=1) > EXPR_MIN) expr_in_sim = sim.max(axis=1) > EXPR_MIN expr_in_hybrids = (hybrids.max(axis=1) > EXPR_MIN) expr_in_all = (expr_in_mel & expr_in_sim & expr_in_hybrids) expr = expr.ix[expr_in_all] ase = ase.ix[expr.index] ase_classes = hu.get_classes(ase, pbar=pbar, style='cutoff') not_maternal = ase_classes.index[~((ase_classes.melXsim == 0) & (ase_classes.simXmel == 0))]
], axis=0) return (weaker_mel_bias, weaker_sim_bias) if __name__ == "__main__": ase = locals().get('ase', None) expr = locals().get('expr', None) if ase is None or expr is None or not np.all(ase.index == expr.index): print("reloading files") expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).dropna(how='all', axis=1) ase = (pd.read_table('analysis_godot/ase_summary_by_read.tsv', **pd_kwargs).dropna(how='all', axis=1).dropna( how='all', axis=0).select(**sel_startswith(('melXsim', 'simXmel')))) chrom_of = get_chroms() males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = [ chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index ] is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan melXsim_expr = expr.select(**sel_startswith('melXsim')) simXmel_expr = expr.select(**sel_startswith('simXmel')) melXsim_ase = ase.select(**sel_startswith('melXsim')) simXmel_ase = ase.select(**sel_startswith('simXmel')) melXsim_is_expr = (melXsim_expr > EXPR_MIN)
make_hyperlinks=True, convert=True, vspacer=0, max_width=200, cmap=cm.RdBu, ) if __name__ == "__main__": ase = (pd .read_table('analysis_godot/ase_summary_by_read.tsv', index_col=0, keep_default_na=False, na_values=['---'],) .dropna(how='all', axis=1) .dropna(how='all', axis=0) .select(**sel_startswith(('melXsim', 'simXmel'))) ) paris = pd.read_table('prereqs/GSE68062_Gene_CLASS_after_FPKM_normalization.txt', index_col=1)['mel.CLASS'] pzyg = paris[paris == 'zyg'] melXsim = ase.select(**sel_startswith('melXsim')).select(pzyg.__contains__) simXmel = ase.select(**sel_startswith('simXmel')).select(pzyg.__contains__) fbgns = pd.read_table('prereqs/gene_map_table_fb_2016_01.tsv', index_col=1,skiprows=5).ix[:, 0] max_slice = defaultdict(int) for sl in ase.columns: sl = sl.split('_sl') emb = sl[0]
male_hybrid_embryos = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') female_hybrid_embryos = ('melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2', 'simXmel_cyc14C_rep1') if __name__ == "__main__": expr = pd.read_table('godot/summary_fb.tsv', **pd_kwargs) ase = (pd.read_table('godot/ase_summary.tsv', **pd_kwargs) .dropna(how='all', axis=0) ) chrom_of = get_chroms() ase = ase.select(lambda x: chrom_of[x] != 'X') expr_males = expr.select(**sel_startswith(male_hybrid_embryos)) expr_females = expr.select(**sel_startswith(female_hybrid_embryos)) ase_males = ase.select(**sel_startswith(male_hybrid_embryos)) ase_females = ase.select(**sel_startswith(female_hybrid_embryos)) ase_xs = get_xs(ase) ase_maternals = pd.Series( index=ase_xs.index, data=[1 if col.startswith('simXmel') else -1 for col in ase_xs.index] ) if 'logistic_females' in locals() and locals().get('recalculate', True): with Pool() as p: logistic_females = fit_all_ase(ase_females, logistic, ase_xs.ix[ase_females.columns],
parser = ArgumentParser() parser.add_argument('--multi', default=True, action='store_true') parser.add_argument('--no-multi', dest='multi', action='store_false') return parser.parse_args() if __name__ == "__main__": synonyms = get_synonyms() args = parse_args() expr = pd.read_table('analysis_godot/summary.tsv', **pd_kwargs).drop('---', axis='columns') ase = pd.read_table('analysis_godot/ase_summary_by_read.tsv', **pd_kwargs).drop('---', axis='columns') mel = expr.select(**sel_startswith('melXmel_')) sim = expr.select(**sel_startswith('simXsim_')) hybrids = expr.select(**sel_startswith(('melXsim', 'simXmel'))) melXsim = expr.select(**sel_startswith('melXsim')) simXmel = expr.select(**sel_startswith('simXmel')) expr_in_mel = (mel.max(axis=1) > EXPR_MIN) expr_in_sim = sim.max(axis=1) > EXPR_MIN expr_in_hybrids = (hybrids.max(axis=1) > EXPR_MIN) expr_in_all = (expr_in_mel & expr_in_sim & expr_in_hybrids) expr = expr.ix[expr_in_all] ase = ase.ix[expr.index] ase_classes = hu.get_classes(ase, pbar=pbar, style='cutoff') not_maternal = ase_classes.index[~((ase_classes.melXsim == 0) & (ase_classes.simXmel == 0))]
def svg_heatmap(data, filename, row_labels=None, box_size=4, index=None, all_indices=None, all_colnames=None, internal_datanames=None, cmap=ISH, norm_rows_by=None, draw_row_labels=False, color_row_labels=False, col_sep='', box_height=None, total_width=None, draw_box=False, draw_name=False, data_names=None, make_hyperlinks=False, progress_bar=False, max_width=np.inf, x_min=10, y_min=10, spacers=None, convert=False, squeeze_rows=None, cmap_by_prefix=None, draw_average=False, draw_average_only=False, average_scale=1, split_columns=False, vspacer=30, hatch_nan=True, hatch_size=20, figure_title=None, nan_replace=None, first_col='', last_col=''): """ Draw heatmap as an SVG file stored in filename *data* can be either a 2D array-like type (list of lists, numpy array, pandas DataFrame, etc), or a tuple of 2D array-likes, in which case a separator will be added between each one in the output *cmap* is a matplotlib-like colormap (i.e. a callable that expects floats in the range 0.0-1.0.), or an iterable of the same length as the tuple *data* containing colormaps *row_labels* can be supplied, otherwise they will detected from the first item in *data*, if available, and if not they will be blank. If *total_width* is supplied, width of each dataset in *data* will be scaled to that constant. If *box_height* is supplied, the height of each row will be *box_height*, otherwise it will be equal to the width of each element. If neither are supplied, elements will be squares equal to *box_size*. IT IS STRONGLY RECOMMENDED that if if supplying *total_width*, *box_height* also be specified, but this is not enforced. *draw_row_labels*, if True, will label the rows on the right hand side. As of 2013/09/03, this won't scale the SVG properly, so including the resulting file in an html element won't display properly. *spacers* is the distance between adjacent datasets. Can either be a number, in which case it will apply to all datasets, or an interable for different distances. If the iterable is shorter than the number of datasets, the last value will be repeated. """ import svgwrite as svg try: import pandas as pd has_pandas = True except: has_pandas = False assert all_indices assert all_colnames if not isinstance(data, tuple): data = (data, ) if not isinstance(norm_rows_by, tuple): norm_rows_by = repeat(norm_rows_by) old_data = data colname_tuple = repeat(None) if split_columns and has_pandas: from Utils import sel_startswith data = [] new_normers = [] new_cmaps = [] if isinstance(cmap, tuple): cmaps = cmap else: cmaps = repeat(cmap) for dataset, normer, c_cmap in zip(old_data, norm_rows_by, cmaps): if dataset is None: data.append(dataset) new_normers.append(normer) new_cmaps.append(c_cmap) continue if not isinstance(dataset, pd.DataFrame): dataset = pd.DataFrame(dataset).T colnames = list( sorted({col.split(col_sep)[0] for col in dataset.columns})) data.extend( dataset.select(**sel_startswith(colname)) for colname in colnames) new_normers.extend(normer for colname in colnames) new_cmaps.extend(c_cmap for colname in colnames) data = tuple(data) norm_rows_by = tuple(new_normers) cmap = tuple(new_cmaps) elif split_columns and all_colnames: colnames = list(sorted({col.split(col_sep)[0] for col in all_colnames})) colname = colnames[0] data = tuple([ data[:, array([c.startswith(colname) for c in internal_datanames])] for colname in colnames ]) colname_tuple = tuple( [c for c in all_colnames if c.startswith(colname)] for colname in colnames) elif not split_columns and all_colnames: colname_tuple = tuple( [c for c in all_colnames if c.startswith(dataname)] for dataname in internal_datanames) rows, cols = np.shape([ds for ds in data if ds is not None][0]) if index is not None: rows = len(index) if box_height is None: box_height = box_size if row_labels is None: if index is not None: row_labels = list(index) elif hasattr(data[0], 'index'): row_labels = list(data[0].index) else: row_labels = ['' for row in range(rows)] if total_width is not None and max_width is not np.inf: boxes_per_row = max_width // (1.1 * total_width) if ((boxes_per_row + 1) * 1.1 * total_width - .1 * total_width < max_width): boxes_per_row += 1 num_plotted_rows = np.ceil( len(data) / boxes_per_row + (draw_average or draw_average_only)) if figure_title is None: fig_title_height = 0 elif isinstance(figure_title, tuple): fig_title_height = len(figure_title) else: fig_title_height = 1 dwg = svg.Drawing( filename, size=(max_width + 2 * x_min + 200 * draw_row_labels, 2 * y_min + (num_plotted_rows * (rows) * box_height) + 80 * (fig_title_height) + 80 * draw_name + (num_plotted_rows - 1) * vspacer), ) elif total_width is not None: width = len(data) * total_width * 1.1 - .1 * total_width height = rows * box_height max_row_label_len = max(len(str(i)) for i in row_labels) dwg = svg.Drawing( filename, size=(width + 2 * x_min + 20 * draw_row_labels * max_row_label_len, height + 2 * y_min + 80 * draw_name + (80 * (figure_title is not None)))) else: dwg = svg.Drawing(filename) dwg.add(svg.base.Title(path.basename(filename))) pat = dwg.pattern(id='hatch', insert=(0, 0), size=(hatch_size, hatch_size), patternUnits='userSpaceOnUse') g = pat.add(dwg.g(style="fill:none; stroke:#B0B0B0; stroke-width:1")) g.add(dwg.path(('M0,0', 'l{hatch},{hatch}'.format(hatch=hatch_size)))) g.add( dwg.path(('M{hatch2},0 l{hatch2},{hatch2}'.format(hatch2=hatch_size / 2).split()))) g.add( dwg.path(('M0,{hatch2} l{hatch2},{hatch2}'.format(hatch2=hatch_size / 2).split()))) dwg.add(pat) if box_height is None: box_height = box_size if not hasattr(cmap, "__len__"): cmap = [cmap for frame in data] if data_names is None: data_names = ["" for frame in data] if len(cmap) != len(data): raise ValueError( "cmap and data should be the same length ({} vs {})".format( len(cmap), len(data))) if not hasattr(spacers, "__len__"): spacers = [spacers] else: spacers = list(spacers) while len(spacers) < len(data): spacers.append(spacers[-1]) if ((isinstance(norm_rows_by, repeat) and isinstance(next(norm_rows_by), str) and next(norm_rows_by).startswith('center0all')) or (not isinstance(norm_rows_by, repeat) and isinstance(norm_rows_by[0], str) and np.any([i.startswith('center0all') for i in norm_rows_by]))): all_data = pd.concat(data, axis=1) if squeeze_rows is not None: data = [ pd.DataFrame(d.apply(squeeze_rows, axis=1), columns=[path.commonprefix(list(d.columns))]) for d in data ] x_start = x_min y_start = y_min y_diff = 0 iterator = zip(data, cmap, data_names, norm_rows_by, spacers, colname_tuple) if figure_title: if isinstance(figure_title, tuple): font_size = '3em' for title_line in figure_title: dwg.add( dwg.text( title_line, ( x_start, y_start + 75, ), style="font-size:{};font-family:sans-serif".format( font_size))) y_start += 80 font_size = '1.5em' else: dwg.add( dwg.text(figure_title, ( x_start, y_start + 75, ), style="font-size:3em;font-family:sans-serif")) y_start += 80 if progress_bar: from progressbar import ProgressBar pbar = ProgressBar(maxval=len(data) * rows).start() pbar_val = 0 for frame, c_cmap, name, normer, spacer, colnames in iterator: if frame is None: dwg.add(dwg.text(normer, (x_start, y_start + box_height / 2))) if total_width is not None: if spacer is None: x_start += total_width * 1.1 else: x_start += total_width + spacer else: if spacer is None: x_start += box_size else: x_start += spacer if x_start > max_width: x_start = x_min y_start += box_height + vspacer continue if has_pandas: frame = pd.DataFrame(frame) if index is not None: if has_pandas: frame = frame.ix[index] else: setix = set(index) #selector = [i for i, name in enumerate(all_indices) if name in setix] #frame = frame[selector, :] if normer is None: norm_data = array(frame.copy()) elif normer is 'mean': if has_pandas: norm_data = array( frame.divide(frame.dropna(axis=1, how='all').mean(axis=1) + 10, axis=0)) else: norm_data = frame / ( frame[:, isfinite(frame[0, :])].mean(axis=1) + 10).reshape( (rows, 1)) elif normer == 'max': if has_pandas: norm_data = array( frame.divide(frame.dropna(axis=1, how='all').max(axis=1) + 10, axis=0)) else: norm_data = frame / ( frame[:, isfinite(frame[0, :])].max(axis=1) + 10).reshape( (rows, 1)) elif normer == 'maxall': if has_pandas: maxall = frame.max(axis=1) assert len(data) == len(new_normers) for old_frame, norm_type in zip(data, new_normers): if norm_type != 'maxall': continue if old_frame is not None: old_frame = old_frame.max( axis=1 ).ix[index if index is not None else old_frame.index] maxall = maxall.where(maxall > old_frame, old_frame) norm_data = array(frame.divide(maxall + 10, axis=0)) else: norm_data = frame / (old_data[:, isfinite(old_data[0, :])].max( axis=1) + 10).reshape((rows, 1)) elif normer == 'fullvar': norm_data = frame.subtract( frame.dropna(axis=1, how='all').min(axis=1) - 1e-6, axis=0) norm_data = array( norm_data.divide(norm_data.dropna(axis=1, how='all').max(axis=1), axis=0)) elif normer == 'center0': norm_data = array( 0.5 + 0.5 * frame.divide(frame.dropna(axis=1).abs().max(axis=1), axis=0)) elif isinstance(normer, str) and normer.startswith('center0min'): min_norm = (frame.dropna(axis=1).abs().max(axis=1).clip( float(normer[10:]), 1e6)) norm_data = array(0.5 + 0.5 * frame.divide(min_norm, axis=0)) elif isinstance(normer, str) and normer.startswith('center0allmin'): min_norm = (all_data.dropna(axis=1).abs().max(axis=1).clip( float(normer[13:]), 1e6)) norm_data = array(0.5 + 0.5 * frame.divide(min_norm, axis=0)) elif normer == 'center0all': norm_data = array(0.5 + 0.5 * frame.divide( all_data.dropna(how='all', axis=1).abs().max(axis=1), axis=0)) elif normer == 'center0pre': norm_data = array(0.5 + 0.5 * frame) elif isinstance(normer, (int, float)): norm_data = array(frame / normer) normer = 'constant' elif index is not None and hasattr(normer, "ix"): norm_data = array(frame.divide(normer.ix[index], axis=0)) elif hasattr(normer, "__len__") and len(normer) == rows: if has_pandas: norm_data = array(frame.divide(normer, axis=0)) else: norm_data = array(frame / np.reshape(normer, (rows, -1))) elif hasattr(normer, "__len__"): print('\n' * 5) print(len(normer), normer, normer == 'max') print(frame.shape) raise TypeError("norm_rows_by should be the same shape " "as the number of rows") else: norm_data = array(frame / normer) if not c_cmap or str(c_cmap).lower() == 'default': c_cmap = ISH new_rows, new_cols = np.shape(frame) if hasattr(frame, 'index'): col_labels = frame.columns elif colnames: col_labels = colnames else: col_labels = ['' for col in range(new_cols)] if new_rows != rows: raise ValueError("All input elements must have the same number of" " rows (and same row meanings --unchecked)") if total_width is not None: box_size = total_width / float(new_cols) i = 0 if not draw_average_only: for i in range(rows): if progress_bar: pbar.update(pbar_val) pbar_val += 1 prefix = col_labels[0][:col_labels[0].find(col_sep)] if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) for j in range(new_cols): g = dwg.g() val = frame.ix[i, j] if has_pandas else frame[i, j] g.add( svg.base.Title("{}, {}: {:.2f}".format( row_labels[i], col_labels[j], val))) hatch = not isfinite(norm_data[i, j]) if hatch and nan_replace is not None: if isinstance(nan_replace, float): norm_data[i, j] = nan_replace else: if normer.startswith('center0'): norm_data[i, j] = 0.5 else: norm_data[i, j] = 0.0 elif hatch: n = 0 norm_data[i, j] = 0 left = j - 1 while left >= 0: if isfinite(norm_data[i, left]): norm_data[i, j] += norm_data[i, left] n += 1 break left -= 1 right = j + 1 while right < norm_data.shape[1]: if isfinite(norm_data[i, right]): norm_data[i, j] += norm_data[i, right] n += 1 break right += 1 if n == 0: norm_data[i, j] = .5 if 'center' in normer else 0 else: norm_data[i, j] /= n g.add( dwg.rect( (x_start + box_size * j, y_start + i * box_height), (box_size, box_height), style="fill:#{:02x}{:02x}{:02x}".format(*[ int(255 * x) for x in c_cmap(norm_data[i, j]) ]))) dwg.add(g) if hatch_nan and hatch: g.add( dwg.rect((x_start + box_size * j, y_start + i * box_height), (box_size, box_height), style="fill:url(#hatch)")) col_base = col_labels[j][:col_labels[j].find(col_sep)] if col_base != prefix: prefix = col_base if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) g.add( dwg.line( (x_start + box_size * j, y_start + i * box_height), (x_start + box_size * j, y_start + (i + 1) * box_height), style="stroke-width:{}; stroke:#000000".format( .1 * box_size))) else: for j in range(new_cols): hatch = not isfinite(norm_data[0, j]) if hatch: n = 0 norm_data[:, j] = 0 if j > 0 and isfinite(norm_data[0, j - 1]): norm_data[:, j] += norm_data[:, j - 1] n += 1 if (j + 1 < norm_data.shape[1] and isfinite(norm_data[0, j + 1])): norm_data[:, j] += norm_data[:, j + 1] n += 1 norm_data[:, j] /= n dwg.add(dwg.text(first_col, (x_start, y_start + (i + 1) * box_height))) dwg.add( dwg.text(last_col, (x_start + (new_cols - 1) * box_size, y_start + (i + 1) * box_height))) if draw_box and not draw_average_only: dwg.add( dwg.rect((x_start, y_start + 0), (new_cols * box_size, rows * box_height), style="stroke-width:1; " "stroke:#000000; fill:none")) if draw_average or draw_average_only: avg_frame = norm_data.mean(axis=0) for j in range(new_cols): col_base = col_labels[j][:col_labels[j].find(col_sep)] prefix = col_base if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) g = dwg.g() g.add( svg.base.Title("Average, {}: {:.2f}".format( col_labels[j], avg_frame[j]))) g.add( dwg.rect((x_start + box_size * j, y_start + (i + (not draw_average_only)) * box_height), (box_size, box_height), style="fill:#{:02x}{:02x}{:02x}".format(*[ int(255 * x) for x in c_cmap(average_scale * avg_frame[j]) ]))) if not isfinite(norm_data[0, j]) and hatch_nan: g.add( dwg.rect((x_start + box_size * j, y_start + (i + (not draw_average_only)) * box_height), (box_size, box_height), style="fill:url(#hatch)")) dwg.add(g) dwg.add( dwg.rect((x_start, y_start + (i + (not draw_average_only)) * box_height), (new_cols * box_size, 1 * box_height), style="stroke-width:1; stroke:#000000; fill:none")) if draw_name: if name == "" and split_columns: name = col_base xpos = x_start + box_size * new_cols / 2.0 text = dwg.text( '', (xpos, y_start + box_height * (rows) * (1 - draw_average_only) + box_height * (draw_average or draw_average_only) + 13), style="text-anchor: middle;font-family:sans-serif;") text.add(dwg.tspan("", dy=["-1.5em"])) for line in name.split('_'): text.add( dwg.tspan( line, dy=["1.5em"], x=[xpos], style="text-anchor: middle;", )) dwg.add(text) if total_width is not None: if spacer is None: x_start += total_width * 1.1 else: x_start += total_width + spacer else: if spacer is None: x_start += new_cols * box_size + box_size else: x_start += new_cols * box_size + spacer #y_diff = new_rows * box_height + vspacer if x_start + total_width >= max_width: x_start = x_min y_start += new_rows * box_height * ( not draw_average_only) + vspacer y_start += box_height * (draw_average_only or draw_average) if draw_row_labels and isinstance(row_labels[0], tuple): lwidths = Counter() for r in row_labels: for i, l in enumerate(r): lwidths[i] = max(lwidths[i], len(str(l))) cum_len = 0 for i in range(len(lwidths)): old_width = lwidths[i] lwidths[i] += cum_len cum_len += old_width if draw_row_labels and not draw_average_only: for i in range(rows): if color_row_labels: style = "font-family:sans-serif; font-size: {size}; fill: {color};".format( size=box_height, color='red' if row_labels[i] in color_row_labels else 'black', ) else: style = "font-family:sans-serif; font-size: {}".format( box_height) if isinstance(row_labels[i], tuple): labeltext = dwg.g() for lnum, ltext in enumerate(row_labels[i]): labeltext.add( dwg.text( ltext, (x_start + lwidths[lnum - 1] * 10 + lnum * 50, y_start + i * box_height + box_height), style=style, )) else: labeltext = (dwg.text( row_labels[i], (x_start, y_start + i * box_height + box_height), style=style, )) if make_hyperlinks: if make_hyperlinks is True: link = dwg.a( 'http://insitu.fruitfly.org/cgi-bin/ex/report.pl?ftype={}&ftext={}' .format( 2 if (isinstance(row_labels[i], str) and (row_labels[i].startswith('FBgn'))) else 1, row_labels[i]), target='_replace', ) else: link = dwg.a(make_hyperlinks.format(frame.index[i])) link.add(labeltext) dwg.add(link) else: dwg.add(labeltext) if progress_bar: pbar.finish() dwg.saveas(filename) if convert: cmd = [ 'convert', filename, '-units', 'PixelsPerInch', '+antialias', '-density', '600', '-background', 'none', '-transparent', 'white', filename.replace('svg', 'png'), ] subprocess.Popen(cmd)
else: step = 1 expr_min = 5 eps = 1 read_table_args = dict(index_col=0, keep_default_na=False, na_values=['---', '']) if 'all_expr' not in locals(): all_expr = (pd.read_table('analysis/summary.tsv', **read_table_args).sort_index()) top_expr = all_expr.max(axis=1) all_expr = all_expr.ix[top_expr > expr_min] all_expr = all_expr.ix[::step] wt = all_expr.select(**sel_startswith('WT')) bcd = all_expr.select(**sel_startswith('bcd')) zld = all_expr.select(**sel_startswith('zld')) g20 = all_expr.select(**sel_startswith('G20')) hb = all_expr.select(**sel_startswith('hb')) wts = bcds = zlds = g20s = hbs = 0 for sub_df_name in 'wt bcd zld g20 hb'.split(): sub_df = locals()[sub_df_name] cycs = { col.split('_sl')[0].split('_', 1)[1] for col in sub_df.columns } cyc_embs = {} for cyc in cycs: cyc_embs[cyc] = sub_df.select(**sel_contains(cyc))
def svg_heatmap(data, filename, row_labels=None, box_size=4, index=None, cmap=ISH, norm_rows_by=None, draw_row_labels=False, col_sep='', box_height=None, total_width=None, draw_box=False, draw_name=False, data_names=None, progress_bar = False, max_width=np.inf, spacers=None, cmap_by_prefix=None, split_columns=False, vspacer=30, hatch_nan=True, hatch_size=20, first_col='', last_col=''): """ Draw heatmap as an SVG file stored in filename *data* can be either a 2D array-like type (list of lists, numpy array, pandas DataFrame, etc), or a tuple of 2D array-likes, in which case a separator will be added between each one in the output *cmap* is a matplotlib-like colormap (i.e. a callable that expects floats in the range 0.0-1.0.), or an iterable of the same length as the tuple *data* containing colormaps *row_labels* can be supplied, otherwise they will detected from the first item in *data*, if available, and if not they will be blank. If *total_width* is supplied, width of each dataset in *data* will be scaled to that constant. If *box_height* is supplied, the height of each row will be *box_height*, otherwise it will be equal to the width of each element. If neither are supplied, elements will be squares equal to *box_size*. IT IS STRONGLY RECOMMENDED that if if supplying *total_width*, *box_height* also be specified, but this is not enforced. *draw_row_labels*, if True, will label the rows on the right hand side. As of 2013/09/03, this won't scale the SVG properly, so including the resulting file in an html element won't display properly. *spacers* is the distance between adjacent datasets. Can either be a number, in which case it will apply to all datasets, or an interable for different distances. If the iterable is shorter than the number of datasets, the last value will be repeated. """ import svgwrite as svg import pandas as pd if split_columns and isinstance(data, pd.DataFrame): from Utils import sel_startswith colnames = list(sorted( {col.split(col_sep)[0] for col in data.columns})) data = tuple( data.select(**sel_startswith(colname)) for colname in colnames ) elif not isinstance(data, tuple): data = (data,) rows, cols = np.shape(data[0]) if index is not None: rows = len(index) if box_height is None: box_height = box_size if total_width is not None and max_width is not np.inf: dwg = svg.Drawing(filename, size=(max_width, np.ceil((len(data) * total_width)/max_width) * (box_height+vspacer))) else: dwg = svg.Drawing(filename) dwg.add(svg.base.Title(path.basename(filename))) pat = dwg.pattern(id='hatch', insert=(0, 0), size=(hatch_size, hatch_size), patternUnits='userSpaceOnUse') g = pat.add(dwg.g(style="fill:none; stroke:#B0B0B0; stroke-width:1")) g.add(dwg.path(('M0,0', 'l{hatch},{hatch}'.format(hatch=hatch_size)))) g.add(dwg.path(('M{hatch2},0 l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split()))) g.add(dwg.path(('M0,{hatch2} l{hatch2},{hatch2}'.format(hatch2=hatch_size/2).split()))) dwg.add(pat) if row_labels is None: if index is not None: row_labels = index elif hasattr(data[0], 'index'): row_labels = data[0].index else: row_labels = ['' for row in range(rows)] if box_height is None: box_height = box_size if not hasattr(cmap, "__len__"): cmap = [cmap for frame in data] if data_names is None: data_names = ["" for frame in data] if len(cmap) != len(data): raise ValueError("cmap and data should be the same length") if not hasattr(spacers, "__len__"): spacers = [spacers] else: spacers = list(spacers) while len(spacers) < len(data): spacers.append(spacers[-1]) if not isinstance(norm_rows_by, tuple): norm_rows_by = repeat(norm_rows_by) x_start = 0 y_start = 0 y_diff = 0 if progress_bar: from progressbar import ProgressBar iterator = zip(data, cmap, data_names, norm_rows_by, spacers) pbar = ProgressBar(maxval=len(iterator)*rows).start() pbar_val = 0 else: iterator = zip(data, cmap, data_names, norm_rows_by, spacers) for frame, c_cmap, name, normer, spacer in iterator: if frame is None: if total_width is not None: if spacer is None: x_start += total_width * 1.1 else: x_start += total_width + spacer else: if spacer is None: x_start += box_size else: x_start += spacer if x_start > max_width: x_start = 0 y_start += y_diff continue frame = pd.DataFrame(frame) if normer is None: norm_data = frame.copy() elif normer is 'mean': norm_data = frame.divide(frame.dropna(axis=1).mean(axis=1)+10, axis=0) elif normer is 'max': norm_data = frame.divide(frame.dropna(axis=1).max(axis=1)+10, axis=0) elif normer is 'center0': norm_data = (0.5 + 0.5 * frame.divide(frame.dropna(axis=1).abs().max(axis=1), axis=0) ) elif index is not None and hasattr(normer, "ix"): norm_data = frame.divide(normer.ix[index], axis=0) elif hasattr(normer, "__len__") and len(normer) == rows: norm_data = frame.divide(normer, axis=0) elif hasattr(normer, "__len__"): raise TypeError("norm_rows_by should be the same shape " "as the number of rows") else: norm_data = frame.divide(normer, axis=0) if not c_cmap or str(c_cmap).lower() == 'default': c_cmap = ISH new_rows, new_cols = np.shape(frame) if hasattr(frame, 'index'): col_labels = frame.columns else: col_labels = ['' for col in range(new_cols)] if new_rows != rows: raise ValueError("All input elements must have the same number of" " rows (and same row meanings --unchecked)") if total_width is not None: box_size = total_width / float(new_cols) for i in range(rows): if progress_bar: pbar.update(pbar_val) pbar_val += 1 prefix = col_labels[0][:col_labels[0].find(col_sep)] if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) for j in range(new_cols): g = dwg.g() g.add(svg.base.Title("{}, {}: {:.2f}".format(row_labels[i], col_labels[j], frame.ix[i, j]))) hatch = not isfinite(norm_data.ix[i, j]) if hatch: n = 0 norm_data.ix[i, j] = 0 if j > 0: norm_data.ix[i, j] += norm_data.ix[i, j-1] n += 1 if j + 1 < len(norm_data.columns): norm_data.ix[i, j] += norm_data.ix[i, j+1] n += 1 norm_data.ix[i, j] /= n g.add(dwg.rect((x_start + box_size*j, y_start + i*box_height), (box_size, box_height), style="fill:#{:02x}{:02x}{:02x}" .format(*[int(255*x) for x in c_cmap(norm_data.ix[i, j])]))) dwg.add(g) if hatch_nan and hatch: g.add(dwg.rect((x_start + box_size*j, y_start + i*box_height), (box_size, box_height), style="fill:url(#hatch)" ) ) col_base = col_labels[j][:col_labels[j].find(col_sep)] if col_base != prefix: prefix = col_base if cmap_by_prefix: c_cmap = cmap_by_prefix(prefix) g.add(dwg.line((x_start + box_size * j, y_start + i * box_height), (x_start + box_size * j, y_start + (i + 1) * box_height), style="stroke-width:{}; stroke:#000000" .format(.1 * box_size))) dwg.add(dwg.text(first_col, (x_start, y_start + (i + 1) * box_height))) dwg.add(dwg.text(last_col, (x_start + (new_cols - 1) * box_size, y_start + (i + 1) * box_height))) if draw_box: dwg.add(dwg.rect((x_start, y_start + 0), (new_cols*box_size, rows*box_height), style="stroke-width:1; " "stroke:#000000; fill:none")) if draw_name: dwg.add(dwg.text(name, (x_start + box_size * new_cols / 2.0, y_start + box_height * (rows) + 13), style="text-anchor: middle;")) if total_width is not None: if spacer is None: x_start += total_width * 1.1 else: x_start += total_width + spacer else: if spacer is None: x_start += new_cols * box_size + box_size else: x_start += new_cols * box_size + spacer y_diff = new_rows * box_height + 30 if x_start + total_width >= max_width: x_start = 0 y_start += new_rows*box_height + vspacer if draw_row_labels: for i in range(rows): dwg.add(dwg.text(row_labels[i], (x_start, y_start + i*box_height+box_height), style='font-size:{}'.format(box_height), )) pbar.finish() dwg.saveas(filename)
if __name__ == "__main__": from Utils import fbgns args = parse_args() expr = pd.read_table(args.expression_file, **pd_kwargs).drop('---', axis=1, errors='ignore') ase = (pd .read_table(args.ase_file, **pd_kwargs ) .dropna(how='all', axis=1) .dropna(how='all', axis=0) .drop(args.overlapping_genes, errors='ignore') .select(**sel_startswith(('melXsim', 'simXmel'))) ) ase_perm = pd.DataFrame( data=np.random.permutation(ase.T).T, index=ase.index, columns=ase.columns, ) chrom_of = get_chroms() if args.male_samples and 'keep' not in args.male_samples: on_x = chrom_of[ase.index] == 'X' is_male = [col.startswith(args.male_samples) for col in ase.columns] ase.ix[on_x, is_male] = np.nan ase = ase.ix[ase.T.count() >= args.min_samples] if args.min_var: ase = ase.ix[ase.T.var() >= args.min_var]
default='analysis_godot/ase_summary_by_read.tsv', type=str) return parser.parse_args() if __name__ == "__main__": filterwarnings("ignore", ".*Covariance of the parameters.*",) filterwarnings("ignore", ".*overflow encountered in exp.*",) #expr = pd.read_table('analysis_godot/summary_fb.tsv', **pd_kwargs).dropna(how='all', axis=1) args = parse_args() ase = (pd .read_table(args.data_to_fit, **pd_kwargs ) .dropna(how='all', axis=1) .dropna(how='all', axis=0) .select(**sel_startswith(('melXsim', 'simXmel'))) ) chrom_of = get_chroms() males = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') on_x = [chrom_of[gene] == 'X' if gene in chrom_of else False for gene in ase.index] is_male = [col.startswith(males) for col in ase.columns] ase.ix[on_x, is_male] = np.nan ase = ase.loc[ase.T.count() > len(ase.columns) / 2.0] hours = len(ase) / 1e4 * 1.5 + 2 cluster_args['time'] = '{}:{}:00'.format(int(hours), int((hours % 1)*60)) print("Estimate {} per iteration".format(cluster_args['time'])) #cluster_args['queue'] = fyrd.Queue(user='******', #qtype=fyrd.queue.get_cluster_environment()) print(cluster_args)
ax.set_xlim(mel_atlas_pos.ix[:, 'X', mel_stage].min() - 15, mel_atlas_pos.ix[:, 'X', mel_stage].max() + 15) pu.minimize_ink(ax) savefig(path.join( cwd, 'analysis/results/{}_atlas_sim_M{}S{}'.format( target, mel_atlas_expr.minor_axis.get_loc(both_stage), sim_atlas_expr.minor_axis.get_loc(both_stage), )), transparent=True) from GetASEStats import slices_per_embryo virtual_slices = {} ase = (pd.read_table( path.join(cwd, 'analysis_godot/ase_summary_by_read.tsv'), **pd_kwargs).select(**sel_startswith(('melXsim', 'simXmel')))) n_slices = slices_per_embryo(ase) actual = [] computed = [] for embryo, n in n_slices.items(): if n not in virtual_slices: virtual_slices[n] = make_virtual_slices( mel_expr_at_stage, sim_expr_at_matching, mel_atlas_pos.ix[:, :, mel_stage].T, n) actual.extend(ase.ix[target].select(startswith(embryo))) computed.extend(virtual_slices[n][1][0]) vslice_25 = virtual_slices[25][1][0].copy() vslice_25[13:19] = np.nan vslice_25 = pd.Series(
from progressbar import ProgressBar as pb male_hybrid_embryos = ('melXsim_cyc14C_rep3', 'simXmel_cyc14C_rep2') female_hybrid_embryos = ('melXsim_cyc14C_rep1', 'melXsim_cyc14C_rep2', 'simXmel_cyc14C_rep1') if __name__ == "__main__": expr = pd.read_table('godot/summary_fb.tsv', **pd_kwargs) ase = (pd.read_table('godot/ase_summary.tsv', **pd_kwargs).dropna(how='all', axis=0)) chrom_of = get_chroms() ase = ase.select(lambda x: chrom_of[x] != 'X') expr_males = expr.select(**sel_startswith(male_hybrid_embryos)) expr_females = expr.select(**sel_startswith(female_hybrid_embryos)) ase_males = ase.select(**sel_startswith(male_hybrid_embryos)) ase_females = ase.select(**sel_startswith(female_hybrid_embryos)) ase_xs = get_xs(ase) ase_maternals = pd.Series( index=ase_xs.index, data=[1 if col.startswith('simXmel') else -1 for col in ase_xs.index]) if 'logistic_females' in locals() and locals().get('recalculate', True): with Pool() as p: logistic_females = fit_all_ase(ase_females, logistic, ase_xs.ix[ase_females.columns],
if "-sparse" in sys.argv: is_sparse = "sparse_" step = 10 else: step = 1 expr_min = 5 eps = 1 read_table_args = dict(index_col=0, keep_default_na=False, na_values=["---", ""]) if "all_expr" not in locals(): all_expr = pd.read_table("analysis/summary.tsv", **read_table_args).sort_index() top_expr = all_expr.max(axis=1) all_expr = all_expr.ix[top_expr > expr_min] all_expr = all_expr.ix[::step] wt = all_expr.select(**sel_startswith("WT")) bcd = all_expr.select(**sel_startswith("bcd")) zld = all_expr.select(**sel_startswith("zld")) g20 = all_expr.select(**sel_startswith("G20")) hb = all_expr.select(**sel_startswith("hb")) wts = bcds = zlds = g20s = hbs = 0 for sub_df_name in "wt bcd zld g20 hb".split(): sub_df = locals()[sub_df_name] cycs = {col.split("_sl")[0].split("_", 1)[1] for col in sub_df.columns} cyc_embs = {} for cyc in cycs: cyc_embs[cyc] = sub_df.select(**sel_contains(cyc)) locals()[sub_df_name + "s"] = cyc_embs print("Read expression in")