refseq_down_slowd = refseq[ refseq['slow_diabetic_balb_nod_notx_0h_fc'] <= -1] if False: #print set(grapher.get_gene_list(refseq_up_nond)) & set(grapher.get_gene_list(refseq_up_d)) #print set(grapher.get_gene_list(refseq_down_nond)) & set(grapher.get_gene_list(refseq_down_d)) print grapher.get_gene_names(refseq_up_nond) if True: # non-d ax = grapher.scatterplot(refseq_up_nond, 'balb_notx_0h_tag_count', 'nod_notx_0h_tag_count_norm', log=True, color='blue', master_dataset=refseq, title='BALBc vs. NOD BMDC Refseq Transcripts', show_2x_range=True, show_legend=False, show_count=True, show_correlation=True, show_plot=False) grapher.save_plot( os.path.join(dirpath, 'nondiabetic_balbc_v_nod_up_scatterplot.png')) grapher.show_plot() if False: # diabetic ax = grapher.scatterplot( refseq, 'diabetic_balb_notx_0h_tag_count',
yzer.get_filename(dirpath, 'all_expressed_refseq.txt')) refseq_with_runoff = refseq[refseq['id'].isin(data['gene_id'])] refseq_no_runoff = refseq[~refseq['id'].isin(data['gene_id'])] if False: print len(refseq_no_runoff) print refseq_no_runoff.tail(100).to_string() # Calculate length of runoff data[ 'length'] = data['transcription_end'] - data['transcription_start'] + 1 data['gene_length'] = data['gene_end'] - data['gene_start'] + 1 # What might be correlated with length of runoff? if False: yzer.scatterplot(data, 'gene_length', 'length', log=True) yzer.scatterplot(data, 'gene_score', 'length', log=True) yzer.scatterplot(data, 'score', 'length', log=True) yzer.scatterplot(data, 'gene_score', 'score', log=True) yzer.scatterplot(data, 'gene_rpkm', 'rpkm', log=True) yzer.scatterplot(data, 'gene_rpkm', 'length', log=True) yzer.scatterplot(data, 'rpkm', 'length', log=True) yzer.boxplot([data['gene_score'], refseq_no_runoff['score']]) yzer.boxplot([data['gene_rpkm'], refseq_no_runoff['rpkm']]) if True: for subset in (refseq_no_runoff, refseq_with_runoff): subset['percent_covered'] = (subset['transcription_end'] - subset['transcription_start'] + 1)\ /(subset['transcription_end(2)'] - subset['transcription_start(2)'] + 1) print 'Total: ', len(subset) print 'Percentage: ', len(subset) / len(refseq)
# We want 'id weight enhancer_lfc gene_lfc' val_set['weight'] = 1.0 f = open( yzer.get_filename( change_subdir, '{0}_for_{1}_pairs.cdt'.format(me2_col, key)), 'w') val_set.to_csv(f, sep='\t', header=False, index=True, cols=(['weight'] + col_set)) if False: ax = yzer.scatterplot( val_set, xcolname=me2_col + '_2', ycolname=me2_col, log=False, color='blue', title= 'Log fold change of genes and interacting enhancers in {0}: {1}, enhancer 1.5x fold changed' .format(key.replace('_', ' '), kla_col), xlabel='Enhancer LFC', ylabel='Gene LFC', show_2x_range=False, plot_regression=True, show_count=True, show_correlation=True, show_legend=False, save_dir=img_dirpath, show_plot=False)
shared['th2_tag_count'] = nonzero(shared['p2_tag_count']) only_th2['th1_tag_count'] = nonzero(only_th2['p2_tag_count']) only_th2['th2_tag_count'] = nonzero(only_th2['tag_count']) data = shared.append(only_th1, ignore_index=True) data = data.append(only_th2, ignore_index=True) if False: # Scatterplots of tag counts ax = yzer.scatterplot( data, 'th1_tag_count', 'th2_tag_count', log=True, color='blue', title='Th1 versus Th2 {0} Tag Counts at Peaks'.format(peak_pretty), show_2x_range=True, show_legend=False, plot_regression=False, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=True) if True: # Motif finding. yzer = MotifAnalyzer() motifs_dirpath = yzer.get_and_create_path(dirpath, 'motifs') data['id'] = data.index if False:
all_data = yzer.import_file(yzer.get_filename(dirpath,'refseq_all.txt')) all_data = all_data[~all_data['id'].isin(data['id'])] data = pandas.concat([data, all_data]) data = data.reset_index().fillna(0) notx = data[data['sequencing_run_id'] == 765] kla_30m = data[data['sequencing_run_id'] == 766] kla_4h = data[data['sequencing_run_id'] == 773] no_intxns = data[data['sequencing_run_id'] == 0] # Zero won't show up in a log plot, so add one. no_intxns['count'] = 1 ax = yzer.scatterplot(no_intxns, xcolname='length', ycolname='count', log=True, color='#CCCCCC', label='No {0}s'.format(counted), show_2x_range=False, plot_regression=False, show_count=False, show_correlation=False, show_legend=False, show_plot=False) ax = yzer.scatterplot(notx, xcolname='length', ycolname='count', log=True, color='#B5D8EB', label='Notx {0}s'.format(counted), show_2x_range=False, plot_regression=False, show_count=False, show_correlation=False, show_legend=False, show_plot=False, ax=ax) ax = yzer.scatterplot(kla_30m, xcolname='length', ycolname='count', log=True, color='#FFBDD8', label='KLA 30m {0}s'.format(counted), show_2x_range=False, plot_regression=False, show_count=False, show_correlation=False, show_legend=False, show_plot=False, ax=ax) ax = yzer.scatterplot(kla_4h, xcolname='length', ycolname='count', log=True, color='#E3AAD6', title='{0} counts as a function of gene length'.format(counted.title()), xlabel='Transcript length', ylabel='Distal {0} count'.format(counted), label='KLA 4h {0}s'.format(counted), show_2x_range=False, plot_regression=False, show_count=False, show_correlation=True, show_legend=True,
trans = data[(data['kla_1_lfc'] >= 1) & (data['dex_over_kla_1_lfc'] <= -.58)] rest = data[(data['kla_1_lfc'] < 1) | (data['dex_over_kla_1_lfc'] > -.58)] key = 'gr_dex_tag_count' datasets = [rest[key],trans[key]] datasets = [d['gr_kla_dex_tag_count'] - d[key] for d in [rest, trans]] title = 'Tags in p65 peaks in KLA 1h + Dex 2h: Distal' title = 'Diff in tags in GR peaks in KLA 1h + Dex 2h vs Dex 1h' ax = grapher.boxplot(datasets, ['Not transrepressed in KLA 1h + Dex 2h','Transrepressed in KLA 1h + Dex 2h',], title=title, xlabel='Condition', ylabel='Total tags in all peaks overlapping transcript', show_outliers=False, show_plot=False) grapher.save_plot(grapher.get_filename(base_dirpath, 'boxplots', 'kla_dex', title.replace(' ','_'))) grapher.show_plot() for sub in datasets: print sub.mean() if True: #data = data[data['has_refseq'] == 1] data = data[data['distal'] == 't'] data['gr_diff'] = data['gr_kla_dex_tag_count'] - data['gr_dex_tag_count'] data['p65_diff'] = data['p65_kla_dex_tag_count'] - data['p65_kla_tag_count'] data['gr_by_length'] = data['gr_kla_dex_tag_count']/data['length']*10000 data['p65_by_length'] = data['p65_kla_dex_tag_count']/data['length']*10000 grapher.scatterplot(data, 'gr_kla_dex_tag_count', 'p65_diff',log=True)
enhancers = enhancers[enhancers['total_interactions'] > 0] enhancers[ 'dmso_tags_per_bp'] = enhancers['dmso_tag_count'] / enhancers['length'] enhancers[ 'kla_tags_per_bp'] = enhancers['kla_tag_count'] / enhancers['length'] # Plot tag counts versus interactions. ax = yzer.scatterplot( enhancers, xcolname='dmso_tags_per_bp', ycolname='notx_interactions', log=True, title= 'Interactions in Notx as a function of GRO-seq tag counts in DMSO', xlabel='GRO-seq tags per bp in DMSO', ylabel='Number of interactions {0}in Notx'.format( tss_only and 'with gene TSSs ' or ''), show_2x_range=True, plot_regression=False, show_count=True, show_correlation=True, show_legend=False, save_dir=img_dirpath, show_plot=True) ax = yzer.scatterplot( enhancers, xcolname='kla_tags_per_bp', ycolname='kla_4h_interactions', log=True, title= 'Interactions in KLA 4h as a function of GRO-seq tag counts in KLA 1h',
evidence_f = os.path.join(dirpath, 'refseq_evidence.orf') data = grapher.import_file(filename) evidence = grapher.import_file(evidence_f) data['score_orf'] = evidence['score'] data = data[data['score_orf'] < 200] data_coding = data[data['score'] >= 0] data_noncoding = data[data['score'] < 0] ax = grapher.scatterplot(data_coding, 'score_orf', 'score', log=False, color='blue', label='Predicted Coding', add_noise=False, show_2x_range=False, plot_regression=False, show_count=False, show_correlation=False, show_legend=False, show_plot=False) ax = grapher.scatterplot( data_noncoding, 'score_orf', 'score', log=False, color='green', title='CPC-derived Coding Potential Predictions for RefSeq mRNA', xlabel='ORF score', ylabel='Coding score',
yzer.get_filename(dirpath, 'dp_with_thiomac_ctcf.txt')).fillna(0) thio = yzer.import_file( yzer.get_filename(dirpath, 'thiomac_with_dp_ctcf.txt')).fillna(0) # Get venn-diagram sets only_dp = dp[dp['thiomac_ctcf_tag_count'] == 0] only_thio = thio[thio['dp_ctcf_tag_count'] == 0] shared = dp[dp['thiomac_ctcf_tag_count'] != 0] shared_check = thio[thio['dp_ctcf_tag_count'] != 0] print len(only_dp), len(only_thio), len(shared), len(shared_check) data = shared.append(only_dp, ignore_index=True) data = data.append(only_thio, ignore_index=True) data['dp_nonzero'] = nonzero(data['dp_ctcf_tag_count']) data['thio_nonzero'] = nonzero(data['thiomac_ctcf_tag_count']) ax = yzer.scatterplot( data, 'dp_nonzero', 'thio_nonzero', xlabel='DP Thymocyte CTCF Tag Count', ylabel='ThioMac CTCF Tag Count', log=True, color='blue', title='Tags in CTCF Peaks in DP Thymocytes versus ThioMacs', show_2x_range=False, show_legend=False, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=True)
print sum(stat1['foxp3_id'] > 0) print sum(stat1['foxp3_id'] > 0) / len(stat1) foxp3_enh = foxp3[(foxp3['tss_me2_id'] == 0) & (foxp3['tss_id'] == 0)] foxp3_tss = foxp3[(foxp3['tss_me2_id'] > 0) | (foxp3['tss_id'] > 0)] print len(foxp3_enh) print sum(foxp3_enh['stat1_id'] > 0) / len(foxp3_enh) print len(foxp3_tss) print sum(foxp3_tss['stat1_id'] > 0) / len(foxp3_tss) foxp3_with_stat = foxp3[foxp3['stat1_id'] > 0] if False: grapher = SeqGrapher() grapher.scatterplot(foxp3_with_stat, xcolname='foxp3_tag_count', ycolname='stat1_tag_count', log=True, show_plot=True) if False: subsets = [ ('all', foxp3_with_stat), ('enh', foxp3_enh[foxp3_enh['stat1_id'] > 0]), ('tss', foxp3_tss[foxp3_tss['stat1_id'] > 0]), ] for k, subset in subsets: first_peak = 'foxp3' subset['id'] = subset[first_peak + '_id'] subset['start'] = subset[first_peak + '_start'] subset['end'] = subset[first_peak + '_end']
dirpath = yzer.get_path(dirpath) img_dirpath = yzer.get_and_create_path(dirpath, 'with_me3', 'basic_scatterplots') data = yzer.import_file( yzer.get_filename(dirpath, 'transcript_vectors.txt')) data = data.fillna(0) data = data[data['naive_me3_tag_count'] + data['act_me3_tag_count'] > 0] for key1, key2, norm_factor in comparison_sets: name1 = pretty_names[key1[:-1]] + key1[-1:] name2 = pretty_names[key2[:-1]] + key2[-1:] data_normed = yzer.normalize(data, key2 + '_tag_count', norm_factor) ax = yzer.scatterplot( data_normed, key1 + '_tag_count', key2 + '_tag_count_norm', log=True, color='blue', title='{0} versus {1} Normalized Tag Counts'.format(name1, name2), xlabel='{0} tags in RefSeq transcripts'.format(name1), ylabel='{0} tags in RefSeq transcripts, normalized'.format(name2), add_noise=False, show_2x_range=True, show_legend=False, plot_regression=False, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=False)
from __future__ import division from glasslab.dataanalysis.graphing.seq_grapher import SeqGrapher if __name__ == '__main__': yzer = SeqGrapher() dirpath = 'karmel/Desktop/Projects/GlassLab/Notes_and_Reports/Glass Atlas/Demo-data' dirpath = yzer.get_path(dirpath) img_dirpath = yzer.get_and_create_path(dirpath, 'scatterplots') data = yzer.import_file( yzer.get_filename(dirpath, 'me2_peaks_with_transcripts.txt')) data = data.fillna(0) data = data.groupby(by='id', as_index=True).mean() data['transcript_score'] = data['score(2)'] ax = yzer.scatterplot( data, xcolname='transcript_score', ycolname='tag_count', log=True, title='H3K4me2 Tag Count as a Function of Transcript Score', xlabel='Glass Atlas Transcript Score', ylabel='Normalized H3K4me2 tag count', show_2x_range=True, plot_regression=True, show_count=True, show_correlation=True, show_legend=False, save_dir=img_dirpath, show_plot=True)
data = data[data['transcript_score'] >= 4] data = data[data[['balb_notx_1h_tag_count','nod_notx_1h_tag_count_norm', 'balb_kla_1h_tag_count','nod_kla_1h_tag_count_norm']].max(axis=1) >= 10] refseq = yzer.get_refseq(data) # Remove low tag counts refseq = refseq[refseq['transcript_score'] >= 4] if False: # Non-diabetic balbc vs. nod ax = yzer.scatterplot(refseq, 'balb_notx_1h_tag_count', 'nod_notx_1h_tag_count_norm', log=True, color='blue', xlabel='BALBc notx 1h tag count',ylabel='NOD notx 1h tag count', title='Non-Diabetic BALBc vs. NOD Notx 1h Refseq Transcripts', show_2x_range=True, show_legend=False, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=True) # Non-diabetic balbc vs. nod ax = yzer.scatterplot(refseq, 'balb_kla_1h_tag_count', 'nod_kla_1h_tag_count_norm', log=True, color='blue', xlabel='BALBc KLA 1h tag count',ylabel='NOD KLA 1h tag count', title='Non-Diabetic BALBc vs. NOD KLA 1h Refseq Transcripts', show_2x_range=True, show_legend=False, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=True) if False: # Non-diabetic balbc vs. nod ax = yzer.scatterplot(data, 'balb_notx_1h_tag_count', 'nod_notx_1h_tag_count_norm',
'balb2_pu_1_tag_count'] data['nod_with_bl6'] = data['nod_sv_id'] <= .1 nod_with_bl6 = data[data['nod_with_bl6'] == True] nod_with_balb = data[data['nod_with_bl6'] == False] if False: ax = grapher.scatterplot( nod_with_bl6, 'wt_pu_1_tag_count', 'nod_pu_1_tag_count', subplot=121, log=True, color='blue', xlabel='C57Bl6 PU.1 tag counts', ylabel='NOD PU.1 tag counts', title= 'C57Bl6 vs. NOD PU.1 peaks\nwhere C57Bl6 has a PU.1 motif and BALBc does not', label='NOD SNP == C57Bl6 SNP', add_noise=False, show_2x_range=False, show_legend=True, show_count=True, show_correlation=True, text_shift=False, text_color=True, show_plot=False) #grapher.save_plot(os.path.join(dirpath, 'bl6_vs_nod_pu_1_peak_tag_counts_bl6_gt_balb_no_balb_motif_nod_eq_bl6.png')) #grapher.show_plot() ax = grapher.scatterplot( nod_with_balb, 'wt_pu_1_tag_count', 'nod_pu_1_tag_count',
scatter_dirpath = grapher.get_filename(dirpath, 'scatterplots') ############################################# # One color tag counts ############################################# if False: for dataset, label in ((data, 'all transcripts'), (refseq, 'RefSeq')): slug_label = label.lower().replace(' ', '_') # All DMSO vs. all KLA ax = grapher.scatterplot( dataset, 'dmso_tag_count', 'kla_tag_count_norm', log=True, color='blue', title='DMSO vs. KLA tag counts: All runs, {0}'.format(label), xlabel='DMSO 2h tags', ylabel='KLA 1h + DMSO 2h tags', show_2x_range=True, show_legend=True, show_count=True, show_correlation=True, show_plot=False) grapher.save_plot( grapher.get_filename( scatter_dirpath, 'dmso_vs_kla_all_runs_{0}.png'.format(slug_label))) grapher.show_plot() for x in xrange(1, 5): # By group ax = grapher.scatterplot(
grouped['kla_ratio'] = grouped['up_in_kla'] / grouped['count'] grouped = grouped.sort(['kla_ratio']).reset_index(drop=True) grouped['idx'] = grouped.index shuffled_grouped['kla_ratio'] = shuffled_grouped[ 'up_in_kla'] / shuffled_grouped['count'] shuffled_grouped = shuffled_grouped.sort(['kla_ratio' ]).reset_index(drop=True) shuffled_grouped['idx'] = shuffled_grouped.index ax = yzer.scatterplot(shuffled_grouped, 'idx', 'kla_ratio', color='green', label='Shuffled Data'.format(rep), show_2x_range=False, plot_regression=False, show_count=False, show_correlation=False, show_legend=False, show_plot=False) ax = yzer.scatterplot( grouped, 'idx', 'kla_ratio', title='Up in KLA {0} Percentage by HiC Domain'.format(rep), xlabel='Ordered Index', ylabel='Percent of transcripts up in KLA', color='blue', label='Replicate {0} Data'.format(rep),
yzer.get_filename(dirpath, 'RNA_GroSeq_CountsGenes.txt')) homer_data['sequence_identifier'] = homer_data['Gene ID'] homer_data['homer_tag_count'] = nonzero(homer_data[ 'ThioMac-GroSeq-notx-110513/ genes (Total: 12166480.0) normFactor 0.82'] .fillna(0)) homer_data = homer_data[['sequence_identifier', 'homer_tag_count']] merged = data.merge(homer_data, how='inner', on='sequence_identifier') merged = merged.fillna(1) if True: ax = yzer.scatterplot(merged, xcolname='homer_tag_count', ycolname='sum', log=True, title='RefSeq Tag Count via Homer and Vespucci', xlabel='Tag Count in Homer', ylabel='Tag Count in Vespucci', show_2x_range=True, plot_regression=False, set_limits=True, show_count=True, show_correlation=True, show_legend=False, save_dir=img_dirpath, show_plot=True) merged['ratio'] = merged['sum'] / merged['homer_tag_count'] merged = merged.sort('ratio') print merged.head(10) print merged.tail(20)
img_dirpath = yzer.get_and_create_path(dirpath, 'figures') data = yzer.import_file( yzer.get_filename(dirpath, 'ctcf_with_stat1_binding.txt')).fillna(0) with_stat1 = data[data['p2_tag_count'] > 0] without_stat1 = data[data['p2_tag_count'] == 0] if True: ax = yzer.piechart( [len(with_stat1), len(without_stat1)], ['CTCF sites with STAT1', 'CTCF sites without STAT1'], title='DP Thymocyte CTCF Sites with STAT1 in Th1 Cells', save_dir=img_dirpath, show_plot=True) data['tag_count_nonzero'] = nonzero(data['tag_count']) data['p2_tag_count_nonzero'] = nonzero(data['p2_tag_count']) ax = yzer.scatterplot( data, 'tag_count_nonzero', 'p2_tag_count_nonzero', xlabel='CTCF Tag Count', ylabel='Stat1 Tag Count', log=True, color='blue', title='Tags in CTCF Peaks versus Overlapping Stat1 Peaks', show_2x_range=False, show_legend=False, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=True)
cond_3 = (data['tag_count_3'] > 0) & (data['tag_count_3'] >= data['tag_count_4']) ax = None for show_points in (True, False): ax = yzer.scatterplot( data[cond_1], xcolname, ycolname, log=True, color=show_points and '#333333' or 'grey', master_dataset=data, xlabel='{0} {1} tag count'.format(main, basal_cond), ylabel='{0} KLA+Dex tag count'.format(main), label='No {0} in KLA+Dex {1}'.format( compare, show_points and ' ({0})'.format(len(data[cond_1])) or ''), add_noise=show_points, show_points=show_points, show_2x_range=False, show_legend=False, plot_regression=(not show_points), show_count=False, show_correlation=False, set_limits=True, show_plot=False, ax=ax) ax = yzer.scatterplot( data[cond_2], xcolname, ycolname,
data['h4k8ac_kla_dex_tag_count']) / nonzero( data['h4k8ac_kla_tag_count']) for subgroup, suffix, dataset in (('RefSeq Transcripts', '_trans', data.groupby( by='nearest_refseq_transcript_id', as_index=False).mean()), ): ax = yzer.scatterplot( dataset[(dataset['kla_1_lfc_trans'] >= 1)], 'dmso_1_rpkm', 'dex_over_kla_1_lfc_trans', log=True, title= 'GR transrepression by DMSO expression for Up-regulated genes', xlabel='DMSO 2h RPKM', ylabel='log2(KLA+Dex GRO-seq/DMSO GRO-seq)', show_2x_range=False, plot_regression=True, show_count=True, show_correlation=True, save_dir=img_dirpath, show_plot=True) ax = yzer.scatterplot( dataset[(dataset['kla_1_lfc_trans'] >= 1)], 'h4k8ac_kla_ratio', 'dex_over_kla_1_lfc_trans', log=True, title= 'GR transrepression by KLA to DMSO H4K8ac tag ratio for Up-regulated genes', xlabel='KLA Tags/DMSO Tags',