def new_TE_distribution(): fields, filtered = filtered_low_counts(0) TEs = {'ingolia_1': np.log2(filtered[:, fields['ingolia:RPF_1']]) - np.log2(filtered[:, fields['ingolia:mRNA_1']]), 'ingolia_2': np.log2(filtered[:, fields['ingolia:RPF_2']]) - np.log2(filtered[:, fields['ingolia:mRNA_2']]), 'ingolia_both': np.log2(filtered[:, fields['ingolia:RPF_1']] + filtered[:, fields['ingolia:RPF_2']]) - np.log2(filtered[:, fields['ingolia:mRNA_1']] + filtered[:, fields['ingolia:mRNA_2']]), 'weinberg_RiboZero': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:RiboZero']]), 'weinberg_Dynabeads': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Dynabeads']]), 'weinberg_Unselected': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Unselected']]), 'artificial': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Dynabeads']] / np.asarray(filtered[:, fields['CDS_length']], dtype=float)), 'artificial2': np.log2(filtered[:, fields['weinberg:RPF']]) - np.log2(filtered[:, fields['weinberg:Dynabeads']] * np.asarray(filtered[:, fields['CDS_length']], dtype=float)), } for name in ['weinberg_RiboZero', 'weinberg_Dynabeads', 'weinberg_Unselected', 'ingolia_both', 'artificial', 'artificial2']: plt.hist(TEs[name] - np.mean(TEs[name]), histtype='step', bins=100, range=(-4, 4), label=name) plt.legend() plt.xlabel('log2(RPF RPKM / mRNA RPKM') plt.ylabel('Number of genes') explore_UTRs.scatter_with_hists_colors(TEs['weinberg_RiboZero'] - np.mean(TEs['weinberg_RiboZero']), TEs['weinberg_Unselected'] - np.mean(TEs['weinberg_Unselected']), 'weinberg_Ribozero', 'weinberg_Unselected', 'Joint distribution of TEs', ) print scipy.stats.pearsonr(TEs['weinberg_RiboZero'], TEs['weinberg_Unselected']) print scipy.stats.spearmanr(TEs['weinberg_RiboZero'], TEs['weinberg_Unselected']) return TEs
def mRNA_RPKM_length_bias(): gtf_fn = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/transcriptome/genes.gtf' genome_dir = '/home/jah/projects/arlen/data/organisms/saccharomyces_cerevisiae/EF4/genome' coding_sequence_fetcher = gtf.make_coding_sequence_fetcher(gtf_fn, genome_dir) lengths = np.asarray([len(coding_sequence_fetcher(name)) for name in gene_names]) explore_UTRs.scatter_with_hists_colors(lengths, #np.log2(arrays['weinberg']['Dynabeads']) - np.log2(arrays['weinberg']['Unselected']), np.log2(arrays['weinberg']['RPF']) - np.log2(arrays['ingolia']['RPF']), 'coding sequence length', 'log2(Ingolia mRNA RPKM / Weinberg mRNA RPKM)', '', ) plt.ylim(-7, 7) plt.gcf().set_size_inches(12, 8)