def test_calculate_psi(event_annotation, reads2d, isoform1_junctions, isoform2_junctions, psi_df, summary_df): from outrigger.psi.compute import calculate_psi true_psi = psi_df true_summary = summary_df test_psi, test_summary = calculate_psi(event_annotation, reads2d, isoform1_junctions, isoform2_junctions) # When psi is written to CSV, only the index name is preserved, not the # column names so need to get rid of it for these comparisons test_psi.columns.name = None pdt.assert_frame_equal(test_psi, true_psi) pdt.assert_frame_equal(test_summary, true_summary)
def execute(self): """Calculate percent spliced in (psi) of splicing events""" logger = logging.getLogger('outrigger.psi') if self.debug: logger.setLevel(10) junction_reads = self.csv() metadata_csv = os.path.join(self.junctions_folder, METADATA_CSV) self.junction_metadata(junction_reads, metadata_csv) junction_reads_2d = junction_reads.pivot(index=self.sample_id_col, columns=self.junction_id_col, values=self.reads_col) junction_reads_2d.fillna(0, inplace=True) junction_reads_2d = junction_reads_2d.astype(int) logger.debug('\n--- Splice Junction reads ---') logger.debug(repr(junction_reads.head())) psis = [] for splice_name, splice_abbrev in outrigger.common.SPLICE_TYPES: filename = self.maybe_get_validated_events(splice_abbrev) if not os.path.exists(filename): util.progress('No {name} ({abbrev}) events found, ' 'skipping.'. format(name=splice_name, abbrev=splice_abbrev)) continue # event_type = os.path.basename(filename).split('.csv')[0] util.progress('Reading {name} ({abbrev}) events from {filename}' ' ...'.format(name=splice_name, abbrev=splice_abbrev, filename=filename)) event_annotation = pd.read_csv(filename, index_col=0, low_memory=self.low_memory) util.done() isoform_junctions = outrigger.common.ISOFORM_JUNCTIONS[ splice_abbrev] logger.debug('\n--- Splicing event annotation ---') logger.debug(repr(event_annotation.head())) util.progress( 'Calculating percent spliced-in (Psi) scores on ' '{name} ({abbrev}) events ...'.format( name=splice_name, abbrev=splice_abbrev)) # Splice type percent spliced-in (psi) and summary type_psi, summary = compute.calculate_psi( event_annotation, junction_reads_2d, min_reads=self.min_reads, n_jobs=self.n_jobs, method=self.method, uneven_coverage_multiplier=self.uneven_coverage_multiplier, **isoform_junctions) # Write this event's percent spliced-in matrix csv = os.path.join(self.psi_folder, splice_abbrev, 'psi.csv'.format(splice_abbrev)) util.progress('Writing {name} ({abbrev}) Psi values to {filename}' ' ...'.format(name=splice_name, abbrev=splice_abbrev, filename=csv)) self.maybe_make_folder(os.path.dirname(csv)) type_psi.to_csv(csv, na_rep='NA') # Write this event's summary of events and why they weren't or were # calculated Psi on csv = os.path.join(self.psi_folder, splice_abbrev, 'summary.csv'.format(splice_abbrev)) util.progress('Writing {name} ({abbrev}) event summaries (e.g. ' 'number of reads, why an event does not have a Psi ' 'score) to {filename} ...' ''.format(name=splice_name, abbrev=splice_abbrev, filename=csv)) self.maybe_make_folder(os.path.dirname(csv)) summary.to_csv(csv, na_rep='NA', index=False) psis.append(type_psi) util.done() util.progress('Concatenating all calculated psi scores ' 'into one big matrix...') splicing = pd.concat(psis, axis=1) util.done() splicing = splicing.T csv = os.path.join(self.psi_folder, 'outrigger_psi.csv') util.progress('Writing a samples x features matrix of Psi ' 'scores to {} ...'.format(csv)) splicing.to_csv(csv, na_rep='NA') util.done()