def main(cmdline=None): parser = ArgumentParser() parser.add_argument('-l', '--library', required=True, action='append', help="library table to load") parser.add_argument('-o', '--output', help='filename to write report to') args = parser.parse_args(cmdline) libraries = load_library_tables(args.library) metrics = [] for library_id, library in libraries.iterrows(): genome_triple = genome_name_from_library(library) filename = library.analysis_name + '-' + genome_triple + '_picard_markdup.metrics' pathname = Path(library.analysis_dir) / filename if pathname.exists(): picard_metric = parse_picard_metric(pathname, library_id=library_id) metrics.append(picard_metric) else: print('{} is missing. Skipping'.format(pathname)) metrics = pandas.DataFrame(metrics) metrics.set_index('LIBRARY', inplace=True) if args.output: metrics.to_csv(args.output, sep='\t') else: print(metrics)
def test_genome_name_from_library_dict(self): d = { 'genome': 'mm10', 'annotation': 'M21_minimal', 'sex': 'male', } self.assertEqual(models.genome_name_from_library(d), 'mm10-M21_minimal-male') self.assertRaises(ValueError, models.genome_name_from_library, 10)
def build_hash_tree(library_filename): table = load_library_tables([library_filename]) hashes = {} for library_id, row in table.iterrows(): analysis_dir = row.analysis_dir name = row.analysis_name + '-' + genome_name_from_library( row) + '_genome.bam' alignment = os.path.join(analysis_dir, name) hashes[library_id] = hash_alignments(alignment) return hashes
def link_genome_bams(libraries, output_dir): for library_id, library in libraries.iterrows(): clean_library_id = sanitize_library_suffix(library_id) target_dir = os.path.join(output_dir, clean_library_id) if not os.path.exists(target_dir): os.mkdir(target_dir) name = make_bam_track_name(library, library.analysis_dir) source_pathname = os.path.join(library.analysis_dir, name) target_name = clean_library_id + genome_name_from_library( library) + '_genome.bam' cur_dir = os.getcwd() os.chdir(target_dir) if os.path.exists(source_pathname) and not os.path.exists(target_name): print(source_pathname, '->', target_name) os.symlink(source_pathname, target_name) os.chdir(cur_dir)
def make_bigwig_track_name(library, signal_type, analysis_root): """Generate the base path where the bigwig track is :param Series library: row from a library table DataFrame :param str signal_type: either uniq or all to specify bigwig type. :param str analysis_root: root directory to be searching for track files :returns: list of paths of bigWig files relative to analysis_root """ assert signal_type in ('uniq', 'all') genome_triplet = genome_name_from_library(library) track_name = library.analysis_name + '-' + genome_triplet + '_' + signal_type + '.bw' for pathname in [ os.path.join(library.analysis_dir, track_name), os.path.join(analysis_root, track_name) ]: if os.path.exists(pathname): return return_subpath(pathname, analysis_root) logger.warning("Couldn't find track file %s", track_name)
def link_rsem(libraries, output_dir): for library_id, library in libraries.iterrows(): clean_library_id = sanitize_library_suffix(library_id) target_dir = os.path.join(output_dir, clean_library_id) if not os.path.exists(target_dir): os.mkdir(target_dir) source_dir = library.analysis_dir cur_dir = os.getcwd() os.chdir(target_dir) for extension in [ '_anno_rsem.genes.results', '_anno_rsem.isoforms.results' ]: suffix = '-' + genome_name_from_library(library) + extension source_name = library_id + suffix target_name = clean_library_id + suffix source_pathname = os.path.join(source_dir, source_name) if os.path.exists( source_pathname) and not os.path.exists(target_name): print(source_pathname, '->', target_name) os.symlink(source_pathname, target_name) os.chdir(cur_dir)
def make_bam_track_name(library, analysis_root=None): """Generate the base path where the bam track is. :param Series library: row from a library table DataFrame :param str analysis_root: root directory to be searching for track files :returns: path of bam file relative to analysis_root """ genome_triplet = genome_name_from_library(library) track_name = library.analysis_name + '-' + genome_triplet + '_genome.bam' old_name = 'Aligned.sortedByCoord.out.bam' to_check = [ os.path.join(library.analysis_dir, track_name), os.path.join(analysis_root, track_name), os.path.join(library.analysis_dir, old_name), ] for pathname in to_check: if os.path.exists(pathname): bai = pathname + '.bai' if not os.path.exists(bai): logger.warning('Missing index file for {}'.format(pathname)) return return_subpath(pathname, analysis_root) logger.warning("Couldn't find track file %s", track_name)
def test_genome_name_from_library_series(self): mm10tsv = resource_filename(__name__, 'library-mm10-se.tsv') mm10 = models.load_library_tables([mm10tsv]) self.assertEqual(models.genome_name_from_library(mm10.loc['12304']), 'mm10-M4-female') self.assertEqual(models.genome_name_from_library(mm10.loc['12309']), 'mm10-M4-male')