def testValid(self): version_string = 'v0030.2__2018_05_02' self.assertTrue(ShopVersion.isValidVersionString(version_string)) version = ShopVersion(version_string) self.assertEqual(30, version.major) self.assertEqual(2, version.minor) self.assertEqual('20180502', version.date_string)
def testValidBigYorubaString(self): version_string = 'v0029.5b__2018_02_27__bigyoruba' # processing without allowing letter should fail self.assertFalse(ShopVersion.isValidVersionString(version_string)) self.assertFalse(ShopVersion.isValidVersionString(version_string, False)) # processing allowing letter should succeed self.assertTrue(ShopVersion.isValidVersionString(version_string, True))
def testValidMT(self): version_string = 'MT.v0002.0__2018_03_08' self.assertTrue(ShopVersion.isValidVersionString(version_string)) version = ShopVersion(version_string) self.assertEqual(2, version.major) self.assertEqual(0, version.minor) self.assertEqual('20180308', version.date_string)
parser = argparse.ArgumentParser(description="Augment the bam list for a release with a prior existing version of the library") parser.add_argument("bam_list", help="Each line contains the parameters to build a library bam for release. This includes the library ID, the individual ID, experiment, read group description (sequencing run name with experiment type and udg treatment), experiment, and (bam, sequencing run date) pairs ") args = parser.parse_args() with open(args.bam_list) as f: library_parameters = [LibraryParameters(line) for line in f] for x in library_parameters: experiment = x.experiment if '1240k' in experiment: experiment = '1240k' search_directory = MT_default_dir if x.reference == 'rsrs' else library_default_dir existingBAM = getBamPath(x.library_id, experiment=experiment, reference=x.reference, version_policy='latest', shop_parent_directory=search_directory) bam = str(existingBAM) #print(bam) if len(bam) > 0: try: # this will match a new pipeline bam match = re.search('v([0-9]+).bam', bam) new_version = int(match.group(1)) + 1 has_read_groups, has_real_library_name, date_string = read_group_checks(bam) except: # if the existing version is Shop's new_version = 1 shop = ShopVersion(bam) date_string = shop.date_string #print('{}\t{}\t{:d}'.format(x.library_id, bam, new_version)) x.version = new_version x.bam_filenames.append(str(existingBAM)) x.bam_date_strings.append(date_string) # the bam date string is used for generating read groups, which the existing bam does not need #print('{}\t{}'.format(x.library_id, bam)) print(x)
def build_release_library(adna_jar_filename, picard_jar, working_directory, library_parameters, jvm_mem_string, leniency): # make a working directory for this library pathlib.Path(working_directory).mkdir(exist_ok=True) library_id = library_parameters.library_id experiment = library_parameters.experiment reference = library_parameters.reference # add read groups for each library component count = 0 library_component_bams = [] component_bam_missing_duplicate_tag = False for input_bam, bam_date_string in zip(library_parameters.bam_filenames, library_parameters.bam_date_strings): # only bams with reads need to be merged if bam_has_aligned_reads(input_bam): if not bam_has_XD_tag(input_bam): component_bam_missing_duplicate_tag = True count += 1 # filter aligned reads only component_bam_filename = '{}_{:d}.bam'.format( Path(input_bam).stem, count) component_bam_path = '{}/{}'.format(working_directory, component_bam_filename) aligned_reads_only(input_bam, component_bam_path) output_bam_filename = "{0}_{1:d}.{2}.{3}.bam".format( library_id, count, experiment, reference) # Demultiplexed, but unreleased bams need read groups added has_read_groups, has_real_library_name, date_string = read_group_checks( component_bam_path) if not has_read_groups: label = "{}_{}".format( library_parameters.read_group_description, library_id) add_read_groups(adna_jar_filename, component_bam_filename, output_bam_filename, bam_date_string, label, library_id, library_parameters.individual_id, working_directory, jvm_mem_string, leniency) # Shop's bams need read groups rewritten elif not has_real_library_name: shop_version = ShopVersion(component_bam_filename) bam_date_string = shop_version.date_string label = "{}_{}".format( library_parameters.read_group_description, library_id) add_read_groups(adna_jar_filename, component_bam_filename, output_bam_filename, bam_date_string, label, library_id, library_parameters.individual_id, working_directory, jvm_mem_string) # Previously released libraries already have read groups, and do not need them added # Simply include these in the merge else: os.symlink( component_bam_filename, '{}/{}'.format(working_directory, output_bam_filename)) library_component_bams.append(output_bam_filename) # use stderr by library with open('{}/stdout_build_release_library'.format(working_directory), 'w') as stdout_build, \ open('{}/stderr_build_release_library'.format(working_directory), 'w') as stderr_build: leniency_string = "VALIDATION_STRINGENCY=LENIENT" if leniency else "" library_filename = library_parameters.get_release_library_name() if len(library_component_bams ) > 0: # merge any bams with reads and mark duplicates # merge library_with_duplicates_filename = "{0}.{1}.{2}.duplicates.bam".format( library_id, experiment, reference) subprocess.run( "java {} -jar {} MergeSamFiles I={} O={} SORT_ORDER=coordinate {}" .format(jvm_mem_string, picard_jar, ' I='.join(library_component_bams), library_with_duplicates_filename, leniency_string), shell=True, check=True, cwd=working_directory, stdout=stdout_build, stderr=stderr_build) # if any component bam is missing the XD tag, we cannot use the tag to deduplicate properly with barcodes # We treat this conservatively by removing all barcode information and deduplicating based on position and length if component_bam_missing_duplicate_tag: library_with_duplicates_tag_rewritten_filename = "{0}.{1}.{2}.duplicates.tagxd.bam".format( library_id, experiment, reference) subprocess.run([ 'java', jvm_mem_string, '-jar', adna_jar_filename, 'DuplicatesTagRewrite', '-i', library_with_duplicates_filename, '-o', library_with_duplicates_tag_rewritten_filename ], check=True, cwd=working_directory) to_deduplicate_filename = library_with_duplicates_tag_rewritten_filename else: to_deduplicate_filename = library_with_duplicates_filename # deduplicate subprocess.run( "java {0} -jar {1} MarkDuplicates I={2} O={3} M={3}.dedup_stats REMOVE_DUPLICATES=true BARCODE_TAG=XD ADD_PG_TAG_TO_READS=false MAX_FILE_HANDLES=1000 COMPRESSION_LEVEL=9 {4}" .format(jvm_mem_string, picard_jar, to_deduplicate_filename, library_filename, leniency_string), shell=True, check=True, cwd=working_directory, stdout=stdout_build, stderr=stderr_build) else: # There are no reads, so use an empty bam. First bam should exist and be empty, so return a copy of that, ensuring aligned reads only aligned_reads_only(library_parameters.bam_filenames[0], working_directory + '/' + library_filename) return library_filename
def testValidAllowLetter(self): version_string = 'v0030.2__2018_05_02' self.assertTrue(ShopVersion.isValidVersionString(version_string, True))
def testMTDirectoryVersion(self): path = '/n/data1/hms/genetics/reich/1000Genomes/amh_samples/ancientMergeSets__MT/B-per_library_versions/S1137.E1.L4/MT.v0001.5__2016_06_27/merged/aln.sort.mapped.rmdupse_adna_v2.md.bam' version = ShopVersion(path) self.assertEqual('20160627', version.date_string)
def testInvalid(self): self.assertFalse(ShopVersion.isValidVersionString('merged'))