示例#1
0
	def testValid(self):
		version_string = 'v0030.2__2018_05_02'
		self.assertTrue(ShopVersion.isValidVersionString(version_string))
		version = ShopVersion(version_string)
		self.assertEqual(30, version.major)
		self.assertEqual(2, version.minor)
		self.assertEqual('20180502', version.date_string)
示例#2
0
	def testValidBigYorubaString(self):
		version_string = 'v0029.5b__2018_02_27__bigyoruba'
		# processing without allowing letter should fail
		self.assertFalse(ShopVersion.isValidVersionString(version_string))
		self.assertFalse(ShopVersion.isValidVersionString(version_string, False))
		# processing allowing letter should succeed
		self.assertTrue(ShopVersion.isValidVersionString(version_string, True))
示例#3
0
	def testValidMT(self):
		version_string = 'MT.v0002.0__2018_03_08'
		self.assertTrue(ShopVersion.isValidVersionString(version_string))
		version = ShopVersion(version_string)
		self.assertEqual(2, version.major)
		self.assertEqual(0, version.minor)
		self.assertEqual('20180308', version.date_string)
	parser = argparse.ArgumentParser(description="Augment the bam list for a release with a prior existing version of the library")
	parser.add_argument("bam_list", help="Each line contains the parameters to build a library bam for release. This includes the library ID, the individual ID, experiment, read group description (sequencing run name with experiment type and udg treatment), experiment, and (bam, sequencing run date) pairs ")
	args = parser.parse_args()

	with open(args.bam_list) as f:
		library_parameters = [LibraryParameters(line) for line in f]

	for x in library_parameters:
		experiment = x.experiment
		if '1240k' in experiment:
			experiment = '1240k'
		search_directory = MT_default_dir if x.reference == 'rsrs' else library_default_dir
		existingBAM = getBamPath(x.library_id, experiment=experiment, reference=x.reference, version_policy='latest', shop_parent_directory=search_directory)
		bam = str(existingBAM)
		#print(bam)
		if len(bam) > 0:
			try: # this will match a new pipeline bam
				match = re.search('v([0-9]+).bam', bam)
				new_version = int(match.group(1)) + 1
				has_read_groups, has_real_library_name, date_string = read_group_checks(bam)
			except: # if the existing version is Shop's
				new_version = 1
				shop = ShopVersion(bam)
				date_string = shop.date_string
			#print('{}\t{}\t{:d}'.format(x.library_id, bam, new_version))
			x.version = new_version
			x.bam_filenames.append(str(existingBAM))
			x.bam_date_strings.append(date_string) # the bam date string is used for generating read groups, which the existing bam does not need
		#print('{}\t{}'.format(x.library_id, bam))
		print(x)
def build_release_library(adna_jar_filename, picard_jar, working_directory,
                          library_parameters, jvm_mem_string, leniency):
    # make a working directory for this library
    pathlib.Path(working_directory).mkdir(exist_ok=True)

    library_id = library_parameters.library_id
    experiment = library_parameters.experiment
    reference = library_parameters.reference
    # add read groups for each library component
    count = 0
    library_component_bams = []
    component_bam_missing_duplicate_tag = False
    for input_bam, bam_date_string in zip(library_parameters.bam_filenames,
                                          library_parameters.bam_date_strings):
        # only bams with reads need to be merged
        if bam_has_aligned_reads(input_bam):
            if not bam_has_XD_tag(input_bam):
                component_bam_missing_duplicate_tag = True
            count += 1
            # filter aligned reads only
            component_bam_filename = '{}_{:d}.bam'.format(
                Path(input_bam).stem, count)
            component_bam_path = '{}/{}'.format(working_directory,
                                                component_bam_filename)
            aligned_reads_only(input_bam, component_bam_path)

            output_bam_filename = "{0}_{1:d}.{2}.{3}.bam".format(
                library_id, count, experiment, reference)
            # Demultiplexed, but unreleased bams need read groups added
            has_read_groups, has_real_library_name, date_string = read_group_checks(
                component_bam_path)
            if not has_read_groups:
                label = "{}_{}".format(
                    library_parameters.read_group_description, library_id)
                add_read_groups(adna_jar_filename, component_bam_filename,
                                output_bam_filename, bam_date_string, label,
                                library_id, library_parameters.individual_id,
                                working_directory, jvm_mem_string, leniency)
            # Shop's bams need read groups rewritten
            elif not has_real_library_name:
                shop_version = ShopVersion(component_bam_filename)
                bam_date_string = shop_version.date_string
                label = "{}_{}".format(
                    library_parameters.read_group_description, library_id)
                add_read_groups(adna_jar_filename, component_bam_filename,
                                output_bam_filename, bam_date_string, label,
                                library_id, library_parameters.individual_id,
                                working_directory, jvm_mem_string)
            # Previously released libraries already have read groups, and do not need them added
            # Simply include these in the merge
            else:
                os.symlink(
                    component_bam_filename,
                    '{}/{}'.format(working_directory, output_bam_filename))
            library_component_bams.append(output_bam_filename)

    # use stderr by library
    with open('{}/stdout_build_release_library'.format(working_directory), 'w') as stdout_build, \
     open('{}/stderr_build_release_library'.format(working_directory), 'w') as stderr_build:

        leniency_string = "VALIDATION_STRINGENCY=LENIENT" if leniency else ""

        library_filename = library_parameters.get_release_library_name()
        if len(library_component_bams
               ) > 0:  # merge any bams with reads and mark duplicates
            # merge
            library_with_duplicates_filename = "{0}.{1}.{2}.duplicates.bam".format(
                library_id, experiment, reference)
            subprocess.run(
                "java {} -jar {} MergeSamFiles I={} O={} SORT_ORDER=coordinate {}"
                .format(jvm_mem_string, picard_jar,
                        ' I='.join(library_component_bams),
                        library_with_duplicates_filename, leniency_string),
                shell=True,
                check=True,
                cwd=working_directory,
                stdout=stdout_build,
                stderr=stderr_build)

            # if any component bam is missing the XD tag, we cannot use the tag to deduplicate properly with barcodes
            # We treat this conservatively by removing all barcode information and deduplicating based on position and length
            if component_bam_missing_duplicate_tag:
                library_with_duplicates_tag_rewritten_filename = "{0}.{1}.{2}.duplicates.tagxd.bam".format(
                    library_id, experiment, reference)
                subprocess.run([
                    'java', jvm_mem_string, '-jar', adna_jar_filename,
                    'DuplicatesTagRewrite', '-i',
                    library_with_duplicates_filename, '-o',
                    library_with_duplicates_tag_rewritten_filename
                ],
                               check=True,
                               cwd=working_directory)
                to_deduplicate_filename = library_with_duplicates_tag_rewritten_filename
            else:
                to_deduplicate_filename = library_with_duplicates_filename

            # deduplicate
            subprocess.run(
                "java {0} -jar {1} MarkDuplicates I={2} O={3} M={3}.dedup_stats REMOVE_DUPLICATES=true BARCODE_TAG=XD ADD_PG_TAG_TO_READS=false MAX_FILE_HANDLES=1000 COMPRESSION_LEVEL=9 {4}"
                .format(jvm_mem_string, picard_jar, to_deduplicate_filename,
                        library_filename, leniency_string),
                shell=True,
                check=True,
                cwd=working_directory,
                stdout=stdout_build,
                stderr=stderr_build)
        else:  # There are no reads, so use an empty bam. First bam should exist and be empty, so return a copy of that, ensuring aligned reads only
            aligned_reads_only(library_parameters.bam_filenames[0],
                               working_directory + '/' + library_filename)

    return library_filename
示例#6
0
	def testValidAllowLetter(self):
		version_string = 'v0030.2__2018_05_02'
		self.assertTrue(ShopVersion.isValidVersionString(version_string, True))
示例#7
0
	def testMTDirectoryVersion(self):
		path = '/n/data1/hms/genetics/reich/1000Genomes/amh_samples/ancientMergeSets__MT/B-per_library_versions/S1137.E1.L4/MT.v0001.5__2016_06_27/merged/aln.sort.mapped.rmdupse_adna_v2.md.bam'
		version = ShopVersion(path)
		self.assertEqual('20160627', version.date_string)
示例#8
0
	def testInvalid(self):
		self.assertFalse(ShopVersion.isValidVersionString('merged'))