def _do_validate_hg_prefix(makefile, prefix, contigs, fatal): if not _is_invalid_hg_prefix(contigs): return message = \ "Prefix appears to be a human genome, but chromosomes are ordered\n" \ "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \ "(chr1, chr2, chr3, ...):\n\n" \ " Makefile = %s\n" \ " Prefix = %s\n\n" \ "GATK requires that human chromosomes are ordered numerically;\n%s\n" \ "See the documentation at the GATK website for more information:\n " \ "http://www.broadinstitute.org/gatk/guide/article?id=1204\n" prefix_path = prefix["Path"] mkfile_path = makefile["Statistics"]["Filename"] if fatal: details = "Either disable GATK in the makefile, or fix the prefix." message %= (mkfile_path, prefix_path, details) raise MakefileError(message) else: details = \ "You will not be able to use the resulting BAM file with GATK." message %= (mkfile_path, prefix_path, details) print_warn("\nWARNING:\n", message, file=sys.stderr, sep="")
def _update_filtering(mkfile): samples = mkfile["Project"]["Samples"] groups = mkfile["Project"]["Groups"] filtering = {} for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems(): if target.startswith("<") and target.endswith(">"): raise MakefileError("Singleton-filtering must be specified per " "sample, not by groups: %r" % (target,)) elif target not in samples: raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,)) elif target in filter_by: raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,)) path = "Project:FilterSingletons:%s" % (target,) filtering[target] = _select_samples(filter_by, groups, samples, path) # Implicit inclusion is allowed, since that is useful in some cases, # where we want to filter a sample based on the group it is a member of if target in filtering[target]: # The target itself must be excluded, as including it is invalid filtering[target] = filtering[target] - set((target,)) print_warn( "Warning: Sample %r is singleton-filtered using a " "group it is also a member of; this may be by mistake." % (target,) ) if not filtering[target]: raise MakefileError("No samples specified by which to " "singleton-filter by for %r" % (target,)) mkfile["Project"]["FilterSingletons"] = filtering
def _check_genders(mkfile): all_contigs = set() contigs_genders = set() regions_genders = set() for regions in mkfile["Project"]["Regions"].itervalues(): all_contigs.update(_collect_fasta_contigs(regions)) for contigs in regions["HomozygousContigs"].itervalues(): contigs_genders.update(contigs) current_genders = set(regions["HomozygousContigs"]) if not regions_genders: regions_genders = current_genders elif regions_genders != current_genders: raise MakefileError("List of genders for regions %r does not " "match other regions" % (regions["Name"],)) if not regions_genders: raise MakefileError("No genders have been specified in makefile; " "please list all sample genders and assosiated " "homozygous contigs (if any).") for sample in mkfile["Project"]["Samples"].itervalues(): if sample["Gender"] not in regions_genders: genders = ", ".join(map(repr, regions_genders)) message = "Sample %r has unknown gender %r; known genders are %s" \ % (sample["Name"], sample["Gender"], genders) raise MakefileError(message) unknown_contigs = contigs_genders - all_contigs if unknown_contigs: print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n - " + "\n - ".join(unknown_contigs)) print_warn("Please verify that the list(s) of contigs is correct!")
def _check_genders(mkfile): all_contigs = set() contigs_genders = set() regions_genders = set() for regions in mkfile["Project"]["Regions"].itervalues(): all_contigs.update(_collect_fasta_contigs(regions)) for contigs in regions["HomozygousContigs"].itervalues(): contigs_genders.update(contigs) current_genders = set(regions["HomozygousContigs"]) if not regions_genders: regions_genders = current_genders elif regions_genders != current_genders: raise MakefileError("List of genders for regions %r does not " "match other regions" % (regions["Name"], )) if not regions_genders: raise MakefileError("No genders have been specified in makefile; " "please list all sample genders and assosiated " "homozygous contigs (if any).") for sample in mkfile["Project"]["Samples"].itervalues(): if sample["Gender"] not in regions_genders: genders = ", ".join(map(repr, regions_genders)) message = "Sample %r has unknown gender %r; known genders are %s" \ % (sample["Name"], sample["Gender"], genders) raise MakefileError(message) unknown_contigs = contigs_genders - all_contigs if unknown_contigs: print_warn( "WARNING: Unknown contig(s) in 'HomozygousContigs':\n - " + "\n - ".join(unknown_contigs)) print_warn("Please verify that the list(s) of contigs is correct!")
def _validate_makefiles_duplicate_files(makefiles): filenames = collections.defaultdict(list) for makefile in makefiles: iterator = _iterate_over_records(makefile) for (target, sample, library, barcode, record) in iterator: current_filenames = [] if record["Type"] == "Raw": for raw_filenames in record["Data"].itervalues(): current_filenames.extend(raw_filenames) else: current_filenames.extend(record["Data"].values()) for realpath in map(os.path.realpath, current_filenames): filenames[realpath].append((target, sample, library, barcode)) has_overlap = {} for (filename, records) in filenames.iteritems(): if len(records) > 1: has_overlap[filename] = list(set(records)) by_records = sorted(zip(has_overlap.values(), has_overlap.keys())) for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]): pairs = list(pairs) description = _describe_files_in_multiple_records(records, pairs) if len(set(record[0] for record in records)) != len(records): message = "Path included multiple times in target:\n" raise MakefileError(message + description) else: print_warn("WARNING: Path included in multiple targets:", file=sys.stderr) print_warn(description, file=sys.stderr) print_warn(file=sys.stderr)
def _update_filtering(mkfile): samples = mkfile["Project"]["Samples"] groups = mkfile["Project"]["Groups"] filtering = {} for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems(): if target.startswith("<") and target.endswith(">"): raise MakefileError("Singleton-filtering must be specified per " "sample, not by groups: %r" % (target, )) elif target not in samples: raise MakefileError( "Unknown/Invalid sample specifed for singleton filtering: %r" % (target, )) elif target in filter_by: raise MakefileError( "Attempting to filter singleton in sample using itself as comparison: %r" % (target, )) path = "Project:FilterSingletons:%s" % (target, ) filtering[target] = _select_samples(filter_by, groups, samples, path) # Implicit inclusion is allowed, since that is useful in some cases, # where we want to filter a sample based on the group it is a member of if target in filtering[target]: # The target itself must be excluded, as including it is invalid filtering[target] = filtering[target] - set((target, )) print_warn( "Warning: Sample %r is singleton-filtered using a " "group it is also a member of; this may be by mistake." % (target, )) if not filtering[target]: raise MakefileError("No samples specified by which to " "singleton-filter by for %r" % (target, )) mkfile["Project"]["FilterSingletons"] = filtering
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn( "WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn("WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")