def check_fastq_files(filenames, required_offset, allow_empty=False): for filename in filenames: qualities = _read_sequences(filename) offsets = fastq.classify_quality_strings(qualities) if offsets == fastq.OFFSET_BOTH: raise NodeError("FASTQ file contains quality scores with both " "quality offsets (33 and 64); file may be " "unexpected format or corrupt. Please ensure " "that this file contains valid FASTQ reads from a " "single source.\n Filename = %r" % (filename, )) elif offsets == fastq.OFFSET_MISSING: if allow_empty and not qualities: return raise NodeError("FASTQ file did not contain quality scores; file " "may be unexpected format or corrupt. Ensure that " "the file is a FASTQ file.\n Filename = %r" % (filename, )) elif offsets not in (fastq.OFFSET_AMBIGIOUS, required_offset): raise NodeError("FASTQ file contains quality scores with wrong " "quality score offset (%i); expected reads with " "quality score offset %i. Ensure that the " "'QualityOffset' specified in the makefile " "corresponds to the input.\n Filename = %s" % (offsets, required_offset, filename))
def _collect_qualities(handle, filename): header = handle.readline() while header: sequence = handle.readline() seperator = handle.readline() qualities = handle.readline() if not header.startswith("@"): if header.startswith(">"): raise NodeError("Input file appears to be in FASTA format " "(header starts with '>', expected '@'), " "but only FASTQ files are supported\n" "Filename = %r" % (filename, )) raise NodeError("Input file lacks FASTQ header (expected '@', " "found %r), but only FASTQ files are supported\n" " Filename = %r" % (header[:1], filename)) elif not qualities: raise NodeError("Partial record found; is not 4 lines long:\n" "Filename = %r\n Record = '%s'" % (filename, header.rstrip())) elif not seperator.startswith("+"): raise NodeError("Input file lacks FASTQ seperator (expected '+', " "found %r), but only FASTQ files are supported\n" " Filename = %r" % (seperator[:1], filename)) elif len(sequence) != len(qualities): raise NodeError("Input file contains malformed FASTQ records; " "length of sequence / qualities are not the " "same.\n Filename = %r\n Record = '%s'" % (filename, header.rstrip())) yield qualities header = handle.readline()
def _validate_fasta_line(filename, linenum, line): invalid_chars = frozenset(line) - _VALID_CHARS if invalid_chars: if invalid_chars == frozenset('\r'): raise NodeError("FASTA file contains carriage-returns ('\\r')!\n" "Please convert file to unix format, using e.g. " "dos2unix.\n Filename = %r\n" % (filename, )) raise NodeError("FASTA sequence contains invalid characters\n" " Filename = %r\n Line = %r\n" " Invalid characters = %r" % (filename, linenum, "".join(invalid_chars)))
def _read_sequences(filename): """Collects the sequences from a PHYLIP file, and returns the header, the names of the sequences, and the sequences themselves. The parser supports interleaved sequences (as produced by the pipeline), or simple sequential (each paired name and sequence on a single line) as produced by RAxML's reduce functionality. PHYLIP files containing multiple entries are not supported.""" line, header = " ", None with open(filename) as handle: # Find header num_sequences = num_bases = 0 while line: line = handle.readline() if line.strip(): header = line num_sequences, num_bases = map(int, line.split()) break names = [None for _ in xrange(num_sequences)] sequences = [[] for _ in xrange(num_sequences)] line_num = 0 while line: line = handle.readline() line_strip = line.strip() if line_strip: # The first N sequences are expected to contain sample names index = line_num % num_sequences if line_num < num_sequences: name, line_strip = line_strip.split(None, 1) names[index] = name sequences[index].extend(line_strip.split()) line_num += 1 if len(sequences) != num_sequences: message = ("Expected %i sequences, but found %i in PHYLIP file:\n" " Filename = %r") % (num_sequences, len(sequences), filename) raise NodeError(message) for (index, fragments) in enumerate(sequences): sequences[index] = "".join(fragments) if len(sequences[index]) != num_bases: message = ("Expected %ibp sequences, found %ibp sequence for %r\n" " Filename = %r") % (num_bases, len( sequences[index]), names[index], filename) raise NodeError(message) return header, names, sequences
def _validate_fasta_header(filename, linenum, line, cache): name = line.split(" ", 1)[0][1:] if not name: raise NodeError("FASTA sequence must have non-empty name\n" " Filename = %r\n Line = %r\n" % (filename, linenum)) elif not _RE_REF_NAME.match(name): raise NodeError("Invalid name for FASTA sequence: %r\n" " Filename = %r\n Line = %r\n" % (name, filename, linenum)) elif name in cache: raise NodeError("FASTA sequences have identical name\n" " Filename = %r\n Name = %r\n" " Line 1 = %r\n Line 2 = %r\n" % (filename, name, linenum, cache[name])) cache[name] = linenum
def _validate_fasta_line(filename, linenum, line): invalid_chars = frozenset(line) - _VALID_CHARS if invalid_chars: raise NodeError("FASTA sequence contains invalid characters\n" " Filename = %r\n Line = %r\n" " Invalid characters = %r" % (filename, linenum, "".join(invalid_chars)))
def _teardown(self, config, temp): # Validate output from MAFFT output_file = reroot_path(temp, self._output_file) try: MSA.from_file(output_file) except MSAError, error: raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
def _read_partitions(filename): """Read a partition file, as produced by the pipeline itself, and returns a list of tuples containing the (start, end) coordinates; each line is expected to follow the following format: DNA, Name = Start-End Multiple regions, or skips are not supported.""" partitions = [] with open(filename) as handle: for (line_num, line) in enumerate(handle): result = _RE_PARTITION.match(line.rstrip()) if result: start, end = result.groups() else: result = _RE_PARTITION_SINGLE.match(line.rstrip()) if not result: message = ("Line %i in partitions file does not follow " "expected format:\n" " Expected, either = 'DNA, Name = Start-End'\n" " or = 'DNA, Name = Start'\n" " Found = %r") % (line_num, line.rstrip()) raise NodeError(message) start, = result.groups() end = start partitions.append((int(start) - 1, int(end))) return partitions
def _report_failure(cls, bed, fragment): message = "Failed to extract region from " \ "reference sequence at %s:%i-%i; got " \ "%i bp, but expected %i bp." \ % (bed.contig, bed.start, bed.end, len(fragment), (bed.end - bed.start)) raise NodeError(message)
def _read_sequences(filename): cat_call = factory.new("cat") cat_call.add_multiple_values((filename, )) cat_call = cat_call.finalized_call cat = None try: cat = subprocess.Popen(cat_call, bufsize=io.DEFAULT_BUFFER_SIZE, stderr=subprocess.PIPE, stdout=subprocess.PIPE) qualities = _collect_qualities(cat.stdout, filename) return sampling.reservoir_sampling(qualities, 100000) except: if cat: cat.kill() cat.wait() cat = None raise finally: rc_cat = cat.wait() if cat else 0 if rc_cat: message = "Error running 'paleomix cat':\n" \ " Unicat return-code = %i\n\n%s" \ % (rc_cat, cat.stderr.read()) raise NodeError(message)
def _check_bwa_prefix(prefix): try: bwa_version = BWA_VERSION.version except versions.VersionRequirementError: return # Ignored here, reported elsewhere if bwa_version >= (0, 6, 0): for extension in (".rbwt", ".rpac", ".rsa"): if os.path.exists(prefix + extension): raise NodeError("BWA version is v%s, but prefix appears to be created using v0.5.x!\n" "\tPlease remove '%s.*' and rebuild index using 'bwa index %s'" \ % (".".join(map(str, bwa_version)), prefix, prefix))
def _read_sequences(filenames): expected_groups = None for filename in sorted(filenames): msa = read_msa(filename) if not expected_groups: expected_groups = set(msa) elif set(msa) != expected_groups: difference = expected_groups.symmetric_difference(msa) raise NodeError("Unexpected/missing groups for sequence (%s): %s" \ % (filename, ", ".join(difference))) yield (filename, msa)
def _run(self, config, temp): try: CommandNode._run(self, config, temp) except NodeError, error: err_message = "DNA damage levels are too low" if self._command.join() == [1]: fpath = os.path.join(temp, "pipe_mapDamage.stdout") with open(fpath) as handle: for line in handle: if err_message in line: line = line.strip().replace("Warning:", "ERROR:") error = NodeError("%s\n\n%s" % (error, line)) break raise error
def test_run__exceptions(): cfg_mock = flexmock(temp_root = "/tmp") def build_tests(key, exception, expectation): @nose.tools.raises(expectation) def test_function(): node_mock = flexmock(Node()) node_mock.should_receive(key).and_raise(exception).once with MonkeypatchCreateTempDir(): node_mock.run(cfg_mock) # pylint: disable=E1103 return test_function for key in ('_setup', '_run', '_teardown'): yield build_tests(key, TypeError("The castle AAARGH!"), NodeUnhandledException) yield build_tests(key, NodeError("He's a very naughty boy!"), NodeError)
def _check_bwa_prefix(prefix): """Checks that a given prefix is compatible with the currently installed version of BWA. This is required in order to allow auto-indexing of prefixes, as indexes produced by v0.5.x and by 0.6+ are not only incompatible, but differs in the files produced, with 0.5.x producing a handful of additional files. As a consequence, simply using normal input-file dependencies would result in prefixes being re-indexed if the version of BWA was changed from 0.6+ to 0.5.x, and in failures during runtime if the version was changed from 0.5.x to 0.6+. This function treats that a difference in the version of BWA installed and the version implied by the prefix files is an error, and therefore requires user intervention.""" if prefix in _PREFIXES_CHECKED: return _PREFIXES_CHECKED.add(prefix) try: bwa_version = BWA_VERSION.version except versions.VersionRequirementError: return # Ignored here, reported elsewhere # Files unique to v0.5.x v05x_files = set((prefix + ext) for ext in (".rbwt", ".rpac", ".rsa")) # Files common to v0.5.x, v0.6.x, and v0.7.x common_files = set( (prefix + ext) for ext in (".amb", ".ann", ".bwt", ".pac", ".sa")) all_files = v05x_files | common_files current_files = all_files - set(missing_files(all_files)) expected_version = None if (current_files & common_files): if bwa_version >= (0, 6, 0): if (current_files & v05x_files): expected_version = "v0.5.x" elif bwa_version < (0, 6, 0): if not (current_files & v05x_files): expected_version = "v0.6.x or later" if expected_version: raise NodeError("BWA version is v%s, but prefix appears to be created using %s!\n" " Your copy of BWA may have changed, or you may be using the wrong\n" " prefix. To resolve this issue, either change your prefix, re-install\n" " BWA %s, or remove the prefix files at\n" " $ ls %s.*" \ % (".".join(map(str, bwa_version)), expected_version, expected_version, prefix))
def _setup(self, _config, _temp): for filename in self._infiles.itervalues(): with open(filename + ".fai") as handle: sequences = set() for line in handle: sequences.add(line.split("\t", 1)[0]) missing_sequences = list(self._sequences - sequences) if missing_sequences: if len(missing_sequences) >= 4: missing_sequences = missing_sequences[:3] missing_sequences.append("...") message = ("FASTA file does not contain expected " "sequences:\n File = %r\n " "Sequences = %s\n") \ % (filename, ", ".join(missing_sequences)) raise NodeError(message)
def test_run__error_log__node_error(): @with_temp_folder def _do_test_run__error_log__node_error(temp_folder, exception): temp = os.path.join(temp_folder, "xTMPx") cfg_mock = flexmock(temp_root=temp_folder) node_mock = flexmock(Node()) node_mock.should_receive("_create_temp_dir").with_args(cfg_mock) \ .and_return(temp).ordered.once node_mock.should_receive("_run").and_raise(exception).ordered.once os.mkdir(temp) assert_raises(NodeError, node_mock.run, cfg_mock) # pylint: disable=E1103 log_file = os.path.join(temp_folder, "xTMPx", "pipe.errors") assert os.path.exists(log_file) assert_in("Errors =", get_file_contents(log_file)) yield _do_test_run__error_log__node_error, NodeError("ARGH!") yield _do_test_run__error_log__node_error, OSError("ARGH!")
def customize(cls, input_file_1, input_file_2, output_file, reference, prefix, threads=2, dependencies=()): aln = _bowtie2_template( ("bowtie2", ), prefix, IN_FILE_1=input_file_1, # Setting IN_FILE_2 to None makes AtomicCmd ignore this key IN_FILE_2=input_file_2 or None, OUT_STDOUT=AtomicCmd.PIPE, CHECK_VERSION=BOWTIE2_VERSION) aln.set_option("-x", prefix) if input_file_1 and not input_file_2: aln.set_option("-U", "%(IN_FILE_1)s") elif input_file_1 and input_file_2: aln.set_option("-1", "%(IN_FILE_1)s") aln.set_option("-2", "%(IN_FILE_2)s") else: raise NodeError( "Input 1, OR both input 1 and input 2 must be specified for Bowtie2 node" ) max_threads = _get_max_threads(reference, threads) aln.set_option("--threads", max_threads) order, commands = _process_output(aln, output_file, reference, run_fixmate=(input_file_1 and input_file_2)) commands["aln"] = aln return { "commands": commands, "order": ["aln"] + order, "threads": max_threads, "dependencies": dependencies }
def _read_coverage_tables(cls, key, filenames): hits = nts = 0 for filename in filenames: subtable = {} read_coverage_table(subtable, filename) contigtables = get_in(subtable, key) if contigtables is None: raise NodeError("Error reading table %r; row not found:" "\n %s ...\n\nIf files have been renamed " "during the run, then please remove this file " "in that it may be re-generated.\nHowever, " "note that read-group tags in the BAM files " "may not be correct!" % (filename, " ".join(key))) for contigtable in contigtables.itervalues(): hits += contigtable["Hits"] nts += contigtable["M"] return hits, nts
def test_run__error_log__node_error(): @with_temp_folder def _do_test_run__error_log__node_error(temp_folder, exception): cfg_mock = flexmock(temp_root = temp_folder) node_mock = flexmock(Node()) node_mock.should_receive("_run").and_raise(exception).once try: os.mkdir(os.path.join(temp_folder, "xTMPx")) with MonkeypatchCreateTempDir(root = temp_folder, subfolder = "xTMPx"): # pylint: disable=E1103 node_mock.run(cfg_mock) # pragma: no coverage except NodeError: log_file = os.path.join(temp_folder, "xTMPx", "pipe.errors") assert os.path.exists(log_file) assert_in("Errors =", get_file_contents(log_file)) return assert False # pragma: no coverage yield _do_test_run__error_log__node_error, NodeError("ARGH!") yield _do_test_run__error_log__node_error, OSError("ARGH!")
def test_run__exceptions(): cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT) def build_tests(key, exception, expectation): def test_function(): node_mock = flexmock(Node()) node_mock.should_receive('_create_temp_dir').with_args(cfg_mock) \ .and_return(_DUMMY_TEMP).ordered.once node_mock.should_receive(key).and_raise(exception).ordered.once node_mock.should_receive('_remove_temp_dir').never assert_raises(expectation, node_mock.run, cfg_mock) # pylint: disable=E1103 return test_function print "foo" for key in ('_setup', '_run', '_teardown'): yield build_tests(key, TypeError("The castle AAARGH!"), NodeUnhandledException) yield build_tests(key, NodeError("He's a very naughty boy!"), NodeError)
def _process_reads(cls, observed_reads, output_files): for ((name, _, _), fpaths) in observed_reads.iteritems(): if len(fpaths) > 1: message = ["Read %r found in multiple files:" % (name, )] for fpath in fpaths: message.append(" - %r" % (fpath, )) message.append("") message.append( "This indicates that the same data files have " "been included multiple times in the project. " "Please review the input files used in this " "project, to ensure that each set of data is " "included only once.\n\n" "If this is not the case, then execute the " "following command(s) to mark this test as having " "succeeded:") for fpath in output_files: message.append("$ touch '%s'" % (fpath, )) raise NodeError("\n".join(message))
def check_fasta_file(filename): with open(filename) as handle: namecache = {} state, linelength, linelengthchanged = _NA, None, False for linenum, line in enumerate(handle, start=1): line = line.rstrip('\n\r') if not line: if state in (_NA, _IN_WHITESPACE): continue elif state == _IN_HEADER: raise NodeError("Expected FASTA sequence, found empty line" "\n Filename = %r\n Line = %r" % (filename, linenum)) elif state == _IN_SEQUENCE: state = _IN_WHITESPACE else: assert False elif line.startswith(">"): if state in (_NA, _IN_SEQUENCE, _IN_WHITESPACE): _validate_fasta_header(filename, linenum, line, namecache) state = _IN_HEADER linelength = None linelengthchanged = False elif state == _IN_HEADER: raise NodeError("Empty sequences not allowed\n" " Filename = %r\n Line = %r" % (filename, linenum - 1)) else: assert False else: if state == _NA: raise NodeError("Expected FASTA header, found %r\n" " Filename = %r\n Line = %r" % (line, filename, linenum)) elif state == _IN_HEADER: _validate_fasta_line(filename, linenum, line) linelength = len(line) state = _IN_SEQUENCE elif state == _IN_SEQUENCE: _validate_fasta_line(filename, linenum, line) # If the length has changed, then that line must be the # last line in the record, which may be shorter due to the # sequence length. This is because the FAI index format # expects that each line has the same length. if linelengthchanged or (linelength < len(line)): raise NodeError("Lines in FASTQ files must be of same " "length\n Filename = %r\n" " Line = %r" % (filename, linenum)) elif linelength != len(line): linelengthchanged = True elif state == _IN_WHITESPACE: raise NodeError("Empty lines not allowed in sequences\n" " Filename = %r\n Line = %r" % (filename, linenum)) else: assert False if state in (_NA, _IN_HEADER): raise NodeError("File does not contain any sequences" " Filename = %r" % (filename, ))