def _generate_demultiplexed_fastq_demux(self, mtime): """Modularity helper""" # An artifact will hold only one file of type # `preprocessed_demux`. Thus, we only use the first one # (the only one present) ar = self.artifact demux = [ path for _, path, ftype in ar.filepaths if ftype == 'preprocessed_demux' ][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = ("'%s' doesn't look like a demux file" % demux) LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del (self.samples[s]) del (self.samples_prep[s]) del (self.sample_demux_fps[s]) remove(sample_fp) return demux_samples
def _generate_demultiplexed_fastq_demux(self, mtime): """Modularity helper""" # An artifact will hold only one file of type # `preprocessed_demux`. Thus, we only use the first one # (the only one present) ar = self.artifact demux = [path for _, path, ftype in ar.filepaths if ftype == 'preprocessed_demux'][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = ( "'%s' doesn't look like a demux file" % demux) LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del(self.samples[s]) del(self.samples_prep[s]) del(self.sample_demux_fps[s]) remove(sample_fp) return demux_samples
def test_to_per_sample_ascii(self): with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" b"ABC\n")]), (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" b"DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" b"DEF\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def test_fetch_qual_length_bug(self): # fetch was not trimming qual to the length of the sequence resulting # in qual scores for positions beyond the length of the sequence. with tempfile.NamedTemporaryFile('r+', suffix='.fq', delete=False) as f: f.write(fqdata_variable_length) self.to_remove.append(f.name) to_hdf5(f.name, self.hdf5_file) exp = [(b'a', [(b"@a_0 orig_bc=abc new_bc=abc bc_diffs=0\nxyz\n+\n" b"ABC\n")]), (b'b', [(b"@b_0 orig_bc=abw new_bc=wbc bc_diffs=4\nqwe\n+\n" b"DFG\n"), (b"@b_1 orig_bc=abw new_bc=wbc bc_diffs=4\nqwexx\n+\n" b"DEF#G\n")])] obs = [(s[0], list(s[1])) for s in to_per_sample_ascii(self.hdf5_file)] self.assertEqual(obs, exp)
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ ar = self.artifact dir_not_exists = not isdir(self.full_ebi_dir) if dir_not_exists or rewrite_fastq: makedirs(self.full_ebi_dir) # An artifact will hold only one file of type `preprocessed_demux` # Thus, we only use the first one (the only one present) demux = [ path for _, path, ftype in ar.filepaths if ftype == 'preprocessed_demux' ][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = "'%s' doesn't look like a demux file" % demux LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del (self.samples[s]) del (self.samples_prep[s]) del (self.sample_demux_fps[s]) remove(sample_fp) else: demux_samples = set() extension = '.fastq.gz' extension_len = len(extension) for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) missing_samples = set(self.samples.keys()).difference( set(demux_samples)) for ms in missing_samples: del (self.samples[ms]) del (self.samples_prep[ms]) del (self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples
def generate_demultiplexed_fastq(self, rewrite_fastq=False, mtime=None): """Generates demultiplexed fastq Parameters ---------- rewrite_fastq : bool, optional If true, it forces the rewrite of the fastq files mtime : float, optional The time to use when creating the gz files. If None, the current time will be used by gzip.GzipFile. This is useful for testing. Returns ------- demux_samples List of successful demultiplexed samples Notes ----- - As a performace feature, this method will check if self.full_ebi_dir already exists and, if it does, the script will assume that in a previous execution this step was performed correctly and will simply read the file names from self.full_ebi_dir - When the object is created (init), samples, samples_prep and sample_demux_fps hold values for all available samples in the database. Here some of those values will be deleted (del's, within the loops) for those cases where the fastq.gz files weren't written or exist. This is an indication that they had no sequences and this kind of files are not accepted in EBI Raises ------ EBISubmissionError - The demux file couldn't be read - All samples are removed """ ar = self.artifact dir_not_exists = not isdir(self.full_ebi_dir) if dir_not_exists or rewrite_fastq: makedirs(self.full_ebi_dir) # An artifact will hold only one file of type `preprocessed_demux` # Thus, we only use the first one (the only one present) demux = [path for _, path, ftype in ar.filepaths if ftype == 'preprocessed_demux'][0] demux_samples = set() with open_file(demux) as demux_fh: if not isinstance(demux_fh, File): error_msg = "'%s' doesn't look like a demux file" % demux LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) for s, i in to_per_sample_ascii(demux_fh, self.prep_template.keys()): sample_fp = self.sample_demux_fps[s] wrote_sequences = False with GzipFile(sample_fp, mode='w', mtime=mtime) as fh: for record in i: fh.write(record) wrote_sequences = True if wrote_sequences: demux_samples.add(s) else: del(self.samples[s]) del(self.samples_prep[s]) del(self.sample_demux_fps[s]) remove(sample_fp) else: demux_samples = set() extension = '.fastq.gz' extension_len = len(extension) for f in listdir(self.full_ebi_dir): fpath = join(self.full_ebi_dir, f) if isfile(fpath) and f.endswith(extension): demux_samples.add(f[:-extension_len]) missing_samples = set(self.samples.keys()).difference( set(demux_samples)) for ms in missing_samples: del(self.samples[ms]) del(self.samples_prep[ms]) del(self.sample_demux_fps[ms]) if not demux_samples: error_msg = ("All samples were removed from the submission " "because the demux file is empty or the sample names " "do not match.") LogEntry.create('Runtime', error_msg) raise EBISubmissionError(error_msg) return demux_samples