class CasavaOneEightLanelessPerSampleDirFmt(model.DirectoryFormat): sequences = model.FileCollection(r'.+_.+_R[12]_001\.fastq\.gz', format=FastqGzFormat) @sequences.set_path_maker def sequences_path_maker(self, sample_id, barcode_id, read_number): return '%s_%s_R%d_001.fastq.gz' % (sample_id, barcode_id, read_number)
class JSONDirectory(model.DirectoryFormat): manifest = model.File("manifest.csv", format=ModelManifest) json_files = model.FileCollection(r".+\.json", format=JSONFormat) @json_files.set_path_maker def sbml_path_maker(self, model_id): return "%s.json" % model_id
class SBMLDirectory(model.DirectoryFormat): manifest = model.File("manifest.csv", format=ModelManifest) sbml_files = model.FileCollection(r".+\.xml", format=SBMLFormat) @sbml_files.set_path_maker def sbml_path_maker(self, model_id): return "%s.xml" % model_id
class CommunityModelDirectory(model.DirectoryFormat): manifest = model.File("manifest.csv", format=CommunityModelManifest) model_files = model.FileCollection(r".+\.pickle", format=CommunityModelFormat) @model_files.set_path_maker def model_path_maker(self, model_id): return "%s.pickle" % model_id
class CasavaOneEightSingleLanePerSampleDirFmt(model.DirectoryFormat): _CHECK_PAIRED = True _REQUIRE_PAIRED = False sequences = model.FileCollection( r'.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gz', format=FastqGzFormat) @sequences.set_path_maker def sequences_path_maker(self, sample_id, barcode_id, lane_number, read_number): return '%s_%s_L%03d_R%d_001.fastq.gz' % (sample_id, barcode_id, lane_number, read_number) def _find_duplicates(self, ids): return {x for x, c in collections.Counter(ids).items() if c > 1} def _validate_(self, level): forwards = [] reverse = [] for p in self.path.iterdir(): if p.is_dir(): # This branch happens if you have a filepath that looks roughly # like: Human_Kneecap/S1_L001_R1_001.fastq.gz # This technically matches the regex. It's easier to just # check that there aren't any directories, than making a very # complicated regex. This also produces a nicer error anyways. d = p.relative_to(self.path) raise ValidationError("Contains a subdirectory: %s" % d) else: if p.name.endswith('_001.fastq.gz'): sample_id = p.name.rsplit('_', maxsplit=4)[0] if p.name.endswith('R1_001.fastq.gz'): forwards.append(sample_id) else: reverse.append(sample_id) set_forwards = set(forwards) set_reverse = set(reverse) if len(set_forwards) != len(forwards): raise ValidationError('Duplicate samples in forward reads: %r' % self._find_duplicates(forwards)) if len(set_reverse) != len(reverse): raise ValidationError('Duplicate samples in reverse reads: %r' % self._find_duplicates(reverse)) if forwards and reverse: if not self._CHECK_PAIRED: raise ValidationError("Forward and reverse reads found.") elif set_forwards ^ set_reverse: raise ValidationError( "These samples do not have matching pairs of forward and " "reverse reads: %r" % (set_forwards ^ set_reverse)) elif self._REQUIRE_PAIRED: raise ValidationError("Reads are not paired end.")
class FourIntsDirectoryFormat(model.DirectoryFormat): """ A sequence of exactly four integers stored across multiple files, some of which are in a nested directory. Each file contains a single integer. Since this is a sequence, the integers have an order (corresponding to filename) and repetition of elements is allowed. """ single_ints = model.FileCollection(r'file[1-2]\.txt|nested/file[3-4]\.txt', format=SingleIntFormat) @single_ints.set_path_maker def single_ints_path_maker(self, num): if not 0 < num < 5: raise ValueError("`num` must be 1-4, not %r." % num) if num > 2: return 'nested/file%d.txt' % num else: return 'file%d.txt' % num
class MinHashSigJsonDirFormat(model.DirectoryFormat): signatures = model.FileCollection(r'.*\.sig', format=MinHashSigJson) @signatures.set_path_maker def signature_path_maker(self, name): return (name + '.sig')
class CasavaOneEightSingleLanePerSampleDirFmt(model.DirectoryFormat): _CHECK_PAIRED = True _REQUIRE_PAIRED = False sequences = model.FileCollection( r'.+_.+_L[0-9][0-9][0-9]_R[12]_001\.fastq\.gz', format=FastqGzFormat) @sequences.set_path_maker def sequences_path_maker(self, sample_id, barcode_id, lane_number, read_number): return '%s_%s_L%03d_R%d_001.fastq.gz' % (sample_id, barcode_id, lane_number, read_number) def _find_duplicates(self, ids): return {x for x, c in collections.Counter(ids).items() if c > 1} @property def manifest(self): tmp_manifest = FastqManifestFormat() with tmp_manifest.open() as fh: fh.write('sample-id,filename,direction\n') for fp, _ in self.sequences.iter_views(FastqGzFormat): sample_id, _, _, _, direction = _parse_casava_filename(fp) fh.write('%s,%s,%s\n' % (sample_id, fp.name, direction)) df = _manifest_to_df(tmp_manifest, self.path.parent) if 'reverse' not in df: df['reverse'] = None if 'forward' not in df: df['forward'] = None def munge_fn_closure(val): if val is not None: return str(self.path / pathlib.Path(val).name) return val for column in {'forward', 'reverse'}: df[column] = df[column].apply(munge_fn_closure) return df def _validate_(self, level): forwards = [] reverse = [] for p in self.path.iterdir(): if p.is_dir(): # This branch happens if you have a filepath that looks roughly # like: Human_Kneecap/S1_L001_R1_001.fastq.gz # This technically matches the regex. It's easier to just # check that there aren't any directories, than making a very # complicated regex. This also produces a nicer error anyways. d = p.relative_to(self.path) raise ValidationError("Contains a subdirectory: %s" % d) else: if p.name.endswith('_001.fastq.gz'): sample_id = p.name.rsplit('_', maxsplit=4)[0] if p.name.endswith('R1_001.fastq.gz'): forwards.append(sample_id) else: reverse.append(sample_id) set_forwards = set(forwards) set_reverse = set(reverse) if len(set_forwards) != len(forwards): raise ValidationError('Duplicate samples in forward reads: %r' % self._find_duplicates(forwards)) if len(set_reverse) != len(reverse): raise ValidationError('Duplicate samples in reverse reads: %r' % self._find_duplicates(reverse)) if forwards and reverse: if not self._CHECK_PAIRED: raise ValidationError("Forward and reverse reads found.") elif set_forwards ^ set_reverse: raise ValidationError( "These samples do not have matching pairs of forward and " "reverse reads: %r" % (set_forwards ^ set_reverse)) elif self._REQUIRE_PAIRED: raise ValidationError("Reads are not paired end.")
class FASTAFilesDirFmt(model.DirectoryFormat): fastas = model.FileCollection(r'.+\.fasta', format=DNAFASTAFormat) @fastas.set_path_maker def fastas_path_maker(self, name): return name + '.fasta'
class SAMFilesDirFmt(model.DirectoryFormat): sams = model.FileCollection(r'.+\.sam', format=SAMFormat) @sams.set_path_maker def sams_path_maker(self, name): return name + '.sam'
class BAMFilesDirFmt(model.DirectoryFormat): bams = model.FileCollection(r'.+\.bam', format=BAMFormat) @bams.set_path_maker def bams_path_maker(self, name): return name + '.bam'
class PileUpFilesDirFmt(model.DirectoryFormat): pileups = model.FileCollection(r'.+\.tsv', format=PileUpTSVFormat) @pileups.set_path_maker def pileups_path_maker(self, name): return name + '.tsv'