def test_find_files_search(self, mock_walk): """Find files using search""" mock_walk.return_value = self.walk f = find_files(regexp=".fastq", search=True) self.assertListEqual(f, ['./1_121023_FLOWCELL_FOO.fastq.gz', './foo/1_121023_FLOWCELL_BAR.fastq.gz'])
def generic_target_generator(tgt_re, src_re=None, samples=[], runs=[], sample_column_map={}, sampleinfo="", target_suffix="", filter_suffix="", **kwargs): """Generic target generator. Args: tgt_re (RegexpDict): RegexpDict object corresponding to the target regular expression src_re (RegexpDict): RegexpDict object corresponding to the source regular expression samples: list of sample names runs: list of runs sample_column_map: dictionary that maps sampleinfo column names to regexp group names, e.g. {'SampleID':'SM', 'Lane':'PU1'} sampleinfo: sample information file target_suffix: suffix of generated targets filter_suffix: suffix to use for filtering when generating target names based on input files Returns: list of target names """ assert isinstance(tgt_re, RegexpDict), "tgt_re argument must be of type {}".format(RegexpDict) if src_re is None: src_re = tgt_re assert isinstance(src_re, RegexpDict), "src_re argument must be of type {}".format(RegexpDict) # 1. Generate targets from command line options if samples and runs: smllogger.debug("trying to gather target information based on configuration keys 'samples' and 'runs'") if len(samples) == len(runs): cfg_list = list(zip(samples, runs)) mlist = [] for (s, r) in cfg_list: # Use basename searches for samples and runs m = re.search(src_re.basename_pattern, r).groupdict() if not re.search(src_re.basename_pattern, r) is None else {} if m: m.update({'SM':s}) mlist.append(m) tgts = [tgt_re.fmt.format(**m) + target_suffix for m in mlist] return sorted(tgts) else: smllogger.warn("if samples and runs are provided, they must be of equal lengths") # 2. Generate targets from information in samplesheet if sampleinfo != "": smllogger.debug("trying to gather target information from configuration key 'sampleinfo'") if isinstance(sampleinfo, str) and not os.path.exists(sampleinfo): smllogger.debug("no such sample information file '{sampleinfo}'; trying to deduct targets from existing files".format(sampleinfo=sampleinfo)) else: smllogger.debug("Reading sample information from '{sampleinfo}'".format(sampleinfo=sampleinfo)) if isinstance(sampleinfo, str): with open(sampleinfo, 'r') as fh: reader = csv.DictReader(fh.readlines()) else: reader = sampleinfo assert type(reader) is csv.DictReader, "sampleinfo is not a 'csv.DictReader'; if not a file name, must be a 'csv.DictReader'" reader.fieldnames = [fn if fn not in sample_column_map.keys() else sample_column_map[fn] for fn in reader.fieldnames] if samples: tgts = [tgt_re.fmt.format(**row) + target_suffix for row in reader if row['SM'] in samples] else: tgts = [tgt_re.fmt.format(**row) + target_suffix for row in reader] return sorted(tgts) # 3. Generate targets from input files smllogger.debug("Getting sample information from input files") inputs = find_files(regexp=src_re.basename_pattern + filter_suffix, limit={'SM':samples} if samples else {}) if inputs: tgts = [tgt_re.fmt.format(**src_re.parse(f)) + target_suffix for f in inputs] return sorted(tgts) smllogger.warn("No targets could be generated!") return []
def test_find_fastq_files(self, mock_walk): """Find fastq files using match""" mock_walk.return_value = self.walk f = find_files(regexp="\w+.fastq.gz") self.assertListEqual(f, ['./1_121023_FLOWCELL_FOO.fastq.gz', './foo/1_121023_FLOWCELL_BAR.fastq.gz'])