def _input(wildcards): try: _prefix = wildcards[prefix] except: _prefix = prefix try: _midfix = wildcards[midfix] except: _midfix = midfix try: _suffix = wildcards[suffix] except: _suffix = suffix if agg: level, _samples = self.find_level(_prefix) _sampleList = pd.DataFrame(_samples).to_dict('list') # Combine midfix and suffix and expand out any format strings _suffix = expand(_midfix + _suffix, **_sampleList, **self.config) # Retrun a list of sample ids by comining format string for higher level along with the full suffix. return list(set(expand(self.config[self.levelMap[level]] + '{suffix}', suffix=_suffix, **_sampleList, **self.config))) else: return expand('{prefix}{midfix}{suffix}', prefix=_prefix, midfix=_midfix, suffix=_suffix)
def generate_sensor_file_lists(configs): # Go through the configs and select those sensors with COMPUTE = True. # Also get TIME_SEGMENTS, and optionally TYPES then create expected # files. Return dictionary with list of file paths of expected and # actual files for each sensor listed in the config file. Added for Travis. # Initialize string of file path for both expected and actual metric values segment = configs['TIME_SEGMENTS']['TYPE'].lower() print(segment) act_str = "data/processed/features/"+segment+"/{pid}/{sensor_key}.csv" exp_str = "tests/data/processed/features/"+segment+"/{pid}/{sensor_key}.csv" # List of available sensors that can be tested by the testing suite TESTABLE_SENSORS = ['PHONE_MESSAGES', 'PHONE_CALLS', 'PHONE_SCREEN', 'PHONE_BATTERY', 'PHONE_BLUETOOTH', 'PHONE_WIFI_VISIBLE', 'PHONE_WIFI_CONNECTED', 'PHONE_LIGHT', 'PHONE_APPLICATIONS_FOREGROUND', 'PHONE_ACTIVITY_RECOGNITION', 'PHONE_CONVERSATION'] # Build list of sensors to be tested. sensors = [] for sensor in TESTABLE_SENSORS: if sensor in configs.keys(): for provider in configs[sensor]["PROVIDERS"]: if configs[sensor]["PROVIDERS"][provider]["COMPUTE"]: sensors.append(sensor.lower()) act_file_list = expand(act_str,pid=configs["PIDS"],sensor_key = sensors) exp_file_list = expand(exp_str, pid=configs["PIDS"],sensor_key = sensors) sensor_file_lists = list(zip(act_file_list,exp_file_list)) #sensor_file_lists[sensor] = list(zip(act_file_list,exp_file_list)) return sensor_file_lists
def generate_file_list(configs, sensor): # Generates the list of files that would be produced for one sensor # i.e. The sensor passed into the function. # Initialize string of file path for both expected and actual metric values act_str = "data/processed/features/{pid}/{sensor}_{sensor_type}{time_segment}.csv" exp_str = "tests/data/processed/features/period/{pid}/{sensor}_{sensor_type}{time_segment}.csv" sensor_cap = sensor.upper() if 'TIME_SEGMENTS' and 'FEATURES' in configs[sensor_cap]: sensor_type = [] if 'TYPES' in configs[sensor_cap]: for each in configs[sensor_cap]['TYPES']: sensor_type.append(each+'_') act_file_list = expand(act_str,pid=configs["PIDS"], sensor = sensor, sensor_type = sensor_type, time_segment = configs[sensor_cap]["TIME_SEGMENTS"]) exp_file_list = expand(exp_str,pid=configs["PIDS"], sensor = sensor, sensor_type = sensor_type, time_segment = configs[sensor_cap]["TIME_SEGMENTS"]) return zip(act_file_list, exp_file_list)
def test_simple_expand(): # single filepattern assert expand("{a}.out", a="test") == ["test.out"] # multiple filepatterns assert expand(["{a}.out", "{b}.out"], a="a", b="b") == ["a.out", "b.out"] # multiple wildcards assert expand("{a}.out", a=["1", "2", "3"]) == ["1.out", "2.out", "3.out"] # multiple wildcards and patterns assert expand(["{a}_{b}.ab", "{b}.b"], a="1 2".split(), b="3 4".split()) == [ "1_3.ab", "1_4.ab", "2_3.ab", "2_4.ab", "3.b", "4.b", ] # replace product assert expand(["{a}_{b}.ab", "{b}.b"], zip, a="1 2".split(), b="3 4".split()) == [ "1_3.ab", "2_4.ab", "3.b", "4.b", ]
def samples2metadata_local(samples: List[str], config: dict, logger) -> dict: """ (try to) get the metadata of local samples """ sampledict = dict() for sample in samples: if os.path.exists( expand(f'{{fastq_dir}}/{sample}.{{fqsuffix}}.gz', **config)[0]): sampledict[sample] = dict() sampledict[sample]["layout"] = "SINGLE" elif all( os.path.exists(path) for path in expand( f'{{fastq_dir}}/{sample}_{{fqext}}.{{fqsuffix}}.gz', ** config)): sampledict[sample] = dict() sampledict[sample]["layout"] = "PAIRED" elif sample.startswith(('GSM', 'SRX', 'SRR', 'ERR', 'DRR')): continue else: logger.error( f"\nsample {sample} was not found..\n" f"We checked for SE file:\n" f"\t{config['fastq_dir']}/{sample}.{config['fqsuffix']}.gz \n" f"and for PE files:\n" f"\t{config['fastq_dir']}/{sample}_{config['fqext1']}.{config['fqsuffix']}.gz \n" f"\t{config['fastq_dir']}/{sample}_{config['fqext2']}.{config['fqsuffix']}.gz \n" f"and since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we " f"couldn't find it online..\n") raise TerminatedException return sampledict
def get_output_files_phy_names(branchlbl_dir, mod_dir, phy_dir, labelled_models, \ mod_ext='.mod', phy_ext='.phy', out_ext='.out', blbl_ext='.txt'): """Return the control filenames for a set of models and phy alignment files. If a model requires a labelled branch (i.e. the model name is in the labelled_models list), a control file for each species group present in branchlbl_dir will be generated.""" # create and populate the lists of labelled an unlabelled models lbl_mod_wcs = [] mod_wcs = [] for mod in os.listdir(mod_dir): name, ext = os.path.splitext(mod) if ext == mod_ext and name in labelled_models: lbl_mod_wcs.append(name) elif ext == mod_ext: mod_wcs.append(name) # get the list of phy file names (no extension) phy_wcs = [ os.path.splitext(f)[0] for f in os.listdir(phy_dir) if os.path.splitext(f)[1] == phy_ext ] # get the phy file name to group mapping, where only phy files that contain at least one species are # present in the map phy_group_map = get_phy_to_group_mapping(branchlbl_dir, phy_dir, phy_ext) # return the list of output filenames for each model and each phy file, for both labelled # and unlabelled models labelled_model_outputs = expand('{mod}/{phy_group_map}.{mod}{ext}', mod=lbl_mod_wcs, \ phy_group_map=phy_group_map, ext=out_ext) unlabelled_model_outputs = expand('{mod}/{phy}/{phy}.{mod}{ext}', mod=mod_wcs, \ phy=phy_wcs, ext=out_ext) return labelled_model_outputs + unlabelled_model_outputs
def generate_sensor_file_lists(self): # Go through the configs and select those sensors with COMPUTE = True. # Also get TIME_SEGMENTS, then create expected # files. Return dictionary with list of file paths of expected and # actual files for each sensor listed in the config file. Added for Travis. # Initialize string of file path for both expected and actual metric values segment = self.configs['TIME_SEGMENTS']['TYPE'].lower() act_str = "data/processed/features/{pid}/{sensor_key}.csv" exp_str = "tests/data/processed/features/" + segment + "/{pid}/{sensor_key}.csv" # Build list of sensors to be tested. sensors = [] for sensor in self.configs: if "PROVIDERS" in self.configs[sensor] and self.configs[sensor][ "PROVIDERS"] is not None: for provider in self.configs[sensor]["PROVIDERS"]: if self.configs[sensor]["PROVIDERS"][provider]["COMPUTE"]: sensors.append(sensor.lower()) act_file_list = expand(act_str, pid=self.configs["PIDS"], sensor_key=sensors) exp_file_list = expand(exp_str, pid=self.configs["PIDS"], sensor_key=sensors) sensor_file_lists = list(zip(act_file_list, exp_file_list)) return sensor_file_lists
def generate_sensor_file_lists(configs): # Go through the configs and select those sensors with DAY_SEGMENTS, # optionally TYPES then create expected files Return dictionary with # list of file paths of expected and actual files for each sensor # listed in the config file. Added for Travis. # Initialize string of file path for both expected and actual metric values act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv" exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv" # Get all the SENSORS in the config.yaml files sensors = configs['SENSORS'] sensor_file_lists = {} # Loop though all sensors and create the actual and expected file paths for sensor in sensors: if sensor == 'messages': sensor = 'sms' sensor_cap = 'SMS' else: sensor_cap = sensor.upper() if 'DAY_SEGMENTS' in configs[sensor_cap]: sensor_type = [] if 'TYPES' in configs[sensor_cap]: for each in configs[sensor_cap]['TYPES']: sensor_type.append(each + '_') if sensor_type: act_file_list = expand( act_str, pid=configs["PIDS"], sensor=sensor, sensor_type=sensor_type, day_segment=configs[sensor_cap]["DAY_SEGMENTS"]) exp_file_list = expand( exp_str, pid=configs["PIDS"], sensor=sensor, sensor_type=sensor_type, day_segment=configs[sensor_cap]["DAY_SEGMENTS"]) else: act_file_list = expand( act_str, pid=configs["PIDS"], sensor=sensor, sensor_type='', day_segment=configs[sensor_cap]["DAY_SEGMENTS"]) exp_file_list = expand( exp_str, pid=configs["PIDS"], sensor=sensor, sensor_type='', day_segment=configs[sensor_cap]["DAY_SEGMENTS"]) sensor_file_lists[sensor_cap] = list( zip(act_file_list, exp_file_list)) return sensor_file_lists
def generate_database_targets(config, also_return_database_names=False): database_targets, database_names = [], [] ## integrate user settings and inputs config = integrate_user_config(config) ## What alphabets are we using? ## alphabet_info = config["alphabet_info"] ## What databases are we using? ## databases = config["databases"] # get all database details database_info = config["database_info"] # default filenaming for each database # variables: db_name, alphabet, ksize, db_type, suffix db_target_templates = config["database_target_template"] info_templates = db_target_templates["info_csv"] db_templates = db_target_templates["database"] # iterate through dbinfo and build targets for the alphabets we're using for db in databases: db_targs, db_names = [], [] db_info = config["database_info"][db] for db_alphabet, db_alpha_info in db_info["alphabets"].items(): if db_alphabet in alphabet_info.keys(): for db_ksize, dbs in db_alpha_info.items(): ksize_int = int( db_ksize[1:]) # db_ksize is string w/format: k{ksize} # only build target if db has matching ksize available. # todo: also handle scaled here??? if ksize_int in alphabet_info[db_alphabet]["ksizes"]: for db_type in dbs.keys(): suffix = config["database_suffixes"][db_type] db_filenames = expand(db_templates, db_name=db, alphabet=db_alphabet, ksize=db_ksize, db_type=db_type, suffix=suffix) # generate db_name, needed for workflow targets. sigh, don't like this - do it better. end = f".{db_type}.{suffix}" names = [fn.rsplit(end)[0] for fn in db_filenames] db_targs += db_filenames db_names += names # if we have any targets for this database name, also grab the info csv target if db_targs: # also get the db info csv db_info = expand(info_templates, db_name=db) db_targs += db_info # add targets for this database database_targets += db_targs database_names += db_names database_dir = config["database_dir"] final_db_targs = [os.path.join(database_dir, x) for x in database_targets] if also_return_database_names: return final_db_targs, database_names return final_db_targs
def generate_targets(config, samples, output_dir="", generate_db_targets=False): pipeline_targets = [] ## integrate user settings and inputs config = integrate_user_config(config) ## What alphabets are we using? ## alphabet_info = config["alphabet_info"] ## set run basename basename = config.get("basename", "thumper-output") ## What databases are we using? ## # Pipeline:: find steps in this pipeline pipeline = config["pipeline"] db_required = config["pipelines"][pipeline]["databases_required"] if db_required: database_targets, database_names = generate_database_targets( config, also_return_database_names=True) else: generate_db_targets = False database_targets, database_names = [], [] index_names = [] if pipeline == "generate_index": for alpha, alphaInfo in alphabet_info.items(): index_names += expand("{basename}.{alpha}-k{ksize}.scaled{scaled}", basename=basename, alpha=alpha, ksize=alphaInfo["ksizes"], scaled=alphaInfo["scaled"]) # generate targets for each step steps = config["pipelines"][pipeline]["steps"] for step in steps: step_outdir = config[step]["output_dir"] step_files = config[step]["output_files"] # fill variables in the output filenames for stepF in step_files: pipeline_targets += expand(os.path.join(output_dir, step_outdir, stepF), sample=samples, database=database_names, basename=basename, db_name=config.get("databases", []), index=index_names) if generate_db_targets: targets = database_targets + pipeline_targets return targets return pipeline_targets
def update(d, u, c): for k, v in u.items(): if isinstance(v, collections.Mapping): r = update(d.get(k, {}), v, c) d[k] = r else: if isinstance(fill, pd.DataFrame): d[k] = list(set(expand(u[k], zip, **fill.to_dict("list")))) else: d[k] = list(set(expand(u[k], c, **fill))) if not d[k]: d[k] = [u[k]] return d
def generate_sensor_file_lists(config): # Go through the configs and select those sensors with COMPUTE = True. # Also get DAY_SEGMENTS, and optionally TYPES then create expected # files. Return dictionary with list of file paths of expected and # actual files for each sensor listed in the config file. Added for Travis. # Initialize string of file path for both expected and actual metric values act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv" exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv" # List of available sensors that can be tested by the testing suite TESTABLE_SENSORS = ['MESSAGES', 'CALLS', 'SCREEN', 'BATTERY', 'BLUETOOTH', 'WIFI', 'LIGHT', 'APPLICATIONS_FOREGROUND', 'ACTIVITY_RECOGNITION', 'CONVERSATION'] # Build list of sensors to be tested. sensors = [] for sensor in TESTABLE_SENSORS: if config[sensor]["COMPUTE"] == True: sensors.append(sensor) sensor_file_lists = {} # Loop though all sensors and create the actual and expected file paths for sensor in sensors: if 'DAY_SEGMENTS' in config[sensor]: sensor_type = [] if 'TYPES' in config[sensor]: for each in config[sensor]['TYPES']: sensor_type.append(each+'_') lower_sensor = sensor.lower() if sensor_type: act_file_list = expand(act_str, pid=config["PIDS"], sensor = lower_sensor, sensor_type = sensor_type, day_segment = config[sensor]["DAY_SEGMENTS"]) exp_file_list = expand(exp_str, pid=config["PIDS"], sensor = lower_sensor, sensor_type = sensor_type, day_segment = config[sensor]["DAY_SEGMENTS"]) else: act_file_list = expand(act_str, pid=config["PIDS"], sensor = lower_sensor, sensor_type = '', day_segment = config[sensor]["DAY_SEGMENTS"]) exp_file_list = expand(exp_str, pid=config["PIDS"], sensor = lower_sensor, sensor_type = '', day_segment = config[sensor]["DAY_SEGMENTS"]) sensor_file_lists[sensor] = list(zip(act_file_list,exp_file_list)) return sensor_file_lists
def getFilePerSample(samples, sampleSheet, form1, form2, **kwargs): """ This function generates a list of filenames. For each sample a filename will be generated according to form1. If a sample is paired-end, form2 is used instead and a filename will be generated for both ends. """ out = [] for x in samples: if isSingleEnd(x, sampleSheet): out += expand(form1, sample=x, **kwargs) else: out += expand(form2, sample=x, group=[1,2], **kwargs) return out
def aggregate_input(wildcards): ops = base_checkpoint_obj.get(**wildcards).output checkpoint_output = _output_accessor(ops, output_key) expand_base_rule = os.path.join(checkpoint_output, base_rule) expand_target_rule = target_rule or expand_base_rule return expand(expand_target_rule, **glob_wildcards(expand_base_rule)._asdict())
def get_filt_logs(sample, unit, seq_type, config, d): if not config["preprocessing"]["phix_filter"]: return [] files = expand(opj(config["paths"]["results"], "intermediate", "preprocess", "{sample}_{unit}_PHIX_{seq_type}{s}.log"), sample=sample, unit=unit, seq_type=seq_type, s=d["phixfilt"]) return files
def _expand(self, template, wc=None): if wc is None: wc = {} if isinstance(template, str): template = [template] names = set() for item in template: names |= get_wildcard_names(item) sources = [wc] try: ds = self.getDatasetFromDir(wc.dir) sources += [ds] except: pass sources += [self] fields = {} for name in names: for source in sources: if name in dir(source): fields[name] = getattr(source, name) break if name not in fields: fields[name] = "{{{}}}".format(name) res = expand(template, **fields) return res
def get_sortmerna_logs(sample, unit, seq_type, config): if not config["preprocessing"]["sortmerna"]: return [] files = expand(opj(config["paths"]["results"], "intermediate", "preprocess", "{sample}_{unit}_{seq_type}.sortmerna.log"), sample=sample, unit=unit, seq_type=seq_type) return files
def get_fns_analysis(wildcards): fns = [] re_fn = re.compile(regex(str(source_pattern))) for fn in source_fkt(wildcards): match = re.match(re_fn, fn).groupdict() pattern = strip_wildcard_constraints(str(target_pattern)) fns.append(expand(pattern, **match, **extra_wildcards, allow_missing=True)[0]) return fns
def check_classifiers(config): """ Set paths and params specific to classifiers :param config: Snakemake config :return: Updated config dict """ # Add read-based config info config["centrifuge"]["index_path"] = "" config["centrifuge"]["base"] = "" config["centrifuge"]["dir"] = "" if config["classification"]["centrifuge"]: # Check if custom database exists custom = expand("{b}.{i}.cf", b=config["centrifuge"]["custom"], i=[1, 2, 3]) if list(set([os.path.exists(x) for x in custom]))[0]: config["centrifuge"]["index_path"] = config["centrifuge"]["custom"] # If not, use prebuilt default else: p = config["centrifuge"]["prebuilt"] config["centrifuge"]["index_path"] = opj("resources", "centrifuge", p) # Set centrifuge index config variables index_path = config["centrifuge"]["index_path"] config["centrifuge"]["dir"] = os.path.dirname(index_path) config["centrifuge"]["base"] = bn(index_path) config["kraken"]["index_path"] = "" config["kraken"]["mem"] = "" if config["classification"]["kraken"]: # Check if custom database exists custom = expand(opj(config["kraken"]["custom"], "{n}.k2d"), n=["hash", "opts", "taxo"]) if list(set(os.path.exists(x) for x in custom))[0]: config["kraken"]["index_path"] = config["kraken"]["custom"] # If not, use prebuilt or standard elif config["kraken"]["standard_db"]: config["kraken"]["index_path"] = opj("resources", "kraken", "standard") else: config["kraken"]["index_path"] = opj("resources", "kraken", "prebuilt", config["kraken"]["prebuilt"]) if config["kraken"]["reduce_memory"]: config["kraken"]["mem"] += "--memory-mapping" return config
def get_fastqc_files(sample, unit, pairs, config, pre): """Get all fastqc output""" if config["preprocessing"]["fastqc"]: files = expand(opj(config["paths"]["results"], "intermediate", "fastqc", "{sample}_{unit}_{pair}{PREPROCESS}_fastqc.zip"), sample=sample, unit=unit, pair=pairs, PREPROCESS=pre) return files return []
def download_file(workflow, data, install_dir): if workflow in data.keys(): for file_name, url_string in data[workflow].items(): try: url = urlparse(url_string) except Exception as e: #we don't care since some of the JSONS are not URL's pass if (file_name == 'sbttar' ): #sourmash files from the taxonomic classification workflow. tar_file = data[workflow]['sbttar'] db = data[workflow]['databases'] kv = data[workflow]['kvalue'] sbturl = data[workflow]['sbturl'] sourmash_files = expand(tar_file, database=db, kvalue=kv) for file in sourmash_files: if not (os.path.isfile(install_dir + "/" + file)): print("\nDownloading " + file + " from " + sbturl) try: urllib.request.urlretrieve( "http://" + sbturl + '/' + file, install_dir + "/" + file, reporthook) except SocketError as e: print("Error downloading file " + file + "Retry script.") print(e) try: os.remove(install_dir + "/" + file) except OSError: pass elif (url.scheme == "http" or url.scheme == "https"): #download via http if not (os.path.isfile(os.path.join(install_dir, file_name))): print("Downloading " + file_name + " from " + url_string) try: urllib.request.urlretrieve( url_string, install_dir + "/" + file_name, reporthook) except SocketError as e: print("Error downloading file " + file_name + " Retry script.") print(e) try: os.remove(install_dir + "/" + file_name) except OSError: pass elif (url.scheme == 'docker'): #download singularity image if not (os.path.isfile("../container_images/" + file_name)): print("Downloading singularity image " + file_name) sing_command = "singularity pull " + url_string subprocess.run( [sing_command], shell=True) #TODO: Error handling for sing pull os.rename(file_name, "../container_images/" + file_name) elif (url.scheme == "file"): #copy file from local location if not (os.path.isfile(os.path.join(install_dir, file_name))): print("Copying " + file_name) copyfile(".." + url.path, install_dir + "/" + file_name)
def concat_and_save(suffix, out_p): print(suffix) print(out_p) fps_ser = metadata_table.apply( lambda ser: expand(prefix + suffix, **ser)[0], axis=1 ) concat_df = pd.concat([pd.read_pickle(fp) for fp in fps_ser]) concat_df.to_pickle(out_p) concat_df.to_csv(fp_to_tsv(out_p, "p"), sep="\t", header=True, index=False) concat_df.columns = concat_df.columns.astype(str) concat_df.to_parquet(fp_to_parquet(out_p, "p"))
def func(wc): try: wc.sample except AttributeError: raise ValueError('Need "{{sample}}" in pattern ' '"{pattern}"'.format(pattern=pattern)) n = [1] if is_paired_end(sampletable, wc.sample) and not r1_only: n = [1, 2] res = expand(pattern, sample=wc.sample, n=n) return res
def getNonSplitCountFiles(self, dataset): """ Get all dummy count filenames for non-split counts :param dataset: DROP group name from wildcard :return: list of files """ ids = self.sa.getIDsByGroup(dataset, assay="RNA") file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \ "sample_tmp" / "nonSplitCounts" done_files = str(file_stump / "sample_{sample_id}.done") return expand(done_files, sample_id=ids)
def get_trim_logs(sample, unit, pairs, config, d): if not config["preprocessing"]["trimmomatic"] and not \ config["preprocessing"]["cutadapt"]: return [] if config["preprocessing"]["trimmomatic"]: trimmer = "trimmomatic" else: trimmer = "cutadapt" files = expand(opj(config["paths"]["results"], "intermediate", "preprocess", "{sample}_{unit}_{pair}{s}.{trimmer}.log"), sample=sample, unit=unit, pair=pairs, s=d["trimming"], trimmer=trimmer) return files
def getCountFiles(self, annotation, group): """ Get all count files from DROP (counted from BAM file) and external count matrices :param annotation: annotation name from wildcard :param group: DROP group name from wildcard :return: list of files """ bam_IDs = self.sa.getIDsByGroup(group, assay="RNA") file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds" count_files = expand(str(file_stump), sampleID=bam_IDs) extCountFiles = self.sa.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE") count_files.extend(extCountFiles) return count_files
def download_sourmash_files(data, workflow, install_dir): tar_file = data[workflow]['sbttar'] db = data[workflow]['databases'] kv = data[workflow]['kvalue'] sbturl = data[workflow]['sbturl'] sourmash_files = expand(tar_file, database=db, kvalue=kv) for file in sourmash_files: if not (os.path.isfile(install_dir+"/"+file)): print("\nDownloading " + file +" from " +sbturl) try: urllib.request.urlretrieve("http://" +sbturl + '/' +file, install_dir +"/" +file, reporthook) except SocketError as e: print("Error downloading file " + file + "Retry script.") print(e)
def dynamic_branch(self, wildcards, input=True): def get_io(rule): return (rule.input, rule.dynamic_input) if input else (rule.output, rule.dynamic_output) io, dynamic_io = get_io(self) branch = Rule(self) io_, dynamic_io_ = get_io(branch) expansion = defaultdict(list) for i, f in enumerate(io): if f in dynamic_io: try: for e in reversed(expand(f, zip, **wildcards)): expansion[i].append(IOFile(e, rule=branch)) except KeyError: return None # replace the dynamic files with the expanded files replacements = [(i, io[i], e) for i, e in reversed(list(expansion.items()))] for i, old, exp in replacements: dynamic_io_.remove(old) io_.insert_items(i, exp) if not input: for i, old, exp in replacements: if old in branch.temp_output: branch.temp_output.discard(old) branch.temp_output.update(exp) if old in branch.protected_output: branch.protected_output.discard(old) branch.protected_output.update(exp) if old in branch.touch_output: branch.touch_output.discard(old) branch.touch_output.update(exp) branch.wildcard_names.clear() non_dynamic_wildcards = dict((name, values[0]) for name, values in wildcards.items() if len(set(values)) == 1) # TODO have a look into how to concretize dependencies here (branch._input, branch._output, branch._params, branch._log, branch._benchmark, _, branch.dependencies) = branch.expand_wildcards( wildcards=non_dynamic_wildcards) return branch, non_dynamic_wildcards return branch
def getFiles(self, filename, datasets=None, **kwargs): """ Determine files for export count groups. :param filename: name of file :return: list of export files """ if datasets is None: datasets = self.getExportGroups() file_pattern = str(self.pattern / f"{filename}") return expand(file_pattern, dataset=datasets, annotation=self.geneAnnotations, genomeAssembly=self.genomeAssembly, **kwargs)
def dynamic_branch(self, wildcards, input=True): def get_io(rule): return (rule.input, rule.dynamic_input) if input else ( rule.output, rule.dynamic_output ) io, dynamic_io = get_io(self) branch = Rule(self) io_, dynamic_io_ = get_io(branch) expansion = defaultdict(list) for i, f in enumerate(io): if f in dynamic_io: try: for e in reversed(expand(f, zip, **wildcards)): expansion[i].append(IOFile(e, rule=branch)) except KeyError: return None # replace the dynamic files with the expanded files replacements = [(i, io[i], e) for i, e in reversed(list(expansion.items()))] for i, old, exp in replacements: dynamic_io_.remove(old) io_.insert_items(i, exp) if not input: for i, old, exp in replacements: if old in branch.temp_output: branch.temp_output.discard(old) branch.temp_output.update(exp) if old in branch.protected_output: branch.protected_output.discard(old) branch.protected_output.update(exp) if old in branch.touch_output: branch.touch_output.discard(old) branch.touch_output.update(exp) branch.wildcard_names.clear() non_dynamic_wildcards = dict((name, values[0]) for name, values in wildcards.items() if len(set(values)) == 1) # TODO have a look into how to concretize dependencies here (branch._input, branch._output, branch._params, branch._log, branch._benchmark, _, branch.dependencies ) = branch.expand_wildcards(wildcards=non_dynamic_wildcards) return branch, non_dynamic_wildcards return branch
def test_expand_call_arguments(): target_folder = nj( "/data/riksdagen_corpus_data/riksdagen-corpus-exports/speech_xml") source_folder = nj("/data/riksdagen_corpus_data/riksdagen-corpus/corpus/") extension = "xml" years, basenames = glob_wildcards( jj(source_folder, "{year}", f"{{file}}.{extension}")) filenames = expand(jj(target_folder, '{year}', f'{{basename}}.{extension}'), zip, year=years, basename=basenames) assert len(filenames) == len(years)
def annotation_input(config, assemblies): input = [] if not config["assembly"]["megahit"] and not config["assembly"]["metaspades"]: return input for group in assemblies.keys(): # Add orfcalling results input.append(opj(config["paths"]["results"], "annotation", group, "final_contigs.gff")) if config["annotation"]["infernal"]: input.append(opj(config["paths"]["results"], "annotation", group, "final_contigs.cmscan")) if config["annotation"]["tRNAscan"]: input.append(opj(config["paths"]["results"], "annotation", group, "tRNA.out")) # Add EGGNOG annotation if config["annotation"]["eggnog"]: input += expand(opj(config["paths"]["results"], "annotation", group, "{db}.parsed.{fc}.tsv"), db=["enzymes", "pathways", "kos", "modules"], fc=["raw", "tpm"]) # Add PFAM annotation if config["annotation"]["pfam"]: input += expand(opj(config["paths"]["results"], "annotation", group, "pfam.parsed.{fc}.tsv"), fc=["tpm", "raw"]) # Add taxonomic annotation if config["annotation"]["taxonomy"]: input += expand( opj(config["paths"]["results"], "annotation", group, "taxonomy", "tax.{fc}.tsv"), fc=["tpm", "raw"]) # Add Resistance Gene Identifier output if config["annotation"]["rgi"]: input += expand(opj(config["paths"]["results"], "annotation", group, "rgi.{fc}.tsv"), fc=["raw", "tpm"]) input.append(opj(config["paths"]["results"], "annotation", group, "rgi.out.txt")) return input
def dynamic_branch(self, wildcards, input=True): def get_io(rule): return (rule.input, rule.dynamic_input) if input else ( rule.output, rule.dynamic_output ) def partially_expand(f, wildcards): """Expand the wildcards in f from the ones present in wildcards This is done by replacing all wildcard delimiters by `{{` or `}}` that are not in `wildcards.keys()`. """ # perform the partial expansion from f's string representation s = str(f).replace('{', '{{').replace('}', '}}') for key in wildcards.keys(): s = s.replace('{{{{{}}}}}'.format(key), '{{{}}}'.format(key)) # build result anno_s = AnnotatedString(s) anno_s.flags = f.flags return IOFile(anno_s, f.rule) io, dynamic_io = get_io(self) branch = Rule(self) io_, dynamic_io_ = get_io(branch) expansion = defaultdict(list) for i, f in enumerate(io): if f in dynamic_io: f = partially_expand(f, wildcards) try: for e in reversed(expand(f, zip, **wildcards)): # need to clone the flags so intermediate # dynamic remote file paths are expanded and # removed appropriately ioFile = IOFile(e, rule=branch) ioFile.clone_flags(f) expansion[i].append(ioFile) except KeyError: return None # replace the dynamic files with the expanded files replacements = [(i, io[i], e) for i, e in reversed(list(expansion.items()))] for i, old, exp in replacements: dynamic_io_.remove(old) io_.insert_items(i, exp) if not input: for i, old, exp in replacements: if old in branch.temp_output: branch.temp_output.discard(old) branch.temp_output.update(exp) if old in branch.protected_output: branch.protected_output.discard(old) branch.protected_output.update(exp) if old in branch.touch_output: branch.touch_output.discard(old) branch.touch_output.update(exp) branch.wildcard_names.clear() non_dynamic_wildcards = dict((name, values[0]) for name, values in wildcards.items() if len(set(values)) == 1) # TODO have a look into how to concretize dependencies here (branch._input, branch._output, branch._params, branch._log, branch._benchmark, _, branch.dependencies ) = branch.expand_wildcards(wildcards=non_dynamic_wildcards) return branch, non_dynamic_wildcards return branch