Exemplo n.º 1
0
        def _input(wildcards):

            try:
                _prefix = wildcards[prefix]
            except:
                _prefix = prefix

            try:
                _midfix = wildcards[midfix]
            except:
                _midfix = midfix

            try:
                _suffix = wildcards[suffix]
            except:
                _suffix = suffix

            if agg:
                level, _samples = self.find_level(_prefix)
                _sampleList = pd.DataFrame(_samples).to_dict('list')

                # Combine midfix and suffix and expand out any format strings
                _suffix = expand(_midfix + _suffix, **_sampleList, **self.config)

                # Retrun a list of sample ids by comining format string for higher level along with the full suffix.
                return list(set(expand(self.config[self.levelMap[level]] + '{suffix}', suffix=_suffix, **_sampleList, **self.config)))
            else:
                return expand('{prefix}{midfix}{suffix}', prefix=_prefix, midfix=_midfix, suffix=_suffix)
Exemplo n.º 2
0
def generate_sensor_file_lists(configs):
    # Go through the configs and select those sensors with COMPUTE = True.
    # Also get TIME_SEGMENTS, and optionally TYPES then create expected 
    # files. Return dictionary with list of file paths of expected and 
    # actual files for each sensor listed in the config file. Added for Travis.

    # Initialize string of file path for both expected and actual metric values
    segment = configs['TIME_SEGMENTS']['TYPE'].lower()
    print(segment)
    act_str = "data/processed/features/"+segment+"/{pid}/{sensor_key}.csv"
    exp_str = "tests/data/processed/features/"+segment+"/{pid}/{sensor_key}.csv"

    # List of available sensors that can be tested by the testing suite
    TESTABLE_SENSORS = ['PHONE_MESSAGES', 'PHONE_CALLS', 'PHONE_SCREEN', 'PHONE_BATTERY', 'PHONE_BLUETOOTH', 'PHONE_WIFI_VISIBLE', 'PHONE_WIFI_CONNECTED', 'PHONE_LIGHT', 'PHONE_APPLICATIONS_FOREGROUND', 'PHONE_ACTIVITY_RECOGNITION', 'PHONE_CONVERSATION']

    # Build list of sensors to be tested. 
    sensors = []
    for sensor in TESTABLE_SENSORS:
        if sensor in configs.keys():
            for provider in configs[sensor]["PROVIDERS"]:
                if configs[sensor]["PROVIDERS"][provider]["COMPUTE"]:
                    sensors.append(sensor.lower())

    act_file_list = expand(act_str,pid=configs["PIDS"],sensor_key = sensors)                                
    exp_file_list = expand(exp_str, pid=configs["PIDS"],sensor_key = sensors)
    sensor_file_lists = list(zip(act_file_list,exp_file_list))          
    #sensor_file_lists[sensor] = list(zip(act_file_list,exp_file_list))

    return sensor_file_lists
Exemplo n.º 3
0
def generate_file_list(configs, sensor):
    # Generates the list of files that would be produced for one sensor
    # i.e. The sensor passed into the function. 

    # Initialize string of file path for both expected and actual metric values
    act_str = "data/processed/features/{pid}/{sensor}_{sensor_type}{time_segment}.csv"
    exp_str = "tests/data/processed/features/period/{pid}/{sensor}_{sensor_type}{time_segment}.csv"
    
    sensor_cap = sensor.upper()
    if 'TIME_SEGMENTS' and 'FEATURES' in configs[sensor_cap]:
        sensor_type = []
        if 'TYPES' in configs[sensor_cap]:
            for each in configs[sensor_cap]['TYPES']:
                sensor_type.append(each+'_')

    act_file_list = expand(act_str,pid=configs["PIDS"],
                                   sensor = sensor,
                                   sensor_type = sensor_type,
                                   time_segment = configs[sensor_cap]["TIME_SEGMENTS"])
    
    exp_file_list = expand(exp_str,pid=configs["PIDS"],
                                   sensor = sensor,
                                   sensor_type = sensor_type,
                                   time_segment = configs[sensor_cap]["TIME_SEGMENTS"])

    return zip(act_file_list, exp_file_list)
Exemplo n.º 4
0
def test_simple_expand():
    # single filepattern
    assert expand("{a}.out", a="test") == ["test.out"]
    # multiple filepatterns
    assert expand(["{a}.out", "{b}.out"], a="a", b="b") == ["a.out", "b.out"]
    # multiple wildcards
    assert expand("{a}.out", a=["1", "2", "3"]) == ["1.out", "2.out", "3.out"]
    # multiple wildcards and patterns
    assert expand(["{a}_{b}.ab", "{b}.b"], a="1 2".split(),
                  b="3 4".split()) == [
                      "1_3.ab",
                      "1_4.ab",
                      "2_3.ab",
                      "2_4.ab",
                      "3.b",
                      "4.b",
                  ]
    # replace product
    assert expand(["{a}_{b}.ab", "{b}.b"],
                  zip,
                  a="1 2".split(),
                  b="3 4".split()) == [
                      "1_3.ab",
                      "2_4.ab",
                      "3.b",
                      "4.b",
                  ]
Exemplo n.º 5
0
def samples2metadata_local(samples: List[str], config: dict, logger) -> dict:
    """
    (try to) get the metadata of local samples
    """
    sampledict = dict()
    for sample in samples:
        if os.path.exists(
                expand(f'{{fastq_dir}}/{sample}.{{fqsuffix}}.gz',
                       **config)[0]):
            sampledict[sample] = dict()
            sampledict[sample]["layout"] = "SINGLE"
        elif all(
                os.path.exists(path) for path in expand(
                    f'{{fastq_dir}}/{sample}_{{fqext}}.{{fqsuffix}}.gz', **
                    config)):
            sampledict[sample] = dict()
            sampledict[sample]["layout"] = "PAIRED"
        elif sample.startswith(('GSM', 'SRX', 'SRR', 'ERR', 'DRR')):
            continue
        else:
            logger.error(
                f"\nsample {sample} was not found..\n"
                f"We checked for SE file:\n"
                f"\t{config['fastq_dir']}/{sample}.{config['fqsuffix']}.gz \n"
                f"and for PE files:\n"
                f"\t{config['fastq_dir']}/{sample}_{config['fqext1']}.{config['fqsuffix']}.gz \n"
                f"\t{config['fastq_dir']}/{sample}_{config['fqext2']}.{config['fqsuffix']}.gz \n"
                f"and since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we "
                f"couldn't find it online..\n")
            raise TerminatedException

    return sampledict
Exemplo n.º 6
0
def get_output_files_phy_names(branchlbl_dir, mod_dir, phy_dir, labelled_models, \
 mod_ext='.mod', phy_ext='.phy', out_ext='.out', blbl_ext='.txt'):
    """Return the control filenames for a set of models and phy alignment files. If a model 
		requires a labelled branch (i.e. the model name is in the labelled_models list), a 
		control file for each species group present in branchlbl_dir will be generated."""
    # create and populate the lists of labelled an unlabelled models
    lbl_mod_wcs = []
    mod_wcs = []
    for mod in os.listdir(mod_dir):
        name, ext = os.path.splitext(mod)
        if ext == mod_ext and name in labelled_models:
            lbl_mod_wcs.append(name)
        elif ext == mod_ext:
            mod_wcs.append(name)
    # get the list of phy file names (no extension)
    phy_wcs = [
        os.path.splitext(f)[0] for f in os.listdir(phy_dir)
        if os.path.splitext(f)[1] == phy_ext
    ]
    # get the phy file name to group mapping, where only phy files that contain at least one species are
    # present in the map
    phy_group_map = get_phy_to_group_mapping(branchlbl_dir, phy_dir, phy_ext)
    # return the list of output filenames for each model and each phy file, for both labelled
    # and unlabelled models
    labelled_model_outputs = expand('{mod}/{phy_group_map}.{mod}{ext}', mod=lbl_mod_wcs, \
     phy_group_map=phy_group_map, ext=out_ext)
    unlabelled_model_outputs = expand('{mod}/{phy}/{phy}.{mod}{ext}', mod=mod_wcs, \
     phy=phy_wcs, ext=out_ext)
    return labelled_model_outputs + unlabelled_model_outputs
Exemplo n.º 7
0
    def generate_sensor_file_lists(self):
        # Go through the configs and select those sensors with COMPUTE = True.
        # Also get TIME_SEGMENTS, then create expected
        # files. Return dictionary with list of file paths of expected and
        # actual files for each sensor listed in the config file. Added for Travis.

        # Initialize string of file path for both expected and actual metric values
        segment = self.configs['TIME_SEGMENTS']['TYPE'].lower()
        act_str = "data/processed/features/{pid}/{sensor_key}.csv"
        exp_str = "tests/data/processed/features/" + segment + "/{pid}/{sensor_key}.csv"

        # Build list of sensors to be tested.
        sensors = []
        for sensor in self.configs:
            if "PROVIDERS" in self.configs[sensor] and self.configs[sensor][
                    "PROVIDERS"] is not None:
                for provider in self.configs[sensor]["PROVIDERS"]:
                    if self.configs[sensor]["PROVIDERS"][provider]["COMPUTE"]:
                        sensors.append(sensor.lower())

        act_file_list = expand(act_str,
                               pid=self.configs["PIDS"],
                               sensor_key=sensors)
        exp_file_list = expand(exp_str,
                               pid=self.configs["PIDS"],
                               sensor_key=sensors)
        sensor_file_lists = list(zip(act_file_list, exp_file_list))

        return sensor_file_lists
Exemplo n.º 8
0
def generate_sensor_file_lists(configs):
    # Go through the configs and select those sensors with DAY_SEGMENTS,
    # optionally TYPES then create expected files Return dictionary with
    # list of file paths of expected and actual files for each sensor
    # listed in the config file. Added for Travis.

    # Initialize string of file path for both expected and actual metric values
    act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
    exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"

    # Get all the SENSORS in the config.yaml files
    sensors = configs['SENSORS']
    sensor_file_lists = {}

    # Loop though all sensors and create the actual and expected file paths
    for sensor in sensors:
        if sensor == 'messages':
            sensor = 'sms'
            sensor_cap = 'SMS'
        else:
            sensor_cap = sensor.upper()
        if 'DAY_SEGMENTS' in configs[sensor_cap]:
            sensor_type = []
            if 'TYPES' in configs[sensor_cap]:
                for each in configs[sensor_cap]['TYPES']:
                    sensor_type.append(each + '_')

            if sensor_type:
                act_file_list = expand(
                    act_str,
                    pid=configs["PIDS"],
                    sensor=sensor,
                    sensor_type=sensor_type,
                    day_segment=configs[sensor_cap]["DAY_SEGMENTS"])
                exp_file_list = expand(
                    exp_str,
                    pid=configs["PIDS"],
                    sensor=sensor,
                    sensor_type=sensor_type,
                    day_segment=configs[sensor_cap]["DAY_SEGMENTS"])
            else:
                act_file_list = expand(
                    act_str,
                    pid=configs["PIDS"],
                    sensor=sensor,
                    sensor_type='',
                    day_segment=configs[sensor_cap]["DAY_SEGMENTS"])
                exp_file_list = expand(
                    exp_str,
                    pid=configs["PIDS"],
                    sensor=sensor,
                    sensor_type='',
                    day_segment=configs[sensor_cap]["DAY_SEGMENTS"])

            sensor_file_lists[sensor_cap] = list(
                zip(act_file_list, exp_file_list))

    return sensor_file_lists
Exemplo n.º 9
0
Arquivo: utils.py Projeto: ctb/thumper
def generate_database_targets(config, also_return_database_names=False):
    database_targets, database_names = [], []
    ## integrate user settings and inputs
    config = integrate_user_config(config)
    ## What alphabets are we using? ##
    alphabet_info = config["alphabet_info"]
    ## What databases are we using? ##
    databases = config["databases"]
    # get all database details
    database_info = config["database_info"]

    # default filenaming for each database
    # variables: db_name, alphabet, ksize, db_type, suffix
    db_target_templates = config["database_target_template"]
    info_templates = db_target_templates["info_csv"]
    db_templates = db_target_templates["database"]

    # iterate through dbinfo and build targets for the alphabets we're using
    for db in databases:
        db_targs, db_names = [], []
        db_info = config["database_info"][db]
        for db_alphabet, db_alpha_info in db_info["alphabets"].items():
            if db_alphabet in alphabet_info.keys():
                for db_ksize, dbs in db_alpha_info.items():
                    ksize_int = int(
                        db_ksize[1:])  # db_ksize is string w/format: k{ksize}
                    # only build target if db has matching ksize available.
                    # todo: also handle scaled here???
                    if ksize_int in alphabet_info[db_alphabet]["ksizes"]:

                        for db_type in dbs.keys():
                            suffix = config["database_suffixes"][db_type]
                            db_filenames = expand(db_templates,
                                                  db_name=db,
                                                  alphabet=db_alphabet,
                                                  ksize=db_ksize,
                                                  db_type=db_type,
                                                  suffix=suffix)
                            # generate db_name, needed for workflow targets. sigh, don't like this - do it better.
                            end = f".{db_type}.{suffix}"
                            names = [fn.rsplit(end)[0] for fn in db_filenames]
                            db_targs += db_filenames
                            db_names += names
        # if we have any targets for this database name, also grab the info csv target
        if db_targs:
            # also get the db info csv
            db_info = expand(info_templates, db_name=db)
            db_targs += db_info
        # add targets for this database
        database_targets += db_targs
        database_names += db_names

    database_dir = config["database_dir"]
    final_db_targs = [os.path.join(database_dir, x) for x in database_targets]
    if also_return_database_names:
        return final_db_targs, database_names

    return final_db_targs
Exemplo n.º 10
0
Arquivo: utils.py Projeto: ctb/thumper
def generate_targets(config,
                     samples,
                     output_dir="",
                     generate_db_targets=False):
    pipeline_targets = []
    ## integrate user settings and inputs
    config = integrate_user_config(config)
    ## What alphabets are we using? ##
    alphabet_info = config["alphabet_info"]
    ## set run basename
    basename = config.get("basename", "thumper-output")

    ## What databases are we using? ##
    # Pipeline:: find steps in this pipeline
    pipeline = config["pipeline"]
    db_required = config["pipelines"][pipeline]["databases_required"]
    if db_required:
        database_targets, database_names = generate_database_targets(
            config, also_return_database_names=True)
    else:
        generate_db_targets = False
        database_targets, database_names = [], []
    index_names = []
    if pipeline == "generate_index":
        for alpha, alphaInfo in alphabet_info.items():
            index_names += expand("{basename}.{alpha}-k{ksize}.scaled{scaled}",
                                  basename=basename,
                                  alpha=alpha,
                                  ksize=alphaInfo["ksizes"],
                                  scaled=alphaInfo["scaled"])

    # generate targets for each step
    steps = config["pipelines"][pipeline]["steps"]
    for step in steps:
        step_outdir = config[step]["output_dir"]
        step_files = config[step]["output_files"]

        # fill variables in the output filenames
        for stepF in step_files:
            pipeline_targets += expand(os.path.join(output_dir, step_outdir,
                                                    stepF),
                                       sample=samples,
                                       database=database_names,
                                       basename=basename,
                                       db_name=config.get("databases", []),
                                       index=index_names)

    if generate_db_targets:
        targets = database_targets + pipeline_targets
        return targets

    return pipeline_targets
Exemplo n.º 11
0
 def update(d, u, c):
     for k, v in u.items():
         if isinstance(v, collections.Mapping):
             r = update(d.get(k, {}), v, c)
             d[k] = r
         else:
             if isinstance(fill, pd.DataFrame):
                 d[k] = list(set(expand(u[k], zip, **fill.to_dict("list"))))
             else:
                 d[k] = list(set(expand(u[k], c, **fill)))
         if not d[k]:
             d[k] = [u[k]]
     return d
Exemplo n.º 12
0
def generate_sensor_file_lists(config):
    # Go through the configs and select those sensors with COMPUTE = True.
    # Also get DAY_SEGMENTS, and optionally TYPES then create expected 
    # files. Return dictionary with list of file paths of expected and 
    # actual files for each sensor listed in the config file. Added for Travis.

    # Initialize string of file path for both expected and actual metric values
    act_str = "data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"
    exp_str = "tests/data/processed/{pid}/{sensor}_{sensor_type}{day_segment}.csv"

    # List of available sensors that can be tested by the testing suite
    TESTABLE_SENSORS = ['MESSAGES', 'CALLS', 'SCREEN', 'BATTERY', 'BLUETOOTH', 'WIFI', 'LIGHT', 'APPLICATIONS_FOREGROUND', 'ACTIVITY_RECOGNITION', 'CONVERSATION']

    # Build list of sensors to be tested. 
    sensors = []
    for sensor in TESTABLE_SENSORS:
        if config[sensor]["COMPUTE"] == True:
            sensors.append(sensor)

    sensor_file_lists = {}
    
    # Loop though all sensors and create the actual and expected file paths
    for sensor in sensors:
        if 'DAY_SEGMENTS' in config[sensor]:
            sensor_type = []
            if 'TYPES' in config[sensor]:
                for each in config[sensor]['TYPES']:
                    sensor_type.append(each+'_')
            lower_sensor = sensor.lower()
            if sensor_type:
                act_file_list = expand(act_str, pid=config["PIDS"], 
                                                sensor = lower_sensor, 
                                                sensor_type = sensor_type, 
                                                day_segment = config[sensor]["DAY_SEGMENTS"])
                exp_file_list = expand(exp_str, pid=config["PIDS"], 
                                                sensor = lower_sensor, 
                                                sensor_type = sensor_type, 
                                                day_segment = config[sensor]["DAY_SEGMENTS"])
            else:
                act_file_list = expand(act_str, pid=config["PIDS"], 
                                                sensor = lower_sensor, 
                                                sensor_type = '', 
                                                day_segment = config[sensor]["DAY_SEGMENTS"])
                exp_file_list = expand(exp_str, pid=config["PIDS"], 
                                                sensor = lower_sensor, 
                                                sensor_type = '', 
                                                day_segment = config[sensor]["DAY_SEGMENTS"])

            sensor_file_lists[sensor] = list(zip(act_file_list,exp_file_list))

    return sensor_file_lists
Exemplo n.º 13
0
def getFilePerSample(samples, sampleSheet, form1, form2, **kwargs):
    """
    This function generates a list of filenames. For each sample
    a filename will be generated according to form1. If a sample
    is paired-end, form2 is used instead and a filename will be
    generated for both ends.
    """
    out = []
    for x in samples:
        if isSingleEnd(x, sampleSheet):
            out += expand(form1, sample=x, **kwargs)
        else:
            out += expand(form2, sample=x, group=[1,2], **kwargs)
    return out
Exemplo n.º 14
0
 def aggregate_input(wildcards):
     ops = base_checkpoint_obj.get(**wildcards).output
     checkpoint_output = _output_accessor(ops, output_key)
     expand_base_rule = os.path.join(checkpoint_output, base_rule)
     expand_target_rule = target_rule or expand_base_rule
     return expand(expand_target_rule,
                   **glob_wildcards(expand_base_rule)._asdict())
Exemplo n.º 15
0
def get_filt_logs(sample, unit, seq_type, config, d):
    if not config["preprocessing"]["phix_filter"]:
        return []
    files = expand(opj(config["paths"]["results"], "intermediate", "preprocess",
                       "{sample}_{unit}_PHIX_{seq_type}{s}.log"), sample=sample,
                   unit=unit, seq_type=seq_type, s=d["phixfilt"])
    return files
Exemplo n.º 16
0
    def _expand(self, template, wc=None):
        if wc is None:
            wc = {}
        if isinstance(template, str):
            template = [template]
        names = set()
        for item in template:
            names |= get_wildcard_names(item)

        sources = [wc]
        try:
            ds = self.getDatasetFromDir(wc.dir)
            sources += [ds]
        except:
            pass
        sources += [self]

        fields = {}
        for name in names:
            for source in sources:
                if name in dir(source):
                    fields[name] = getattr(source, name)
                    break
            if name not in fields:
                fields[name] = "{{{}}}".format(name)

        res = expand(template, **fields)
        return res
Exemplo n.º 17
0
def get_sortmerna_logs(sample, unit, seq_type, config):
    if not config["preprocessing"]["sortmerna"]:
        return []
    files = expand(opj(config["paths"]["results"], "intermediate", "preprocess",
                       "{sample}_{unit}_{seq_type}.sortmerna.log"),
                   sample=sample, unit=unit, seq_type=seq_type)
    return files
 def get_fns_analysis(wildcards):
     fns = []
     re_fn = re.compile(regex(str(source_pattern)))
     for fn in source_fkt(wildcards):
         match = re.match(re_fn, fn).groupdict()
         pattern = strip_wildcard_constraints(str(target_pattern))
         fns.append(expand(pattern, **match, **extra_wildcards, allow_missing=True)[0])
     return fns
Exemplo n.º 19
0
def check_classifiers(config):
    """
    Set paths and params specific to classifiers

    :param config: Snakemake config
    :return: Updated config dict
    """
    # Add read-based config info
    config["centrifuge"]["index_path"] = ""
    config["centrifuge"]["base"] = ""
    config["centrifuge"]["dir"] = ""
    if config["classification"]["centrifuge"]:
        # Check if custom database exists
        custom = expand("{b}.{i}.cf", b=config["centrifuge"]["custom"],
                        i=[1, 2, 3])
        if list(set([os.path.exists(x) for x in custom]))[0]:
            config["centrifuge"]["index_path"] = config["centrifuge"]["custom"]
        # If not, use prebuilt default
        else:
            p = config["centrifuge"]["prebuilt"]
            config["centrifuge"]["index_path"] = opj("resources", "centrifuge",
                                                     p)
        # Set centrifuge index config variables
        index_path = config["centrifuge"]["index_path"]
        config["centrifuge"]["dir"] = os.path.dirname(index_path)
        config["centrifuge"]["base"] = bn(index_path)

    config["kraken"]["index_path"] = ""
    config["kraken"]["mem"] = ""
    if config["classification"]["kraken"]:
        # Check if custom database exists
        custom = expand(opj(config["kraken"]["custom"], "{n}.k2d"),
                        n=["hash", "opts", "taxo"])
        if list(set(os.path.exists(x) for x in custom))[0]:
            config["kraken"]["index_path"] = config["kraken"]["custom"]
        # If not, use prebuilt or standard
        elif config["kraken"]["standard_db"]:
            config["kraken"]["index_path"] = opj("resources", "kraken",
                                                 "standard")
        else:
            config["kraken"]["index_path"] = opj("resources", "kraken",
                                                 "prebuilt",
                                                 config["kraken"]["prebuilt"])
        if config["kraken"]["reduce_memory"]:
            config["kraken"]["mem"] += "--memory-mapping"
    return config
Exemplo n.º 20
0
def get_fastqc_files(sample, unit, pairs, config, pre):
    """Get all fastqc output"""
    if config["preprocessing"]["fastqc"]:
        files = expand(opj(config["paths"]["results"], "intermediate", "fastqc",
                           "{sample}_{unit}_{pair}{PREPROCESS}_fastqc.zip"),
                       sample=sample, unit=unit, pair=pairs, PREPROCESS=pre)
        return files
    return []
Exemplo n.º 21
0
def download_file(workflow, data, install_dir):
    if workflow in data.keys():
        for file_name, url_string in data[workflow].items():
            try:
                url = urlparse(url_string)
            except Exception as e:  #we don't care since some of the JSONS are not URL's
                pass
            if (file_name == 'sbttar'
                ):  #sourmash files from the taxonomic classification workflow.
                tar_file = data[workflow]['sbttar']
                db = data[workflow]['databases']
                kv = data[workflow]['kvalue']
                sbturl = data[workflow]['sbturl']
                sourmash_files = expand(tar_file, database=db, kvalue=kv)
                for file in sourmash_files:
                    if not (os.path.isfile(install_dir + "/" + file)):
                        print("\nDownloading " + file + " from " + sbturl)
                        try:
                            urllib.request.urlretrieve(
                                "http://" + sbturl + '/' + file,
                                install_dir + "/" + file, reporthook)
                        except SocketError as e:
                            print("Error downloading file " + file +
                                  "Retry script.")
                            print(e)
                            try:
                                os.remove(install_dir + "/" + file)
                            except OSError:
                                pass
            elif (url.scheme == "http"
                  or url.scheme == "https"):  #download via http
                if not (os.path.isfile(os.path.join(install_dir, file_name))):
                    print("Downloading " + file_name + " from " + url_string)
                    try:
                        urllib.request.urlretrieve(
                            url_string, install_dir + "/" + file_name,
                            reporthook)
                    except SocketError as e:
                        print("Error downloading file " + file_name +
                              " Retry script.")
                        print(e)
                        try:
                            os.remove(install_dir + "/" + file_name)
                        except OSError:
                            pass
            elif (url.scheme == 'docker'):  #download singularity image
                if not (os.path.isfile("../container_images/" + file_name)):
                    print("Downloading singularity image " + file_name)
                    sing_command = "singularity pull " + url_string
                    subprocess.run(
                        [sing_command],
                        shell=True)  #TODO: Error handling for sing pull
                    os.rename(file_name, "../container_images/" + file_name)
            elif (url.scheme == "file"):  #copy file from local location
                if not (os.path.isfile(os.path.join(install_dir, file_name))):
                    print("Copying " + file_name)
                    copyfile(".." + url.path, install_dir + "/" + file_name)
Exemplo n.º 22
0
 def concat_and_save(suffix, out_p):
     print(suffix)
     print(out_p)
     fps_ser = metadata_table.apply(
         lambda ser: expand(prefix + suffix, **ser)[0], axis=1
     )
     concat_df = pd.concat([pd.read_pickle(fp) for fp in fps_ser])
     concat_df.to_pickle(out_p)
     concat_df.to_csv(fp_to_tsv(out_p, "p"), sep="\t", header=True, index=False)
     concat_df.columns = concat_df.columns.astype(str)
     concat_df.to_parquet(fp_to_parquet(out_p, "p"))
Exemplo n.º 23
0
 def func(wc):
     try:
         wc.sample
     except AttributeError:
         raise ValueError('Need "{{sample}}" in pattern '
                          '"{pattern}"'.format(pattern=pattern))
     n = [1]
     if is_paired_end(sampletable, wc.sample) and not r1_only:
         n = [1, 2]
     res = expand(pattern, sample=wc.sample, n=n)
     return res
Exemplo n.º 24
0
 def getNonSplitCountFiles(self, dataset):
     """
     Get all dummy count filenames for non-split counts
     :param dataset: DROP group name from wildcard
     :return: list of files
     """
     ids = self.sa.getIDsByGroup(dataset, assay="RNA")
     file_stump = self.processedDataDir / "aberrant_splicing" / "datasets" / "cache" / f"raw-{dataset}" / \
                  "sample_tmp" / "nonSplitCounts"
     done_files = str(file_stump / "sample_{sample_id}.done")
     return expand(done_files, sample_id=ids)
Exemplo n.º 25
0
def get_trim_logs(sample, unit, pairs, config, d):
    if not config["preprocessing"]["trimmomatic"] and not \
    config["preprocessing"]["cutadapt"]:
        return []
    if config["preprocessing"]["trimmomatic"]:
        trimmer = "trimmomatic"
    else:
        trimmer = "cutadapt"
    files = expand(opj(config["paths"]["results"], "intermediate", "preprocess",
                       "{sample}_{unit}_{pair}{s}.{trimmer}.log"),
                   sample=sample, unit=unit, pair=pairs, s=d["trimming"],
                   trimmer=trimmer)
    return files
Exemplo n.º 26
0
 def getCountFiles(self, annotation, group):
     """
     Get all count files from DROP (counted from BAM file) and external count matrices
     :param annotation: annotation name from wildcard
     :param group: DROP group name from wildcard
     :return: list of files
     """
     bam_IDs = self.sa.getIDsByGroup(group, assay="RNA")
     file_stump = self.processedDataDir / "aberrant_expression" / annotation / "counts" / "{sampleID}.Rds"
     count_files = expand(str(file_stump), sampleID=bam_IDs)
     extCountFiles = self.sa.getImportCountFiles(annotation, group, file_type="GENE_COUNTS_FILE")
     count_files.extend(extCountFiles)
     return count_files
def download_sourmash_files(data, workflow, install_dir):   
    tar_file = data[workflow]['sbttar']
    db = data[workflow]['databases']
    kv = data[workflow]['kvalue']
    sbturl = data[workflow]['sbturl']
    sourmash_files = expand(tar_file, database=db, kvalue=kv)
    for file in sourmash_files:
        if not (os.path.isfile(install_dir+"/"+file)):
            print("\nDownloading " + file +" from " +sbturl)
            try:
                urllib.request.urlretrieve("http://" +sbturl + '/' +file, install_dir +"/" +file, reporthook)
            except SocketError as e:
                print("Error downloading file " + file + "Retry script.")
                print(e)
Exemplo n.º 28
0
    def dynamic_branch(self, wildcards, input=True):
        def get_io(rule):
            return (rule.input,
                    rule.dynamic_input) if input else (rule.output,
                                                       rule.dynamic_output)

        io, dynamic_io = get_io(self)

        branch = Rule(self)
        io_, dynamic_io_ = get_io(branch)

        expansion = defaultdict(list)
        for i, f in enumerate(io):
            if f in dynamic_io:
                try:
                    for e in reversed(expand(f, zip, **wildcards)):
                        expansion[i].append(IOFile(e, rule=branch))
                except KeyError:
                    return None

        # replace the dynamic files with the expanded files
        replacements = [(i, io[i], e)
                        for i, e in reversed(list(expansion.items()))]
        for i, old, exp in replacements:
            dynamic_io_.remove(old)
            io_.insert_items(i, exp)

        if not input:
            for i, old, exp in replacements:
                if old in branch.temp_output:
                    branch.temp_output.discard(old)
                    branch.temp_output.update(exp)
                if old in branch.protected_output:
                    branch.protected_output.discard(old)
                    branch.protected_output.update(exp)
                if old in branch.touch_output:
                    branch.touch_output.discard(old)
                    branch.touch_output.update(exp)

            branch.wildcard_names.clear()
            non_dynamic_wildcards = dict((name, values[0])
                                         for name, values in wildcards.items()
                                         if len(set(values)) == 1)
            # TODO have a look into how to concretize dependencies here
            (branch._input, branch._output, branch._params, branch._log,
             branch._benchmark, _,
             branch.dependencies) = branch.expand_wildcards(
                 wildcards=non_dynamic_wildcards)
            return branch, non_dynamic_wildcards
        return branch
Exemplo n.º 29
0
 def getFiles(self, filename, datasets=None, **kwargs):
     """
     Determine files for export count groups.
     :param filename: name of file
     :return: list of export files
     """
     if datasets is None:
         datasets = self.getExportGroups()
     file_pattern = str(self.pattern / f"{filename}")
     return expand(file_pattern,
                   dataset=datasets,
                   annotation=self.geneAnnotations,
                   genomeAssembly=self.genomeAssembly,
                   **kwargs)
Exemplo n.º 30
0
    def dynamic_branch(self, wildcards, input=True):
        def get_io(rule):
            return (rule.input, rule.dynamic_input) if input else (
                rule.output, rule.dynamic_output
            )

        io, dynamic_io = get_io(self)

        branch = Rule(self)
        io_, dynamic_io_ = get_io(branch)

        expansion = defaultdict(list)
        for i, f in enumerate(io):
            if f in dynamic_io:
                try:
                    for e in reversed(expand(f, zip, **wildcards)):
                        expansion[i].append(IOFile(e, rule=branch))
                except KeyError:
                    return None

        # replace the dynamic files with the expanded files
        replacements = [(i, io[i], e)
                        for i, e in reversed(list(expansion.items()))]
        for i, old, exp in replacements:
            dynamic_io_.remove(old)
            io_.insert_items(i, exp)

        if not input:
            for i, old, exp in replacements:
                if old in branch.temp_output:
                    branch.temp_output.discard(old)
                    branch.temp_output.update(exp)
                if old in branch.protected_output:
                    branch.protected_output.discard(old)
                    branch.protected_output.update(exp)
                if old in branch.touch_output:
                    branch.touch_output.discard(old)
                    branch.touch_output.update(exp)

            branch.wildcard_names.clear()
            non_dynamic_wildcards = dict((name, values[0])
                                         for name, values in wildcards.items()
                                         if len(set(values)) == 1)
            # TODO have a look into how to concretize dependencies here
            (branch._input, branch._output, branch._params, branch._log,
             branch._benchmark, _, branch.dependencies
             ) = branch.expand_wildcards(wildcards=non_dynamic_wildcards)
            return branch, non_dynamic_wildcards
        return branch
Exemplo n.º 31
0
def test_expand_call_arguments():
    target_folder = nj(
        "/data/riksdagen_corpus_data/riksdagen-corpus-exports/speech_xml")
    source_folder = nj("/data/riksdagen_corpus_data/riksdagen-corpus/corpus/")
    extension = "xml"
    years, basenames = glob_wildcards(
        jj(source_folder, "{year}", f"{{file}}.{extension}"))

    filenames = expand(jj(target_folder, '{year}',
                          f'{{basename}}.{extension}'),
                       zip,
                       year=years,
                       basename=basenames)

    assert len(filenames) == len(years)
Exemplo n.º 32
0
def annotation_input(config, assemblies):
    input = []
    if not config["assembly"]["megahit"] and not config["assembly"]["metaspades"]:
        return input
    for group in assemblies.keys():
        # Add orfcalling results
        input.append(opj(config["paths"]["results"], "annotation", group,
                         "final_contigs.gff"))
        if config["annotation"]["infernal"]:
            input.append(opj(config["paths"]["results"], "annotation", group,
                             "final_contigs.cmscan"))
        if config["annotation"]["tRNAscan"]:
            input.append(opj(config["paths"]["results"], "annotation", group,
                             "tRNA.out"))
        # Add EGGNOG annotation
        if config["annotation"]["eggnog"]:
            input += expand(opj(config["paths"]["results"], "annotation", group,
                                "{db}.parsed.{fc}.tsv"),
                            db=["enzymes", "pathways", "kos", "modules"],
                            fc=["raw", "tpm"])
        # Add PFAM annotation
        if config["annotation"]["pfam"]:
            input += expand(opj(config["paths"]["results"], "annotation", group,
                                "pfam.parsed.{fc}.tsv"), fc=["tpm", "raw"])
        # Add taxonomic annotation
        if config["annotation"]["taxonomy"]:
            input += expand(
                opj(config["paths"]["results"], "annotation", group, "taxonomy",
                    "tax.{fc}.tsv"), fc=["tpm", "raw"])
        # Add Resistance Gene Identifier output
        if config["annotation"]["rgi"]:
            input += expand(opj(config["paths"]["results"], "annotation", group,
                                "rgi.{fc}.tsv"), fc=["raw", "tpm"])
            input.append(opj(config["paths"]["results"], "annotation", group,
                             "rgi.out.txt"))
    return input
Exemplo n.º 33
0
    def dynamic_branch(self, wildcards, input=True):
        def get_io(rule):
            return (rule.input, rule.dynamic_input) if input else (
                rule.output, rule.dynamic_output
            )

        def partially_expand(f, wildcards):
            """Expand the wildcards in f from the ones present in wildcards

            This is done by replacing all wildcard delimiters by `{{` or `}}`
            that are not in `wildcards.keys()`.
            """
            # perform the partial expansion from f's string representation
            s = str(f).replace('{', '{{').replace('}', '}}')
            for key in wildcards.keys():
                s = s.replace('{{{{{}}}}}'.format(key),
                              '{{{}}}'.format(key))
            # build result
            anno_s = AnnotatedString(s)
            anno_s.flags = f.flags
            return IOFile(anno_s, f.rule)

        io, dynamic_io = get_io(self)

        branch = Rule(self)
        io_, dynamic_io_ = get_io(branch)

        expansion = defaultdict(list)
        for i, f in enumerate(io):
            if f in dynamic_io:
                f = partially_expand(f, wildcards)
                try:
                    for e in reversed(expand(f, zip, **wildcards)):
                        # need to clone the flags so intermediate
                        # dynamic remote file paths are expanded and
                        # removed appropriately
                        ioFile = IOFile(e, rule=branch)
                        ioFile.clone_flags(f)
                        expansion[i].append(ioFile)
                except KeyError:
                    return None

        # replace the dynamic files with the expanded files
        replacements = [(i, io[i], e)
                        for i, e in reversed(list(expansion.items()))]
        for i, old, exp in replacements:
            dynamic_io_.remove(old)
            io_.insert_items(i, exp)

        if not input:
            for i, old, exp in replacements:
                if old in branch.temp_output:
                    branch.temp_output.discard(old)
                    branch.temp_output.update(exp)
                if old in branch.protected_output:
                    branch.protected_output.discard(old)
                    branch.protected_output.update(exp)
                if old in branch.touch_output:
                    branch.touch_output.discard(old)
                    branch.touch_output.update(exp)

            branch.wildcard_names.clear()
            non_dynamic_wildcards = dict((name, values[0])
                                         for name, values in wildcards.items()
                                         if len(set(values)) == 1)
            # TODO have a look into how to concretize dependencies here
            (branch._input, branch._output, branch._params, branch._log,
             branch._benchmark, _, branch.dependencies
             ) = branch.expand_wildcards(wildcards=non_dynamic_wildcards)
            return branch, non_dynamic_wildcards
        return branch