def extract_contigs_bin_cov(file_path, key, data_dict): yaml = datahandling.load_yaml(file_path) data_dict["results"][key] = yaml for bin_value in GLOBAL_BIN_VALUES: data_dict["summary"]["raw_length_at_{}x".format( bin_value)] = yaml["binned_depth"][bin_value - 1] return data_dict
def extract_contig_variants(file_path, key, data_dict): yaml = datahandling.load_yaml(file_path) data_dict["results"][key] = yaml data_dict["summary"]["snp_filter_10x_10%"] = yaml["variant_table"][9][9] data_dict["summary"]["snp_filter_indels"] = yaml["indels"] data_dict["summary"]["snp_filter_deletions"] = yaml["deletions"] return data_dict
def script__datadump(folder, sample, sample_file_name, component_file_name): db_sample = datahandling.load_sample(sample_file_name) db_component = datahandling.load_component(component_file_name) folder = str(folder) sample = str(sample) datadump_dict = datahandling.load_sample_component(sample) datadump_dict["summary"] = datadump_dict.get("summary", {}) datadump_dict["results"] = datadump_dict.get("results", {}) species = db_sample["properties"]["species"] datadump_dict["summary"]["db"] = [] datadump_dict["summary"]["strain"] = [] datadump_dict["summary"]["alleles"] = [] datadump_dict["summary"]["component"] = {"id": db_component["_id"], "date": datetime.datetime.utcnow()} mlst_species = db_component["mlst_species_mapping"][species] for mlst_entry in mlst_species: mlst_entry_db = datahandling.load_yaml("cge_mlst/" + mlst_entry + "/data.json") datadump_dict["results"][mlst_entry] = mlst_entry_db datadump_dict["summary"]["db"].append(mlst_entry) datadump_dict["summary"]["strain"].append(mlst_entry_db["mlst"]["results"].get("sequence_type","NA")) datadump_dict["summary"]["alleles"].append(",".join([mlst_entry_db["mlst"]["results"]["allele_profile"][i]["allele_name"] for i in [i for i in mlst_entry_db["mlst"]["results"]["allele_profile"]]])) db_sample["properties"]["mlst"] = datadump_dict["summary"] datahandling.save_sample_component(datadump_dict, sample) datahandling.save_sample(db_sample, sample_file_name) return 0
def passes_check_reads_pipeline(sample, requirements_file, log_err): """ Checks if the component is a pipeline. In that case it will require reads to be present so the component can run. """ sample_db = datahandling.load_yaml(sample) if requirements_file["type"] == "pipeline": if "reads" not in sample_db: datahandling.log( log_err, "Pipeline component can't run on a sample with no reads. db:{}" .format(sample_db)) return False return True
def extract_contigs_sum_cov(file_path, key, data_dict): yaml = datahandling.load_yaml(file_path) data_dict["results"][key] = yaml for bin_value in GLOBAL_BIN_VALUES: total_length = 0 total_depth = 0 total_contigs = 0 for contig in yaml["contig_depth"]: if yaml["contig_depth"][contig]["coverage"] >= float(bin_value): total_length += yaml["contig_depth"][contig]["total_length"] total_depth += yaml["contig_depth"][contig]["total_depth"] total_contigs += 1 data_dict["summary"]["bin_contigs_at_{}x".format( bin_value)] = total_contigs data_dict["summary"]["bin_length_at_{}x".format( bin_value)] = total_length data_dict["summary"]["bin_coverage_at_{}x".format(bin_value)] = float( total_depth / total_length) return data_dict
def requirements_met(requirements_file, sample, log_out, log_err): requirements_file = datahandling.load_yaml(requirements_file) sample_name = datahandling.load_yaml(sample)["name"] if not passes_check_reads_pipeline(sample, requirements_file, log_err): return False no_failures = True if requirements_file.get('requirements', None) is not None: df = pandas.io.json.json_normalize( requirements_file.get('requirements') ) # flattens json into key while maintaining values https://stackoverflow.com/a/41801708 requirements_dict = df.to_dict(orient='records')[0] requirements = [] for key in requirements_dict: values = key.split('.') if values[0] == 'components': file_location = sample_name + "__" + values[1] + ".yaml" requirement = values[2:] # TODO: add check for no requirements expected_value = requirements_dict[key] requirements.append( [file_location, requirement, expected_value]) elif values[0] == 'sample': file_location = "sample.yaml" requirement = values[1:] # TODO: add check for no requirements expected_value = requirements_dict[key] requirements.append( [file_location, requirement, expected_value]) elif values[0] == 'run': file_location = "run.yaml" requirement = values[1:] # TODO: add check for no requirements expected_value = requirements_dict[key] requirements.append( [file_location, requirement, expected_value]) else: datahandling.log(log_err, "Improper requirement {}".format(key)) for requirement in requirements: # requirements are in form [file_path, [keys,key2,...], value_of_key (optional otherwise None)] file_location = requirement[0] keys = requirement[1] desired_value = requirement[2] db = datahandling.load_yaml(file_location) """ What this does is run dict.get interatively on the db based on the keys till it can 't go deeper then returns the value or None if it couldn' t reach that level. While used with dict.get any function can be passed https://pymotw.com/3/functools/ """ actual_value = functools.reduce(dict.get, keys, db) """ Check has been adjusted to check for a list to allow multiple potential options to match """ if not isinstance(desired_value, list): desired_value = [desired_value] # As it should be a list because of the last line. if actual_value is not None: # Not sure why desired is [None] instead of None if desired_value != [None]: if actual_value in desired_value: datahandling.log( log_err, "Found required entry (value checked) for\ndb: {}\nentry: {}\n" .format(":".join(keys), db)) else: datahandling.log( log_err, "Requirements not met for\ndb: {}\nentry: {}\ndesired_entry: {}\n" .format(":".join(keys), db, desired_value)) no_failures = False else: datahandling.log( log_err, "Found required entry (value not checked) for\ndb: {}\nentry: {}\n" .format(":".join(keys), db)) else: datahandling.log( log_err, "Requirements not met for\ndb: {}\nentry: {}\n".format( file_location, ":".join(keys))) no_failures = False if no_failures: return True else: return False
#!/usr/bin/env python3 """ Launcher file for accessing dockerfile commands """ import argparse import json import subprocess import os import sys import traceback from bifrostlib import datahandling COMPONENT: dict = datahandling.load_yaml( os.path.join(os.path.dirname(__file__), 'config.yaml')) def parse_args(): """ Arg parsing via argparse """ description: str = ( f"-Description------------------------------------\n" f"{COMPONENT['details']['description']}" f"------------------------------------------------\n\n" f"*Run command************************************\n" f"docker run \ \n" f" -e BIFROST_DB_KEY=mongodb://<user>:<password>@<server>:<port>/<db_name> \ \n" f" -v <input_path>:/input \ \n" f" -v <output_path>:/output \ \n" f" {COMPONENT['dockerfile']} \ \n" f" -id <sample_id>\n"