def write_sample_file(gen_db_path, fastq_list, analysis_name, execution_folder, analysis_id): GEN_DB = gendb_utils.DB() # update status gendb_utils.add_analysis_metadata(analysis_id, "airflow_execution_status", "running", update=True) if not isinstance(fastq_list, list): fastq_list = fastq_list.split(",") # id,fastq_prefix,R1,R2,species_name fastq_df = GEN_DB.get_fastq_and_sample_data(fastq_list) run_execution_folder = os.path.join(execution_folder, analysis_name) header = ["SampleName", "ScientificName", "R1", "R2", "fastq_id"] with open(os.path.join(run_execution_folder, f'{analysis_name}.tsv'), 'w') as f: f.write("\t".join(header) + '\n') for n, row in fastq_df.iterrows(): # deal with multiple fastq with same name R1 = row["R1"] R2 = row["R2"] fastq_id = row["fastq_id"] species = row["taxonomy"] sample_name = f'{row["sample_name"]}_{fastq_id}' f.write(f"{sample_name}\t{species}\t{R1}\t{R2}\t{fastq_id}\n")
def backup_output_files_samples(metadata_name2path_template, fastq_list, analysis_name, analysis_id, backup_folder): GEN_DB = gendb_utils.DB() fastq_df = GEN_DB.get_fastq_and_sample_data(fastq_list) qc_data = [] for metadata_name in metadata_name2path_template: path_template = metadata_name2path_template[metadata_name] for n, sample in fastq_df.iterrows(): sample_name = f'{sample["sample_name"]}_{sample["fastq_id"]}' # assume structure: {workflow}/{analysis_name}/{filepath} backup_path_format_relative = path_template.format(analysis_name=analysis_name,sample=sample_name) backup_path_format_absolute = os.path.join(backup_folder, '/'.join(backup_path_format_relative.split("/")[1:])) print("backup_path_format", backup_path_format_relative) if not os.path.exists(backup_path_format_absolute): print(f"WARNING: {backup_path_format_absolute} does not exit, skipping" ) continue qc_data.append({"fastq_id": sample["fastq_id"], "metrics_name": metadata_name, "metrics_value": backup_path_format_relative, "pipeline_version": ""}) for qc in qc_data: print("inserting", analysis_id, qc["fastq_id"], qc["metrics_name"], qc["metrics_value"]) GEN_DB.add_fastq_metadata(fastq_id=qc["fastq_id"], term_name=qc["metrics_name"], value=qc["metrics_value"], analysis_id=analysis_id)
def parse_molis_xml(XML_TABLE, ): ''' <Cell ss:StyleID="th1"><Data ss:Type="Spath_listtring">N° de demande</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Période</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Numéro de demande</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Patient hospitalisé</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Demandeur</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Sexe</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Date de naissance</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro de patient</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Unité de soins</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro de projet</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Patient</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro alias</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Référence externe</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Numéro patient externe</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Date saisie dem.</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Heure de saisie de la demande</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Date de réception</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Heure de réception</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Date prélèvement</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Heure de prélèvement</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Numéro de séjour</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Date dern. édition</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">L'heure du compte-rendu</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Remarque interne</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Remarque sur compte rendu</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Renseignements cliniques</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Statut validation niv. 2 (Méd.)</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Matériel</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Adresse</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">CodePostal</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">Ville</Data></Cell> <Cell ss:StyleID="th4"><Data ss:Type="String">Canton-Pays</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">COVTYP</Data></Cell> <Cell ss:StyleID="th3"><Data ss:Type="String">PREL</Data></Cell> ''' GEN_DB = gendb_utils.DB() from xml.dom import minidom xmldoc = minidom.parse(XML_TABLE) itemlist = xmldoc.getElementsByTagName('Row') row_list = [] for n, rows in enumerate(itemlist): item = rows.getElementsByTagName('Cell') if n == 0: columns = [ cells.childNodes[0].childNodes[0].nodeValue if len(cells.childNodes[0].childNodes) > 0 else '' for cells in item ] else: row_list.append([ cells.childNodes[0].childNodes[0].nodeValue if len(cells.childNodes[0].childNodes) > 0 else '' for cells in item ]) df = pandas.DataFrame(row_list) df.columns = columns return df
def backup(execution_folder, backup_folder, file_or_folder_list, analysis_id=False, analysis_name=False, fastq_list=False, output_selection=False, config=False, workflow_name=False, compress_ext=["fna", "faa", "gbk", "gbff", "vcf", "tsv", "csv", "gff"]): ''' Analysis name: folder within execution_folder (generally execution date) Analysis_metadata: dictionnary of status to add to LIMS: {"analysis_id" : <id>, "value": <value>} ''' import shutil import glob if analysis_id: # update status print("analysis_id", analysis_id) gendb_utils.add_analysis_metadata(analysis_id, "airflow_execution_status", "running", update=True) print("backup_folder", backup_folder) # copy files and folders to backup directory for n, output in enumerate(file_or_folder_list): print("BACKUP LIST", n, output) # list with reference and target path if isinstance(output, list): print("LIST...") # copy files to specified target directory file_list = glob.glob(os.path.join(execution_folder, analysis_name, output[0])) target_dir = os.path.join(backup_folder, analysis_name, output[1]) #print("file_list", file_list) for one_file in file_list: #print("original", one_file) target_abs_path = os.path.join(target_dir, os.path.basename(one_file)) print("copy---", one_file, target_abs_path) copy_and_compress(one_file, target_abs_path, compress_ext) elif isinstance(output, dict): print("DICT...") # more complex copy with renaming # {glob: samples/*/mapping/bwa/*_assembled_genome.bam, regex: .*/samples/(.*)/mapping/bwa/(.*)_assembled_genome.bam, vars: {1: 'sample', 2: 'reference'}, target: "mapping/{sample}-vs-{reference}.bam", term: bam_file} import re GEN_DB = gendb_utils.DB() file_list = glob.glob(os.path.join(execution_folder, analysis_name, output["glob"])) #print("glob file list:", file_list) for one_file in file_list: s = re.search(output["regex"], one_file) term2value = {output["vars"][index]:s.group(index) for index in output["vars"]} term2value.update({'analysis_name': analysis_name}) target_format = output["target"].format_map(term2value) target_path_full = os.path.join(backup_folder, '/'.join(target_format.split("/")[1:])) # copy file to target location if not os.path.exists(os.path.dirname(target_path_full)): os.makedirs(os.path.dirname(target_path_full)) #print("cp:", one_file, target_path_full) copy_and_compress(one_file, target_path_full, compress_ext) # save path in db if "term" in output: fastq_id = term2value["sample"].split("_")[-1] GEN_DB.add_fastq_metadata(fastq_id=fastq_id, term_name=output["term"], value=target_format, analysis_id=analysis_id) else: print("MIROR...") # simplest case # copy identical path output = output.format(analysis_name=analysis_name) original = os.path.join(execution_folder, analysis_name, output) target = os.path.join(backup_folder, analysis_name, output) # copy and compress what can be compressed copy_and_compress(original, target, compress_ext) # save file paths into database # can be either nested dictionnaries or a single dictionnary print("CONF", config, fastq_list) if config: if output_selection: output_selection = output_selection.split(",") metadata_lst = [value for key, value in config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["ANALYSIS"].items() if key in output_selection] analysis_metadata_name2template = {k: v for d in metadata_lst for k, v in d.items()} if fastq_list: metadata_lst = [value for key, value in config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["INDIVIDUAL_SAMPLES"].items() if key in output_selection] sample_metadata_name2template = {k: v for d in metadata_lst for k, v in d.items()} else: analysis_metadata_name2template = config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["ANALYSIS"] if fastq_list: sample_metadata_name2template = config["WORKFLOW"][workflow_name]["PIPELINE_OUTPUT"]["INDIVIDUAL_SAMPLES"] print("backup path analysis", analysis_metadata_name2template) backup_output_files_analysis(analysis_metadata_name2template, analysis_name, analysis_id, backup_folder) if fastq_list: print("fastq_list", fastq_list) print("sample_metadata_name2template", sample_metadata_name2template) fastq_list = fastq_list.split(",") backup_output_files_samples(sample_metadata_name2template, fastq_list, analysis_name, analysis_id, backup_folder)
def write_snakemake_config_file(analysis_name, fastq_list, execution_folder, snakemake_config, gen_db_path, analysis_id, reference_list=False, check_single_species=False, reference_docx=False, additional_args=False): GEN_DB = gendb_utils.DB() # update status gendb_utils.add_analysis_metadata(analysis_id, "airflow_execution_status", "running", update=True) run_execution_folder = os.path.join(execution_folder, analysis_name) species_list = list(set(GEN_DB.get_fastq_id2species(fastq_list.split(",")).values())) print("species_list", species_list) if check_single_species: if len(species_list) > 1: raise IOError("More than one different species in the dataset: %s" % ','.join(species_list)) # if only one species, set scientific_name # otherwise Mixed if len(species_list) == 1: scientific_name = species_list[0] else: scientific_name = 'Mixed' print("reference list:", reference_list) # if references, prepare list if reference_list: reference_list = reference_list.split(",") fastq_df = GEN_DB.get_fastq_and_sample_data(reference_list) # check if external ref ref_list = [ref for ref in reference_list if str(ref) not in fastq_df["fastq_id"].astype(str).to_list()] if len(ref_list) != 0: print(f"WARNING: extrenal reference genome -- {ref_list[0]} ") ref_list += [f'{row["sample_name"]}_{row["fastq_id"]}' for n, row in fastq_df.iterrows()] #if 'cgMLST' in reference_list: # ref_list.append("cgMLST") print("additional_args", additional_args) with open(os.path.join(run_execution_folder, f'{analysis_name}.config'), 'w') as f: # update sample table name snakemake_config["local_samples"] = f'{analysis_name}.tsv' if reference_list: print("ref list", ref_list) snakemake_config["reference"] = f'{",".join(ref_list)}' snakemake_config["species"] = f'{scientific_name}' if reference_docx: snakemake_config["reference_docx"] = f'{reference_docx}' if additional_args: for arg in additional_args: snakemake_config[arg] = additional_args[arg] documents = yaml.dump(snakemake_config, f)