def main(args): projectFolder = os.getcwd() assemblies_dir = args.assembly_dir lineage = args.lineage for sample_dir_name in [dir for dir in os.listdir(assemblies_dir) \ if os.path.isdir(os.path.join(assemblies_dir, dir))]: # in this folder I stored all the assemblies assemblies_folder = os.path.join(assemblies_dir, sample_dir_name) # in this folder I will compute the validation validation_folder = os.path.join(os.getcwd(), sample_dir_name) if not os.path.exists(validation_folder): os.makedirs(validation_folder) os.chdir(validation_folder) ## Restore all information present in the sample yaml file present ## in the assembly folder sample_config_assembly_file = os.path.join(assemblies_folder, "{}_assemble.yaml".format(sample_dir_name)) with open(sample_config_assembly_file) as sample_config_handle: sample_config_assembly = yaml.load(sample_config_handle) # prepare for each assembly employed a validation job --- do not check # in sample sheet the asse,blies, run one validation per forder # only (assumptions are done on assmebly name) for assembler in [dir for dir in os.listdir(assemblies_folder) \ if os.path.isdir(os.path.join(assemblies_folder, dir))]: if not os.path.exists(assembler): os.makedirs(assembler) os.chdir(assembler) assembly_dir = os.path.join(assemblies_folder, assembler) assembly_name = os.path.join(assembly_dir, "{}.scf.fasta".format(sample_config_assembly["output"])) pipeline = "evaluete" sample_YAML_name = "{}_{}.yaml".format(sample_dir_name, pipeline) sample_YAML = open(sample_YAML_name, 'w') sample_YAML.write("pipeline:\n") sample_YAML.write(" {}\n".format(pipeline)) sample_YAML.write("tools:\n") if lineage == 'none': sample_YAML.write(" [align, qaTools, FRC]\n") elif lineage in ['eukaryota', 'bacteria', 'vertebrata', 'fungi', 'metazoa', 'plant_early_release', 'arthropoda']: sample_YAML.write(" [align, qaTools, FRC, BUSCO]\n") sample_YAML.write("BUSCODataPath: /sw/apps/bioinfo/BUSCO/lineage_sets/{}\n".format(lineage)) sample_YAML.write( "output: {}\n".format(sample_config_assembly["output"])) sample_YAML.write( "projectName: {}_validate\n".format( sample_config_assembly["output"])) sample_YAML.write("kmer: \n") sample_YAML.write("threads: {}\n".format(args.threads)) sample_YAML.write( "genomeSize: {}\n".format( sample_config_assembly["genomeSize"])) sample_YAML.write("minCtgLength: 1000\n") sample_YAML.write("reference: {}\n".format(assembly_name)) sample_YAML.write("libraries:\n") for library, libraryData in \ sample_config_assembly["libraries"].items(): sample_YAML.write(" {}:\n".format(library)) sample_YAML.write(" pair1: {}\n".format(libraryData["pair1"])) sample_YAML.write(" pair2: {}\n".format(libraryData["pair2"])) sample_YAML.write(" orientation: {}\n".format( libraryData["orientation"])) sample_YAML.write(" insert: {}\n".format( libraryData["insert"])) sample_YAML.write(" std: {}\n".format(libraryData["std"])) sample_YAML.close # Run the job extramodules = ["module load samtools\nmodule load bwa\nmodule load BUSCO/1.22\nsource $BUSCO_SETUP\n"] jobname = "{}_{}_{}".format(sample_dir_name, pipeline, assembler) submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules) os.chdir(validation_folder) os.chdir(projectFolder)
def main(args): projectFolder = os.getcwd() samples_data_dir = args.sample_data_dir projectName = os.path.basename(os.path.normpath(samples_data_dir)) for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \ if os.path.isdir(os.path.join(samples_data_dir, dir))]: sample_folder = os.path.join(os.getcwd(), sample_dir_name) if not os.path.exists(sample_folder): os.makedirs(sample_folder) os.chdir(sample_folder) # now I am in the folder, i can run at the same time QC and MP anlaysis pipeline = "QCcontrol" tools = ["trimmomatic", "fastqc", "abyss", "align"] if args.reference is None: tools = ["trimmomatic", "fastqc", "abyss"] sample_YAML_name = os.path.join(sample_folder, "{}_{}.yaml".format( sample_dir_name, pipeline)) sample_YAML = open(sample_YAML_name, 'w') sample_YAML.write("pipeline:\n") sample_YAML.write(" {}\n".format(pipeline)) sample_YAML.write("tools:\n") sample_YAML.write(" {}\n".format(tools)) ##TODO: output must became sampleName sample_YAML.write("output: {}\n".format(sample_dir_name)) sample_YAML.write("projectName: {}\n".format(projectName)) sample_YAML.write("kmer: 35\n") sample_YAML.write("threads: {}\n".format(args.threads)) sample_YAML.write("genomeSize: \n") sample_YAML.write("adapters: {}\n".format(args.adapter)) if args.reference is not None: sample_YAML.write("reference: {}\n".format(args.reference)) sample_YAML.write("libraries:\n") sample_data_dir = os.path.join(samples_data_dir,sample_dir_name) # helper variables for collecting FCs fc_pat, prep_pat = (r'^\d{6}_.*_?.*$', r'^[A-Z]$') def _get_expected_dir(path, pat): return [os.path.join(path, d) for d in os.listdir(path) if re.match(pat, d) \ and os.path.isdir(os.path.join(path, d))] #collect FC directories flowcells_dirs = _get_expected_dir(sample_data_dir, fc_pat) # to adapt the directory structure in IRMA where it have lib prep dir lib_prep_dirs = _get_expected_dir(sample_data_dir, prep_pat) # Check and collect the flowcells in the lib prep directory for prep_dir in lib_prep_dirs: flowcells_dirs.extend(_get_expected_dir(prep_dir, fc_pat)) sample_files = [] for flowcell in flowcells_dirs: sample_files.extend([os.path.join(flowcell, f) for f in \ os.listdir(flowcell) \ if (os.path.isfile(os.path.realpath(os.path.join(flowcell,f))) \ and re.search('.gz$',f))]) # now sample_files contains all the file sequenced for this sample pair1_file = "" pair2_file = "" single = "" library = 1 while len(sample_files) > 0: file = sample_files[0] sample_YAML.write(" lib{}:\n".format(library)) if "_1.fastq.gz" in file: pair1_file = file pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file) elif "_2.fastq.gz" in file: pair2_file = file pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file) elif "R1_001.fastq.gz" in file: pair1_file = file pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file) elif "R2_001.fastq.gz" in file: pair2_file = file pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file) else: sys.exit("file {} does not respect naming convection. \ Exit!".format(file)) sample_YAML.write(" pair1: {}\n".format(pair1_file)) sample_YAML.write(" pair2: {}\n".format(pair2_file)) sample_YAML.write(" orientation: {}\n".format(args.orientation)) sample_YAML.write(" insert: {}\n".format(args.insert)) sample_YAML.write(" std: {}\n".format(args.std)) sample_files.remove(pair1_file) sample_files.remove(pair2_file) library += 1 sample_YAML.close # Run the job extramodules = [] if "abyss" in tools: extramodules.append("module load abyss/1.3.5\n") if "align" in tools: extramodules.append("module load samtools\nmodule load bwa\n") jobname = "{}_{}".format(sample_dir_name, pipeline) submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules) os.chdir(projectFolder)
def main(args): projectFolder = os.getcwd() samples_data_dir = args.sample_data_dir #UPPMAX assumption projectName = os.path.basename(os.path.normpath(samples_data_dir)) for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \ if os.path.isdir(os.path.join(samples_data_dir, dir))]: sample_folder = os.path.join(os.getcwd(), sample_dir_name) if not os.path.exists(sample_folder): os.makedirs(sample_folder) os.chdir(sample_folder) # if this is the case I need to retrive the project name from the yaml # file if args.afterqc: QC_YAML_file = os.path.join(samples_data_dir,sample_dir_name, "{}_QCcontrol.yaml".format(sample_dir_name)) if not os.path.exists(QC_YAML_file): sys.exit("Error file {} must exists!".format(QC_YAML_file)) with open(QC_YAML_file) as QC_YAML_file_handle: QC_sample_config = yaml.load(QC_YAML_file_handle) # TODO: I need to use the sample sheet that must be present in # the QC folder to extract the project name projectName = QC_sample_config["projectName"] #Now all the info is in place and I am in the correct folder pipeline = "assemble" tools = list(args.assemblers) tools = list(map(str, tools)) # Beware whoever inputs unicode characters sample_YAML_name = os.path.join(sample_folder, "{}_{}.yaml".format(sample_dir_name, pipeline)) sample_YAML = open(sample_YAML_name, 'w') sample_YAML.write("pipeline:\n") sample_YAML.write(" {}\n".format(pipeline)) sample_YAML.write("tools:\n") sample_YAML.write(" {}\n".format(tools)) sample_YAML.write("output: {}\n".format(sample_dir_name)) sample_YAML.write("projectName: {}\n".format(projectName)) sample_YAML.write("kmer: {}\n".format(args.kmer)) sample_YAML.write("threads: {}\n".format(args.threads)) sample_YAML.write("genomeSize: {}\n".format(args.genomesize)) if args.keep_tmp_files: #TODO: generalize if we add more flags sample_YAML.write("flags: ['keep_tmp_files']\n") #I have to distinguish between afterQC and not sample_data_dir = "" sample_files = [] if args.afterqc: sample_data_dir = os.path.join(samples_data_dir,sample_dir_name) fastq_files = os.path.join(sample_data_dir, "results", "fastq_trimmed") sample_files = [os.path.join(fastq_files, f) for f in \ os.listdir(fastq_files) \ if (os.path.isfile(os.path.join(fastq_files,f)) \ and re.search('[1|2].fastq.gz$',f))] else: sample_data_dir = os.path.join(samples_data_dir,sample_dir_name) # full path to flowcell flowcells_dirs = [os.path.join(sample_data_dir,flowcell) \ for flowcell in os.listdir(sample_data_dir) \ if os.path.isdir(os.path.join(sample_data_dir,flowcell))] for flowcell in flowcells_dirs: sample_files.extend([os.path.join(flowcell, f) for f in \ os.listdir(flowcell) \ if (os.path.isfile(os.path.join(flowcell,f)) \ and re.search('.gz$',f))]) # now sample_files contains all the file sequenced for this sample pair1_file = "" pair2_file = "" single = "" library = 1 sample_YAML.write("libraries:\n") for file in sample_files: sample_YAML.write(" lib{}:\n".format(library)) if "_1.fastq.gz" in file: pair1_file = file pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file) elif "_2.fastq.gz" in file: pair2_file = file pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file) elif "R1_001.fastq.gz" in file: pair1_file = file pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file) elif "R2_001.fastq.gz" in file: pair2_file = file pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file) sample_YAML.write(" pair1: {}\n".format(pair1_file)) sample_YAML.write(" pair2: {}\n".format(pair2_file)) sample_YAML.write(" orientation: {}\n".format(args.orientation)) sample_YAML.write(" insert: {}\n".format(args.insert)) sample_YAML.write(" std: {}\n".format(args.std)) sample_files.remove(pair1_file) sample_files.remove(pair2_file) library += 1 sample_YAML.close #Run the job all_modules = { "abyss": "module load abyss/1.3.5\n", "soapdenovo": "module load soapdenovo/2.04-r240\n", "spades": "module load spades/3.6.0\n", "cabog": "module load cabog/8.1\n", "allpaths": "module unload gcc\nmodule load allpathslg/52485\n", "masurca": "module load masurca MaSuRCA/2.3.2\n" } extramodules = [all_modules[tool] for tool in tools] jobname = "{}_{}".format(sample_dir_name, pipeline) submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules) os.chdir(projectFolder)
def main(args): projectFolder = os.getcwd() samples_data_dir = args.sample_data_dir #UPPMAX assumption projectName = os.path.basename(os.path.normpath(samples_data_dir)) for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \ if os.path.isdir(os.path.join(samples_data_dir, dir))]: sample_folder = os.path.join(os.getcwd(), sample_dir_name) if not os.path.exists(sample_folder): os.makedirs(sample_folder) os.chdir(sample_folder) # if this is the case I need to retrive the project name from the yaml # file if args.afterqc: QC_YAML_file = os.path.join( samples_data_dir, sample_dir_name, "{}_QCcontrol.yaml".format(sample_dir_name)) if not os.path.exists(QC_YAML_file): sys.exit("Error file {} must exists!".format(QC_YAML_file)) with open(QC_YAML_file) as QC_YAML_file_handle: QC_sample_config = yaml.load(QC_YAML_file_handle) # TODO: I need to use the sample sheet that must be present in # the QC folder to extract the project name projectName = QC_sample_config["projectName"] #Now all the info is in place and I am in the correct folder pipeline = "assemble" tools = list(args.assemblers) tools = list(map(str, tools)) # Beware whoever inputs unicode characters sample_YAML_name = os.path.join( sample_folder, "{}_{}.yaml".format(sample_dir_name, pipeline)) sample_YAML = open(sample_YAML_name, 'w') sample_YAML.write("pipeline:\n") sample_YAML.write(" {}\n".format(pipeline)) sample_YAML.write("tools:\n") sample_YAML.write(" {}\n".format(tools)) sample_YAML.write("output: {}\n".format(sample_dir_name)) sample_YAML.write("projectName: {}\n".format(projectName)) sample_YAML.write("kmer: {}\n".format(args.kmer)) sample_YAML.write("threads: {}\n".format(args.threads)) sample_YAML.write("genomeSize: {}\n".format(args.genomesize)) if args.keep_tmp_files: #TODO: generalize if we add more flags sample_YAML.write("flags: ['keep_tmp_files']\n") #I have to distinguish between afterQC and not sample_data_dir = "" sample_files = [] if args.afterqc: sample_data_dir = os.path.join(samples_data_dir, sample_dir_name) fastq_files = os.path.join(sample_data_dir, "Trimmomatic") sample_files = [os.path.join(fastq_files, f) for f in \ os.listdir(fastq_files) \ if (os.path.isfile(os.path.join(fastq_files,f)) \ and re.search('[1|2].fastq.gz$',f))] else: sample_data_dir = os.path.join(samples_data_dir, sample_dir_name) # full path to flowcell flowcells_dirs = [os.path.join(sample_data_dir,flowcell) \ for flowcell in os.listdir(sample_data_dir) \ if os.path.isdir(os.path.join(sample_data_dir,flowcell))] for flowcell in flowcells_dirs: sample_files.extend([os.path.join(flowcell, f) for f in \ os.listdir(flowcell) \ if (os.path.isfile(os.path.join(flowcell,f)) \ and re.search('.gz$',f))]) # now sample_files contains all the file sequenced for this sample pair1_file = "" pair2_file = "" single = "" library = 1 sample_YAML.write("libraries:\n") for file in sample_files: sample_YAML.write(" lib{}:\n".format(library)) if "_1.fastq.gz" in file: pair1_file = file pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file) elif "_2.fastq.gz" in file: pair2_file = file pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file) elif "R1_001.fastq.gz" in file: pair1_file = file pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file) elif "R2_001.fastq.gz" in file: pair2_file = file pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file) sample_YAML.write(" pair1: {}\n".format(pair1_file)) sample_YAML.write(" pair2: {}\n".format(pair2_file)) sample_YAML.write(" orientation: {}\n".format(args.orientation)) sample_YAML.write(" insert: {}\n".format(args.insert)) sample_YAML.write(" std: {}\n".format(args.std)) sample_files.remove(pair1_file) sample_files.remove(pair2_file) library += 1 sample_YAML.close #Run the job all_modules = { "abyss": "module load abyss/1.3.5\n", "soapdenovo": "module load soapdenovo/2.04-r240\n", "spades": "module load spades/3.6.0\n", "cabog": "module load cabog/8.1\n", "allpaths": "module unload gcc\nmodule load allpathslg/52485\n", "masurca": "module load MaSuRCA/2.3.2\n" } extramodules = [all_modules[tool] for tool in tools] jobname = "{}_{}".format(sample_dir_name, pipeline) submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules) os.chdir(projectFolder)
def main(args): projectFolder = os.getcwd() samples_data_dir = args.sample_data_dir projectName = os.path.basename(os.path.normpath(samples_data_dir)) for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) \ if os.path.isdir(os.path.join(samples_data_dir, dir))]: sample_folder = os.path.join(os.getcwd(), sample_dir_name) if not os.path.exists(sample_folder): os.makedirs(sample_folder) os.chdir(sample_folder) # now I am in the folder, i can run at the same time QC and MP anlaysis pipeline = "QCcontrol" tools = ["trimmomatic", "fastqc", "abyss", "align"] if args.reference is None: tools = ["trimmomatic", "fastqc", "abyss"] sample_YAML_name = os.path.join( sample_folder, "{}_{}.yaml".format(sample_dir_name, pipeline)) sample_YAML = open(sample_YAML_name, 'w') sample_YAML.write("pipeline:\n") sample_YAML.write(" {}\n".format(pipeline)) sample_YAML.write("tools:\n") sample_YAML.write(" {}\n".format(tools)) ##TODO: output must became sampleName sample_YAML.write("output: {}\n".format(sample_dir_name)) sample_YAML.write("projectName: {}\n".format(projectName)) sample_YAML.write("kmer: 35\n") sample_YAML.write("threads: {}\n".format(args.threads)) sample_YAML.write("genomeSize: \n") sample_YAML.write("adapters: {}\n".format(args.adapter)) if args.reference is not None: sample_YAML.write("reference: {}\n".format(args.reference)) sample_YAML.write("libraries:\n") sample_data_dir = os.path.join(samples_data_dir, sample_dir_name) # helper variables for collecting FCs fc_pat, prep_pat = (r'^\d{6}_.*_?.*$', r'^[A-Z]$') def _get_expected_dir(path, pat): return [os.path.join(path, d) for d in os.listdir(path) if re.match(pat, d) \ and os.path.isdir(os.path.join(path, d))] #collect FC directories flowcells_dirs = _get_expected_dir(sample_data_dir, fc_pat) # to adapt the directory structure in IRMA where it have lib prep dir lib_prep_dirs = _get_expected_dir(sample_data_dir, prep_pat) # Check and collect the flowcells in the lib prep directory for prep_dir in lib_prep_dirs: flowcells_dirs.extend(_get_expected_dir(prep_dir, fc_pat)) sample_files = [] for flowcell in flowcells_dirs: sample_files.extend([os.path.join(flowcell, f) for f in \ os.listdir(flowcell) \ if (os.path.isfile(os.path.realpath(os.path.join(flowcell,f))) \ and re.search('.gz$',f))]) # now sample_files contains all the file sequenced for this sample pair1_file = "" pair2_file = "" single = "" library = 1 while len(sample_files) > 0: file = sample_files[0] sample_YAML.write(" lib{}:\n".format(library)) if "_1.fastq.gz" in file: pair1_file = file pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file) elif "_2.fastq.gz" in file: pair2_file = file pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file) elif "R1_001.fastq.gz" in file: pair1_file = file pair2_file = re.sub("R1_001.fastq.gz", "R2_001.fastq.gz", file) elif "R2_001.fastq.gz" in file: pair2_file = file pair1_file = re.sub("R2_001.fastq.gz", "R1_001.fastq.gz", file) else: sys.exit("file {} does not respect naming convection. \ Exit!".format(file)) sample_YAML.write(" pair1: {}\n".format(pair1_file)) sample_YAML.write(" pair2: {}\n".format(pair2_file)) sample_YAML.write(" orientation: {}\n".format(args.orientation)) sample_YAML.write(" insert: {}\n".format(args.insert)) sample_YAML.write(" std: {}\n".format(args.std)) sample_files.remove(pair1_file) sample_files.remove(pair2_file) library += 1 sample_YAML.close # Run the job extramodules = [] if "abyss" in tools: extramodules.append("module load abyss/1.3.5\n") if "align" in tools: extramodules.append("module load samtools\nmodule load bwa\n") jobname = "{}_{}".format(sample_dir_name, pipeline) submit_job(sample_YAML_name, jobname, os.getcwd(), args, extramodules) os.chdir(projectFolder)