def runAssembly(fastq_frw, fastq_rev, sample_name, parent_dir, read_length, unicycler, cores): try: assert (os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "Assembly" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Assembly.log' logObject = uF.createLoggerObject(log_file) # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) if unicycler: FastqPairedObj.run_unicycler(workspace, cores=cores) else: FastqPairedObj.run_spades(workspace, read_length=read_length, cores=cores) # create successful completion file if steps completed! conf_file = open(parent_dir + "ASSEMBLY.txt", 'w') conf_file.write("Assembly: Module Completed Succesfully!") conf_file.close()
def setup(sample_name, parent_dir, meta): meta = meta.strip('"') try: assert (os.path.isfile(meta) or is_json( meta.replace("QUOTES", "\"").replace("-COMMA-", ","))) except: sys.stderr.write("ERROR: Meta file does not exist! Exiting now ...\n") raise RuntimeError meta = meta.replace("QUOTES", "\"").replace("-COMMA-", ",") # set up directory structure workspace = uF.setupDirectory(parent_dir, sample_name) meta_info_file = open(workspace + "meta_information.json", 'w') meta_information = {} if os.path.isfile(meta): with open(meta) as om: for line in om: line = line.rstrip('\n') key, value = line.split('\t') meta_information[key] = value meta_info_file.write(json.dumps(meta_information)) else: meta_info_file.write(meta) # create successful completion file if steps completed! conf_file = open(workspace + "SETUP.txt", 'w') conf_file.write("Setup: Module Completed Succesfully!") conf_file.close()
def runBayesHammer(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir): try: assert( (fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) ) except: sys.stderr.write("ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"); raise RuntimeError # set up directory structure workspace_name = "BayesHammer" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'BayesHammer.log' logObject = uF.createLoggerObject(log_file) ### Perform single end cutadapt operation if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) FastqObj.error_correction(workspace) ### Perform paired-end cutadapt operation elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # trim adpaters using trim-galore preset settings for nextera and return resulting FASTQ files in gzip compressed format FastqPairedObj.error_correction(workspace) # create successful completion file if steps completed! conf_file = open(parent_dir + "BAYESHAMMER.txt", 'w') conf_file.write("BayesHammer: Module Completed Succesfully!") conf_file.close()
def runNanoSample(nanopore_fastq, sample_dir, sample_name, fastqfilter_options, no_gzip): try: assert (nanopore_fastq and os.path.isfile(nanopore_fastq)) except: raise RuntimeError( "ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n" ) fastqfilter_options = fastqfilter_options.strip('"') # set up directory structure workspace_name = "NanoSample" workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'NanoSample.log' logObject = uF.createLoggerObject(log_file) # Initialize Nanopore Object NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject) # Subsample Nanopore reads using fastqfilter by Bruce Walker NanoporeObj.run_fastqfilter(workspace, options=fastqfilter_options, compress=(not no_gzip)) conf_file = open(sample_dir + "NANOSAMPLE.txt", 'w') conf_file.write("NanoSample: Module Completed Succesfully!") conf_file.close()
def runPilon(bam_file, reference_fasta, sample_name, parent_dir, pilon_options, cores): try: assert (os.path.isfile(bam_file)) except: sys.stderr.write( "ERROR: BAM input was not provided. Please provide. Exiting now ...\n" ) raise RuntimeError # set up directory structure workspace_name = "Pilon" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Pilon.log' logObject = uF.createLoggerObject(log_file) #### Start Pilon workflow # create Alignment object AlignmentObj = Alignment(bam_file, sample_name, logObject) # run Pilon AlignmentObj.run_pilon(workspace, reference_fasta, options=pilon_options) # create successful completion file if steps completed! conf_file = open(parent_dir + "PILON.txt", 'w') conf_file.write("Pilon: Module Completed Succesfully!") conf_file.close()
def runMLST(assembly, sample_name, parent_dir, identifier): try: assert (os.path.isfile(assembly)) except: sys.stderr.write("ERROR: Assembly does not seem to exist.\n") raise RuntimeError # set up directory structure workspace_name = "Assembly_MLST" if identifier: workspace_name += '_' + identifier workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Assembly_MLST.log' logObject = uF.createLoggerObject(log_file) # Initialize Assembly Object AssemblyObj = Assembly(assembly, sample_name, logObject) # Run MLST AssemblyObj.run_mlst(workspace) # create successful completion file if steps completed! conf_file_name = parent_dir + "ASSEMBLY_MLST" if identifier: conf_file_name += '_' + identifier conf_file_name += ".txt" conf_file = open(conf_file_name, 'w') conf_file.write("Assembly MLST: Module Completed Succesfully!") conf_file.close()
def runNanoQC(nanopore_fastq, nanopore_seqsum, nanopore_barcode, sample_dir, cores): try: assert (nanopore_fastq and (os.path.isfile(nanopore_fastq))) except: sys.stderr.write( "ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n" ) raise RuntimeError # set up directory structure workspace_name = "NanoQC" workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'NanoQC.log' logObject = uF.createLoggerObject(log_file) npF.run_nanoplot_qc(nanopore_fastq, workspace, logObject, cores=cores) if os.path.isfile(nanopore_seqsum): sample_seqsum = npF.filter_sequence_summary(nanopore_seqsum, nanopore_barcode, workspace, logObject) npF.run_minion_qc(sample_seqsum, workspace, logObject) # create successful completion file if steps completed! conf_file = open(sample_dir + "NANOQC.txt", 'w') conf_file.write("NanoQC: Module Completed Succesfully!") conf_file.close()
def runStrainGST(fastq_sin, fastq_frw, fastq_rev, db, sample_name, parent_dir, options_kmerize, options_straingst): try: assert ((fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError try: assert (os.path.isfile(db)) except: sys.stderr.write( "ERROR: StrainGST pangenome database file is not available.") raise RuntimeError() # set up directory structure workspace_name = "StrainGST" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'StrainGST.log' logObject = uF.createLoggerObject(log_file) kmer_file = None ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # run strainge kmerize kmer_file = FastqObj.kmerize(workspace, options=options_kmerize) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create Fastq object FastqObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # run strainge kmerize kmer_file = FastqObj.kmerize(workspace, options=options_kmerize) # create Kmer object KmerObj = Kmer(kmer_file, sample_name, logObject) # run straingst KmerObj.run_straingst(workspace, db, options=options_straingst) # produce kmer histogram - in progress - issues running. # KmerObj.create_histogram(workspace) # create successful completion file if steps completed! conf_file = open(parent_dir + "STRAINGST.txt", 'w') conf_file.write("StrainGST: Module Completed Succesfully!") conf_file.close()
def runAssemblyAdapterRemoval(assembly, sample_name, parent_dir, run_guinan, gaemr_options, guinan_options, size_filter, identifier): try: assert (os.path.isfile(assembly)) except: sys.stderr.write("ERROR: Assembly does not seem to exist.\n") raise RuntimeError # set up directory structure workspace_name = "Assembly_Adapter_Removal" if identifier: workspace_name += '_' + identifier workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Assembly_Adapter_Removal.log' logObject = uF.createLoggerObject(log_file) # Initialize Assembly Object AssemblyObj = Assembly(assembly, sample_name, logObject) # Run GAEMR formatting program to generate assembly graph workspace_a = uF.setupDirectory(workspace, "Assembly_Formatted/") AssemblyObj.run_gaemr_formatter(workspace_a, reference_change=True) if run_guinan: # Run GAEMR based adapters in assembly guinan_commands_file = AssemblyObj.detect_adapters( workspace, options=gaemr_options) # Run guinan suite to remove detected adapters from assembly AssemblyObj.remove_adapters(guinan_commands_file, workspace, options=guinan_options) # Run assembly filter by contig size AssemblyObj.filter_contigs_by_size(workspace, size_filter=size_filter) # create successful completion file if steps completed! conf_file_name = parent_dir + "ASSEMBLY_ADAPTER_REMOVAL" if identifier: conf_file_name += '_' + identifier conf_file_name += ".txt" conf_file = open(conf_file_name, 'w') conf_file.write("Assembly Adapter Removal: Module Completed Succesfully!") conf_file.close()
def runFastQC(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir, cores): try: assert ((fastq_sin and os.path.isfile(fastq_sin)) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "FastQC" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'FastQC.log' logObject = uF.createLoggerObject(log_file) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # validate FASTQ file is indeed a FASTQ file valid = FastqObj.validate() if not valid: sys.stderr.write( "ERROR: FASTQ file %s seems to be in invalid format. Exiting now...\n" % FastqObj.fastq) sys.exit(1) # run FastQC and parse results. fastqcResDir = FastqObj.run_qc(workspace, cores=cores) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # validate FASTQ file is indeed a FASTQ file valid = FastqPairedObj.validate() if not valid: sys.stderr.write( "ERROR: At least one of the FASTQ files seems to be in an invalid format. Exiting now ...\n" ) sys.exit(1) # run FastQC and parse results. fastqcResDirs = FastqPairedObj.run_qc(workspace, cores=cores) # create successful completion file if steps completed! conf_file = open(parent_dir + "FASTQC.txt", 'w') conf_file.write("FastQC: Module Completed Succesfully!") conf_file.close()
def runRefAlignment(fastq_sin, fastq_frw, fastq_rev, reference_fasta, sample_name, parent_dir, bwa_options, cores): try: assert( (os.path.isfile(fastq_sin)) or (os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)) ) except: sys.stderr.write("ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"); raise RuntimeError try: assert(os.path.isfile(reference_fasta)) except: sys.stderr.write("ERROR: Reference FASTA file does not have the correct format.\n"); raise RuntimeError # set up directory structure workspace_name = "ReferenceAlignment" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'ReferenceAlignment.log' logObject = uF.createLoggerObject(log_file) sam_file = None ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # Align reads to reference genome sam_file = FastqObj.align_to_reference(workspace, reference_fasta, options=bwa_options, cores=cores) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # Align reads to reference genome sam_file = FastqPairedObj.align_to_reference(workspace, reference_fasta, options=bwa_options, cores=cores) # create Alignment object AlignmentObj = Alignment(sam_file, sample_name, logObject) # compress SAM to BAM AlignmentObj.compress_sam(workspace, clean=True) # sort BAM file AlignmentObj.sort_bam(workspace, clean=True) # index BAM file AlignmentObj.index_bam(workspace) # mark duplicates AlignmentObj.mark_dups(workspace, clean=True) # index BAM file AlignmentObj.index_bam(workspace) # create successful completion file if steps completed! conf_file = open(parent_dir + "REFALIGNMENT.txt", 'w') conf_file.write("Reference Alignment: Module Completed Succesfully!") conf_file.close()
def runAMRP(fastq_frw, fastq_rev, sample_name, parent_dir, shortbred_markers, ariba_database, ariba_names): try: assert (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError try: assert (shortbred_markers or ariba_database) except: sys.stderr.write( "ERROR: Some issue occurred with provided databases/options. Please check the input and retry.\n" ) raise RuntimeError try: if ariba_database or ariba_names: assert (ariba_database and ariba_names) except: sys.stderr.write( "ERROR: ARIBA database provided without names or visa versa, either way please check the input and retry!.\n" ) raise RuntimeError # set up directory structure workspace_name = "AMRP" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'AMPR.log' logObject = uF.createLoggerObject(log_file) # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # Run AMR Prediction analysis if ariba_database: for i, adb in enumerate(ariba_database): adb_name = ariba_names[i] FastqPairedObj.ariba(workspace, name=adb_name, ariba_db=adb) if shortbred_markers: FastqPairedObj.shortbred_amrp(workspace, shortbred_markers) # create successful completion file if steps completed! conf_file = open(parent_dir + "AMRP.txt", 'w') conf_file.write("AMRP: Module Completed Succesfully!") conf_file.close()
def runNanoCanu(nanopore_fastq, sample_dir, sample_name, canu_options, memory, cores): try: assert (nanopore_fastq and os.path.isfile(nanopore_fastq)) except: raise RuntimeError( "ERROR: FASTQ input(s) were not provided properly. Please fix. Raising exception\n" ) unicycler_options = canu_options.strip('"') # set up directory structure workspace_name = "Canu_Assembly" workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'Canu_Assembly.log' logObject = uF.createLoggerObject(log_file) if nanopore_fastq.endswith('.gz'): FastqObj = Fastq(nanopore_fastq, sample_name, logObject) FastqObj.create_new_instance(workspace, compress=False, change_reference=True) # Initialize Nanopore Object NanoporeObj = Nanopore(FastqObj.fastq, sample_name, logObject) # Run Canu for assembly NanoporeObj.run_canu(workspace, options=canu_options, memory=memory, cores=cores) # Clean up temporary FASTQ instance os.system('rm -f %s' % FastqObj.fastq) else: # Initialize Nanopore Object NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject) # Run Canu for assembly NanoporeObj.run_canu(workspace, options=canu_options, memory=memory, cores=cores) conf_file = open(sample_dir + "CANU_ASSEMBLY.txt", 'w') conf_file.write("Canu Assembly: Module Completed Succesfully!") conf_file.close()
def runSortMeRNA(fastq_sin, fastq_frw, fastq_rev, database_dir, sample_name, parent_dir, cores, no_gzip): try: assert ((fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "SortMeRNA" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'SortMeRNA.log' logObject = uF.createLoggerObject(log_file) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # split up ribosomal and non-ribosomal RNA data FastqObj.filter_ribo_rna(workspace, database_dir, cores=cores, compress=(not no_gzip)) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # split up ribosomal and non-ribosomal RNA data FastqPairedObj.filter_ribo_rna(workspace, database_dir, cores=cores, compress=(not no_gzip)) # create successful completion file if steps completed! conf_file = open(parent_dir + "SORTMERNA.txt", 'w') conf_file.write("SortMeRNA: Module Completed Succesfully!") conf_file.close()
def runKneadData(fastq_sin, fastq_frw, fastq_rev, kneaddata_options, sample_name, parent_dir, cores, no_gzip): try: assert ((fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "KneadData" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'KneadData.log' logObject = uF.createLoggerObject(log_file) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # bin reads taxonomically using Centrifuge FastqObj.run_kneaddata(workspace, options=kneaddata_options, cores=cores, compress=not (no_gzip)) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # bin reads taxonomically using Centrifuge FastqPairedObj.run_kneaddata(workspace, options=kneaddata_options, cores=cores, compress=not (no_gzip)) # create successful completion file if steps completed! conf_file = open(parent_dir + "KNEADDATA.txt", 'w') conf_file.write("KneadData: Module Completed Succesfully!") conf_file.close()
def storeInput(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir): try: assert ((fastq_sin and os.path.isfile(fastq_sin)) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "LSARP_Results/" workspace = uF.setupDirectory(parent_dir, workspace_name) workspace_name_input = 'Input/' workspace_input = uF.setupDirectory(workspace, workspace_name_input) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): res_sin_read = workspace_input + sample_name + '_R1.processed.fastq' if fastq_sin.endswith('.gz'): res_sin_read += '.gz' os.system('cp %s %s' % (fastq_sin, res_sin_read)) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: res_frw_read = workspace_input + sample_name + '_R1.processed.fastq' res_rev_read = workspace_input + sample_name + '_R2.processed.fastq' if fastq_frw.endswith('.gz'): res_frw_read += '.gz' if fastq_rev.endswith('.gz'): res_rev_read += '.gz' os.system('cp %s %s' % (fastq_frw, res_frw_read)) os.system('cp %s %s' % (fastq_rev, res_rev_read)) # create successful completion file if steps completed! conf_file = open(parent_dir + "STOREINPUT.txt", 'w') conf_file.write("StoreInput: Module Completed Succesfully!") conf_file.close()
def processGpDirectory(bam_location, sample_name, parent_dir, no_gzip): try: assert(os.path.isdir(bam_location) or os.path.isfile(bam_location)) except: sys.stderr.write("ERROR: BAM/GP directory does not exist! Exiting now ...\n"); raise RuntimeError # set up directory structure workspace_name = "ProcessGPDirectory" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'ProcessGPDirectory.log' logObject = uF.createLoggerObject(log_file) bam_location = os.path.abspath(bam_location) logObject.info('*' * 70) logObject.info("Beginning to convert BAM location %s to FASTQ(s)" % (os.path.abspath(bam_location) + '/')) input_bam = bam_location if os.path.isdir(bam_location): gp_directory = bam_location + '/' # copy over all txt files metric_files = [gp_directory + f for f in os.listdir(gp_directory) if not f.endswith('.pdf') and not f.endswith('.bam') and not f.endswith('.bai') and not os.path.isdir(gp_directory + f)] for mf in metric_files: mf_basename = mf.split('/')[-1] try: shutil.copy(mf, workspace + mf_basename) except: pass bam_files = [gp_directory + f for f in os.listdir(gp_directory) if f.endswith('.bam')] try: assert(len(bam_files) == 1) except: logObject.error() raise RuntimeError input_bam = bam_files[0] # create Alignment Object AlignmentObj = Alignment(input_bam, sample_name, logObject) # extract reads from BAM AlignmentObj.extract_fastqs(workspace, compress=(not no_gzip)) # create successful completion file if steps completed! conf_file = open(parent_dir + "GPPROCESS.txt", 'w') conf_file.write("ProcessGPDirectory: Module Completed Succesfully!") conf_file.close()
def runNanoUnicycler(nanopore_fastq, illumina_forward, illumina_reverse, sample_dir, sample_name, unicycler_options, identifier, cores): try: assert (nanopore_fastq and os.path.isfile(nanopore_fastq)) except: raise RuntimeError( "ERROR: FASTQ input(s) were not provided properly. Please fix. Raising exception\n" ) try: assert (illumina_forward and illumina_reverse and os.path.isfile(illumina_forward) and os.path.isfile(illumina_reverse)) except: raise RuntimeError( "ERROR: Optional Illumina FASTQ input(s) / assembly were not provided properly. Please fix. Raising exception\n" ) unicycler_options = unicycler_options.strip('"') # set up directory structure workspace_name = "Unicycler_Assembly" if identifier: workspace_name += '_' + identifier workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'Unicycler_Assembly.log' logObject = uF.createLoggerObject(log_file) # initialize Nanopore object NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject) # Run Unicycler for assembly NanoporeObj.run_unicycler(illumina_forward, illumina_reverse, workspace, options=unicycler_options, cores=cores) conf_file_name = sample_dir + "UNICYCLER_ASSEMBLY" if identifier: conf_file_name += '_' + identifier conf_file_name += ".txt" conf_file = open(conf_file_name, 'w') conf_file.write("Unicycler Assembly: Module Completed Succesfully!") conf_file.close()
def runCentrifuge(fastq_sin, fastq_frw, fastq_rev, centrifuge_index, sample_name, parent_dir, cores): try: assert ((fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "Centrifuge" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Centrifuge.log' logObject = uF.createLoggerObject(log_file) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # bin reads taxonomically using Centrifuge FastqObj.bin_taxonomically(workspace, centrifuge_index, cores=cores) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # bin reads taxonomically using Centrifuge FastqPairedObj.bin_taxonomically(workspace, centrifuge_index, cores=cores) # create successful completion file if steps completed! conf_file = open(parent_dir + "CENTRIFUGE.txt", 'w') conf_file.write("Centrifuge: Module Completed Succesfully!") conf_file.close()
def runSymlinkInput(fastq_sin, fastq_frw, fastq_rev, parent_dir, sample_name): try: assert ((fastq_sin and os.path.isfile(fastq_sin)) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "Symlink_Input" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Symlink.log' logObject = uF.createLoggerObject(log_file) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # create symlink FastqObj.create_symlink(workspace) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # create symlink FastqPairedObj.create_symlink(workspace) conf_file = open(parent_dir + "SYMLINK_INPUT.txt", 'w') conf_file.write("SymlinkInput: Module Completed Succesfully!") conf_file.close()
def runNanoMerge(nanopore_fastq, sample_dir, sample_name, barcode, no_gzip): try: assert (nanopore_fastq and (os.path.isfile(nanopore_fastq) or os.path.isdir(nanopore_fastq))) except: sys.stderr.write( "ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n" ) raise RuntimeError # set up directory structure workspace_name = "NanoMerge" workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'NanoMerge.log' logObject = uF.createLoggerObject(log_file) if os.path.isdir(nanopore_fastq): npF.concat_fastqs(nanopore_fastq, workspace, sample_name, logObject, barcode=barcode, compress=(not no_gzip)) elif os.path.isfile(nanopore_fastq): if nanopore_fastq.endswith('.gz'): os.system('cp %s %s' % (nanopore_fastq, workspace + sample_name + '.fastq.gz')) if no_gzip: os.system('gunzip %s' % workspace + sample_name + '.fastq.gz') else: os.system('cp %s %s' % (nanopore_fastq, workspace + sample_name + '.fastq')) if not no_gzip: os.system('gzip %s' % workspace + sample_name + '.fastq') conf_file = open(sample_dir + "NANOMERGE.txt", 'w') conf_file.write("NanoMerge: Module Completed Succesfully!") conf_file.close()
def runNanoTrim(nanopore_fastq, sample_dir, sample_name, no_gzip): try: assert( nanopore_fastq and (os.path.isfile(nanopore_fastq) or os.path.isdir(nanopore_fastq)) ) except: sys.stderr.write("ERROR: FASTQ input(s) were not provided. Please provide. Raising exception\n") raise RuntimeError # set up directory structure workspace_name = "NanoTrim" workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'NanoTrim.log' logObject = uF.createLoggerObject(log_file) # Initialize Nanopore Object NanoporeObj = Nanopore(nanopore_fastq, sample_name, logObject) # Trim any adapters with PoreChop NanoporeObj.run_nanotrim(workspace, compress=(not no_gzip)) conf_file = open(sample_dir + "NANOTRIM.txt", 'w') conf_file.write("NanoTrim: Module Completed Succesfully!") conf_file.close()
def runTrimmomatic(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir, trimmomatic_options, cores, no_gzip): try: assert( (fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) ) except: sys.stderr.write("ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n"); raise RuntimeError trimmomatic_options = trimmomatic_options.strip('"') # set up directory structure workspace_name = "QualityTrim" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'QualityTrim.log' logObject = uF.createLoggerObject(log_file) ### Perform single end trimmomatic operation if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # trim adapters using cutadapt and return resulting FASTQ file in gzip compressed format FastqObj.quality_trim(workspace, options=trimmomatic_options, cores=cores, compress=(not no_gzip)) ### Perform paired-end trimmomatic operation elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # trim adpaters using cutadapt and return resulting FASTQ files in gzip compressed format FastqPairedObj.quality_trim(workspace, options=trimmomatic_options, cores=cores, compress=(not no_gzip)) # create successful completion file if steps completed! conf_file = open(parent_dir + "QUALITYTRIM.txt", 'w') conf_file.write("QualityTrim: Module Completed Succesfully!") conf_file.close()
def runMLST(fastq_frw, fastq_rev, sample_name, parent_dir, ariba_db_dir): try: assert (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError try: assert (os.path.isdir(ariba_db_dir)) except: sys.stderr.write( "ERROR: Some issue occurred with provided databases/options. Please check the input and retry.\n" ) raise RuntimeError # set up directory structure workspace_name = "MLST" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'MLST.log' logObject = uF.createLoggerObject(log_file) # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # Run ARIBA analysis to detect STs in raw reads FastqPairedObj.ariba(workspace, name='ariba_mlst', ariba_db=ariba_db_dir) # create successful completion file if steps completed! conf_file = open(parent_dir + "MLST.txt", 'w') conf_file.write("MLST: Module Completed Succesfully!") conf_file.close()
def runSubsample(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir, reads, bases, no_gzip): try: assert ((fastq_sin and os.path.isfile(fastq_sin)) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError # set up directory structure workspace_name = "Subsample" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'Subsample.log' logObject = uF.createLoggerObject(log_file) ### Perform single end QC analysis if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # run FastQC and parse results. if reads: FastqObj.subsample(workspace, reads=reads, compress=(not no_gzip)) elif bases: FastqObj.downsample(workspace, bases=bases, compress=(not no_gzip)) else: logObject.error( "No subsampling quantity specified defaulting to 100K reads being subsampled!" ) FastqObj.subsample(workspace, reads=reads, compress=(not no_gzip)) ### Perform paired-end QC analysis elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # run FastQC and parse results. if reads: FastqPairedObj.subsample(workspace, reads=reads, compress=(not no_gzip)) elif bases: FastqPairedObj.downsample(workspace, bases=bases, compress=(not no_gzip)) else: logObject.error( "No subsampling quantity specified defaulting to 100K ") FastqPairedObj.subsample(workspace, reads=reads, compress=(not no_gzip)) # create successful completion file if steps completed! conf_file = open(parent_dir + "SUBSAMPLE.txt", 'w') conf_file.write("Subsample: Module Completed Succesfully!") conf_file.close()
def runAdapterTrim(fastq_sin, fastq_frw, fastq_rev, sample_name, parent_dir, trimgalore_options, cutadapt_options): try: assert ((fastq_sin and os.path.isfile(fastq_sin) or (fastq_frw and fastq_rev and os.path.isfile(fastq_frw) and os.path.isfile(fastq_rev)))) except: sys.stderr.write( "ERROR: FASTQ inputs were not provided. Please provide either a pair (frw and rev) or a single FASTQ file.\n" ) raise RuntimeError trimgalore_options = trimgalore_options.strip('"') cutadapt_options = cutadapt_options.strip('"') try: assert (not (trimgalore_options and cutadapt_options)) except: sys.stderr.write( "ERROR: Both filtering options with cutadapt and trim galore provided. Can only use one adapter trimmer. Exiting now ...\n" ) raise RuntimeError # set up directory structure workspace_name = "AdapterTrim" workspace = uF.setupDirectory(parent_dir, workspace_name) # create logging object log_file = workspace + 'AdapterTrim.log' logObject = uF.createLoggerObject(log_file) ### Perform single end cutadapt operation if fastq_sin and os.path.isfile(fastq_sin): # create Fastq object FastqObj = Fastq(fastq_sin, sample_name, logObject) # trim adpaters using trim-galore preset settings for nextera and return resulting FASTQ files in gzip compressed format if cutadapt_options: FastqObj.cutadapt_adapter_trim(workspace, options=cutadapt_options) else: FastqObj.trim_galore_adapter_trim(workspace, options=trimgalore_options) ### Perform paired-end cutadapt operation elif fastq_frw and fastq_rev: # create FastqPaired Object FastqPairedObj = FastqPaired(fastq_frw, fastq_rev, sample_name, logObject) # trim adpaters using trim-galore preset settings for nextera and return resulting FASTQ files in gzip compressed format if cutadapt_options: FastqPairedObj.cutadapt_adapter_trim(workspace, options=cutadapt_options) else: FastqPairedObj.trim_galore_adapter_trim(workspace, options=trimgalore_options) # create successful completion file if steps completed! conf_file = open(parent_dir + "ADAPTERTRIM.txt", 'w') conf_file.write("AdapterTrim: Module Completed Succesfully!") conf_file.close()
def reorganize(sample_dir): try: assert (os.path.isdir(sample_dir)) except: sys.stderr.write( "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n") raise RuntimeError sample_dir = os.path.abspath(sample_dir) + '/' # set up directory structure workspace_name = "LSARP_Results/" workspace = sample_dir + workspace_name if not os.path.isdir(workspace): workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'LSARP_Table_Creation.log' logObject = uF.createLoggerObject(log_file) sample = sample_dir.split('/')[-2] logObject.info("Creating easy upload formats for sample %s", sample) logObject.info("-" * 80) # FASTQC Tables logObject.info('Creating FastQC Data Tables.') logObject.info('-' * 80) FastQC_results = 'FastQC/' FastQC_results_workspace = workspace + FastQC_results fastqc_modules = [ 'Per base sequence quality', 'Per tile sequence quality', 'Per sequence quality scores', 'Per base sequence content', 'Per sequence GC content', 'Per base N content', 'Sequence Length Distribution', 'Sequence Duplication Levels', 'Overrepresented sequences', 'Adapter Content' ] try: fastqc_zipped_data_dirs = [ sample_dir + 'FastQC/' + zd for zd in os.listdir(sample_dir + 'FastQC/') if zd.endswith('.zip') ] assert (len(fastqc_zipped_data_dirs) > 0) for zd in fastqc_zipped_data_dirs: assert (os.path.isfile(zd)) if not os.path.isdir(FastQC_results_workspace): FastQC_results_workspace = uF.setupDirectory( workspace, FastQC_results) except: logObject.error( 'No FastQC results available or path is unable to be determined!') else: for zd in fastqc_zipped_data_dirs: with zipfile.ZipFile(zd) as z: for filename in z.namelist(): if filename.split('/')[-1] == 'fastqc_data.txt': with z.open(filename) as fh: FastQC_tmp_out = open( FastQC_results_workspace + 'tmp.txt', 'wb') for line in fh: FastQC_tmp_out.write(line) FastQC_tmp_out.close() fadapa = Fadapa(FastQC_results_workspace + 'tmp.txt') for module in fastqc_modules: try: table_file = '_'.join(module.split()) cleaned_module_data = fadapa.clean_data( module) if cleaned_module_data: table_handle = open( FastQC_results_workspace + table_file + '.table.txt', 'w') for i, split_line in enumerate( cleaned_module_data): if i == 0: split_line = [ 'sample', 'read' ] + split_line else: split_line = [ sample_dir.split('/')[-2], zd.split('/')[-1].split( sample_dir.split('/') [-2] + '_')[1].split( '_fastqc.zip') [0].split('.')[0] ] + split_line table_handle.write( '\t'.join(split_line) + '\n') table_handle.close() except: pass os.system('rm -f %s' % FastQC_results_workspace + 'tmp.txt') logObject.info('*' * 80) # Centrifuge Tables logObject.info('Creating Centrifuge Data Tables.') logObject.info('-' * 80) Centrifuge_results = 'Centrifuge/' Centrifuge_results_workspace = workspace + Centrifuge_results centrifuge_report_file = sample_dir + 'Centrifuge/' + sample_dir.split( '/')[-2] + '_centrifuge_report.tsv' kraken_report_file = sample_dir + 'Centrifuge/' + sample_dir.split( '/')[-2] + '_centrifuge_kraken_report.txt' try: assert (os.path.isfile(centrifuge_report_file) and os.path.isfile(kraken_report_file)) if not os.path.isdir(Centrifuge_results_workspace): Centrifuge_results_workspace = uF.setupDirectory( workspace, Centrifuge_results) centrifuge_report_table_file = Centrifuge_results_workspace + 'centrifuge_report.table.txt' centrifuge_report_table_handle = open(centrifuge_report_table_file, 'w') centrifuge_report_data = defaultdict(lambda: ['NA'] * 6) for i, line in enumerate(open(centrifuge_report_file)): if i > 0: line = line.rstrip('\n') name, taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = line.split( '\t') centrifuge_report_data[name] = [ taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance ] header = [ 'sample', 'taxonomy_name', 'taxonomy_level', 'taxonomy_rank', 'taxonomy_id', 'genome_size', 'centrifuge_abundance', 'percentage_of_fragments_recursively_covered', 'number_of_fragments_recursively_included', 'number_of_fragments_direct' ] centrifuge_report_table_handle.write('\t'.join(header) + '\n') for i, line in enumerate(open(kraken_report_file)): line = line.rstrip('\n') prop, frag_recurse, frag_direct, tax_level, tax_id = line.split( )[:5] tax = ' '.join(line.split()[5:]).strip() taxID, taxRank, genomeSize, numReads, numUniqueReads, abundance = centrifuge_report_data[ tax] centrifuge_report_table_handle.write('\t'.join([ sample_dir.split('/')[-2], tax, tax_level, taxRank, taxID, genomeSize, abundance, prop, frag_recurse, frag_direct ]) + '\n') centrifuge_report_table_handle.close() except: logObject.error('No Centrifuge results available!') logObject.info('*' * 80) # AMRP Tables logObject.info('Moving Results from ARIBA and ShortBRED AMR Searches.') logObject.info('-' * 80) AMRP_results = 'AMRP_Searches/' AMRP_results_workspace = workspace + AMRP_results try: AMRP_dir = sample_dir + 'AMRP/' assert (os.path.isdir(AMRP_dir)) if not os.path.isdir(workspace + AMRP_dir): AMRP_results_workspace = uF.setupDirectory(workspace, AMRP_results) for sd in os.listdir(AMRP_dir): ariba_dir = AMRP_dir + sd + '/' ariba_report = ariba_dir + 'report.tsv' if os.path.isfile(ariba_report): ariba_result = AMRP_results_workspace + sample_dir.split( '/')[-2] + '_' + sd + '_ariba_results.txt' os.system('cp %s %s' % (ariba_report, ariba_result)) except: logObject.error('Unable to create AMR prediction data tables.' ) # Raising exception now ...') logObject.info('*' * 80) # MLST Tables logObject.info('Creating MLST Data Tables.') logObject.info('-' * 80) MLST_results = 'MLST/' MLST_results_workspace = workspace + MLST_results try: MLST_dir = sample_dir + 'MLST/' MLST_result_file = MLST_dir + 'ariba_mlst/mlst_report.tsv' if not os.path.isdir(MLST_results_workspace): MLST_results_workspace = uF.setupDirectory(workspace, MLST_results) os.system('cp %s %s' % (MLST_result_file, MLST_results_workspace)) except: logObject.error('Unable to create MLST call data tables.' ) # Raising exception now ...') #raise RuntimeError logObject.info('*' * 80) # De Novo Assembly Storage logObject.info('Moving de novo assembly to results directory.') logObject.info('-' * 80) Assembly_results = 'Assembly/' Assembly_results_workspace = workspace + Assembly_results try: Assembly_dir = sample_dir + 'Assembly/' Assembly_original_location = Assembly_dir + 'assembly.fasta' if not os.path.isfile(Assembly_original_location): Assembly_original_location = Assembly_dir + 'scaffolds.fasta' assert (os.path.isfile(Assembly_original_location)) if not os.path.isdir(Assembly_results_workspace): Assembly_results_workspace = uF.setupDirectory( workspace, Assembly_results) Assembly_new_location = Assembly_results_workspace + sample_dir.split( '/')[-2] + '.genome.fa' os.system('cp %s %s' % (Assembly_original_location, Assembly_new_location)) except: logObject.error('Unable to move assembly to results directory.') logObject.info('*' * 80) # Assembly QC Storage logObject.info('Moving GAEMR assembly QC to results directory.') logObject.info('-' * 80) try: Assembly_QC_new_location = workspace + 'Assembly_QC/' Assembly_QC_original_dir = sample_dir + 'GAEMR/QC/' assert (os.path.isdir(Assembly_QC_original_dir)) os.system('cp -r %s %s' % (Assembly_QC_original_dir, Assembly_QC_new_location)) except: logObject.error( 'Unable to move GAEMR assembly QC to results directory.') logObject.info('*' * 80) # Pilon Results Storage logObject.info('Moving Pilon output to results directory.') logObject.info('-' * 80) try: Pilon_new_dir = workspace + 'Reference_Assembly_and_Variant_Calling/' Pilon_original_dir = sample_dir + 'Pilon/results/' assert (os.path.isdir(Pilon_original_dir)) os.system('cp -r %s %s' % (Pilon_original_dir, Pilon_new_dir)) os.system('gzip %s*' % Pilon_new_dir) except: logObject.error('Unable to move Pilon output to results directory.') logObject.info('*' * 80) # StrainGST Results Storage logObject.info('Moving StrainGST output to results directory.') logObject.info('-' * 80) try: Straingst_result_file = sample_dir + 'StrainGST/' + sample + '.straingst_result.tsv' assert (os.path.isfile(Straingst_result_file)) Straingst_new_dir = 'StrainGST/' Straingst_results_workspace = workspace + Straingst_new_dir if not os.path.isdir(workspace + Straingst_new_dir): Straingst_results_workspace = uF.setupDirectory( workspace, Straingst_new_dir) os.system('cp %s %s' % (Straingst_result_file, Straingst_results_workspace)) except: logObject.error( 'Unable to move StrainGST output to results directory.') logObject.info('*' * 80) uF.closeLoggerObject(logObject) # create successful completion file if steps completed! conf_file = open(sample_dir + "LSARP.txt", 'w') conf_file.write("LSARP Table Creation: Module Completed Succesfully!") conf_file.close()
def runAssemblyQC(assembly, sample_name, sample_dir, format_options, qc_options, cores, illumina_frw, illumina_rev, picard_insert_file, nanopore_fastq, gaemr_ont, identifier): try: assert (os.path.isfile(assembly)) except: sys.stderr.write("ERROR: Assembly does not exist. Raising exception\n") raise RuntimeError try: assert ((not nanopore_fastq) or (nanopore_fastq and os.path.isfile(nanopore_fastq))) except: sys.stderr.write( "ERROR: Nanopore FASTQ provided but the path does not exist. Raising exception\n" ) raise RuntimeError format_options = format_options.strip('"') qc_options = qc_options.strip('"') # set up directory structure workspace_name = "GAEMR" if identifier: workspace_name += '_' + identifier workspace = uF.setupDirectory(sample_dir, workspace_name) # create logging object log_file = workspace + 'GAEMR.log' logObject = uF.createLoggerObject(log_file) # Format Assembly for GAEMR QC Analysis workspace_a = uF.setupDirectory(workspace, "Formatting") AssemblyObj = AssemblyAnalyzer.Assembly(assembly, sample_name, logObject) AssemblyObj.run_gaemr_formatter(workspace_a, options=format_options, reference_change=True) # Generate read list file for QC read_list = workspace + 'read_list.txt' outf = open(read_list, 'w') outf.write("#name,lib_type,mean_read_length,dir,insert_size,files\n") frw_read = None rev_read = None illumina_avg_read_length = 250 illumina_avg_insert_length = 400 if illumina_frw and illumina_rev and os.path.isfile( illumina_frw) and os.path.isfile(illumina_rev): readlengths = [] if illumina_frw.endswith(".gz"): with gzip.open(illumina_frw, 'rt') as ofr: for i, line in enumerate(ofr): if i > 40000: continue if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0: readlengths.append(len(line.strip())) else: with open(illumina_frw) as ofr: for i, line in enumerate(ofr): if i > 40000: continue if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0: readlengths.append(len(line.strip())) frw_read = illumina_frw rev_read = illumina_rev illumina_avg_read_length = sum(readlengths) / float(len(readlengths)) elif os.path.isdir(sample_dir + 'Subsample'): frw_read = [ sample_dir + 'Subsample/' + x for x in os.listdir(sample_dir + 'Subsample') if '_R1.' in x and ( x.endswith('.fastq.gz') or x.endswith('.fastq')) ][0] rev_read = [ sample_dir + 'Subsample/' + x for x in os.listdir(sample_dir + 'Subsample') if '_R2.' in x and ( x.endswith('.fastq.gz') or x.endswith('.fastq')) ][0] readlengths = [] if frw_read.endswith(".gz"): with gzip.open(frw_read, 'rt') as ofr: for i, line in enumerate(ofr): if i > 40000: continue if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0: readlengths.append(len(line.strip())) else: with open(frw_read) as ofr: for i, line in enumerate(ofr): if i > 40000: continue if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0: readlengths.append(len(line.strip())) illumina_avg_read_length = sum(readlengths) / float(len(readlengths)) if picard_insert_file and os.path.isfile(picard_insert_file): with open(picard_insert_file) as oisf: for line in oisf: line = line.strip() ls = line.split('\t') if len(ls) > 0 and ls[0].startswith("MEDIAN_INSERT_SIZE"): flag_header_observed = True continue if flag_header_observed: illumina_avg_insert_length = int(float(ls[5])) break elif os.path.isdir(sample_dir + 'ProcessGPDirectory'): insert_stats_file_query = [ sample_dir + 'ProcessGPDirectory/' + x for x in os.listdir(sample_dir + 'ProcessGPDirectory') if x.endswith('.insert_size_metrics') ] if len(insert_stats_file_query) == 1: insert_stats_file = insert_stats_file_query[0] flag_header_observed = False with open(insert_stats_file) as oisf: for line in oisf: line = line.strip() ls = line.split('\t') if len(ls) > 0 and ls[0].startswith("MEDIAN_INSERT_SIZE"): flag_header_observed = True continue if flag_header_observed: illumina_avg_insert_length = int(float(ls[5])) break if frw_read and rev_read: outf.write('Fragments,fragment,%d,fr,%d,%s,%s\n' % (illumina_avg_read_length, illumina_avg_insert_length, frw_read, rev_read)) if nanopore_fastq and gaemr_ont: readlengths = [] if nanopore_fastq.endswith(".gz"): with gzip.open(nanopore_fastq, 'rt') as ofr: for i, line in enumerate(ofr): if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0: readlengths.append(len(line.strip())) else: with open(nanopore_fastq) as ofr: for i, line in enumerate(ofr): if i > 0 and (i + 1) % 2 == 0 and (i + 1) % 4 != 0: readlengths.append(len(line.strip())) nanopore_avg_read_length = 2000 if len(readlengths) > 0: nanopore_avg_read_length = sum(readlengths) / len(readlengths) nanopore_avg_insert_size = nanopore_avg_read_length outf.write('Long,unpaired,%d,,%d,%s\n' % (nanopore_avg_read_length, nanopore_avg_insert_size, nanopore_fastq)) outf.close() # Run GAEMR QC workspace_b = uF.setupDirectory(workspace, "QC") if nanopore_fastq and gaemr_ont: AssemblyObj.run_gaemr_qc_ont(read_list, workspace_b, options=qc_options, cores=cores) else: AssemblyObj.run_gaemr_qc(read_list, workspace_b, options=qc_options, cores=cores) # create successful completion file if steps completed! conf_file_name = sample_dir + "GAEMR" if identifier: conf_file_name += '_' + identifier conf_file_name += ".txt" conf_file = open(conf_file_name, 'w') conf_file.write("GAEMR: Module Completed Succesfully!") conf_file.close()
def reorganize(sample_dir, reorganize, sample_name): try: assert (os.path.isdir(sample_dir)) except: sys.stderr.write( "ERROR: Sample directory doesn't seem to exist! Exiting now ...\n") raise RuntimeError sample_dir = os.path.abspath(sample_dir) + '/' if reorganize: # set up directory structure workspace_name = "Assembly_Results/" workspace = uF.setupDirectory(sample_dir, workspace_name, panic_if_exists=False) # create logging object log_file = workspace + 'Reorganization.log' logObject = uF.createLoggerObject(log_file) logObject.info("Creating easy upload formats for sample %s", sample_name) logObject.info("-" * 80) runs = ['full-np', 'sub-np', 'canu'] names = [ 'Unicycler_All-ONT', 'Unicycler_Subsampled-ONT', 'Canu_Pure-ONT' ] for i, run in enumerate(runs): run_name = names[i] # De Novo Assembly + QC Storage logObject.info( 'Moving GAEMR folder for run %s to results directory.' % run) logObject.info('-' * 80) try: new_location = os.path.abspath(workspace + run_name) original_dir = os.path.abspath(sample_dir + 'GAEMR_' + run) + '/' assert (os.path.isdir(original_dir)) os.system('mv %s %s' % (original_dir, new_location)) except: logObject.warning( 'Unable to move GAEMR directory for run %s to results directory.' % run_name) # MLST Results Storage logObject.info( 'Moving MLST folder for run %s to results directory.' % run) logObject.info('-' * 80) try: new_location = os.path.abspath(workspace + run_name) original_dir = os.path.abspath(sample_dir + 'Assembly_MLST_' + run) + '/' assert (os.path.isdir(original_dir)) os.system('mv %s %s' % (original_dir, new_location)) except: logObject.warning( 'Unable to move Assembly_MLST directory for run %s to results directory.' % run_name) logObject.info('*' * 80) intermediate_workspace_name = 'Intermediate_Subdirectories/' intermediate_workspace = uF.setupDirectory( sample_dir, intermediate_workspace_name) logObject.info( "Moving intermediate subdirectories of workflow to directory %s" % intermediate_workspace) try: for sub in os.listdir(sample_dir): sub_dir = os.path.abspath(sample_dir + sub) + '/' if os.path.isdir(sub_dir) and sub != 'Assembly_Results': os.system('mv %s %s' % (sub_dir, intermediate_workspace)) except: logObject.error( "Something went wrong when moving intermediate directories.") raise RuntimeError() logObject.info('*' * 80) checkpoint_workspace_name = 'Checkpoint_Files/' checkpoint_workspace = uF.setupDirectory(sample_dir, checkpoint_workspace_name) logObject.info("Moving checkpoint files of workflow to directory %s" % checkpoint_workspace) try: for f in os.listdir(sample_dir): checkpoint_file = os.path.abspath(sample_dir + f) if os.path.isfile(checkpoint_file) and f.endswith('.txt'): os.system('mv %s %s' % (checkpoint_file, checkpoint_workspace)) except: logObject.error( "Something went wrong when moving checkpoint files.") raise RuntimeError() uF.closeLoggerObject(logObject) # create successful reorganization file if steps completed! conf_file = open(sample_dir + "REORGANIZATION.txt", 'w') conf_file.write("Reorganization was Completed Successfully!") conf_file.close() # create successful completion file if steps completed! conf_file = open(sample_dir + "COMPLETION.txt", 'w') conf_file.write("Completion: Module Completed Successfully!") conf_file.close()