def quast(self): printtime('Performing Quast analyses', self.start) for i in range( len([ sample.general for sample in self.metadata if sample.general.bestassemblyfile != 'NA' ])): # Send the threads to the merge method. :args is empty threads = Thread(target=self.runquast, args=()) # Set the daemon to true - something to do with thread management threads.setDaemon(True) # Start the threading threads.start() for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': # Create the quast output directory quastoutputdirectory = '{}/quast_results/'.format( sample.general.outputdirectory) make_path(quastoutputdirectory) # Set the quast system call quastcall = 'quast.py {} -o {}'.format( sample.general.filteredfile, quastoutputdirectory) # Add the command to the metadata sample.commands.quast = quastcall self.quastqueue.put((sample, quastoutputdirectory)) else: sample.commands.quast = 'NA' self.quastqueue.join()
def test_sistr(variables): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() # Set the destination folder outputdir = os.path.join(variables.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' variable_update()
def basic(self): # Grab any .fastq files in the path fastqfiles = glob(os.path.join(self.path, '*.fastq*')) # Extract the base name of the globbed name + path provided fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles)) # Iterate through the names of the fastq files for fastqname in sorted(fastqnames): # Set the name metadata = MetadataObject() metadata.name = fastqname # Set the destination folder outputdir = os.path.join(self.path, fastqname) # Make the destination folder make_path(outputdir) # Get the fastq files specific to the fastqname specificfastq = glob( os.path.join(self.path, '{}*.fastq*'.format(fastqname))) # Link the files to the output folder try: # Link the .gz files to :self.path/:filename list( map( lambda x: os.symlink( '../{}'.format(os.path.basename(x)), '{}/{}'. format(outputdir, os.path.basename(x))), specificfastq)) # Except os errors except OSError as exception: # If there is an exception other than the file exists, raise it if exception.errno != errno.EEXIST: raise # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Populate the .fastqfiles category of :self.metadata metadata.general.fastqfiles = [ fastq for fastq in sorted( glob( os.path.join(outputdir, '{}*.fastq*'.format( metadata.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join( self.path, metadata.name, '{}_log_out.txt'.format(metadata.name)) metadata.general.logerr = os.path.join( self.path, metadata.name, '{}_log_err.txt'.format(metadata.name)) # Append the metadata to the list of samples self.samples.append(metadata) # Grab metadata from previous runs previousmetadata = metadataReader.MetadataReader(self) # Update self.samples (if required) if previousmetadata.samples: self.samples = previousmetadata.samples # Run the read length method self.readlength()
def test_sistr(variables): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() # Set the destination folder outputdir = os.path.join(variables.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join(outputdir, 'out') metadata.general.logerr = os.path.join(outputdir, 'err') metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' variable_update()
def reporter(self): """ Creates a report of the results """ printtime('Creating {} report'.format(self.analysistype), self.starttime) # Create the path in which the reports are stored make_path(self.reportpath) header = 'Strain,Serotype\n' data = '' with open( os.path.join(self.reportpath, '{}.csv'.format(self.analysistype)), 'w') as report: for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': data += sample.name + ',' if sample[self.analysistype].results: serotype = '{oset} ({opid}):{hset} ({hpid}),' \ .format(oset=';'.join(sample.serosippr.o_set), opid=sample.serosippr.best_o_pid, hset=';'.join(sample.serosippr.h_set), hpid=sample.serosippr.best_h_pid) data += '{}\n'.format(serotype) else: data += '\n' report.write(header) report.write(data)
def allelealigner(self): """ Perform a multiple sequence alignment of the allele sequences """ logging.info('Aligning alleles') # Create the threads for the analysis for _ in range(self.cpus): threads = Thread(target=self.alignthreads, args=()) threads.setDaemon(True) threads.start() for sample in self.samples: sample.alignpath = os.path.join(self.path, 'alignedalleles', sample.organism) make_path(sample.alignpath) # Create a list to store objects sample.alignedalleles = list() for outputfile in sample.allelefiles: aligned = os.path.join(sample.alignpath, os.path.basename(outputfile)) sample.alignedalleles.append(aligned) # Create the command line call clustalomega = ClustalOmegaCommandline(infile=outputfile, outfile=aligned, threads=4, auto=True) sample.clustalomega = str(clustalomega) self.queue.put((sample, clustalomega, outputfile, aligned)) self.queue.join()
def getrmlsthelper(self): """ Makes a system call to rest_auth.py, a Python script modified from https://github.com/kjolley/BIGSdb/tree/develop/scripts/test And downloads the most up-to-date rMLST profile and alleles """ printtime('Downloading {} alleles'.format(self.analysistype), self.start) # Extract the path of the current script from the full path + file name homepath = os.path.split(os.path.abspath(__file__))[0] # Set the path/name of the folder to contain the new alleles and profile newfolder = os.path.join(self.path, self.analysistype) # Create the path make_path(newfolder) # Create arguments to feed into the rest_auth_class script args = ArgumentParser args.secret_file = os.path.join(homepath, 'secret.txt') args.file_path = homepath args.output_path = newfolder args.start = self.start rmlst = rest_auth_class.REST(args) # Download the profile and alleles rmlst.main() # Get the new alleles into a list, and create the combinedAlleles file alleles = glob(os.path.join(newfolder, '*.tfa')) self.combinealleles(newfolder, alleles)
def predict(self): while True: sample = self.predictqueue.get() # Populate attributes sample.prodigal.reportdir = os.path.join( sample.general.outputdirectory, 'prodigal') sample.prodigal.results_file = os.path.join( sample.prodigal.reportdir, '{}_prodigalresults.sco'.format(sample.name)) sample.prodigal.results = sample.prodigal.results_file sample.commands.prodigal = 'prodigal -i {in1} -o {out1} -f sco -d {genes}'\ .format(in1=sample.general.bestassemblyfile, out1=sample.prodigal.results_file, genes=os.path.join(sample.prodigal.reportdir, '{}_genes.fa'.format(sample.name))) # Create the folder to store the reports make_path(sample.prodigal.reportdir) # Determine if the report already exists, and that it is not empty size = 0 if os.path.isfile(sample.prodigal.results_file): size = os.stat(sample.prodigal.results_file).st_size if not os.path.isfile(sample.prodigal.results_file) or size == 0: # Run the command out, err = run_subprocess(sample.commands.prodigal) threadlock.acquire() write_to_logfile(sample.commands.prodigal, sample.commands.prodigal, self.logfile, sample.general.logout, sample.general.logerr, None, None) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) threadlock.release() self.predictqueue.task_done()
def __init__(self, args, pipelinecommit, startingtime, scriptpath): """ :param args: command line arguments :param pipelinecommit: pipeline commit or version :param startingtime: time the script was started :param scriptpath: home path of the script """ # Initialise variables self.commit = str(pipelinecommit) self.starttime = startingtime self.homepath = scriptpath self.args = args # Define variables based on supplied arguments self.path = os.path.join(args.path, '') assert os.path.isdir(self.path), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') self.seqpath = self.sequencepath self.targetpath = os.path.join(args.targetpath, '') # ref file path is used to work with submodule code with a different naming scheme self.reffilepath = self.targetpath self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) assert os.path.isdir(self.targetpath), u'Target path is not a valid directory {0!r:s}' \ .format(self.targetpath) self.bcltofastq = args.bcl2fastq self.miseqpath = args.miseqpath self.miseqfolder = args.miseqfolder self.fastqdestination = args.destinationfastq self.forwardlength = args.readlengthforward self.reverselength = args.readlengthreverse self.numreads = 2 if self.reverselength != 0 else 1 self.customsamplesheet = args.customsamplesheet # Set the custom cutoff value self.cutoff = args.customcutoffs # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = int(args.numthreads if args.numthreads else multiprocessing.cpu_count()) self.threads = int() self.runmetadata = MetadataObject() self.taxonomy = {'Escherichia': 'coli', 'Listeria': 'monocytogenes', 'Salmonella': 'enterica'} self.analysistype = 'GeneSippr' self.copy = args.copy self.pipeline = False self.forward = str() self.reverse = str() self.index = str() self.header = dict() self.rundata = dict() self.completed = list() self.incomplete = list() self.analysescomplete = False self.final = False self.sum = int() self.completemetadata = list() self.samplesheetpath = str() self.samples = list() self.logfile = os.path.join(self.path, 'log') self.reports = str() # Run the method self.main()
def samplesheet(self): """ Create a custom sample sheet based on the original sample sheet for the run, but only including the samples that did not pass the quality threshold on the previous iteration """ make_path(self.samplesheetpath) self.customsamplesheet = os.path.join(self.samplesheetpath, 'SampleSheet.csv') header = [ 'Sample_ID', 'Sample_Name', 'Sample_Plate', 'Sample_Well', 'I7_Index_ID', 'index', 'I5_Index_ID', 'index2', 'Sample_Project', 'Description' ] with open(self.customsamplesheet, 'w') as samplesheet: lines = str() lines += '[Header]\n' lines += 'IEMFileVersion,{}\n'.format( self.header['IEMFileVersion']) lines += 'Investigator Name,{}\n'.format( self.header['InvestigatorName']) lines += 'Experiment Name,{}\n'.format( self.header['ExperimentName']) lines += 'Date,{}\n'.format(self.header['Date']) lines += 'Workflow,{}\n'.format(self.header['Workflow']) lines += 'Application,{}\n'.format(self.header['Application']) lines += 'Assay,{}\n'.format(self.header['Assay']) lines += 'Description,{}\n'.format(self.header['Description']) lines += 'Chemistry,{}\n'.format(self.header['Chemistry']) lines += '\n' lines += '[Reads]\n' lines += str(self.forward) + '\n' lines += str(self.reverse) + '\n' lines += '\n' lines += '[Settings]\n' lines += 'ReverseComplement,{}\n'.format( self.header['ReverseComplement']) lines += 'Adapter,{}\n'.format(self.header['Adapter']) lines += '\n' lines += '[Data]\n' lines += ','.join(header) lines += '\n' # Correlate all the samples added to the list of incomplete samples with their metadata for incomplete in self.incomplete: for sample in self.rundata: if incomplete == sample['SampleID']: # Use each entry in the header list as a key for the rundata dictionary for data in header: # Modify the key to be consistent with how the dictionary was populated result = sample[data.replace('_', '')] # Description is the final entry in the list, and shouldn't have a , following the value if data != 'Description': lines += '{},'.format(result.replace('NA', '')) # This entry should have a newline instead of a , else: lines += '{}\n'.format(result.replace( 'NA', '')) # Write the string to the sample sheet samplesheet.write(lines)
def primers(self): """Setup and create threads for ePCR""" # Create the threads for the ePCR analysis for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.epcr, args=()) threads.setDaemon(True) threads.start() for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': setattr(sample, self.analysistype, GenObject()) # Get the primers ready try: sample[self.analysistype].primers = glob(os.path.join(self.reffilepath, self.analysistype, sample.general.referencegenus, 'primers', '*.txt'))[0] # Find the name of the probe file sample[self.analysistype].probes = glob(os.path.join(self.reffilepath, self.analysistype, sample.general.referencegenus, 'probes', '*.fa'))[0] # Create the BLAST database of the probes (if necessary) self.makeblastdb(sample[self.analysistype].probes) # Initialise a list to store the names of the targets sample[self.analysistype].targets = list() # Open the primer file, and read the names of the targets into a list with open(sample[self.analysistype].primers, 'r') as primerfile: for line in primerfile: sample[self.analysistype].targets.append(line.split('\t')[0]) # Organisms without primer/probe files will fail. Populate metadata with 'NA' values except IndexError: sample[self.analysistype].primers = 'NA' sample[self.analysistype].probes = 'NA' # Only try to process organisms with primer files if sample[self.analysistype].primers != 'NA': # Make the output path sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].reportdir) # Set the base name of the output file outfile = sample[self.analysistype].reportdir + sample.name # Set the hashing and mapping commands sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext) sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile) # re-PCR uses the subtyping primers list to search the contigs file using the following parameters # -S {hash file} (Perform STS lookup using hash-file), -r + (Enable/disable reverse STS lookup) # -m 10000 (Set variability for STS size for lookup), # -n 1 (Set max allowed mismatches per primer for lookup) # -g 0 (Set max allowed indels per primer for lookup), # -G (Print alignments in comments), -o {output file} sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 2 -g 0 -G -q -o {}.txt {}' \ .format(outfile, outfile, sample[self.analysistype].primers) # Add the variables to the queue self.epcrqueue.put((sample, outfile)) self.epcrqueue.join()
def __init__(self, args): self.databasepath = os.path.join(args.databasepath) make_path(self.databasepath) self.start = args.start # Determine the location of the CLARK scripts self.clarkpath = os.path.dirname(shutil.which('CLARK')) self.logfile = os.path.join(self.databasepath, 'logfile') # Delete log files form previous iterations of the script in this folder clear_logfile(self.logfile)
def extract_rmlst_reads(self): """ rMLST read extraction. Should be the first thing called after parsing the fastq directory. """ for sample in self.metadata: # Create the object to store the variables setattr(sample, self.analysistype, GenObject()) # Initialise variables sample[self.analysistype].snv_count = list() # Initialise a starting value for the number of unique kmers found in each sample sample[self.analysistype].unique_kmers = -1 # Set and create the output directory try: sample[self.analysistype].outputdir = os.path.join( sample.run.outputdirectory, self.analysistype) except KeyError: sample[self.analysistype].outputdir = os.path.join( sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].outputdir) sample[self.analysistype].logout = os.path.join( sample[self.analysistype].outputdir, 'logout.txt') sample[self.analysistype].logerr = os.path.join( sample[self.analysistype].outputdir, 'logerr.txt') sample[self.analysistype].baitedfastq = os.path.join( sample[self.analysistype].outputdir, '{}_targetMatches.fastq.gz'.format(self.analysistype)) # Create the command to run the baiting - paired inputs and a single, zipped output sample[self.analysistype].bbdukcmd = 'bbduk.sh ref={} in1={} in2={} threads={} outm={}'\ .format(self.database, sample.general.trimmedcorrectedfastqfiles[0], sample.general.trimmedcorrectedfastqfiles[1], str(self.threads), sample[self.analysistype].baitedfastq) # Sometimes bbduk hangs forever, so that needs to be handled. Give it a very generous timeout. try: # Run the call, and write any errors to the logfile command = sample[self.analysistype].bbdukcmd if self.analyse: out, err = run_subprocess(command) else: out = str() err = str() write_to_logfile(command, command, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, sample[self.analysistype].logout, sample[self.analysistype].logerr) except TimeoutExpired: print('ERROR: Could not extract rMLST reads from sample {}'. format(sample.name))
def createobject(self): # Grab any .fastq files in the path fastqfiles = glob(os.path.join(self.path, '*.fastq*')) # Extract the base name of the globbed name + path provided fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles)) # Iterate through the names of the fastq files for fastqname in sorted(fastqnames): # Set the name metadata = MetadataObject() metadata.name = fastqname # Set the destination folder outputdir = os.path.join(self.path, fastqname) # Make the destination folder make_path(outputdir) # Get the fastq files specific to the fastqname specificfastq = glob( os.path.join(self.path, '{}*.fastq*'.format(fastqname))) # Make relative symlinks to the files in :self.path try: for fastq in specificfastq: # Get the basename of the file fastqfile = os.path.split(fastq)[-1] # Set the destination fastq path as the base name plus the destination folder destinationfastq = os.path.join(outputdir, fastqfile) # Symlink the files os.symlink('../{}'.format(fastqfile), destinationfastq) # Except os errors except OSError as exception: # If there is an exception other than the file exists, raise it if exception.errno != errno.EEXIST: raise # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Populate the .fastqfiles category of :self.metadata metadata.general.fastqfiles = [ fastq for fastq in glob( os.path.join(outputdir, '{}*.fastq*'.format(fastqname))) if 'trimmed' not in fastq ] # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True metadata.general.trimmedcorrectedfastqfiles = metadata.general.fastqfiles metadata.general.logout = os.path.join( metadata.general.outputdirectory, 'logout') metadata.general.logerr = os.path.join( metadata.general.outputdirectory, 'logerr') # Initialise an attribute to store commands metadata.commands = GenObject() # Append the metadata to the list of samples self.samples.append(metadata)
def create_database_folder(self, database): """ Create an appropriately named folder in which the database is to be stored :param database: the name of the database folder to create :return: the absolute path of the folder """ printtime('Setting up {} database'.format(database), self.start) # Define the path to store the database files databasepath = os.path.join(self.databasepath, database) # Create the path as required make_path(databasepath) return databasepath
def objects(self): """ :return: """ self.runmetadata = ObjectCreation(inputobject=self) make_path(os.path.join(self.path, 'BestAssemblies')) for sample in self.runmetadata.samples: # Link the assemblies to the BestAssemblies folder - necessary for GenomeQAML relative_symlink(sample.general.bestassemblyfile, os.path.join(self.path, 'BestAssemblies')) # Create attributes required for downstream analyses sample.general.trimmedcorrectedfastqfiles = [sample.general.bestassemblyfile]
def __init__(self, seqids, outdir, copyflag, filetype, verboseflag): """ :param seqids: list of SEQ IDs provided :param outdir: Directory in which sequence files are to be copied/linked :param copyflag: Boolean for whether files are to be copied of relatively symbolically linked :param filetype: File type to process: either FASTQ or FASTA :param verboseflag: Boolean for whether debug messages should be printed """ # Configure the logging SetupLogging(verboseflag) # Class variables from arguments self.seqids = seqids self.outdir = outdir # Make output directory if it doesn't exist. make_path(self.outdir) self.copyflag = copyflag self.filetype = filetype # Global setup of expected NAS folder structure # Set all the paths for the folders to use self.nas_dir = os.path.join('/mnt', 'nas2') self.processed_sequence_data = os.path.join(self.nas_dir, 'processed_sequence_data') self.raw_sequence_data = os.path.join(self.nas_dir, 'raw_sequence_data') self.merge_backup = os.path.join(self.nas_dir, 'raw_sequence_data', 'merged_sequences') # Dictionaries storing the path, the file type present in the folder, and the nested folder structure self.nas_folders = { self.raw_sequence_data: { 'fastq': ['*/*'] }, self.merge_backup: { 'fastq': [''] }, self.processed_sequence_data: { 'fasta': ['*/*/BestAssemblies'] } } # List of all the folders self.folders = [folder for folder in self.nas_folders] # Glob patterns for each file type self.extensions = {'fastq': '*.fastq.gz', 'fasta': '*.fasta'} # As FASTQ files are (usually) paired, only print a warning about finding duplicate copies if more than # two files are found; print the warning if more than one FASTA file is found self.lengths = 2 if self.filetype == 'fastq' else 1 # Set the term to use depending on whether files are copied or linked self.verb = 'Copying' if copyflag else 'Linking' # Dictionary to store sequence files on the related NAS self.new_file_dict = dict() # A list to store SEQ IDs for which sequence files cannot be located self.missing = list()
def runblast(self): while True: # while daemon (assembly, target, sample) = self.blastqueue.get() # grabs fastapath from dqueue genome = os.path.split(assembly)[1].split('.')[0] # Run the BioPython BLASTn module with the genome as query, fasta(target gene) as db. # Do not re-perform the BLAST search each time make_path(sample[self.analysistype].reportdir) size = 0 try: report = glob('{}{}*rawresults*'.format(sample[self.analysistype].reportdir, genome))[0] size = os.path.getsize(report) except IndexError: report = '{}{}_rawresults_{:}.csv'.format(sample[self.analysistype].reportdir, genome, time.strftime("%Y.%m.%d.%H.%M.%S")) db = target.split('.')[0] # BLAST command line call. Note the mildly restrictive evalue, and the high number of alignments. # Due to the fact that all the targets are combined into one database, this is to ensure that all potential # alignments are reported. Also note the custom outfmt: the doubled quotes are necessary to get it work blastn = NcbiblastnCommandline(query=assembly, db=db, reward=1, penalty=-5, gapopen=3, gapextend=3, dust="yes", soft_masking="true", evalue=0.1, num_alignments=1000000, num_threads=24, outfmt="'6 qseqid sacc stitle positive mismatch gaps " "evalue bitscore slen length'", out=report) # Save the blast command in the metadata sample[self.analysistype].blastcommand = str(blastn) if not os.path.isfile(report) or size == 0: try: blastn() except: self.blastqueue.task_done() self.blastqueue.join() try: os.remove(report) except IOError: pass raise # Run the blast parsing module self.blastparser(report, sample) self.blastqueue.task_done() # signals to dqueue job is done
def movefastq(self): """Find .fastq files for each sample and move them to an appropriately named folder""" printtime('Moving FASTQ files', self.start) # Iterate through each sample for sample in self.metadata.runmetadata.samples: # Retrieve the output directory outputdir = os.path.join(self.path, sample.name) # Find any fastq files with the sample name fastqfiles = sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \ if sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \ else sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \ if sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \ else sorted(glob(os.path.join(self.path, '{}*.fastq*'.format(sample.name)))) # Only try and move the files if the files exist if fastqfiles: make_path(outputdir) # Symlink the fastq files to the directory try: list( map( lambda x: os.symlink( os.path.join('..', os.path.basename(x)), os.path.join(outputdir, os.path.basename(x))), fastqfiles)) except OSError: pass # Find any fastq files with the sample name fastqfiles = [ fastq for fastq in sorted( glob( os.path.join(outputdir, '{}*.fastq*'.format( sample.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] else: if outputdir: # Find any fastq files with the sample name fastqfiles = [ fastq for fastq in sorted( glob( os.path.join( outputdir, '{}*.fastq*'.format( outputdir, sample.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] sample.general.fastqfiles = fastqfiles
def alleleretriever(self): """ Retrieve the required alleles from a file of all alleles, and create organism-specific allele files """ logging.info('Retrieving alleles') # Index all the records in the allele file logging.info('Loading rMLST records') recorddict = SeqIO.index(self.allelefile, 'fasta') logging.info('Creating allele output files') # Create the organism-specific files of alleles for organism in sorted(self.alleledict): # Make an object to store information for each strain metadata = MetadataObject() metadata.organism = organism metadata.path = self.path metadata.outpath = os.path.join(self.path, 'outputalleles', organism, '') # Delete and recreate the output path - as the files are appended to each time, they will be too large if # this script is run more than once try: shutil.rmtree(metadata.outpath) except OSError: pass make_path(metadata.outpath) metadata.combined = os.path.join(metadata.outpath, 'gdcs_alleles.fasta') metadata.allelefiles = list() with open(metadata.combined, 'w') as combined: for gene, alleles in sorted(self.alleledict[organism].items()): # Open the file to append allelefiles = os.path.join(metadata.outpath, '{}.tfa'.format(gene)) metadata.allelefiles.append(allelefiles) with open(allelefiles, 'a') as allelefile: # Write each allele record to the file for allele in sorted(alleles): # Skip adding alleles that are no longer in the database try: SeqIO.write( recorddict['{}_{}'.format(gene, allele)], allelefile, 'fasta') SeqIO.write( recorddict['{}_{}'.format(gene, allele)], combined, 'fasta') except KeyError: pass # Add the populated metadata to the list self.samples.append(metadata)
def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ printtime( 'Welcome to the CFIA de novo bacterial assembly pipeline {}'. format(args.commit.decode('utf-8')), args.startingtime, '\033[1;94m') # Define variables from the arguments - there may be a more streamlined way to do this self.args = args self.path = os.path.join(args.sequencepath) self.reffilepath = os.path.join(args.referencefilepath) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\ .format(self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile( os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True printtime( 'Could not find a sample sheet. Performing basic assembly (no run metadata captured)', self.starttime) # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir( self.path ), 'Supplied path location is not a valid directory {0!r:s}'.format( self.path) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.reffilepath) self.commit = args.commit.decode('utf-8') self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def fastqc(self): """Run fastqc system calls""" while True: # while daemon threadlock = threading.Lock() # Unpack the variables from the queue (sample, systemcall, outputdir, fastqcreads) = self.qcqueue.get() # Check to see if the output HTML file already exists try: _ = glob(os.path.join(outputdir, '*.html'))[0] except IndexError: # Make the output directory make_path(outputdir) # Run the system calls outstr = str() errstr = str() out, err = run_subprocess(systemcall) outstr += out errstr += err out, err = run_subprocess(fastqcreads) outstr += out errstr += err # Acquire thread lock, and write the logs to file threadlock.acquire() write_to_logfile(systemcall, systemcall, self.logfile, sample.general.logout, sample.general.logerr, None, None) write_to_logfile(fastqcreads, fastqcreads, self.logfile, sample.general.logout, sample.general.logerr, None, None) write_to_logfile(outstr, errstr, self.logfile, sample.general.logout, sample.general.logerr, None, None) threadlock.release() # Rename the outputs try: shutil.move( os.path.join(outputdir, 'stdin_fastqc.html'), os.path.join(outputdir, '{}_fastqc.html'.format(sample.name))) shutil.move( os.path.join(outputdir, 'stdin_fastqc.zip'), os.path.join(outputdir, '{}_fastqc.zip'.format(sample.name))) except IOError: pass # Signal to qcqueue that job is done self.qcqueue.task_done()
def reporter(self): make_path(self.reportpath) header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n' data = '' for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': data += '{},{},{},{},{},{}\n'.format( sample.name, sample[self.analysistype].closestrefseqgenus, sample[self.analysistype].closestrefseq, sample[self.analysistype].mashdistance, sample[self.analysistype].pvalue, sample[self.analysistype].nummatches) # Create the report file reportfile = '{}/mash.csv'.format(self.reportpath) with open(reportfile, 'w') as report: report.write(header) report.write(data)
def run_qaml(self): """ Create and run the GenomeQAML system call """ printtime('Running GenomeQAML quality assessment', self.start) qaml_call = 'classify.py -t {tf} -r {rf}'\ .format(tf=self.qaml_path, rf=self.qaml_report) make_path(self.reportpath) # Only attempt to assess assemblies if the report doesn't already exist if not os.path.isfile(self.qaml_report): # Run the system calls out, err = run_subprocess(qaml_call) # Acquire thread lock, and write the logs to file self.threadlock.acquire() write_to_logfile(qaml_call, qaml_call, self.logfile) write_to_logfile(out, err, self.logfile) self.threadlock.release()
def vtyper(self): """Setup and create threads for ePCR""" printtime('Running ePCR', self.start) # Create the threads for the BLAST analysis for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.epcr, args=()) threads.setDaemon(True) threads.start() # Create the system calls for famap, fahash, and ePCR for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': if 'stx' in sample.general.datastore: setattr(sample, self.analysistype, GenObject()) # Get the primers ready if self.reffilepath: sample[self.analysistype].primers = '{}{}/vtx_subtyping_primers.txt'\ .format(self.reffilepath, self.analysistype) else: sample[self.analysistype].primers = self.primerfile # Make the output path sample[self.analysistype].reportdir = '{}/{}/'.format(sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].reportdir) outfile = sample[self.analysistype].reportdir + sample.name # Set the hashing and mapping commands sample.commands.famap = 'famap -b {}.famap {}.fasta'.format(outfile, sample.general.filenoext) sample.commands.fahash = 'fahash -b {}.hash {}.famap'.format(outfile, outfile) # re-PCR uses the subtyping primers list to search the contigs file using the following parameters # -S {hash file} (Perform STS lookup using hash-file), # -r + (Enable/disable reverse STS lookup) # -m 10000 (Set variability for STS size for lookup), # -n 1 (Set max allowed mismatches per primer for lookup) # -g 0 (Set max allowed indels per primer for lookup), # -G (Print alignments in comments), # -q quiet # -o {output file}, sample.commands.epcr = 're-PCR -S {}.hash -r + -m 10000 -n 1 -g 0 -G -q -o {}.txt {}'\ .format(outfile, outfile, sample[self.analysistype].primers) sample[self.analysistype].resultsfile = '{}.txt'.format(outfile) self.epcrqueue.put((sample, outfile)) self.epcrqueue.join() self.epcrparse()
def __init__(self, inputobject, samplebasestarget=700000): self.metadata = inputobject.runmetadata.samples self.database = glob( os.path.join(inputobject.reffilepath, 'rMLST', '*.fasta'))[0] self.logfile = inputobject.logfile self.threads = inputobject.cpus self.analysistype = 'confinder' self.number_subsamples = 5 self.start = inputobject.starttime self.reportpath = inputobject.reportpath make_path(self.reportpath) self.samplebasestarget = samplebasestarget self.reportfile = os.path.join(self.reportpath, self.analysistype + '.csv') if not os.path.isfile(self.reportfile): self.analyse = True else: self.analyse = False self.main()
def reporter(self): make_path(self.reportpath) header = 'Strain,ReferenceGenus,ReferenceFile,ReferenceGenomeMashDistance,Pvalue,NumMatchingHashes\n' data = '' for sample in self.metadata: try: data += '{},{},{},{},{},{}\n'.format(sample.name, sample[self.analysistype].closestrefseqgenus, sample[self.analysistype].closestrefseq, sample[self.analysistype].mashdistance, sample[self.analysistype].pvalue, sample[self.analysistype].nummatches) except AttributeError: data += '{}\n'.format(sample.name) # Create the report file reportfile = os.path.join(self.reportpath, 'mash.csv') with open(reportfile, 'w') as report: report.write(header) report.write(data)
def probes(self): """ Find the 'best' probes for each gene by evaluating the percent identity of the probe to the best recorded percent identity for that organism + gene pair """ logging.info('Determining optimal probe sequences') for sample in self.samples: # Make a folder to store the probes sample.gdcsoutputpath = os.path.join(self.gdcsoutputpath, sample.organism) sample.gdcscombined = os.path.join( sample.gdcsoutputpath, '{}_gdcs_combined.fasta'.format(sample.organism)) make_path(sample.gdcsoutputpath) with open(sample.gdcscombined, 'w') as combined: for gene in sample.gene: # Open the file to append gene.gdcsoutputfile = os.path.join( sample.gdcsoutputpath, '{}_gdcs.tfa'.format(gene.name)) with open(gene.gdcsoutputfile, 'w') as allelefile: for window in gene.windows: # Variable to record whether a probe has already been identified from this gene passed = False for sliding in window.sliding: # Only consider the sequence if the sliding object has data, if the probe in question # has a mean identity equal to the highest observed identity for that probe size, and # if the mean identity is greater or equal than the lowest observed identity if sliding.datastore and sliding.mean == window.max and sliding.mean >= window.min \ and not passed: dnaseq = Seq(sliding.sequence, IUPAC.unambiguous_dna) # Create a sequence record using BioPython fasta = SeqRecord( dnaseq, # Without this, the header will be improperly formatted description='', # Use the gene name as the header id=gene.name) # Write each probe to the files SeqIO.write(fasta, allelefile, 'fasta') SeqIO.write(fasta, combined, 'fasta') passed = True
def __init__(self): from argparse import ArgumentParser from time import time # Parser for arguments parser = ArgumentParser( description='Performs ePCR using a supplied primer file. The primers must be in the format: ' '<name>\t<forward primer>\t<reverse primer>\t<max size allowed between primers>\n.' 'Sequence files must be stored in <path>/sequences' ) parser.add_argument('path', help='Specify path in which reports are to be stored') parser.add_argument('-s', '--sequencepath', required=True, help='Path to assembly files') parser.add_argument('-f', '--primerfile', required=True, help='The name and path of the file containing the primers') # Get the arguments into an object arguments = parser.parse_args() self.starttime = time() # Add trailing slashes to the path variables to ensure consistent formatting (os.path.join) self.path = os.path.join(arguments.path, '') self.sequencepath = os.path.join(arguments.sequencepath, '') self.primerfile = arguments.primerfile # Initialise variables self.runmetadata = MetadataObject() self.reffilepath = False self.analysistype = 'ePCR' self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) # Initialise metadata self.runmetadata.samples = self.setup() self.logfile = os.path.join(self.path, 'vtyper_logfile.txt') # Run the analyses Vtyper(self, self.analysistype) # Create a report self.reporter() # Print the metadata to file printtime('Printing metadata to file', self.starttime) metadataprinter.MetadataPrinter(self) # Print a bold, green exit statement print(u'\033[92m' + u'\033[1m' + u'\nElapsed Time: %0.2f seconds' % (time() - self.starttime) + u'\033[0m')
def sketching(self): printtime('Indexing assemblies for mash analysis', self.starttime) # Create the threads for the analysis for sample in self.metadata: if sample.general.bestassemblyfile != 'NA': threads = Thread(target=self.sketch, args=()) threads.setDaemon(True) threads.start() # Populate threads for each gene, genome combination for sample in self.metadata: # Create the analysis type-specific GenObject setattr(sample, self.analysistype, GenObject()) if sample.general.bestassemblyfile != 'NA': # Set attributes sample[self.analysistype].reportdir = os.path.join( sample.general.outputdirectory, self.analysistype) sample[self.analysistype].targetpath = os.path.join( self.referencefilepath, self.analysistype) sample[self.analysistype].refseqsketch = \ sample[self.analysistype].targetpath + '/RefSeqSketchesDefaults.msh' sample[self.analysistype].sketchfilenoext = '{}/{}'.format( sample[self.analysistype].reportdir, sample.name) sample[self.analysistype].sketchfile = sample[ self.analysistype].sketchfilenoext + '.msh' # Make the mash output directory if necessary make_path(sample[self.analysistype].reportdir) # Create a file containing the path/name of the filtered, corrected fastq files sample[self. analysistype].filelist = '{}/{}_fastqfiles.txt'.format( sample[self.analysistype].reportdir, sample.name) with open(sample[self.analysistype].filelist, 'w') as filelist: filelist.write('\n'.join( sample.general.trimmedcorrectedfastqfiles)) # Create the system call sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \ .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext) # Add each sample to the threads self.sketchqueue.put(sample) # Join the threads self.sketchqueue.join() self.mashing()
def sketching(self): printtime('Indexing files for {} analysis'.format(self.analysistype), self.starttime) # Create the threads for the analysis for i in range(self.cpus): threads = Thread(target=self.sketch, args=()) threads.setDaemon(True) threads.start() # Populate threads for each gene, genome combination for sample in self.metadata: # Create the analysis type-specific GenObject setattr(sample, self.analysistype, GenObject()) # Set attributes sample[self.analysistype].reportdir = os.path.join(sample.general.outputdirectory, self.analysistype) make_path(sample[self.analysistype].reportdir) sample[self.analysistype].targetpath = self.referencefilepath if not self.pipeline else os.path.join( self.referencefilepath, self.analysistype) sample[self.analysistype].refseqsketch = os.path.join(sample[self.analysistype].targetpath, 'RefSeqSketchesDefaults.msh') sample[self.analysistype].sketchfilenoext = os.path.join(sample[self.analysistype].reportdir, sample.name) sample[self.analysistype].sketchfile = sample[self.analysistype].sketchfilenoext + '.msh' # Make the mash output directory if necessary make_path(sample[self.analysistype].reportdir) # Create a file containing the path/name of the filtered, corrected fastq files sample[self.analysistype].filelist = os.path.join(sample[self.analysistype].reportdir, '{}_fastqfiles.txt'.format(sample.name)) with open(sample[self.analysistype].filelist, 'w') as filelist: filelist.write('\n'.join(sample.general.trimmedcorrectedfastqfiles)) # Create the system call sample.commands.sketch = 'mash sketch -m 2 -p {} -l {} -o {}' \ .format(self.cpus, sample[self.analysistype].filelist, sample[self.analysistype].sketchfilenoext) # Add each sample to the threads try: self.sketchqueue.put(sample) except (KeyboardInterrupt, SystemExit): printtime('Received keyboard interrupt, quitting threads', self.starttime) quit() # Join the threads self.sketchqueue.join() self.mashing()
def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ SetupLogging() logging.info('Welcome to the CFIA de novo bacterial assembly pipeline {}' .format(__version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.args = args self.path = os.path.join(args.sequencepath) self.reffilepath = os.path.join(args.referencefilepath) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\ .format(self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile(os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True logging.warning('Could not find a sample sheet. Performing basic assembly (no run metadata captured)') # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count() - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir(self.path), 'Supplied path location is not a valid directory {0!r:s}'.format(self.path) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.reffilepath) self.commit = __version__ self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, inputobject, extension='fasta', light=False): # Create an object to mimic the command line arguments necessary for the script args = MetadataObject() args.path = inputobject.path args.sequencepath = inputobject.path args.databasepath = os.path.join(inputobject.reffilepath, 'clark') make_path(args.databasepath) args.clarkpath = os.path.dirname(which('CLARK')) args.clarkpath += '/../opt/clark/' args.cutoff = 0.005 args.database = 'bacteria' args.rank = 'species' args.filter = False args.threads = inputobject.cpus args.runmetadata = inputobject.runmetadata args.clean_seqs = False args.reffilepath = inputobject.reffilepath args.runmetadata.extension = extension args.light = light # Run CLARK CLARK(args, inputobject.commit, inputobject.starttime, inputobject.homepath)
def main(args): # Create the path to store the schemes (if necessary) make_path(args.path) # Allow for Shigella to use the Escherichia MLST profile/alleles args.genus = args.genus if args.genus != 'Shigella' else 'Escherichia' # As there are multiple profiles for certain organisms, this dictionary has the schemes I use as values organismdictionary = {'Escherichia': 'Escherichia coli#1', 'Vibrio': 'Vibrio parahaemolyticus', 'Campylobacter': 'Campylobacter jejuni', 'Listeria': 'Listeria monocytogenes', 'Bacillus': 'Bacillus cereus', 'Staphylococcus': "Staphylococcus aureus", 'Salmonella': 'Salmonella enterica'} # Set the appropriate profile based on the dictionary key:value pairs try: args.genus = organismdictionary[args.species] except (KeyError, AttributeError): pass with url.urlopen(args.repository_url) as docfile: doc = xml.parse(docfile) root = doc.childNodes[0] found_species = [] for species_node in root.getElementsByTagName('species'): info = getspeciesinfo(species_node, args.genus, args.force_scheme_name) if info is not None: found_species.append(info) if len(found_species) == 0: print("No species matched your query.") return if len(found_species) > 1: print("The following {} species match your query, please be more specific:".format(len(found_species))) for info in found_species: print(info.name) return # exit(2) # output information for the single matching species assert len(found_species) == 1 species_info = found_species[0] species_name_underscores = species_info.name.replace(' ', '_') species_name_underscores = species_name_underscores.replace('/', '_') species_all_fasta_filename = species_name_underscores + '.fasta' species_all_fasta_file = open('{}/{}'.format(args.path, species_all_fasta_filename), 'w') log_filename = "mlst_data_download_{}_{}.log".format(species_name_underscores, species_info.retrieved) log_file = open('{}/{}'.format(args.path, log_filename), "w") log_file.write(species_info.retrieved + '\n') profile_path = urlparse(species_info.profiles_url).path profile_filename = profile_path.split('/')[-1] log_file.write("definitions: {}\n".format(profile_filename)) log_file.write("{} profiles\n".format(species_info.profiles_count)) log_file.write("sourced from: {}\n\n".format(species_info.profiles_url)) # # with url.urlopen(species_info.profiles_url) as profile_doc: # with open(os.path.join(args.path, profile_filename), 'w') as profile_file: localfile, headers = url.urlretrieve(species_info.profiles_url) with open(localfile, 'r') as profile_doc: with open(os.path.join(args.path, profile_filename), 'w') as profile_file: profile_file.write(profile_doc.read()) for locus in species_info.loci: locus_path = urlparse(locus.url).path locus_filename = locus_path.split('/')[-1] log_file.write("locus {}\n".format(locus.name)) log_file.write(locus_filename + '\n') log_file.write("Sourced from {}\n\n".format(locus.url)) # local_locus_doc, headers = url.urlretrieve(locus.url) with open(local_locus_doc, 'r') as locus_doc: with open(os.path.join(args.path, locus_filename), 'w') as locus_file: # locus_doc = url.urlopen(locus.url) # locus_file = open('{}/{}'.format(args.path, locus_filename), 'w') locus_fasta_content = locus_doc.read() locus_file.write(locus_fasta_content) species_all_fasta_file.write(locus_fasta_content) # locus_file.close() # locus_doc.close() log_file.write("all loci: {}\n".format(species_all_fasta_filename)) log_file.close() species_all_fasta_file.close()