def basic(self): # Grab any .fastq files in the path fastqfiles = glob(os.path.join(self.path, '*.fastq*')) # Extract the base name of the globbed name + path provided fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles)) # Iterate through the names of the fastq files for fastqname in sorted(fastqnames): # Set the name metadata = MetadataObject() metadata.name = fastqname # Set the destination folder outputdir = os.path.join(self.path, fastqname) # Make the destination folder make_path(outputdir) # Get the fastq files specific to the fastqname specificfastq = glob( os.path.join(self.path, '{}*.fastq*'.format(fastqname))) # Link the files to the output folder try: # Link the .gz files to :self.path/:filename list( map( lambda x: os.symlink( '../{}'.format(os.path.basename(x)), '{}/{}'. format(outputdir, os.path.basename(x))), specificfastq)) # Except os errors except OSError as exception: # If there is an exception other than the file exists, raise it if exception.errno != errno.EEXIST: raise # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Populate the .fastqfiles category of :self.metadata metadata.general.fastqfiles = [ fastq for fastq in sorted( glob( os.path.join(outputdir, '{}*.fastq*'.format( metadata.name)))) if 'trimmed' not in fastq and 'normalised' not in fastq and 'corrected' not in fastq and 'paired' not in fastq and 'unpaired' not in fastq ] # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join( self.path, metadata.name, '{}_log_out.txt'.format(metadata.name)) metadata.general.logerr = os.path.join( self.path, metadata.name, '{}_log_err.txt'.format(metadata.name)) # Append the metadata to the list of samples self.samples.append(metadata) # Grab metadata from previous runs previousmetadata = metadataReader.MetadataReader(self) # Update self.samples (if required) if previousmetadata.samples: self.samples = previousmetadata.samples # Run the read length method self.readlength()
def test_sistr(variables): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() # Set the destination folder outputdir = os.path.join(variables.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join(outputdir, 'out') metadata.general.logerr = os.path.join(outputdir, 'err') metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' variable_update()
def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ printtime( 'Welcome to the CFIA de novo bacterial assembly pipeline {}'. format(args.commit.decode('utf-8')), args.startingtime, '\033[1;94m') # Define variables from the arguments - there may be a more streamlined way to do this self.args = args self.path = os.path.join(args.sequencepath) self.reffilepath = os.path.join(args.referencefilepath) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\ .format(self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile( os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True printtime( 'Could not find a sample sheet. Performing basic assembly (no run metadata captured)', self.starttime) # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir( self.path ), 'Supplied path location is not a valid directory {0!r:s}'.format( self.path) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.reffilepath) self.commit = args.commit.decode('utf-8') self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, inputobject): self.path = inputobject.path self.starttime = inputobject.starttime self.sequencepath = inputobject.sequencepath try: self.customsamplesheet = inputobject.customsamplesheet self.bcltofastq = inputobject.bcltofastq self.miseqpath = inputobject.miseqpath self.miseqfolder = inputobject.miseqfolder self.fastqdestination = inputobject.fastqdestination self.forwardlength = inputobject.forwardlength self.reverselength = inputobject.reverselength self.numreads = 2 if self.reverselength != 0 else 1 self.customsamplesheet = inputobject.customsamplesheet self.homepath = inputobject.homepath self.commit = inputobject.commit self.copy = inputobject.copy except AttributeError: self.bcltofastq = False try: self.debug = inputobject.debug except AttributeError: self.debug = False try: self.portallog = inputobject.portallog except AttributeError: self.portallog = '' self.samples = MetadataObject() self.forward = str() self.reverse = str() self.index = str() self.header = dict() self.run = dict()
def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
def __init__(self, inputobject): self.starttime = inputobject.starttime try: self.samples = inputobject.samples except AttributeError: self.samples = inputobject.runmetadata.samples try: self.completemetadata = inputobject.completemetadata except AttributeError: self.completemetadata = inputobject.runmetadata.samples self.path = inputobject.path try: self.analysescomplete = inputobject.analysescomplete except AttributeError: self.analysescomplete = True self.reportpath = inputobject.reportpath self.runmetadata = MetadataObject() try: self.runmetadata.samples = inputobject.runmetadata.samples except AttributeError: self.runmetadata.samples = inputobject.runmetadata try: self.portallog = inputobject.portallog except AttributeError: self.portallog = ''
def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(inputobject=self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(passed=self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(inputobject=self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(inputobject=self) # Print the metadata to file metadataprinter.MetadataPrinter(inputobject=self)
def __init__(self, args, pipelinecommit, startingtime, scriptpath): """ :param args: command line arguments :param pipelinecommit: pipeline commit or version :param startingtime: time the script was started :param scriptpath: home path of the script """ # Initialise variables self.commit = str(pipelinecommit) self.starttime = startingtime self.homepath = scriptpath self.args = args # Define variables based on supplied arguments self.path = os.path.join(args.path, '') assert os.path.isdir(self.path), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') self.seqpath = self.sequencepath self.targetpath = os.path.join(args.targetpath, '') # ref file path is used to work with submodule code with a different naming scheme self.reffilepath = self.targetpath self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) assert os.path.isdir(self.targetpath), u'Target path is not a valid directory {0!r:s}' \ .format(self.targetpath) self.bcltofastq = args.bcl2fastq self.miseqpath = args.miseqpath self.miseqfolder = args.miseqfolder self.fastqdestination = args.destinationfastq self.forwardlength = args.readlengthforward self.reverselength = args.readlengthreverse self.numreads = 2 if self.reverselength != 0 else 1 self.customsamplesheet = args.customsamplesheet # Set the custom cutoff value self.cutoff = args.customcutoffs # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = int(args.numthreads if args.numthreads else multiprocessing.cpu_count()) self.threads = int() self.runmetadata = MetadataObject() self.taxonomy = {'Escherichia': 'coli', 'Listeria': 'monocytogenes', 'Salmonella': 'enterica'} self.analysistype = 'GeneSippr' self.copy = args.copy self.pipeline = False self.forward = str() self.reverse = str() self.index = str() self.header = dict() self.rundata = dict() self.completed = list() self.incomplete = list() self.analysescomplete = False self.final = False self.sum = int() self.completemetadata = list() self.samplesheetpath = str() self.samples = list() self.logfile = os.path.join(self.path, 'log') self.reports = str() # Run the method self.main()
def reader(self): import os import json from accessoryFunctions.accessoryFunctions import GenObject, MetadataObject for sample in self.metadata: metadatafile = '{}{}/{}_metadata.json'.format( self.path, sample.name, sample.name) if os.path.isfile(metadatafile): size = os.stat(metadatafile).st_size if size != 0: try: with open(metadatafile) as metadatareport: jsondata = json.load(metadatareport) # Create the metadata objects metadata = MetadataObject() # Initialise the metadata categories as GenObjects created using the appropriate key for attr in jsondata: if not isinstance(jsondata[attr], dict): setattr(metadata, attr, jsondata[attr]) else: setattr(metadata, attr, GenObject(jsondata[attr])) # As files often need to be reanalysed after being moved, test to see if it possible to use the # metadata from the previous assembly jsonfile = '{}/{}_metadata.json'.format( metadata.general.outputdirectory, sample.name) try: # Open the metadata file to write with open( jsonfile, 'w' ) as metadatafile: # Change from wb to w since this is text in python3 # Write the json dump of the object dump to the metadata file json.dump(sample.dump(), metadatafile, sort_keys=True, indent=4, separators=(',', ': ')) # Set the name metadata.name = sample.name self.samples.append(metadata) except IOError: self.samples.append(sample) except ValueError: self.samples.append(sample) else: self.samples.append(sample)
def rmlst(self): """ Get the most up-to-date profiles and alleles from pubmlst. Note that you will need the necessary access token and secret for this to work """ printtime('Downloading rMLST database', self.start) # Set the name of the file to be used to determine if the database download and set-up was successful completefile = os.path.join(self.databasepath, 'rMLST', 'complete') if not os.path.isfile(completefile): # Create an object to send to the rMLST download script args = MetadataObject() # Add the path and start time attributes args.path = self.databasepath args.start = self.start # Run the rMLST download get_rmlst.Get(args) # Create and populate the complete.txt file with open(completefile, 'w') as complete: complete.write('\n'.join(glob(os.path.join(self.databasepath, 'rMLST', '*'))))
def test_sistr(variables): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(variables.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() # Set the destination folder outputdir = os.path.join(variables.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' variable_update()
def setup(self): """ Set up the metadata object to be passed to Vtyper() """ from glob import glob files = sorted(glob('{}*.fasta'.format(self.sequencepath))) samples = list() # Create the metadata for each file for fasta in files: # Create a metadata object to store all metadata associated with each strain metadata = MetadataObject() metadata.general = GenObject() metadata.commands = GenObject() # Set the name metadata.name = os.path.basename(fasta).split('.')[0] metadata.general.bestassemblyfile = fasta metadata.general.stx = True metadata.general.outputdirectory = self.path metadata.general.filenoext = fasta.split('.')[0] metadata.general.fastqfiles = list() samples.append(metadata) return samples
def createobject(self): # Grab any .fastq files in the path fastqfiles = glob(os.path.join(self.path, '*.fastq*')) # Extract the base name of the globbed name + path provided fastqnames = map(lambda x: os.path.split(x)[1], filer(fastqfiles)) # Iterate through the names of the fastq files for fastqname in sorted(fastqnames): # Set the name metadata = MetadataObject() metadata.name = fastqname # Set the destination folder outputdir = os.path.join(self.path, fastqname) # Make the destination folder make_path(outputdir) # Get the fastq files specific to the fastqname specificfastq = glob( os.path.join(self.path, '{}*.fastq*'.format(fastqname))) # Make relative symlinks to the files in :self.path try: for fastq in specificfastq: # Get the basename of the file fastqfile = os.path.split(fastq)[-1] # Set the destination fastq path as the base name plus the destination folder destinationfastq = os.path.join(outputdir, fastqfile) # Symlink the files os.symlink('../{}'.format(fastqfile), destinationfastq) # Except os errors except OSError as exception: # If there is an exception other than the file exists, raise it if exception.errno != errno.EEXIST: raise # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Populate the .fastqfiles category of :self.metadata metadata.general.fastqfiles = [ fastq for fastq in glob( os.path.join(outputdir, '{}*.fastq*'.format(fastqname))) if 'trimmed' not in fastq ] # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True metadata.general.trimmedcorrectedfastqfiles = metadata.general.fastqfiles metadata.general.logout = os.path.join( metadata.general.outputdirectory, 'logout') metadata.general.logerr = os.path.join( metadata.general.outputdirectory, 'logerr') # Initialise an attribute to store commands metadata.commands = GenObject() # Append the metadata to the list of samples self.samples.append(metadata)
def alleleretriever(self): """ Retrieve the required alleles from a file of all alleles, and create organism-specific allele files """ logging.info('Retrieving alleles') # Index all the records in the allele file logging.info('Loading rMLST records') recorddict = SeqIO.index(self.allelefile, 'fasta') logging.info('Creating allele output files') # Create the organism-specific files of alleles for organism in sorted(self.alleledict): # Make an object to store information for each strain metadata = MetadataObject() metadata.organism = organism metadata.path = self.path metadata.outpath = os.path.join(self.path, 'outputalleles', organism, '') # Delete and recreate the output path - as the files are appended to each time, they will be too large if # this script is run more than once try: shutil.rmtree(metadata.outpath) except OSError: pass make_path(metadata.outpath) metadata.combined = os.path.join(metadata.outpath, 'gdcs_alleles.fasta') metadata.allelefiles = list() with open(metadata.combined, 'w') as combined: for gene, alleles in sorted(self.alleledict[organism].items()): # Open the file to append allelefiles = os.path.join(metadata.outpath, '{}.tfa'.format(gene)) metadata.allelefiles.append(allelefiles) with open(allelefiles, 'a') as allelefile: # Write each allele record to the file for allele in sorted(alleles): # Skip adding alleles that are no longer in the database try: SeqIO.write( recorddict['{}_{}'.format(gene, allele)], allelefile, 'fasta') SeqIO.write( recorddict['{}_{}'.format(gene, allele)], combined, 'fasta') except KeyError: pass # Add the populated metadata to the list self.samples.append(metadata)
def __init__(self): from argparse import ArgumentParser from time import time # Parser for arguments parser = ArgumentParser( description='Performs ePCR using a supplied primer file. The primers must be in the format: ' '<name>\t<forward primer>\t<reverse primer>\t<max size allowed between primers>\n.' 'Sequence files must be stored in <path>/sequences' ) parser.add_argument('path', help='Specify path in which reports are to be stored') parser.add_argument('-s', '--sequencepath', required=True, help='Path to assembly files') parser.add_argument('-f', '--primerfile', required=True, help='The name and path of the file containing the primers') # Get the arguments into an object arguments = parser.parse_args() self.starttime = time() # Add trailing slashes to the path variables to ensure consistent formatting (os.path.join) self.path = os.path.join(arguments.path, '') self.sequencepath = os.path.join(arguments.sequencepath, '') self.primerfile = arguments.primerfile # Initialise variables self.runmetadata = MetadataObject() self.reffilepath = False self.analysistype = 'ePCR' self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) # Initialise metadata self.runmetadata.samples = self.setup() self.logfile = os.path.join(self.path, 'vtyper_logfile.txt') # Run the analyses Vtyper(self, self.analysistype) # Create a report self.reporter() # Print the metadata to file printtime('Printing metadata to file', self.starttime) metadataprinter.MetadataPrinter(self) # Print a bold, green exit statement print(u'\033[92m' + u'\033[1m' + u'\nElapsed Time: %0.2f seconds' % (time() - self.starttime) + u'\033[0m')
def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ SetupLogging() logging.info('Welcome to the CFIA de novo bacterial assembly pipeline {}' .format(__version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.args = args self.path = os.path.join(args.sequencepath) self.reffilepath = os.path.join(args.referencefilepath) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\ .format(self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile(os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True logging.warning('Could not find a sample sheet. Performing basic assembly (no run metadata captured)') # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count() - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir(self.path), 'Supplied path location is not a valid directory {0!r:s}'.format(self.path) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.reffilepath) self.commit = __version__ self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, inputobject): self.start = inputobject.starttime self.commit = inputobject.commit self.starttime = inputobject.starttime self.homepath = inputobject.homepath self.path = inputobject.path self.cpus = inputobject.cpus self.metadata = inputobject.runmetadata.samples self.runmetadata = MetadataObject() self.runmetadata.samples = list() self.reffilepath = inputobject.reffilepath self.reportpath = inputobject.reportpath self.logfile = inputobject.logfile self.analysistype = 'coregenome' self.cutoff = 90 self.coregenomes = list() # Fields used for custom outfmt 6 BLAST output: self.fieldnames = ['query_id', 'subject_id', 'positives', 'mismatches', 'gaps', 'evalue', 'bit_score', 'subject_length', 'alignment_length', 'query_start', 'query_end', 'query_sequence', 'subject_start', 'subject_end', 'subject_sequence'] # Run the analyses self.annotatedcore()
def methodreporter(self): """ Create final reports collating results from all the individual iterations through the method pipeline """ # Ensure that the analyses are set to complete self.analysescomplete = True # Reset the report path to original value self.reportpath = os.path.join(self.path, 'reports') # Clear the runmetadata - it will be populated with all the metadata from completemetadata self.runmetadata = MetadataObject() self.runmetadata.samples = list() # As the samples were entered into self.completemetadata depending on when they passed the quality threshold, # this list is not ordered numerically/alphabetically like the original runmetadata. Reset the order. for strain in self.samples: for sample in self.completemetadata: if sample.name == strain: # Append the sample to the ordered list of objects self.runmetadata.samples.append(sample) # Create the reports self.reporter() self.genusspecific() self.sixteensreporter() self.gdcsreporter()
def mlst(self, genera={'Escherichia', 'Vibrio', 'Campylobacter', 'Listeria', 'Bacillus', 'Staphylococcus', 'Salmonella'}): """ Download the necessary up-to-date MLST profiles and alleles """ printtime('Downloading MLST databases', self.start) for genus in genera: # Create an object to pass to the get_mlst script args = MetadataObject() # Populate the object with the necessary attributes args.species = genus args.repository_url = 'http://pubmlst.org/data/dbases.xml' args.force_scheme_name = False args.path = os.path.join(self.databasepath, 'MLST', genus) # Create the name of the file to be used to determine if the database download and setup was successful completefile = os.path.join(args.path, 'complete') # Only download the files if the download was not previously successful if not os.path.isfile(completefile): # Run the download get_mlst.main(args) # Create and populate the complete.txt file with open(completefile, 'w') as complete: complete.write('\n'.join(glob(os.path.join(args.path, '*'))))
class RunAssemble(object): def main(self): """ Run the methods in the correct order """ # Start the assembly self.helper() # Create the quality object self.create_quality_object() # Run the quality analyses self.quality() # Perform assembly self.assemble() # Perform genus-agnostic typing self.agnostictyping() # Perform typing self.typing() # Create a report reporter.Reporter(self) # Compress or remove all large, temporary files created by the pipeline compress.Compress(self) metadataprinter.MetadataPrinter(self) def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(self) # Print the metadata to file metadataprinter.MetadataPrinter(self) def create_quality_object(self): """ Create the quality object """ self.qualityobject = quality.Quality(self) def quality(self): """ Creates quality objects and runs quality assessments and quality processes on the supplied sequences """ # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers # of forward and reverse reads, read length longer than quality score length, proper extension self.fastq_validate() # Run FastQC on the unprocessed fastq files self.fastqc_raw() # Perform quality trimming and FastQC on the trimmed files self.quality_trim() # Run FastQC on the trimmed files self.fastqc_trimmed() # Perform error correcting on the reads self.error_correct() # Detect contamination in the reads self.contamination_detection() # Run FastQC on the processed fastq files self.fastqc_trimmedcorrected() # Exit if only pre-processing of data is requested metadataprinter.MetadataPrinter(self) if self.preprocess: printtime('Pre-processing complete', self.starttime) quit() def fastq_validate(self): """ Attempt to detect and fix issues with the FASTQ files """ self.qualityobject.validate_fastq() metadataprinter.MetadataPrinter(self) def fastqc_raw(self): """ Run FastQC on the unprocessed FASTQ files """ self.qualityobject.fastqcthreader('Raw') metadataprinter.MetadataPrinter(self) def quality_trim(self): """ Perform quality trimming and FastQC on the trimmed files """ self.qualityobject.trimquality() metadataprinter.MetadataPrinter(self) def fastqc_trimmed(self): """ Run FastQC on the quality trimmed FASTQ files """ self.qualityobject.fastqcthreader('Trimmed') metadataprinter.MetadataPrinter(self) def error_correct(self): """ Perform error correcting on the reads """ self.qualityobject.error_correction() metadataprinter.MetadataPrinter(self) def contamination_detection(self): """ Calculate the levels of contamination in the reads """ self.qualityobject.contamination_finder() metadataprinter.MetadataPrinter(self) def fastqc_trimmedcorrected(self): """ Run FastQC on the processed fastq files """ self.qualityobject.fastqcthreader('trimmedcorrected') metadataprinter.MetadataPrinter(self) def assemble(self): """ Assemble genomes and perform some basic quality analyses """ # Assemble genomes self.assemble_genomes() # Calculate assembly metrics on raw assemblies self.quality_features('raw') # Calculate the depth of coverage as well as other quality metrics using Qualimap self.qualimap() # Calculate assembly metrics on polished assemblies self.quality_features('polished') # ORF detection self.prodigal() # Assembly quality determination self.genome_qaml() # CLARK analyses self.clark() def assemble_genomes(self): """ Use skesa to assemble genomes """ assembly = skesa.Skesa(self) assembly.main() metadataprinter.MetadataPrinter(self) def qualimap(self): """ Calculate the depth of coverage as well as other quality metrics using Qualimap """ qual = depth.QualiMap(self) qual.main() metadataprinter.MetadataPrinter(self) def quality_features(self, analysis): """ Extract features from assemblies such as total genome size, longest contig, and N50 """ features = quality.QualityFeatures(self, analysis) features.main() metadataprinter.MetadataPrinter(self) def prodigal(self): """ Use prodigal to detect open reading frames in the assemblies """ prodigal.Prodigal(self) metadataprinter.MetadataPrinter(self) def genome_qaml(self): """ Use GenomeQAML to determine the quality of the assemblies """ g_qaml = quality.GenomeQAML(self) g_qaml.main() metadataprinter.MetadataPrinter(self) def clark(self): """ Run CLARK metagenome analyses on the raw reads and assemblies if the system has adequate resources """ # Determine the amount of physical memory in the system mem = virtual_memory() # If the total amount of memory is greater than 100GB (this could probably be lowered), run CLARK if mem.total >= 100000000000: # Run CLARK typing on the .fastq and .fasta files automateCLARK.PipelineInit(self) automateCLARK.PipelineInit(self, 'fastq') else: # Run CLARK typing on the .fastq and .fasta files automateCLARK.PipelineInit(self, light=True) automateCLARK.PipelineInit(self, 'fastq', light=True) metadataprinter.MetadataPrinter(self) def agnostictyping(self): """ Perform typing that does not require the genus of the organism to be known """ # Run mash self.mash() # Run rMLST self.rmlst() # Run the 16S analyses self.sixteens() # Calculate the presence/absence of GDCS self.run_gdcs() # Find genes of interest self.genesippr() # Plasmid finding self.plasmids() # Plasmid extracting self.plasmid_extractor() # Resistance finding - raw reads self.ressippr() # Resistance finding - assemblies self.resfinder() # Prophage detection self.prophages() # Univec contamination search self.univec() # Virulence self.virulence() def mash(self): """ Run mash to determine closest refseq genome """ mash.Mash(self, 'mash') metadataprinter.MetadataPrinter(self) def rmlst(self): """ Run rMLST analyses """ MLSTSippr(self, self.commit, self.starttime, self.homepath, 'rMLST', 1.0, True) metadataprinter.MetadataPrinter(self) def sixteens(self): """ Run the 16S analyses """ SixteensFull(self, self.commit, self.starttime, self.homepath, 'sixteens_full', 0.95) metadataprinter.MetadataPrinter(self) def run_gdcs(self): """ Determine the presence of genomically-dispersed conserved sequences for Escherichia, Listeria, and Salmonella strains """ # Run the GDCS analysis GDCS(self) metadataprinter.MetadataPrinter(self) def genesippr(self): """ Find genes of interest """ GeneSippr(self, self.commit, self.starttime, self.homepath, 'genesippr', 0.95, False, False) metadataprinter.MetadataPrinter(self) def plasmids(self): """ Plasmid finding """ Plasmids(self, self.commit, self.starttime, self.homepath, 'plasmidfinder', 0.8, False, True) metadataprinter.MetadataPrinter(self) def plasmid_extractor(self): """ Extracts and types plasmid sequences """ plasmids = PlasmidExtractor(self) plasmids.main() metadataprinter.MetadataPrinter(self) def ressippr(self): """ Resistance finding - raw reads """ res = Resistance(self, self.commit, self.starttime, self.homepath, 'resfinder', 0.8, False, True) res.main() metadataprinter.MetadataPrinter(self) def resfinder(self): """ Resistance finding - assemblies """ ResFinder(self) metadataprinter.MetadataPrinter(self) def prophages(self, cutoff=90): """ Prophage detection :param cutoff: cutoff value to be used in the analyses """ pro = GeneSeekrMethod.PipelineInit(self, 'prophages', False, cutoff, True) Prophages(pro) metadataprinter.MetadataPrinter(self) def univec(self): """ Univec contamination search """ uni = univec.PipelineInit(self, 'univec', False, 80, True) Univec(uni) metadataprinter.MetadataPrinter(self) def virulence(self): """ Virulence gene detection """ vir = Virulence(self, self.commit, self.starttime, self.homepath, 'virulence', 0.95, False, True) vir.reporter() metadataprinter.MetadataPrinter(self) def typing(self): """ Perform analyses that use genera-specific databases """ # Run modules and print metadata to file # MLST self.mlst() # Serotyping self.serosippr() # Virulence typing self.vtyper() # Core genome calculation self.coregenome() # Sistr self.sistr() def mlst(self): """ MLST analyses """ MLSTSippr(self, self.commit, self.starttime, self.homepath, 'MLST', 1.0, True) metadataprinter.MetadataPrinter(self) def serosippr(self): """ Serotyping analyses """ Serotype(self, self.commit, self.starttime, self.homepath, 'serosippr', 0.95, True) metadataprinter.MetadataPrinter(self) def vtyper(self): """ Virulence typing """ vtype = vtyper.PrimerFinder(self, 'vtyper') vtype.main() metadataprinter.MetadataPrinter(self) def coregenome(self): """ Core genome calculation """ coregen = GeneSeekrMethod.PipelineInit(self, 'coregenome', True, 70, False) core.CoreGenome(coregen) core.AnnotatedCore(self) metadataprinter.MetadataPrinter(self) def sistr(self): """ Sistr """ sistr.Sistr(self, 'sistr') metadataprinter.MetadataPrinter(self) def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ printtime( 'Welcome to the CFIA de novo bacterial assembly pipeline {}'. format(args.commit.decode('utf-8')), args.startingtime, '\033[1;94m') # Define variables from the arguments - there may be a more streamlined way to do this self.args = args self.path = os.path.join(args.sequencepath) self.reffilepath = os.path.join(args.referencefilepath) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\ .format(self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile( os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True printtime( 'Could not find a sample sheet. Performing basic assembly (no run metadata captured)', self.starttime) # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir( self.path ), 'Supplied path location is not a valid directory {0!r:s}'.format( self.path) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.reffilepath) self.commit = args.commit.decode('utf-8') self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, inputobject, extension='fasta', light=False): # Create an object to mimic the command line arguments necessary for the script args = MetadataObject() args.path = inputobject.path args.sequencepath = inputobject.path args.databasepath = os.path.join(inputobject.reffilepath, 'clark') make_path(args.databasepath) args.clarkpath = os.path.dirname(which('CLARK')) args.clarkpath += '/../opt/clark/' args.cutoff = 0.005 args.database = 'bacteria' args.rank = 'species' args.filter = False args.threads = inputobject.cpus args.runmetadata = inputobject.runmetadata args.clean_seqs = False args.reffilepath = inputobject.reffilepath args.runmetadata.extension = extension args.light = light # Run CLARK CLARK(args, inputobject.commit, inputobject.starttime, inputobject.homepath)
def probefinder(self): """ Find the longest probe sequences """ logging.info('Finding and filtering probe sequences') for sample in self.samples: # A list to store the metadata object for each alignment sample.gene = list() for align in sample.alignedalleles: # Create an object to store all the information for each alignment file metadata = GenObject() metadata.name = os.path.splitext(os.path.basename(align))[0] metadata.alignmentfile = align # Create an alignment object from the alignment file try: metadata.alignment = AlignIO.read(align, 'fasta') except ValueError: # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences # to be the length of the longest sequence # https://stackoverflow.com/questions/32833230/biopython-alignio-valueerror-says-strings-must-be-same-length records = SeqIO.parse(align, 'fasta') # Make a copy, otherwise our generator is exhausted after calculating maxlen records = list(records) # Calculate the length of the longest sequence maxlen = max(len(record.seq) for record in records) # Pad sequences so that they all have the same length for record in records: if len(record.seq) != maxlen: sequence = str(record.seq).ljust(maxlen, '.') record.seq = Seq(sequence) assert all(len(record.seq) == maxlen for record in records) # Write to file and do alignment metadata.alignmentfile = '{}_padded.tfa'.format( os.path.splitext(align)[0]) with open(metadata.alignmentfile, 'w') as padded: SeqIO.write(records, padded, 'fasta') # Align the padded sequences metadata.alignment = AlignIO.read(metadata.alignmentfile, 'fasta') metadata.summaryalign = AlignInfo.SummaryInfo( metadata.alignment) # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default # parameters of threshold=.7, and ambiguous='X' are used consensus = metadata.summaryalign.dumb_consensus() metadata.consensus = str(consensus) # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each # location along the entire consensus sequence metadata.pssm = metadata.summaryalign.pos_specific_score_matrix( consensus) metadata.identity = list() # Find the prevalence of each base for every location along the sequence for line in metadata.pssm: try: bases = [ line['A'], line['C'], line['G'], line['T'], line['-'] ] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases[:4]) / sum(bases) * 100))) except KeyError: bases = [line['A'], line['C'], line['G'], line['T']] # Calculate the frequency of the most common base - don't count gaps metadata.identity.append( float('{:.2f}'.format( max(bases) / sum(bases) * 100))) # List to store metadata objects metadata.windows = list() # Variable to store whether a suitable probe has been found for the current organism + gene pair. # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found passing = False # Create sliding windows of size self.max - self.min from the list of identities for each column # of the alignment for i in reversed(range(self.min, self.max + 1)): if not passing: windowdata = MetadataObject() windowdata.size = i windowdata.max = 0 windowdata.sliding = list() # Create a counter to store the starting location of the window in the sequence n = 0 # Create sliding windows from the range of sizes for the list of identities windows = self.window(metadata.identity, i) # Go through each window from the collection of sliding windows to determine which window(s) # has (have) the best results for window in windows: # Create another object to store all the data for the window slidingdata = MetadataObject() # Only consider the window if every position has a percent identity greater than the cutoff if min(window) > self.cutoff: # Populate the object with the necessary variables slidingdata.location = '{}:{}'.format(n, n + i) slidingdata.min = min(window) slidingdata.mean = float('{:.2f}'.format( numpy.mean(window))) slidingdata.sequence = str(consensus[n:n + i]) # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min # means a better/less overall percent identity, respectively windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \ else windowdata.max windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \ else windowdata.min # Add the object to the list of objects windowdata.sliding.append(slidingdata) passing = True n += 1 # All the object to the list of objects metadata.windows.append(windowdata) # All the object to the list of objects sample.gene.append(metadata)
parser.add_argument('path', help='Specify input directory') parser.add_argument('-s', '--sequencepath', required=True, help='Path of .fastq(.gz) files to process.') # Get the arguments into an object arguments = parser.parse_args() # Define the start time arguments.starttime = time.time() # Find the files fastas = sorted(glob(os.path.join(arguments.sequencepath, '*.fa*'))) # Create a metadata object arguments.runmetadata = MetadataObject() arguments.runmetadata.samples = list() for fasta in fastas: metadata = MetadataObject() metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() # Set the destination folder outputdir = os.path.join(arguments.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands
def __init__(self, args, pipelinecommit, startingtime, scriptpath, analysistype, cutoff, pipeline): """ :param args: command line arguments :param pipelinecommit: pipeline commit or version :param startingtime: time the script was started :param scriptpath: home path of the script :param analysistype: name of the analysis being performed - allows the program to find databases :param cutoff: percent identity cutoff for matches :param pipeline: boolean of whether this script needs to run as part of a particular assembly pipeline """ import multiprocessing # Initialise variables self.commit = str(pipelinecommit) self.starttime = startingtime self.homepath = scriptpath # Define variables based on supplied arguments self.path = os.path.join(args.path, '') assert os.path.isdir( self.path ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) try: self.sequencepath = os.path.join(args.sequencepath, '') except AttributeError: self.sequencepath = self.path assert os.path.isdir(self.sequencepath), u'Sequence path is not a valid directory {0!r:s}' \ .format(self.sequencepath) try: self.targetpath = os.path.join(args.reffilepath) except AttributeError: self.targetpath = os.path.join(args.targetpath) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.targetpath), u'Target path is not a valid directory {0!r:s}' \ .format(self.targetpath) try: self.bcltofastq = args.bcltofastq except AttributeError: self.bcltofastq = False try: self.miseqpath = args.miseqpath except AttributeError: self.miseqpath = str() try: self.miseqfolder = args.miseqfolder except AttributeError: self.miseqfolder = str() try: self.fastqdestination = args.fastqdestination except AttributeError: self.fastqdestination = str() try: self.forwardlength = args.forwardlength except AttributeError: self.forwardlength = 'full' try: self.reverselength = args.reverselength except AttributeError: self.reverselength = 'full' self.numreads = 2 if self.reverselength != 0 else 1 self.customsamplesheet = args.customsamplesheet self.taxonomy = { 'Escherichia': 'coli', 'Listeria': 'monocytogenes', 'Salmonella': 'enterica' } self.logfile = args.logfile # Set the custom cutoff value self.cutoff = float(cutoff) try: self.averagedepth = int(args.averagedepth) except AttributeError: self.averagedepth = 10 try: self.copy = args.copy except AttributeError: self.copy = False self.pipeline = pipeline if not self.pipeline: self.runmetadata = MetadataObject() # Create the objects to be used in the analyses objects = Objectprep(self) objects.objectprep() self.runmetadata = objects.samples else: self.runmetadata = args.runmetadata # Use the argument for the number of threads to use, or default to the number of cpus in the system try: self.cpus = int(args.cpus) except AttributeError: self.cpus = multiprocessing.cpu_count() try: self.threads = int( self.cpus / len(self.runmetadata.samples) ) if self.cpus / len(self.runmetadata.samples) > 1 else 1 except TypeError: self.threads = self.cpus self.analysistype = analysistype self.threads = int(self.cpus / len(self.runmetadata.samples)) if self.cpus / len(self.runmetadata.samples) > 1 \ else 1 # Run the analyses self.runner()
def __init__(self, args, pipelinecommit, startingtime, scriptpath): # Initialise variables self.commit = str(pipelinecommit) self.start = startingtime self.homepath = scriptpath # Define variables based on supplied arguments self.args = args self.path = os.path.join(args.path, '') assert os.path.isdir( self.path ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \ .format(self.sequencepath) self.databasepath = os.path.join(args.databasepath, '') assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \ .format(self.databasepath) # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1 self.cpus = 1 # Set variables from the arguments self.database = args.database self.rank = args.rank self.clarkpath = args.clarkpath self.cutoff = float(args.cutoff) * 100 # Initialise variables for the analysis self.targetcall = str() self.classifycall = str() self.devnull = open(os.devnull, 'wb') self.filelist = os.path.join(self.path, 'sampleList.txt') self.reportlist = os.path.join(self.path, 'reportList.txt') self.abundancequeue = Queue() self.datapath = str() self.reportpath = os.path.join(self.path, 'reports') self.clean_seqs = args.clean_seqs self.light = args.light if self.clean_seqs: try: self.reffilepath = args.reffilepath except AttributeError: self.clean_seqs = False # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects # and variables play nice try: if args.runmetadata: self.runmetadata = args.runmetadata self.extension = self.runmetadata.extension # Create the name of the final report self.report = os.path.join( self.reportpath, '{}'.format('abundance{}.xlsx'.format(self.extension))) # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK if not os.path.isfile(self.report): printtime( 'Performing CLARK analysis on {} files'.format( self.extension), self.start) if self.extension != 'fastq': for sample in self.runmetadata.samples: sample.general.combined = sample.general.bestassemblyfile # Run the pipeline self.main() else: # Only perform FASTQ analyses if the sample is declared to be a metagenome metagenome = False for sample in self.runmetadata.samples: try: status = sample.run.Description except KeyError: status = 'unknown' if status == 'metagenome': metagenome = True # If any of the samples are metagenomes, run the CLARK analysis on the raw files if metagenome: fileprep.Fileprep(self) # Run the pipeline self.main() # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects for sample in self.runmetadata.samples: if sample.general.bestassemblyfile != 'NA': # Create a GenObject to store metadata when this script is run as part of the pipeline clarkextension = 'clark{}'.format(self.extension) setattr(sample, clarkextension, GenObject()) # Create a folder to store all the CLARK files sample[clarkextension].outputpath = os.path.join( sample.general.outputdirectory, 'CLARK') make_path(sample[clarkextension].outputpath) # Move the files to the CLARK folder try: move( sample.general.abundance, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.abundance))) move( sample.general.classification, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.classification))) except (KeyError, FileNotFoundError): pass # Set the CLARK-specific attributes try: sample[ clarkextension].abundance = sample.general.abundance sample[ clarkextension].classification = sample.general.classification sample[ clarkextension].combined = sample.general.combined except KeyError: pass if self.extension == 'fastq': # Remove the combined .fastq files try: if type(sample[clarkextension].combined ) is list: os.remove( sample[clarkextension].combined) except (OSError, KeyError): pass # Remove all the attributes from .general map(lambda x: delattr(sample.general, x), ['abundance', 'classification', 'combined']) # Remove the text files lists of files and reports created by CLARK try: map( lambda x: os.remove(os.path.join(self.path, x) ), ['reportList.txt', 'sampleList.txt']) except OSError: pass else: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() except AttributeError: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() # Optionally filter the .fastq reads based on taxonomic assignment if args.filter: filtermetagenome.PipelineInit(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
'-a', '--averagedepth', default=10, help= 'Supply an integer of the minimum mapping depth in order to return a positive result ' ) parser.add_argument( '-C', '--copy', action='store_true', help= 'Normally, the program will create symbolic links of the files into the sequence path, ' 'however, the are occasions when it is necessary to copy the files instead' ) # Get the arguments into an object arguments = parser.parse_args() arguments.pipeline = False arguments.runmetadata.samples = MetadataObject() arguments.analysistype = 'genesippr' arguments.logfile = os.path.join(arguments.path, 'logfile') # Define the start time start = time.time() # Run the script GeneSippr(arguments, commit, start, homepath, arguments.analysistype, arguments.cutoff, arguments.pipeline, False) # Print a bold, green exit statement print('\033[92m' + '\033[1m' + "\nElapsed Time: %0.2f seconds" % (time.time() - start) + '\033[0m')
class RunAssemble(object): def main(self): """ Run the methods in the correct order """ # Start the assembly self.helper() # Create the quality object self.create_quality_object() # Run the quality analyses self.quality() # Perform assembly self.assemble() # Perform genus-agnostic typing self.agnostictyping() # Perform typing self.typing() # Create a report reporter.Reporter(self) # Compress or remove all large, temporary files created by the pipeline compress.Compress(self) metadataprinter.MetadataPrinter(inputobject=self) def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(inputobject=self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(passed=self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(inputobject=self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(inputobject=self) # Print the metadata to file metadataprinter.MetadataPrinter(inputobject=self) def create_quality_object(self): """ Create the quality object """ self.qualityobject = quality.Quality(inputobject=self) def quality(self): """ Creates quality objects and runs quality assessments and quality processes on the supplied sequences """ # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers # of forward and reverse reads, read length longer than quality score length, proper extension self.fastq_validate() # Run FastQC on the unprocessed fastq files self.fastqc_raw() # Perform quality trimming and FastQC on the trimmed files self.quality_trim() # Run FastQC on the trimmed files self.fastqc_trimmed() # Perform error correcting on the reads self.error_correct() # Detect contamination in the reads self.contamination_detection() # Run FastQC on the processed fastq files self.fastqc_trimmedcorrected() # Exit if only pre-processing of data is requested metadataprinter.MetadataPrinter(inputobject=self) if self.preprocess: logging.info('Pre-processing complete') quit() def fastq_validate(self): """ Attempt to detect and fix issues with the FASTQ files """ self.qualityobject.validate_fastq() metadataprinter.MetadataPrinter(inputobject=self) def fastqc_raw(self): """ Run FastQC on the unprocessed FASTQ files """ self.qualityobject.fastqcthreader(level='Raw') metadataprinter.MetadataPrinter(inputobject=self) def quality_trim(self): """ Perform quality trimming and FastQC on the trimmed files """ self.qualityobject.trimquality() metadataprinter.MetadataPrinter(inputobject=self) def fastqc_trimmed(self): """ Run FastQC on the quality trimmed FASTQ files """ self.qualityobject.fastqcthreader(level='Trimmed') metadataprinter.MetadataPrinter(inputobject=self) def error_correct(self): """ Perform error correcting on the reads """ self.qualityobject.error_correction() metadataprinter.MetadataPrinter(inputobject=self) def contamination_detection(self): """ Calculate the levels of contamination in the reads """ self.qualityobject.contamination_finder(report_path=self.reportpath) metadataprinter.MetadataPrinter(inputobject=self) def fastqc_trimmedcorrected(self): """ Run FastQC on the processed fastq files """ self.qualityobject.fastqcthreader(level='trimmedcorrected') metadataprinter.MetadataPrinter(inputobject=self) def assemble(self): """ Assemble genomes and perform some basic quality analyses """ # Assemble genomes self.assemble_genomes() # Calculate assembly metrics on raw assemblies self.quality_features(analysis='raw') # Calculate the depth of coverage as well as other quality metrics using Qualimap self.qualimap() # Calculate assembly metrics on polished assemblies self.quality_features(analysis='polished') # ORF detection self.prodigal() # Assembly quality determination self.genome_qaml() # CLARK analyses self.clark() def assemble_genomes(self): """ Use skesa to assemble genomes """ assembly = skesa.Skesa(inputobject=self) assembly.main() metadataprinter.MetadataPrinter(inputobject=self) def qualimap(self): """ Calculate the depth of coverage as well as other quality metrics using Qualimap """ qual = depth.QualiMap(inputobject=self) qual.main() metadataprinter.MetadataPrinter(inputobject=self) def quality_features(self, analysis): """ Extract features from assemblies such as total genome size, longest contig, and N50 """ features = quality.QualityFeatures(inputobject=self, analysis=analysis) features.main() metadataprinter.MetadataPrinter(self) def prodigal(self): """ Use prodigal to detect open reading frames in the assemblies """ prodigal.Prodigal(self) metadataprinter.MetadataPrinter(self) def genome_qaml(self): """ Use GenomeQAML to determine the quality of the assemblies """ g_qaml = quality.GenomeQAML(inputobject=self) g_qaml.main() metadataprinter.MetadataPrinter(inputobject=self) def clark(self): """ Run CLARK metagenome analyses on the raw reads and assemblies if the system has adequate resources """ # Run CLARK typing on the .fastq and .fasta files automateCLARK.PipelineInit(inputobject=self, extension='fasta', light=True ) automateCLARK.PipelineInit(inputobject=self, extension='fastq', light=True) def agnostictyping(self): """ Perform typing that does not require the genus of the organism to be known """ # Run mash self.mash() # Run rMLST self.rmlst() # Run the 16S analyses self.sixteens() # Calculate the presence/absence of GDCS self.run_gdcs() # Find genes of interest self.genesippr() # Resistance finding - raw reads self.ressippr() # Resistance finding - assemblies self.resfinder() # Run MOB-suite self.mob_suite() # Prophage detection self.prophages() # Univec contamination search self.univec() # Virulence self.virulence() def mash(self): """ Run mash to determine closest refseq genome """ mash.Mash(inputobject=self, analysistype='mash') metadataprinter.MetadataPrinter(inputobject=self) def rmlst(self): """ Run rMLST analyses """ rmlst = MLSTSippr(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='rMLST', pipeline=True, cutoff=1.0) rmlst.runner() metadataprinter.MetadataPrinter(inputobject=self) def sixteens(self): """ Run the 16S analyses """ SixteensFull(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='sixteens_full', cutoff=0.95) metadataprinter.MetadataPrinter(inputobject=self) def run_gdcs(self): """ Determine the presence of genomically-dispersed conserved sequences for Escherichia, Listeria, and Salmonella strains """ # Run the GDCS analysis GDCS(inputobject=self) metadataprinter.MetadataPrinter(inputobject=self) def genesippr(self): """ Find genes of interest """ GeneSippr(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='genesippr', cutoff=0.95, pipeline=False, revbait=False) metadataprinter.MetadataPrinter(inputobject=self) def mob_suite(self): """ """ mob = MobRecon(metadata=self.runmetadata.samples, analysistype='mobrecon', databasepath=self.reffilepath, threads=self.cpus, logfile=self.logfile, reportpath=self.reportpath) mob.mob_recon() metadataprinter.MetadataPrinter(inputobject=self) def ressippr(self): """ Resistance finding - raw reads """ res = Resistance(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='resfinder', cutoff=0.7, pipeline=False, revbait=True) res.main() metadataprinter.MetadataPrinter(inputobject=self) def resfinder(self): """ Resistance finding - assemblies """ resfinder = BLAST(args=self, analysistype='resfinder_assembled') resfinder.seekr() metadataprinter.MetadataPrinter(inputobject=self) def prophages(self, cutoff=90): """ Prophage detection :param cutoff: cutoff value to be used in the analyses """ prophages = Prophages(args=self, analysistype='prophages', cutoff=cutoff, unique=True) prophages.seekr() metadataprinter.MetadataPrinter(inputobject=self) def univec(self): """ Univec contamination search """ univec = Univec(args=self, analysistype='univec', cutoff=80, unique=True) univec.seekr() metadataprinter.MetadataPrinter(inputobject=self) def virulence(self): """ Virulence gene detection """ vir = Virulence(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='virulence', cutoff=0.95, pipeline=False, revbait=True) if not os.path.isfile(os.path.join(self.reportpath, 'virulence.csv')): vir.reporter() metadataprinter.MetadataPrinter(inputobject=self) def typing(self): """ Perform analyses that use genera-specific databases """ # Run modules and print metadata to file # MLST self.mlst() # Serotyping self.serosippr() # Assembly-based vtyper self.legacy_vtyper() # Core genome calculation self.coregenome() # Sistr self.sistr() def mlst(self): """ MLST analyses """ mlst = MLSTSippr(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='MLST', cutoff=1.0, pipeline=True) mlst.runner() metadataprinter.MetadataPrinter(inputobject=self) def serosippr(self): """ Serotyping analyses """ Serotype(args=self, pipelinecommit=self.commit, startingtime=self.starttime, scriptpath=self.homepath, analysistype='serosippr', cutoff=0.90, pipeline=True) metadataprinter.MetadataPrinter(inputobject=self) def legacy_vtyper(self): """ Legacy vtyper - uses ePCR """ legacy_vtyper = LegacyVtyper(inputobject=self, analysistype='legacy_vtyper', mismatches=2) legacy_vtyper.vtyper() metadataprinter.MetadataPrinter(inputobject=self) def coregenome(self): """ Core genome calculation """ coregen = core.CoreGenome(args=self, analysistype='coregenome', genus_specific=True) coregen.seekr() core.AnnotatedCore(inputobject=self) metadataprinter.MetadataPrinter(inputobject=self) def sistr(self): """ Sistr """ sistr.Sistr(inputobject=self, analysistype='sistr') metadataprinter.MetadataPrinter(inputobject=self) def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ SetupLogging() logging.info('Welcome to the CFIA de novo bacterial assembly pipeline {}' .format(__version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.args = args self.path = os.path.join(args.sequencepath) self.reffilepath = os.path.join(args.referencefilepath) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {}'\ .format(self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile(os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True logging.warning('Could not find a sample sheet. Performing basic assembly (no run metadata captured)') # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count() - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir(self.path), 'Supplied path location is not a valid directory {0!r:s}'.format(self.path) self.reportpath = os.path.join(self.path, 'reports') assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}'\ .format(self.reffilepath) self.commit = __version__ self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def parsesamplesheet(self): """Parses the sample sheet (SampleSheet.csv) to determine certain values important for the creation of the assembly report""" # Open the sample sheet with open(self.samplesheet, "r") as samplesheet: # Iterate through the sample sheet samples, prev, header = False, 0, [] for count, line in enumerate(samplesheet): # Remove new lines, and split on commas # line = line.decode('utf-8') # Turn from bytes to string, since python3 is finicky. data = line.rstrip().split(",") if any(data): if "[Settings]" in line: samples = False if not line.startswith( "[") and not samples and not data == ['']: # Grab an data not in the [Data] Section setattr(self.header, data[0].replace(" ", ""), "".join(data[1:])) elif "[Data]" in line or "[Reads]" in line: samples = True elif samples and "Sample_ID" in line: header.extend([ x.replace("_", "").replace(' ', "") for x in data ]) prev = count elif header: # Try and replicate the Illumina rules to create file names from "Sample_Name" samplename = samplenamer(data) # Create an object for storing nested static variables strainmetadata = MetadataObject() # Set the sample name in the object strainmetadata.name = samplename # Add the header object to strainmetadata # strainmetadata.__setattr__("run", GenObject(dict(self.header))) strainmetadata.run = GenObject( copy.copy(self.header.datastore)) # Create the run object, so it will be easier to populate the object (eg run.SampleName = ... # instead of strainmetadata.run.SampleName = ... run = strainmetadata.run # Capture Sample_ID, Sample_Name, I7_Index_ID, index1, I5_Index_ID, index2, Sample_Project for idx, item in enumerate(data): setattr(run, header[idx], item) if item else setattr( run, header[idx], "NA") # Add the sample number run.SampleNumber = count - prev # Create the 'General' category for strainmetadata strainmetadata.general = GenObject({ 'outputdirectory': os.path.join(self.path, samplename), 'pipelinecommit': self.commit }) strainmetadata.general.logout = os.path.join( self.path, samplename, '{}_log_out.txt'.format(samplename)) strainmetadata.general.logerr = os.path.join( self.path, samplename, '{}_log_err.txt'.format(samplename)) # Add the output directory to the general category # Append the strainmetadata object to a list self.samples.append(strainmetadata) elif samples: setattr(self.header, 'forwardlength', data[0]) \ if 'forwardlength' not in self.header.datastore else \ setattr(self.header, 'reverselength', data[0]) self.totalreads += int(data[0]) self.date = self.header.Date if "Date" in self.header.datastore else self.date for sample in self.samples: if 'InvestigatorName' not in sample.run.datastore: sample.run.InvestigatorName = 'NA'