def quality_features(self, analysis): """ Extract features from assemblies such as total genome size, longest contig, and N50 """ features = quality.QualityFeatures(self, analysis) features.main() metadataprinter.MetadataPrinter(self)
def helper(self): """Helper function for file creation (if desired), manipulation, quality assessment, and trimming as well as the assembly""" # Simple assembly without requiring accessory files (SampleSheet.csv, etc). if self.basicassembly: self.runmetadata = Basic(self) else: # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and # RunInfo.xml files self.runinfo = os.path.join(self.path, 'RunInfo.xml') self.runmetadata = runMetadata.Metadata(self) # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided self.runmetadata.parseruninfo() # Extract PhiX mapping information from the run phi = phix.PhiX(self) phi.main() # Populate the lack of bclcall and nohup call into the metadata sheet for sample in self.runmetadata.samples: sample.commands = GenObject() sample.commands.nohupcall = 'NA' sample.commands.bclcall = 'NA' # Move/link the FASTQ files to strain-specific working directories fastqmover.FastqMover(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
def qualimap(self): """ Calculate the depth of coverage as well as other quality metrics using Qualimap """ qual = depth.QualiMap(self) qual.main() metadataprinter.MetadataPrinter(self)
def assemble_genomes(self): """ Use skesa to assemble genomes """ assembly = skesa.Skesa(self) assembly.main() metadataprinter.MetadataPrinter(self)
def quality(self): """ Creates quality objects and runs quality assessments and quality processes on the supplied sequences """ # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers # of forward and reverse reads, read length longer than quality score length, proper extension self.fastq_validate() # Run FastQC on the unprocessed fastq files self.fastqc_raw() # Perform quality trimming and FastQC on the trimmed files self.quality_trim() # Run FastQC on the trimmed files self.fastqc_trimmed() # Perform error correcting on the reads self.error_correct() # Detect contamination in the reads self.contamination_detection() # Run FastQC on the processed fastq files self.fastqc_trimmedcorrected() # Exit if only pre-processing of data is requested metadataprinter.MetadataPrinter(self) if self.preprocess: printtime('Pre-processing complete', self.starttime) quit()
def genome_qaml(self): """ Use GenomeQAML to determine the quality of the assemblies """ g_qaml = quality.GenomeQAML(self) g_qaml.main() metadataprinter.MetadataPrinter(self)
def sixteens(self): """ Run the 16S analyses """ SixteensFull(self, self.commit, self.starttime, self.homepath, 'sixteens_full', 0.95) metadataprinter.MetadataPrinter(self)
def mlst(self): """ MLST analyses """ MLSTSippr(self, self.commit, self.starttime, self.homepath, 'MLST', 1.0, True) metadataprinter.MetadataPrinter(self)
def vtyper(self): """ Virulence typing """ vtype = vtyper.PrimerFinder(self, 'vtyper') vtype.main() metadataprinter.MetadataPrinter(self)
def univec(self): """ Univec contamination search """ uni = univec.PipelineInit(self, 'univec', False, 80, True) Univec(uni) metadataprinter.MetadataPrinter(self)
def serosippr(self): """ Serotyping analyses """ Serotype(self, self.commit, self.starttime, self.homepath, 'serosippr', 0.95, True) metadataprinter.MetadataPrinter(self)
def plasmids(self): """ Plasmid finding """ Plasmids(self, self.commit, self.starttime, self.homepath, 'plasmidfinder', 0.8, False, True) metadataprinter.MetadataPrinter(self)
def plasmid_extractor(self): """ Extracts and types plasmid sequences """ plasmids = PlasmidExtractor(self) plasmids.main() metadataprinter.MetadataPrinter(self)
def genesippr(self): """ Find genes of interest """ GeneSippr(self, self.commit, self.starttime, self.homepath, 'genesippr', 0.95, False, False) metadataprinter.MetadataPrinter(self)
def ressippr(self): """ Resistance finding - raw reads """ res = Resistance(self, self.commit, self.starttime, self.homepath, 'resfinder', 0.8, False, True) res.main() metadataprinter.MetadataPrinter(self)
def run_gdcs(self): """ Determine the presence of genomically-dispersed conserved sequences for Escherichia, Listeria, and Salmonella strains """ # Run the GDCS analysis GDCS(self) metadataprinter.MetadataPrinter(self)
def virulence(self): """ Virulence gene detection """ vir = Virulence(self, self.commit, self.starttime, self.homepath, 'virulence', 0.95, False, True) vir.reporter() metadataprinter.MetadataPrinter(self)
def prophages(self, cutoff=90): """ Prophage detection :param cutoff: cutoff value to be used in the analyses """ pro = GeneSeekrMethod.PipelineInit(self, 'prophages', False, cutoff, True) Prophages(pro) metadataprinter.MetadataPrinter(self)
def coregenome(self): """ Core genome calculation """ coregen = GeneSeekrMethod.PipelineInit(self, 'coregenome', True, 70, False) core.CoreGenome(coregen) core.AnnotatedCore(self) metadataprinter.MetadataPrinter(self)
def clark(self): """ Run CLARK metagenome analyses on the raw reads and assemblies if the system has adequate resources """ # Determine the amount of physical memory in the system mem = virtual_memory() # If the total amount of memory is greater than 100GB (this could probably be lowered), run CLARK if mem.total >= 100000000000: # Run CLARK typing on the .fastq and .fasta files automateCLARK.PipelineInit(self) automateCLARK.PipelineInit(self, 'fastq') else: # Run CLARK typing on the .fastq and .fasta files automateCLARK.PipelineInit(self, light=True) automateCLARK.PipelineInit(self, 'fastq', light=True) metadataprinter.MetadataPrinter(self)
def __init__(self): from argparse import ArgumentParser from time import time # Parser for arguments parser = ArgumentParser( description='Performs ePCR using a supplied primer file. The primers must be in the format: ' '<name>\t<forward primer>\t<reverse primer>\t<max size allowed between primers>\n.' 'Sequence files must be stored in <path>/sequences' ) parser.add_argument('path', help='Specify path in which reports are to be stored') parser.add_argument('-s', '--sequencepath', required=True, help='Path to assembly files') parser.add_argument('-f', '--primerfile', required=True, help='The name and path of the file containing the primers') # Get the arguments into an object arguments = parser.parse_args() self.starttime = time() # Add trailing slashes to the path variables to ensure consistent formatting (os.path.join) self.path = os.path.join(arguments.path, '') self.sequencepath = os.path.join(arguments.sequencepath, '') self.primerfile = arguments.primerfile # Initialise variables self.runmetadata = MetadataObject() self.reffilepath = False self.analysistype = 'ePCR' self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) # Initialise metadata self.runmetadata.samples = self.setup() self.logfile = os.path.join(self.path, 'vtyper_logfile.txt') # Run the analyses Vtyper(self, self.analysistype) # Create a report self.reporter() # Print the metadata to file printtime('Printing metadata to file', self.starttime) metadataprinter.MetadataPrinter(self) # Print a bold, green exit statement print(u'\033[92m' + u'\033[1m' + u'\nElapsed Time: %0.2f seconds' % (time() - self.starttime) + u'\033[0m')
def main(self): """ Run the methods in the correct order """ # Start the assembly self.helper() # Create the quality object self.create_quality_object() # Run the quality analyses self.quality() # Perform assembly self.assemble() # Perform genus-agnostic typing self.agnostictyping() # Perform typing self.typing() # Create a report reporter.Reporter(self) # Compress or remove all large, temporary files created by the pipeline compress.Compress(self) metadataprinter.MetadataPrinter(self)
def __init__(self, inputobject): from queue import Queue self.path = inputobject.path self.sequencepath = inputobject.databasesequencepath self.start = inputobject.start self.cpus = inputobject.cpus self.genus = inputobject.genus self.species = inputobject.species self.runmetadata = MetadataObject() self.dockerimage = inputobject.dockerimage # Set and create necessary folders self.coregenelocation = os.path.join(self.path, 'coregenes', self.genus) self.profilelocation = os.path.join(self.path, 'profile', self.genus) make_path(self.profilelocation) make_path(self.coregenelocation) # Create class variables self.genes = dict() self.genenames = dict() self.genesequence = dict() self.cdsset = dict() self.coresequence = dict() self.geneset = set() self.corealleles = dict() self.coreset = set() self.profiles = dict() self.queue = Queue() self.corequeue = Queue() self.codingqueue = Queue() self.headerqueue = Queue() # self.devnull = open(os.devnull, 'wb') self.logfile = inputobject.logfile # Run the analyses self.annotatethreads() # Print the metadata to file metadataprinter.MetadataPrinter(self)
def validate_fastq(self): """ Runs reformat.sh on the FASTQ files. If a CalledProcessError arises, do not proceed with the assembly of these files """ printtime('Validating FASTQ files', self.start) validated_reads = list() for sample in self.metadata: # Tiny files can pass the validation tests - ensure that they don't size = os.path.getsize(sample.general.fastqfiles[0]) if size >= 1000000: # Try to run reformat.sh on the reads - on any errors try to run repair.sh try: out, err, cmd = bbtools.validate_reads( forward_in=sample.general.fastqfiles[0], returncmd=True) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) # Add the sample to the list of samples with FASTQ files that pass this validation step validated_reads.append(sample) except CalledProcessError: # Set the file names for the reformatted and repaired files outputfile1 = os.path.join( sample.general.outputdirectory, '{}_reformatted_R1.fastq.gz'.format(sample.name)) repair_file1 = os.path.join( sample.general.outputdirectory, '{}_repaired_R1.fastq.gz'.format(sample.name)) if len(sample.general.fastqfiles) == 2: outputfile2 = os.path.join( sample.general.outputdirectory, '{}_reformatted_R2.fastq.gz'.format(sample.name)) repair_file2 = os.path.join( sample.general.outputdirectory, '{}_repaired_R2.fastq.gz'.format(sample.name)) else: outputfile2 = str() repair_file2 = str() # Try to use reformat.sh to repair the reads - if this fails, discard the sample from the analyses try: printtime( 'Errors detected in FASTQ files for sample {sample}. Please check the following files' ' for details {log} {logout} {logerr}. Using reformat.sh to attempt to repair issues' .format(sample=sample.name, log=self.logfile, logout=sample.general.logout, logerr=sample.general.logerr), self.start) if not os.path.isfile(outputfile1): # Run reformat.sh out, err, cmd = bbtools.reformat_reads( forward_in=sample.general.fastqfiles[0], forward_out=outputfile1, returncmd=True) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) # Run repair.sh (if necessary) if outputfile2: out, err, cmd = bbtools.repair_reads( forward_in=outputfile1, forward_out=repair_file1, returncmd=True) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) # Ensure that the output file(s) exist before declaring this a success if os.path.isfile(outputfile1): # Update the fastqfiles attribute to point to the repaired files sample.general.fastqfiles = [ repair_file1, repair_file2 ] if repair_file2 else [outputfile1] # Add the sample object to the list of samples passing the FASTQ validation step validated_reads.append(sample) except CalledProcessError: # The file(s) can be created even if there is STDERR from reformat.sh if os.path.isfile(outputfile1) and outputfile2: try: out, err, cmd = bbtools.repair_reads( forward_in=outputfile1, forward_out=repair_file1, returncmd=True) write_to_logfile(out, err, self.logfile, sample.general.logout, sample.general.logerr, None, None) # Update the fastqfiles attribute to point to the repaired files sample.general.fastqfiles = [repair_file1, repair_file2] if repair_file2 else \ [repair_file1] # Add the sample object to the list of samples passing the FASTQ validation step validated_reads.append(sample) except CalledProcessError: # Write in the logs that there was an error detected in the FASTQ files write_to_logfile( 'An error was detected in the FASTQ files for sample {}. ' 'These files will not be processed further' .format(sample.name), 'An error was detected in the FASTQ files for sample {}. ' 'These files will not be processed further' .format(sample.name), self.logfile, sample.general.logout, sample.general.logerr, None, None) # Update metadata objects with error self.error(sample, 'fastq_error') else: # Write in the logs that there was an error detected in the FASTQ files write_to_logfile( 'An error was detected in the FASTQ files for sample {}. ' 'These files will not be processed further'. format(sample.name), 'An error was detected in the FASTQ files for sample {}. ' 'These files will not be processed further'. format(sample.name), self.logfile, sample.general.logout, sample.general.logerr, None, None) # Update metadata objects with error self.error(sample, 'fastq_error') else: # Update metadata objects with error self.error(sample, 'files_too_small') # Print the metadata to file metadataprinter.MetadataPrinter(self) # Overwrite self.metadata with objects that do not fail the validation self.metadata = validated_reads
def fastqc_trimmedcorrected(self): """ Run FastQC on the processed fastq files """ self.qualityobject.fastqcthreader('trimmedcorrected') metadataprinter.MetadataPrinter(self)
def prodigal(self): """ Use prodigal to detect open reading frames in the assemblies """ prodigal.Prodigal(self) metadataprinter.MetadataPrinter(self)
def sistr(self): """ Sistr """ sistr.Sistr(self, 'sistr') metadataprinter.MetadataPrinter(self)
def mash(self): """ Run mash to determine closest refseq genome """ mash.Mash(self, 'mash') metadataprinter.MetadataPrinter(self)
def contamination_detection(self): """ Calculate the levels of contamination in the reads """ self.qualityobject.contamination_finder() metadataprinter.MetadataPrinter(self)
def resfinder(self): """ Resistance finding - assemblies """ ResFinder(self) metadataprinter.MetadataPrinter(self)