def query_prep(self):
     """
     Create metadata objects for each sample
     """
     logging.info('Preparing query files')
     # Find all the sequence files in the path
     fastas = sorted(glob(os.path.join(self.query_path, '*.fasta')))
     for fasta in fastas:
         name = os.path.splitext(os.path.basename(fasta))[0]
         if name != 'combinedtargets':
             # Create a metadata object for each sample
             metadata = MetadataObject()
             metadata.samples = list()
             # Populate the metadata object with the required attributes
             metadata.name = name
             metadata.general = GenObject()
             metadata.commands = GenObject()
             metadata.alleles = GenObject()
             metadata.alleles.outputdirectory = os.path.join(self.query_path, metadata.name)
             # Set the name of the BLAST output file
             metadata.alleles.blast_report = os.path.join(metadata.alleles.outputdirectory,
                                                          '{seq_id}.tsv'.format(seq_id=metadata.name))
             try:
                 os.remove(metadata.alleles.blast_report)
             except FileNotFoundError:
                 pass
             make_path(metadata.alleles.outputdirectory)
             metadata.general.bestassemblyfile = relative_symlink(src_file=fasta,
                                                                  output_dir=metadata.alleles.outputdirectory,
                                                                  export_output=True)
             metadata.samples.append(metadata)
             self.runmetadata.samples.append(metadata)
 def helper(self):
     """Helper function for file creation (if desired), manipulation, quality assessment,
     and trimming as well as the assembly"""
     # Simple assembly without requiring accessory files (SampleSheet.csv, etc).
     if self.basicassembly:
         self.runmetadata = Basic(inputobject=self)
     else:
         # Populate the runmetadata object by parsing the SampleSheet.csv, GenerateFASTQRunStatistics.xml, and
         # RunInfo.xml files
         self.runinfo = os.path.join(self.path, 'RunInfo.xml')
         self.runmetadata = runMetadata.Metadata(passed=self)
         # Extract the flowcell ID and the instrument name if the RunInfo.xml file was provided
         self.runmetadata.parseruninfo()
         # Extract PhiX mapping information from the run
         phi = phix.PhiX(inputobject=self)
         phi.main()
         # Populate the lack of bclcall and nohup call into the metadata sheet
         for sample in self.runmetadata.samples:
             sample.commands = GenObject()
             sample.commands.nohupcall = 'NA'
             sample.commands.bclcall = 'NA'
         # Move/link the FASTQ files to strain-specific working directories
         fastqmover.FastqMover(inputobject=self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(inputobject=self)
예제 #3
0
 def typing_reports(self):
     """
     Create empty attributes for analyses that were not performed, so that the metadata report can be created
     :return:
     """
     for sample in self.runmetadata.samples:
         sample.confindr = GenObject()
         sample.mapping = GenObject()
         sample.quast = GenObject()
         sample.qualimap = GenObject()
         sample.verotoxin = GenObject()
         if not GenObject.isattr(sample, 'sistr'):
             sample.sistr = GenObject()
         sample.mapping.MeanInsertSize = 0
         sample.mapping.MeanCoveragedata = 0
         sample.genesippr.report_output = set()
         sample.genesippr.results = dict()
         sample.verotoxin.verotoxin_subtypes_set = sample.legacy_vtyper.toxinprofile
         try:
             for gene, percentid in sample.genesippr.blastresults.items():
                 if percentid > 95:
                     sample.genesippr.report_output.add(gene.split('_')[0])
         except AttributeError:
             sample.genesippr.report_output = list()
         sample.genesippr.report_output = sorted(
             list(sample.genesippr.report_output))
     # Create a report
     run_report = reporter.Reporter(self)
     # Create the standard and legacy reports
     run_report.metadata_reporter()
     run_report.legacy_reporter()
 def run_blast(self):
     """
     BLAST the alleles against the genomes
     """
     logging.info('BLASTing alleles against sequence files')
     for query_file in self.query_files:
         # Create a metadata object to store all the sample-specific information
         sample = MetadataObject()
         sample.alleles = GenObject()
         local_db = os.path.splitext(query_file)[0]
         sample.name = os.path.basename(local_db)
         # Set the name of the BLAST output file
         sample.alleles.blast_report = os.path.join(
             self.reportpath, '{seq_id}.tsv'.format(seq_id=sample.name))
         # Update the list of metadata objects with this sample
         self.runmetadata.samples.append(sample)
         self.blast_reports.append(sample.alleles.blast_report)
         # Run the appropriate BLAST command: BLASTn for nt; tBLASTn for aa against translated nt
         if self.amino_acid:
             blast = NcbitblastnCommandline(db=local_db,
                                            query=self.target_file,
                                            num_alignments=100000000,
                                            evalue=0.001,
                                            num_threads=self.cpus,
                                            task='tblastn',
                                            outfmt=self.outfmt,
                                            word_size=3,
                                            out=sample.alleles.blast_report)
         else:
             blast = NcbiblastnCommandline(db=local_db,
                                           query=self.target_file,
                                           num_alignments=100000000,
                                           evalue=0.001,
                                           num_threads=self.cpus,
                                           task='blastn',
                                           outfmt=self.outfmt,
                                           out=sample.alleles.blast_report)
         if not os.path.isfile(sample.alleles.blast_report):
             # Run BLAST - supply the record sequence as stdin, so BLAST doesn't look for an input file
             try:
                 blast()
             # BLAST can have issues with genomes that have very large contigs. Retry the analysis using only one
             # thread
             except ApplicationError:
                 os.remove(sample.alleles.blast_report)
                 blast = NcbitblastnCommandline(
                     db=local_db,
                     query=self.target_file,
                     num_alignments=100000000,
                     evalue=0.001,
                     num_threads=1,
                     task='tblastn',
                     outfmt=self.outfmt,
                     word_size=3,
                     out=sample.alleles.blast_report)
                 blast()
예제 #5
0
def test_sistr_seqsero():
    metadata = MetadataObject()
    method.runmetadata.samples = list()
    fasta = os.path.join(var.sequencepath, 'NC_003198.fasta')
    metadata.name = os.path.split(fasta)[1].split('.')[0]
    # Initialise the general and run categories
    metadata.general = GenObject()
    metadata.run = GenObject()
    metadata.general.fastqfiles = list()
    metadata.general.trimmedcorrectedfastqfiles = [
        os.path.join(var.sequencepath, 'seqsero',
                     '2014-SEQ-1049_seqsero.fastq.gz')
    ]
    # Set the destination folder
    outputdir = os.path.join(var.sequencepath, metadata.name)
    make_path(outputdir)
    # Add the output directory to the metadata
    metadata.general.outputdirectory = outputdir
    metadata.general.logout = os.path.join(outputdir, 'out')
    metadata.general.logerr = os.path.join(outputdir, 'err')
    metadata.run.outputdirectory = outputdir
    metadata.general.bestassemblyfile = True
    # Initialise an attribute to store commands
    metadata.commands = GenObject()
    # Assume that all samples are Salmonella
    metadata.general.referencegenus = 'Salmonella'
    # Set the .fasta file as the best assembly
    metadata.general.bestassemblyfile = fasta
    method.runmetadata.samples.append(metadata)
    method.sistr()
    for sample in method.runmetadata.samples:
        assert sample.sistr.cgmlst_genome_match == 'ERR586739' or sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA'
    method.seqsero()
    for sample in method.runmetadata.samples:
        assert sample.seqsero.predicted_serotype == '- 9:f,g,t:-'
    variable_update()
예제 #6
0
    def sketch_reads(self):
        """

        """
        # Create the threads for the analysis
        for i in range(self.cpus):
            threads = Thread(target=self.sketch, args=())
            threads.setDaemon(True)
            threads.start()
        for sample in self.metadata:
            # Create the analysis type-specific GenObject
            setattr(sample, 'paratyper', GenObject())
            sample.paratyper.sketchfilenoext = os.path.join(
                sample.general.outputdirectory, sample.name)
            sample.paratyper.sketchfile = sample.paratyper.sketchfilenoext + '.msh'
            sample.commands.sketch = 'mash sketch -m 2 -r {reads} -o {output_file}' \
                .format(reads=sample.general.normalised_reads,
                        output_file=sample.paratyper.sketchfilenoext)
            self.sketchqueue.put(sample)
        # Join the threads
        self.sketchqueue.join()
예제 #7
0
 def taxids(self):
     for sample in self.runmetadata.samples:
         # Initialise a list to store the taxIDs of interest
         sample.general.taxids = list()
         # Read the abundance file into a dictionary
         abundancedict = DictReader(open(sample.general.abundancefile))
         # Filter abundance to taxIDs with at least self.cutoff% of the total proportion
         for row in abundancedict:
             # The UNKNOWN category doesn't contain a 'Lineage' column, and therefore, subsequent columns are
             # shifted out of proper alignment, and do not contain the appropriate data
             try:
                 if float(row['Proportion_All(%)']) > self.cutoff:
                     sample.general.taxids.append(row['TaxID'], )
             except ValueError:
                 pass
         for taxid in sample.general.taxids:
             # Create the an attribute for each taxID
             setattr(sample, taxid, GenObject())
             sample[taxid].readlist = list()
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
     # Load the assignment file to memory
     self.loadassignment()
예제 #8
0
    def estimateabundance(self):
        """
        Estimate the abundance of taxonomic groups
        """
        logging.info('Estimating abundance of taxonomic groups')
        # Create and start threads
        for i in range(self.cpus):
            # Send the threads to the appropriate destination function
            threads = Thread(target=self.estimate, args=())
            # Set the daemon to true - something to do with thread management
            threads.setDaemon(True)
            # Start the threading
            threads.start()
        with progressbar(self.runmetadata.samples) as bar:
            for sample in bar:
                try:
                    if sample.general.combined != 'NA':
                        # Set the name of the abundance report
                        sample.general.abundance = sample.general.combined.split(
                            '.')[0] + '_abundance.csv'
                        # if not hasattr(sample, 'commands'):
                        if not sample.commands.datastore:
                            sample.commands = GenObject()

                        # Define system calls
                        sample.commands.target = self.targetcall
                        sample.commands.classify = self.classifycall
                        sample.commands.abundancecall = \
                            'cd {} && ./estimate_abundance.sh -D {} -F {} > {}'.format(self.clarkpath,
                                                                                       self.databasepath,
                                                                                       sample.general.classification,
                                                                                       sample.general.abundance)
                        self.abundancequeue.put(sample)
                except KeyError:
                    pass
        self.abundancequeue.join()
예제 #9
0
 def __init__(self, args, pipelinecommit, startingtime, scriptpath):
     # Initialise variables
     self.commit = str(pipelinecommit)
     self.start = startingtime
     self.homepath = scriptpath
     # Define variables based on supplied arguments
     self.args = args
     self.path = os.path.join(args.path)
     assert os.path.isdir(
         self.path
     ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path)
     self.sequencepath = os.path.join(args.sequencepath, '')
     assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \
         .format(self.sequencepath)
     self.databasepath = os.path.join(args.databasepath, '')
     assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \
         .format(self.databasepath)
     # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1
     self.cpus = 4
     # Set variables from the arguments
     self.database = args.database
     self.rank = args.rank
     self.clarkpath = args.clarkpath
     self.cutoff = float(args.cutoff) * 100
     # Initialise variables for the analysis
     self.targetcall = str()
     self.classifycall = str()
     self.devnull = open(os.devnull, 'wb')
     self.filelist = os.path.join(self.path, 'sampleList.txt')
     self.reportlist = os.path.join(self.path, 'reportList.txt')
     self.abundancequeue = Queue()
     self.datapath = str()
     self.reportpath = os.path.join(self.path, 'reports')
     self.clean_seqs = args.clean_seqs
     self.light = args.light
     self.extension = args.extension
     if self.clean_seqs:
         try:
             self.reffilepath = args.reffilepath
         except AttributeError:
             self.clean_seqs = False
     # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects
     # and variables play nice
     try:
         if args.runmetadata:
             self.runmetadata = args.runmetadata
             # Create the name of the final report
             self.report = os.path.join(
                 self.reportpath,
                 'abundance_{ft}.xlsx'.format(ft=self.extension))
             # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK
             if not os.path.isfile(self.report):
                 logging.info(
                     'Performing CLARK analysis on {ft} files'.format(
                         ft=self.extension))
                 if self.extension != 'fastq':
                     for sample in self.runmetadata.samples:
                         sample.general.combined = sample.general.bestassemblyfile
                     # Run the pipeline
                     self.main()
                 else:
                     # Only perform FASTQ analyses if the sample is declared to be a metagenome
                     metagenome = False
                     for sample in self.runmetadata.samples:
                         try:
                             status = sample.run.Description
                         except AttributeError:
                             status = 'unknown'
                         if status == 'metagenome':
                             metagenome = True
                     # If any of the samples are metagenomes, run the CLARK analysis on the raw files
                     if metagenome:
                         fileprep.Fileprep(self)
                         # Run the pipeline
                         self.main()
                 # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects
                 for sample in self.runmetadata.samples:
                     # Create a GenObject to store metadata when this script is run as part of the pipeline
                     clarkextension = 'clark{}'.format(self.extension)
                     setattr(sample, clarkextension, GenObject())
                     # Create a folder to store all the CLARK files
                     sample[clarkextension].outputpath = os.path.join(
                         sample.general.outputdirectory, 'CLARK')
                     make_path(sample[clarkextension].outputpath)
                     if sample.general.bestassemblyfile != 'NA':
                         # Move the files to the CLARK folder
                         try:
                             move(
                                 sample.general.abundance,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.abundance)))
                             move(
                                 sample.general.classification,
                                 os.path.join(
                                     sample[clarkextension].outputpath,
                                     os.path.basename(
                                         sample.general.classification)))
                         except (AttributeError, FileNotFoundError):
                             pass
                         # Set the CLARK-specific attributes
                         try:
                             sample[
                                 clarkextension].abundance = sample.general.abundance
                             sample[
                                 clarkextension].classification = sample.general.classification
                             sample[
                                 clarkextension].combined = sample.general.combined
                         except AttributeError:
                             pass
                         if self.extension == 'fastq':
                             # Remove the combined .fastq files
                             try:
                                 if type(sample[clarkextension].combined
                                         ) is list:
                                     os.remove(
                                         sample[clarkextension].combined)
                             except (OSError, AttributeError):
                                 pass
                     # Remove the text files lists of files and reports created by CLARK
                     try:
                         map(
                             lambda x: os.remove(os.path.join(self.path, x)
                                                 ),
                             ['reportList.txt', 'sampleList.txt'])
                     except OSError:
                         pass
         else:
             self.runmetadata = MetadataObject()
             self.report = os.path.join(self.reportpath, 'abundance.xlsx')
             # Create the objects
             self.objectprep()
             self.main()
     except AttributeError:
         self.runmetadata = MetadataObject()
         self.report = os.path.join(self.reportpath, 'abundance.xlsx')
         # Create the objects
         self.objectprep()
         # Set the run description to 'metagenome' in order to process the samples
         for sample in self.runmetadata.samples:
             sample.run.Description = 'metagenome'
         self.main()
     # Optionally filter the .fastq reads based on taxonomic assignment
     if args.filter:
         filtermetagenome.PipelineInit(self)
     # Print the metadata to file
     metadataprinter.MetadataPrinter(self)
예제 #10
0
    def probefinder(self):
        """
        Find the longest probe sequences
        """
        logging.info('Finding and filtering probe sequences')
        for sample in self.samples:
            # A list to store the metadata object for each alignment
            sample.gene = list()
            for align in sample.alignedalleles:
                # Create an object to store all the information for each alignment file
                metadata = GenObject()
                metadata.name = os.path.splitext(os.path.basename(align))[0]
                metadata.alignmentfile = align
                # Create an alignment object from the alignment file
                try:
                    metadata.alignment = AlignIO.read(align, 'fasta')
                except ValueError:
                    # If a ValueError: Sequences must all be the same length is raised, pad the shorter sequences
                    # to be the length of the longest sequence
                    # https://stackoverflow.com/q/32833230
                    records = SeqIO.parse(align, 'fasta')
                    # Make a copy, otherwise our generator is exhausted after calculating maxlen
                    records = list(records)
                    # Calculate the length of the longest sequence
                    maxlen = max(len(record.seq) for record in records)
                    # Pad sequences so that they all have the same length
                    for record in records:
                        if len(record.seq) != maxlen:
                            sequence = str(record.seq).ljust(maxlen, '.')
                            record.seq = Seq(sequence)
                    assert all(len(record.seq) == maxlen for record in records)
                    # Write to file and do alignment
                    metadata.alignmentfile = '{}_padded.tfa'.format(
                        os.path.splitext(align)[0])
                    with open(metadata.alignmentfile, 'w') as padded:
                        SeqIO.write(records, padded, 'fasta')
                    # Align the padded sequences
                    metadata.alignment = AlignIO.read(metadata.alignmentfile,
                                                      'fasta')

                metadata.summaryalign = AlignInfo.SummaryInfo(
                    metadata.alignment)
                # The dumb consensus is a very simple consensus sequence calculated from the alignment. Default
                # parameters of threshold=.7, and ambiguous='X' are used
                consensus = metadata.summaryalign.dumb_consensus()
                metadata.consensus = str(consensus)
                # The position-specific scoring matrix (PSSM) stores the frequency of each based observed at each
                # location along the entire consensus sequence
                metadata.pssm = metadata.summaryalign.pos_specific_score_matrix(
                    consensus)
                metadata.identity = list()
                # Find the prevalence of each base for every location along the sequence
                for line in metadata.pssm:
                    try:
                        bases = [
                            line['A'], line['C'], line['G'], line['T'],
                            line['-']
                        ]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases[:4]) / sum(bases) * 100)))
                    except KeyError:
                        bases = [line['A'], line['C'], line['G'], line['T']]
                        # Calculate the frequency of the most common base - don't count gaps
                        metadata.identity.append(
                            float('{:.2f}'.format(
                                max(bases) / sum(bases) * 100)))
                # List to store metadata objects
                metadata.windows = list()
                # Variable to store whether a suitable probe has been found for the current organism + gene pair.
                # As the probe sizes are evaluated in descending size, as soon as a probe has been discovered, the
                # search for more probes can stop, and subsequent probes will be smaller than the one(s) already found
                passing = False
                # Create sliding windows of size self.max - self.min from the list of identities for each column
                # of the alignment
                for i in reversed(range(self.min, self.max + 1)):
                    if not passing:
                        windowdata = MetadataObject()
                        windowdata.size = i
                        windowdata.max = 0
                        windowdata.sliding = list()
                        # Create a counter to store the starting location of the window in the sequence
                        n = 0
                        # Create sliding windows from the range of sizes for the list of identities
                        windows = self.window(metadata.identity, i)
                        # Go through each window from the collection of sliding windows to determine which window(s)
                        # has (have) the best results
                        for window in windows:
                            # Create another object to store all the data for the window
                            slidingdata = MetadataObject()
                            # Only consider the window if every position has a percent identity greater than the cutoff
                            if min(window) > self.cutoff:
                                # Populate the object with the necessary variables
                                slidingdata.location = '{}:{}'.format(n, n + i)
                                slidingdata.min = min(window)
                                slidingdata.mean = float('{:.2f}'.format(
                                    numpy.mean(window)))
                                slidingdata.sequence = str(consensus[n:n + i])
                                # Create attributes for evaluating windows. A greater/less windowdata.max/windowdata.min
                                #  means a better/less overall percent identity, respectively
                                windowdata.max = slidingdata.mean if slidingdata.mean >= windowdata.max \
                                    else windowdata.max
                                windowdata.min = slidingdata.mean if slidingdata.mean <= windowdata.max \
                                    else windowdata.min
                                # Add the object to the list of objects
                                windowdata.sliding.append(slidingdata)
                                passing = True
                            n += 1
                        # All the object to the list of objects
                        metadata.windows.append(windowdata)
                # All the object to the list of objects
                sample.gene.append(metadata)