Exemplo n.º 1
0
 def __init__(self, inputobject):
     self.metadata = inputobject.runmetadata.samples
     self.start = inputobject.starttime
     self.commit = inputobject.commit
     # Determine the versions of the software used
     printtime('Populating metadata', self.start)
     self.python = sys.version.replace('\n', '')
     self.arch = ", ".join(os.uname())
     self.blast = get_version(['blastn', '-version']).decode('utf-8').split('\n')[0].split()[1]
     self.spades = get_version(['spades.py', '-v']).decode('utf-8').split('\n')[0].split()[1]
     self.bowversion = Bowtie2CommandLine(version=True)()[0].split('\n')[0].split()[-1]
     self.samversion = get_version(['samtools', '--version']).decode('utf-8').split('\n')[0].split()[1]
     # Qualimap seems to have an Java warning message that doesn't necessarily show up on every system
     # Only capture the line that starts with 'Qualimap'
     qualimaplist = get_version(['qualimap', '--help']).decode('utf-8').split('\n')
     for line in qualimaplist:
         if 'QualiMap' in line:
             self.qualimap = line.split()[1]
     self.mash = get_version(['mash']).decode('utf-8').split('\n')[1].split()[2]
     self.prodigal = get_version(['prodigal', '-v']).decode('utf-8').split('\n')[1].split()[1]
     self.bbmap = get_version(['bbversion.sh']).decode('utf-8')
     self.fastqc = get_version(['fastqc', '--version']).decode('utf-8').split('\n')[0].split()[1]
     # Uncomment this once you figure ou where this file is stored.
     self.bcl2fastq = "2"
     self.perl = get_version(['perl', '-v']).decode('utf-8').split('\n')[1].split('This is ')[1]
     self.biopython = Bio.__version__
     self.java = get_version(['java', '-showversion']).decode('utf-8').split('\n')[0].split()[2].replace('"', '')
     # self.docker = get_version(['docker', 'version']).split('\n')[1].split()[1]
     self.versions()
Exemplo n.º 2
0
    def getrmlsthelper(self):
        """
        Makes a system call to rest_auth.py, a Python script modified from
        https://github.com/kjolley/BIGSdb/tree/develop/scripts/test
        And downloads the most up-to-date rMLST profile and alleles
        """

        printtime('Downloading {} alleles'.format(self.analysistype),
                  self.start)
        # Extract the path of the current script from the full path + file name
        homepath = os.path.split(os.path.abspath(__file__))[0]
        # Set the path/name of the folder to contain the new alleles and profile
        newfolder = os.path.join(self.path, self.analysistype)
        # Create the path
        make_path(newfolder)
        # Create arguments to feed into the rest_auth_class script
        args = ArgumentParser
        args.secret_file = os.path.join(homepath, 'secret.txt')
        args.file_path = homepath
        args.output_path = newfolder
        args.start = self.start
        rmlst = rest_auth_class.REST(args)
        # Download the profile and alleles
        rmlst.main()

        # Get the new alleles into a list, and create the combinedAlleles file
        alleles = glob(os.path.join(newfolder, '*.tfa'))
        self.combinealleles(newfolder, alleles)
Exemplo n.º 3
0
 def remove(self):
     """Removes unnecessary temporary files generated by the pipeline"""
     import shutil
     printtime('Removing large and/or temporary files', self.start)
     removefolder = list()
     for sample in self.metadata:
         # Use os.walk to iterate through all the files in the sample output directory
         for path, dirs, files in os.walk(sample.general.outputdirectory):
             for item in files:
                 # Use regex to find files to remove
                 if re.search(".fastq$", item) or re.search(".fastq.gz$", item) or re.search(".bam$", item) \
                         or re.search(".bt2$", item) or re.search(".tab$", item) or re.search("^before", item) \
                         or re.search("^baitedtargets", item) or re.search("_combined.csv$", item) \
                         or re.search("^scaffolds", item) or re.search(".fastg$", item) or re.search(".gfa$", item) \
                         or re.search(".bai$", item) or 'coregenome' in path or 'prophages' in path:
                     # Keep the baitedtargets.fa, core genome, and merged metagenome files
                     if item != 'baitedtargets.fa' and not re.search("coregenome", item) \
                             and not re.search("paired", item):
                         # Remove the unnecessary files
                         try:
                             os.remove(os.path.join(path, item))
                         except IOError:
                             pass
     # Clear out the folders
     for folder in removefolder:
         try:
             shutil.rmtree(folder)
         except (OSError, TypeError):
             pass
Exemplo n.º 4
0
 def reporter(self):
     """
     Creates a report of the results
     """
     printtime('Creating {} report'.format(self.analysistype),
               self.starttime)
     # Create the path in which the reports are stored
     make_path(self.reportpath)
     header = 'Strain,Serotype\n'
     data = ''
     with open(
             os.path.join(self.reportpath,
                          '{}.csv'.format(self.analysistype)),
             'w') as report:
         for sample in self.runmetadata.samples:
             if sample.general.bestassemblyfile != 'NA':
                 data += sample.name + ','
                 if sample[self.analysistype].results:
                     serotype = '{oset} ({opid}):{hset} ({hpid}),' \
                         .format(oset=';'.join(sample.serosippr.o_set),
                                 opid=sample.serosippr.best_o_pid,
                                 hset=';'.join(sample.serosippr.h_set),
                                 hpid=sample.serosippr.best_h_pid)
                     data += '{}\n'.format(serotype)
                 else:
                     data += '\n'
         report.write(header)
         report.write(data)
Exemplo n.º 5
0
 def parse_qaml(self):
     """
     Parse the GenomeQAML report, and populate metadata objects
     """
     printtime('Parsing GenomeQAML outputs', self.start)
     # A dictionary to store the parsed excel file in a more readable format
     nesteddictionary = dict()
     # Use pandas to read in the CSV file, and convert the pandas data frame to a dictionary (.to_dict())
     dictionary = pandas.read_csv(self.qaml_report).to_dict()
     # Iterate through the dictionary - each header from the CSV file
     for header in dictionary:
         # Sample is the primary key, and value is the value of the cell for that primary key + header combination
         for sample, value in dictionary[header].items():
             # Update the dictionary with the new data
             try:
                 nesteddictionary[sample].update({header: value})
             # Create the nested dictionary if it hasn't been created yet
             except KeyError:
                 nesteddictionary[sample] = dict()
                 nesteddictionary[sample].update({header: value})
     # Get the results into the metadata object
     for sample in self.metadata:
         # Initialise the plasmid extractor genobject
         setattr(sample, self.analysistype, GenObject())
         # Initialise the list of all plasmids
         sample[self.analysistype].prediction = str()
         # Iterate through the dictionary of results
         for line in nesteddictionary:
             # Extract the sample name from the dictionary
             name = nesteddictionary[line]['Sample']
             # Ensure that the names match
             if name == sample.name:
                 # Append the plasmid name extracted from the dictionary to the list of plasmids
                 sample[self.analysistype].prediction = nesteddictionary[
                     line]['Predicted_Class']
Exemplo n.º 6
0
    def mashing(self):
        printtime('Performing {} analyses'.format(self.analysistype), self.starttime)
        # Create the threads for the analysis
        for i in range(self.cpus):
                threads = Thread(target=self.mash, args=())
                threads.setDaemon(True)
                threads.start()
        # Populate threads for each gene, genome combination
        for sample in self.metadata:
            sample[self.analysistype].mashresults = os.path.join(sample[self.analysistype].reportdir, '{}.tab'.format(
                sample.name))

            sample.commands.mash = \
                'mash dist -p {} {} {} | sort -gk3 > {}'.format(self.threads,
                                                                sample[self.analysistype].refseqsketch,
                                                                sample[self.analysistype].sketchfile,
                                                                sample[self.analysistype].mashresults)
            try:
                self.mashqueue.put(sample)
            except (KeyboardInterrupt, SystemExit):
                printtime('Received keyboard interrupt, quitting threads', self.starttime)
                quit()
        # Join the threads
        self.mashqueue.join()
        self.parse()
Exemplo n.º 7
0
 def targets(self):
     """
     Using the data from the BLAST analyses, set the targets folder, and create the 'mapping file'. This is the
     genera-specific FASTA file that will be used for all the reference mapping; it replaces the 'bait file' in the
     code
     """
     printtime('Performing analysis with {} targets folder'.format(
         self.analysistype),
               self.start,
               output=self.portallog)
     for sample in self.runmetadata:
         if sample.general.bestassemblyfile != 'NA':
             sample[self.analysistype].targetpath = \
                 os.path.join(self.targetpath, 'genera', sample[self.analysistype].genus, '')
             # There is a relatively strict databasing scheme necessary for the custom targets. Eventually,
             # there will be a helper script to combine individual files into a properly formatted combined file
             try:
                 sample[self.analysistype].mappingfile = glob(
                     '{}*.fa'.format(
                         sample[self.analysistype].targetpath))[0]
             # If the fasta file is missing, raise a custom error
             except IndexError as e:
                 # noinspection PyPropertyAccess
                 e.args = [
                     'Cannot find the combined fasta file in {}. Please note that the file must have a '
                     '.fasta extension'.format(
                         sample[self.analysistype].targetpath)
                 ]
                 if os.path.isdir(sample[self.analysistype].targetpath):
                     raise
                 else:
                     sample.general.bestassemblyfile = 'NA'
Exemplo n.º 8
0
 def download_profile(self):
     """
     Download the profile from the database
     """
     printtime('Downloading profile', self.start)
     # Set the name of the profile file
     profile_file = os.path.join(self.output_path, 'profile.txt')
     size = 0
     # Ensure that the file exists, and that it is not too small; likely indicating a failed download
     try:
         stats = os.stat(profile_file)
         size = stats.st_size
     except FileNotFoundError:
         pass
     # Only download the profile if the file doesn't exist, or is likely truncated
     if not os.path.isfile(profile_file) or size <= 100:
         # Create a new session
         session = OAuth1Session(self.consumer_key,
                                 self.consumer_secret,
                                 access_token=self.session_token,
                                 access_token_secret=self.session_secret)
         # The profile file is called profiles_csv on the server. Updated the URL appropriately
         r = session.get(self.profile + '/1/profiles_csv')
         # On a successful GET request, parse the returned data appropriately
         if r.status_code == 200 or r.status_code == 201:
             if re.search('json', r.headers['content-type'], flags=0):
                 decoded = r.json()
             else:
                 decoded = r.text
             # Write the profile file to disk
             with open(profile_file, 'w') as profile:
                 profile.write(decoded)
Exemplo n.º 9
0
 def numberofsamples(self):
     """Count the number of samples is the samplesheet"""
     # Initialise variables to store line data
     idline = 0
     linenumber = 0
     # Parse the sample sheet to find the number of samples
     with open(self.samplesheet, "rb") as ssheet:
         # Use enumerate to iterate through the lines in the sample sheet to retrieve the line number and the data
         for linenumber, entry in enumerate(ssheet):
             # Once Sample_ID is encountered
             if "Sample_ID" in entry:
                 # Set the id line as the current line number
                 idline = linenumber
     # :samplecount is the last line number in the file minus the line number of Sample_ID
     self.samplecount = linenumber - idline
     printtime(
         'There are {} samples in this run. '
         'Running off-hours module with the following parameters:\n'
         'MiSeqPath: {},\n'
         'MiSeqFolder: {},\n'
         'SampleSheet: {}'.format(self.samplecount, self.miseqpath,
                                  self.miseqfolder, self.samplesheet),
         self.start)
     # Run the fastqmover module now that the number of sequences is known
     self.fastqlinker()
Exemplo n.º 10
0
 def clean_sequences(self):
     """Removes reads/contigs that contain plasmids, and masks phage sequences."""
     printtime('Removing plasmids and masking phages', self.start)
     plasmid_db = os.path.join(self.reffilepath, 'plasmidfinder',
                               'plasmid_database.fa')
     phage_db = os.path.join(self.reffilepath, 'prophages',
                             'combinedtargets.tfa')
     for sample in self.runmetadata.samples:
         plasmid_removal = 'bbduk.sh ref={} in={} out={} overwrite'\
             .format(plasmid_db, sample.general.combined, sample.general.combined.replace('.f', '_noplasmid.f'))
         subprocess.call(plasmid_removal,
                         shell=True,
                         stdout=self.devnull,
                         stderr=self.devnull)
         phage_masking = 'bbduk.sh ref={} in={} out={} kmask=N overwrite'\
             .format(phage_db, sample.general.combined.replace('.f', '_noplasmid.f'),
                     sample.general.combined.replace('.f', '_clean.f'))
         subprocess.call(phage_masking,
                         shell=True,
                         stdout=self.devnull,
                         stderr=self.devnull)
         os.remove(sample.general.combined)
         os.rename(sample.general.combined.replace('.f', '_clean.f'),
                   sample.general.combined)
         os.remove(sample.general.combined.replace('.f', '_noplasmid.f'))
Exemplo n.º 11
0
 def classifymetagenome(self):
     """Run the classify metagenome of the CLARK package on the samples"""
     printtime('Classifying metagenomes', self.start)
     # Define the system call
     self.classifycall = 'cd {} && ./classify_metagenome.sh -O {} -R {} -n {} --light'\
         .format(self.clarkpath,
                 self.filelist,
                 self.reportlist,
                 self.cpus)
     # Variable to store classification state
     classify = True
     for sample in self.runmetadata.samples:
         try:
             # Define the name of the .csv classification file
             sample.general.classification = sample.general.combined.split(
                 '.')[0] + '.csv'
             # If the file exists, then set classify to False
             if os.path.isfile(sample.general.classification):
                 classify = False
         except KeyError:
             pass
     # Run the system call if the samples have not been classified
     if classify:
         # Run the call
         subprocess.call(self.classifycall,
                         shell=True,
                         stdout=self.devnull,
                         stderr=self.devnull)
Exemplo n.º 12
0
 def normalise_reads(self):
     """
     Use bbnorm from the bbmap suite of tools to perform read normalisation
     """
     printtime('Normalising reads to a kmer depth of 100', self.start)
     for sample in self.metadata:
         # Set the name of the normalised read files
         sample.general.normalisedreads = [
             fastq.split('.fastq.gz')[0] + '_normalised.fastq.gz'
             for fastq in sorted(sample.general.fastqfiles)
         ]
         try:
             # Run the normalisation command
             out, err, cmd = bbtools.bbnorm(
                 forward_in=sorted(
                     sample.general.trimmedcorrectedfastqfiles)[0],
                 forward_out=sample.general.normalisedreads[0],
                 returncmd=True,
                 threads=self.cpus)
             sample[self.analysistype].normalisecmd = cmd
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
         except CalledProcessError:
             sample.general.normalisedreads = sample.general.trimmedfastqfiles
         except IndexError:
             sample.general.normalisedreads = list()
Exemplo n.º 13
0
    def getrmlsthelper(self):
        """
        Makes a system call to rest_auth.py, a Python script modified from
        https://github.com/kjolley/BIGSdb/tree/develop/scripts/test
        And downloads the most up-to-date rMLST profile and alleles
        """

        printtime('Downloading {} alleles'.format(self.analysistype), self.start)
        # Extract the path of the current script from the full path + file name
        homepath = os.path.split(os.path.abspath(__file__))[0]
        # Set the path/name of the folder to contain the new alleles and profile
        newfolder = os.path.join(self.path, self.analysistype)
        # Create the path
        make_path(newfolder)
        # Create arguments to feed into the rest_auth_class script
        args = ArgumentParser
        args.secret_file = os.path.join(homepath, 'secret.txt')
        args.file_path = homepath
        args.output_path = newfolder
        args.start = self.start
        rmlst = rest_auth_class.REST(args)
        # Download the profile and alleles
        rmlst.main()

        # Get the new alleles into a list, and create the combinedAlleles file
        alleles = glob(os.path.join(newfolder, '*.tfa'))
        self.combinealleles(newfolder, alleles)
Exemplo n.º 14
0
 def quality(self):
     """
     Creates quality objects and runs quality assessments and quality processes on the
     supplied sequences
     """
     # Validate that the FASTQ files are in the proper format, and that there are no issues e.g. different numbers
     # of forward and reverse reads, read length longer than quality score length, proper extension
     self.fastq_validate()
     # Run FastQC on the unprocessed fastq files
     self.fastqc_raw()
     # Perform quality trimming and FastQC on the trimmed files
     self.quality_trim()
     # Run FastQC on the trimmed files
     self.fastqc_trimmed()
     # Perform error correcting on the reads
     self.error_correct()
     # Detect contamination in the reads
     self.contamination_detection()
     # Run FastQC on the processed fastq files
     self.fastqc_trimmedcorrected()
     # Exit if only pre-processing of data is requested
     metadataprinter.MetadataPrinter(self)
     if self.preprocess:
         printtime('Pre-processing complete', self.starttime)
         quit()
Exemplo n.º 15
0
    def estimateabundance(self):
        """
        Estimate the abundance of taxonomic groups
        """
        printtime('Estimating abundance of taxonomic groups', self.start)
        # Create and start threads
        for i in range(self.cpus):
            # Send the threads to the appropriate destination function
            threads = Thread(target=self.estimate, args=())
            # Set the daemon to true - something to do with thread management
            threads.setDaemon(True)
            # Start the threading
            threads.start()
        for sample in self.runmetadata.samples:
            try:
                if sample.general.combined != 'NA':
                    # Set the name of the abundance report
                    sample.general.abundance = sample.general.combined.split(
                        '.')[0] + '_abundance.csv'
                    # if not hasattr(sample, 'commands'):
                    if not sample.commands.datastore:
                        sample.commands = GenObject()

                    # Define system calls
                    sample.commands.target = self.targetcall
                    sample.commands.classify = self.classifycall
                    sample.commands.abundancecall = \
                        'cd {} && ./estimate_abundance.sh -D {} -F {} > {}'.format(self.clarkpath,
                                                                                   self.databasepath,
                                                                                   sample.general.classification,
                                                                                   sample.general.abundance)
                    self.abundancequeue.put(sample)
            except KeyError:
                pass
        self.abundancequeue.join()
Exemplo n.º 16
0
 def reporter(self):
     """
     Runs the necessary methods to parse raw read outputs
     """
     printtime('Preparing reports', self.starttime)
     # Populate self.plusdict in order to reuse parsing code from an assembly-based method
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             for gene in sample[self.analysistype].allelenames:
                 for allele, percentidentity in sample[
                         self.analysistype].results.items():
                     if gene in allele:
                         # Split the allele number from the gene name using the appropriate delimiter
                         if '_' in allele:
                             splitter = '_'
                         elif '-' in allele:
                             splitter = '-'
                         else:
                             splitter = ''
                         # Create the plusdict dictionary as in the assembly-based (r)MLST method. Allows all the
                         # parsing and sequence typing code to be reused.
                         try:
                             self.plusdict[sample.name][gene][allele.split(splitter)[1]][percentidentity] \
                                 = sample[self.analysistype].avgdepth[allele]
                         except IndexError:
                             pass
     self.profiler()
     self.sequencetyper()
     self.mlstreporter()
Exemplo n.º 17
0
    def extract_rmlst_reads(self, fastq_pairs, fastq_singles):
        """
        Extracts rmlst reads and puts them in a folder.
        :param fastq_pairs: List of fastqpairs in nested array [[forward1, reverse1], [forward2, reverse2]]
        :param fastq_singles: List of fastq singles.
        :return: Zip, zilch, nada.
        """
        for pair in fastq_pairs:
            cmd = 'bbduk.sh ref={} in1={} in2={} outm={}' \
              ' outm2={}'.format(self.database, pair[0], pair[1], self.output_file + 'rmlsttmp/' + pair[0].split('/')[-1],
                                 self.output_file + 'rmlsttmp/' + pair[1].split('/')[-1])
            with open(self.output_file + 'tmp/junk.txt', 'w') as outjunk:
                try:  # This should give bbduk more than enough time to run, unless user's computer is super slow.
                    # Maybe adjust the value later.
                    subprocess.call(cmd, shell=True, stderr=outjunk, timeout=3600)
                except subprocess.TimeoutExpired:
                    printtime(pair[0] + ' appears to be making BBDUK run forever. Killing...', self.start)
                    os.remove(self.output_file + 'rmlsttmp/' + pair[0].split('/')[-1])
                    os.remove(self.output_file + 'rmlsttmp/' + pair[1].split('/')[-1])

        for single in fastq_singles:
            cmd = 'bbduk.sh ref=database.fasta in={} outm={}' \
              ''.format(single, self.output_file + 'rmlsttmp/' + single.split('/')[-1])
            with open(self.output_file + 'tmp/junk.txt', 'w') as outjunk:
                try:  # This should give bbduk more than enough time to run, unless user's computer is super slow.
                    # Maybe adjust the value later.
                    subprocess.call(cmd, shell=True, stderr=outjunk, timeout=3600)
                except subprocess.TimeoutExpired:
                    printtime(pair[0] + ' appears to be making BBDUK run forever. Killing...', self.start)
                    os.remove(self.output_file + 'rmlsttmp/' + single.split('/')[-1])
Exemplo n.º 18
0
 def blast(self):
     """
     Run BLAST analyses of the subsampled FASTQ reads against the NCBI 16S reference database
     """
     printtime('BLASTing FASTA files against {} database'.format(
         self.analysistype),
               self.starttime,
               output=self.portallog)
     for _ in range(self.cpus):
         threads = Thread(target=self.blastthreads, args=())
         threads.setDaemon(True)
         threads.start()
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             # Set the name of the BLAST report
             sample[self.analysistype].blastreport = os.path.join(
                 sample[self.analysistype].outputdir,
                 '{}_{}_blastresults.csv'.format(sample.name,
                                                 self.analysistype))
             # Use the NCBI BLASTn command line wrapper module from BioPython to set the parameters of the search
             blastn = NcbiblastnCommandline(
                 query=sample[self.analysistype].fasta,
                 db=os.path.splitext(sample[self.analysistype].baitfile)[0],
                 max_target_seqs=1,
                 num_threads=self.threads,
                 outfmt="'6 qseqid sseqid positive mismatch gaps "
                 "evalue bitscore slen length qstart qend qseq sstart send sseq'",
                 out=sample[self.analysistype].blastreport)
             # Add a string of the command to the metadata object
             sample[self.analysistype].blastcall = str(blastn)
             # Add the object and the command to the BLAST queue
             self.blastqueue.put((sample, blastn))
     self.blastqueue.join()
Exemplo n.º 19
0
 def quast(self):
     printtime('Performing Quast analyses', self.start)
     for i in range(
             len([
                 sample.general for sample in self.metadata
                 if sample.general.bestassemblyfile != 'NA'
             ])):
         # Send the threads to the merge method. :args is empty
         threads = Thread(target=self.runquast, args=())
         # Set the daemon to true - something to do with thread management
         threads.setDaemon(True)
         # Start the threading
         threads.start()
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             # Create the quast output directory
             quastoutputdirectory = '{}/quast_results/'.format(
                 sample.general.outputdirectory)
             make_path(quastoutputdirectory)
             # Set the quast system call
             quastcall = 'quast.py {} -o {}'.format(
                 sample.general.filteredfile, quastoutputdirectory)
             # Add the command to the metadata
             sample.commands.quast = quastcall
             self.quastqueue.put((sample, quastoutputdirectory))
         else:
             sample.commands.quast = 'NA'
     self.quastqueue.join()
Exemplo n.º 20
0
 def get_session_token(self):
     """
     Use the accession token to request a new session token
     """
     printtime('Getting session token', self.start)
     # Rather than testing any previous session tokens to see if they are still valid, simply delete old tokens in
     # preparation of the creation of new ones
     try:
         os.remove(os.path.join(self.file_path, 'session_token'))
     except FileNotFoundError:
         pass
     # Create a new session
     session_request = OAuth1Session(self.consumer_key,
                                     self.consumer_secret,
                                     access_token=self.access_token,
                                     access_token_secret=self.access_secret)
     # Set the URL appropriately
     url = self.test_rest_url + '/oauth/get_session_token'
     # Perform a GET request with the appropriate keys and tokens
     r = session_request.get(url)
     # If the status code is '200' (OK), proceed
     if r.status_code == 200:
         # Save the JSON-decoded token secret and token
         self.session_token = r.json()['oauth_token']
         self.session_secret = r.json()['oauth_token_secret']
         # Write the token and secret to file
         self.write_token('session_token', self.session_token, self.session_secret)
     # Any other status than 200 is considered a failure
     else:
         print('Failed:')
         print(r.json()['message'])
Exemplo n.º 21
0
 def error_correction(self):
     """
     Use tadpole from the bbmap suite of tools to perform error correction of the reads
     """
     printtime('Error correcting reads', self.start)
     for sample in self.metadata:
         sample.general.trimmedcorrectedfastqfiles = [
             fastq.split('.fastq.gz')[0] + '_trimmed_corrected.fastq.gz'
             for fastq in sorted(sample.general.fastqfiles)
         ]
         try:
             out, err, cmd = bbtools.tadpole(
                 forward_in=sorted(sample.general.trimmedfastqfiles)[0],
                 forward_out=sample.general.trimmedcorrectedfastqfiles[0],
                 returncmd=True,
                 mode='correct',
                 threads=self.cpus)
             # Set the command in the object
             sample[self.analysistype].errorcorrectcmd = cmd
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
         except CalledProcessError:
             sample.general.trimmedcorrectedfastqfiles = sample.general.trimmedfastqfiles
         except KeyError:
             sample.general.trimmedcorrectedfastqfiles = list()
Exemplo n.º 22
0
    def profiler(self):
        """Creates a dictionary from the profile scheme(s)"""
        printtime('Loading profiles', self.starttime)
        from csv import DictReader
        # Initialise variables
        profiledata = defaultdict(make_dict)
        profileset = set()
        genedict = dict()
        # Find all the unique profiles to use with a set
        for sample in self.runmetadata.samples:
            if sample.general.bestassemblyfile != 'NA':
                if sample[self.analysistype].profile != 'NA':
                    profileset.add(sample[self.analysistype].profile)

        # Extract the profiles for each set
        for sequenceprofile in profileset:
            # Clear the list of genes
            genelist = list()
            for sample in self.runmetadata.samples:
                if sample.general.bestassemblyfile != 'NA':
                    if sequenceprofile == sample[self.analysistype].profile:
                        genelist = [
                            allele
                            for allele in sample[self.analysistype].alleles
                        ]
            try:
                # Open the sequence profile file as a dictionary
                profile = DictReader(open(sequenceprofile),
                                     dialect='excel-tab')
            # Revert to standard comma separated values
            except KeyError:
                # Open the sequence profile file as a dictionary
                profile = DictReader(open(sequenceprofile))
            # Iterate through the rows
            for row in profile:
                # Iterate through the genes
                for gene in genelist:
                    # Add the sequence profile, and type, the gene name and the allele number to the dictionary
                    try:
                        profiledata[sequenceprofile][
                            row['ST']][gene] = row[gene]
                    except KeyError:
                        try:
                            profiledata[sequenceprofile][
                                row['rST']][gene] = row[gene]
                        except KeyError:
                            raise
            # Add the gene list to a dictionary
            genedict[sequenceprofile] = sorted(genelist)
            # Add the profile data, and gene list to each sample
            for sample in self.runmetadata.samples:
                if sample.general.bestassemblyfile != 'NA':
                    if sequenceprofile == sample[self.analysistype].profile:
                        # Populate the metadata with the profile data
                        sample[self.analysistype].profiledata = profiledata[
                            sample[self.analysistype].profile]
                        dotter()
Exemplo n.º 23
0
 def create_database_folder(self, database):
     """
     Create an appropriately named folder in which the database is to be stored
     :param database: the name of the database folder to create
     :return: the absolute path of the folder
     """
     printtime('Setting up {} database'.format(database), self.start)
     # Define the path to store the database files
     databasepath = os.path.join(self.databasepath, database)
     # Create the path as required
     make_path(databasepath)
     return databasepath
Exemplo n.º 24
0
 def settargets(self):
     """Set the targets to be used in the analyses. Involves the path of the database files, the database files to
      use, and the level of classification for the analysis"""
     # Define the set targets call. Include the path to the script, the database path and files, as well
     # as the taxonomic rank to use
     printtime('Setting up database', self.start)
     self.targetcall = 'cd {} && ./set_targets.sh {} {} --{}'.format(
         self.clarkpath, self.databasepath, self.database, self.rank)
     #
     subprocess.call(self.targetcall,
                     shell=True,
                     stdout=self.devnull,
                     stderr=self.devnull)
Exemplo n.º 25
0
 def reporter(self):
     """
     Create a report of the results
     """
     printtime('Writing report', self.starttime)
     data = 'Strain,Profile\n'
     for sample in self.runmetadata.samples:
         # Only add to the string if there are results
         if sample[self.analysistype].toxinprofile:
             data += '{},{}\n'.format(sample.name, sample[self.analysistype].toxinprofile)
     # Create the report, and write to it
     with open('{}/{}.csv'.format(self.reportpath, self.analysistype), 'wb') as report:
         report.write(data)
Exemplo n.º 26
0
 def main(self):
     """
     Run the necessary methods in the correct order
     """
     printtime('Starting {} analysis pipeline'.format(self.analysistype), self.starttime)
     # Create the objects to be used in the analyses
     objects = Objectprep(self)
     objects.objectprep()
     self.runmetadata = objects.samples
     self.threads = int(self.cpus / len(self.runmetadata.samples)) if self.cpus / len(self.runmetadata.samples) > 1 \
         else 1
     # Run the genesippr analyses
     self.analysistype = 'genesippr'
     self.targetpath = os.path.join(self.reffilepath, self.analysistype, '')
     Sippr(self, 0.90)
     # Create the reports
     self.reports = Reports(self)
     Reports.reporter(self.reports)
     # Run the 16S analyses using the filtered database
     self.targetpath = self.reffilepath
     # Run the 16S analyses
     self.analysistype = 'sixteens_full'
     SixteensFull(self, self.commit, self.starttime, self.homepath, 'sixteens_full', 0.985)
     # ResFinding
     Resistance(self, self.commit, self.starttime, self.homepath, 'resfinder', 0.90, False, True)
     # Run the GDCS analysis
     self.analysistype = 'GDCS'
     self.pipeline = True
     self.targetpath = os.path.join(self.targetpath, self.analysistype)
     Sippr(self, 0.95)
     # Create the reports
     Reports.gdcsreporter(self.reports)
     # Perform serotyping for samples classified as Escherichia
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             sample.mash = GenObject()
             try:
                 sample.mash.closestrefseqgenus = sample.general.closestrefseqgenus
                 for genus, species in self.taxonomy.items():
                     if genus == sample.mash.closestrefseqgenus:
                         sample.mash.closestrefseqspecies = species
             except KeyError:
                 sample.mash.closestrefseqgenus = 'NA'
                 sample.mash.closestrefseqspecies = 'NA'
         else:
             sample.mash.closestrefseqgenus = 'NA'
             sample.mash.closestrefseqspecies = 'NA'
     SeroSippr(self, self.commit, self.starttime, self.homepath, 'serosippr', 0.95, True)
     # Print the metadata
     printer = MetadataPrinter(self)
     printer.printmetadata()
Exemplo n.º 27
0
 def sistr(self):
     """Perform sistr analyses on Salmonella"""
     printtime('Performing sistr analyses', self.start)
     for sample in self.metadata:
         # Create the analysis-type specific attribute
         setattr(sample, self.analysistype, GenObject())
         if sample.general.bestassemblyfile != 'NA':
             try:
                 # Only process strains that have been determined to be Salmonella
                 if sample.general.referencegenus == 'Salmonella':
                     # Set and create the path of the directory to store the strain-specific reports
                     sample[self.analysistype].reportdir = os.path.join(
                         sample.general.outputdirectory, self.analysistype)
                     # Name of the .json output file
                     sample[self.analysistype].jsonoutput = os.path.join(
                         sample[self.analysistype].reportdir,
                         '{}.json'.format(sample.name))
                     # Set the sistr system call
                     sample.commands.sistr = \
                         'sistr -f json -o {} -t {} -T {} {}'\
                         .format(sample[self.analysistype].jsonoutput,
                                 self.cpus,
                                 os.path.join(sample[self.analysistype].reportdir, 'tmp'),
                                 sample.general.bestassemblyfile)
                     #
                     sample[self.analysistype].logout = os.path.join(
                         sample[self.analysistype].reportdir, 'logout')
                     sample[self.analysistype].logerr = os.path.join(
                         sample[self.analysistype].reportdir, 'logerr')
                     # Only run the analyses if the output json file does not exist
                     if not os.path.isfile(
                             sample[self.analysistype].jsonoutput):
                         out, err = run_subprocess(sample.commands.sistr)
                         write_to_logfile(sample.commands.sistr,
                                          sample.commands.sistr,
                                          self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                         write_to_logfile(out, err, self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                     self.queue.task_done()
             except (ValueError, KeyError):
                 pass
     self.queue.join()
     self.report()
Exemplo n.º 28
0
 def objectprep(self):
     """Create objects to store data and metadata for each sample. Also, perform necessary file manipulations"""
     # Move the files to subfolders and create objects
     self.runmetadata = createobject.ObjectCreation(self)
     if self.runmetadata.extension == 'fastq':
         # To streamline the CLARK process, decompress and combine .gz and paired end files as required
         printtime(
             'Decompressing and combining .fastq files for CLARK analysis',
             self.start)
         fileprep.Fileprep(self)
     else:
         printtime('Using .fasta files for CLARK analysis', self.start)
         for sample in self.runmetadata.samples:
             sample.general.combined = sample.general.fastqfiles[0]
Exemplo n.º 29
0
 def predictthreads(self):
     printtime('Performing gene predictions', self.start)
     # Create the threads for the analyses
     for sample in self.metadata:
         if sample.general.bestassemblyfile != 'NA':
             threads = Thread(target=self.predict, args=())
             threads.setDaemon(True)
             threads.start()
     for sample in self.metadata:
         # Create the .prodigal attribute
         sample.prodigal = GenObject()
         if sample.general.bestassemblyfile != 'NA':
             self.predictqueue.put(sample)
     self.predictqueue.join()
Exemplo n.º 30
0
 def movefastq(self):
     """Find .fastq files for each sample and move them to an appropriately named folder"""
     printtime('Moving FASTQ files', self.start)
     # Iterate through each sample
     for sample in self.metadata.runmetadata.samples:
         # Retrieve the output directory
         outputdir = os.path.join(self.path, sample.name)
         # Find any fastq files with the sample name
         fastqfiles = sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \
             if sorted(glob(os.path.join(self.path, '{}_*.fastq*'.format(sample.name)))) \
             else sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \
             if sorted(glob(os.path.join(self.path, '{}.fastq*'.format(sample.name)))) \
             else sorted(glob(os.path.join(self.path, '{}*.fastq*'.format(sample.name))))
         # Only try and move the files if the files exist
         if fastqfiles:
             make_path(outputdir)
             # Symlink the fastq files to the directory
             try:
                 list(
                     map(
                         lambda x: os.symlink(
                             os.path.join('..', os.path.basename(x)),
                             os.path.join(outputdir, os.path.basename(x))),
                         fastqfiles))
             except OSError:
                 pass
             # Find any fastq files with the sample name
             fastqfiles = [
                 fastq for fastq in sorted(
                     glob(
                         os.path.join(outputdir, '{}*.fastq*'.format(
                             sample.name)))) if 'trimmed' not in fastq
                 and 'normalised' not in fastq and 'corrected' not in fastq
                 and 'paired' not in fastq and 'unpaired' not in fastq
             ]
         else:
             if outputdir:
                 # Find any fastq files with the sample name
                 fastqfiles = [
                     fastq for fastq in sorted(
                         glob(
                             os.path.join(
                                 outputdir, '{}*.fastq*'.format(
                                     outputdir, sample.name))))
                     if 'trimmed' not in fastq and 'normalised' not in fastq
                     and 'corrected' not in fastq and 'paired' not in fastq
                     and 'unpaired' not in fastq
                 ]
         sample.general.fastqfiles = fastqfiles
Exemplo n.º 31
0
 def __init__(self, inputobject):
     self.metadata = inputobject.runmetadata.samples
     self.start = inputobject.starttime
     self.kmers = inputobject.kmers
     self.cpus = inputobject.cpus
     try:
         self.threads = int(self.cpus / len(
             self.metadata)) if self.cpus / len(self.metadata) > 1 else 1
     except TypeError:
         self.threads = self.cpus
     self.path = inputobject.path
     self.logfile = inputobject.logfile
     self.assemblequeue = Queue(maxsize=self.threads)
     printtime('Assembling sequences', self.start)
     self.spades()
Exemplo n.º 32
0
 def combinealleles(self, allelepath, alleles):
     printtime('Creating combined rMLST allele file', self.start)
     with open(os.path.join(allelepath, 'rMLST_combined.fasta'), 'w') as combinedfile:
         # Open each allele file
         for allele in sorted(alleles):
             # with open(allele, 'rU') as fasta:
             for record in SeqIO.parse(open(allele, "rU"), "fasta"):
                 # Extract the sequence record from each entry in the multifasta
                 # Replace and dashes in the record.id with underscores
                 record.id = record.id.replace('-', '_')
                 # Remove and dashes or 'N's from the sequence data - makeblastdb can't handle sequences
                 # with gaps
                 # noinspection PyProtectedMember
                 record.seq._data = record.seq._data.replace('-', '').replace('N', '')
                 # Clear the name and description attributes of the record
                 record.name = ''
                 record.description = ''
                 # Write each record to the combined file
                 SeqIO.write(record, combinedfile, 'fasta')
Exemplo n.º 33
0
 def find_loci(self):
     """
     Finds the URLs for all allele files
     """
     printtime('Downloading alleles', self.start)
     session = OAuth1Session(self.consumer_key,
                             self.consumer_secret,
                             access_token=self.session_token,
                             access_token_secret=self.session_secret)
     # Use the URL for all loci determined above
     r = session.get(self.loci)
     if r.status_code == 200 or r.status_code == 201:
         if re.search('json', r.headers['content-type'], flags=0):
             decoded = r.json()
         else:
             decoded = r.text
         # Extract all the URLs in the decoded dictionary under the key 'loci'
         for locus in decoded['loci']:
             # Add each URL to the list
             self.loci_url.append(locus)