示例#1
0
 def epcr(self):
     while True:
         # I think this should work for getting output from processes and writing to a logfile - but it's long
         # and terrible - maybe possible to get it set up in accessoryFunctions so it's a bit less of a pain?
         # Setup a threadlock for later so multiple processes don't all try to write their output at once.
         threadlock = threading.Lock()
         # Get our stdout and stderr strings set up.
         outstr = ''
         errstr = ''
         sample, linkfile = self.epcrqueue.get()
         if not os.path.isfile('{}.famap'.format(linkfile)):
             # Run the subprocess, then get the stdout in outstr and stderr in errstr
             out, err = run_subprocess(sample.commands.famap)
             outstr += out
             errstr += err
         if not os.path.isfile('{}.hash'.format(linkfile)):
             out, err = run_subprocess(sample.commands.fahash)
             outstr += out
             errstr += err
         if not os.path.isfile('{}.txt'.format(linkfile)):
             out, err = run_subprocess(sample.commands.epcr)
             outstr += out
             errstr += err
         # Once processes are finished running, get the threadlock, because now it's output writing time.
         threadlock.acquire()
         # Write stuff to the logfile.
         write_to_logfile(sample.commands.famap, sample.commands.famap, self.logfile)
         write_to_logfile(sample.commands.fahash, sample.commands.fahash, self.logfile)
         write_to_logfile(sample.commands.epcr, sample.commands.epcr, self.logfile)
         write_to_logfile(outstr, errstr, self.logfile)
         threadlock.release()
         # Release the threadlock so that other processes can get on with it.
         self.epcrqueue.task_done()
 def run_jellyfish(self):
     """
     Runs jellyfish to split subsample reads into kmers. Runs kmers through a bloom filter to get rid of singletons
     that are likely just sequencing errors. Should be run after subsampling reads.
     """
     for sample in self.metadata:
         # Set the name of the jellyfish count file
         sample[self.analysistype].jellyfish_file = os.path.join(
             sample[self.analysistype].outputdir,
             sample.name + '_jellyfish')
         # Set the system call
         sample[self.analysistype].jellyfishcountcmd \
             = 'jellyfish count -m 31 -s 100M --bf-size 100M -C -F 2 {} -o {} -t {}'\
             .format(sample[self.analysistype].subsampledreads,
                     sample[self.analysistype].jellyfish_file,
                     str(self.threads))
         # Run the call, and write any errors to the logfile
         command = sample[self.analysistype].jellyfishcountcmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
示例#3
0
 def bbduker(self):
     """Run bbduk system calls"""
     while True:  # while daemon
         # Unpack the variables from the queue
         (sample, systemcall, reversename) = self.trimqueue.get()
         # Check to see if the forward file already exists
         if systemcall:
             threadlock = threading.Lock()
             if not os.path.isfile(reversename) and not os.path.isfile(
                     '{}.bz2'.format(reversename)):
                 # Run the call
                 out, err = run_subprocess(systemcall)
                 threadlock.acquire()
                 write_to_logfile(systemcall, systemcall, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr, None, None)
                 write_to_logfile(out, err, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr, None, None)
                 threadlock.release()
             # Define the output directory
             outputdir = sample.general.outputdirectory
             # Add the trimmed fastq files to a list
             trimmedfastqfiles = sorted(
                 glob(os.path.join(outputdir, '*trimmed.fastq.gz')))
             # Populate the metadata if the files exist
             sample.general.trimmedfastqfiles = trimmedfastqfiles if trimmedfastqfiles else 'NA'
         # Signal to trimqueue that job is done
         self.trimqueue.task_done()
示例#4
0
 def predict(self):
     while True:
         sample = self.predictqueue.get()
         # Populate attributes
         sample.prodigal.reportdir = os.path.join(
             sample.general.outputdirectory, 'prodigal')
         sample.prodigal.results_file = os.path.join(
             sample.prodigal.reportdir,
             '{}_prodigalresults.sco'.format(sample.name))
         sample.prodigal.results = sample.prodigal.results_file
         sample.commands.prodigal = 'prodigal -i {in1} -o {out1} -f sco -d {genes}'\
             .format(in1=sample.general.bestassemblyfile,
                     out1=sample.prodigal.results_file,
                     genes=os.path.join(sample.prodigal.reportdir, '{}_genes.fa'.format(sample.name)))
         # Create the folder to store the reports
         make_path(sample.prodigal.reportdir)
         # Determine if the report already exists, and that it is not empty
         size = 0
         if os.path.isfile(sample.prodigal.results_file):
             size = os.stat(sample.prodigal.results_file).st_size
         if not os.path.isfile(sample.prodigal.results_file) or size == 0:
             # Run the command
             out, err = run_subprocess(sample.commands.prodigal)
             threadlock.acquire()
             write_to_logfile(sample.commands.prodigal,
                              sample.commands.prodigal, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
         self.predictqueue.task_done()
示例#5
0
 def assemble(self):
     """Run the assembly command in a multi-threaded fashion"""
     threadlock = threading.Lock()
     while True:
         (sample, command) = self.assemblequeue.get()
         if command and not os.path.isfile(
                 os.path.join(sample.general.spadesoutput,
                              'contigs.fasta')):
             # execute(command)
             out, err = run_subprocess(command)
             threadlock.acquire()
             write_to_logfile(command, command, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
             #
             call(command,
                  shell=True,
                  stdout=open(os.devnull, 'wb'),
                  stderr=open(os.devnull, 'wb'))
         dotter()
         # Signal to the queue that the job is done
         self.assemblequeue.task_done()
 def subsample_reads(self):
     """
     Subsampling of reads to 20X coverage of rMLST genes (roughly).
     To be called after rMLST extraction and read trimming, in that order.
     """
     for sample in self.metadata:
         # Create the name of the subsampled read file
         sample[self.analysistype].subsampledreads = os.path.join(
             sample[self.analysistype].outputdir,
             '{}_targetMatches_subsampled.fastq'.format(self.analysistype))
         # Set the reformat.sh command - as this command will be run multiple times, overwrite previous iterations
         # each time. Use samplebasestarget to provide an approximation of the number of bases to include in the
         # subsampled reads e.g. for rMLST: 700000 (approx. 35000 bp total length of genes x 20X coverage)
         sample[self.analysistype].subsamplecmd = 'reformat.sh in={} out={} overwrite samplebasestarget={}' \
             .format(sample[self.analysistype].baitedfastq,
                     sample[self.analysistype].subsampledreads,
                     self.samplebasestarget)
         # Run the call, and write any errors to the logfile
         command = sample[self.analysistype].subsamplecmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
 def run_bbmap(self):
     """
     Runs bbmap on kmer fasta file, against kmer fasta file to generate a samfile which can then be parsed to find
     low frequency kmers that have one mismatch to high frequency kmers, indicating that they're from contaminating
     alleles.
     """
     for sample in self.metadata:
         # Create the name for the output bam file
         sample[self.analysistype].bamfile = sample[
             self.analysistype].mer_fasta.replace('.fasta', '.bam')
         # Set the bbmap call - use the overwrite option to overwrite previous files that were created on previous
         # iterations, ambig=all to use all highest scoring mappings, nodisk to build index in memory, and only write
         # ouput to disk, local to allow soft-clipping
         sample[self.analysistype].bbmapcmd = \
             'bbmap.sh ref={} in={} outm={} overwrite ambig=all nodisk local threads={}'\
             .format(sample[self.analysistype].solid_mers,
                     sample[self.analysistype].solid_mers,
                     sample[self.analysistype].bamfile,
                     str(self.threads))
         # Run the call, and write any errors to the logfile
         command = sample[self.analysistype].bbmapcmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
示例#8
0
 def makeblastdb(self):
     """
     Makes blast database files from targets as necessary
     """
     # Iterate through the samples to set the bait file.
     for sample in self.runmetadata.samples:
         if sample.general.bestassemblyfile != 'NA':
             # Remove the file extension
             db = os.path.splitext(sample[self.analysistype].baitfile)[0]
             # Add '.nhr' for searching below
             nhr = '{}.nhr'.format(db)
             # Check for already existing database files
             if not os.path.isfile(str(nhr)):
                 # Create the databases
                 command = 'makeblastdb -in {} -parse_seqids -max_file_sz 2GB -dbtype nucl -out {}'\
                     .format(sample[self.analysistype].baitfile, db)
                 out, err = run_subprocess(command)
                 write_to_logfile(command, command, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr,
                                  sample[self.analysistype].logout,
                                  sample[self.analysistype].logerr)
                 write_to_logfile(out, err, self.logfile,
                                  sample.general.logout,
                                  sample.general.logerr,
                                  sample[self.analysistype].logout,
                                  sample[self.analysistype].logerr)
示例#9
0
 def fastqc(self):
     """Run fastqc system calls"""
     while True:  # while daemon
         threadlock = threading.Lock()
         # Unpack the variables from the queue
         (sample, systemcall, outputdir, fastqcreads) = self.qcqueue.get()
         # Check to see if the output HTML file already exists
         try:
             _ = glob(os.path.join(outputdir, '*.html'))[0]
         except IndexError:
             # Make the output directory
             make_path(outputdir)
             # Run the system calls
             outstr = str()
             errstr = str()
             out, err = run_subprocess(systemcall)
             outstr += out
             errstr += err
             out, err = run_subprocess(fastqcreads)
             outstr += out
             errstr += err
             # Acquire thread lock, and write the logs to file
             threadlock.acquire()
             write_to_logfile(systemcall, systemcall, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(fastqcreads, fastqcreads, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(outstr, errstr, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             threadlock.release()
             # Rename the outputs
             try:
                 shutil.move(
                     os.path.join(outputdir, 'stdin_fastqc.html'),
                     os.path.join(outputdir,
                                  '{}_fastqc.html'.format(sample.name)))
                 shutil.move(
                     os.path.join(outputdir, 'stdin_fastqc.zip'),
                     os.path.join(outputdir,
                                  '{}_fastqc.zip'.format(sample.name)))
             except IOError:
                 pass
         # Signal to qcqueue that job is done
         self.qcqueue.task_done()
 def write_mer_file(self):
     """
     Writes the mer file created by jellyfish to a fasta format to be used by other things downstream.
     Only writes kmers that have been seen at least twice to attempt to get rid of sequencing erros.
     """
     for sample in self.metadata:
         # Set the name of the kmer file dumped from jellyfish
         sample[self.analysistype].mer_fasta = sample[
             self.analysistype].jellyfish_file + '.fasta'
         sample[self.analysistype].solid_mers = sample[
             self.analysistype].jellyfish_file + '_solid.fasta'
         # Set the system call
         sample[self.analysistype].jellyfishdumpcmd =\
             'jellyfish dump {} > {}'\
             .format(sample[self.analysistype].jellyfish_file,
                     sample[self.analysistype].mer_fasta)
         # Run the system call
         command = sample[self.analysistype].jellyfishdumpcmd
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile,
                          sample.general.logout, sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         write_to_logfile(out, err, self.logfile, sample.general.logout,
                          sample.general.logerr,
                          sample[self.analysistype].logout,
                          sample[self.analysistype].logerr)
         # Read in the dumped file to a list
         with open(sample[self.analysistype].mer_fasta, 'r') as mers:
             fastas = mers.readlines()
         # Initialise variables for use in parsing outputs
         num_mers = 0
         sequences = list()
         # Iterate through the list of the fasta outputs. Output is a multifasta e.g.:
         # >8
         # GCCTGGAAAACTGGCCACCGGCAAGCATCGC
         # where the header, >8, indicates that the sequence is present 8 times in the sample
         for i in range(len(fastas)):
             # Find the headers
             if '>' in fastas[i]:
                 # If the number of times the sequence is present in the sample is greater than one, increment
                 # the total number of kmers observed
                 if int(fastas[i].replace('>', '')) > 1:
                     num_mers += 1
                     # Append a string of the header plus the total number of mers, and the sequence information
                     # to the list of sequences e.g. ['>8_1\nGCCTGGAAAACTGGCCACCGGCAAGCATCGC\n']
                     sequences.append(fastas[i].rstrip() + '_' +
                                      str(num_mers) + '\n' + fastas[i + 1])
         # Write out our solid kmers to file to be used later.
         with open(sample[self.analysistype].solid_mers, 'w') as solidmers:
             solidmers.write(''.join(sequences))
         # Update the number of unique kmers
         if num_mers > sample[self.analysistype].unique_kmers:
             sample[self.analysistype].unique_kmers = num_mers
示例#11
0
文件: CHAS.py 项目: carden24/OLCTools
 def epcr(self):
     while True:
         sample, linkfile = self.epcrqueue.get()
         # Set the names of the output files
         sample[self.analysistype].famap = '{}.famap'.format(linkfile)
         sample[self.analysistype].hash = '{}.hash'.format(linkfile)
         sample[self.analysistype].output = '{}.txt'.format(linkfile)
         # Initialise a list to store the results
         sample[self.analysistype].epcrresults = list()
         # If the files created by the results do not exist, run the necessary system calls
         threadlock = threading.Lock()
         # Get our stdout and stderr strings set up.
         outstr = ''
         errstr = ''
         sample, linkfile = self.epcrqueue.get()
         if not os.path.isfile(sample[self.analysistype].famap):
             # Run the subprocess, then get the stdout in outstr and stderr in errstr
             out, err = run_subprocess(sample.commands.famap)
             outstr += out
             errstr += err
         if not os.path.isfile(sample[self.analysistype].famap):
             out, err = run_subprocess(sample.commands.fahash)
             outstr += out
             errstr += err
         if not os.path.isfile(sample[self.analysistype].output):
             out, err = run_subprocess(sample.commands.epcr)
             outstr += out
             errstr += err
         # Once processes are finished running, get the threadlock, because now it's output writing time.
         threadlock.acquire()
         # Write stuff to the logfile.
         write_to_logfile(sample.commands.famap, sample.commands.famap, self.logfile)
         write_to_logfile(sample.commands.fahash, sample.commands.fahash, self.logfile)
         write_to_logfile(sample.commands.epcr, sample.commands.epcr, self.logfile)
         write_to_logfile(outstr, errstr, self.logfile)
         threadlock.release()
         # Read the results into a list
         with open(sample[self.analysistype].output, 'r') as results:
             for line in results:
                 sample[self.analysistype].epcrresults.append(line.strip())
         self.epcrqueue.task_done()
 def extract_rmlst_reads(self):
     """
     rMLST read extraction. Should be the first thing called after parsing the fastq directory.
     """
     for sample in self.metadata:
         # Create the object to store the variables
         setattr(sample, self.analysistype, GenObject())
         # Initialise variables
         sample[self.analysistype].snv_count = list()
         # Initialise a starting value for the number of unique kmers found in each sample
         sample[self.analysistype].unique_kmers = -1
         # Set and create the output directory
         try:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.run.outputdirectory, self.analysistype)
         except KeyError:
             sample[self.analysistype].outputdir = os.path.join(
                 sample.general.outputdirectory, self.analysistype)
         make_path(sample[self.analysistype].outputdir)
         sample[self.analysistype].logout = os.path.join(
             sample[self.analysistype].outputdir, 'logout.txt')
         sample[self.analysistype].logerr = os.path.join(
             sample[self.analysistype].outputdir, 'logerr.txt')
         sample[self.analysistype].baitedfastq = os.path.join(
             sample[self.analysistype].outputdir,
             '{}_targetMatches.fastq.gz'.format(self.analysistype))
         # Create the command to run the baiting - paired inputs and a single, zipped output
         sample[self.analysistype].bbdukcmd = 'bbduk.sh ref={} in1={} in2={} threads={} outm={}'\
             .format(self.database,
                     sample.general.trimmedcorrectedfastqfiles[0],
                     sample.general.trimmedcorrectedfastqfiles[1],
                     str(self.threads),
                     sample[self.analysistype].baitedfastq)
         # Sometimes bbduk hangs forever, so that needs to be handled. Give it a very generous timeout.
         try:
             # Run the call, and write any errors to the logfile
             command = sample[self.analysistype].bbdukcmd
             if self.analyse:
                 out, err = run_subprocess(command)
             else:
                 out = str()
                 err = str()
             write_to_logfile(command, command, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         except TimeoutExpired:
             print('ERROR: Could not extract rMLST reads from sample {}'.
                   format(sample.name))
示例#13
0
文件: mash.py 项目: carden24/OLCTools
 def mash(self):
     while True:
         sample = self.mashqueue.get()
         if not os.path.isfile(sample[self.analysistype].mashresults):
             threadlock = threading.Lock()
             out, err = run_subprocess(sample.commands.mash)
             threadlock.acquire()
             write_to_logfile(sample.commands.mash, sample.commands.mash,
                              self.logfile)
             write_to_logfile(out, err, self.logfile)
             threadlock.release()
             # call(sample.commands.mash, shell=True, stdout=self.fnull, stderr=self.fnull)
         self.mashqueue.task_done()
示例#14
0
 def sistr(self):
     """Perform sistr analyses on Salmonella"""
     printtime('Performing sistr analyses', self.start)
     for sample in self.metadata:
         # Create the analysis-type specific attribute
         setattr(sample, self.analysistype, GenObject())
         if sample.general.bestassemblyfile != 'NA':
             try:
                 # Only process strains that have been determined to be Salmonella
                 if sample.general.referencegenus == 'Salmonella':
                     # Set and create the path of the directory to store the strain-specific reports
                     sample[self.analysistype].reportdir = os.path.join(
                         sample.general.outputdirectory, self.analysistype)
                     # Name of the .json output file
                     sample[self.analysistype].jsonoutput = os.path.join(
                         sample[self.analysistype].reportdir,
                         '{}.json'.format(sample.name))
                     # Set the sistr system call
                     sample.commands.sistr = \
                         'sistr -f json -o {} -t {} -T {} {}'\
                         .format(sample[self.analysistype].jsonoutput,
                                 self.cpus,
                                 os.path.join(sample[self.analysistype].reportdir, 'tmp'),
                                 sample.general.bestassemblyfile)
                     #
                     sample[self.analysistype].logout = os.path.join(
                         sample[self.analysistype].reportdir, 'logout')
                     sample[self.analysistype].logerr = os.path.join(
                         sample[self.analysistype].reportdir, 'logerr')
                     # Only run the analyses if the output json file does not exist
                     if not os.path.isfile(
                             sample[self.analysistype].jsonoutput):
                         out, err = run_subprocess(sample.commands.sistr)
                         write_to_logfile(sample.commands.sistr,
                                          sample.commands.sistr,
                                          self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                         write_to_logfile(out, err, self.logfile,
                                          sample.general.logout,
                                          sample.general.logerr,
                                          sample[self.analysistype].logout,
                                          sample[self.analysistype].logerr)
                     self.queue.task_done()
             except (ValueError, KeyError):
                 pass
     self.queue.join()
     self.report()
示例#15
0
 def assemble(self):
     while True:
         sample = self.assemblequeue.get()
         if not os.path.isfile(sample.general.assemblyfile):
             # Run the assembly
             out, err = run_subprocess(sample.commands.assemble)
             self.threadlock.acquire()
             write_to_logfile(sample.commands.assemble,
                              sample.commands.assemble, self.logfile,
                              sample.general.logout, sample.general.logerr,
                              None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             self.threadlock.release()
         self.assemblequeue.task_done()
示例#16
0
文件: CHAS.py 项目: carden24/OLCTools
 def makeblastdb(self, fastapath):
     """
     Makes blast database files from targets as necessary
     """
     # remove the path and the file extension for easier future globbing
     db = fastapath.split('.')[0]
     nhr = '{}.nhr'.format(db)  # add nhr for searching
     if not os.path.isfile(str(nhr)):  # if check for already existing dbs
         # Create the databases
         threadlock = threading.Lock()
         command = 'makeblastdb -in {} -parse_seqids -max_file_sz 2GB -dbtype nucl -out {}'.format(fastapath, db)
         out, err = run_subprocess(command)
         threadlock.acquire()
         write_to_logfile(out, err, self.logfile)
         threadlock.release()
     dotter()
示例#17
0
 def run_qaml(self):
     """
     Create and run the GenomeQAML system call
     """
     printtime('Running GenomeQAML quality assessment', self.start)
     qaml_call = 'classify.py -t {tf} -r {rf}'\
         .format(tf=self.qaml_path,
                 rf=self.qaml_report)
     make_path(self.reportpath)
     # Only attempt to assess assemblies if the report doesn't already exist
     if not os.path.isfile(self.qaml_report):
         # Run the system calls
         out, err = run_subprocess(qaml_call)
         # Acquire thread lock, and write the logs to file
         self.threadlock.acquire()
         write_to_logfile(qaml_call, qaml_call, self.logfile)
         write_to_logfile(out, err, self.logfile)
         self.threadlock.release()
示例#18
0
 def fastathreads(self):
     while True:
         sample = self.fastaqueue.get()
         # Check to see if the FASTA file already exists
         if not os.path.isfile(sample[self.analysistype].fasta):
             # Run the system call , stdout=self.devnull, stderr=self.devnull
             out, err = run_subprocess(sample[self.analysistype].fastxcall)
             write_to_logfile(sample[self.analysistype].fastxcall,
                              sample[self.analysistype].fastxcall,
                              self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         self.fastaqueue.task_done()
示例#19
0
 def subsamplethreads(self):
     while True:
         sample = self.samplequeue.get()
         # Check to see if the subsampled FASTQ file has already been created
         if not os.path.isfile(sample[self.analysistype].subsampledfastq):
             # Run the system call
             # call(sample[self.analysistype].seqtkcall, shell=True, stdout=self.devnull, stderr=self.devnull)
             out, err = run_subprocess(sample[self.analysistype].seqtkcall)
             write_to_logfile(sample[self.analysistype].seqtkcall,
                              sample[self.analysistype].seqtkcall,
                              self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr,
                              sample[self.analysistype].logout,
                              sample[self.analysistype].logerr)
         self.samplequeue.task_done()
 def make_db(self):
     """
     Makes the blast database if it isn't present. Doesn't do anything if we already have database files.
     """
     db_files = ['.nhr', '.nin', '.nsq']
     db_present = True
     for db_file in db_files:
         if not os.path.isfile(self.database + db_file):
             db_present = False
     if not db_present:
         printtime('Making database!', self.start)
         command = 'makeblastdb -dbtype nucl -in ' + self.database
         if self.analyse:
             out, err = run_subprocess(command)
         else:
             out = str()
             err = str()
         write_to_logfile(command, command, self.logfile, None, None, None,
                          None)
         write_to_logfile(out, err, self.logfile, None, None, None, None)
示例#21
0
 def database_download(self, targetcall, databasepath, complete=True):
     """
     Checks to see if the database has already been downloaded. If not, downloads the database, and writes stdout
     and stderr to the logfile
     :param targetcall: system call to download, and possibly set-up the database
     :param databasepath: absolute path of the database
     :param complete: boolean variable to determine whether the complete file should be created
     """
     # Create a file to store the logs; it will be used to determine if the database was downloaded and set-up
     completefile = os.path.join(databasepath, 'complete')
     # Run the system call if the database is not already downloaded
     if not os.path.isfile(completefile):
         out, err = run_subprocess(targetcall)
         print(out, err)
         # Write the out and err streams to the master files
         write_to_logfile(out, err, self.logfile, None, None, None, None)
         if complete:
             # Create the database completeness assessment file and populate it with the out and err streams
             with open(completefile, 'w') as complete:
                 complete.write(out)
                 complete.write(err)
示例#22
0
 def runquast(self):
     while True:
         sample, quastoutputdirectory = self.quastqueue.get()
         make_path(quastoutputdirectory)
         threadlock = threading.Lock()
         # fnull = open(os.devnull, 'wb')
         # Don't re-perform the analysis if the report file exists
         if not os.path.isfile(
                 '{}/report.tsv'.format(quastoutputdirectory)):
             out, err = run_subprocess(sample.commands.quast)
             # call(sample.commands.quast, shell=True, stdout=fnull, stderr=fnull)
             threadlock.acquire()
             write_to_logfile(sample.commands.quast, sample.commands.quast,
                              self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             write_to_logfile(out, err, self.logfile, sample.general.logout,
                              sample.general.logerr, None, None)
             threadlock.release()
         # Following the analysis, parse the report (if it exists) into the metadata object
         if os.path.isfile('{}/report.tsv'.format(quastoutputdirectory)):
             self.metaparse(sample, quastoutputdirectory)
         self.quastqueue.task_done()
示例#23
0
 def createfastq(self):
     """Uses bcl2fastq to create .fastq files from a MiSeqRun"""
     # Initialise samplecount
     samplecount = 0
     # If the fastq destination folder is not provided, make the default value of :path/:miseqfoldername
     self.fastqdestination = self.fastqdestination if self.fastqdestination else self.path + self.miseqfoldername
     # Make the path
     make_path(self.fastqdestination)
     # Initialise variables for storing index information
     index = ''
     indexlength = int()
     # bcl2fastq requires an older version of the sample sheet, this recreates the required version
     # Create the new sample sheet
     with open('{}/SampleSheet_modified.csv'.format(self.fastqdestination),
               "w") as modifiedsamplesheet:
         # Write the required headings to the file
         modifiedsamplesheet.write(
             "FCID,Lane,SampleID,SampleRef,Index,Description,Control,Recipe,Operator,SampleProject\n"
         )
         for strain in self.samples:
             # Create a combined index of index1-index2
             try:
                 strain.run.modifiedindex = '{}-{}'.format(
                     strain.run.index, strain.run.index2)
                 indexlength = 16
                 index = 'I8,I8'
             except KeyError:
                 strain.run.modifiedindex = strain.run.index
                 indexlength = 6
                 index = 'I6'
             # The list of items to print to each line of the modified sample sheet
             printlist = [
                 self.flowcell, '1', strain.name,
                 str(strain.run.SampleNumber), strain.run.modifiedindex,
                 strain.run.Description, 'N', 'NA',
                 strain.run.InvestigatorName, self.projectname
             ]
             modifiedsamplesheet.write('{}\n'.format(",".join(printlist)))
             samplecount += 1
     # Set :forward/reverse length to :header.forward/reverse length if the argument is not provided, or it's 'full',
     # otherwise  use the supplied argument
     self.forwardlength = self.metadata.header.forwardlength if self.forwardlength.lower()\
         == 'full' else self.forwardlength
     # Set :reverselength to :header.reverselength
     self.reverselength = self.metadata.header.reverselength if self.reverselength.lower() \
         == 'full' else self.reverselength
     # As the number of cycles required is the number of forward reads + the index(8) + the second index(8)
     # Also set the basemask variable as required
     if self.reverselength != '0':
         self.readsneeded = int(self.forwardlength) + int(
             self.reverselength) + indexlength
         basemask = "Y{}n*,{},Y{}n*".format(self.forwardlength, index,
                                            self.reverselength)
         nohup = "nohup make -j 16 > nohup.out"
     else:
         #  + 1
         self.readsneeded = int(self.forwardlength) + indexlength
         basemask = "Y{}n*,{},n*".format(self.forwardlength, index)
         nohup = "nohup make -j 16 r1 > nohup.out"
     # Handle plurality appropriately
     samples = 'samples' if samplecount > 1 else 'sample'
     number = 'are' if samplecount > 1 else 'is'
     printtime(
         'There {} {} {} in this run. '
         'Running fastq creating module with the following parameters:\n'
         'MiSeqPath: {},\n'
         'MiSeqFolder: {},\n'
         'Fastq destination: {},\n'
         'SampleSheet: {}'.format(
             number, samplecount, samples, self.miseqpath, self.miseqfolder,
             self.fastqdestination,
             '{}/SampleSheet_modified.csv'.format(self.fastqdestination)),
         self.start)
     # Count the number of completed cycles in the run of interest
     cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format(
         self.miseqfolder))
     while len(cycles) < self.readsneeded:
         printtime(
             'Currently at {} cycles. Waiting until the MiSeq reaches cycle {}'
             .format(len(cycles), self.readsneeded), self.start)
         sleep(1800)
         cycles = glob('{}Data/Intensities/BaseCalls/L001/C*'.format(
             self.miseqfolder))
     # configureBClToFastq requires :self.miseqfolder//Data/Intensities/BaseCalls/config.xml in order to work
     # When you download runs from BaseSpace, this file is not provided. There is an empty config.xml file that
     # can be populated with run-specific values and moved to the appropriate folder
     if not os.path.isfile('{}Data/Intensities/BaseCalls/config.xml'.format(
             self.miseqfolder)):
         self.configfilepopulator()
     # Define the bcl2fastq system call
     bclcall = "configureBclToFastq.pl --input-dir {}Data/Intensities/BaseCalls " \
               "--output-dir {} --force --sample-sheet {}/SampleSheet_modified.csv " \
               "--mismatches 1 --no-eamss --fastq-cluster-count 0 --compression none --use-bases-mask {}"\
         .format(self.miseqfolder, self.fastqdestination, self.fastqdestination, basemask)
     # Define the nohup system call
     nohupcall = "cd {} && {}".format(self.fastqdestination, nohup)
     # fnull = open(os.devnull, 'wb')
     if not os.path.isdir("{}/Project_{}".format(self.fastqdestination,
                                                 self.projectname)):
         # Call configureBclToFastq.pl
         printtime('Running bcl2fastq', self.start)
         # Run the commands
         threadlock = threading.Lock()
         outstr = ''
         outerr = ''
         out, err = run_subprocess(bclcall)
         outstr += out
         outerr += out
         out, err = run_subprocess(nohupcall)
         outstr += out
         outerr += out
         # call(bclcall, shell=True, stdout=fnull, stderr=fnull)
         # call(nohupcall, shell=True, stdout=fnull, stderr=fnull)
         threadlock.acquire()
         write_to_logfile(bclcall, bclcall, self.logfile)
         write_to_logfile(nohupcall, nohupcall, self.logfile)
         write_to_logfile(outstr, outerr, self.logfile)
         threadlock.release()
     # Populate the metadata
     for sample in self.metadata.samples:
         sample.commands = GenObject()
         sample.commands.nohup = nohupcall
         sample.commands.bcl = bclcall
         sample.run.forwardlength = self.forwardlength
         sample.run.reverselength = self.reverselength
     # Copy the fastq files to a central folder so they can be processed
     self.fastqmover()