def alleleretriever(self): """ Retrieve the required alleles from a file of all alleles, and create organism-specific allele files """ logging.info('Retrieving alleles') # Index all the records in the allele file logging.info('Loading rMLST records') recorddict = SeqIO.index(self.allelefile, 'fasta') logging.info('Creating allele output files') # Create the organism-specific files of alleles for organism in sorted(self.alleledict): # Make an object to store information for each strain metadata = MetadataObject() metadata.organism = organism metadata.path = self.path metadata.outpath = os.path.join(self.path, 'outputalleles', organism, '') # Delete and recreate the output path - as the files are appended to each time, they will be too large if # this script is run more than once try: shutil.rmtree(metadata.outpath) except OSError: pass make_path(metadata.outpath) metadata.combined = os.path.join(metadata.outpath, 'gdcs_alleles.fasta') metadata.allelefiles = list() with open(metadata.combined, 'w') as combined: for gene, alleles in sorted(self.alleledict[organism].items()): # Open the file to append allelefiles = os.path.join(metadata.outpath, '{}.tfa'.format(gene)) metadata.allelefiles.append(allelefiles) with open(allelefiles, 'a') as allelefile: # Write each allele record to the file for allele in sorted(alleles): # Skip adding alleles that are no longer in the database try: SeqIO.write( recorddict['{}_{}'.format(gene, allele)], allelefile, 'fasta') SeqIO.write( recorddict['{}_{}'.format(gene, allele)], combined, 'fasta') except KeyError: pass # Add the populated metadata to the list self.samples.append(metadata)
def rmlst(self): """ Get the most up-to-date profiles and alleles from pubmlst. Note that you will need the necessary access token and secret for this to work """ printtime('Downloading rMLST database', self.start) # Set the name of the file to be used to determine if the database download and set-up was successful completefile = os.path.join(self.databasepath, 'rMLST', 'complete') if not os.path.isfile(completefile): # Create an object to send to the rMLST download script args = MetadataObject() # Add the path and start time attributes args.path = self.databasepath args.start = self.start # Run the rMLST download get_rmlst.Get(args) # Create and populate the complete.txt file with open(completefile, 'w') as complete: complete.write('\n'.join(glob(os.path.join(self.databasepath, 'rMLST', '*'))))
def __init__(self, inputobject, extension='fasta', light=False): # Create an object to mimic the command line arguments necessary for the script args = MetadataObject() args.path = inputobject.path args.sequencepath = inputobject.path args.databasepath = os.path.join(inputobject.reffilepath, 'clark') make_path(args.databasepath) args.clarkpath = os.path.dirname(which('CLARK')) args.clarkpath += '/../opt/clark/' args.cutoff = 0.005 args.database = 'bacteria' args.rank = 'species' args.filter = False args.threads = inputobject.cpus args.runmetadata = inputobject.runmetadata args.clean_seqs = False args.reffilepath = inputobject.reffilepath args.runmetadata.extension = extension args.light = light # Run CLARK CLARK(args, inputobject.commit, inputobject.starttime, inputobject.homepath)
def mlst(self, genera={'Escherichia', 'Vibrio', 'Campylobacter', 'Listeria', 'Bacillus', 'Staphylococcus', 'Salmonella'}): """ Download the necessary up-to-date MLST profiles and alleles """ printtime('Downloading MLST databases', self.start) for genus in genera: # Create an object to pass to the get_mlst script args = MetadataObject() # Populate the object with the necessary attributes args.species = genus args.repository_url = 'http://pubmlst.org/data/dbases.xml' args.force_scheme_name = False args.path = os.path.join(self.databasepath, 'MLST', genus) # Create the name of the file to be used to determine if the database download and setup was successful completefile = os.path.join(args.path, 'complete') # Only download the files if the download was not previously successful if not os.path.isfile(completefile): # Run the download get_mlst.main(args) # Create and populate the complete.txt file with open(completefile, 'w') as complete: complete.write('\n'.join(glob(os.path.join(args.path, '*'))))