def __init__(self, allelepath, targetpath, reportpath, gene): logging.info('Welcome to the CFIA Allele Attributer') if allelepath.startswith('~'): self.allelepath = os.path.abspath( os.path.expanduser(os.path.join(allelepath))) else: self.allelepath = os.path.abspath(os.path.join(allelepath)) self.allelefiles = glob(os.path.join(self.allelepath, '*.tfa')) self.alleleset = set() if targetpath.startswith('~'): self.targetpath = os.path.expanduser( os.path.abspath(os.path.join(targetpath))) else: self.targetpath = os.path.abspath(os.path.join(targetpath)) self.targetfile = sorted(glob(os.path.join(self.targetpath, '*.fasta')))[0] self.targetrecords = dict() if reportpath.startswith('~'): self.reportpath = os.path.expanduser( os.path.abspath(os.path.join(reportpath, 'reports'))) else: self.reportpath = os.path.abspath( os.path.join(reportpath, 'reports')) make_path(self.reportpath) self.gene = gene self.unaligned_alleles = os.path.join( self.reportpath, '{gene}_unaligned_alleles.fasta'.format(gene=self.gene)) self.aligned_alleles = os.path.join( self.reportpath, '{gene}_aligned_alleles.fasta'.format(gene=self.gene)) self.attributedalleles = list() self.complete = set()
def listread(self): while True: sample = self.listqueue.get() # Set and create the path of the sorted fastq files sample.general.sortedfastqpath = os.path.join( sample.general.outputdirectory, 'sortedFastq') make_path(sample.general.sortedfastqpath) # Initialise dictionaries to hold data sample.general.fastqlist = dict() sample.general.filteredfastq = dict() # Iterate through the taxIDs for taxid in sample.general.taxids: # Set the name of the list to store all the reads associated with the taxID sample.general.fastqlist[taxid] = os.path.join( sample.general.sortedfastqpath, '{sn}_{taxid}.txt'.format(sn=sample.name, taxid=taxid)) # Set the name of the .fastq file that will store the filtered reads sample.general.filteredfastq[taxid] = os.path.join( sample.general.sortedfastqpath, '{sn}_{taxid}.fastq.gz'.format(sn=sample.name, taxid=taxid)) # Open the list, and write the list of all reads, one per line with open(sample.general.fastqlist[taxid], 'w') as binned: binned.write('\n'.join(set(sample[taxid].readlist))) self.listqueue.task_done()
def getrmlsthelper(self): """ Makes a system call to rest_auth.py, a Python script modified from https://github.com/kjolley/BIGSdb/tree/develop/scripts/test And downloads the most up-to-date rMLST profile and alleles """ printtime('Downloading {} alleles'.format(self.analysistype), self.start) # Extract the path of the current script from the full path + file name homepath = os.path.split(os.path.abspath(__file__))[0] # Set the path/name of the folder to contain the new alleles and profile newfolder = os.path.join(self.path, self.analysistype) # Create the path make_path(newfolder) # Create arguments to feed into the rest_auth_class script args = ArgumentParser args.secret_file = os.path.join(homepath, 'secret.txt') args.file_path = homepath args.output_path = newfolder args.start = self.start rmlst = rest_auth_class.REST(args) # Download the profile and alleles rmlst.main() # Get the new alleles into a list, and create the combinedAlleles file alleles = glob(os.path.join(newfolder, '*.tfa')) self.combinealleles(newfolder, alleles)
def __init__(self, sequencepath, reportpath): # Allow for relative paths if sequencepath.startswith('~'): self.sequencepath = os.path.abspath( os.path.expanduser(os.path.join(sequencepath))) else: self.sequencepath = os.path.abspath(os.path.join(sequencepath)) assert os.path.isdir(self.sequencepath), 'Cannot locate supplied sequence path: {seq_path}'\ .format(seq_path=self.sequencepath) if reportpath.startswith('~'): self.reportpath = os.path.abspath( os.path.expanduser(os.path.join(reportpath))) else: self.reportpath = os.path.abspath(os.path.join(reportpath)) make_path(self.reportpath) assert os.path.isdir(self.reportpath), 'Could not create the requested report directory: {rep_path}'\ .format(rep_path=self.reportpath) # Initialise class variables self.max_allele = int() self.samples = list() self.allele_dict = dict() self.record_dict = dict() self.output_dict = dict() self.new_alleles = dict() self.primer_sequences = dict() self.json_report = os.path.join(self.reportpath, 'virus_typer_outputs.json') # Extract the path of this file - will be used to find the necessary accessory files self.homepath = os.path.split(os.path.abspath(__file__))[0] self.forward_primers = os.path.join(self.homepath, 'forward_typing_primers.fasta') self.reverse_primers = os.path.join(self.homepath, 'reverse_typing_primers.fasta') self.allele_database = os.path.join(self.homepath, 'virus_typer_alleles.fasta')
def query_prep(self): """ Create metadata objects for each sample """ logging.info('Preparing query files') # Find all the sequence files in the path fastas = sorted(glob(os.path.join(self.query_path, '*.fasta'))) for fasta in fastas: name = os.path.splitext(os.path.basename(fasta))[0] if name != 'combinedtargets': # Create a metadata object for each sample metadata = MetadataObject() metadata.samples = list() # Populate the metadata object with the required attributes metadata.name = name metadata.general = GenObject() metadata.commands = GenObject() metadata.alleles = GenObject() metadata.alleles.outputdirectory = os.path.join(self.query_path, metadata.name) # Set the name of the BLAST output file metadata.alleles.blast_report = os.path.join(metadata.alleles.outputdirectory, '{seq_id}.tsv'.format(seq_id=metadata.name)) try: os.remove(metadata.alleles.blast_report) except FileNotFoundError: pass make_path(metadata.alleles.outputdirectory) metadata.general.bestassemblyfile = relative_symlink(src_file=fasta, output_dir=metadata.alleles.outputdirectory, export_output=True) metadata.samples.append(metadata) self.runmetadata.samples.append(metadata)
def __init__(self, profile, names): logging.info('Welcome to profile reducer!') if profile.startswith('~'): self.profile = os.path.abspath( os.path.expanduser(os.path.join(profile))) else: self.profile = os.path.abspath(os.path.join(profile)) assert os.path.isfile( self.profile), f'Cannot find the supplied profile {self.profile}' self.report_path = os.path.join(os.path.dirname(self.profile), 'reports') make_path(self.report_path) self.reduced_profile = os.path.join(self.report_path, 'profile.txt') self.notes_file = os.path.join(self.report_path, 'reducing_notes.txt') if names.startswith('~'): self.name_file = os.path.abspath( os.path.expanduser(os.path.join(names))) else: self.name_file = os.path.abspath(os.path.join(names)) assert os.path.isfile( self.name_file ), f'Cannot find the supplied file with gene names: {self.name_file}' self.names = list() self.profile_dict = dict() self.allele_dict = dict()
def allelealigner(self): """ Perform a multiple sequence alignment of the allele sequences """ logging.info('Aligning alleles') # Create the threads for the analysis for i in range(self.cpus): threads = Thread(target=self.alignthreads, args=()) threads.setDaemon(True) threads.start() for sample in self.samples: sample.alignpath = os.path.join(self.path, 'alignedalleles') make_path(sample.alignpath) # Create a list to store objects sample.alignedalleles = list() for outputfile in sample.allelefiles: aligned = os.path.join(sample.alignpath, os.path.basename(outputfile)) sample.alignedalleles.append(aligned) # Create the command line call clustalomega = ClustalOmegaCommandline(infile=outputfile, outfile=aligned, threads=4, auto=True) sample.clustalomega = str(clustalomega) self.queue.put((sample, clustalomega, outputfile, aligned)) self.queue.join()
def __init__(self, path, outputpath, accessiontable, threads, sleeptime): if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) if outputpath: if outputpath.startswith('~'): self.outputpath = os.path.abspath( os.path.expanduser(os.path.join(outputpath))) else: self.outputpath = os.path.abspath(os.path.join(outputpath)) else: self.outputpath = os.path.join(self.path, 'downloads') make_path(self.outputpath) self.metadatatable = os.path.join(self.path, accessiontable) assert os.path.isfile(self.metadatatable), 'Cannot find supplied pathogen metadata table {at} in ' \ 'supplied path {sp}' \ .format(at=self.metadatatable, sp=self.path) self.threads = threads self.sleeptime = sleeptime if self.sleeptime: assert self.sleeptime > 10, 'Must sleep at least 10 seconds' assert self.sleeptime < 86400, 'Cannot sleep for more than 24 hours' self.assembly_dict = dict() self.queue = Queue(maxsize=self.threads) logging.info('Starting pathogen assembly download using {at}'.format( at=self.metadatatable))
def __init__(self, sequencepath, reportpath): # Allow for relative paths if sequencepath.startswith('~'): self.path = os.path.abspath( os.path.expanduser(os.path.join(sequencepath))) else: self.path = os.path.abspath(os.path.join(sequencepath)) assert os.path.isdir(self.path), 'Cannot locate supplied sequence path: {seq_path}' \ .format(seq_path=self.path) self.sequencepath = self.path if reportpath.startswith('~'): self.reportpath = os.path.abspath( os.path.expanduser(os.path.join(reportpath))) else: self.reportpath = os.path.abspath(os.path.join(reportpath)) make_path(self.reportpath) assert os.path.isdir(self.reportpath), 'Could not create the requested report directory: {rep_path}' \ .format(rep_path=self.reportpath) # Define the start time for legacy code compatibility self.starttime = time.time() self.logfile = os.path.join(self.path, 'log') self.cpus = multiprocessing.cpu_count() - 1 self.sketchqueue = Queue(maxsize=self.cpus) self.mashqueue = Queue(maxsize=self.cpus) # Extract the path of this file - will be used to find the necessary accessory files self.homepath = os.path.split(os.path.abspath(__file__))[0] # self.reference_mash_sketch_file = os.path.join(self.homepath, 'toxoplasma.msh') self.reference_mash_sketch_file = '/mnt/nas2/redmine/bio_requests/17874/reference_sequences/toxoplasma.msh' self.metadata = list() self.output_dict = dict() self.json_report = os.path.join(self.reportpath, 'para_typer_outputs.json')
def objects(self): """ :return: """ self.runmetadata = ObjectCreation(inputobject=self) make_path(os.path.join(self.path, 'BestAssemblies')) for sample in self.runmetadata.samples: # Link the assemblies to the BestAssemblies folder - necessary for GenomeQAML relative_symlink(sample.general.bestassemblyfile, os.path.join(self.path, 'BestAssemblies')) # Create attributes required for downstream analyses sample.general.trimmedcorrectedfastqfiles = [ sample.general.bestassemblyfile ]
def __init__(self, spectra_path, filename, start_time, outputpath, classic, extensions): """ :param spectra_path: Path to .spa/.spc files :param filename: Path to .xls(x) file with renaming information. :param start_time: Time the analyses started :param outputpath: Path to folder in which the renamed files are to be stored :param classic: BOOL whether to use the "classic" method of file renaming. :param extension: BOOL whether the file extension is .spc """ SetupLogging() # Define variables based on supplied arguments if spectra_path.startswith('~'): self.spectra_path = os.path.abspath( os.path.expanduser(os.path.join(spectra_path))) else: self.spectra_path = self.file = os.path.abspath( os.path.join(spectra_path)) assert os.path.isdir(self.spectra_path), 'Supplied sequence path is not a valid directory {0!r:s}'\ .format(self.spectra_path) if filename.startswith('~'): self.file = os.path.abspath( os.path.expanduser(os.path.join(filename))) else: self.file = os.path.abspath(os.path.join(filename)) # If the path to the file wasn't provided, check the spectra folder if not os.path.isfile(self.file): self.file = os.path.join(self.spectra_path, filename) # If the file still can't be found, check the parental folder of the spectra folder if not os.path.isfile(self.file): self.file = os.path.join(os.path.dirname(self.spectra_path), filename) self.start = start_time assert os.path.isfile(self.file), 'Cannot find the supplied Excel file ({0!r:s}) with the file information. ' \ 'Please ensure that this file is in the path, and there\'s no spelling ' \ 'mistakes'.format(self.file) # Set the output path self.outputpath = os.path.join(outputpath) # Create the output path as required make_path(self.outputpath) # Determine the naming scheme self.classic = classic # Variable for extensions of files to rename self.extensions = extensions # Create class variable self.metadata = list()
def aa_allele_prep(self): """ Create (first time only) and read the amino acid allele database file """ # Create the amino acid allele database file path as required make_path(self.aa_allele_path) # Iterate through all the gene in the analysis for gene in self.gene_names: # Attempt to find the database file try: allele_file = glob(os.path.join(self.aa_allele_path, f'{gene}*.*fa*'))[0] # Create the file if it doesn't exist except IndexError: allele_file = self.initialise_aa_alleles(gene=gene) # Read in and store all the amino acid records in the allele database file for record in SeqIO.parse(allele_file, 'fasta'): self.aa_allele_dict[record.id] = str(record.seq)
def __init__(self, path, amino_acid): if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) self.allele_path = os.path.join(self.path, 'alleles') self.aa_allele_path = os.path.join(self.path, 'aa_alleles') self.profile_path = os.path.join(self.path, 'profile') self.aa_profile_path = os.path.join(self.path, 'aa_profile') make_path(self.profile_path) self.profile_file = os.path.join(self.profile_path, 'profile.txt') self.aa_profile_file = os.path.join(self.aa_profile_path, 'aa_profile.txt') self.query_path = os.path.join(self.path, 'query') self.report_path = os.path.join(self.path, 'reports') self.aa_report_path = os.path.join(self.path, 'aa_reports') make_path(self.report_path) make_path(self.aa_report_path) novel_alleles = glob(os.path.join(self.report_path, '*.fasta')) for novel_allele in novel_alleles: os.remove(novel_allele) self.aa_notes_path = os.path.join(self.path, 'aa_notes') make_path(self.aa_notes_path) self.aa_profile_notes = os.path.join(self.aa_notes_path, 'aa_profile_notes.tsv') self.amino_acid = amino_acid if not self.amino_acid: self.combined_targets = os.path.join(self.allele_path, 'combinedtargets.fasta') else: self.combined_targets = os.path.join(self.aa_allele_path, 'combinedtargets.fasta') self.gene_names = list() self.runmetadata = MetadataObject() self.runmetadata.samples = list() self.cpus = multiprocessing.cpu_count() - 1 self.profile_report = os.path.join(self.report_path, 'profiles.tsv') self.aa_profile_report = os.path.join(self.aa_report_path, 'aa_profiles.tsv') try: os.remove(self.profile_report) except FileNotFoundError: pass # Fields used for custom outfmt 6 BLAST output: self.fieldnames = ['query_id', 'subject_id', 'identical', 'mismatches', 'gaps', 'evalue', 'bit_score', 'query_length', 'subject_length', 'alignment_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'query_sequence', 'subject_sequence'] self.extended_fieldnames = self.fieldnames.copy() self.extended_fieldnames.insert(14, 'percent_match') self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \ 'qstart qend sstart send qseq sseq' # A string of the header to use for formatting the profile file, and the report headers self.data = str() self.aa_allele_dict = dict() self.aa_nt_allele_link_dict = dict()
def __init__(self, path, profile, one_based): logging.info('Welcome to the allele translator!') if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) if profile: self.profile_file = os.path.join(self.path, 'profile', 'profile.txt') assert os.path.isfile(self.profile_file), 'Cannot locate the required profile file: {profile}. Please ' \ 'ensure that the file name and path of your file is correct'\ .format(profile=self.profile_file) else: self.profile_file = None self.one_based = one_based self.sequence_files = glob(os.path.join(self.path, '*.fasta')) self.translated_path = os.path.join(self.path, 'aa_alleles') self.notes_path = os.path.join(self.path, 'notes') make_path(inpath=self.translated_path) make_path(inpath=self.notes_path) self.allele_dict = dict() self.profile_data = dict() self.allele_links = dict() self.aa_profile_path = os.path.join(self.path, 'aa_profile') make_path(self.aa_profile_path) self.aa_profile_file = os.path.join(self.aa_profile_path, 'aa_profile.txt') self.gene_names = set() self.gene_name_file = os.path.join(self.aa_profile_path, 'gene_names.txt') self.aa_profile_data = dict() self.profile_matches = dict() self.aa_nt_profile_link_file = os.path.join(self.aa_profile_path, 'reports', 'aa_nt_profile_links.tsv')
def __init__(self, inputobject, extension='fasta', light=False): # Create an object to mimic the command line arguments necessary for the script args = MetadataObject() args.path = inputobject.path args.sequencepath = inputobject.path args.databasepath = os.path.join(inputobject.reffilepath, 'clark') make_path(args.databasepath) args.clarkpath = os.path.dirname(which('CLARK')) args.clarkpath += '/../opt/clark/' args.cutoff = 0.005 args.database = 'bacteria' args.rank = 'species' args.filter = False args.threads = inputobject.cpus args.runmetadata = inputobject.runmetadata args.clean_seqs = False args.reffilepath = inputobject.reffilepath args.extension = extension args.light = light # Run CLARK CLARK(args, inputobject.commit, inputobject.starttime, inputobject.homepath)
def __init__(self, path, targetfile, min_length, max_length, cutoff, perc_gc, blast, one_based): # Determine the path in which the sequence files are located. Allow for ~ expansion if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) self.file = os.path.join(self.path, targetfile) assert os.path.isfile( self.file), 'Cannot find the supplied FASTA file: {fn}'.format( fn=self.file) self.reportpath = os.path.join(self.path, 'reports') self.probepath = os.path.join(self.path, 'probes') make_path(self.reportpath) self.min = min_length self.max = max_length self.cutoff = cutoff self.perc_gc = perc_gc self.blast = blast self.one_based = one_based self.cpus = multiprocessing.cpu_count() self.queue = Queue() self.samples = list()
def test_sistr_seqsero(): metadata = MetadataObject() method.runmetadata.samples = list() fasta = os.path.join(var.sequencepath, 'NC_003198.fasta') metadata.name = os.path.split(fasta)[1].split('.')[0] # Initialise the general and run categories metadata.general = GenObject() metadata.run = GenObject() metadata.general.fastqfiles = list() metadata.general.trimmedcorrectedfastqfiles = [ os.path.join(var.sequencepath, 'seqsero', '2014-SEQ-1049_seqsero.fastq.gz') ] # Set the destination folder outputdir = os.path.join(var.sequencepath, metadata.name) make_path(outputdir) # Add the output directory to the metadata metadata.general.outputdirectory = outputdir metadata.general.logout = os.path.join(outputdir, 'out') metadata.general.logerr = os.path.join(outputdir, 'err') metadata.run.outputdirectory = outputdir metadata.general.bestassemblyfile = True # Initialise an attribute to store commands metadata.commands = GenObject() # Assume that all samples are Salmonella metadata.general.referencegenus = 'Salmonella' # Set the .fasta file as the best assembly metadata.general.bestassemblyfile = fasta method.runmetadata.samples.append(metadata) method.sistr() for sample in method.runmetadata.samples: assert sample.sistr.cgmlst_genome_match == 'ERR586739' or sample.sistr.cgmlst_genome_match == 'SAL_BA2732AA' method.seqsero() for sample in method.runmetadata.samples: assert sample.seqsero.predicted_serotype == '- 9:f,g,t:-' variable_update()
def reports(self): """ Create reports from the abundance estimation """ logging.info( 'Creating CLARK report for {ft} files'.format(ft=self.extension)) # Create a workbook to store the report. Using xlsxwriter rather than a simple csv format, as I want to be # able to have appropriately sized, multi-line cells workbook = xlsxwriter.Workbook(self.report) make_path(self.reportpath) # New worksheet to store the data worksheet = workbook.add_worksheet() # Add a bold format for header cells. Using a monotype font size 8 bold = workbook.add_format({ 'bold': True, 'font_name': 'Courier New', 'font_size': 8 }) bold.set_align('center') # Format for data cells. Monotype, size 8, top vertically justified courier = workbook.add_format({ 'font_name': 'Courier New', 'font_size': 8 }) courier.set_align('top') # Set the custom width for 5 and 6 to be 15 worksheet.set_column(5, 5, 15) worksheet.set_column(6, 6, 20) # Initialise the position within the worksheet to be (0,0) row = 0 col = 0 # List of the headers to use headers = [ 'Strain', 'Name', 'TaxID', 'Lineage', 'Count', 'Proportion_All(%)', 'Proportion_Classified(%)' ] # Add an additional header for .fasta analyses if self.extension == 'fasta': headers.insert(4, 'TotalBP') # Populate the headers for category in headers: # Write the data in the specified cell (row, col) using the bold format worksheet.write(row, col, category, bold) # Move to the next column to write the next category col += 1 # Data starts in row 1 row = 1 # Initialise variables to hold the longest names; used in setting the column width longeststrain = 0 longestname = 0 longestlineage = 0 # Extract all the taxonomic groups that pass the cutoff from the abundance file for sample in self.runmetadata.samples: # Every record starts at column 0 col = 0 # Write the strain name worksheet.write(row, col, sample.name, courier) col += 1 # Initialise a dictionary to store the species above the cutoff in the sample sample.general.passfilter = list() try: # Abundance file as a dictionary abundancedict = DictReader(open(sample.general.abundance)) # Filter abundance to taxIDs with at least self.cutoff% of the total proportion for result in abundancedict: # The UNKNOWN category doesn't contain a 'Lineage' column, and therefore, subsequent columns are # shifted out of proper alignment, and do not contain the appropriate data try: if float(result['Proportion_All(%)']) > self.cutoff: sample.general.passfilter.append(result) except ValueError: pass # Determine the longest name of all the strains, and use it to set the width of column 0 if len(sample.name) > longeststrain: longeststrain = len(sample.name) worksheet.set_column(0, 0, longeststrain) # Sort the abundance results based on the highest count sortedabundance = sorted(sample.general.passfilter, key=lambda x: int(x['Count']), reverse=True) # Set of contigs from the classification file. For some reason, certain contigs are represented multiple # times in the classification file. As far as I can tell, these multiple representations are always # classified the same, and, therefore, should be treated as duplicates, and ignored contigset = set() for result in sortedabundance: # Add the total number of base pairs classified for each TaxID. As only the total number of contigs # classified as a particular TaxID are in the report, it can be misleading if a large number # of small contigs are classified to a particular TaxID e.g. 56 contigs map to TaxID 28901, and 50 # contigs map to TaxID 630, however, added together, those 56 contigs are 4705838 bp, while the 50 # contigs added together are only 69602 bp. While this is unlikely a pure culture, only # 69602 / (4705838 + 69602) = 1.5% of the total bp map to TaxID 630 compared to 45% of the contigs if self.extension == 'fasta': # Initialise a variable to store the total bp mapped to the TaxID result['TotalBP'] = int() # Read the classification file into a dictionary classificationdict = DictReader( open(sample.general.classification)) # Read through each contig classification in the dictionary for contig in classificationdict: # Pull out each contig with a TaxID that matches the TaxID of the result of interest, and # is not present in a set of contigs that have already been added to the dictionary if result['TaxID'] == contig[ ' Assignment'] and contig[ 'Object_ID'] not in contigset: # Increment the total bp mapping to the TaxID by the integer of each contig result['TotalBP'] += int(contig[' Length']) # Avoid duplicates by adding the contig name to the set of contigs contigset.add(contig['Object_ID']) # Print the results to file # Ignore the first header, as it is the strain name, which has already been added to the report dictionaryheaders = headers[1:] for header in dictionaryheaders: data = result[header] worksheet.write(row, col, data, courier) col += 1 # Determine the longest name of all the matches, and use it to set the width of column 0 if len(result['Name']) > longestname: longestname = len(result['Name']) worksheet.set_column(1, 1, longestname) # Do the same for the lineages if len(result['Lineage']) > longestlineage: longestlineage = len(result['Lineage']) worksheet.set_column(3, 3, longestlineage) # Increase the row row += 1 # Set the column to 1 col = 1 except (KeyError, AttributeError): # Increase the row row += 1 # Close the workbook workbook.close()
def main(args): # Create the path to store the schemes (if necessary) make_path(args.path) # Allow for Shigella to use the Escherichia MLST profile/alleles args.genus = args.genus if args.genus != 'Shigella' else 'Escherichia' # As there are multiple profiles for certain organisms, this dictionary has the schemes I use as values organismdictionary = { 'Escherichia': 'Escherichia coli#1', 'Vibrio': 'Vibrio parahaemolyticus', 'Campylobacter': 'Campylobacter jejuni', 'Listeria': 'Listeria monocytogenes', 'Bacillus': 'Bacillus cereus', 'Staphylococcus': "Staphylococcus aureus", 'Salmonella': 'Salmonella enterica' } # Set the appropriate profile based on the dictionary key:value pairs try: args.genus = organismdictionary[args.species] except (KeyError, AttributeError): pass with url.urlopen(args.repository_url) as docfile: doc = xml.parse(docfile) root = doc.childNodes[0] found_species = [] for species_node in root.getElementsByTagName('species'): info = getspeciesinfo(species_node, args.genus, args.force_scheme_name) if info is not None: found_species.append(info) if len(found_species) == 0: print("No species matched your query.") return if len(found_species) > 1: print( "The following {} species match your query, please be more specific:" .format(len(found_species))) for info in found_species: print(info.name) return # exit(2) # output information for the single matching species assert len(found_species) == 1 species_info = found_species[0] species_name_underscores = species_info.name.replace(' ', '_') species_name_underscores = species_name_underscores.replace('/', '_') species_all_fasta_filename = species_name_underscores + '.fasta' species_all_fasta_file = open( '{}/{}'.format(args.path, species_all_fasta_filename), 'w') log_filename = "mlst_data_download_{}_{}.log".format( species_name_underscores, species_info.retrieved) log_file = open('{}/{}'.format(args.path, log_filename), "w") log_file.write(species_info.retrieved + '\n') profile_path = urlparse(species_info.profiles_url).path profile_filename = profile_path.split('/')[-1] log_file.write("definitions: {}\n".format(profile_filename)) log_file.write("{} profiles\n".format(species_info.profiles_count)) log_file.write("sourced from: {}\n\n".format(species_info.profiles_url)) # # with url.urlopen(species_info.profiles_url) as profile_doc: # with open(os.path.join(args.path, profile_filename), 'w') as profile_file: localfile, headers = url.urlretrieve(species_info.profiles_url) with open(localfile, 'r') as profile_doc: with open(os.path.join(args.path, profile_filename), 'w') as profile_file: profile_file.write(profile_doc.read()) for locus in species_info.loci: locus_path = urlparse(locus.url).path locus_filename = locus_path.split('/')[-1] log_file.write("locus {}\n".format(locus.name)) log_file.write(locus_filename + '\n') log_file.write("Sourced from {}\n\n".format(locus.url)) # local_locus_doc, headers = url.urlretrieve(locus.url) with open(local_locus_doc, 'r') as locus_doc: with open(os.path.join(args.path, locus_filename), 'w') as locus_file: # locus_doc = url.urlopen(locus.url) # locus_file = open('{}/{}'.format(args.path, locus_filename), 'w') locus_fasta_content = locus_doc.read() locus_file.write(locus_fasta_content) species_all_fasta_file.write(locus_fasta_content) # locus_file.close() # locus_doc.close() log_file.write("all loci: {}\n".format(species_all_fasta_filename)) log_file.close() species_all_fasta_file.close()
def main(self): # Create metadata objects for all files in the query folder self.query_prep() for sample in self.runmetadata.samples: logging.warning('Processing sample {sn}'.format(sn=sample.name)) if not self.amino_acid: records, gene_names, self.data = \ allele_prep(allele_path=self.allele_path, gene_names=self.gene_names, combined_targets=self.combined_targets, amino_acid=self.amino_acid) else: records, gene_names, self.data = \ allele_prep(allele_path=self.aa_allele_path, gene_names=self.gene_names, combined_targets=self.combined_targets, amino_acid=self.amino_acid) logging.info('Loading profile') if not self.amino_acid: profile_data = read_profile(profile_file=self.profile_file) else: profile_data = read_profile(profile_file=self.aa_profile_file) self.blast_alleles(runmetadata=sample, amino_acid=self.amino_acid) parseable_blast_outputs(runmetadata=sample, fieldnames=self.fieldnames, extended_fieldnames=self.extended_fieldnames, records=records) sample = parse_results(runmetadata=sample, fieldnames=self.fieldnames, extended_fieldnames=self.extended_fieldnames, amino_acid=self.amino_acid, genome_query=True) if not self.amino_acid: profile_dict, profile_set = profile_alleles(runmetadata=sample, profile_dict=dict(), profile_set=list(), records=self.gene_names, novel_alleles=True, genome_query=True, allele_path=self.allele_path, report_path=self.report_path) else: profile_dict, profile_set = profile_alleles(runmetadata=sample, profile_dict=dict(), profile_set=list(), records=self.gene_names, novel_alleles=True, genome_query=True, allele_path=self.aa_allele_path, report_path=self.aa_report_path) profile_matches = match_profile(profile_data=profile_data, profile_dict=profile_dict, profile_matches=dict()) profile_matches, profile_data, new_profiles = \ create_profile(profile_data=profile_data, profile_set=profile_set, new_profiles=list(), profile_dict=profile_dict, profile_matches=profile_matches) if not self.amino_acid: sample = sequence_typer(profile_report=self.profile_report, data=self.data, runmetadata=sample, profile_matches=profile_matches, profile_data=profile_data, update=True) append_profiles(new_profiles=new_profiles, profile_file=self.profile_file, data=self.data, novel_profiles=True, profile_path=self.profile_path, gene_names=self.gene_names) else: sample = sequence_typer(profile_report=self.aa_profile_report, data=self.data, runmetadata=sample, profile_matches=profile_matches, profile_data=profile_data, update=True) append_profiles(new_profiles=new_profiles, profile_file=self.aa_profile_file, data=self.data, novel_profiles=True, profile_path=self.aa_profile_path, gene_names=self.gene_names) if not self.amino_acid: # AA sample = self.translate(runmetadata=sample) self.aa_allele_prep() aa_profile_dict, aa_profile_set = self.aa_allele_match(runmetadata=sample, profile_dict=dict(), profile_set=list(), gene_names=gene_names) aa_profile_data = read_profile(profile_file=self.aa_profile_file) aa_profile_matches = match_profile(profile_data=aa_profile_data, profile_dict=aa_profile_dict, profile_matches=dict()) aa_profile_matches, aa_profile_data, aa_new_profiles = \ create_profile(profile_data=aa_profile_data, profile_set=aa_profile_set, new_profiles=list(), profile_dict=aa_profile_dict, profile_matches=aa_profile_matches) sample = sequence_typer(profile_report=self.aa_profile_report, data=self.data, runmetadata=sample, profile_matches=aa_profile_matches, profile_data=aa_profile_data, update=True, amino_acid=True) make_path(self.aa_profile_path) append_profiles(new_profiles=aa_new_profiles, profile_file=self.aa_profile_file, data=self.data, novel_profiles=True, profile_path=self.aa_profile_path, gene_names=self.gene_names) self.aa_notes(runmetadata=sample) clear_alleles(combined_targets_db=glob(os.path.join(self.allele_path, 'combinedtargets*')), custom_targets=os.path.join(self.allele_path, 'custom.tfa'))
def __init__(self, args): """ Initialises the variables required for this class :param args: list of arguments passed to the script """ self.debug = args.debug SetupLogging(self.debug) logging.info( 'Welcome to the CFIA OLC Workflow for Bacterial Assembly and Typing (COWBAT) version {version}' .format(version=__version__)) # Define variables from the arguments - there may be a more streamlined way to do this self.args = args if args.sequencepath.startswith('~'): self.path = os.path.abspath( os.path.expanduser(os.path.join(args.sequencepath))) else: self.path = os.path.abspath(os.path.join(args.sequencepath)) self.sequencepath = self.path if args.referencefilepath.startswith('~'): self.reffilepath = os.path.expanduser( os.path.abspath(os.path.join(args.referencefilepath))) else: self.reffilepath = os.path.abspath( os.path.join(args.referencefilepath)) self.numreads = args.numreads self.preprocess = args.preprocess # Define the start time self.starttime = args.startingtime if args.customsamplesheet: if args.customsamplesheet.startswith('~'): self.customsamplesheet = os.path.expanduser( os.path.abspath(os.path.join(self.customsamplesheet))) else: self.customsamplesheet = os.path.abspath( os.path.join(args.customsamplesheet)) else: self.customsamplesheet = args.customsamplesheet if self.customsamplesheet: assert os.path.isfile(self.customsamplesheet), 'Cannot find custom sample sheet as specified {css}' \ .format(css=self.customsamplesheet) self.basicassembly = args.basicassembly if not self.customsamplesheet and not os.path.isfile( os.path.join(self.path, 'SampleSheet.csv')): self.basicassembly = True logging.warning( 'Could not find a sample sheet. Performing basic assembly (no run metadata captured)' ) # Use the argument for the number of threads to use, or default to the number of cpus in the system self.cpus = args.threads if args.threads else multiprocessing.cpu_count( ) - 1 # Assertions to ensure that the provided variables are valid make_path(self.path) assert os.path.isdir( self.path ), 'Supplied path location is not a valid directory {0!r:s}'.format( self.path) self.reportpath = os.path.join(self.path, 'reports') make_path(self.reportpath) assert os.path.isdir(self.reffilepath), 'Reference file path is not a valid directory {0!r:s}' \ .format(self.reffilepath) self.commit = __version__ self.homepath = args.homepath self.logfile = os.path.join(self.path, 'logfile') self.runinfo = str() self.pipeline = True self.qualityobject = MetadataObject() # Initialise the metadata object self.runmetadata = MetadataObject()
def __init__(self, path, fasta_path, records, amino_acid): if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) if fasta_path.startswith('~'): self.fasta_path = os.path.abspath( os.path.expanduser(os.path.join(fasta_path))) else: self.fasta_path = os.path.abspath(os.path.join(fasta_path)) self.working_path = os.path.join(self.path, 'strain_profiles') self.sequencepath = os.path.join(self.working_path, 'query') make_path(self.sequencepath) target_files = [ fasta for fasta in sorted(glob(os.path.join(self.fasta_path, '*.fasta'))) if os.path.basename(fasta) != 'combinedtargets.fasta' or os.path.basename(fasta) != 'custom.tfa' ] self.query_files = list() # Create symlinks of the target files in the local path for target in target_files: try: query_file = os.path.join( self.sequencepath, os.path.basename(target).replace('.tfa', '.fasta')) self.query_files.append(query_file) os.symlink(target, query_file) except FileExistsError: pass self.targetpath = os.path.join(self.working_path, 'targets') make_path(self.targetpath) self.profilepath = os.path.join(self.working_path, 'sequence_profile') make_path(self.profilepath) self.profile_file = os.path.join(self.profilepath, 'profile.txt') self.target_file = os.path.join(self.targetpath, 'combinedtargets.fasta') shutil.copyfile(src=os.path.join(os.path.join(self.path, 'alleles'), 'combinedtargets.fasta'), dst=self.target_file) self.reportpath = os.path.join(self.working_path, 'reports') make_path(self.reportpath) self.strain_profile_path = os.path.join(self.working_path, 'strain_profiles') make_path(self.strain_profile_path) self.profile_report = os.path.join(self.strain_profile_path, 'profiles.tsv') self.cpus = multiprocessing.cpu_count() - 1 self.starttime = time() self.start = self.starttime self.runmetadata = MetadataObject() self.runmetadata.samples = list() self.records = records # Create an object for performing BLAST analyses if amino_acid: self.amino_acid = amino_acid else: self.amino_acid = None if amino_acid: self.program = 'tblastn' else: self.program = 'blastn' # Fields used for custom outfmt 6 BLAST output: self.fieldnames = [ 'query_id', 'subject_id', 'identical', 'mismatches', 'gaps', 'evalue', 'bit_score', 'query_length', 'subject_length', 'alignment_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'query_sequence', 'subject_sequence' ] self.extended_fieldnames = self.fieldnames.copy() self.extended_fieldnames.insert(14, 'percent_match') self.outfmt = '6 qseqid sseqid nident mismatch gaps evalue bitscore qlen slen length ' \ 'qstart qend sstart send qseq sseq' self.blast_reports = list() self.profile_dict = dict() self.profile_data = dict() self.profile_set = list() self.sequence_profile = dict() self.profile_matches = dict() self.new_profiles = list() # A string of the header to use for formatting the profile file, and the report headers genes = '\t'.join(sorted(self.records)) self.data = 'ST\t{genes}\n'.format(genes=genes.rstrip()) self.gene_names = list()
def __init__(self, args, pipelinecommit, startingtime, scriptpath): # Initialise variables self.commit = str(pipelinecommit) self.start = startingtime self.homepath = scriptpath # Define variables based on supplied arguments self.args = args self.path = os.path.join(args.path) assert os.path.isdir( self.path ), u'Supplied path is not a valid directory {0!r:s}'.format(self.path) self.sequencepath = os.path.join(args.sequencepath, '') assert os.path.isdir(self.sequencepath), u'Supplied sequence path is not a valid directory {0!r:s}' \ .format(self.sequencepath) self.databasepath = os.path.join(args.databasepath, '') assert os.path.isdir(self.databasepath), u'Supplied database path is not a valid directory {0!r:s}' \ .format(self.databasepath) # There seems to be an issue with CLARK when running with a very high number of cores. Limit self.cpus to 1 self.cpus = 4 # Set variables from the arguments self.database = args.database self.rank = args.rank self.clarkpath = args.clarkpath self.cutoff = float(args.cutoff) * 100 # Initialise variables for the analysis self.targetcall = str() self.classifycall = str() self.devnull = open(os.devnull, 'wb') self.filelist = os.path.join(self.path, 'sampleList.txt') self.reportlist = os.path.join(self.path, 'reportList.txt') self.abundancequeue = Queue() self.datapath = str() self.reportpath = os.path.join(self.path, 'reports') self.clean_seqs = args.clean_seqs self.light = args.light self.extension = args.extension if self.clean_seqs: try: self.reffilepath = args.reffilepath except AttributeError: self.clean_seqs = False # If run as part of the assembly pipeline, a few modifications are necessary to ensure that the metadata objects # and variables play nice try: if args.runmetadata: self.runmetadata = args.runmetadata # Create the name of the final report self.report = os.path.join( self.reportpath, 'abundance_{ft}.xlsx'.format(ft=self.extension)) # Only re-run the CLARK analyses if the CLARK report doesn't exist. All files created by CLARK if not os.path.isfile(self.report): logging.info( 'Performing CLARK analysis on {ft} files'.format( ft=self.extension)) if self.extension != 'fastq': for sample in self.runmetadata.samples: sample.general.combined = sample.general.bestassemblyfile # Run the pipeline self.main() else: # Only perform FASTQ analyses if the sample is declared to be a metagenome metagenome = False for sample in self.runmetadata.samples: try: status = sample.run.Description except AttributeError: status = 'unknown' if status == 'metagenome': metagenome = True # If any of the samples are metagenomes, run the CLARK analysis on the raw files if metagenome: fileprep.Fileprep(self) # Run the pipeline self.main() # Clean up the files and create/delete attributes to be consistent with pipeline Metadata objects for sample in self.runmetadata.samples: # Create a GenObject to store metadata when this script is run as part of the pipeline clarkextension = 'clark{}'.format(self.extension) setattr(sample, clarkextension, GenObject()) # Create a folder to store all the CLARK files sample[clarkextension].outputpath = os.path.join( sample.general.outputdirectory, 'CLARK') make_path(sample[clarkextension].outputpath) if sample.general.bestassemblyfile != 'NA': # Move the files to the CLARK folder try: move( sample.general.abundance, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.abundance))) move( sample.general.classification, os.path.join( sample[clarkextension].outputpath, os.path.basename( sample.general.classification))) except (AttributeError, FileNotFoundError): pass # Set the CLARK-specific attributes try: sample[ clarkextension].abundance = sample.general.abundance sample[ clarkextension].classification = sample.general.classification sample[ clarkextension].combined = sample.general.combined except AttributeError: pass if self.extension == 'fastq': # Remove the combined .fastq files try: if type(sample[clarkextension].combined ) is list: os.remove( sample[clarkextension].combined) except (OSError, AttributeError): pass # Remove the text files lists of files and reports created by CLARK try: map( lambda x: os.remove(os.path.join(self.path, x) ), ['reportList.txt', 'sampleList.txt']) except OSError: pass else: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() self.main() except AttributeError: self.runmetadata = MetadataObject() self.report = os.path.join(self.reportpath, 'abundance.xlsx') # Create the objects self.objectprep() # Set the run description to 'metagenome' in order to process the samples for sample in self.runmetadata.samples: sample.run.Description = 'metagenome' self.main() # Optionally filter the .fastq reads based on taxonomic assignment if args.filter: filtermetagenome.PipelineInit(self) # Print the metadata to file metadataprinter.MetadataPrinter(self)
def __init__(self, path, targetfile, analysis_type, fasta_path, genesippr, metadata_file, cutoff, amino_acid, one_based, target_alleles=True, allele_hashing=False): logging.info('Welcome to the CFIA Allele Finder (CAlF)') # Determine the path in which the sequence files are located. Allow for ~ expansion if path.startswith('~'): self.path = os.path.abspath(os.path.expanduser(os.path.join(path))) else: self.path = os.path.abspath(os.path.join(path)) self.targetfile = os.path.join(self.path, targetfile) assert os.path.isfile( self.targetfile ), 'Cannot find the supplied FASTA file: {fn}'.format( fn=self.targetfile) self.reportpath = os.path.join(self.path, 'reports') self.allelepath = os.path.join(self.path, 'alleles') make_path(self.reportpath) make_path(self.allelepath) self.analysistype = analysis_type if self.analysistype != 'remote': if fasta_path.startswith('~'): self.fasta_path = os.path.abspath( os.path.expanduser(os.path.join(fasta_path))) else: self.fasta_path = os.path.abspath(os.path.join(fasta_path)) else: self.fasta_path = None self.gensippr = genesippr if self.gensippr: self.metadata_file = os.path.join(self.path, metadata_file) self.cutoff = cutoff # If the supplied target allele is to be included in the output allele file, set self.target_alleles to 0 - this # will later be used to set the starting index of when iterating over the list of alleles (target allele is # stored at index 0) if target_alleles: self.target_alleles = 0 else: self.target_alleles = 1 # Set whether the allele identifiers will be generic (_0) or computed hashes of the allele sequence if allele_hashing: self.allele_hashing = True else: self.allele_hashing = False if amino_acid: self.amino_acid = amino_acid else: self.amino_acid = None self.one_based = one_based if self.one_based and self.allele_hashing: logging.error( 'Only one of allele_hashing (-a), and 1-based (-o) may be specified' ) raise SystemExit self.records = dict() self.record_parameters = dict() self.expect = dict() self.word_size = dict() self.filter_low_complexity = dict() self.blast_outputs = dict() self.alleleset = dict() self.illegal_alleleset = dict() self.strain_genera = dict() self.all_alleles = list() self.devnull = open(os.devnull, 'wb') self.cpus = multiprocessing.cpu_count() self.queue = Queue() # Fields used for custom outfmt 6 BLAST output: self.fieldnames = [ 'query_id', 'subject_id', 'positives', 'mismatches', 'gaps', 'evalue', 'bit_score', 'subject_length', 'alignment_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'query_sequence', 'subject_sequence' ] self.outfmt = '6 qseqid sseqid positive mismatch gaps evalue bitscore slen length qstart qend sstart send ' \ 'qseq sseq' # TODO self.strain_genera needs to be populated properly self.metadata_file = str() self.local_dict = dict() self.genera = str() self.gene_dict = dict() self.mismatches = dict() self.genus_alleles = dict()