def run_mash(seqids, output_dir): """ Use MASH to determine the genus of strains when the requested analysis has a genus-specific database :return: dictionary of MASH-calculated genera """ # Dictionary to store the MASH results genus_dict = dict() # Run mash screen on each of the assemblies for seqid in seqids: screen_file = os.path.join(output_dir, '{seqid}_screen.tab'.format(seqid=seqid)) mash.screen('/mnt/nas2/databases/confindr/databases/refseq.msh', item, threads=8, w='', i='0.95', output_file=screen_file, returncmd=True) screen_output = mash.read_mash_screen(screen_file) # Determine the genus from the screen output file for screen in screen_output: # Extract the genus from the mash results mash_organism = screen.query_id.split('/')[-3] # Populate the dictionary with the seqid, and the calculated genus genus_dict[seqid] = mash_organism return genus_dict
def find_genus(files, database, threads=12): """ Uses MASH to find the genus of fasta files. :param files: File dictionary returned by filer method. :param database: Path to reduced refseq database sketch. :param threads: Number of threads to run mash with. :return: genus_dict: Dictionary of genus for each sample. Will return NA if genus could not be found. """ genus_dict = dict() tmpdir = str(time.time()).split('.')[-1] if not os.path.isdir(tmpdir): os.makedirs(tmpdir) for file_name, fasta in files.items(): mash.screen(database, fasta, threads=threads, w='', i=0.95, output_file=os.path.join(tmpdir, 'screen.tab')) screen_output = mash.read_mash_screen(os.path.join(tmpdir, 'screen.tab')) try: os.remove(os.path.join(tmpdir, 'screen.tab')) except IOError: pass try: genus = screen_output[0].query_id.split('/')[-3] if genus == 'Shigella': genus = 'Escherichia' genus_dict[file_name] = genus except IndexError: genus_dict[file_name] = 'NA' shutil.rmtree(tmpdir) return genus_dict
def pointfinder_redmine(redmine_instance, issue, work_dir, description): sentry_sdk.init(SENTRY_DSN, before_send=before_send) # Unpickle Redmine objects redmine_instance = pickle.load(open(redmine_instance, 'rb')) issue = pickle.load(open(issue, 'rb')) description = pickle.load(open(description, 'rb')) # Parse description to get list of SeqIDs seqids = list() for i in range(0, len(description)): item = description[i] item = item.upper() # Minimal check to make sure IDs provided somewhat resemble a valid sample ID if item.isalpha(): pass else: seqids.append(item) # Run Mash with open(os.path.join(work_dir, 'seqid.txt'), 'w') as f: for seqid in seqids: f.write(seqid + '\n') # Drop FASTA files into workdir retrieve_nas_files(seqids=seqids, outdir=work_dir, filetype='fasta', copyflag=False) # Create output directory output_dir = os.path.join(work_dir, 'output') make_path(output_dir) # Get all of the FASTA files fasta_list = sorted(glob.glob(os.path.join(work_dir, '*.fasta'))) # Set the folder to store all the PointFinder outputs pointfinder_output_dir = os.path.join(work_dir, 'pointfinder_outputs') # Initialise a dictionaries to store the mash-calculated, and pointfinder-formatted genus outputs for each strain genus_dict = dict() organism_dict = dict() # Create lists to store missing and unprocessed seqids unprocessed_seqs = list() missing_seqs = list() mash_fails = list() # Dictionary to convert the mash-calculated genus to the pointfinder format pointfinder_org_dict = { 'Campylobacter': 'campylobacter', 'Escherichia': 'e.coli', 'Shigella': 'e.coli', 'Mycobacterium': 'tuberculosis', 'Neisseria': 'gonorrhoeae', 'Salmonella': 'salmonella' } # Reverse look-up dictionary rev_org_dict = { 'campylobacter': 'Campylobacter', 'e.coli': 'Escherichia', 'tuberculosis': 'Mycobacterium', 'gonorrhoeae': 'Neisseria', 'salmonella': 'Salmonella' } summary_dict = { 'Salmonella': { 'prediction': { 'header': 'Strain,Colitsin,Colistin,Spectinomycin,Quinolones,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'Salmonella_prediction_summary.csv') }, 'table': { 'header': 'Strain,parE,parC,gyrA,pmrB,pmrA,gyrB,16S_rrsD,23S,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'Salmonella_table_summary.csv') }, 'results': { 'header': 'Strain,Genus,Mutation,NucleotideChange,AminoAcidChange,Resistance,PMID,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'PointFinder_results_summary.csv') } }, 'Escherichia': { 'prediction': { 'header': 'Strain,Colistin,GentamicinC,gentamicinC,Streptomycin,Macrolide,Sulfonamide,' 'Tobramycin,Neomycin,Fluoroquinolones,Aminocoumarin,Tetracycline,KanamycinA,' 'Spectinomycin,B-lactamResistance,Paromomycin,Kasugamicin,Quinolones,G418,' 'QuinolonesAndfluoroquinolones,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'Escherichia_prediction_summary.csv') }, 'table': { 'header': 'Strain,parE,parC,folP,gyrA,pmrB,pmrA,16S_rrsB,16S_rrsH,gyrB,ampC,16S_rrsC,23S,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'Escherichia_table_summary.csv') }, 'results': { 'header': 'Strain,Genus,Mutation,NucleotideChange,AminoAcidChange,Resistance,PMID,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'PointFinder_results_summary.csv') } }, 'Campylobacter': { 'prediction': { 'header': 'Strain,LowLevelIncreaseMIC,AssociatedWithT86Mutations,Macrolide,Quinolone,' 'Streptinomycin,Erythromycin,IntermediateResistance,HighLevelResistance_' 'nalidixic_and_ciprofloxacin,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'Campylobacter_prediction_summary.csv') }, 'table': { 'header': 'Strain,L22,rpsL,cmeR,gyrA,23S,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'Campylobacter_table_summary.csv') }, 'results': { 'header': 'Strain,Genus,Mutation,NucleotideChange,AminoAcidChange,Resistance,PMID,\n', 'output': str(), 'summary': os.path.join(pointfinder_output_dir, 'PointFinder_results_summary.csv') } } } # Run mash screen on each of the assemblies for item in fasta_list: seqid = os.path.splitext(os.path.basename(item))[0] screen_file = os.path.join(output_dir, '{seqid}_screen.tab'.format(seqid=seqid)) mash.screen('/mnt/nas2/databases/confindr/databases/refseq.msh', item, threads=8, w='', i='0.95', output_file=screen_file, returncmd=True) screen_output = mash.read_mash_screen(screen_file) # Determine the genus from the screen output file for screen in screen_output: # Extract the genus from the mash results mash_organism = screen.query_id.split('/')[-3] # Use the organism as a key in the pointfinder database name conversion dictionary try: mash_genus = pointfinder_org_dict[mash_organism] except KeyError: mash_genus = 'NA' # Populate the dictionaries with the seqid, and the calculated genus/pointfinder name genus_dict[seqid] = mash_genus organism_dict[seqid] = mash_organism # Delete all of the FASTA files for fasta in fasta_list: os.remove(fasta) # # Delete the output folder # shutil.rmtree(output_dir) # Pointfinder # These unfortunate hard coded paths appear to be necessary activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/pointfinder' pointfinder_py = '/mnt/nas2/virtual_environments/pointfinder/pointfinder-3.0/pointfinder-3.0.py' # Database locations pointfinder_db = '/mnt/nas2/databases/assemblydatabases/0.3.4/pointfinder' # List of organisms in the pointfinder database pointfinder_list = [ 'campylobacter', 'e.coli', 'tuberculosis', 'gonorrhoeae', 'salmonella' ] try: os.mkdir(pointfinder_output_dir) except FileExistsError: pass # Pointfinder cannot handle an entire folder of sequences; each sample must be processed independently for seqid in sorted(seqids): # If the seqid isn't present in the dictionary, it is because the assembly could not be found - or because # MASH screen failed try: # Look up the PointFinder and the MASH-calculated genera pointfinder_genus = genus_dict[seqid] genus = rev_org_dict[pointfinder_genus] # If the genus isn't in the pointfinder database, do not attempt to process it if pointfinder_genus in pointfinder_list: # Create folder to drop FASTA files assembly_folder = os.path.join(work_dir, seqid) make_path(assembly_folder) # Extract FASTA files. retrieve_nas_files(seqids=[seqid], outdir=assembly_folder, filetype='fasta', copyflag=False) fasta = os.path.join(assembly_folder, '{seqid}.fasta'.format(seqid=seqid)) # Prepare command cmd = 'python {py} -i {fasta} -s {orgn} -p {db} -o {output} -m blastn -m_p {blast_path}'\ .format(py=pointfinder_py, fasta=fasta, orgn=pointfinder_genus, db=pointfinder_db, output=pointfinder_output_dir, blast_path='/mnt/nas2/virtual_environments/pointfinder/bin/blastn' ) # Create another shell script to execute within the PlasmidExtractor conda environment template = "#!/bin/bash\n{} && {}".format(activate, cmd) pointfinder_script = os.path.join(work_dir, 'run_pointfinder.sh') with open(pointfinder_script, 'w+') as file: file.write(template) # Modify the permissions of the script to allow it to be run on the node make_executable(pointfinder_script) # Run shell script os.system(pointfinder_script) # Find the pointfinder outputs summary_dict[genus]['prediction']['output'] = \ glob.glob(os.path.join(pointfinder_output_dir, '{seq}*prediction.txt'.format(seq=seqid)))[0] summary_dict[genus]['table']['output'] = \ glob.glob(os.path.join(pointfinder_output_dir, '{seq}*table.txt'.format(seq=seqid)))[0] summary_dict[genus]['results']['output'] = \ glob.glob(os.path.join(pointfinder_output_dir, '{seq}*results.txt'.format(seq=seqid)))[0] # Process the predictions write_report(summary_dict=summary_dict, seqid=seqid, genus=genus, key='prediction') # Process the results summary write_report(summary_dict=summary_dict, seqid=seqid, genus=genus, key='results') # Process the table summary write_table_report(summary_dict=summary_dict, seqid=seqid, genus=genus) else: unprocessed_seqs.append(seqid) except KeyError: if not os.path.isfile( os.path.join(output_dir, '{seq}_screen.tab'.format(seq=seqid))): missing_seqs.append(seqid) else: mash_fails.append(seqid) # Attempt to clear out the tmp folder from the pointfinder_output_dir try: shutil.rmtree(os.path.join(pointfinder_output_dir, 'tmp')) except FileNotFoundError: pass # Zip output output_filename = 'pointfinder_output' zip_filepath = zip_folder(results_path=pointfinder_output_dir, output_dir=work_dir, output_filename=output_filename) zip_filepath += '.zip' # Prepare upload output_list = [{ 'filename': os.path.basename(zip_filepath), 'path': zip_filepath }] # Create a note to add to the updated Redmine issue notes = 'Pointfinder process complete!' # If there are missing, or unprocessed sequences, add details to the note if unprocessed_seqs: seq_list = list() for sequence in unprocessed_seqs: seq_list.append('{seqid} ({organism})'.format( seqid=sequence, organism=organism_dict[sequence])) if len(unprocessed_seqs) > 1: notes += '\n The following sequences were not processed, as they were determined to be genera not ' \ 'present in the pointfinder database: {seqs}'.format(seqs=', '.join(seq_list)) else: notes += '\n The following sequence was not processed, as it was determined to be a genus not ' \ 'present in the pointfinder database: {seqs}'.format(seqs=', '.join(seq_list)) if missing_seqs: if len(missing_seqs) > 1: notes += '\n The following sequences were not processed, as they could not be located in the strain ' \ 'database: {seqs}'.format(seqs=', '.join(missing_seqs)) else: notes += '\n The following sequence was not processed, as it could not be located in the strain database:' \ ' {seqs}'.format(seqs=', '.join(missing_seqs)) if mash_fails: if len(mash_fails) > 1: notes += '\n The following sequences could not be processed by MASH screen: {seqs}'\ .format(seqs=', '.join(mash_fails)) else: notes += '\n The following sequence could not be processed by MASH screen: {seqs}'\ .format(seqs=', '.join(mash_fails)) # Create a list of all the folders - will be used to clean up the working directory folders = glob.glob(os.path.join(work_dir, '*/')) # Remove all the folders for folder in folders: if os.path.isdir(folder): shutil.rmtree(folder) # Wrap up issue redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4, notes=notes)
def staramr_redmine(redmine_instance, issue, work_dir, description): # Unpickle Redmine objects redmine_instance = pickle.load(open(redmine_instance, 'rb')) issue = pickle.load(open(issue, 'rb')) description = pickle.load(open(description, 'rb')) # Parse description to get list of SeqIDs seqids = list() for i in range(0, len(description)): item = description[i] item = item.upper() # Minimal check to make sure IDs provided somewhat resemble a valid sample ID if item.isalpha(): pass else: seqids.append(item) # Run Mash with open(os.path.join(work_dir, 'seqid.txt'), 'w') as f: for seqid in seqids: f.write(seqid + '\n') # Drop FASTA files into workdir retrieve_nas_files(seqids=seqids, outdir=work_dir, filetype='fasta', copyflag=False) # Create output directory output_dir = os.path.join(work_dir, 'output') make_path(output_dir) # Get all of the FASTA files fasta_list = sorted(glob.glob(os.path.join(work_dir, '*.fasta'))) # Set the folder to store all the PointFinder outputs staramr_output_dir = os.path.join(work_dir, 'staramr_outputs') # Initialise a dictionaries to store the mash-calculated, and pointfinder-formatted genus outputs for each strain genus_dict = dict() organism_dict = dict() # Create lists to store missing and unprocessed seqids unprocessed_seqs = list() missing_seqs = list() mash_fails = list() # Dictionary to convert the mash-calculated genus to the pointfinder format pointfinder_org_dict = { 'Campylobacter': 'campylobacter', 'Escherichia': 'e.coli', 'Shigella': 'e.coli', 'Mycobacterium': 'tuberculosis', 'Neisseria': 'gonorrhoeae', 'Salmonella': 'salmonella' } # Reverse look-up dictionary rev_org_dict = { 'campylobacter': 'Campylobacter', 'e.coli': 'Escherichia', 'tuberculosis': 'Mycobacterium', 'gonorrhoeae': 'Neisseria', 'salmonella': 'Salmonella' } # Run mash screen on each of the assemblies for item in fasta_list: seqid = os.path.splitext(os.path.basename(item))[0] screen_file = os.path.join(output_dir, '{seqid}_screen.tab'.format(seqid=seqid)) mash.screen('/mnt/nas2/databases/confindr/databases/refseq.msh', item, threads=8, w='', i='0.95', output_file=screen_file, returncmd=True) screen_output = mash.read_mash_screen(screen_file) # Determine the genus from the screen output file for screen in screen_output: # Extract the genus from the mash results mash_organism = screen.query_id.split('/')[-3] # Use the organism as a key in the pointfinder database name conversion dictionary try: mash_genus = pointfinder_org_dict[mash_organism] except KeyError: mash_genus = 'NA' # Populate the dictionaries with the seqid, and the calculated genus/pointfinder name genus_dict[seqid] = mash_genus organism_dict[seqid] = mash_organism # Delete all of the FASTA files for fasta in fasta_list: os.remove(fasta) # # Delete the output folder # shutil.rmtree(output_dir) # Pointfinder # These unfortunate hard coded paths appear to be necessary activate = 'source /home/ubuntu/miniconda3/bin/activate /mnt/nas2/virtual_environments/staramr' staramr_py = '/mnt/nas2/virtual_environments/staramr/bin/staramr' # List of organisms in the pointfinder database staramr_list = ['campylobacter', 'salmonella'] try: os.mkdir(staramr_output_dir) except FileExistsError: pass genus_seqid_dict = dict() for seqid in sorted(seqids): try: seqid_genus = genus_dict[seqid] if seqid_genus not in genus_seqid_dict: genus_seqid_dict[seqid_genus] = [seqid] else: genus_seqid_dict[seqid_genus].append(seqid) except KeyError: # Mash sometimes doesn't find a genus! mash_fails.append(seqid) for genus in genus_seqid_dict: if genus in staramr_list: assembly_folder = os.path.join(work_dir, genus) make_path(assembly_folder) retrieve_nas_files(seqids=genus_seqid_dict[genus], outdir=assembly_folder, filetype='fasta', copyflag=False) fastas = sorted(glob.glob(os.path.join(assembly_folder, '*.fasta'))) outdir = os.path.join(staramr_output_dir, genus) cmd = '{py} search --pointfinder-organism {orgn} -o {output} ' \ .format(py=staramr_py, orgn=genus, output=outdir, ) for fasta in fastas: cmd += fasta + ' ' # Create another shell script to execute within the PlasmidExtractor conda environment template = "#!/bin/bash\n{} && {}".format(activate, cmd) pointfinder_script = os.path.join(work_dir, 'run_staramr.sh') with open(pointfinder_script, 'w+') as f: f.write(template) # Modify the permissions of the script to allow it to be run on the node make_executable(pointfinder_script) # Run shell script os.system(pointfinder_script) else: for seqid in genus_seqid_dict[genus]: unprocessed_seqs.append(seqid) # Zip output output_filename = 'staramr_output' zip_filepath = zip_folder(results_path=staramr_output_dir, output_dir=work_dir, output_filename=output_filename) zip_filepath += '.zip' # Prepare upload output_list = [{ 'filename': os.path.basename(zip_filepath), 'path': zip_filepath }] # Create a note to add to the updated Redmine issue notes = 'StarAMR process complete!' # If there are missing, or unprocessed sequences, add details to the note if unprocessed_seqs: seq_list = list() for sequence in unprocessed_seqs: seq_list.append('{seqid} ({organism})'.format( seqid=sequence, organism=organism_dict[sequence])) if len(unprocessed_seqs) > 1: notes += '\n The following sequences were not processed, as they were determined to be genera not ' \ 'present in the StarAMR database: {seqs}'.format(seqs=', '.join(seq_list)) else: notes += '\n The following sequence was not processed, as it was determined to be a genus not ' \ 'present in the StarAMR database: {seqs}'.format(seqs=', '.join(seq_list)) if missing_seqs: if len(missing_seqs) > 1: notes += '\n The following sequences were not processed, as they could not be located in the strain ' \ 'database: {seqs}'.format(seqs=', '.join(missing_seqs)) else: notes += '\n The following sequence was not processed, as it could not be located in the strain database:' \ ' {seqs}'.format(seqs=', '.join(missing_seqs)) if mash_fails: if len(mash_fails) > 1: notes += '\n The following sequences could not be processed by MASH screen: {seqs}'\ .format(seqs=', '.join(mash_fails)) else: notes += '\n The following sequence could not be processed by MASH screen: {seqs}'\ .format(seqs=', '.join(mash_fails)) # Create a list of all the folders - will be used to clean up the working directory folders = glob.glob(os.path.join(work_dir, '*/')) # Remove all the folders for folder in folders: if os.path.isdir(folder): shutil.rmtree(folder) # Wrap up issue redmine_instance.issue.update(resource_id=issue.id, uploads=output_list, status_id=4, notes=notes)
def mash_for_potential_plasmids(plasmid_db, forward_reads, output_dir, reverse_reads=None, threads=1, logfile=None, identity_cutoff=0.95): """ Uses mash to find a list of potential plasmids in a set of forward (and optionally reverse) reads. :param plasmid_db: Path to a multi-Fasta-formatted file that has plasmid sequences of interest. :param forward_reads: Path to forward reads. :param output_dir: Path to output directory where mash sketch/screen result file will be stored. :param reverse_reads: Path to reverse reads. If not specified, things will work in unpaired mode. :param threads: Number of threads to run mash analyses on. :param logfile: Path to logfile you want to use. :param identity_cutoff: Mash screen identity cutoff. Values lower than this won't be reported. :return: potential_plasmids: A list where each entry is a putatively present plasmid, identified by the fasta header. """ potential_plasmids = list() # Make sure the output dir specified gets created if it doesn't exist. if not os.path.isdir(output_dir): os.makedirs(output_dir) # Make a sketch of the plasmid db. out, err = mash.sketch(plasmid_db, output_sketch=os.path.join(output_dir, 'plasmid_sketch.msh'), threads=threads, i='') if logfile: accessoryFunctions.write_to_logfile(out, err, logfile) # Now it's time to use mash screen to try to figure out what plasmids might be present in our sample. if reverse_reads: # As usual, do things slightly differently for paired vs unpaired reads. out, err = mash.screen(os.path.join(output_dir, 'plasmid_sketch.msh'), forward_reads, reverse_reads, output_file=os.path.join( output_dir, 'screen_results.tsv'), threads=threads, i=identity_cutoff) if logfile: accessoryFunctions.write_to_logfile(out, err, logfile) else: # Unpaired read mode. out, err = mash.screen(os.path.join(output_dir, 'plasmid_sketch.msh'), forward_reads, output_file=os.path.join( output_dir, 'screen_results.tsv'), threads=threads, i=identity_cutoff) if logfile: accessoryFunctions.write_to_logfile(out, err, logfile) # Now need to read through the list of potential plasmids generated by the mash screen. results = mash.read_mash_screen( screen_result=os.path.join(output_dir, 'screen_results.tsv')) for item in results: potential_plasmids.append(item.query_id) return potential_plasmids