def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1, min_covhsp=25, seq_id_file=None): blast_runner = BlastRunner(input_fasta, out_dir) blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb, db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, seq_id_file=seq_id_file, logging=logging) if os.path.getsize(blast_results_file) == 0: os.remove(blast_results_file) return False blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp] blast_df = blast_df.loc[blast_df['evalue'] <= evalue] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) return True
def blastn(input_fasta, blastdb, min_ident, min_cov, evalue, min_length, out_dir, blast_results_file, logging, seq_filterfile=None, num_threads=1, max_length=400000, min_hsp_cov=1): blast_runner = BlastRunner(input_fasta, out_dir) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, logging=logging, num_threads=num_threads, word_size=11, seq_id_file=seq_filterfile) if os.path.getsize(blast_results_file) == 0: os.remove(blast_results_file) return False blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= max_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_hsp_cov] blast_df = blast_df.loc[blast_df['evalue'] <= evalue] blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.reset_index(drop=True) blast_df = fixStart(blast_df) blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) return True
def repetitive_blast(input_fasta, ref_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file, num_threads=1): blast_runner = BlastRunner(input_fasta, tmp_dir) #blast_runner.makeblastdb(ref_db, 'nucl') blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=ref_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads) if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) contig_list = dict() for index, row in blast_df.iterrows(): if not row['qseqid'] in contig_list: contig_list[row['qseqid']] = { 'id': row['sseqid'], 'score': row['bitscore'], 'contig_start': row['sstart'], 'contig_end': row['send'] } else: if contig_list[row['qseqid']]['score'] > row['bitscore']: contig_list[row['qseqid']] = { 'id': row['sseqid'], 'score': row['bitscore'], 'contig_start': row['sstart'], 'contig_end': row['send'] } return contig_list
def main(): logging = init_console_logger(2) logging.info('Initilizating databases...this will take some time') #Find available threads and use the maximum number available for mash sketch but cap it at 32 num_threads = multiprocessing.cpu_count() if num_threads > 32: num_threads = 32 database_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)),'databases/') zip_file = os.path.join(database_directory,'data.zip') plasmid_database_fasta_file = os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas') repetitive_fasta_file = os.path.join(database_directory,'repetitive.dna.fas') mash_db_file = os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas.msh') download_to_file('https://ndownloader.figshare.com/articles/5841882?private_link=a4c92dd84f17b2cefea6',zip_file) extract(zip_file,database_directory) os.remove(zip_file) files = [f for f in listdir(database_directory) if isfile(join(database_directory, f))] for file in files: if file.endswith('gz'): extract(os.path.join(database_directory,file), database_directory) #Initilize blast and mash daatabases blast_runner = BlastRunner(repetitive_fasta_file, database_directory) blast_runner.makeblastdb(repetitive_fasta_file, 'nucl') blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory) blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl') mObj = mash() mObj.mashsketch(plasmid_database_fasta_file,mash_db_file,num_threads=num_threads) status_file = os.path.join(database_directory,'status.txt') with gzip.open(status_file, 'w') as f: f.write("Download date: {}".format(datetime.datetime.today().strftime('%Y-%m-%d'))) f.close()
def contig_blast(input_fasta, plasmid_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file, num_threads=1): blast_runner = None filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt') blast_runner = BlastRunner(input_fasta, tmp_dir) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=plasmid_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, word_size=11) if os.path.getsize(blast_results_file) == 0: fh = open(filtered_blast, 'w', encoding="utf-8") fh.write('') fh.close() return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= 400000] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False)
def mob_blast(input_fasta, ref_db, min_ident, min_cov, evalue, tmp_dir, blast_results_file, overlap=5, num_threads=1): num_threads = 1 blast_runner = BlastRunner(input_fasta, tmp_dir) blast_runner.makeblastdb(ref_db, 'nucl') blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=ref_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads) if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = 0 size = str(len(blast_df)) while size != prev_size: blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = size size = str(len(blast_df)) #print(blast_df) return blast_df
def run_blast(self, input_fasta, output_path, blast_results_file, logging, min_cov=1, min_ident=1, evalue=1, num_threads=1, min_length=25): blast_runner = BlastRunner(input_fasta, output_path) blast_runner.makeblastdb(input_fasta, 'nucl', logging) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=input_fasta, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, word_size=11, logging=logging) if os.path.getsize(blast_results_file) == 0: fh = open(blast_results_file, 'w', encoding="utf-8") fh.write('') fh.close() return dict() blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(blast_results_file, sep='\t', header=False, line_terminator='\n', index=False)
def main(): args = arguments() database_directory = os.path.abspath(args.database_directory) if os.path.exists(database_directory) == False: os.mkdir(database_directory) else: logger.info("Database directory folder already exists at {}".format( database_directory)) # Helper function to simplify adding database_directory to everything prepend_db_dir = functools.partial(os.path.join, database_directory) lockfilepath = os.path.join(database_directory, ".lock") status_file = prepend_db_dir('status.txt') if os.path.exists(lockfilepath) == False: try: open(file=lockfilepath, mode="w").close() logger.info("Placed lock file at {}".format(lockfilepath)) except Exception as e: logger.error( "Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?" .format(lockfilepath)) logger.error("{}".format(e)) pass else: while os.path.exists(lockfilepath): elapsed_time = time.time() - os.path.getmtime(lockfilepath) logger.info( "Lock file found at {}. Waiting for other processes to finish database init ..." .format(lockfilepath)) logger.info( "Elapsed time {} min. Will continue processing after 16 min mark." .format(int(elapsed_time / 60))) if elapsed_time >= 1000: logger.info( "Elapsed time {} min. Assuming previous process completed all init steps. Continue ..." .format(int(elapsed_time / 60))) try: #if previous process failed, no processes are running and > 16 min passed since the lock was created os.remove(lockfilepath) except: #continue if file was removed by other process pass break time.sleep(60) #recheck every 1 min if lock file was removed logger.info( "Lock file no longer exists. Assuming init process completed successfully" ) return 0 logger.info('Initializing databases...this will take some time') # Find available threads and use the maximum number available for mash sketch but cap it at 32 num_threads = min(multiprocessing.cpu_count(), 32) if not os.path.exists(database_directory): os.makedirs(database_directory) zip_file = prepend_db_dir('data.tar.gz') plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas') repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas') mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh') logger.info('Downloading databases...this will take some time') for db_mirror in config['db_mirrors']: try: logger.info('Trying mirror {}'.format(db_mirror)) download_to_file(db_mirror, zip_file) break except Exception as e: logger.error( "Download failed with error {}. Removing lock file".format( str(e))) os.remove(lockfilepath) sys.exit(-1) logger.info( "Downloading databases successful, now building databases at {}". format(database_directory)) extract(zip_file, database_directory) files = [ prepend_db_dir(f) for f in os.listdir(database_directory) if f.endswith('.gz') ] for file in files: extract(file, database_directory) #Initialize blast and mash databases try: logger.info('Building repetitive mask database') blast_runner = BlastRunner(repetitive_fasta_file, database_directory) blast_runner.makeblastdb(repetitive_fasta_file, 'nucl', logger) logger.info('Building complete plasmid database') blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory) blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl', logger, True) logger.info('Sketching complete plasmid database') mObj = mash() mObj.mashsketch(plasmid_database_fasta_file, mash_db_file, num_threads=num_threads) except Exception as e: logger.error( 'Downloading databases failed, please check your internet connection and retry' ) logger.error( "Process failed with error {}. Removing lock file".format(e)) os.remove(lockfilepath) sys.exit(-1) try: logger.info("Init ete3 library ...") ete3taxadbpath = os.path.abspath( os.path.join(database_directory, "taxa.sqlite")) ncbi = NCBITaxa() ncbi.dbfile = ete3taxadbpath ncbi.update_taxonomy_database() except Exception as e: logger.error( "Init of ete3 library failed with error {}. Removing lock file". format(e)) os.remove(lockfilepath) sys.exit(-1) try: os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz")) logger.info( "Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job." ) except: pass with open(status_file, 'w') as f: download_date = datetime.datetime.today().strftime('%Y-%m-%d') f.write("Download date: {}. Removing lock file.".format(download_date)) try: os.remove(lockfilepath) except: logger.warning( "Lock file is already removed by some other process.") pass logger.info("MOB init completed successfully") return 0
def main(): args = parse_args() logging = init_console_logger(3) logging.info('Running Mob-Suite Clustering toolkit v. {}'.format(__version__)) logging.info('Processing fasta file {}'.format(args.infile)) logging.info('Analysis directory {}'.format(args.outdir)) input_fasta = args.infile if not os.path.isfile(input_fasta): logging.error('Error, input fasta specified does not exist: {}'.format(input_fasta )) sys.exit() out_dir = args.outdir num_threads = args.num_threads if not os.path.isdir(out_dir): os.mkdir(out_dir, 0o755) tmp_dir = os.path.join(out_dir, '__tmp') if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) mode = str(args.mode).lower() if mode not in ('update','build'): logging.error('Error you have not entered a valid mode of build or update, you entered: {}'.format(mode)) print(('Error you have not entered a valid mode of build or update, you entered: {}'.format(mode))) sys.exit() header = ('id', 0.05, 0.0001) tmp_cluster_file = os.path.join(out_dir, 'clusters.txt') tmp_ref_fasta_file = os.path.join(tmp_dir, 'references_tmp.fasta') update_fasta = os.path.join(out_dir, 'references_updated.fasta') if mode == 'update': if args.ref_cluster_file is None: logging.error('Reference fasta file must be specified, please check help for parameter reference') sys.exit() ref_fasta = args.ref_fasta_file if not os.path.isfile(ref_fasta ): logging.error('Reference fasta file specified does not exist: {}'.format(ref_fasta)) sys.exit() if args.ref_cluster_file is None: logging.error('Reference cluster file must be specified, please check help for parameter reference') sys.exit() ref_cluster_file = args.ref_cluster_file if not os.path.isfile(ref_cluster_file): logging.error('Reference cluster file specified does not exist: {}'.format(ref_cluster_file)) sys.exit() if args.ref_mash_db is None: logging.error('Reference mash sketch file must be specified, please check help for parameter reference') sys.exit() ref_mash_db = args.ref_mash_db if not os.path.isfile(ref_mash_db): logging.error('Reference mash file specified does not exist: {}'.format(ref_mash_db)) sys.exit() logging.info('Running mob-cluster in update mode with input file: {}'.format(input_fasta)) logging.info('Running mob-cluster in update mode with output directory: {}'.format(out_dir)) logging.info('Running mob-cluster in update mode on reference fasta file: {}'.format(ref_fasta)) logging.info('Reading previous cluster reference assignments from : {}'.format(ref_cluster_file)) shutil.copy(ref_cluster_file, tmp_cluster_file) shutil.copy(ref_fasta, tmp_ref_fasta_file) update_existing(input_fasta, tmp_dir, ref_mash_db, tmp_cluster_file, header, tmp_ref_fasta_file, update_fasta) if args.overwrite: shutil.move(update_fasta,ref_fasta) shutil.move(tmp_cluster_file,ref_cluster_file) mash_db_file = "{}.msh".format(input_fasta) mObj = mash() mObj.mashsketch(input_fasta, mash_db_file, num_threads=num_threads) blast_runner = BlastRunner(ref_fasta, '') blast_runner.makeblastdb(ref_fasta, 'nucl') else: mashObj = mash() mashObj.mashsketch(input_fasta,input_fasta+".msh",num_threads=num_threads) distance_matrix_file = os.path.join(tmp_dir,'mash_dist_matrix.txt') mashfile_handle = open(distance_matrix_file,'w') mashObj.run_mash(input_fasta+'.msh', input_fasta+'.msh', mashfile_handle,table=True,num_threads=num_threads) clust_assignments = build_cluster_db(distance_matrix_file, (0.05, 0.0001)) writeClusterAssignments(tmp_cluster_file, header, clust_assignments) clust_dict = selectCluster(clust_assignments, 1) shutil.copy(input_fasta, tmp_ref_fasta_file) updateFastaFile(tmp_ref_fasta_file ,update_fasta, clust_dict)
def main(): default_database_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'databases') parser = argparse.ArgumentParser() parser.add_argument('-d', '--database_directory', default=default_database_dir, help='Directory to download databases to. Defaults to {}'.format(default_database_dir)) args = parser.parse_args() logging = init_console_logger(2) logging.info('Initilizating databases...this will take some time') #Find available threads and use the maximum number available for mash sketch but cap it at 32 num_threads = multiprocessing.cpu_count() if num_threads > 32: num_threads = 32 # For some reason absolute paths don't work - enforce absolute path. database_directory = os.path.abspath(args.database_directory) if not os.path.exists(database_directory): os.makedirs(database_directory) zip_file = os.path.join(database_directory,'data.zip') plasmid_database_fasta_file = os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas') repetitive_fasta_file = os.path.join(database_directory,'repetitive.dna.fas') mash_db_file = os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas.msh') logging.info('Downloading databases...this will take some time') db_mirrors = ['https://share.corefacility.ca/index.php/s/oeufkw5HyKz0X5I/download', 'https://ndownloader.figshare.com/articles/5841882/versions/1'] for db_mirror in db_mirrors: logging.info('Trying mirror {}'.format(db_mirror)) download_to_file(db_mirror, zip_file) if os.path.exists(zip_file) and os.path.getsize(zip_file) > 50000: break #do not try other mirror if (not os.path.isfile(zip_file)): logging.error('Downloading databases failed, please check your internet connection and retry') sys.exit(-1) else: logging.info('Downloading databases successful, now building databases') extract(zip_file,database_directory) os.remove(zip_file) files = [f for f in listdir(database_directory) if isfile(join(database_directory, f))] for file in files: if file.endswith('gz'): extract(os.path.join(database_directory,file), database_directory) #Initilize blast and mash daatabases logging.info('Building repetive mask database') blast_runner = BlastRunner(repetitive_fasta_file, database_directory) blast_runner.makeblastdb(repetitive_fasta_file, 'nucl') logging.info('Building complete plasmid database') blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory) blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl') logging.info('Sketching complete plasmid database') mObj = mash() mObj.mashsketch(plasmid_database_fasta_file,mash_db_file,num_threads=num_threads) status_file = os.path.join(database_directory,'status.txt') with open(status_file, 'w') as f: f.write("Download date: {}".format(datetime.datetime.today().strftime('%Y-%m-%d'))) f.close()
def main(): args = parse_args() if args.debug: logger = init_console_logger(3) else: logger = init_console_logger(2) logger.info('Running Mob-typer version {}'.format(__version__)) logger.info('Processing fasta file {}'.format(args.infile)) if not os.path.isfile(args.infile): logger.info('Error, fasta file does not exist {}'.format(args.infile)) sys.exit() if not args.analysis_dir: tmp_dir = tempfile.TemporaryDirectory(dir=tempfile.gettempdir()).name else: tmp_dir = args.analysis_dir if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) if not isinstance(args.num_threads, int): logger.info( 'Error number of threads must be an integer, you specified "{}"'. format(args.num_threads)) database_dir = os.path.abspath(args.database_directory) if args.sample_id is None: sample_id = re.sub(r"\.(fasta|fa|fas){1,1}", "", os.path.basename(args.infile)) else: sample_id = args.sample_id # Script arguments input_fasta = args.infile report_file = args.out_file num_threads = int(args.num_threads) keep_tmp = args.keep_tmp if args.multi: multi = True else: multi = False if not (args.primary_cluster_dist >= 0 and args.primary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.primary_cluster_dist)) sys.exit() else: primary_distance = float(args.primary_cluster_dist) if not (args.secondary_cluster_dist >= 0 and args.secondary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.secondary_cluster_dist)) sys.exit() else: secondary_distance = float(args.secondary_cluster_dist) if database_dir == default_database_dir: mob_ref = args.plasmid_mob mash_db = args.plasmid_mash_db replicon_ref = args.plasmid_replicons plasmid_meta = args.plasmid_meta mpf_ref = args.plasmid_mpf plasmid_orit = args.plasmid_orit verify_init(logger, database_dir) else: mob_ref = os.path.join(database_dir, 'mob.proteins.faa') mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh') replicon_ref = os.path.join(database_dir, 'rep.dna.fas') plasmid_meta = os.path.join(database_dir, 'clusters.txt') mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa') plasmid_orit = os.path.join(database_dir, 'orit.fas') LIT_PLASMID_TAXONOMY_FILE = os.path.join( database_dir, "host_range_literature_plasmidDB.txt") NCBI_PLASMID_TAXONOMY_FILE = plasmid_meta fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta') replicon_blast_results = os.path.join(tmp_dir, 'replicon_blast_results.txt') mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt') mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt') orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt') repetitive_blast_results = os.path.join(tmp_dir, 'repetitive_blast_results.txt') if os.path.isfile(mob_blast_results): os.remove(mob_blast_results) if os.path.isfile(mpf_blast_results): os.remove(mpf_blast_results) if os.path.isfile(orit_blast_results): os.remove(orit_blast_results) if os.path.isfile(replicon_blast_results): os.remove(replicon_blast_results) # Input numeric params min_rep_ident = float(args.min_rep_ident) min_mob_ident = float(args.min_mob_ident) min_ori_ident = float(args.min_rep_ident) min_mpf_ident = float(args.min_mob_ident) idents = { 'min_rep_ident': min_rep_ident, 'min_mob_ident': min_mob_ident, 'min_ori_ident': min_ori_ident } for param in idents: value = float(idents[param]) if value < 60: logger.error( "Error: {} is too low, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) if value > 100: logger.error( "Error: {} is too high, please specify an integer between 70 - 100" .format(param)) sys.exit(-1) min_rep_cov = float(args.min_rep_cov) min_mob_cov = float(args.min_mob_cov) min_ori_cov = float(args.min_rep_cov) min_mpf_cov = float(args.min_mob_cov) covs = { 'min_rep_cov': min_rep_cov, 'min_mob_cov': min_mob_cov, 'min_con_cov': min_ori_cov, 'min_rpp_cov': min_ori_cov } for param in covs: value = float(covs[param]) if value < 60: logger.error( "Error: {} is too low, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) if value > 100: logger.error( "Error: {} is too high, please specify an integer between 50 - 100" .format(param)) sys.exit(-1) min_rep_evalue = float(args.min_rep_evalue) min_mob_evalue = float(args.min_mob_evalue) min_ori_evalue = float(args.min_rep_evalue) min_mpf_evalue = float(args.min_mob_evalue) evalues = { 'min_rep_evalue': min_rep_evalue, 'min_mob_evalue': min_mob_evalue, 'min_con_evalue': min_ori_evalue } for param in evalues: value = float(evalues[param]) if value > 1: logger.error( "Error: {} is too high, please specify an float evalue between 0 to 1" .format(param)) sys.exit(-1) check_dependencies(logger) needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref] for db in needed_dbs: if (not os.path.isfile(db)): logger.info('Warning! Needed database missing "{}"'.format(db)) mob_suite.mob_init.main() if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir, 0o755) # Test that ETE3 db is ok and lock process check dbstatus = ETE3_db_status_check(1, ETE3_LOCK_FILE, ETE3DBTAXAFILE, logging) if dbstatus == False: logging.error( "Exiting due to lock file not removed: {}".format(ETE3_LOCK_FILE)) sys.exit(-1) # Get cluster information reference_sequence_meta = read_sequence_info(plasmid_meta, MOB_CLUSTER_INFO_HEADER) # initilize master record tracking fix_fasta_header(input_fasta, fixed_fasta) contig_seqs = read_fasta_dict(fixed_fasta) contig_info = {} for id in contig_seqs: seq = contig_seqs[id] contig_info[id] = {} for feature in MOB_TYPER_REPORT_HEADER: contig_info[id][feature] = '' contig_info[id]['md5'] = calc_md5(seq) contig_info[id]['gc'] = GC(seq) contig_info[id]['size'] = len(seq) contig_info[id]['contig_id'] = id contig_info[id]['sample_id'] = sample_id # Makeblastdb blast_runner = BlastRunner(fixed_fasta, tmp_dir) build_success = blast_runner.makeblastdb(fixed_fasta, 'nucl', logging=logging) if build_success == False: logging.error( "Could not build blast database, check error messages..cannot continue" ) sys.exit() # run individual marker blasts contig_info = identify_biomarkers(contig_info, fixed_fasta, tmp_dir, 25, logging, \ replicon_ref, min_rep_ident, min_rep_cov, min_rep_evalue, replicon_blast_results, \ mob_ref, min_mob_ident, min_mob_cov, min_mob_evalue, mob_blast_results, \ mpf_ref, min_mpf_ident, min_mpf_cov, min_mpf_evalue, mpf_blast_results, \ None, None, None, None, \ plasmid_orit, orit_blast_results, repetitive_blast_results, \ num_threads=1) m = mash() mobtyper_results = [] mash_input_fasta = fixed_fasta + '.msh' ncbi = dict_from_alt_key_list( read_file_to_dict(NCBI_PLASMID_TAXONOMY_FILE, MOB_CLUSTER_INFO_HEADER, separater="\t"), "sample_id") lit = dict_from_alt_key_list( read_file_to_dict(LIT_PLASMID_TAXONOMY_FILE, LIT_PLASMID_TAXONOMY_HEADER, separater="\t"), "sample_id") if multi: m.mashsketch(input_fasta=fixed_fasta, output_path=mash_input_fasta, sketch_ind=True, num_threads=num_threads) mash_results = parseMash( m.run_mash(reference_db=mash_db, input_fasta=mash_input_fasta, table=False, num_threads=num_threads)) for seq_id in mash_results: record = {} for field in MOB_TYPER_REPORT_HEADER: if field in contig_info[seq_id]: record[field] = contig_info[seq_id][field] else: record[field] = '' record['sample_id'] = seq_id record['num_contigs'] = 1 distances = OrderedDict( sorted(mash_results[seq_id].items(), key=itemgetter(1), reverse=False)) for mash_neighbor_id in distances: dist = distances[mash_neighbor_id] if mash_neighbor_id not in reference_sequence_meta: continue else: record['mash_nearest_neighbor'] = mash_neighbor_id record['mash_neighbor_distance'] = dist record['primary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['primary_cluster_id'] record['secondary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['secondary_cluster_id'] record[ 'mash_neighbor_identification'] = reference_sequence_meta[ mash_neighbor_id]['organism'] break mobtyper_results.append(record) else: m.mashsketch(input_fasta=fixed_fasta, output_path=mash_input_fasta, sketch_ind=False, num_threads=num_threads) mash_results = parseMash( m.run_mash(reference_db=mash_db, input_fasta=mash_input_fasta, table=False, num_threads=num_threads)) record = {} for field in MOB_TYPER_REPORT_HEADER: record[field] = '' record['sample_id'] = sample_id fastaSeqStats = calcFastaStats(fixed_fasta) record['md5'] = fastaSeqStats['md5'] record['total_length'] = fastaSeqStats['size'] record['num_contigs'] = fastaSeqStats['num_seq'] record['gc'] = fastaSeqStats['gc_content'] record['mash_nearest_neighbor'] = '-' record['mash_neighbor_distance'] = 1 record['primary_cluster_id'] = '-' record['secondary_cluster_id'] = '-' record['mash_neighbor_identification'] = '-' for seq_id in mash_results: distances = OrderedDict( sorted(mash_results[seq_id].items(), key=itemgetter(1), reverse=False)) mash_neighbor_id = next(iter(distances)) dist = distances[mash_neighbor_id] if mash_neighbor_id not in reference_sequence_meta: continue record['mash_nearest_neighbor'] = mash_neighbor_id record['mash_neighbor_distance'] = dist record['primary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['primary_cluster_id'] record['secondary_cluster_id'] = reference_sequence_meta[ mash_neighbor_id]['secondary_cluster_id'] record['mash_neighbor_identification'] = reference_sequence_meta[ mash_neighbor_id]['organism'] record['rep_type(s)'] = [] record['rep_type_accession(s)'] = [] record['relaxase_type(s)'] = [] record['relaxase_type_accession(s)'] = [] record['mpf_type'] = [] record['mpf_type_accession(s)'] = [] record['orit_type(s)'] = [] record['orit_accession(s)'] = [] for seq_id in contig_info: record['rep_type(s)'].append(contig_info[seq_id]['rep_type(s)']) record['rep_type_accession(s)'].append( contig_info[seq_id]['rep_type_accession(s)']) record['relaxase_type(s)'].append( contig_info[seq_id]['relaxase_type(s)']) record['relaxase_type_accession(s)'].append( contig_info[seq_id]['relaxase_type_accession(s)']) record['mpf_type'].append(contig_info[seq_id]['mpf_type']) record['mpf_type_accession(s)'].append( contig_info[seq_id]['mpf_type_accession(s)']) record['orit_type(s)'].append(contig_info[seq_id]['orit_type(s)']) record['orit_accession(s)'].append( contig_info[seq_id]['orit_accession(s)']) for field in record: tmp = [] if record[field] == None: continue if isinstance(record[field], list): length = len(record[field]) for i in range(0, length): tmp += record[field][i].split(',') elif isinstance(record[field], str) and len(record[field]) > 0: tmp += record[field].split(',') if len(tmp) > 0: record[field] = [] for d in tmp: if len(d) > 0: record[field].append(d) mobtyper_results.append(record) for i in range(0, len(mobtyper_results)): record = mobtyper_results[i] bio_markers = sort_biomarkers({ 0: { 'types': record['rep_type(s)'], 'acs': record['rep_type_accession(s)'] }, 1: { 'types': record['relaxase_type(s)'], 'acs': record['relaxase_type_accession(s)'] }, 2: { 'types': record['mpf_type'], 'acs': record['mpf_type_accession(s)'] }, 3: { 'types': record['orit_type(s)'], 'acs': record['orit_accession(s)'] }, }) record['rep_type(s)'] = bio_markers[0]['types'] record['rep_type_accession(s)'] = bio_markers[0]['acs'] record['relaxase_type(s)'] = bio_markers[1]['types'] record['relaxase_type_accession(s)'] = bio_markers[1]['acs'] record['mpf_type'] = bio_markers[2]['types'] record['mpf_type_accession(s)'] = bio_markers[2]['acs'] record['orit_type(s)'] = bio_markers[3]['types'] record['orit_accession(s)'] = bio_markers[3]['acs'] if (isinstance(record['mash_neighbor_distance'], float) or isinstance( record['mash_neighbor_distance'], int)) and record['mash_neighbor_distance'] <= primary_distance: mob_cluster_id = record['primary_cluster_id'] else: mob_cluster_id = None #Patches that sometimes results are concatonated into strings if contigs are merged into a single results if isinstance(record['rep_type(s)'], list): record['rep_type(s)'] = ",".join(record['rep_type(s)']) if isinstance(record['relaxase_type_accession(s)'], list): record['relaxase_type_accession(s)'] = ",".join( record['relaxase_type_accession(s)']) host_range = hostrange(record['rep_type(s)'].split(','), record['relaxase_type_accession(s)'].split(','), mob_cluster_id, ncbi, lit) for field in host_range: record[field] = host_range[field] if isinstance(record['mpf_type'], list): record['mpf_type'] = determine_mpf_type(record['mpf_type']) elif isinstance(record['mpf_type'], str): record['mpf_type'] = determine_mpf_type( record['mpf_type'].split(',')) for field in record: if isinstance(record[field], list): record[field] = ",".join(record[field]) record['predicted_mobility'] = 'non-mobilizable' if len(record['relaxase_type(s)']) > 0 and len(record['mpf_type']): record['predicted_mobility'] = 'conjugative' elif len(record['relaxase_type(s)']) > 0 or len( record['orit_type(s)']) > 0: record['predicted_mobility'] = 'mobilizable' mobtyper_results[i] = record writeReport(mobtyper_results, MOB_TYPER_REPORT_HEADER, report_file) if not keep_tmp: shutil.rmtree(tmp_dir) logger.info( "MOB-typer completed and results written to {}".format(report_file))
def main(): args = parse_args() if args.debug: logging = init_console_logger(3) else: logging = init_console_logger(2) logging.info( 'Running Mob-Suite Clustering toolkit v. {}'.format(__version__)) logging.info('Processing fasta file {}'.format(args.infile)) logging.info('Analysis directory {}'.format(args.outdir)) check_dependencies(logging) input_fasta = args.infile if not os.path.isfile(input_fasta): logging.error('Error, input fasta specified does not exist: {}'.format( input_fasta)) sys.exit() mob_typer_report_file = args.mob_typer_file if not os.path.isfile(mob_typer_report_file): logging.error( 'Error, input metadata file specified does not exist: {}'.format( mob_typer_report_file)) sys.exit() mode = str(args.mode).lower() if mode not in ('update', 'build'): logging.error( 'Error you have not entered a valid mode of build or update, you entered: {}' .format(mode)) sys.exit() out_dir = args.outdir num_threads = args.num_threads if not (args.primary_cluster_dist >= 0 and args.primary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.primary_cluster_dist)) sys.exit() else: primary_distance = args.primary_cluster_dist if not (args.secondary_cluster_dist >= 0 and args.secondary_cluster_dist <= 1): logging.error( 'Error distance thresholds must be between 0 - 1: {}'.format( args.secondary_cluster_dist)) sys.exit() else: secondary_distance = args.secondary_cluster_dist if not os.path.isdir(out_dir): logging.info('Creating directory {}'.format(args.outdir)) os.mkdir(out_dir, 0o755) tmp_dir = os.path.join(out_dir, '__tmp') if not os.path.isdir(tmp_dir): logging.info('Creating directory {}'.format(args.outdir)) os.mkdir(tmp_dir, 0o755) taxonomy_file = args.taxonomy records = read_file_to_dict(mob_typer_report_file, MOB_TYPER_REPORT_HEADER, separater="\t") seq_ids = [] new_seq_info = {} duplicate_keys = [] for record in records: seq_ids.append(record['sample_id']) if not record['sample_id'] in new_seq_info: new_seq_info[record['sample_id']] = record else: duplicate_keys.append(record['sample_id']) if len(duplicate_keys) > 0: logging.error( "Duplicate sequence identifiers in fasta file. Please make every sequence id unique in the input file before using this tool" ) logging.error("Duplicate sequence ids: {}".format( ",".join(duplicate_keys))) sys.exit() record_identifications = read_file_to_dict(taxonomy_file, ['sample_id', 'organism'], separater="\t") organisms = [] for record in record_identifications: organism = record['organism'] if organism == 'unknown' or organism == '' or organism == 'Unknown': organism = 'Bacteria' organisms.append(organism) seq_id = record['sample_id'] if seq_id in new_seq_info: new_seq_info[seq_id]['organism'] = organism taxids = NamesToTaxIDs(organisms) del (organisms) for seq_id in new_seq_info: organism = new_seq_info[seq_id]['organism'] if organism in taxids: new_seq_info[seq_id]['taxid'] = taxids[organism][0] else: new_seq_info[seq_id]['taxid'] = 2 if len(new_seq_info) == 0: logging.error( 'Error no MOB-typer results for sequences. Sequences must be typed with MOB-typer first' ) sys.exit() fasta_dict = read_fasta_dict(input_fasta) if len(fasta_dict) == 0: logging.error( 'Error no sequences found in input fasta: {}..cannot continue'. format(input_fasta)) sys.exit() key_set_1 = set(seq_ids) key_set_2 = set(list(fasta_dict.keys())) if len(list(key_set_1 ^ key_set_2)) > 0: logging.error( 'Error MOB-typer results: {} and input fasta: {} do not have the same set of identifiers, these must match in order to proceed' .format(mob_typer_report_file, input_fasta)) logging.error( 'Keys present in MOB-typer results: {} and not in input fasta: {} are: {}' .format(mob_typer_report_file, input_fasta, list(key_set_1 - key_set_2))) logging.error( 'Keys present in MOB-typer results: {} and not in input fasta: {} are: {}' .format(mob_typer_report_file, input_fasta, list(key_set_2 - key_set_1))) sys.exit() tmp_cluster_file = os.path.join(out_dir, 'clusters.txt') tmp_ref_fasta_file = os.path.join(tmp_dir, 'references_tmp.fasta') update_fasta = os.path.join(out_dir, 'references_updated.fasta') # Sketch and calculate distances within update sequences if len(fasta_dict) > 1: mashObj = mash() mashObj.mashsketch(input_fasta, input_fasta + ".msh", num_threads=num_threads) distance_matrix_file = os.path.join(tmp_dir, 'mash_dist_matrix.txt') mashfile_handle = open(distance_matrix_file, 'w', encoding="utf-8") mashfile_handle.write( mashObj.run_mash(input_fasta + '.msh', input_fasta + '.msh', table=True, num_threads=num_threads).decode()) mashfile_handle.close() clust_assignments = build_cluster_db( distance_matrix_file, (primary_distance, secondary_distance)) else: seq_id = next(iter(fasta_dict)) clust_assignments = {seq_id: [0, 1]} logging.info('Running MOB-cluster in {} mode'.format(mode)) if mode == 'update': if args.ref_cluster_file is None: logging.error( 'Reference fasta file must be specified, please check help for parameter reference' ) sys.exit() ref_fasta = args.ref_fasta_file if not os.path.isfile(ref_fasta): logging.error( 'Reference fasta file specified does not exist: {}'.format( ref_fasta)) sys.exit() if args.ref_cluster_file is None: logging.error( 'Reference cluster file must be specified, please check help for parameter reference' ) sys.exit() ref_cluster_file = args.ref_cluster_file if not os.path.isfile(ref_cluster_file): logging.error( 'Reference cluster file specified does not exist: {}'.format( ref_cluster_file)) sys.exit() mob_cluster_seq_info = read_sequence_info(ref_cluster_file, MOB_CLUSTER_INFO_HEADER) logging.info( 'Running mob-cluster in update mode with input file: {}'.format( input_fasta)) logging.info( 'Running mob-cluster in update mode with output directory: {}'. format(out_dir)) logging.info( 'Running mob-cluster in update mode on reference fasta file: {}'. format(ref_fasta)) logging.info( 'Reading previous cluster reference assignments from : {}'.format( ref_cluster_file)) shutil.copy(ref_cluster_file, tmp_cluster_file) shutil.copy(ref_fasta, tmp_ref_fasta_file) logging.info('Creating new cluster assignments') new_seq_info = update_existing_db(new_seq_info, mob_cluster_seq_info, clust_assignments, primary_distance, secondary_distance, num_threads) cluster_assignments = {**mob_cluster_seq_info, **new_seq_info} logging.info( 'Writting cluster assignments to : {}'.format(tmp_cluster_file)) writeClusterAssignments(tmp_cluster_file, MOB_CLUSTER_INFO_HEADER, cluster_assignments) shutil.copy(tmp_ref_fasta_file, os.path.join(out_dir, update_fasta)) else: cluster_acs = convert_num_to_acs(clust_assignments) for id in cluster_acs: primary_key = cluster_acs[id][0] secondary_key = cluster_acs[id][1] new_seq_info[id]['primary_cluster_id'] = primary_key new_seq_info[id]['primary_dist'] = primary_distance new_seq_info[id]['secondary_cluster_id'] = secondary_key new_seq_info[id]['secondary_dist'] = secondary_distance writeClusterAssignments(tmp_cluster_file, MOB_CLUSTER_INFO_HEADER, new_seq_info) shutil.copy(input_fasta, update_fasta) logging.info("Sketching new fasta {}".format(update_fasta)) mash_db_file = "{}.msh".format(update_fasta) mObj = mash() mObj.mashsketch(update_fasta, mash_db_file, num_threads=num_threads) logging.info("Building blastdb {}".format(update_fasta)) blast_runner = BlastRunner(update_fasta, '') blast_runner.makeblastdb(update_fasta, 'nucl', logging=logging) logging.info("Removing temporary directory") shutil.rmtree(tmp_dir) logging.info( "MOB-cluster completed, analysis results written to {}".format( out_dir))