示例#1
0
def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1,
            min_covhsp=25, seq_id_file=None):
    blast_runner = BlastRunner(input_fasta, out_dir)

    blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                             db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads, seq_id_file=seq_id_file, logging=logging)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
示例#2
0
def blastn(input_fasta, blastdb, min_ident, min_cov, evalue, min_length, out_dir, blast_results_file, logging,
           seq_filterfile=None, num_threads=1, max_length=400000, min_hsp_cov=1):
    blast_runner = BlastRunner(input_fasta, out_dir)
    blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                           db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue,
                           blast_outfile=blast_results_file, logging=logging, num_threads=num_threads, word_size=11,
                           seq_id_file=seq_filterfile)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= max_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_hsp_cov]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]

    blast_df = blast_df.reset_index(drop=True)
    blast_df = fixStart(blast_df)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
示例#3
0
def repetitive_blast(input_fasta,
                     ref_db,
                     min_ident,
                     min_cov,
                     evalue,
                     min_length,
                     tmp_dir,
                     blast_results_file,
                     num_threads=1):
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    #blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_blast(query_fasta_path=input_fasta,
                           blast_task='megablast',
                           db_path=ref_db,
                           db_type='nucl',
                           min_cov=min_cov,
                           min_ident=min_ident,
                           evalue=evalue,
                           blast_outfile=blast_results_file,
                           num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)

    contig_list = dict()
    for index, row in blast_df.iterrows():
        if not row['qseqid'] in contig_list:
            contig_list[row['qseqid']] = {
                'id': row['sseqid'],
                'score': row['bitscore'],
                'contig_start': row['sstart'],
                'contig_end': row['send']
            }
        else:
            if contig_list[row['qseqid']]['score'] > row['bitscore']:
                contig_list[row['qseqid']] = {
                    'id': row['sseqid'],
                    'score': row['bitscore'],
                    'contig_start': row['sstart'],
                    'contig_end': row['send']
                }

    return contig_list
示例#4
0
def main():
    logging = init_console_logger(2)
    logging.info('Initilizating databases...this will take some time')

    #Find available threads and use the maximum number available for mash sketch but cap it at 32
    num_threads = multiprocessing.cpu_count()
    if num_threads > 32:
        num_threads = 32

    database_directory = os.path.join(os.path.dirname(os.path.realpath(__file__)),'databases/')
    zip_file = os.path.join(database_directory,'data.zip')
    plasmid_database_fasta_file = os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas')
    repetitive_fasta_file = os.path.join(database_directory,'repetitive.dna.fas')
    mash_db_file =  os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas.msh')
    download_to_file('https://ndownloader.figshare.com/articles/5841882?private_link=a4c92dd84f17b2cefea6',zip_file)
    extract(zip_file,database_directory)
    os.remove(zip_file)
    files = [f for f in listdir(database_directory) if isfile(join(database_directory, f))]
    for file in files:

        if file.endswith('gz'):
            extract(os.path.join(database_directory,file), database_directory)

    #Initilize blast and mash daatabases
    blast_runner = BlastRunner(repetitive_fasta_file, database_directory)
    blast_runner.makeblastdb(repetitive_fasta_file, 'nucl')
    blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory)
    blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl')
    mObj = mash()
    mObj.mashsketch(plasmid_database_fasta_file,mash_db_file,num_threads=num_threads)
    status_file = os.path.join(database_directory,'status.txt')
    with gzip.open(status_file, 'w') as f:
        f.write("Download date: {}".format(datetime.datetime.today().strftime('%Y-%m-%d')))
    f.close()
示例#5
0
def contig_blast(input_fasta, plasmid_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file,
                 num_threads=1):
    blast_runner = None
    filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt')
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=plasmid_db,
                           db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue,
                           blast_outfile=blast_results_file, num_threads=num_threads, word_size=11)
    if os.path.getsize(blast_results_file) == 0:
        fh = open(filtered_blast, 'w', encoding="utf-8")
        fh.write('')
        fh.close()
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= 400000]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False)
示例#6
0
def mob_blast(input_fasta,
              ref_db,
              min_ident,
              min_cov,
              evalue,
              tmp_dir,
              blast_results_file,
              overlap=5,
              num_threads=1):
    num_threads = 1
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_tblastn(query_fasta_path=input_fasta,
                             blast_task='megablast',
                             db_path=ref_db,
                             db_type='nucl',
                             min_cov=min_cov,
                             min_ident=min_ident,
                             evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart',
                                         'send', 'bitscore')
    prev_size = 0
    size = str(len(blast_df))
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid',
                                             'sstart', 'send', 'bitscore')
        prev_size = size
        size = str(len(blast_df))
    #print(blast_df)
    return blast_df
示例#7
0
    def run_blast(self,
                  input_fasta,
                  output_path,
                  blast_results_file,
                  logging,
                  min_cov=1,
                  min_ident=1,
                  evalue=1,
                  num_threads=1,
                  min_length=25):
        blast_runner = BlastRunner(input_fasta, output_path)
        blast_runner.makeblastdb(input_fasta, 'nucl', logging)
        blast_runner.run_blast(query_fasta_path=input_fasta,
                               blast_task='megablast',
                               db_path=input_fasta,
                               db_type='nucl',
                               min_cov=min_cov,
                               min_ident=min_ident,
                               evalue=evalue,
                               blast_outfile=blast_results_file,
                               num_threads=num_threads,
                               word_size=11,
                               logging=logging)

        if os.path.getsize(blast_results_file) == 0:
            fh = open(blast_results_file, 'w', encoding="utf-8")
            fh.write('')
            fh.close()
            return dict()

        blast_df = BlastReader(blast_results_file, logging).df
        blast_df = blast_df.loc[blast_df['length'] >= min_length]
        blast_df = blast_df.reset_index(drop=True)
        blast_df.to_csv(blast_results_file,
                        sep='\t',
                        header=False,
                        line_terminator='\n',
                        index=False)
示例#8
0
def main():
    args = arguments()

    database_directory = os.path.abspath(args.database_directory)

    if os.path.exists(database_directory) == False:
        os.mkdir(database_directory)
    else:
        logger.info("Database directory folder already exists at {}".format(
            database_directory))

    # Helper function to simplify adding database_directory to everything
    prepend_db_dir = functools.partial(os.path.join, database_directory)

    lockfilepath = os.path.join(database_directory, ".lock")
    status_file = prepend_db_dir('status.txt')

    if os.path.exists(lockfilepath) == False:
        try:
            open(file=lockfilepath, mode="w").close()
            logger.info("Placed lock file at {}".format(lockfilepath))
        except Exception as e:
            logger.error(
                "Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?"
                .format(lockfilepath))
            logger.error("{}".format(e))
            pass
    else:
        while os.path.exists(lockfilepath):
            elapsed_time = time.time() - os.path.getmtime(lockfilepath)
            logger.info(
                "Lock file found at {}. Waiting for other processes to finish database init ..."
                .format(lockfilepath))
            logger.info(
                "Elapsed time {} min. Will continue processing after 16 min mark."
                .format(int(elapsed_time / 60)))
            if elapsed_time >= 1000:
                logger.info(
                    "Elapsed time {} min. Assuming previous process completed all init steps. Continue ..."
                    .format(int(elapsed_time / 60)))
                try:  #if previous process failed, no processes are running and > 16 min passed since the lock was created
                    os.remove(lockfilepath)
                except:  #continue if file was removed by other process
                    pass
                break
            time.sleep(60)  #recheck every 1 min if lock file was removed
        logger.info(
            "Lock file no longer exists. Assuming init process completed successfully"
        )
        return 0

    logger.info('Initializing databases...this will take some time')
    # Find available threads and use the maximum number available for mash sketch but cap it at 32
    num_threads = min(multiprocessing.cpu_count(), 32)

    if not os.path.exists(database_directory):
        os.makedirs(database_directory)

    zip_file = prepend_db_dir('data.tar.gz')
    plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas')
    repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas')
    mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh')

    logger.info('Downloading databases...this will take some time')

    for db_mirror in config['db_mirrors']:
        try:
            logger.info('Trying mirror {}'.format(db_mirror))
            download_to_file(db_mirror, zip_file)
            break
        except Exception as e:
            logger.error(
                "Download failed with error {}. Removing lock file".format(
                    str(e)))
            os.remove(lockfilepath)
            sys.exit(-1)

    logger.info(
        "Downloading databases successful, now building databases at {}".
        format(database_directory))
    extract(zip_file, database_directory)

    files = [
        prepend_db_dir(f) for f in os.listdir(database_directory)
        if f.endswith('.gz')
    ]

    for file in files:

        extract(file, database_directory)

    #Initialize blast and mash databases
    try:
        logger.info('Building repetitive mask database')
        blast_runner = BlastRunner(repetitive_fasta_file, database_directory)
        blast_runner.makeblastdb(repetitive_fasta_file, 'nucl', logger)

        logger.info('Building complete plasmid database')
        blast_runner = BlastRunner(plasmid_database_fasta_file,
                                   database_directory)
        blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl', logger,
                                 True)

        logger.info('Sketching complete plasmid database')
        mObj = mash()
        mObj.mashsketch(plasmid_database_fasta_file,
                        mash_db_file,
                        num_threads=num_threads)
    except Exception as e:
        logger.error(
            'Downloading databases failed, please check your internet connection and retry'
        )
        logger.error(
            "Process failed with error {}. Removing lock file".format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        logger.info("Init ete3 library ...")
        ete3taxadbpath = os.path.abspath(
            os.path.join(database_directory, "taxa.sqlite"))
        ncbi = NCBITaxa()
        ncbi.dbfile = ete3taxadbpath
        ncbi.update_taxonomy_database()
    except Exception as e:
        logger.error(
            "Init of ete3 library failed with error {}. Removing lock file".
            format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz"))
        logger.info(
            "Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job."
        )
    except:
        pass

    with open(status_file, 'w') as f:
        download_date = datetime.datetime.today().strftime('%Y-%m-%d')
        f.write("Download date: {}. Removing lock file.".format(download_date))
        try:
            os.remove(lockfilepath)
        except:
            logger.warning(
                "Lock file is already removed by some other process.")
            pass

    logger.info("MOB init completed successfully")
    return 0
示例#9
0
def main():
    args = parse_args()
    logging = init_console_logger(3)
    logging.info('Running Mob-Suite Clustering toolkit v. {}'.format(__version__))
    logging.info('Processing fasta file {}'.format(args.infile))
    logging.info('Analysis directory {}'.format(args.outdir))

    input_fasta = args.infile
    if not os.path.isfile(input_fasta):
        logging.error('Error, input fasta specified does not exist: {}'.format(input_fasta ))
        sys.exit()

    out_dir = args.outdir
    num_threads = args.num_threads
    if not os.path.isdir(out_dir):
        os.mkdir(out_dir, 0o755)
    tmp_dir = os.path.join(out_dir, '__tmp')
    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    mode = str(args.mode).lower()

    if mode not in ('update','build'):
        logging.error('Error you have not entered a valid mode of build or update, you entered: {}'.format(mode))
        print(('Error you have not entered a valid mode of build or update, you entered: {}'.format(mode)))
        sys.exit()

    header = ('id', 0.05, 0.0001)
    tmp_cluster_file = os.path.join(out_dir, 'clusters.txt')
    tmp_ref_fasta_file = os.path.join(tmp_dir, 'references_tmp.fasta')
    update_fasta = os.path.join(out_dir, 'references_updated.fasta')

    if mode == 'update':
        if args.ref_cluster_file is None:
            logging.error('Reference fasta file must be specified, please check help for parameter reference')
            sys.exit()

        ref_fasta = args.ref_fasta_file

        if not os.path.isfile(ref_fasta ):
            logging.error('Reference fasta file specified does not exist: {}'.format(ref_fasta))
            sys.exit()

        if args.ref_cluster_file is None:
            logging.error('Reference cluster file must be specified, please check help for parameter reference')
            sys.exit()

        ref_cluster_file = args.ref_cluster_file

        if not os.path.isfile(ref_cluster_file):
            logging.error('Reference cluster file specified does not exist: {}'.format(ref_cluster_file))
            sys.exit()

        if args.ref_mash_db is None:
            logging.error('Reference mash sketch file must be specified, please check help for parameter reference')
            sys.exit()

        ref_mash_db = args.ref_mash_db
        if not os.path.isfile(ref_mash_db):
            logging.error('Reference mash file specified does not exist: {}'.format(ref_mash_db))
            sys.exit()

        logging.info('Running mob-cluster in update mode with input file: {}'.format(input_fasta))
        logging.info('Running mob-cluster in update mode with output directory: {}'.format(out_dir))
        logging.info('Running mob-cluster in update mode on reference fasta file: {}'.format(ref_fasta))
        logging.info('Reading previous cluster reference assignments from : {}'.format(ref_cluster_file))

        shutil.copy(ref_cluster_file, tmp_cluster_file)
        shutil.copy(ref_fasta, tmp_ref_fasta_file)
        update_existing(input_fasta, tmp_dir, ref_mash_db, tmp_cluster_file, header, tmp_ref_fasta_file, update_fasta)

        if args.overwrite:
            shutil.move(update_fasta,ref_fasta)
            shutil.move(tmp_cluster_file,ref_cluster_file)
            mash_db_file = "{}.msh".format(input_fasta)
            mObj = mash()
            mObj.mashsketch(input_fasta, mash_db_file, num_threads=num_threads)
            blast_runner = BlastRunner(ref_fasta, '')
            blast_runner.makeblastdb(ref_fasta, 'nucl')
    else:
        mashObj = mash()
        mashObj.mashsketch(input_fasta,input_fasta+".msh",num_threads=num_threads)
        distance_matrix_file = os.path.join(tmp_dir,'mash_dist_matrix.txt')
        mashfile_handle = open(distance_matrix_file,'w')

        mashObj.run_mash(input_fasta+'.msh', input_fasta+'.msh', mashfile_handle,table=True,num_threads=num_threads)
        clust_assignments = build_cluster_db(distance_matrix_file, (0.05, 0.0001))
        writeClusterAssignments(tmp_cluster_file, header, clust_assignments)
        clust_dict = selectCluster(clust_assignments, 1)
        shutil.copy(input_fasta, tmp_ref_fasta_file)
        updateFastaFile(tmp_ref_fasta_file ,update_fasta, clust_dict)
示例#10
0
def main():
    default_database_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'databases')
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--database_directory',
                        default=default_database_dir,
                        help='Directory to download databases to. Defaults to {}'.format(default_database_dir))
    args = parser.parse_args()
    logging = init_console_logger(2)
    logging.info('Initilizating databases...this will take some time')

    #Find available threads and use the maximum number available for mash sketch but cap it at 32
    num_threads = multiprocessing.cpu_count()
    if num_threads > 32:
        num_threads = 32

    # For some reason absolute paths don't work - enforce absolute path.
    database_directory = os.path.abspath(args.database_directory)
    if not os.path.exists(database_directory):
        os.makedirs(database_directory)
    zip_file = os.path.join(database_directory,'data.zip')
    plasmid_database_fasta_file = os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas')
    repetitive_fasta_file = os.path.join(database_directory,'repetitive.dna.fas')
    mash_db_file =  os.path.join(database_directory,'ncbi_plasmid_full_seqs.fas.msh')
    logging.info('Downloading databases...this will take some time')


    db_mirrors = ['https://share.corefacility.ca/index.php/s/oeufkw5HyKz0X5I/download',
                  'https://ndownloader.figshare.com/articles/5841882/versions/1']

    for db_mirror in db_mirrors:
        logging.info('Trying mirror {}'.format(db_mirror))
        download_to_file(db_mirror, zip_file)
        if os.path.exists(zip_file) and os.path.getsize(zip_file) > 50000:
            break   #do not try other mirror

    if (not os.path.isfile(zip_file)):
        logging.error('Downloading databases failed, please check your internet connection and retry')
        sys.exit(-1)
    else:
        logging.info('Downloading databases successful, now building databases')
    extract(zip_file,database_directory)
    os.remove(zip_file)
    files = [f for f in listdir(database_directory) if isfile(join(database_directory, f))]
    for file in files:

        if file.endswith('gz'):
            extract(os.path.join(database_directory,file), database_directory)

    #Initilize blast and mash daatabases
    logging.info('Building repetive mask database')
    blast_runner = BlastRunner(repetitive_fasta_file, database_directory)
    blast_runner.makeblastdb(repetitive_fasta_file, 'nucl')
    logging.info('Building complete plasmid database')
    blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory)
    blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl')
    logging.info('Sketching complete plasmid database')
    mObj = mash()
    mObj.mashsketch(plasmid_database_fasta_file,mash_db_file,num_threads=num_threads)
    status_file = os.path.join(database_directory,'status.txt')
    with open(status_file, 'w') as f:
        f.write("Download date: {}".format(datetime.datetime.today().strftime('%Y-%m-%d')))
    f.close()
示例#11
0
def main():
    args = parse_args()

    if args.debug:
        logger = init_console_logger(3)
    else:
        logger = init_console_logger(2)

    logger.info('Running Mob-typer version {}'.format(__version__))

    logger.info('Processing fasta file {}'.format(args.infile))

    if not os.path.isfile(args.infile):
        logger.info('Error, fasta file does not exist {}'.format(args.infile))
        sys.exit()

    if not args.analysis_dir:
        tmp_dir = tempfile.TemporaryDirectory(dir=tempfile.gettempdir()).name
    else:
        tmp_dir = args.analysis_dir

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    if not isinstance(args.num_threads, int):
        logger.info(
            'Error number of threads must be an integer, you specified "{}"'.
            format(args.num_threads))

    database_dir = os.path.abspath(args.database_directory)

    if args.sample_id is None:
        sample_id = re.sub(r"\.(fasta|fa|fas){1,1}", "",
                           os.path.basename(args.infile))
    else:
        sample_id = args.sample_id

    # Script arguments
    input_fasta = args.infile
    report_file = args.out_file
    num_threads = int(args.num_threads)
    keep_tmp = args.keep_tmp

    if args.multi:
        multi = True
    else:
        multi = False

    if not (args.primary_cluster_dist >= 0 and args.primary_cluster_dist <= 1):
        logging.error(
            'Error distance thresholds must be between 0 - 1: {}'.format(
                args.primary_cluster_dist))
        sys.exit()
    else:
        primary_distance = float(args.primary_cluster_dist)

    if not (args.secondary_cluster_dist >= 0
            and args.secondary_cluster_dist <= 1):
        logging.error(
            'Error distance thresholds must be between 0 - 1: {}'.format(
                args.secondary_cluster_dist))
        sys.exit()
    else:
        secondary_distance = float(args.secondary_cluster_dist)

    if database_dir == default_database_dir:
        mob_ref = args.plasmid_mob
        mash_db = args.plasmid_mash_db
        replicon_ref = args.plasmid_replicons
        plasmid_meta = args.plasmid_meta
        mpf_ref = args.plasmid_mpf
        plasmid_orit = args.plasmid_orit
        verify_init(logger, database_dir)
    else:
        mob_ref = os.path.join(database_dir, 'mob.proteins.faa')
        mash_db = os.path.join(database_dir, 'ncbi_plasmid_full_seqs.fas.msh')
        replicon_ref = os.path.join(database_dir, 'rep.dna.fas')
        plasmid_meta = os.path.join(database_dir, 'clusters.txt')
        mpf_ref = os.path.join(database_dir, 'mpf.proteins.faa')
        plasmid_orit = os.path.join(database_dir, 'orit.fas')

    LIT_PLASMID_TAXONOMY_FILE = os.path.join(
        database_dir, "host_range_literature_plasmidDB.txt")
    NCBI_PLASMID_TAXONOMY_FILE = plasmid_meta

    fixed_fasta = os.path.join(tmp_dir, 'fixed.input.fasta')
    replicon_blast_results = os.path.join(tmp_dir,
                                          'replicon_blast_results.txt')
    mob_blast_results = os.path.join(tmp_dir, 'mobtyper_blast_results.txt')
    mpf_blast_results = os.path.join(tmp_dir, 'mpf_blast_results.txt')
    orit_blast_results = os.path.join(tmp_dir, 'orit_blast_results.txt')
    repetitive_blast_results = os.path.join(tmp_dir,
                                            'repetitive_blast_results.txt')

    if os.path.isfile(mob_blast_results):
        os.remove(mob_blast_results)
    if os.path.isfile(mpf_blast_results):
        os.remove(mpf_blast_results)
    if os.path.isfile(orit_blast_results):
        os.remove(orit_blast_results)
    if os.path.isfile(replicon_blast_results):
        os.remove(replicon_blast_results)

    # Input numeric params

    min_rep_ident = float(args.min_rep_ident)
    min_mob_ident = float(args.min_mob_ident)
    min_ori_ident = float(args.min_rep_ident)
    min_mpf_ident = float(args.min_mob_ident)

    idents = {
        'min_rep_ident': min_rep_ident,
        'min_mob_ident': min_mob_ident,
        'min_ori_ident': min_ori_ident
    }

    for param in idents:

        value = float(idents[param])

        if value < 60:
            logger.error(
                "Error: {} is too low, please specify an integer between 70 - 100"
                .format(param))
            sys.exit(-1)
        if value > 100:
            logger.error(
                "Error: {} is too high, please specify an integer between 70 - 100"
                .format(param))
            sys.exit(-1)

    min_rep_cov = float(args.min_rep_cov)
    min_mob_cov = float(args.min_mob_cov)
    min_ori_cov = float(args.min_rep_cov)
    min_mpf_cov = float(args.min_mob_cov)

    covs = {
        'min_rep_cov': min_rep_cov,
        'min_mob_cov': min_mob_cov,
        'min_con_cov': min_ori_cov,
        'min_rpp_cov': min_ori_cov
    }

    for param in covs:

        value = float(covs[param])

        if value < 60:
            logger.error(
                "Error: {} is too low, please specify an integer between 50 - 100"
                .format(param))
            sys.exit(-1)
        if value > 100:
            logger.error(
                "Error: {} is too high, please specify an integer between 50 - 100"
                .format(param))
            sys.exit(-1)

    min_rep_evalue = float(args.min_rep_evalue)
    min_mob_evalue = float(args.min_mob_evalue)
    min_ori_evalue = float(args.min_rep_evalue)
    min_mpf_evalue = float(args.min_mob_evalue)

    evalues = {
        'min_rep_evalue': min_rep_evalue,
        'min_mob_evalue': min_mob_evalue,
        'min_con_evalue': min_ori_evalue
    }

    for param in evalues:

        value = float(evalues[param])

        if value > 1:
            logger.error(
                "Error: {} is too high, please specify an float evalue between 0 to 1"
                .format(param))
            sys.exit(-1)

    check_dependencies(logger)

    needed_dbs = [replicon_ref, mob_ref, mash_db, mpf_ref]

    for db in needed_dbs:
        if (not os.path.isfile(db)):
            logger.info('Warning! Needed database missing "{}"'.format(db))
            mob_suite.mob_init.main()

    if not os.path.isdir(tmp_dir):
        os.mkdir(tmp_dir, 0o755)

    # Test that ETE3 db is ok and lock process check
    dbstatus = ETE3_db_status_check(1, ETE3_LOCK_FILE, ETE3DBTAXAFILE, logging)
    if dbstatus == False:
        logging.error(
            "Exiting due to lock file not removed: {}".format(ETE3_LOCK_FILE))
        sys.exit(-1)

    # Get cluster information
    reference_sequence_meta = read_sequence_info(plasmid_meta,
                                                 MOB_CLUSTER_INFO_HEADER)

    # initilize master record tracking
    fix_fasta_header(input_fasta, fixed_fasta)
    contig_seqs = read_fasta_dict(fixed_fasta)
    contig_info = {}
    for id in contig_seqs:
        seq = contig_seqs[id]
        contig_info[id] = {}
        for feature in MOB_TYPER_REPORT_HEADER:
            contig_info[id][feature] = ''
        contig_info[id]['md5'] = calc_md5(seq)
        contig_info[id]['gc'] = GC(seq)
        contig_info[id]['size'] = len(seq)
        contig_info[id]['contig_id'] = id
        contig_info[id]['sample_id'] = sample_id

    # Makeblastdb
    blast_runner = BlastRunner(fixed_fasta, tmp_dir)
    build_success = blast_runner.makeblastdb(fixed_fasta,
                                             'nucl',
                                             logging=logging)
    if build_success == False:
        logging.error(
            "Could not build blast database, check error messages..cannot continue"
        )
        sys.exit()

    # run individual marker blasts

    contig_info = identify_biomarkers(contig_info, fixed_fasta, tmp_dir, 25, logging, \
                                      replicon_ref, min_rep_ident, min_rep_cov, min_rep_evalue, replicon_blast_results, \
                                      mob_ref, min_mob_ident, min_mob_cov, min_mob_evalue, mob_blast_results, \
                                      mpf_ref, min_mpf_ident, min_mpf_cov, min_mpf_evalue, mpf_blast_results, \
                                      None, None, None, None, \
                                      plasmid_orit, orit_blast_results, repetitive_blast_results, \
                                      num_threads=1)

    m = mash()
    mobtyper_results = []

    mash_input_fasta = fixed_fasta + '.msh'

    ncbi = dict_from_alt_key_list(
        read_file_to_dict(NCBI_PLASMID_TAXONOMY_FILE,
                          MOB_CLUSTER_INFO_HEADER,
                          separater="\t"), "sample_id")
    lit = dict_from_alt_key_list(
        read_file_to_dict(LIT_PLASMID_TAXONOMY_FILE,
                          LIT_PLASMID_TAXONOMY_HEADER,
                          separater="\t"), "sample_id")

    if multi:
        m.mashsketch(input_fasta=fixed_fasta,
                     output_path=mash_input_fasta,
                     sketch_ind=True,
                     num_threads=num_threads)
        mash_results = parseMash(
            m.run_mash(reference_db=mash_db,
                       input_fasta=mash_input_fasta,
                       table=False,
                       num_threads=num_threads))

        for seq_id in mash_results:
            record = {}
            for field in MOB_TYPER_REPORT_HEADER:
                if field in contig_info[seq_id]:
                    record[field] = contig_info[seq_id][field]
                else:
                    record[field] = ''
            record['sample_id'] = seq_id
            record['num_contigs'] = 1
            distances = OrderedDict(
                sorted(mash_results[seq_id].items(),
                       key=itemgetter(1),
                       reverse=False))

            for mash_neighbor_id in distances:
                dist = distances[mash_neighbor_id]
                if mash_neighbor_id not in reference_sequence_meta:
                    continue
                else:
                    record['mash_nearest_neighbor'] = mash_neighbor_id
                    record['mash_neighbor_distance'] = dist
                    record['primary_cluster_id'] = reference_sequence_meta[
                        mash_neighbor_id]['primary_cluster_id']
                    record['secondary_cluster_id'] = reference_sequence_meta[
                        mash_neighbor_id]['secondary_cluster_id']
                    record[
                        'mash_neighbor_identification'] = reference_sequence_meta[
                            mash_neighbor_id]['organism']
                    break
            mobtyper_results.append(record)

    else:
        m.mashsketch(input_fasta=fixed_fasta,
                     output_path=mash_input_fasta,
                     sketch_ind=False,
                     num_threads=num_threads)
        mash_results = parseMash(
            m.run_mash(reference_db=mash_db,
                       input_fasta=mash_input_fasta,
                       table=False,
                       num_threads=num_threads))
        record = {}

        for field in MOB_TYPER_REPORT_HEADER:
            record[field] = ''

        record['sample_id'] = sample_id
        fastaSeqStats = calcFastaStats(fixed_fasta)
        record['md5'] = fastaSeqStats['md5']
        record['total_length'] = fastaSeqStats['size']
        record['num_contigs'] = fastaSeqStats['num_seq']
        record['gc'] = fastaSeqStats['gc_content']
        record['mash_nearest_neighbor'] = '-'
        record['mash_neighbor_distance'] = 1
        record['primary_cluster_id'] = '-'
        record['secondary_cluster_id'] = '-'
        record['mash_neighbor_identification'] = '-'

        for seq_id in mash_results:
            distances = OrderedDict(
                sorted(mash_results[seq_id].items(),
                       key=itemgetter(1),
                       reverse=False))
            mash_neighbor_id = next(iter(distances))
            dist = distances[mash_neighbor_id]
            if mash_neighbor_id not in reference_sequence_meta:
                continue
            record['mash_nearest_neighbor'] = mash_neighbor_id
            record['mash_neighbor_distance'] = dist
            record['primary_cluster_id'] = reference_sequence_meta[
                mash_neighbor_id]['primary_cluster_id']
            record['secondary_cluster_id'] = reference_sequence_meta[
                mash_neighbor_id]['secondary_cluster_id']
            record['mash_neighbor_identification'] = reference_sequence_meta[
                mash_neighbor_id]['organism']

        record['rep_type(s)'] = []
        record['rep_type_accession(s)'] = []
        record['relaxase_type(s)'] = []
        record['relaxase_type_accession(s)'] = []
        record['mpf_type'] = []
        record['mpf_type_accession(s)'] = []
        record['orit_type(s)'] = []
        record['orit_accession(s)'] = []

        for seq_id in contig_info:
            record['rep_type(s)'].append(contig_info[seq_id]['rep_type(s)'])
            record['rep_type_accession(s)'].append(
                contig_info[seq_id]['rep_type_accession(s)'])
            record['relaxase_type(s)'].append(
                contig_info[seq_id]['relaxase_type(s)'])
            record['relaxase_type_accession(s)'].append(
                contig_info[seq_id]['relaxase_type_accession(s)'])
            record['mpf_type'].append(contig_info[seq_id]['mpf_type'])
            record['mpf_type_accession(s)'].append(
                contig_info[seq_id]['mpf_type_accession(s)'])
            record['orit_type(s)'].append(contig_info[seq_id]['orit_type(s)'])
            record['orit_accession(s)'].append(
                contig_info[seq_id]['orit_accession(s)'])

        for field in record:
            tmp = []
            if record[field] == None:
                continue
            if isinstance(record[field], list):
                length = len(record[field])
                for i in range(0, length):
                    tmp += record[field][i].split(',')
            elif isinstance(record[field], str) and len(record[field]) > 0:
                tmp += record[field].split(',')
            if len(tmp) > 0:
                record[field] = []
                for d in tmp:
                    if len(d) > 0:
                        record[field].append(d)

        mobtyper_results.append(record)

    for i in range(0, len(mobtyper_results)):
        record = mobtyper_results[i]
        bio_markers = sort_biomarkers({
            0: {
                'types': record['rep_type(s)'],
                'acs': record['rep_type_accession(s)']
            },
            1: {
                'types': record['relaxase_type(s)'],
                'acs': record['relaxase_type_accession(s)']
            },
            2: {
                'types': record['mpf_type'],
                'acs': record['mpf_type_accession(s)']
            },
            3: {
                'types': record['orit_type(s)'],
                'acs': record['orit_accession(s)']
            },
        })

        record['rep_type(s)'] = bio_markers[0]['types']
        record['rep_type_accession(s)'] = bio_markers[0]['acs']
        record['relaxase_type(s)'] = bio_markers[1]['types']
        record['relaxase_type_accession(s)'] = bio_markers[1]['acs']
        record['mpf_type'] = bio_markers[2]['types']
        record['mpf_type_accession(s)'] = bio_markers[2]['acs']
        record['orit_type(s)'] = bio_markers[3]['types']
        record['orit_accession(s)'] = bio_markers[3]['acs']

        if (isinstance(record['mash_neighbor_distance'], float) or isinstance(
                record['mash_neighbor_distance'],
                int)) and record['mash_neighbor_distance'] <= primary_distance:
            mob_cluster_id = record['primary_cluster_id']
        else:
            mob_cluster_id = None

        #Patches that sometimes results are concatonated into strings if contigs are merged into a single results
        if isinstance(record['rep_type(s)'], list):
            record['rep_type(s)'] = ",".join(record['rep_type(s)'])
        if isinstance(record['relaxase_type_accession(s)'], list):
            record['relaxase_type_accession(s)'] = ",".join(
                record['relaxase_type_accession(s)'])

        host_range = hostrange(record['rep_type(s)'].split(','),
                               record['relaxase_type_accession(s)'].split(','),
                               mob_cluster_id, ncbi, lit)

        for field in host_range:
            record[field] = host_range[field]

        if isinstance(record['mpf_type'], list):
            record['mpf_type'] = determine_mpf_type(record['mpf_type'])
        elif isinstance(record['mpf_type'], str):
            record['mpf_type'] = determine_mpf_type(
                record['mpf_type'].split(','))

        for field in record:
            if isinstance(record[field], list):
                record[field] = ",".join(record[field])

        record['predicted_mobility'] = 'non-mobilizable'
        if len(record['relaxase_type(s)']) > 0 and len(record['mpf_type']):
            record['predicted_mobility'] = 'conjugative'
        elif len(record['relaxase_type(s)']) > 0 or len(
                record['orit_type(s)']) > 0:
            record['predicted_mobility'] = 'mobilizable'

        mobtyper_results[i] = record

    writeReport(mobtyper_results, MOB_TYPER_REPORT_HEADER, report_file)

    if not keep_tmp:
        shutil.rmtree(tmp_dir)
    logger.info(
        "MOB-typer completed and results written to {}".format(report_file))
示例#12
0
def main():
    args = parse_args()
    if args.debug:
        logging = init_console_logger(3)
    else:
        logging = init_console_logger(2)
    logging.info(
        'Running Mob-Suite Clustering toolkit v. {}'.format(__version__))
    logging.info('Processing fasta file {}'.format(args.infile))
    logging.info('Analysis directory {}'.format(args.outdir))

    check_dependencies(logging)

    input_fasta = args.infile
    if not os.path.isfile(input_fasta):
        logging.error('Error, input fasta specified does not exist: {}'.format(
            input_fasta))
        sys.exit()

    mob_typer_report_file = args.mob_typer_file
    if not os.path.isfile(mob_typer_report_file):
        logging.error(
            'Error, input metadata file specified does not exist: {}'.format(
                mob_typer_report_file))
        sys.exit()

    mode = str(args.mode).lower()
    if mode not in ('update', 'build'):
        logging.error(
            'Error you have not entered a valid mode of build or update, you entered: {}'
            .format(mode))
        sys.exit()

    out_dir = args.outdir
    num_threads = args.num_threads

    if not (args.primary_cluster_dist >= 0 and args.primary_cluster_dist <= 1):
        logging.error(
            'Error distance thresholds must be between 0 - 1: {}'.format(
                args.primary_cluster_dist))
        sys.exit()
    else:
        primary_distance = args.primary_cluster_dist

    if not (args.secondary_cluster_dist >= 0
            and args.secondary_cluster_dist <= 1):
        logging.error(
            'Error distance thresholds must be between 0 - 1: {}'.format(
                args.secondary_cluster_dist))
        sys.exit()
    else:
        secondary_distance = args.secondary_cluster_dist

    if not os.path.isdir(out_dir):
        logging.info('Creating directory {}'.format(args.outdir))
        os.mkdir(out_dir, 0o755)
    tmp_dir = os.path.join(out_dir, '__tmp')
    if not os.path.isdir(tmp_dir):
        logging.info('Creating directory {}'.format(args.outdir))
        os.mkdir(tmp_dir, 0o755)

    taxonomy_file = args.taxonomy

    records = read_file_to_dict(mob_typer_report_file,
                                MOB_TYPER_REPORT_HEADER,
                                separater="\t")

    seq_ids = []
    new_seq_info = {}
    duplicate_keys = []
    for record in records:
        seq_ids.append(record['sample_id'])
        if not record['sample_id'] in new_seq_info:
            new_seq_info[record['sample_id']] = record
        else:
            duplicate_keys.append(record['sample_id'])

    if len(duplicate_keys) > 0:
        logging.error(
            "Duplicate sequence identifiers in fasta file. Please make every sequence id unique in the input file before using this tool"
        )
        logging.error("Duplicate sequence ids: {}".format(
            ",".join(duplicate_keys)))
        sys.exit()

    record_identifications = read_file_to_dict(taxonomy_file,
                                               ['sample_id', 'organism'],
                                               separater="\t")
    organisms = []
    for record in record_identifications:
        organism = record['organism']
        if organism == 'unknown' or organism == '' or organism == 'Unknown':
            organism = 'Bacteria'
        organisms.append(organism)
        seq_id = record['sample_id']
        if seq_id in new_seq_info:
            new_seq_info[seq_id]['organism'] = organism

    taxids = NamesToTaxIDs(organisms)
    del (organisms)

    for seq_id in new_seq_info:
        organism = new_seq_info[seq_id]['organism']
        if organism in taxids:
            new_seq_info[seq_id]['taxid'] = taxids[organism][0]
        else:
            new_seq_info[seq_id]['taxid'] = 2

    if len(new_seq_info) == 0:
        logging.error(
            'Error no MOB-typer results for sequences. Sequences must be typed with MOB-typer first'
        )
        sys.exit()

    fasta_dict = read_fasta_dict(input_fasta)

    if len(fasta_dict) == 0:
        logging.error(
            'Error no sequences found in input fasta: {}..cannot continue'.
            format(input_fasta))
        sys.exit()

    key_set_1 = set(seq_ids)
    key_set_2 = set(list(fasta_dict.keys()))

    if len(list(key_set_1 ^ key_set_2)) > 0:
        logging.error(
            'Error MOB-typer results: {} and input fasta: {} do not have the same set of identifiers, these must match in order to proceed'
            .format(mob_typer_report_file, input_fasta))
        logging.error(
            'Keys present in  MOB-typer results: {} and not in input fasta: {} are: {}'
            .format(mob_typer_report_file, input_fasta,
                    list(key_set_1 - key_set_2)))
        logging.error(
            'Keys present in  MOB-typer results: {} and not in input fasta: {} are: {}'
            .format(mob_typer_report_file, input_fasta,
                    list(key_set_2 - key_set_1)))
        sys.exit()

    tmp_cluster_file = os.path.join(out_dir, 'clusters.txt')
    tmp_ref_fasta_file = os.path.join(tmp_dir, 'references_tmp.fasta')
    update_fasta = os.path.join(out_dir, 'references_updated.fasta')

    # Sketch and calculate distances within update sequences
    if len(fasta_dict) > 1:
        mashObj = mash()
        mashObj.mashsketch(input_fasta,
                           input_fasta + ".msh",
                           num_threads=num_threads)
        distance_matrix_file = os.path.join(tmp_dir, 'mash_dist_matrix.txt')
        mashfile_handle = open(distance_matrix_file, 'w', encoding="utf-8")
        mashfile_handle.write(
            mashObj.run_mash(input_fasta + '.msh',
                             input_fasta + '.msh',
                             table=True,
                             num_threads=num_threads).decode())
        mashfile_handle.close()
        clust_assignments = build_cluster_db(
            distance_matrix_file, (primary_distance, secondary_distance))
    else:
        seq_id = next(iter(fasta_dict))
        clust_assignments = {seq_id: [0, 1]}

    logging.info('Running MOB-cluster in {} mode'.format(mode))
    if mode == 'update':

        if args.ref_cluster_file is None:
            logging.error(
                'Reference fasta file must be specified, please check help for parameter reference'
            )
            sys.exit()

        ref_fasta = args.ref_fasta_file
        if not os.path.isfile(ref_fasta):
            logging.error(
                'Reference fasta file specified does not exist: {}'.format(
                    ref_fasta))
            sys.exit()

        if args.ref_cluster_file is None:
            logging.error(
                'Reference cluster file must be specified, please check help for parameter reference'
            )
            sys.exit()

        ref_cluster_file = args.ref_cluster_file
        if not os.path.isfile(ref_cluster_file):
            logging.error(
                'Reference cluster file specified does not exist: {}'.format(
                    ref_cluster_file))
            sys.exit()

        mob_cluster_seq_info = read_sequence_info(ref_cluster_file,
                                                  MOB_CLUSTER_INFO_HEADER)

        logging.info(
            'Running mob-cluster in update mode with input file: {}'.format(
                input_fasta))
        logging.info(
            'Running mob-cluster in update mode with output directory: {}'.
            format(out_dir))
        logging.info(
            'Running mob-cluster in update mode on reference fasta file: {}'.
            format(ref_fasta))
        logging.info(
            'Reading previous cluster reference assignments from : {}'.format(
                ref_cluster_file))

        shutil.copy(ref_cluster_file, tmp_cluster_file)
        shutil.copy(ref_fasta, tmp_ref_fasta_file)
        logging.info('Creating new cluster assignments')
        new_seq_info = update_existing_db(new_seq_info, mob_cluster_seq_info,
                                          clust_assignments, primary_distance,
                                          secondary_distance, num_threads)

        cluster_assignments = {**mob_cluster_seq_info, **new_seq_info}
        logging.info(
            'Writting cluster assignments to : {}'.format(tmp_cluster_file))
        writeClusterAssignments(tmp_cluster_file, MOB_CLUSTER_INFO_HEADER,
                                cluster_assignments)
        shutil.copy(tmp_ref_fasta_file, os.path.join(out_dir, update_fasta))

    else:
        cluster_acs = convert_num_to_acs(clust_assignments)
        for id in cluster_acs:
            primary_key = cluster_acs[id][0]
            secondary_key = cluster_acs[id][1]
            new_seq_info[id]['primary_cluster_id'] = primary_key
            new_seq_info[id]['primary_dist'] = primary_distance
            new_seq_info[id]['secondary_cluster_id'] = secondary_key
            new_seq_info[id]['secondary_dist'] = secondary_distance

        writeClusterAssignments(tmp_cluster_file, MOB_CLUSTER_INFO_HEADER,
                                new_seq_info)
        shutil.copy(input_fasta, update_fasta)

    logging.info("Sketching new fasta {}".format(update_fasta))
    mash_db_file = "{}.msh".format(update_fasta)
    mObj = mash()
    mObj.mashsketch(update_fasta, mash_db_file, num_threads=num_threads)
    logging.info("Building blastdb {}".format(update_fasta))
    blast_runner = BlastRunner(update_fasta, '')
    blast_runner.makeblastdb(update_fasta, 'nucl', logging=logging)
    logging.info("Removing temporary directory")
    shutil.rmtree(tmp_dir)
    logging.info(
        "MOB-cluster completed, analysis results written to {}".format(
            out_dir))