Exemplo n.º 1
0
Arquivo: bins.py Projeto: chuym726/CAT
def run():
    args = parse_arguments()

    message = '# CAT v{0}.'.format(about.__version__)
    shared.give_user_feedback(message, args.log_file, args.quiet,
            show_time=False)
    
    # Check at which state to start.
    step_list = []
    if not args.proteins_fasta and not args.alignment_file:
        message = (
                '\n'
                'BAT is running. Protein prediction, alignment, and bin '
                'classification are carried out.')
        shared.give_user_feedback(message, args.log_file, args.quiet,
                show_time=False)

        step_list.append('predict_proteins')
        step_list.append('align')
    elif args.proteins_fasta and not args.alignment_file:
        message = (
                '\n'
                'BAT is running. Since a predicted protein fasta is supplied, '
                'only alignment and bin classification are carried out.')
        shared.give_user_feedback(message, args.log_file, args.quiet,
                show_time=False)

        step_list.append('align')
    elif args.proteins_fasta and args.alignment_file:
        message = (
                '\n'
                'BAT is running. Since a predicted protein fasta and '
                'alignment file are supplied, only bin classification is '
                'carried out.')
        shared.give_user_feedback(message, args.log_file, args.quiet,
                show_time=False)
    elif not args.proteins_fasta and args.alignment_file:
        message = (
                'if you want BAT to directly classify a set of bins, you '
                'should not only supply a DIAMOND alignment table but also a '
                'concatenated predicted protein fasta file with argument '
                '[-p / --proteins].')
        shared.give_user_feedback(message, args.log_file, args.quiet,
                error=True)

        sys.exit(1)

    step_list.append('classify')

    # Print variables.
    message = (
            'Rarw!\n\n'
            'Supplied command: {0}\n\n'
            'Bin folder: {1}\n'
            'Taxonomy folder: {2}\n'
            'Database folder: {3}\n'
            'Parameter r: {4}\n'
            'Parameter f: {5}\n'
            'Log file: {6}\n\n'
            '-----------------\n'.format(
                ' '.join(sys.argv),
                args.bin_folder,
                args.taxonomy_folder,
                args.database_folder,
                int(args.r),
                float(args.f),
                args.log_file))
    shared.give_user_feedback(message, args.log_file, args.quiet,
            show_time=False)

    # Check binaries, output files, taxonomy folder and database folder, and
    # set variables.
    message = 'Doing some pre-flight checks first.'
    shared.give_user_feedback(message, args.log_file, args.quiet,
            show_time=False)

    errors = []

    errors.append(
            check.check_bin_folder(
                args.bin_folder, args.bin_suffix, args.log_file, args.quiet))
    
    errors.append(
            check.check_out_prefix(args.out_prefix, args.log_file, args.quiet))
    
    if 'predict_proteins' in step_list:
        errors.append(
                check.check_prodigal_binaries(
                    args.path_to_prodigal, args.log_file, args.quiet))

        setattr(args,
                'concatenated_fasta',
                '{0}.concatenated.fasta'.format(args.out_prefix))
        setattr(args,
                'proteins_fasta',
                '{0}.concatenated.predicted_proteins.faa'.format(
                    args.out_prefix))
        setattr(args,
                'proteins_gff',
                '{0}.concatenated.predicted_proteins.gff'.format(
                    args.out_prefix))

        if not args.force:
            errors.append(
                    check.check_output_file(
                        args.concatenated_fasta, args.log_file, args.quiet))
            errors.append(
                    check.check_output_file(
                        args.proteins_fasta, args.log_file, args.quiet))
            errors.append(
                    check.check_output_file(
                        args.proteins_gff, args.log_file, args.quiet))
            
    if 'align' in step_list:
        errors.append(
                check.check_diamond_binaries(
                    args.path_to_diamond, args.log_file, args.quiet))

        setattr(args,
                'alignment_file',
                '{0}.concatenated.alignment.diamond'.format(args.out_prefix))

        if not args.force:
            errors.append(
                    check.check_output_file(
                        args.alignment_file, args.log_file, args.quiet))

    errors.append(
            check.check_folders_for_run(
                args.taxonomy_folder,
                args.nodes_dmp,
                args.names_dmp,
                args.database_folder,
                args.diamond_database,
                args.fastaid2LCAtaxid_file,
                args.taxids_with_multiple_offspring_file,
                step_list,
                args.log_file,
                args.quiet))

    setattr(args,
            'bin2classification_output_file',
            '{0}.bin2classification.txt'.format(args.out_prefix))
    setattr(args,
            'ORF2LCA_output_file',
            '{0}.ORF2LCA.txt'.format(args.out_prefix))

    if not args.force:
        errors.append(
                check.check_output_file(
                    args.bin2classification_output_file,
                    args.log_file,
                    args.quiet))
        errors.append(
                check.check_output_file(
                    args.ORF2LCA_output_file, args.log_file, args.quiet))
        
    if 'predict_proteins' not in step_list:
        errors.append(
                check.check_fasta(
                    args.proteins_fasta, args.log_file, args.quiet))

    if 'align' in step_list:
        errors.append(
                check.check_top(args.top, args.r, args.log_file, args.quiet))

    # Print all variables.
    shared.print_variables(args, step_list)

    if True in errors:
        sys.exit(1)

    message = 'Ready to fly!\n\n-----------------\n'
    shared.give_user_feedback(message, args.log_file, args.quiet,
            show_time=False)
    
    # Start BAT.
    (bin2contigs, contig_names) = import_bins(
            args.bin_folder, args.bin_suffix, args.log_file, args.quiet)

    if 'predict_proteins' in step_list:
        make_concatenated_fasta(
                args.concatenated_fasta,
                bin2contigs,
                args.bin_folder,
                args.log_file,
                args.quiet)

        shared.run_prodigal(
                args.path_to_prodigal,
                args.concatenated_fasta,
                args.proteins_fasta,
                args.proteins_gff,
                args.log_file,
                args.quiet)
        
    contig2ORFs = shared.import_ORFs(
            args.proteins_fasta, args.log_file, args.quiet)
    
    check.check_whether_ORFs_are_based_on_contigs(
            contig_names, contig2ORFs, args.log_file, args.quiet)
    
    if 'align' in step_list:
        shared.run_diamond(args)

    (ORF2hits,
            all_hits) = shared.parse_tabular_alignment(
                    args.alignment_file,
                    args.one_minus_r,
                    args.log_file,
                    args.quiet)

    (taxid2parent,
            taxid2rank) = tax.import_nodes(
            args.nodes_dmp, args.log_file, args.quiet)
    fastaid2LCAtaxid = tax.import_fastaid2LCAtaxid(
            args.fastaid2LCAtaxid_file, all_hits, args.log_file, args.quiet)
    taxids_with_multiple_offspring = tax.import_taxids_with_multiple_offspring(
            args.taxids_with_multiple_offspring_file,
            args.log_file,
            args.quiet)
    
    message = 'BAT is flying! Files {0} and {1} are created.'.format(
        args.bin2classification_output_file, args.ORF2LCA_output_file)
    shared.give_user_feedback(message, args.log_file, args.quiet)

    n_classified_bins = 0

    with open(args.bin2classification_output_file, 'w') as outf1, open(args.ORF2LCA_output_file, 'w') as outf2:
        outf1.write('# bin\tclassification\treason\tlineage\tlineage scores\n')

        outf2.write('# ORF\tbin\tnumber of hits\tlineage\ttop bit-score\n')
        
        for bin_ in sorted(bin2contigs):
            LCAs_ORFs = []

            for contig in sorted(bin2contigs[bin_]):
                if contig not in contig2ORFs:
                    continue

                for ORF in contig2ORFs[contig]:
                    if ORF not in ORF2hits:
                        outf2.write('{0}\t{1}\tORF has no hit to database\n'
                                ''.format(ORF, bin_))

                        continue

                    n_hits = len(ORF2hits[ORF])

                    (taxid,
                            top_bitscore) = tax.find_LCA_for_ORF(
                                    ORF2hits[ORF],
                                    fastaid2LCAtaxid,
                                    taxid2parent)
                     
                    if taxid.startswith('no taxid found'):
                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            ORF, bin_, n_hits, taxid, top_bitscore))
                    else:
                        lineage = tax.find_lineage(taxid, taxid2parent)

                        if not args.no_stars:
                            lineage = tax.star_lineage(
                                    lineage, taxids_with_multiple_offspring)

                        outf2.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format(
                            ORF,
                            bin_,
                            n_hits,
                            ';'.join(lineage[::-1]),
                            top_bitscore))
                                       
                    LCAs_ORFs.append((taxid, top_bitscore),)
                    
            if len(LCAs_ORFs) == 0:
                outf1.write('{0}\tno taxid assigned\tno hits to database\n'
                        ''.format(bin_))

                continue

            (lineages,
                    lineages_scores,
                    based_on_n_ORFs) = tax.find_weighted_LCA(
                            LCAs_ORFs, taxid2parent, args.f)

            if lineages == 'no ORFs with taxids found.':
                outf1.write('{0}\tno taxid assigned\t'
                        'hits not found in taxonomy files\n'.format(bin_))

                continue

            if lineages == 'no lineage whitelisted.':
                outf1.write(
                        '{0}\tno taxid assigned\t'
                        'no lineage reached minimum bit-score support\n'
                        ''.format(bin_))

                continue
            
            # The bin has a valid classification.
            n_classified_bins += 1

            total_n_ORFs = sum([len(contig2ORFs[contig]) for
                contig in bin2contigs[bin_] if contig in contig2ORFs])
            
            for (i, lineage) in enumerate(lineages):
                if not args.no_stars:
                    lineage = tax.star_lineage(
                            lineage, taxids_with_multiple_offspring)
                
                scores = ['{0:.2f}'.format(score) for
                        score in lineages_scores[i]]
                
                if len(lineages) == 1:
                    # There is only one classification.
                    outf1.write(
                            '{0}\t'
                            'taxid assigned\t'
                            'based on {1}/{2} ORFs\t'
                            '{3}\t'
                            '{4}\n'.format(
                                bin_,
                                based_on_n_ORFs,
                                total_n_ORFs,
                                ';'.join(lineage[::-1]),
                                ';'.join(scores[::-1])))
                else:
                    # There are multiple classifications.
                    outf1.write(
                            '{0}\t'
                            'taxid assigned ({1}/{2})\t'
                            'based on {3}/{4} ORFs\t'
                            '{5}\t'
                            '{6}\n'.format(
                                bin_,
                                i + 1,
                                len(lineages),
                                based_on_n_ORFs,
                                total_n_ORFs,
                                ';'.join(lineage[::-1]),
                                ';'.join(scores[::-1])))
                                   
    message = ('\n-----------------\n\n'
            '{0} BAT is done! {1:,d}/{2:,d} bins have taxonomy assigned.'
            ''.format(shared.timestamp(), n_classified_bins, len(bin2contigs)))
    shared.give_user_feedback(message, args.log_file, args.quiet,
            show_time=False)
  
    if args.f < 0.5:
        message = ('\nWARNING: since f is set to smaller than 0.5, one bin '
                'may have multiple classifications.')
        shared.give_user_feedback(message, args.log_file, args.quiet,
                show_time=False)

    return
Exemplo n.º 2
0
def prepare(step_list, args):
    shared.print_variables(args, step_list)

    if not os.path.isdir(args.taxonomy_folder):
        os.mkdir(args.taxonomy_folder)
        message = 'Taxonomy folder {0} is created.'.format(
            args.taxonomy_folder)
        shared.give_user_feedback(message, args.log_file, args.quiet)

    if not os.path.isdir(args.database_folder):
        os.mkdir(args.database_folder)
        message = 'Database folder {0} is created.'.format(
            args.database_folder)
        shared.give_user_feedback(message, args.log_file, args.quiet)

    if 'download_taxonomy_files' in step_list:
        download_taxonomy_files(args.taxonomy_folder, args.date, args.log_file,
                                args.quiet)

        setattr(args, 'nodes_dmp', '{0}nodes.dmp'.format(args.taxonomy_folder))

    if 'download_prot_accession2taxid_file' in step_list:
        setattr(
            args, 'prot_accession2taxid_file',
            '{0}{1}.prot.accession2taxid.FULL.gz'.format(
                args.taxonomy_folder, args.date))

        download_prot_accession2taxid_file(args.prot_accession2taxid_file,
                                           args.date, args.log_file,
                                           args.quiet)

    if 'download_nr' in step_list:
        setattr(args, 'nr_file', '{0}{1}.nr.gz'.format(args.database_folder,
                                                       args.date))

        download_nr(args.nr_file, args.log_file, args.quiet)

    if 'make_diamond_database' in step_list:
        setattr(args, 'diamond_database_prefix',
                '{0}{1}.nr'.format(args.database_folder, args.date))

        make_diamond_database(args.path_to_diamond, args.nr_file,
                              args.diamond_database_prefix, args.nproc,
                              args.log_file, args.quiet, args.verbose)

    if ('make_fastaid2LCAtaxid_file' in step_list
            or 'make_taxids_with_multiple_offspring_file' in step_list):
        taxid2parent, taxid2rank = tax.import_nodes(args.nodes_dmp,
                                                    args.log_file, args.quiet)

    if 'make_fastaid2LCAtaxid_file' in step_list:
        setattr(
            args, 'fastaid2LCAtaxid_file',
            '{0}{1}.nr.fastaid2LCAtaxid'.format(args.database_folder,
                                                args.date))

        make_fastaid2LCAtaxid_file(args.nodes_dmp, args.fastaid2LCAtaxid_file,
                                   args.nr_file,
                                   args.prot_accession2taxid_file,
                                   taxid2parent, args.log_file, args.quiet)

    if 'make_taxids_with_multiple_offspring_file' in step_list:
        setattr(
            args, 'taxids_with_multiple_offspring_file',
            '{0}{1}.nr.taxids_with_multiple_offspring'.format(
                args.database_folder, args.date))

        taxid2offspring = find_offspring(args.nodes_dmp,
                                         args.fastaid2LCAtaxid_file,
                                         taxid2parent, args.log_file,
                                         args.quiet)
        write_taxids_with_multiple_offspring_file(
            args.taxids_with_multiple_offspring_file, taxid2offspring,
            args.log_file, args.quiet)

    message = ('\n-----------------\n\n'
               '{0} CAT prepare is done!'.format(shared.timestamp()))
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    if args.nr_file:
        message = 'You may remove {0} now.'.format(args.nr_file)
        shared.give_user_feedback(message,
                                  args.log_file,
                                  args.quiet,
                                  show_time=False)

    message = ('\nSupply the following arguments to CAT or BAT if you want to '
               'use this database:\n'
               '-d / --database_folder {0}\n'
               '-t / --taxonomy_folder {1}'.format(args.database_folder,
                                                   args.taxonomy_folder))
    shared.give_user_feedback(message,
                              args.log_file,
                              args.quiet,
                              show_time=False)

    return