def summarise_bins(args): message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file(args.output_file, args.log_file, args.quiet)) if True in errors: sys.exit(1) message = 'Summarising...' shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# bin': message = '{0} is not a BAT classification file.'.format( args.input_file) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) if line[0] == '# contig': message = ( '{0} appears to be a CAT classification file. ' 'If you want to summarise contig ' 'classifications, supply a contigs fasta with ' 'argument [-c / --contigs_fasta].'.format( args.input_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ( 'official ranks not found in header of {0}. Make ' 'sure that the BAT classification file is named ' 'with official ranks with \'CAT add_names ' '--only_official\'.'.format(args.input_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) break else: message = 'input file does not have a recognisable header.' shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) n_bins = {} n_bins['no taxid assigned'] = 0 official_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] for rank in official_ranks: n_bins[rank] = {} n = 0 bin_trace = set() doubles = set() with open(args.input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') bin_ = line[0] if bin_ in bin_trace: doubles.add(bin_) bin_trace.add(bin_) if line[1] == 'no taxid assigned': n_bins['no taxid assigned'] += 1 continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in n_bins[rank]: n_bins[rank][classification] = 0 n_bins[rank][classification] += 1 if len(doubles) != 0: message = ('some bins have multiple classifications. CAT summarise ' 'currently does not allow for this. Bins with multiple ' 'classifications: {0}.'.format(', '.join(list(doubles)))) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) n_classified_bins = n - n_bins['no taxid assigned'] with open(args.output_file, 'w') as outf1: outf1.write('# total number of bins is {0:,d}, of which {1:,d} ' '({2:.2f}%) have taxonomy assigned.\n'.format( n, n_classified_bins, n_classified_bins / n * 100)) outf1.write('#\n') outf1.write('# rank\tclade\tnumber of bins\n') for rank in official_ranks: for clade in sorted(n_bins[rank], key=lambda x: n_bins[rank][x], reverse=True): outf1.write('{0}\t{1}\t{2}\n'.format(rank, clade, n_bins[rank][clade])) message = '{0} is created!'.format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return
def summarise_contigs(args): message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file(args.output_file, args.log_file, args.quiet)) if True in errors: sys.exit(1) contig2length = import_contig_lengths(args.contigs_fasta, args.log_file, args.quiet) message = 'Summarising...' shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# contig': message = '{0} is not a CAT classification file.'.format( args.input_file) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) if line[0] == '# bin': message = ( '{0} appears to be a BAT classification file. ' 'If you want to summarise bin ' 'classifications, simply don\'t supply a ' 'contigs fasta and everything should be fine.' ''.format(args.input_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ( 'official ranks not found in header of {0}. Make ' 'sure that the CAT classification file is named ' 'with official ranks with \'CAT add_names ' '--only_official\'.'.format(args.input_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) break else: message = 'input file does not have a recognisable header.' shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) length = {} length['no taxid assigned'] = [] ORFs = {} official_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] for rank in official_ranks: length[rank] = {} ORFs[rank] = {} n = 0 contig_trace = set() doubles = set() with open(args.input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') contig = line[0] if contig in contig_trace: doubles.add(contig) contig_trace.add(contig) if contig not in contig2length: message = ( 'contig {0} in CAT classification file is not found ' 'in supplied contigs fasta file. Are you sure the CAT ' 'classification file is based on the contigs fasta?' ''.format(contig)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if line[1] == 'no taxid assigned': length['no taxid assigned'].append(contig2length[contig]) continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in length[rank]: length[rank][classification] = [] ORFs[rank][classification] = [] length[rank][classification].append(contig2length[contig]) # NOTE that the total number of ORFs on a contig is reproted, # not only the number of ORFs a classification is based on. ORFs_on_contig = int(line[2].split('/')[1].split(' ')[0]) ORFs[rank][classification].append(ORFs_on_contig) if len(doubles) != 0: message = ('some contigs have multiple classifications. CAT summarise ' 'currently does not allow for this. Contigs with multiple ' 'classifications: {0}.'.format(', '.join(list(doubles)))) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) if n != len(contig2length): message = ('the number of classified contigs is not the same as the ' 'number of contigs in contigs fasta. Are you sure the CAT ' 'classification file is based on the contigs fasta?') shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) with open(args.output_file, 'w') as outf1: n_contigs = len(contig2length) total_length = sum(contig2length.values()) n_classified_contigs = n_contigs - len(length['no taxid assigned']) total_classified_length = total_length - sum( length['no taxid assigned']) outf1.write('# total number of contigs in {0} is {1:,d} representing ' '{2:,d} positions.\n'.format(args.contigs_fasta, n_contigs, total_length)) outf1.write( '# {0:,d} contigs have taxonomy assigned ({1:.2f}%) ' 'representing {2:,d} positions ({3:.2f}%) in {4}.\n'.format( n_classified_contigs, n_classified_contigs / n_contigs * 100, total_classified_length, total_classified_length / total_length * 100, args.input_file)) outf1.write('#\n') outf1.write('# rank\t' 'clade\t' 'number of contigs\t' 'number of ORFs\t' 'number of positions\n') for rank in official_ranks: for clade in sorted(length[rank], key=lambda x: sum(length[rank][x]), reverse=True): outf1.write('{0}\t{1}\t{2}\t{3}\t{4}\n'.format( rank, clade, len(length[rank][clade]), sum(ORFs[rank][clade]), sum(length[rank][clade]))) message = '{0} is created!'.format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return
def run(): args = parse_arguments() message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, args.log_file, args.quiet, show_time=False) errors = [] errors.append( check.check_input_file(args.input_file, args.log_file, args.quiet)) if not args.force: errors.append( check.check_output_file(args.output_file, args.log_file, args.quiet)) if True in errors: sys.exit(1) (taxid2parent, taxid2rank) = tax.import_nodes(args.nodes_dmp, args.log_file, args.quiet) taxid2name = tax.import_names(args.names_dmp, args.log_file, args.quiet) message = 'Appending names...' shared.give_user_feedback(message, args.log_file, args.quiet) with open(args.input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.rstrip().split('\t') if 'lineage' in line: lineage_index = line.index('lineage') else: message = ('{0} is not a supported classification file.' ''.format(input_file)) shared.give_user_feedback(message, args.log_file, args.quiet, error=True) sys.exit(1) try: scores_index = line.index('lineage scores') except: scores_index = None full_length = len(line) break else: message = ('{0} is not a supported classification file.'.format( args.input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with open(args.input_file, 'r') as f1, open(args.output_file, 'w') as outf1: for line in f1: line = line.rstrip() if line.startswith('#'): if args.only_official: outf1.write('{0}\tsuperkingdom\tphylum\tclass\torder\t' 'family\tgenus\tspecies\n'.format(line)) else: outf1.write('{0}\tfull lineage names\n'.format(line)) continue line = line.split('\t') if len(line) != full_length: # Entry does not have a full annotation. outf1.write('{0}\n'.format('\t'.join(line))) continue if (line[1].startswith('no taxid found') or line[2].startswith('no taxid found')): # ORF has database hits but the accession number is not found # in the taxonomy files. outf1.write('{0}\n'.format('\t'.join(line))) continue lineage = line[lineage_index].split(';') if scores_index is not None and not args.exclude_scores: scores = line[scores_index].split(';') else: scores = None if args.only_official: names = tax.convert_to_official_names(lineage, taxid2rank, taxid2name, scores) else: names = tax.convert_to_names(lineage, taxid2rank, taxid2name, scores) outf1.write('{0}\t{1}\n'.format('\t'.join(line), '\t'.join(names))) message = 'Names written to {0}!'.format(args.output_file) shared.give_user_feedback(message, args.log_file, args.quiet) return
def summarise_bins(input_file, output_file, force, quiet): # Currently summarise does not a allow for a log file. log_file = None message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_input_file(input_file, log_file, quiet)) if not force: errors.append(check.check_output_file(output_file, log_file, quiet)) if True in errors: sys.exit(1) message = 'Summarising...' shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(input_file, 'rt') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# bin': message = ('ERROR: {0} is not a BAT classification file.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) if line[0] == '# contig': message = ('ERROR: {0} appears to be a CAT ' 'classification file. If you want to ' 'summarise contig classifications, please ' 'supply a contigs fasta.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ('ERROR: official ranks not found in header of ' '{0}. Make sure that the BAT classification ' 'file is named with official ranks with \'CAT ' 'add_names --only_official\'.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) break else: message = 'ERROR: input file does not have a recognisable header.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) number_of_bins = {} number_of_bins['unclassified'] = 0 official_ranks = [ 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species' ] for rank in official_ranks: number_of_bins[rank] = {} n = 0 bin_trace = set() doubles = set() with open(input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') bin_ = line[0] if bin_ in bin_trace: doubles.add(bin_) bin_trace.add(bin_) if line[1] == 'unclassified': number_of_bins['unclassified'] += 1 continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in number_of_bins[rank]: number_of_bins[rank][classification] = 0 number_of_bins[rank][classification] += 1 if len(doubles) != 0: message = ('ERROR: some bins have multiple classifications. CAT ' 'summarise currently does not allow for this. Bins with ' 'multiple classifications: {0}.' ''.format(', '.join(list(doubles)))) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) number_of_classified_bins = n - number_of_bins['unclassified'] with shared.open_maybe_gzip(output_file, 'wt') as outf1: outf1.write('# total number of bins is {0}, of which {1} ({2:.2f}%) ' 'are classified.\n' ''.format(n, number_of_classified_bins, number_of_classified_bins / n * 100)) outf1.write('#\n') outf1.write('# rank\tclade\tnumber of bins\n') for rank in official_ranks: for clade in sorted(number_of_bins[rank], key=lambda x: number_of_bins[rank][x], reverse=True): outf1.write('{0}\t{1}\t{2}\n' ''.format(rank, clade, number_of_bins[rank][clade])) message = '{0} is created!'.format(output_file) shared.give_user_feedback(message, log_file, quiet)
def add_names(args): (input_file, output_file, taxonomy_folder, only_official, exclude_scores, force, quiet) = check.convert_arguments(args) # Currently add_names does not allow for a log file. log_file = None message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_input_file(input_file, log_file, quiet)) if not force: errors.append(check.check_output_file(output_file, log_file, quiet)) if True in errors: sys.exit(1) (nodes_dmp, names_dmp, prot_accession2taxid_file ) = check.inspect_taxonomy_folder(taxonomy_folder) (taxid2parent, taxid2rank) = tax.import_nodes(nodes_dmp, log_file, quiet) taxid2name = tax.import_names(names_dmp, log_file, quiet) message = 'Appending names...' shared.give_user_feedback(message, log_file, quiet) with shared.open_maybe_gzip(input_file, 'rt') as f1: for line in f1: if line.startswith('#'): line = line.rstrip().split('\t') try: lineage_index = line.index('lineage') except: message = ('ERROR: {0} is not a supported classification ' 'file.'.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: scores_index = line.index('lineage scores') except: scores_index = None full_length = len(line) break else: message = ('ERROR: {0} is not a supported classification file.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with shared.open_maybe_gzip(input_file, 'rt') as f1, shared.open_maybe_gzip( output_file, 'wt') as outf1: for line in f1: line = line.rstrip() if line.startswith('#'): if only_official: outf1.write('{0}\tsuperkingdom\tphylum\tclass\torder\t' 'family\tgenus\tspecies\n'.format(line)) else: outf1.write('{0}\tfull lineage names\n'.format(line)) continue line = line.split('\t') if len(line) != full_length: # Entry does not have a full annotation. outf1.write('{0}\n'.format('\t'.join(line))) continue if (line[1].startswith('no taxid found') or line[2].startswith('no taxid found')): # ORF has database hits but the accession number is not found # in the taxonomy files. outf1.write('{0}\n'.format('\t'.join(line))) continue lineage = line[lineage_index].split(';') if scores_index and not exclude_scores: scores = line[scores_index].split(';') else: scores = None if only_official: names = tax.convert_to_official_names(lineage, taxid2rank, taxid2name, scores) else: names = tax.convert_to_names(lineage, taxid2rank, taxid2name, scores) outf1.write('{0}\t{1}\n'.format('\t'.join(line), '\t'.join(names))) message = 'Names written to {0}!'.format(output_file) shared.give_user_feedback(message, log_file, quiet)
def summarise_contigs(input_file, output_file, contigs_fasta, force, quiet): # Currently summarise does not a allow for a log file. log_file = None message = '# CAT v{0}.'.format(about.__version__) shared.give_user_feedback(message, log_file, quiet, show_time=False) errors = [] errors.append(check.check_input_file(input_file, log_file, quiet)) if not force: errors.append(check.check_output_file(output_file, log_file, quiet)) if True in errors: sys.exit(1) contig2length = import_contig_lengths(contigs_fasta, log_file, quiet) message = 'Summarising...' shared.give_user_feedback(message, log_file, quiet) with open(input_file, 'r') as f1: for line in f1: if line.startswith('#'): line = line.split('\t') if line[0] != '# contig': message = ('ERROR: {0} is not a CAT classification file.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) if line[0] == '# bin': message = ('ERROR: {0} appears to be a BAT ' 'classification file. If you want to ' 'summarise bin classifications, just ' 'don\'t supply a contigs fasta and ' 'everything should be fine!' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) try: superkingdom_index = line.index('superkingdom') except: message = ('ERROR: official ranks not found in header of ' '{0}. Make sure that the CAT classification ' 'file is named with official ranks with \'CAT ' 'add_names --only_official\'.' ''.format(input_file)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) break else: message = 'ERROR: input file does not have a recognisable header.' shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) length = {} length['unclassified'] = [] ORFs = {} official_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'] for rank in official_ranks: length[rank] = {} ORFs[rank] = {} n = 0 contig_trace = set() doubles = set() with open(input_file, 'r') as f1: for line in f1: line = line.rstrip() if line.startswith('#'): continue n += 1 line = line.split('\t') contig = line[0] if contig in contig_trace: doubles.add(contig) contig_trace.add(contig) if contig not in contig2length: message = ('ERROR: contig {0} in CAT classification file is ' 'not found in supplied contigs fasta file. Are you ' 'sure the CAT classification file is based on the ' 'contigs fasta?'.format(contig)) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if line[1].startswith('unclassified'): length['unclassified'].append(contig2length[contig]) continue for (i, classification) in enumerate(line[superkingdom_index:]): classification = classification.rsplit(': ', 1)[0].rstrip('*') rank = official_ranks[i] if classification not in length[rank]: length[rank][classification] = [] ORFs[rank][classification] = [] length[rank][classification].append(contig2length[contig]) ORFs[rank][classification].append(int(line[2])) if len(doubles) != 0: message = ('ERROR: some contigs have multiple classifications. CAT ' 'summarise currently does not allow for this. Contigs with ' 'multiple classifications: {0}.' ''.format(', '.join(list(doubles)))) shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) if n != len(contig2length): message = ('ERROR: the number of classified contigs is not the same ' 'as the number of contigs in contigs fasta. Are you sure ' 'the CAT classification file is based on the contigs ' 'fasta?') shared.give_user_feedback(message, log_file, quiet, error=True) sys.exit(1) with open(output_file, 'w') as outf1: number_of_contigs = len(contig2length) total_length = sum(contig2length.values()) number_of_classified_contigs = number_of_contigs - len(length['unclassified']) total_classified_length = total_length - sum(length['unclassified']) outf1.write('# total number of contigs in {0} is {1} representing {2} ' 'positions.\n' ''.format(contigs_fasta, number_of_contigs, total_length)) outf1.write('# {0} contigs are classified ({1:.2f}%) representing {2} ' 'positions ({3:.2f}%) in {4}.\n' ''.format(number_of_classified_contigs, number_of_classified_contigs / number_of_contigs * 100, total_classified_length, total_classified_length / total_length * 100, input_file)) outf1.write('#\n') outf1.write('# rank\t' 'clade\t' 'number of contigs\t' 'number of ORFs\t' 'number of positions\n') for rank in official_ranks: for clade in sorted(length[rank], key=lambda x: sum(length[rank][x]), reverse=True): outf1.write('{0}\t{1}\t{2}\t{3}\t{4}\n' ''.format(rank, clade, len(length[rank][clade]), sum(ORFs[rank][clade]), sum(length[rank][clade]))) message = '{0} is created!'.format(output_file) shared.give_user_feedback(message, log_file, quiet)