def main(gff_file=None, fasta_file=None, embedded_fasta=False, stype=None, user_defined=None, dline=None, qc=True, output_prefix=None, logger=None): stderr_handler = logging.StreamHandler() stderr_handler.setFormatter( logging.Formatter('%(levelname)-8s %(message)s')) logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if not gff_file or (not fasta_file and not embedded_fasta) or not stype: print( 'Gff file, fasta file, and type of extracted sequences need to be specified' ) sys.exit(1) type_set = [ 'gene', 'exon', 'pre_trans', 'trans', 'cds', 'pep', 'all', 'user_defined' ] if not stype in type_set: logger.error( 'Your sequence type is "{0:s}". Sequence type must be one of {1:s}!' .format(stype, str(type_set))) sys.exit(1) if stype == 'all' and output_prefix: pass elif stype != 'all' and output_prefix: logger.info('Specifying prefix of output file name: (%s)...', output_prefix) fname = '{0:s}_{1:s}.fa'.format(output_prefix, stype) report_fh = open(fname, 'w') else: print('[Error] Please specify the prefix of output file name...') sys.exit(1) if stype == 'user_defined' and user_defined != None: if len(user_defined) != 2: logger.error( 'Please specify parent and child feature via the -u argument. Format: [parent feature type],[child feature type]' ) sys.exit(1) elif stype != 'user_defined' and user_defined != None: logger.warning( 'Your sequence type is "{0:s}", -u argument will be ignored.'. format(stype)) elif stype == 'user_defined' and user_defined == None: logger.error('-u is needed in combination with -st user_defined.') sys.exit(1) logger.info('Reading files: {0:s}, {1:s}...'.format(gff_file, fasta_file)) gff = None if qc: initial_phase = False gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger) if embedded_fasta and len(gff.fasta_embedded) == 0: logger.error('There is no embedded fasta in the GFF3 file.') sys.exit(1) logger.info('Checking errors...') gff.check_parent_boundary() gff.check_phase(initial_phase) gff.check_reference() error_set = function4gff.extract_internal_detected_errors(gff) t = intra_model.main(gff, logger=logger) if t: error_set.extend(t) t = single_feature.main(gff, logger=logger) if t: error_set.extend(t) if error_set and len(error_set): escaped_error = ['Esf0012', 'Esf0033'] eSet = list() for e in error_set: if not e['eCode'] in escaped_error: eSet.append(e) if len(eSet): logger.warning( 'The extracted sequences might be wrong for the following features which have formatting errors...' ) print('ID\tError_Code\tError_Tag') for e in eSet: tag = '[{0:s}]'.format(e['eTag']) print(e['ID'], e['eCode'], tag) else: gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger_null) if embedded_fasta and len(gff.fasta_embedded) == 0: logger.error('There is no embedded fasta in the GFF3 file.') logger.info('Extract sequences for {0:s}...'.format(stype)) seq = dict() if stype == 'all': if output_prefix: logger.info('Specifying prefix of output file name: (%s)...', output_prefix) pass else: print('[Error] Please specify the prefix of output file name...') sys.exit(1) tmp_stype = 'pre_trans' logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'gene' logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'exon' logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = extract_start_end(gff, tmp_stype, dline, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'trans' feature_type = ['exon', 'pseudogenic_exon'] logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = splicer(gff, feature_type, dline, stype, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'cds' feature_type = ['CDS'] logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) seq = splicer(gff, feature_type, dline, stype, embedded_fasta) if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) seq = dict() tmp_stype = 'pep' feature_type = ['CDS'] logger.info('\t- Extract sequences for {0:s}...'.format(tmp_stype)) tmpseq = splicer(gff, feature_type, dline, tmp_stype, embedded_fasta) for k, v in tmpseq.items(): k = k.replace("|mRNA(CDS)|", "|peptide|") v = translator(v) seq[k] = v if len(seq): fname = '{0:s}_{1:s}.fa'.format(output_prefix, tmp_stype) report_fh = open(fname, 'w') logger.info( '\t\tPrint out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, tmp_stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) elif stype == 'user_defined': feature_type = [user_defined[0], user_defined[1]] seq = splicer(gff, feature_type, dline, stype, embedded_fasta) if len(seq): logger.info( 'Print out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v)) else: if stype == 'pre_trans' or stype == 'gene' or stype == 'exon': seq = extract_start_end(gff, stype, dline, embedded_fasta) elif stype == 'trans': feature_type = ['exon', 'pseudogenic_exon'] seq = splicer(gff, feature_type, dline, stype, embedded_fasta) elif stype == 'cds': feature_type = ['CDS'] seq = splicer(gff, feature_type, dline, stype, embedded_fasta) elif stype == 'pep': feature_type = ['CDS'] tmpseq = splicer(gff, feature_type, dline, stype, embedded_fasta) for k, v in tmpseq.items(): k = k.replace("|mRNA(CDS)|", "|peptide|") #k = re.sub(r'(.*-)(R)(.)',r'\1P\3',k) v = translator(v) seq[k] = v if len(seq): logger.info( 'Print out extracted sequences: {0:s}_{1:s}.fa...'.format( output_prefix, stype)) for k, v in seq.items(): if len(k) != 0 and len(v) != 0: report_fh.write('{0:s}\n{1:s}\n'.format(k, v))
def script_main(): logger_stderr = logging.getLogger(__name__ + 'stderr') logger_stderr.setLevel(logging.INFO) stderr_handler = logging.StreamHandler() stderr_handler.setFormatter( logging.Formatter('%(levelname)-8s %(message)s')) logger_stderr.addHandler(stderr_handler) logger_null = logging.getLogger(__name__ + 'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) import argparse from textwrap import dedent parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=dedent("""\ Testing environment: 1. Python 2.7 Inputs: 1. GFF3: Specify the file name with the -g or --gff argument; Please note that this program requires gene/pseudogene and mRNA/pseudogenic_transcript to have an ID attribute in column 9. 2. fasta file: Specify the file name with the -f or --fasta argument Outputs: 1. Error report for the input GFF3 file * Line_num: Line numbers of the found problematic models in the input GFF3 file. * Error_code: Error codes for the found problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag. * Error_tag: Detail of the found errors for the problematic models. Please refer to lib/ERROR/ERROR.py to see the full list of Error_code and the corresponding Error_tag. Quick start: gff3_QC -g example_file/example.gff3 -f example_file/reference.fa -o test or gff3_QC --gff example_file/example.gff3 --fasta example_file/reference.fa --output test """)) parser.add_argument('-g', '--gff', type=str, help='Genome annotation file, gff3 format') parser.add_argument('-f', '--fasta', type=str, help='Genome sequences, fasta format') parser.add_argument( '-noncg', '--noncanonical_gene', action="store_true", help='gff3 file is not formatted in the canonical gene model format.') parser.add_argument( '-i', '--initial_phase', action="store_true", help='Check whether initial CDS phase is 0 (default: no check)') parser.add_argument( '-n', '--allowed_num_of_n', type=int, default=0, help= 'Max number of Ns allowed in a feature, anything more will be reported as an error (default: 0)' ) parser.add_argument( '-t', '--check_n_feature_types', nargs='*', default=['CDS'], help= 'Count the number of Ns in each feature with the type specified, multiple types may be specified, ex: -t CDS exon (default: "CDS")' ) parser.add_argument('-o', '--output', type=str, help='output file name (default: report.txt)') parser.add_argument('-s', '--statistic', type=str, help='statistic file name (default: statistic.txt)') parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) args = parser.parse_args() if args.gff: logger_stderr.info('Checking gff file (%s)...', args.gff) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.gff = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) if args.fasta: logger_stderr.info('Checking genome fasta (%s)...', args.fasta) elif not sys.stdin.isatty(): # if STDIN connected to pipe or file args.fasta = sys.stdin logger_stderr.info('Reading from STDIN...') else: # no input parser.print_help() sys.exit(1) if args.allowed_num_of_n or args.check_n_feature_types: check_n = True else: check_n = False logger_stderr.info('Reading gff files: (%s)...\n', args.gff) gff3 = Gff3(gff_file=args.gff, fasta_external=args.fasta, logger=logger_null) logger_stderr.info('Checking errors in the gff files: (%s)...\n', args.gff) if not gff3.check_parent_boundary(): sys.exit() gff3.check_unresolved_parents() if args.noncanonical_gene == False: gff3.check_phase(args.initial_phase) gff3.check_reference(fasta_external=args.fasta, check_n=check_n, allowed_num_of_n=args.allowed_num_of_n, feature_types=args.check_n_feature_types) logger_stderr.info('\t- Checking missing attributes: (%s)...\n', 'function4gff.FIX_MISSING_ATTR()') function4gff.FIX_MISSING_ATTR(gff3, logger=logger_stderr) error_set = list() cmd = None cmd = function4gff.extract_internal_detected_errors(gff3) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking intra-model errors: (%s)...\n', args.gff) cmd = intra_model.main(gff3, logger=logger_stderr, noncanonical_gene=args.noncanonical_gene) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking inter-model errors: (%s)...\n', args.gff) cmd = inter_model.main(gff3, args.gff, args.fasta, logger=logger_stderr, noncanonical_gene=args.noncanonical_gene) if cmd: error_set.extend(cmd) cmd = None logger_stderr.info('\t- Checking single-feature errors: (%s)...\n', args.gff) cmd = single_feature.main(gff3, logger=logger_stderr) if cmd: error_set.extend(cmd) if args.output: logger_stderr.info('Print QC report at {0:s}'.format(args.output)) report_fh = open(args.output, 'w') else: logger_stderr.info('Print QC report at {0:s}'.format('report.txt')) report_fh = open('report.txt', 'w') if args.statistic: logger_stderr.info('Print QC statistic report at {0:s}'.format( args.statistic)) statistic_fh = open(args.statistic, 'w') else: logger_stderr.info( 'Print QC statistic report at {0:s}'.format('statistic.txt')) statistic_fh = open('statistic.txt', 'w') report_fh.write('Line_num\tError_code\tError_tag\n') for e in sorted(error_set, key=lambda x: sorted(x.keys())): tag = '[{0:s}]'.format(e['eTag']) report_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(str(e['line_num']), str(e['eCode']), str(tag))) #statistic_file error_counts = dict() ERROR_INFO = ERROR.INFO statistic_fh.write('Error_code\tNumber_of_problematic_models\tError_tag\n') for s in sorted(error_set, key=lambda x: sorted(x.keys())): if s['eCode'] not in error_counts: error_counts[s['eCode']] = { 'count': 0, 'etag': ERROR_INFO[s['eCode']] } error_counts[s['eCode']]['count'] += 1 for a in error_counts: statistic_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format( str(a), str(error_counts[a]['count']), str(error_counts[a]['etag'])))