def main(gff_file=None, fasta_file=None, stype=None, dline=None, qc=True, output_prefix=None, logger=None): if logger == None: logger = logging.getLogger(__name__+'stderr') logger.setLevel(logging.INFO) stderr_handler = logging.StreamHandler() stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s')) logger.addHandler(stderr_handler) logger_null = logging.getLogger(__name__+'null') null_handler = logging.NullHandler() logger_null.addHandler(null_handler) if output_prefix: logger.info('Specifying prefix of output file name: (%s)...', output_prefix) fname = '{0:s}_{1:s}.fa'.format(output_prefix, stype) report_fh = open(fname, 'wb') else: parser.print_help() sys.exit(1) if not gff_file or not fasta_file or not stype: print('All of Gff file, fasta file, and type of extracted seuqences need to be specified') return type_set=['gene','exon','pre_trans', 'trans', 'cds', 'pep'] if not stype in type_set: logger.error('Your sequence type is "{0:s}". Sequence type must be one of {1:s}!'.format(stype, str(type_set))) return logger.info('Reading files: {0:s}, {1:s}...'.format(gff_file, fasta_file)) gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger) if qc: logger.info('Checking errors...') gff.check_parent_boundary() gff.check_phase() gff.check_reference() error_set = function4gff.extract_internal_detected_errors(gff) t = intra_model.main(gff, logger=logger) if t: error_set.extend(t) t = single_feature.main(gff, logger=logger) if t: error_set.extend(t) if len(error_set): escaped_error = ['Esf0012','Esf0033'] eSet = list() for e in error_set: if not e['eCode'] in escaped_error: eSet.append(e) if len(eSet): logger.warning('The extracted sequences might be wrong for the following features which have formatting errors...') print('ID\tError_Code\tError_Tag') for e in eSet: tag = '[{0:s}]'.format(e['eTag']) print e['ID'], e['eCode'], tag logger.info('Extract seqeunces for {0:s}...'.format(stype)) seq=dict() if stype == 'pre_trans' or stype == 'gene' or stype == 'exon': seq = extract_start_end(gff, stype, dline) elif stype == 'trans': feature_type = ['exon', 'pseudogenic_exon'] seq = splicer(gff, feature_type, dline) elif stype == 'cds': feature_type = ['CDS'] seq = splicer(gff, feature_type, dline) elif stype == 'pep': feature_type = ['CDS'] tmpseq = splicer(gff, feature_type, dline) for k,v in tmpseq.items(): k = k.replace("|mRNA(CDS)|", "|peptide|").replace("-RA", "-PA") v = translator(v) seq[k] = v if len(seq): logger.info('Print out extracted sequences: {0:s}_{1:s}.fa...'.format(output_prefix, stype)) for k,v in seq.items(): report_fh.write('{0:s}\n{1:s}\n'.format(k,v))
#ERROR_CODE = ['Esf0001', 'Esf0002', 'Ema0005', 'Emr0001'] #ERROR_TAG = ['pseudogene or not?', 'Negative/Zero start/end coordinate', 'unusual child features in the type of pseudogene found', 'Duplicate transcripts found'] #ERROR_INFO = dict(zip(ERROR_CODE, ERROR_TAG)) logger_stderr.info('Reading gff files: (%s)...\n', args.gff) gff3 = Gff3(gff_file=args.gff, fasta_external=args.fasta, logger=logger_null) gff3.check_unresolved_parents() gff3.check_parent_boundary() gff3.check_phase() gff3.check_reference() logger_stderr.info('Checking missing attributes: (%s)...\n', 'single_feature.FIX_MISSING_ATTR()') error_set = list() if function4gff.extract_internal_detected_errors(gff3): error_set.extend(function4gff.extract_internal_detected_errors(gff3)) if intra_model.main(gff3, logger=logger_stderr): error_set.extend(intra_model.main(gff3, logger=logger_stderr)) if inter_model.main(gff3, logger=logger_stderr): error_set.extend(inter_model.main(gff3, logger=logger_stderr)) if inter_model.main(gff3, logger=logger_stderr): error_set.extend(single_feature.main(gff3, logger=logger_stderr)) if args.output: logger_stderr.info('Print QC report at {0:s}'.format(args.output)) else: logger_stderr.info('Print QC report at {0:s}'.format('report.txt')) report_fh.write('ID\tError_code\tError_tag\n') for e in error_set: tag = '[{0:s}]'.format(e['eTag']) report_fh.write('{0:s}\t{1:s}\t{2:s}\n'.format(str(e['ID']), str(e['eCode']), str(tag)))