Exemplo n.º 1
0
def main(gff_file=None, fasta_file=None, stype=None, dline=None, qc=True, output_prefix=None, logger=None):
    if logger == None:
        logger = logging.getLogger(__name__+'stderr')
        logger.setLevel(logging.INFO)
        stderr_handler = logging.StreamHandler()
        stderr_handler.setFormatter(logging.Formatter('%(levelname)-8s %(message)s'))
        logger.addHandler(stderr_handler)
        logger_null = logging.getLogger(__name__+'null')
        null_handler = logging.NullHandler()
        logger_null.addHandler(null_handler)

    if output_prefix:
        logger.info('Specifying prefix of output file name: (%s)...', output_prefix)
        fname = '{0:s}_{1:s}.fa'.format(output_prefix, stype)
        report_fh = open(fname, 'wb')
    else:
        parser.print_help()
        sys.exit(1)

    if not gff_file or not fasta_file or not stype:
        print('All of Gff file, fasta file, and type of extracted seuqences need to be specified')
        return
    type_set=['gene','exon','pre_trans', 'trans', 'cds', 'pep']
    if not stype in type_set:
        logger.error('Your sequence type is "{0:s}". Sequence type must be one of {1:s}!'.format(stype, str(type_set)))
        return
    logger.info('Reading files: {0:s}, {1:s}...'.format(gff_file, fasta_file))
    gff = Gff3(gff_file=gff_file, fasta_external=fasta_file, logger=logger)

    if qc:
        logger.info('Checking errors...')
        gff.check_parent_boundary()
        gff.check_phase()
        gff.check_reference()
        error_set = function4gff.extract_internal_detected_errors(gff)
        t = intra_model.main(gff, logger=logger)
        if t:
            error_set.extend(t)
        t = single_feature.main(gff, logger=logger)
        if t:
            error_set.extend(t)

        if len(error_set):
            escaped_error = ['Esf0012','Esf0033']
            eSet = list()
            for e in error_set:
                if not e['eCode'] in escaped_error:
                    eSet.append(e)
            if len(eSet):
                logger.warning('The extracted sequences might be wrong for the following features which have formatting errors...')
                print('ID\tError_Code\tError_Tag')
                for e in eSet:
                    tag = '[{0:s}]'.format(e['eTag'])
                    print e['ID'], e['eCode'], tag
    
    logger.info('Extract seqeunces for {0:s}...'.format(stype))
    seq=dict()
    if stype == 'pre_trans' or stype == 'gene' or stype == 'exon':
        seq = extract_start_end(gff, stype, dline)        
    elif stype == 'trans':
        feature_type = ['exon', 'pseudogenic_exon']
        seq = splicer(gff, feature_type,  dline)
    elif stype == 'cds':
        feature_type = ['CDS']
        seq = splicer(gff, feature_type,  dline)
    elif stype == 'pep':
        feature_type = ['CDS']
        tmpseq = splicer(gff, feature_type,  dline)
        for k,v in tmpseq.items():
            k = k.replace("|mRNA(CDS)|", "|peptide|").replace("-RA", "-PA")
            v = translator(v)
            seq[k] = v
            
    if len(seq):
        logger.info('Print out extracted sequences: {0:s}_{1:s}.fa...'.format(output_prefix, stype))
        for k,v in seq.items():
            report_fh.write('{0:s}\n{1:s}\n'.format(k,v))
Exemplo n.º 2
0

    #ERROR_CODE = ['Esf0001', 'Esf0002', 'Ema0005', 'Emr0001'] 
    #ERROR_TAG = ['pseudogene or not?', 'Negative/Zero start/end coordinate', 'unusual child features in the type of pseudogene found', 'Duplicate transcripts found']
    #ERROR_INFO = dict(zip(ERROR_CODE, ERROR_TAG))

    logger_stderr.info('Reading gff files: (%s)...\n', args.gff)
    gff3 = Gff3(gff_file=args.gff, fasta_external=args.fasta, logger=logger_null)
    gff3.check_unresolved_parents()
    gff3.check_parent_boundary()
    gff3.check_phase()
    gff3.check_reference()
    logger_stderr.info('Checking missing attributes: (%s)...\n', 'single_feature.FIX_MISSING_ATTR()')

    error_set = list()
    if function4gff.extract_internal_detected_errors(gff3):
        error_set.extend(function4gff.extract_internal_detected_errors(gff3))
    if intra_model.main(gff3, logger=logger_stderr):
        error_set.extend(intra_model.main(gff3, logger=logger_stderr))
    if inter_model.main(gff3, logger=logger_stderr):
        error_set.extend(inter_model.main(gff3, logger=logger_stderr))
    if inter_model.main(gff3, logger=logger_stderr):
        error_set.extend(single_feature.main(gff3, logger=logger_stderr))

    if args.output:
        logger_stderr.info('Print QC report at {0:s}'.format(args.output))
    else:
        logger_stderr.info('Print QC report at {0:s}'.format('report.txt'))
    report_fh.write('ID\tError_code\tError_tag\n')
    for e in error_set:
        tag = '[{0:s}]'.format(e['eTag'])