Exemplo n.º 1
0
def splicer(gff, ftype, dline):
    seq=dict()
    roots = [line for line in gff.lines if line['line_type'] == 'feature' and not line['attributes'].has_key('Parent')]
    for root in roots:
        #if ftype[0] == 'CDS' and root['type'] == 'pseudogene': # pseudogene should not contain cds
            #continue
        rid = 'NA'
        if root['attributes'].has_key('ID'):
           rid = root['attributes']['ID']
       
        children = root['children']
        for child in children:
            cid = 'NA'
            if child['attributes'].has_key('ID'):
                cid = child['attributes']['ID']
            cname = cid
            if child['attributes'].has_key('Name'):
                cname = child['attributes']['Name']
            defline='>{0:s}'.format(cid)
            if ftype[0] == 'CDS':
                defline='>{0:s}-CDS'.format(cid)
            if dline == 'complete':
                defline = '>{0:s}:{1:d}..{2:d}:{3:s}|{4:s}({8:s})|Parent={5:s}|ID={6:s}|Name={7:s}'.format(child['seqid'], child['start'], child['end'], child['strand'], child['type'], rid, cid, cname, ftype[0])

            segments = []
            gchildren = child['children']
            for gchild in gchildren:
                if gchild['type'] in ftype:
                    segments.append(gchild)
            
            flag = 0
            if len(segments)==0:
                flag += 1
                for gchild in gchildren:
                    if gchild['type'] == 'CDS':
                        segments.append(gchild)

            if len(segments)==0 and ftype[0] == 'CDS':
                flag += 1
                print("WARNING  There is no CDS feature for {0:s} in the input gff. The sequence of {0:s} is not generated.".format(cid))
                continue
            elif len(segments)==0:
                flag += 1
                print("WARNING  There is no exon, nor CDS feature for {0:s} in the input gff. The sequence of {0:s} is not generated.".format(cid))
                continue
            
            if flag == 1:
                print("WARNING  There is no exon feature for {0:s} in the input gff. CDS features are used for splicing instead.".format(cid))
            
            sort_seg = function4gff.featureSort(segments)
            if gchild['strand'] == '-':
                sort_seg = function4gff.featureSort(segments, reverse=True)

            tmpseq = ''
            for s in sort_seg:
                tmpseq = tmpseq + get_subseq(gff, s)
            
            seq[defline] = tmpseq

    return seq
Exemplo n.º 2
0
def check_duplicate(gff, linelist):
    '''
    This function assumes that,
    1. Each gnee is unique
    2. Children features such as Exons/CDSs do not contain multiple Parent IDs

    Note: If there are additional transcript type in the input gfff, then you should go to intra_model.featureSort, and add the new transcript type to the dict of  FEATURECODE.
    '''

    eCode = 'Emr0001'
    eSet = list()

    pairs = list()
    for i in range(len(linelist)-1):
        for j in range(i+1, len(linelist)):
            source, target = linelist[i], linelist[j]
            if source['seqid'] == target['seqid']:
                s7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(source['seqid'], source['source'], source['type'], source['start'], source['end'], source['score'], source['strand'], source['phase'])
                t7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(target['seqid'], target['source'], target['type'], target['start'], target['end'], target['score'], target['strand'], target['phase'])
                if s7 == t7:
                    pairs.append({'source':source, 'target':target})

    for pair in pairs:
        result = dict()
        same_target = False
        if pair['source'].has_key('children') and pair['target'].has_key('children'):
            schildren = pair['source']['children']
            tchildren = pair['target']['children']
            if len(schildren) == len(tchildren):
                sort_schildren = function4gff.featureSort(schildren, reverse=True if pair['source']['strand'] == '-' else False)
                sort_tchildren = function4gff.featureSort(tchildren, reverse=True if pair['source']['strand'] == '-' else False)
                for i in range(len(sort_schildren)):
                    s7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(sort_schildren[i]['seqid'], sort_schildren[i]['source'], sort_schildren[i]['type'], sort_schildren[i]['start'], sort_schildren[i]['end'], sort_schildren[i]['score'], sort_schildren[i]['strand'], sort_schildren[i]['phase'])
                    t7 = '{0:s}\t{1:s}\t{2:s}\t{3:d}\t{4:d}\t{5:s}\t{6:s}'.format(sort_tchildren[i]['seqid'], sort_tchildren[i]['source'], sort_tchildren[i]['type'], sort_tchildren[i]['start'], sort_tchildren[i]['end'], sort_tchildren[i]['score'], sort_tchildren[i]['strand'], sort_tchildren[i]['phase'])
                    if s7 == t7:
                        same_target=True
                    else:
                        same_target=False
                        break
        if same_target:
            key = [pair['source']['attributes']['ID'], pair['target']['attributes']['ID']]
            result['ID'] = key
            result['eCode'] = eCode
            result['eLines'] = [pair['source'], pair['target']]
            result['eTag'] = 'Duplicate transcripts found between {0:s} and {1:s}'.format(pair['source']['attributes']['ID'], pair['target']['attributes']['ID'])
            eSet.append(result)       
            gff.add_line_error(pair['source'], {'message': 'Duplicate transcripts found between {0:s} and {1:s}'.format(pair['source']['attributes']['ID'], pair['target']['attributes']['ID']), 'error_type': 'INTER_MODEL', 'eCode': eCode})
            gff.add_line_error(pair['target'], {'message': 'Duplicate transcripts found between {0:s} and {1:s}'.format(pair['source']['attributes']['ID'], pair['target']['attributes']['ID']), 'error_type': 'INTER_MODEL', 'eCode': eCode})

    if len(eSet):
        return eSet