Пример #1
0
def _format_gene_models(parent_nf_map, child_nf_map):
    """
    Genarate GeneObject based on the parsed file contents

    parent_map: parent features with source and chromosome information 
    child_map: transctipt and exon information are encoded 
    """
    g_cnt = 0
    gene = np.zeros((len(parent_nf_map), ), dtype=utils.init_gene_DE())

    for pkey, pdet in parent_nf_map.items():
        # considering only gene features
        if not re.search(r'gene', pdet.get('type', '')):
            continue
        # infer the gene start and stop if not there in the
        if not pdet.get('location', []):
            GNS, GNE = [], []
            # multiple number of transcripts
            for L1 in child_nf_map[pkey]:
                GNS.append(L1.get('location', [])[0])
                GNE.append(L1.get('location', [])[1])
            GNS.sort()
            GNE.sort()
            pdet['location'] = [GNS[0], GNE[-1]]
        orient = pdet.get('strand', '')

        gene[g_cnt]['id'] = g_cnt + 1
        gene[g_cnt]['chr'] = pkey[0]
        gene[g_cnt]['source'] = pkey[1]
        gene[g_cnt]['name'] = pkey[-1]
        gene[g_cnt]['start'] = pdet.get('location', [])[0]
        gene[g_cnt]['stop'] = pdet.get('location', [])[1]
        gene[g_cnt]['strand'] = orient

        # default value
        gene[g_cnt]['is_alt_spliced'] = 0
        if len(child_nf_map[pkey]) > 1:
            gene[g_cnt]['is_alt_spliced'] = 1

        # complete sub-feature for all transcripts
        dim = len(child_nf_map[pkey])
        TRS = np.zeros((dim, ), dtype=np.object)
        EXON = np.zeros((dim, ), dtype=np.object)

        # fetching corresponding transcripts
        for xq, Lv1 in enumerate(child_nf_map[pkey]):

            TID = Lv1.get('ID', '')
            TRS[xq] = np.array([TID])

            orient = Lv1.get('strand', '')

            # fetching different sub-features
            child_feat = defaultdict(list)
            for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
                E_TYP = Lv2.get('type', '')
                child_feat[E_TYP].append(Lv2.get('location'))

            # make exon coordinate from cds and utr regions
            if not child_feat.get('exon'):
                if child_feat.get('CDS'):
                    exon_cod = utils.make_Exon_cod(
                        orient,
                        NonetoemptyList(child_feat.get('five_prime_UTR')),
                        NonetoemptyList(child_feat.get('CDS')),
                        NonetoemptyList(child_feat.get('three_prime_UTR')))
                    child_feat['exon'] = exon_cod
                else:
                    # searching through keys to find a pattern describing exon feature
                    ex_key_pattern = [
                        k for k in child_feat if k.endswith("exon")
                    ]
                    child_feat['exon'] = child_feat[ex_key_pattern[0]]
                    # TODO only UTR's

            # make general ascending order of coordinates
            if orient == '-':
                for etype, excod in child_feat.items():
                    if len(excod) > 1:
                        if excod[0][0] > excod[-1][0]:
                            excod.reverse()
                            child_feat[etype] = excod

            # add sub-feature # make array for export to different out
            EXON[xq] = np.array(child_feat.get('exon'), np.float64)

        # add sub-features to the parent gene feature
        gene[g_cnt]['transcripts'] = TRS
        gene[g_cnt]['exons'] = EXON

        gene[g_cnt]['gene_info'] = dict(ID=pkey[-1],
                                        Name=pdet.get('name'),
                                        Source=pkey[1])
        g_cnt += 1

    ## deleting empty gene records from the main array
    for XP, ens in enumerate(gene):
        if ens[0] == 0:
            break

    XQC = range(XP, len(gene) + 1)
    gene = np.delete(gene, XQC)

    return gene
Пример #2
0
def format_gene_models(parent_nf_map, child_nf_map): 
    """
    Genarate GeneObject based on the parsed file contents

    @args parent_nf_map: parent features with source and chromosome information 
    @type parent_nf_map: collections defaultdict
    @args child_nf_map: transctipt and exon information are encoded 
    @type child_nf_map: collections defaultdict
    """

    g_cnt = 0 
    gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene())

    for pkey, pdet in parent_nf_map.items():
        # considering only gene features 
        #if not re.search(r'gene', pdet.get('type', '')):
        #    continue 

        # infer the gene start and stop if not there in the 
        if not pdet.get('location', []):
            GNS, GNE = [], []
            # multiple number of transcripts 
            for L1 in child_nf_map[pkey]:
                GNS.append(L1.get('location', [])[0]) 
                GNE.append(L1.get('location', [])[1]) 
            GNS.sort()
            GNE.sort()
            pdet['location'] = [GNS[0], GNE[-1]]

        orient = pdet.get('strand', '')
        gene[g_cnt]['id'] = g_cnt +1 
        gene[g_cnt]['chr'] = pkey[0]
        gene[g_cnt]['source'] = pkey[1]
        gene[g_cnt]['name'] = pkey[-1]
        gene[g_cnt]['start'] = pdet.get('location', [])[0]
        gene[g_cnt]['stop'] = pdet.get('location', [])[1]
        gene[g_cnt]['strand'] = orient  
        gene[g_cnt]['score'] = pdet.get('score','')

        # default value 
        gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
        if len(child_nf_map[pkey]) > 1:
            gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1

        # complete sub-feature for all transcripts 
        dim = len(child_nf_map[pkey])
        TRS = np.zeros((dim,), dtype=np.object)
        TR_TYP = np.zeros((dim,), dtype=np.object)
        EXON = np.zeros((dim,), dtype=np.object)
        UTR5 = np.zeros((dim,), dtype=np.object)
        UTR3 = np.zeros((dim,), dtype=np.object)
        CDS = np.zeros((dim,), dtype=np.object)
        TISc = np.zeros((dim,), dtype=np.object)
        TSSc = np.zeros((dim,), dtype=np.object)
        CLV = np.zeros((dim,), dtype=np.object)
        CSTOP = np.zeros((dim,), dtype=np.object)
        TSTAT = np.zeros((dim,), dtype=np.object)
        TSCORE = np.zeros((dim,), dtype=np.object)

        # fetching corresponding transcripts 
        for xq, Lv1 in enumerate(child_nf_map[pkey]):

            TID = Lv1.get('ID', '')
            TRS[xq]= np.array([TID])

            TYPE = Lv1.get('type', '')
            TR_TYP[xq] = np.array('')
            TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]

            orient = Lv1.get('strand', '')
            tr_score = Lv1.get('score', '')

            # fetching different sub-features 
            child_feat = defaultdict(list)
            for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
                E_TYP = Lv2.get('type', '')
                child_feat[E_TYP].append(Lv2.get('location'))
            
            # make general ascending order of coordinates 
            if orient == '-':
                for etype, excod in child_feat.items():
                    if len(excod) > 1:
                        if excod[0][0] > excod[-1][0]:
                            excod.reverse()
                            child_feat[etype] = excod

            # make exon coordinate from cds and utr regions 
            if not child_feat.get('exon'):  
                if child_feat.get('CDS'):
                    exon_cod = utils.make_Exon_cod( orient, 
                                NonetoemptyList(child_feat.get('five_prime_UTR')), 
                                NonetoemptyList(child_feat.get('CDS')),
                                NonetoemptyList(child_feat.get('three_prime_UTR')))
                    child_feat['exon'] = exon_cod 
                else: 
                    # TODO only UTR's
                    # searching through keys to find a pattern describing exon feature 
                    ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
                    if ex_key_pattern:
                        child_feat['exon'] = child_feat[ex_key_pattern[0]]

            # stop_codon are seperated from CDS, add the coordinates based on strand
            if child_feat.get('stop_codon'):
                if orient == '+':
                    if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1:
                        child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]]
                    else:
                        child_feat['CDS'].append(child_feat.get('stop_codon')[0])
                elif orient == '-':
                    if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1:
                        child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]]
                    else:
                        child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0])

            # transcript signal sites 
            TIS, cdsStop, TSS, cleave = [], [], [], []
            cds_status, exon_status, utr_status = 0, 0, 0

            if child_feat.get('exon'):
                TSS = [child_feat.get('exon')[-1][1]]
                TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS 
                cleave = [child_feat.get('exon')[0][0]]
                cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave
                exon_status = 1

            if child_feat.get('CDS'):
                if orient == '+': 
                    TIS = [child_feat.get('CDS')[0][0]]
                    cdsStop = [child_feat.get('CDS')[-1][1]-3]
                else:
                    TIS = [child_feat.get('CDS')[-1][1]]
                    cdsStop = [child_feat.get('CDS')[0][0]+3]
                cds_status = 1 
                # cds phase calculation 
                child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS'))
            
            # sub-feature status 
            if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'):
                utr_status =1 
            
            if utr_status == cds_status == exon_status == 1: 
                t_status = 1
            else:
                t_status = 0
            
            # add sub-feature # make array for export to different out
            TSTAT[xq] = t_status
            EXON[xq] = np.array(child_feat.get('exon'), np.float64)
            UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR')))
            UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR')))
            CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
            TISc[xq] = np.array(TIS)
            CSTOP[xq] = np.array(cdsStop)
            TSSc[xq] = np.array(TSS)
            CLV[xq] = np.array(cleave)
            TSCORE[xq] = tr_score 
            
        # add sub-features to the parent gene feature
        gene[g_cnt]['transcript_status'] = TSTAT
        gene[g_cnt]['transcripts'] = TRS 
        gene[g_cnt]['exons'] = EXON
        gene[g_cnt]['utr5_exons'] = UTR5 
        gene[g_cnt]['cds_exons'] = CDS 
        gene[g_cnt]['utr3_exons'] = UTR3 
        gene[g_cnt]['transcript_type'] = TR_TYP
        gene[g_cnt]['tis'] = TISc
        gene[g_cnt]['cdsStop'] = CSTOP
        gene[g_cnt]['tss'] = TSSc
        gene[g_cnt]['cleave'] = CLV
        gene[g_cnt]['transcript_score'] = TSCORE

        gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], 
                                Name = pdet.get('name'), 
                                Source = pkey[1]) 
        # few empty fields // TODO fill this:
        gene[g_cnt]['anno_id'] = []
        gene[g_cnt]['confgenes_id'] = []
        gene[g_cnt]['alias'] = ''
        gene[g_cnt]['name2'] = []
        gene[g_cnt]['chr_num'] = []
        gene[g_cnt]['paralogs'] = []
        gene[g_cnt]['transcript_valid'] = []
        gene[g_cnt]['exons_confirmed'] = []
        gene[g_cnt]['tis_conf'] = []
        gene[g_cnt]['tis_info'] = []
        gene[g_cnt]['cdsStop_conf'] = []
        gene[g_cnt]['cdsStop_info'] = []
        gene[g_cnt]['tss_info'] = []
        gene[g_cnt]['tss_conf'] = []
        gene[g_cnt]['cleave_info'] = []
        gene[g_cnt]['cleave_conf'] = []
        gene[g_cnt]['polya_info'] = []
        gene[g_cnt]['polya_conf'] = []
        gene[g_cnt]['is_valid'] = []
        gene[g_cnt]['transcript_complete'] = []
        gene[g_cnt]['is_complete'] = []
        gene[g_cnt]['is_correctly_gff3_referenced'] = ''
        gene[g_cnt]['splicegraph'] = []
        g_cnt += 1 

    ## deleting empty gene records from the main array
    XPFLG=0
    for XP, ens in enumerate(gene):
        if ens[0]==0:
            XPFLG=1
            break
    
    if XPFLG==1:
        XQC = range(XP, len(gene)+1)
        gene = np.delete(gene, XQC)

    return gene 
Пример #3
0
def format_gene_models(parent_nf_map, child_nf_map):
    """
    Genarate GeneObject based on the parsed file contents
    @args parent_nf_map: parent features with source and chromosome information 
    @type parent_nf_map: collections defaultdict
    @args child_nf_map: transctipt and exon information are encoded 
    @type child_nf_map: collections defaultdict
    """

    g_cnt = 0
    gene = np.zeros((len(parent_nf_map), ), dtype=utils.init_gene())

    for pkey, pdet in parent_nf_map.items():
        # considering only gene features
        #if not re.search(r'gene', pdet.get('type', '')):
        #    continue

        # infer the gene start and stop if not there in the
        if not pdet.get('location', []):
            GNS, GNE = [], []
            # multiple number of transcripts
            for L1 in child_nf_map[pkey]:
                GNS.append(L1.get('location', [])[0])
                GNE.append(L1.get('location', [])[1])
            GNS.sort()
            GNE.sort()
            pdet['location'] = [GNS[0], GNE[-1]]

        orient = pdet.get('strand', '')
        gene[g_cnt]['id'] = g_cnt + 1
        gene[g_cnt]['chr'] = pkey[0]
        gene[g_cnt]['source'] = pkey[1]
        gene[g_cnt]['name'] = pkey[-1]
        gene[g_cnt]['start'] = pdet.get('location', [])[0]
        gene[g_cnt]['stop'] = pdet.get('location', [])[1]
        gene[g_cnt]['strand'] = orient
        gene[g_cnt]['score'] = pdet.get('score', '')

        # default value
        gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0
        if len(child_nf_map[pkey]) > 1:
            gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1

        # complete sub-feature for all transcripts
        dim = len(child_nf_map[pkey])
        TRS = np.zeros((dim, ), dtype=np.object)
        TR_TYP = np.zeros((dim, ), dtype=np.object)
        EXON = np.zeros((dim, ), dtype=np.object)
        UTR5 = np.zeros((dim, ), dtype=np.object)
        UTR3 = np.zeros((dim, ), dtype=np.object)
        CDS = np.zeros((dim, ), dtype=np.object)
        TISc = np.zeros((dim, ), dtype=np.object)
        TSSc = np.zeros((dim, ), dtype=np.object)
        CLV = np.zeros((dim, ), dtype=np.object)
        CSTOP = np.zeros((dim, ), dtype=np.object)
        TSTAT = np.zeros((dim, ), dtype=np.object)
        TSCORE = np.zeros((dim, ), dtype=np.object)

        # fetching corresponding transcripts
        for xq, Lv1 in enumerate(child_nf_map[pkey]):

            TID = Lv1.get('ID', '')
            TRS[xq] = np.array([TID])

            TYPE = Lv1.get('type', '')
            TR_TYP[xq] = np.array('')
            TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq]

            orient = Lv1.get('strand', '')
            tr_score = Lv1.get('score', '')

            # fetching different sub-features
            child_feat = defaultdict(list)
            for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
                E_TYP = Lv2.get('type', '')
                child_feat[E_TYP].append(Lv2.get('location'))

            # make general ascending order of coordinates
            if orient == '-':
                for etype, excod in child_feat.items():
                    if len(excod) > 1:
                        if excod[0][0] > excod[-1][0]:
                            excod.reverse()
                            child_feat[etype] = excod

            # make exon coordinate from cds and utr regions
            if not child_feat.get('exon'):
                if child_feat.get('CDS'):
                    exon_cod = utils.make_Exon_cod(
                        orient,
                        NonetoemptyList(child_feat.get('five_prime_UTR')),
                        NonetoemptyList(child_feat.get('CDS')),
                        NonetoemptyList(child_feat.get('three_prime_UTR')))
                    child_feat['exon'] = exon_cod
                else:
                    # TODO only UTR's
                    # searching through keys to find a pattern describing exon feature
                    ex_key_pattern = [
                        k for k in child_feat if k.endswith("exon")
                    ]
                    if ex_key_pattern:
                        child_feat['exon'] = child_feat[ex_key_pattern[0]]

            # stop_codon are seperated from CDS, add the coordinates based on strand
            if child_feat.get('stop_codon'):
                if orient == '+':
                    if child_feat.get('stop_codon')[0][0] - child_feat.get(
                            'CDS')[-1][1] == 1:
                        child_feat['CDS'][-1] = [
                            child_feat.get('CDS')[-1][0],
                            child_feat.get('stop_codon')[0][1]
                        ]
                    else:
                        child_feat['CDS'].append(
                            child_feat.get('stop_codon')[0])
                elif orient == '-':
                    if child_feat.get('CDS')[0][0] - child_feat.get(
                            'stop_codon')[0][1] == 1:
                        child_feat['CDS'][0] = [
                            child_feat.get('stop_codon')[0][0],
                            child_feat.get('CDS')[0][1]
                        ]
                    else:
                        child_feat['CDS'].insert(
                            0,
                            child_feat.get('stop_codon')[0])

            # transcript signal sites
            TIS, cdsStop, TSS, cleave = [], [], [], []
            cds_status, exon_status, utr_status = 0, 0, 0

            if child_feat.get('exon'):
                TSS = [child_feat.get('exon')[-1][1]]
                TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS
                cleave = [child_feat.get('exon')[0][0]]
                cleave = [child_feat.get('exon')[-1][1]
                          ] if orient == '+' else cleave
                exon_status = 1

            if child_feat.get('CDS'):
                if orient == '+':
                    TIS = [child_feat.get('CDS')[0][0]]
                    cdsStop = [child_feat.get('CDS')[-1][1] - 3]
                else:
                    TIS = [child_feat.get('CDS')[-1][1]]
                    cdsStop = [child_feat.get('CDS')[0][0] + 3]
                cds_status = 1
                # cds phase calculation
                child_feat['CDS'] = utils.add_CDS_phase(
                    orient, child_feat.get('CDS'))

            # sub-feature status
            if child_feat.get('three_prime_UTR') or child_feat.get(
                    'five_prime_UTR'):
                utr_status = 1

            if utr_status == cds_status == exon_status == 1:
                t_status = 1
            else:
                t_status = 0

            # add sub-feature # make array for export to different out
            TSTAT[xq] = t_status
            EXON[xq] = np.array(child_feat.get('exon'), np.float64)
            UTR5[xq] = np.array(
                NonetoemptyList(child_feat.get('five_prime_UTR')))
            UTR3[xq] = np.array(
                NonetoemptyList(child_feat.get('three_prime_UTR')))
            CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS')))
            TISc[xq] = np.array(TIS)
            CSTOP[xq] = np.array(cdsStop)
            TSSc[xq] = np.array(TSS)
            CLV[xq] = np.array(cleave)
            TSCORE[xq] = tr_score

        # add sub-features to the parent gene feature
        gene[g_cnt]['transcript_status'] = TSTAT
        gene[g_cnt]['transcripts'] = TRS
        gene[g_cnt]['exons'] = EXON
        gene[g_cnt]['utr5_exons'] = UTR5
        gene[g_cnt]['cds_exons'] = CDS
        gene[g_cnt]['utr3_exons'] = UTR3
        gene[g_cnt]['transcript_type'] = TR_TYP
        gene[g_cnt]['tis'] = TISc
        gene[g_cnt]['cdsStop'] = CSTOP
        gene[g_cnt]['tss'] = TSSc
        gene[g_cnt]['cleave'] = CLV
        gene[g_cnt]['transcript_score'] = TSCORE

        gene[g_cnt]['gene_info'] = dict(ID=pkey[-1],
                                        Name=pdet.get('name'),
                                        Source=pkey[1])
        # few empty fields // TODO fill this:
        gene[g_cnt]['anno_id'] = []
        gene[g_cnt]['confgenes_id'] = []
        gene[g_cnt]['alias'] = ''
        gene[g_cnt]['name2'] = []
        gene[g_cnt]['chr_num'] = []
        gene[g_cnt]['paralogs'] = []
        gene[g_cnt]['transcript_valid'] = []
        gene[g_cnt]['exons_confirmed'] = []
        gene[g_cnt]['tis_conf'] = []
        gene[g_cnt]['tis_info'] = []
        gene[g_cnt]['cdsStop_conf'] = []
        gene[g_cnt]['cdsStop_info'] = []
        gene[g_cnt]['tss_info'] = []
        gene[g_cnt]['tss_conf'] = []
        gene[g_cnt]['cleave_info'] = []
        gene[g_cnt]['cleave_conf'] = []
        gene[g_cnt]['polya_info'] = []
        gene[g_cnt]['polya_conf'] = []
        gene[g_cnt]['is_valid'] = []
        gene[g_cnt]['transcript_complete'] = []
        gene[g_cnt]['is_complete'] = []
        gene[g_cnt]['is_correctly_gff3_referenced'] = ''
        gene[g_cnt]['splicegraph'] = []
        g_cnt += 1

    ## deleting empty gene records from the main array
    XPFLG = 0
    for XP, ens in enumerate(gene):
        if ens[0] == 0:
            XPFLG = 1
            break

    if XPFLG == 1:
        XQC = range(XP, len(gene) + 1)
        gene = np.delete(gene, XQC)

    return gene
Пример #4
0
def _format_gene_models(parent_nf_map, child_nf_map): 
    """
    Genarate GeneObject based on the parsed file contents

    parent_map: parent features with source and chromosome information 
    child_map: transctipt and exon information are encoded 
    """
    g_cnt = 0 
    gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene_DE())

    for pkey, pdet in parent_nf_map.items():
        # considering only gene features 
        if not re.search(r'gene', pdet.get('type', '')):
            continue 
        # infer the gene start and stop if not there in the 
        if not pdet.get('location', []):
            GNS, GNE = [], []
            # multiple number of transcripts 
            for L1 in child_nf_map[pkey]:
                GNS.append(L1.get('location', [])[0]) 
                GNE.append(L1.get('location', [])[1]) 
            GNS.sort()
            GNE.sort()
            pdet['location'] = [GNS[0], GNE[-1]]
        orient = pdet.get('strand', '')

        gene[g_cnt]['id'] = g_cnt +1 
        gene[g_cnt]['chr'] = pkey[0]
        gene[g_cnt]['source'] = pkey[1]
        gene[g_cnt]['name'] = pkey[-1]
        gene[g_cnt]['start'] = pdet.get('location', [])[0]
        gene[g_cnt]['stop'] = pdet.get('location', [])[1]
        gene[g_cnt]['strand'] = orient  
        
        # default value 
        gene[g_cnt]['is_alt_spliced'] = 0
        if len(child_nf_map[pkey]) > 1:
            gene[g_cnt]['is_alt_spliced'] = 1

        # complete sub-feature for all transcripts 
        dim = len(child_nf_map[pkey])
        TRS = np.zeros((dim,), dtype=np.object)
        EXON = np.zeros((dim,), dtype=np.object)

        # fetching corresponding transcripts 
        for xq, Lv1 in enumerate(child_nf_map[pkey]):

            TID = Lv1.get('ID', '')
            TRS[xq]= np.array([TID])

            orient = Lv1.get('strand', '')

            # fetching different sub-features 
            child_feat = defaultdict(list)
            for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]:
                E_TYP = Lv2.get('type', '')
                child_feat[E_TYP].append(Lv2.get('location'))
            
            # make exon coordinate from cds and utr regions 
            if not child_feat.get('exon'):  
                if child_feat.get('CDS'):
                    exon_cod = utils.make_Exon_cod( orient, 
                                NonetoemptyList(child_feat.get('five_prime_UTR')), 
                                NonetoemptyList(child_feat.get('CDS')),
                                NonetoemptyList(child_feat.get('three_prime_UTR')))
                    child_feat['exon'] = exon_cod 
                else:
                    # searching through keys to find a pattern describing exon feature 
                    ex_key_pattern = [k for k in child_feat if k.endswith("exon")]
                    child_feat['exon'] = child_feat[ex_key_pattern[0]]
                    # TODO only UTR's

            # make general ascending order of coordinates 
            if orient == '-':
                for etype, excod in child_feat.items():
                    if len(excod) > 1:
                        if excod[0][0] > excod[-1][0]:
                            excod.reverse()
                            child_feat[etype] = excod

            # add sub-feature # make array for export to different out
            EXON[xq] = np.array(child_feat.get('exon'), np.float64)
            
        # add sub-features to the parent gene feature
        gene[g_cnt]['transcripts'] = TRS 
        gene[g_cnt]['exons'] = EXON
        
        gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], 
                                Name = pdet.get('name'), 
                                Source = pkey[1]) 
        g_cnt += 1 

    ## deleting empty gene records from the main array
    XPFLG=0
    for XP, ens in enumerate(gene):
        if ens[0]==0:
            XPFLG=1
            break
    
    if XPFLG==1:
        XQC = range(XP, len(gene)+1)
        gene = np.delete(gene, XQC)

    return gene