def _format_gene_models(parent_nf_map, child_nf_map): """ Genarate GeneObject based on the parsed file contents parent_map: parent features with source and chromosome information child_map: transctipt and exon information are encoded """ g_cnt = 0 gene = np.zeros((len(parent_nf_map), ), dtype=utils.init_gene_DE()) for pkey, pdet in parent_nf_map.items(): # considering only gene features if not re.search(r'gene', pdet.get('type', '')): continue # infer the gene start and stop if not there in the if not pdet.get('location', []): GNS, GNE = [], [] # multiple number of transcripts for L1 in child_nf_map[pkey]: GNS.append(L1.get('location', [])[0]) GNE.append(L1.get('location', [])[1]) GNS.sort() GNE.sort() pdet['location'] = [GNS[0], GNE[-1]] orient = pdet.get('strand', '') gene[g_cnt]['id'] = g_cnt + 1 gene[g_cnt]['chr'] = pkey[0] gene[g_cnt]['source'] = pkey[1] gene[g_cnt]['name'] = pkey[-1] gene[g_cnt]['start'] = pdet.get('location', [])[0] gene[g_cnt]['stop'] = pdet.get('location', [])[1] gene[g_cnt]['strand'] = orient # default value gene[g_cnt]['is_alt_spliced'] = 0 if len(child_nf_map[pkey]) > 1: gene[g_cnt]['is_alt_spliced'] = 1 # complete sub-feature for all transcripts dim = len(child_nf_map[pkey]) TRS = np.zeros((dim, ), dtype=np.object) EXON = np.zeros((dim, ), dtype=np.object) # fetching corresponding transcripts for xq, Lv1 in enumerate(child_nf_map[pkey]): TID = Lv1.get('ID', '') TRS[xq] = np.array([TID]) orient = Lv1.get('strand', '') # fetching different sub-features child_feat = defaultdict(list) for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: E_TYP = Lv2.get('type', '') child_feat[E_TYP].append(Lv2.get('location')) # make exon coordinate from cds and utr regions if not child_feat.get('exon'): if child_feat.get('CDS'): exon_cod = utils.make_Exon_cod( orient, NonetoemptyList(child_feat.get('five_prime_UTR')), NonetoemptyList(child_feat.get('CDS')), NonetoemptyList(child_feat.get('three_prime_UTR'))) child_feat['exon'] = exon_cod else: # searching through keys to find a pattern describing exon feature ex_key_pattern = [ k for k in child_feat if k.endswith("exon") ] child_feat['exon'] = child_feat[ex_key_pattern[0]] # TODO only UTR's # make general ascending order of coordinates if orient == '-': for etype, excod in child_feat.items(): if len(excod) > 1: if excod[0][0] > excod[-1][0]: excod.reverse() child_feat[etype] = excod # add sub-feature # make array for export to different out EXON[xq] = np.array(child_feat.get('exon'), np.float64) # add sub-features to the parent gene feature gene[g_cnt]['transcripts'] = TRS gene[g_cnt]['exons'] = EXON gene[g_cnt]['gene_info'] = dict(ID=pkey[-1], Name=pdet.get('name'), Source=pkey[1]) g_cnt += 1 ## deleting empty gene records from the main array for XP, ens in enumerate(gene): if ens[0] == 0: break XQC = range(XP, len(gene) + 1) gene = np.delete(gene, XQC) return gene
def format_gene_models(parent_nf_map, child_nf_map): """ Genarate GeneObject based on the parsed file contents @args parent_nf_map: parent features with source and chromosome information @type parent_nf_map: collections defaultdict @args child_nf_map: transctipt and exon information are encoded @type child_nf_map: collections defaultdict """ g_cnt = 0 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene()) for pkey, pdet in parent_nf_map.items(): # considering only gene features #if not re.search(r'gene', pdet.get('type', '')): # continue # infer the gene start and stop if not there in the if not pdet.get('location', []): GNS, GNE = [], [] # multiple number of transcripts for L1 in child_nf_map[pkey]: GNS.append(L1.get('location', [])[0]) GNE.append(L1.get('location', [])[1]) GNS.sort() GNE.sort() pdet['location'] = [GNS[0], GNE[-1]] orient = pdet.get('strand', '') gene[g_cnt]['id'] = g_cnt +1 gene[g_cnt]['chr'] = pkey[0] gene[g_cnt]['source'] = pkey[1] gene[g_cnt]['name'] = pkey[-1] gene[g_cnt]['start'] = pdet.get('location', [])[0] gene[g_cnt]['stop'] = pdet.get('location', [])[1] gene[g_cnt]['strand'] = orient gene[g_cnt]['score'] = pdet.get('score','') # default value gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0 if len(child_nf_map[pkey]) > 1: gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1 # complete sub-feature for all transcripts dim = len(child_nf_map[pkey]) TRS = np.zeros((dim,), dtype=np.object) TR_TYP = np.zeros((dim,), dtype=np.object) EXON = np.zeros((dim,), dtype=np.object) UTR5 = np.zeros((dim,), dtype=np.object) UTR3 = np.zeros((dim,), dtype=np.object) CDS = np.zeros((dim,), dtype=np.object) TISc = np.zeros((dim,), dtype=np.object) TSSc = np.zeros((dim,), dtype=np.object) CLV = np.zeros((dim,), dtype=np.object) CSTOP = np.zeros((dim,), dtype=np.object) TSTAT = np.zeros((dim,), dtype=np.object) TSCORE = np.zeros((dim,), dtype=np.object) # fetching corresponding transcripts for xq, Lv1 in enumerate(child_nf_map[pkey]): TID = Lv1.get('ID', '') TRS[xq]= np.array([TID]) TYPE = Lv1.get('type', '') TR_TYP[xq] = np.array('') TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq] orient = Lv1.get('strand', '') tr_score = Lv1.get('score', '') # fetching different sub-features child_feat = defaultdict(list) for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: E_TYP = Lv2.get('type', '') child_feat[E_TYP].append(Lv2.get('location')) # make general ascending order of coordinates if orient == '-': for etype, excod in child_feat.items(): if len(excod) > 1: if excod[0][0] > excod[-1][0]: excod.reverse() child_feat[etype] = excod # make exon coordinate from cds and utr regions if not child_feat.get('exon'): if child_feat.get('CDS'): exon_cod = utils.make_Exon_cod( orient, NonetoemptyList(child_feat.get('five_prime_UTR')), NonetoemptyList(child_feat.get('CDS')), NonetoemptyList(child_feat.get('three_prime_UTR'))) child_feat['exon'] = exon_cod else: # TODO only UTR's # searching through keys to find a pattern describing exon feature ex_key_pattern = [k for k in child_feat if k.endswith("exon")] if ex_key_pattern: child_feat['exon'] = child_feat[ex_key_pattern[0]] # stop_codon are seperated from CDS, add the coordinates based on strand if child_feat.get('stop_codon'): if orient == '+': if child_feat.get('stop_codon')[0][0] - child_feat.get('CDS')[-1][1] == 1: child_feat['CDS'][-1] = [child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1]] else: child_feat['CDS'].append(child_feat.get('stop_codon')[0]) elif orient == '-': if child_feat.get('CDS')[0][0] - child_feat.get('stop_codon')[0][1] == 1: child_feat['CDS'][0] = [child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1]] else: child_feat['CDS'].insert(0, child_feat.get('stop_codon')[0]) # transcript signal sites TIS, cdsStop, TSS, cleave = [], [], [], [] cds_status, exon_status, utr_status = 0, 0, 0 if child_feat.get('exon'): TSS = [child_feat.get('exon')[-1][1]] TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS cleave = [child_feat.get('exon')[0][0]] cleave = [child_feat.get('exon')[-1][1]] if orient == '+' else cleave exon_status = 1 if child_feat.get('CDS'): if orient == '+': TIS = [child_feat.get('CDS')[0][0]] cdsStop = [child_feat.get('CDS')[-1][1]-3] else: TIS = [child_feat.get('CDS')[-1][1]] cdsStop = [child_feat.get('CDS')[0][0]+3] cds_status = 1 # cds phase calculation child_feat['CDS'] = utils.add_CDS_phase(orient, child_feat.get('CDS')) # sub-feature status if child_feat.get('three_prime_UTR') or child_feat.get('five_prime_UTR'): utr_status =1 if utr_status == cds_status == exon_status == 1: t_status = 1 else: t_status = 0 # add sub-feature # make array for export to different out TSTAT[xq] = t_status EXON[xq] = np.array(child_feat.get('exon'), np.float64) UTR5[xq] = np.array(NonetoemptyList(child_feat.get('five_prime_UTR'))) UTR3[xq] = np.array(NonetoemptyList(child_feat.get('three_prime_UTR'))) CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS'))) TISc[xq] = np.array(TIS) CSTOP[xq] = np.array(cdsStop) TSSc[xq] = np.array(TSS) CLV[xq] = np.array(cleave) TSCORE[xq] = tr_score # add sub-features to the parent gene feature gene[g_cnt]['transcript_status'] = TSTAT gene[g_cnt]['transcripts'] = TRS gene[g_cnt]['exons'] = EXON gene[g_cnt]['utr5_exons'] = UTR5 gene[g_cnt]['cds_exons'] = CDS gene[g_cnt]['utr3_exons'] = UTR3 gene[g_cnt]['transcript_type'] = TR_TYP gene[g_cnt]['tis'] = TISc gene[g_cnt]['cdsStop'] = CSTOP gene[g_cnt]['tss'] = TSSc gene[g_cnt]['cleave'] = CLV gene[g_cnt]['transcript_score'] = TSCORE gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], Name = pdet.get('name'), Source = pkey[1]) # few empty fields // TODO fill this: gene[g_cnt]['anno_id'] = [] gene[g_cnt]['confgenes_id'] = [] gene[g_cnt]['alias'] = '' gene[g_cnt]['name2'] = [] gene[g_cnt]['chr_num'] = [] gene[g_cnt]['paralogs'] = [] gene[g_cnt]['transcript_valid'] = [] gene[g_cnt]['exons_confirmed'] = [] gene[g_cnt]['tis_conf'] = [] gene[g_cnt]['tis_info'] = [] gene[g_cnt]['cdsStop_conf'] = [] gene[g_cnt]['cdsStop_info'] = [] gene[g_cnt]['tss_info'] = [] gene[g_cnt]['tss_conf'] = [] gene[g_cnt]['cleave_info'] = [] gene[g_cnt]['cleave_conf'] = [] gene[g_cnt]['polya_info'] = [] gene[g_cnt]['polya_conf'] = [] gene[g_cnt]['is_valid'] = [] gene[g_cnt]['transcript_complete'] = [] gene[g_cnt]['is_complete'] = [] gene[g_cnt]['is_correctly_gff3_referenced'] = '' gene[g_cnt]['splicegraph'] = [] g_cnt += 1 ## deleting empty gene records from the main array XPFLG=0 for XP, ens in enumerate(gene): if ens[0]==0: XPFLG=1 break if XPFLG==1: XQC = range(XP, len(gene)+1) gene = np.delete(gene, XQC) return gene
def format_gene_models(parent_nf_map, child_nf_map): """ Genarate GeneObject based on the parsed file contents @args parent_nf_map: parent features with source and chromosome information @type parent_nf_map: collections defaultdict @args child_nf_map: transctipt and exon information are encoded @type child_nf_map: collections defaultdict """ g_cnt = 0 gene = np.zeros((len(parent_nf_map), ), dtype=utils.init_gene()) for pkey, pdet in parent_nf_map.items(): # considering only gene features #if not re.search(r'gene', pdet.get('type', '')): # continue # infer the gene start and stop if not there in the if not pdet.get('location', []): GNS, GNE = [], [] # multiple number of transcripts for L1 in child_nf_map[pkey]: GNS.append(L1.get('location', [])[0]) GNE.append(L1.get('location', [])[1]) GNS.sort() GNE.sort() pdet['location'] = [GNS[0], GNE[-1]] orient = pdet.get('strand', '') gene[g_cnt]['id'] = g_cnt + 1 gene[g_cnt]['chr'] = pkey[0] gene[g_cnt]['source'] = pkey[1] gene[g_cnt]['name'] = pkey[-1] gene[g_cnt]['start'] = pdet.get('location', [])[0] gene[g_cnt]['stop'] = pdet.get('location', [])[1] gene[g_cnt]['strand'] = orient gene[g_cnt]['score'] = pdet.get('score', '') # default value gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 0 if len(child_nf_map[pkey]) > 1: gene[g_cnt]['is_alt_spliced'] = gene[g_cnt]['is_alt'] = 1 # complete sub-feature for all transcripts dim = len(child_nf_map[pkey]) TRS = np.zeros((dim, ), dtype=np.object) TR_TYP = np.zeros((dim, ), dtype=np.object) EXON = np.zeros((dim, ), dtype=np.object) UTR5 = np.zeros((dim, ), dtype=np.object) UTR3 = np.zeros((dim, ), dtype=np.object) CDS = np.zeros((dim, ), dtype=np.object) TISc = np.zeros((dim, ), dtype=np.object) TSSc = np.zeros((dim, ), dtype=np.object) CLV = np.zeros((dim, ), dtype=np.object) CSTOP = np.zeros((dim, ), dtype=np.object) TSTAT = np.zeros((dim, ), dtype=np.object) TSCORE = np.zeros((dim, ), dtype=np.object) # fetching corresponding transcripts for xq, Lv1 in enumerate(child_nf_map[pkey]): TID = Lv1.get('ID', '') TRS[xq] = np.array([TID]) TYPE = Lv1.get('type', '') TR_TYP[xq] = np.array('') TR_TYP[xq] = np.array(TYPE) if TYPE else TR_TYP[xq] orient = Lv1.get('strand', '') tr_score = Lv1.get('score', '') # fetching different sub-features child_feat = defaultdict(list) for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: E_TYP = Lv2.get('type', '') child_feat[E_TYP].append(Lv2.get('location')) # make general ascending order of coordinates if orient == '-': for etype, excod in child_feat.items(): if len(excod) > 1: if excod[0][0] > excod[-1][0]: excod.reverse() child_feat[etype] = excod # make exon coordinate from cds and utr regions if not child_feat.get('exon'): if child_feat.get('CDS'): exon_cod = utils.make_Exon_cod( orient, NonetoemptyList(child_feat.get('five_prime_UTR')), NonetoemptyList(child_feat.get('CDS')), NonetoemptyList(child_feat.get('three_prime_UTR'))) child_feat['exon'] = exon_cod else: # TODO only UTR's # searching through keys to find a pattern describing exon feature ex_key_pattern = [ k for k in child_feat if k.endswith("exon") ] if ex_key_pattern: child_feat['exon'] = child_feat[ex_key_pattern[0]] # stop_codon are seperated from CDS, add the coordinates based on strand if child_feat.get('stop_codon'): if orient == '+': if child_feat.get('stop_codon')[0][0] - child_feat.get( 'CDS')[-1][1] == 1: child_feat['CDS'][-1] = [ child_feat.get('CDS')[-1][0], child_feat.get('stop_codon')[0][1] ] else: child_feat['CDS'].append( child_feat.get('stop_codon')[0]) elif orient == '-': if child_feat.get('CDS')[0][0] - child_feat.get( 'stop_codon')[0][1] == 1: child_feat['CDS'][0] = [ child_feat.get('stop_codon')[0][0], child_feat.get('CDS')[0][1] ] else: child_feat['CDS'].insert( 0, child_feat.get('stop_codon')[0]) # transcript signal sites TIS, cdsStop, TSS, cleave = [], [], [], [] cds_status, exon_status, utr_status = 0, 0, 0 if child_feat.get('exon'): TSS = [child_feat.get('exon')[-1][1]] TSS = [child_feat.get('exon')[0][0]] if orient == '+' else TSS cleave = [child_feat.get('exon')[0][0]] cleave = [child_feat.get('exon')[-1][1] ] if orient == '+' else cleave exon_status = 1 if child_feat.get('CDS'): if orient == '+': TIS = [child_feat.get('CDS')[0][0]] cdsStop = [child_feat.get('CDS')[-1][1] - 3] else: TIS = [child_feat.get('CDS')[-1][1]] cdsStop = [child_feat.get('CDS')[0][0] + 3] cds_status = 1 # cds phase calculation child_feat['CDS'] = utils.add_CDS_phase( orient, child_feat.get('CDS')) # sub-feature status if child_feat.get('three_prime_UTR') or child_feat.get( 'five_prime_UTR'): utr_status = 1 if utr_status == cds_status == exon_status == 1: t_status = 1 else: t_status = 0 # add sub-feature # make array for export to different out TSTAT[xq] = t_status EXON[xq] = np.array(child_feat.get('exon'), np.float64) UTR5[xq] = np.array( NonetoemptyList(child_feat.get('five_prime_UTR'))) UTR3[xq] = np.array( NonetoemptyList(child_feat.get('three_prime_UTR'))) CDS[xq] = np.array(NonetoemptyList(child_feat.get('CDS'))) TISc[xq] = np.array(TIS) CSTOP[xq] = np.array(cdsStop) TSSc[xq] = np.array(TSS) CLV[xq] = np.array(cleave) TSCORE[xq] = tr_score # add sub-features to the parent gene feature gene[g_cnt]['transcript_status'] = TSTAT gene[g_cnt]['transcripts'] = TRS gene[g_cnt]['exons'] = EXON gene[g_cnt]['utr5_exons'] = UTR5 gene[g_cnt]['cds_exons'] = CDS gene[g_cnt]['utr3_exons'] = UTR3 gene[g_cnt]['transcript_type'] = TR_TYP gene[g_cnt]['tis'] = TISc gene[g_cnt]['cdsStop'] = CSTOP gene[g_cnt]['tss'] = TSSc gene[g_cnt]['cleave'] = CLV gene[g_cnt]['transcript_score'] = TSCORE gene[g_cnt]['gene_info'] = dict(ID=pkey[-1], Name=pdet.get('name'), Source=pkey[1]) # few empty fields // TODO fill this: gene[g_cnt]['anno_id'] = [] gene[g_cnt]['confgenes_id'] = [] gene[g_cnt]['alias'] = '' gene[g_cnt]['name2'] = [] gene[g_cnt]['chr_num'] = [] gene[g_cnt]['paralogs'] = [] gene[g_cnt]['transcript_valid'] = [] gene[g_cnt]['exons_confirmed'] = [] gene[g_cnt]['tis_conf'] = [] gene[g_cnt]['tis_info'] = [] gene[g_cnt]['cdsStop_conf'] = [] gene[g_cnt]['cdsStop_info'] = [] gene[g_cnt]['tss_info'] = [] gene[g_cnt]['tss_conf'] = [] gene[g_cnt]['cleave_info'] = [] gene[g_cnt]['cleave_conf'] = [] gene[g_cnt]['polya_info'] = [] gene[g_cnt]['polya_conf'] = [] gene[g_cnt]['is_valid'] = [] gene[g_cnt]['transcript_complete'] = [] gene[g_cnt]['is_complete'] = [] gene[g_cnt]['is_correctly_gff3_referenced'] = '' gene[g_cnt]['splicegraph'] = [] g_cnt += 1 ## deleting empty gene records from the main array XPFLG = 0 for XP, ens in enumerate(gene): if ens[0] == 0: XPFLG = 1 break if XPFLG == 1: XQC = range(XP, len(gene) + 1) gene = np.delete(gene, XQC) return gene
def _format_gene_models(parent_nf_map, child_nf_map): """ Genarate GeneObject based on the parsed file contents parent_map: parent features with source and chromosome information child_map: transctipt and exon information are encoded """ g_cnt = 0 gene = np.zeros((len(parent_nf_map),), dtype = utils.init_gene_DE()) for pkey, pdet in parent_nf_map.items(): # considering only gene features if not re.search(r'gene', pdet.get('type', '')): continue # infer the gene start and stop if not there in the if not pdet.get('location', []): GNS, GNE = [], [] # multiple number of transcripts for L1 in child_nf_map[pkey]: GNS.append(L1.get('location', [])[0]) GNE.append(L1.get('location', [])[1]) GNS.sort() GNE.sort() pdet['location'] = [GNS[0], GNE[-1]] orient = pdet.get('strand', '') gene[g_cnt]['id'] = g_cnt +1 gene[g_cnt]['chr'] = pkey[0] gene[g_cnt]['source'] = pkey[1] gene[g_cnt]['name'] = pkey[-1] gene[g_cnt]['start'] = pdet.get('location', [])[0] gene[g_cnt]['stop'] = pdet.get('location', [])[1] gene[g_cnt]['strand'] = orient # default value gene[g_cnt]['is_alt_spliced'] = 0 if len(child_nf_map[pkey]) > 1: gene[g_cnt]['is_alt_spliced'] = 1 # complete sub-feature for all transcripts dim = len(child_nf_map[pkey]) TRS = np.zeros((dim,), dtype=np.object) EXON = np.zeros((dim,), dtype=np.object) # fetching corresponding transcripts for xq, Lv1 in enumerate(child_nf_map[pkey]): TID = Lv1.get('ID', '') TRS[xq]= np.array([TID]) orient = Lv1.get('strand', '') # fetching different sub-features child_feat = defaultdict(list) for Lv2 in child_nf_map[(pkey[0], pkey[1], TID)]: E_TYP = Lv2.get('type', '') child_feat[E_TYP].append(Lv2.get('location')) # make exon coordinate from cds and utr regions if not child_feat.get('exon'): if child_feat.get('CDS'): exon_cod = utils.make_Exon_cod( orient, NonetoemptyList(child_feat.get('five_prime_UTR')), NonetoemptyList(child_feat.get('CDS')), NonetoemptyList(child_feat.get('three_prime_UTR'))) child_feat['exon'] = exon_cod else: # searching through keys to find a pattern describing exon feature ex_key_pattern = [k for k in child_feat if k.endswith("exon")] child_feat['exon'] = child_feat[ex_key_pattern[0]] # TODO only UTR's # make general ascending order of coordinates if orient == '-': for etype, excod in child_feat.items(): if len(excod) > 1: if excod[0][0] > excod[-1][0]: excod.reverse() child_feat[etype] = excod # add sub-feature # make array for export to different out EXON[xq] = np.array(child_feat.get('exon'), np.float64) # add sub-features to the parent gene feature gene[g_cnt]['transcripts'] = TRS gene[g_cnt]['exons'] = EXON gene[g_cnt]['gene_info'] = dict( ID = pkey[-1], Name = pdet.get('name'), Source = pkey[1]) g_cnt += 1 ## deleting empty gene records from the main array XPFLG=0 for XP, ens in enumerate(gene): if ens[0]==0: XPFLG=1 break if XPFLG==1: XQC = range(XP, len(gene)+1) gene = np.delete(gene, XQC) return gene