def start(self): freq_item = self.get_freq_item() with open(self.infile, 'r') as fr, open(self.suffix + '.filter_freq_pass', 'w') as fw_pass,\ open(self.suffix + '.filter_freq_fail', 'w') as fw_fail: for line in fr: freq_status_list = [] # if line.startswith('Scale'): fw_pass.write(line) fw_fail.write(line) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') for item in freq_item: # item = gnomAD_SAS_AF if not linelist[head_index[item.lower()]] == '-' and\ float(linelist[head_index[item.lower()]]) < self.freq: freq_status_list.append('pass') else: freq_status_list.append('fail') if self.judge_freq_tag(freq_status_list): fw_pass.write(line) else: fw_fail.write(line) # 清空,列表为可更改对象 freq_status_list.clear()
def start(self): localcontrol_info = self.get_localcontrol_info() with open(self.infile, 'r') as fr, open(self.suffix + '_pass', 'w') as fw_pass,\ open(self.suffix + '_fail', 'w') as fw_fail: for line in fr: if line.startswith('Scale'): head = line fw_pass.write(line) fw_fail.write(line) head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') case_freq = float(linelist[head_index['case_var_freq']]) _chr = linelist[head_index['chromosome']] pos = linelist[ head_index['start_position']] # 理解为什么选取start作为pos ref = linelist[head_index['reference_allele']] alt = linelist[head_index['allele']] key = '{_chr}_{pos}_{ref}_{alt}'.format(**locals()) if key in localcontrol_info and case_freq > localcontrol_info[ key][0]: fw_pass.write('NOControl{0};{1}'.format( localcontrol_info[key][1], line)) elif key in localcontrol_info and case_freq <= localcontrol_info[ key][0]: fw_fail.write('NOControl{0};{1}'.format( localcontrol_info[key][1], line)) else: fw_pass.write(line)
def start(self): func_list = self.get_save_function_list() with utils.safe_open(self.infile, 'r') as fr, open(self.suffix + '_pass', 'w') as fw_pass, \ open(self.suffix + '_fail', 'w') as fw_fail: for line in fr: if line.startswith('Scale'): fw_pass.write(line) fw_fail.write(line) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') bgi_func = linelist[head_index['bgi_function']] # nonsense vep_func = linelist[head_index['consequence']] # 原始vep注释结果 gene = linelist[head_index['hugo_symbol']] # TERT chgvs = linelist[head_index['hgvsc']] # c.-146C>T # 针对8个tert启动子,构造特殊key tert_promter = '{gene}:{chgvs}'.format(**locals()) if bgi_func in func_list and self.save_span( chgvs, bgi_func, vep_func) or tert_promter in func_list: fw_pass.write(line) else: fw_fail.write(line)
def start(self): ''' ''' with open(self.infile, 'r') as fr, open(self.suffix + '_pass', 'w') as fw_pass,\ open(self.suffix + '_fail', 'w') as fw_fail: for line in fr: if line.startswith('Scale'): head = line fw_pass.write(line) fw_fail.write('{}\tfilter_reason\n'.format( line.strip('\n'))) head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') case_read = linelist[head_index['case_var_readsnum']] case_read_pos = linelist[ head_index['case_var_positive_readsnum']] case_read_neg = linelist[ head_index['case_var_negative_readsnum']] if self.pass_read_threshold(case_read, case_read_pos, case_read_neg): fw_pass.write(line) else: fw_fail.write('{}\tReadNum\n'.format(line.strip('\n')))
def start(self): '''全部的最终接口均为start函数 ''' uniport_info, uniport_gene_length = self.get_uniport_info() with open(self.infile, 'r') as fr, open(self.resullt, 'w') as fw: for line in fr: if line.startswith('Scale'): fw.write(line) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') gene = linelist[head_index['hugo_symbol']] phgvs = linelist[head_index['hgvsp_short']] #获取氨基酸发生突变的位置 if re.search(r'(\d+)', phgvs): phgvs_pos = re.search(r'(\d+)', phgvs).group(1) else: phgvs_pos = '*' key = '{gene}_{phgvs_pos}'.format(**locals()) linelist[head_index['bgi_uniport_position(s)']] = uniport_gene_length.get(gene, '*') if uniport_info.get(key): linelist[head_index['bgi_uniport_position(s)']] = uniport_info[key]['length'] or '*' linelist[head_index['bgi_uniport_feature_key']] = uniport_info[key]['feature_key'] or '*' linelist[head_index['bgi_uniport_description']] = uniport_info[key]['description'] or '*' fw.write('{0}\n'.format('\t'.join(linelist)))
def start(self): final_exon_info = self.get_final_exon_info() func_relation_info = self.get_func_relation_info() with open(self.infile, 'r') as fr, open(self.result, 'w') as fw: for line in fr: if line.startswith('Scale'): head = line fw.write(line) head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') gene = linelist[head_index['hugo_symbol']] tran = linelist[head_index['transcript_id']] exon = linelist[head_index['exon']] key = '{tran}_{exon}'.format(**locals()) # 更新Funcregion 字段 linelist[head_index['funcregion']] = func_relation_info.get( key, 'Nan') # 更新最后一个exon的写法 linelist[head_index['exon']] = self.final_exon( final_exon_info, tran, exon) fw.write('{}\n'.format('\t'.join(linelist)))
def start(self): driver_info = self.get_driver_info() special_driver_info = self.get_special_driver_info() with open(self.infile, 'r') as fr, open(self.result, 'w') as fw: for line in fr: if line.startswith('Scale'): #GG head = line fw.write(line) head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') gene = linelist[head_index['hugo_symbol']] chgvs = linelist[head_index['hgvsc']] phgvs = linelist[head_index['hgvsp_short']] func = linelist[head_index['bgi_function']] exon = linelist[head_index['exon']] case_freq = linelist[head_index['case_var_freq']] # 利用bgicg结果进行测试 # gene = linelist[head_index['#gene']] # gene # chgvs = linelist[head_index['chgvs']] # cHGVS # phgvs = linelist[head_index['phgvs']] # pHGVS # func = linelist[head_index['function']] # Function # exon = linelist[head_index['exin_id']] # ExIn_ID # case_freq = linelist[head_index['case_var_freq']] kw = { 'gene': gene, 'chgvs': chgvs, 'phgvs': phgvs, 'func': func, 'exon': exon, 'driver_info': driver_info, 'special_driver_info': special_driver_info } # 进行TMB字段判断 if float(case_freq) < self.tmb_freq: linelist[head_index['tmb_type']] = 'noTMB' # elif func == 'span' and re.search(r'-EX1$', exon): linelist[head_index['tmb_type']] = 'noTMB' elif self.is_driver_gene(**kw): linelist[head_index['tmb_type']] = 'Driver' else: linelist[head_index['tmb_type']] = 'TMB' fw.write('{}\n'.format('\t'.join(linelist)))
def start(self): func_info = self.get_gene_func_info() chgvs_info, phgvs_info = self.get_gene_pos_info() exon_info = self.get_gene_exon_info() with open(self.infile, 'r') as fr, open(self.result, 'w') as fw: for line in fr: if line.startswith('Scale'): head = line fw.write(line) head_index = utils.get_head_index(line) continue linelist = line.strip('\n').split('\t') gene = linelist[head_index['hugo_symbol']] chgvs = linelist[head_index['hgvsc']] phgvs = linelist[head_index['hgvsp_short']] func = linelist[head_index['bgi_function']] exon = linelist[head_index['exon']] key_func = '{gene}_{func}'.format(**locals()) key_chgvs = '{gene}_{chgvs}'.format(**locals()) key_phgvs = '{gene}_{phgvs}'.format(**locals()) # print(key_func) #修改Target_gene字段信息 tmp = '' if func_info.get(key_func): for key, value in func_info[key_func].items(): tmp += '{key}:{value};'.format(**locals()) elif phgvs_info.get(key_phgvs): for key, value in phgvs_info[key_phgvs].items(): tmp += '{key}:{value};'.format(**locals()) elif chgvs_info.get(key_chgvs): for key, value in chgvs_info[key_chgvs].items(): tmp += '{key}:{value};'.format(**locals()) elif exon_info.get(gene) and exon.startswith('EX'): tmp = exon_info[gene] # tmp = tmp.replace(';;', ';') if tmp: linelist[head_index['target_gene']] = 'YES({tmp})'.format( **locals()).replace(';;', ';') else: linelist[head_index['target_gene']] = 'NO' fw.write('{}\n'.format('\t'.join(map(str, linelist))))
def get_tran_relation(self): ''' input: self.transcript: gene transcript output: list: [gene=trans, gene=trans] ''' tran_relation = [] with utils.safe_open(self.transript_database, 'r') as fr: for line in fr: if line.startswith('#'): head_index = utils.get_head_index(line) continue linelist = line.strip('').split('\t') gene = linelist[head_index['#gene']] tran = linelist[head_index['transcript']] tran_relation.append('{gene}={tran}'.format(**locals())) return tran_relation
def start(self): maploc_info = self.get_maploc_info() with open(self.infile, 'r') as fr, open(self.result, 'w') as fw: for line in fr: if line.startswith('Scale'): fw.write(line) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') _chr = linelist[head_index['chromosome']] start = int(linelist[head_index['start_position']]) end = int(linelist[head_index['end_position']]) map_location = self.get_variant_maploc(_chr, start, end, maploc_info) linelist[head_index['maploc']] = map_location # print(linelist) fw.write('{}\n'.format('\t'.join(linelist)))
def start(self): with open(self.infile, 'r') as fr, open(self.suffix + '_pass', 'w') as fw_pass,\ open(self.suffix + '_fail', 'w') as fw_fail: for line in fr: if line.startswith('Scale'): fw_pass.write(line) fw_fail.write('{}\tfilter_reason\n'.format( line.strip('\n'))) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') tmb_tag = linelist[head_index['tmb_type']] func = linelist[head_index['bgi_function']] if func == 'coding-synon' and tmb_tag == 'noTMB': fw_fail.write('{}\tsynonymy\n'.format('\t'.join(linelist))) else: fw_pass.write(line)
def get_tran_relation(self, **args): ''' input: self.transcript: gene transcript output: list: [trans1, trans2] ''' tran_relation = [] with utils.safe_open(args['transcript_data'], 'r') as fr: for line in fr: line = line.strip('\n') if line.startswith('#'): head_index = utils.get_head_index(line) continue linelist = line.strip('\n').split('\t') # gene = linelist[head_index['#gene']] tran = linelist[head_index['transcript']] tran_relation.append('{tran}'.format(**locals())) return tran_relation
def start(self, **args): with open(args['infile'], 'r') as fr, open(args['result'], 'w') as fw: for line in fr: if line.startswith('Scale'): fw.write(line) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') func = linelist[head_index['bgi_function']] tran = linelist[head_index['transcript_id']] start = linelist[head_index['start_position']] end = linelist[head_index['end_position']] phgvs = linelist[head_index['hgvsp_short']] chgvs = linelist[head_index['hgvsc']] flank = linelist[head_index['flank']] strand = linelist[head_index['strand']] gene = linelist[head_index['hugo_symbol']] exon_id = linelist[head_index['exon']] end_exon_tag = EndExonCheck(args, tran, start, end, func).start() gene_extend_tag = GeneExtendCheck(args, tran, func, phgvs).start() splice_affect_tag = SpliceAffectCheck(args, func, chgvs, flank, strand).start() newfun_tag = NewFunction(func, gene, exon_id, end_exon_tag, gene_extend_tag, splice_affect_tag).start() linelist[head_index['bgi_end_exon_check']] = end_exon_tag linelist[head_index['bgi_gene_extend_check']] = gene_extend_tag linelist[ head_index['bgi_splice_affect_check']] = splice_affect_tag linelist[head_index['bgi_newfunction']] = newfun_tag # print(linelist) fw.write('{}\n'.format('\t'.join(linelist)))
def start(self): database_info = self.get_database_info() with open(self.infile, 'r') as fr, open(self.suffix + '_pass', 'w') as fw_pass, \ open(self.suffix + '_fail', 'w') as fw_fail: for line in fr: if line.startswith('Scale'): fw_pass.write(line) fw_fail.write('{}\tfail_reason\n'.format(line.strip('\n'))) head = line head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') _chr = linelist[head_index['chromosome']] pos = linelist[head_index['start_position']] ref = linelist[head_index['reference_allele']] alt = linelist[head_index['allele']] key = '{_chr}_{pos}_{ref}_{alt}'.format(**locals()) if key in database_info: fw_fail.write('{}\t{}\n'.format(line.strip('\n'), database_info[key])) else: fw_pass.write(line)
def start(self): # print('>>>cosmic分析中') cosmic_info = self.get_cosmic_info() with open(self.infile, 'r') as fr, open(self.result, 'w') as fw: for line in fr: if line.startswith('Scale'): head = line fw.write(line) head_index = utils.get_head_index(head) continue linelist = line.strip('\n').split('\t') gene = linelist[head_index['hugo_symbol']] _chr = linelist[head_index['chromosome']] pos = linelist[head_index['start_position']] #清楚为什么使用start!! ref = linelist[head_index['reference_allele']] alt = linelist[head_index['allele']] chgvs = linelist[head_index['hgvsc']] key1 = '{gene}_{_chr}_{pos}_{ref}_{alt}'.format(**locals()) key2 = '{gene}_{chgvs}'.format(**locals()) cosmic_tmp = '' tmp1 = cosmic_info.get(key1, '') tmp2 = cosmic_info.get(key2, '') if tmp1 == tmp2 and tmp1: cosmic_linelist = tmp1.split('\t') if cosmic_linelist[11] == '-': cosmic_tmp = 'the mutation {0} has been exclude from the website.:{1}'.\ format(cosmic_linelist[1], cosmic_linelist[10]) else: cosmic_tmp = '{0}:{1};{2}'.format( cosmic_linelist[1], cosmic_linelist[10], cosmic_linelist[11]) elif tmp1 and not tmp2: cosmic_linelist = tmp1.split('\t') if cosmic_linelist[11] == '-': cosmic_tmp = 'the mutation {0} has beed excluded from \ the website and cosmic gene or chgvs diff {1}_{2}:{3}'.format( cosmic_linelist[1], cosmic_linelist[0], cosmic_linelist[2], cosmic_linelist[10]) else: cosmic_tmp = 'Cosmic gene or cHGVS diff {0}_{1}: {2}:{3};{4}'.format( cosmic_linelist[0], # gene cosmic_linelist[2], # hgvs cosmic_linelist[1], # cosm cosmic_linelist[10], # 1 cosmic_linelist[11]) # large elif tmp2 and not tmp1: cosmic_linelist = tmp2.split('\t') if cosmic_linelist[11] == '-': cosmic_tmp = 'The mutation {0} has beed exclude from the website and \ cosmic pos or alt diff {1}:{2} {3}/{4}: {5}'.format( cosmic_linelist[1], cosmic_linelist[4], # chr cosmic_linelist[5], cosmic_linelist[6], cosmic_linelist[7], cosmic_linelist[10]) else: cosmic_tmp = 'Cosmic Pos or alt diff {0}:{1} {2}/{3}:{4}:{5};{6}'.format( cosmic_linelist[4], # chr cosmic_linelist[5], cosmic_linelist[6], cosmic_linelist[7], cosmic_linelist[1], cosmic_linelist[10], cosmic_linelist[11]) else: cosmic_tmp = '*' # 更细cosmic字段 linelist[head_index['cosmic']] = cosmic_tmp fw.write('{0}\n'.format('\t'.join(linelist)))
def start(self, **args): ''' 程序运行主函数 ''' other_info = {} # 用于更新其他字段 if args['vcf']: vcf_info = get_vcf_info.HandleVcf(args['vcf'], args['vcftype']).start() tran_relation = self.get_tran_relation(**args) with open(args['vep_annotation'], 'r') as fr, open(args['result'], 'w') as fw: fw.write('{}\n'.format('\t'.join(headers.HEAD().update_head().keys()))) for line in fr: if line.startswith('##'): continue elif line.startswith('#Uploaded_variation'): head = line head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') gene = linelist[head_index['symbol']] # TERT transcript = linelist[head_index['feature']] # NM_198253.3 #提取基因指定的转录本注释信息 if not '{transcript}'.format(**locals()) in tran_relation: continue #获取需要的信息 row = headers.HEAD() ## 可以直接提取的信息 upload_variation = linelist[head_index['#Uploaded_variation'.lower()]] # chr5_1295229_-/A location = linelist[head_index['location']] # chr5:1295187-1295188 vep_function = linelist[head_index['consequence']] # missense_variant strand = linelist[head_index['strand']] # -1 strand = '+' if strand == '1' else '-' protein = linelist[head_index['ensp']] # NP_937983.2 sift = linelist[head_index['sift']] # tolerated(0.05) polyphen = linelist[head_index['polyphen']] exon_info = linelist[head_index['exon']] # 2/19 or - intro_info = linelist[head_index['intron']] chgvs = linelist[head_index['hgvsc']] # NM_198253.3:c.77C>T phgvs = linelist[head_index['hgvsp']] # NP_937983.2:p.Thr26Met tert = linelist[head_index['tert']] # 只有tert的启动子区域有 clinvar = linelist[head_index['clinvar_clnsig']] rs = linelist[head_index['existing_variation']] bl_muttype = linelist[head_index['variant_class']] af = linelist[head_index['af']] afr_af = linelist[head_index['afr_af']] amr_af = linelist[head_index['amr_af']] eas_af = linelist[head_index['eas_af']] eur_af = linelist[head_index['eur_af']] sas_af = linelist[head_index['sas_af']] aa_af = linelist[head_index['aa_af']] ea_af = linelist[head_index['eas_af']] gnomad_af = linelist[head_index['gnomad_af']] gnomad_afr_af = linelist[head_index['gnomad_afr_af']] gnomad_amr_af = linelist[head_index['gnomad_amr_af']] gnomad_asj_af = linelist[head_index['gnomad_asj_af']] gnomad_eas_af = linelist[head_index['gnomad_eas_af']] gnomad_fin_af = linelist[head_index['gnomad_fin_af']] gnomad_nfe_af = linelist[head_index['gnomad_nfe_af']] gnomad_oth_af = linelist[head_index['gnomad_oth_af']] gnomad_sas_af = linelist[head_index['gnomad_sas_af']] # 需要进行处理获取的信息 hgvsc = utils.simplify_hgvsc(gene, chgvs, tert) hgvsp = utils.simplify_hgvsp(phgvs) # p.Lys872_Thr874delinsAsnTer hgvsp_short = utils.get_oneletter_hgvsp(hgvsp) # p.K872_T874delinsN* exon_id = utils.get_exon_id(exon_info, intro_info) _chr, start, end = utils.get_chr_start_end_from_location(location) ref, alt = utils.get_ref_alt_from_upload_variation(upload_variation) muttype = utils.get_muttype(ref, alt) genotype = utils.get_genotype(ref, alt, strand) flank = utils.get_flank_according_upload_variation(upload_variation, args['hg19']) vep_simple_function = TransverFunction(**args).simplify_function(vep_function, tert, gene) vep2bgicg_function = TransverFunction(**args).vep2bgi(vep_simple_function, hgvsc, hgvsp, ref, alt, exon_id) ## 存在特殊情况,span,跨越整个内含子,但是phgvs还存在注释信息,这种是错误的 ## 针对这种情况,需要对span类型的phgvs赋空值 if vep2bgicg_function == 'span' and (not hgvsp == '-'): hgvsp = '-' hgvsp_short = '-' # 更新row row.gene = gene row.chgvs = hgvsc row.phgvs = hgvsp row.phgvs_shoft = hgvsp_short row.exon_id = exon_id # row.tert = tert row.vep_function = vep_function row.vep_simple_function = vep_simple_function row.vep2bgicg_function = vep2bgicg_function row.sift = sift row.polyphen2 = polyphen row.chr = _chr row.start = start row.end = end row.ref = ref row.alt = alt row.muttype = muttype row.genotype = genotype row.transcript = transcript row.protein = protein row.strand = strand row.flank = flank row.rs = rs row.bl_muttype = bl_muttype row.clinvar = clinvar row.af = af row.afr_af = afr_af row.amr_af = amr_af row.eas_af = eas_af row.eur_af = eur_af row.sas_af = sas_af row.aa_af = aa_af row.ea_af = ea_af row.gnomad_af = gnomad_af row.gnomad_afr_af = gnomad_afr_af row.gnomad_amr_af = gnomad_amr_af row.gnomad_asj_af = gnomad_asj_af row.gnomad_eas_af = gnomad_eas_af row.gnomad_fin_af = gnomad_fin_af row.gnomad_nfe_af = gnomad_nfe_af row.gnomad_oth_af = gnomad_oth_af row.gnomad_sas_af = gnomad_sas_af if args['vcf']: freq_tag = vcf_info[upload_variation] other_info.update(freq_tag) info = row.update_head(**other_info) fw.write('\t'.join(map(str, info.values())) + '\n')
def start(self): ''' 程序运行主函数 ''' if self.vcf: vcf_info = get_vcf_info.HandleVcf(self.vcf, self.vcftype).start() with open(self.vep, 'r') as fr, open('test', 'w') as fw: for line in fr: if line.startswith('##'): continue elif line.startswith('#Uploaded_variation'): head = line head_index = utils.get_head_index(head) continue linelist = line.strip('').split('\t') #获取需要的信息 row = headers.HEAD() ## 可以直接提取的信息 upload_variation = linelist[head_index['#Uploaded_variation'.lower()]] # chr5_1295229_-/A location = linelist[head_index['location']] # chr5:1295187-1295188 transcript = linelist[head_index['feature']] # NM_198253.3 function = linelist[head_index['consequence']] # missense_variant strand = linelist[head_index['strand']] # -1 gene = linelist[head_index['symbol']] # TERT protein = linelist[head_index['ensp']] # NP_937983.2 sift = linelist[head_index['sift']] # tolerated(0.05) polyphen = linelist[head_index['polyphen']] exon_id = linelist[head_index['exon']] # 2/19 or - chgvs = linelist[head_index['hgvsc']] # NM_198253.3:c.77C>T phgvs = linelist[head_index['hgvsp']] # NP_937983.2:p.Thr26Met tert = linelist[head_index['tert']] # 只有tert的启动子区域有 clinvar = linelist[head_index['clinvar_clnsig']] rs = linelist[head_index['existing_variation']] # 需要进行处理获取的信息 hgvsc = utils.simplify_hgvsc(chgvs) hgvsp2 = utils.simplify_hgvsp(phgvs) hgvsp = utils.get_oneletter_hgvsp(hgvsp2) exon_id = utils.get_exon_id(exon_id) _chr, start, end = utils.get_chr_start_end_from_location(location) ref, alt = utils.get_ref_alt_from_upload_variation(upload_variation) muttype = utils.get_muttype(ref, alt) genotype = utils.get_genotype(ref, alt, strand) flank = utils.get_flank_according_upload_variation(upload_variation, self.hg19) bl_muttype = utils.get_bl_muttype() # 更新row row.gene = gene row.chgvs = hgvsc row.phgvs = hgvsp row.phgvs2 = hgvsp2 row.exon_id = exon_id row.vep_function = function row.sift = sift row.polyphen2 = polyphen row.chr = _chr row.start = start row.end = end row.ref = ref row.alt = alt row.muttype = muttype row.genotype = genotype row.transcript = transcript row.protein = protein row.strand = strand row.flank = flank row.rs = rs row.bl_muttype = bl_muttype row.clinvar = clinvar if self.vcf: freq_tag = vcf_info[upload_variation] # print(freq_tag) info = row.update_head(**freq_tag) fw.write('\t'.join(info.keys()) + '\n') fw.write('\t'.join(map(str, info.values())) + '\n')