def train_conservation_coeff(tr_varant_filt_vcf, include_hgmd): cadd_trset = {} mnp_cadd_trset = {} gerp_trset = {} v = vcf.VCFParser(tr_varant_filt_vcf) for rec in v: v.parseinfo(rec) found = False if include_hgmd: for cclass in rec.info.CLINSIG_CLASS: if not found: if 'HGMD' in cclass: found = True break if not found: for cclass in rec.info.CLINSIG_CLASS: if not found: if 'CLINVARDB' in cclass or '1kMAF' in cclass: found = True break if not found: continue varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt) mut_type = varlist[0][-1] if ':' in rec.id[0]: mut_type = 'mnp' #aaconv = './.' aaconv = '.' if rec.info.CADD_raw: #to get CADD_raw (average) cadd_trset, mnp_cadd_trset, aaconv = vcf.get_CADD_scores_tr(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, cadd_trset, mnp_cadd_trset) #to_get GERP score if rec.info.GerpConserve: gerp_trset = vcf.get_GERP_scores_tr(mut_type, aaconv, rec.info.GerpRSScore, gerp_trset) v.stream.close() return cadd_trset, mnp_cadd_trset, gerp_trset
def predict_gender_from_VCF(single_vcf, sample_id): from gcn.data import pseudoautosomal_genes UNKNOWN, MALE, FEMALE = range(3) f = Filter() f.geneincl = pseudoautosomal_genes.PSEUDO_AUTO_GENES v = vcf.VCFParser(single_vcf, sampleids=[sample_id]) gender = UNKNOWN chrmXY = [0, 0, 0] msg = "predicting gender from the sample [%s]" % single_vcf print msg for rec in v: if rec['chrom'] == 'chrY' or rec['chrom'] == 'Y': v.parsegenotypes(rec) v.parseinfo(rec) if rec[v.samples[0]].GT != './.': chrmXY[1] += 1 if not f.in_gene(rec, f.geneincl): chrmXY[2] += 1 elif rec['chrom'] == 'chrX' or rec['chrom'] == 'X': chrmXY[0] += 1 if chrmXY[0] > 0: chrY2X_rate = 1. * (chrmXY[1] + chrmXY[2]) / chrmXY[0] if chrY2X_rate > 0.01: gender = MALE else: gender = FEMALE elif chrmXY[2] > 0: gender = MALE else: gender = UNKNOWN v.stream.close() msg = "gender identified [%d], Done." % gender print msg return gender
def ranking_vcf(self): ''' this function is obsolete and replaced by vcf2xls_varant() ''' import gcn.lib.io.vcf as vcf job_name = 'ranking_vcf' msg = 'annotating Divine prediction score into filtered VCF ... [%s;%s]' % ( job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) ranked_vcf = '%s.ranked' % self.vcf ostream = open(ranked_vcf, 'w') v = vcf.VCFParser(self.vcf) v.add_meta_info("DVN", "1", "Float",\ "Gene damage score predicted by Divine:%s"%self.command) v.writeheader(ostream) for rec in v: v.parseinfo(rec) vpop = vp.parse(rec.info) max_dmg_sc = 0. for altnum, val in vpop.items(): for gene, gd in val.items(): if gene in self.gene_dmg: if self.gene_dmg[gene] > max_dmg_sc: max_dmg_score = self.gene_dmg[gene] rec.info.DVN = max_dmg_score v.write(ostream, rec) ostream.close() v.stream.close() os.rename(ranked_vcf, self.vcf) msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg)
import gcn.lib.utils.fileutils as fileutils #import gcn.lib.io.snpeff as snpeff import os import gcn.lib.io.vcf as vcf parser = argparse.ArgumentParser(description='Filter variants in the vcf') parser.add_argument('-i', dest='infile', help='input vcf file') parser.add_argument('-f', dest='filterconf', default="", help='filterfile') parser.add_argument('-o', dest='outfile', default=None, help='output file') parser.add_argument('-l', dest="filterlist", default="", help='Comma separated input list to include') options = parser.parse_args() fl = [] if options.filterlist: for el in open(options.filterlist, 'rU'): fl.append(el.strip().split(',')) f = Filter(fl) else: f = Filter(options.filterconf) v = vcf.VCFParser(options.infile) ostream = open(options.outfile, 'w') v.writeheader(ostream) for rec in v: v.parseinfo(rec) v.parsegenotypes(rec) if f.retain(rec): v.write(ostream, rec)
def _store_variants(self, beta_fits): ''' collect essential info on each variant ''' job_name = '_store_variants' msg = 'collecting variant information and class label to determine genetic damage [%s;%s]...' % ( job_name, self.vcf) lib_utils.msgout('notice', msg) self.logger.info(msg) mutation_info = [] v = vcf.VCFParser(self.vcf) for rec in v: v.parseinfo(rec) varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt) mut_type = varlist[0][-1] if ':' in rec.id[0]: mut_type = 'mnp' # collect conservation prediction score (CADD and GERP++) cadd_aa = './.' px_cadd = None if rec.info.CADD_raw: # to get CADD_raw (average) px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits) # to get GERP++ score px_gerp = None if rec.info.GerpConserve: px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits) # which score can be chosen px = 0.5 if self.cadd > 0 and px_cadd is not None: px = px_cadd elif px_gerp is not None: px = px_gerp vpop = vp.parse(rec.info) genes = [] # to get MAF in the order of ExAC, ESP, and 1K if rec.info.EXACDB: maf = float(rec.info.EXACAF[0]) elif rec.info.ESPDB: maf = float(rec.info.ESPAF[0]) elif rec.info.KGDB: maf = float(rec.info.KGAF[0]) else: maf = 0. # to compute a significance of MAF maf_offset = 0. if maf > 0: maf_offset = ( 1. - self.dm.beta1 * math.exp(1000. * maf)) / self.dm.beta2 if maf_offset < 0.: maf_offset = 0. # to get transcript length for altnum, val in vpop.items(): # for each gene involved with the variant for gene, gd in val.items(): protein_len = self.dm.avg_protein_len if gd: for t in gd['TRANSCRIPTS']: if t.protein_len: protein_len = float(t.protein_len) break # store a set of essential annotation to be used for genetic damage if gene not in genes: mutation_info.append([ gene, rec.info.INDEL, rec.info.CLASS_TAG, protein_len, px, maf_offset ]) genes.append(gene) # done reading filterd VCF file v.stream.close() msg = 'done. [%s]' % job_name lib_utils.msgout('notice', msg) self.logger.info(msg) return mutation_info
#create filter class f = Filter(options.filterconf) #read gene list file to tell which genes are to be included/excluded f.geneincl = f.store_genelist('incl', options.genelist) f.geneexcl = f.store_genelist('excl', options.genelist) min_depth = 0 if 'min_depth' in f.dconf: min_depth = f.dconf['min_depth'] f.get_clinvar_pathogenes() if options.sample_id: v = vcf.VCFParser(options.infile, sampleids=[options.sample_id]) else: v = vcf.VCFParser(options.infile) if options.skip_parse_genotype: gt_sample = False else: if v._sampleids: sidx = v._sampleids[0] gt_sample = True else: gt_sample = False ostream = open(options.outfile, 'w') v.add_meta_info( "CLASS_TAG", "1", "String",
def reformat_to_lite(self, infile, vtype, outfile, min_cnt=1): jobname = "reformat_to_lite" msg = "working on vcf file [%s] ..." % infile print msg infoKeys = ['GENE', 'STRAND', 'CDS', 'AA', 'SNP'] v = vcf.VCFParser(infile) v.add_meta_info('COSMIC_ID', '.', 'String', 'cosmic ID') v.add_meta_info('REG', '2', 'Integer', '1:coding, 0:noncoding') if not os.path.exists(outfile): ostream = open(outfile, 'w') v.writeheader(ostream, to_del_info=infoKeys) else: ostream = open(outfile, 'a') pk0 = 'NA' cosmics = [] cnts = [] prev_rec = None for rec in v: v.parseinfo(rec) pk = lib_utils.joined([rec.chrom, rec.pos, rec.ref, rec.alt], '_') if pk != pk0: pk0 = pk if prev_rec: prev_rec.id = '.' prev_rec.info['COSMIC_ID'] = cosmics prev_rec.info['REG'] = vtype for info_key in infoKeys: v.delete_info(prev_rec, info_key) if vtype == NONCODING: prev_rec.info['CNT'] = '1' v.write(ostream, prev_rec) cosmics = [rec.id[0]] prev_rec = rec else: pk0 = pk cosmics.append(rec.id[0]) if prev_rec: prev_rec.id = '.' prev_rec.info['COSMIC_ID'] = cosmics prev_rec.info['REG'] = vtype for info_key in infoKeys: v.delete_info(prev_rec, info_key) if vtype == NONCODING: prev_rec.info['CNT'] = '1' v.write(ostream, prev_rec) ostream.close() v.stream.close() msg = "Done [%s]." % jobname print msg
def get_gene_data(vcffile, pedigree, GQ_THRES): """Retrieves gene_transcript wise variants where there exits at least one frameshift/stopgain mutation. Args: - vcffile(str): Input VCF file. Note - VCF should be VARANT annotated. - pedigree(list): [Father SampleID, Mother SampleID, Child SampleID]. Expects the order in which the SampleIDs are mentioned above. - GQ_THRES(int): Threshold Genotype Quality Returns: - gene_data_phased(dictionary): Genotype Phased gene_transcript wise variants where there is at least one Frameshift/ Stopgain mutation. - gene_data_unphased(dictionary): Genotype Unphased gene_transcript wise variants where there is at least one Frameshift/Stopgain mutation in homozygous state. """ data1 = {} data2 = {} FILTER = ['PASS', 'VQSRTrancheSNP99.00to99.90'] v = vcf.VCFParser(vcffile) for rec in v: v.parseinfo(rec) v.parsegenotypes(rec) varfltr = rec['filter'] if len([True for flt in FILTER if flt in varfltr]) > 0: genotypes = check_genotype(rec, pedigree, GQ_THRES) if genotypes: pg = phase(*genotypes) if pg[1] == '|': c1, c2 = int(pg[0]), int(pg[-1]) va = vp.parse(rec.info) for idx, altid in enumerate([c1, c2]): if altid != 0: if altid in va: gene = va[altid].keys()[0] if len(va[altid][gene]) > 0: for ta in va[altid][gene]['TRANSCRIPTS']: if ta.region == 'CodingExonic': trans_id = ta.trans_id key = (rec.chrom, rec.pos, \ ','.join(rec.id), rec.ref, \ rec.alt[altid - 1], altid) gi = (gene, trans_id) if gi not in data1: data1[gi] = [{}, {}] data1[gi][idx][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] else: data1[gi][idx][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] else: c1, c2 = int(pg[0]), int(pg[-1]) va = vp.parse(rec.info) for altid in [c1, c2]: if altid != 0: if altid in va: gene = va[altid].keys()[0] if len(va[altid][gene]) > 0: for ta in va[altid][gene]['TRANSCRIPTS']: if ta.region == 'CodingExonic': trans_id = ta.trans_id key = (rec.chrom, rec.pos, \ ','.join(rec.id), rec.ref, \ rec.alt[altid - 1], altid) gi = (gene, trans_id) if gi not in data2: data2[gi] = [{}] data2[gi][0][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] else: data2[gi][0][key] = \ [ta.mutation, pg, genotypes[0], genotypes[1]] gene_data_phased = {} for k, v in data1.items(): for e in v: if len(e) > 0: if len(e.values()) > 1: if len([True for mut in [x[0] for x in e.values()] \ if mut.startswith('FrameShift') \ or mut == 'StopGain']) > 0: if k not in gene_data_phased: gene_data_phased[k] = [e] else: gene_data_phased[k].append(e) del data1 gene_data_unphased = {} for k, v in data2.items(): for e in v: if len(e) > 0: if len(e.values()) > 1: if len([True for y in [(x[0], x[1]) for x in e.values()] \ if (y[0].startswith('FrameShift') or \ y[0] == 'StopGain') and \ int(y[1][0]) == int(y[1][2])]) > 0: if k not in gene_data_unphased: gene_data_unphased[k] = [e] else: gene_data_unphased[k].append(e) del data2 return gene_data_phased, gene_data_unphased
def run(infile, outfile, hdrfile, genefile, vknown, samples=None, fl=[[], [], [], []]): logger.info('Running vcf to xls conversion script') genescores = [] if genefile: g = open(genefile, 'r') genescores = [(i.strip('\n').split('\t')) for i in g.readlines()] g.close() if samples: parser = vcf.VCFParser(infile, samples) else: parser = vcf.VCFParser(infile) samples = parser.samples # Write the column definitions to a tab col_map = defaultdict(str) ikeys = parser.meta['INFO'].keys() if hdrfile: hf = open(hdrfile, 'r') col_map = { i.strip('\n').split('\t')[0]: i.strip('\n').split('\t')[1] for i in hf } hf.close() for key in ikeys: col_map[key] = parser.meta['INFO'][key][-1] colnames, infokeys, formkeys, samplehdrs = columnnames(parser, samples) opthdrs = [] if 'VARANT_IMPACTCODE' in infokeys: opthdrs.append('VARANT_IMPACTCODE') if genefile: opthdrs.append('Comments_on_Genes') book, sheets = create_book(colnames, samplehdrs, opthdrs) tempidx = 1 logger.info('Writing the column definitions in Column Def sheet') for key, value in sorted(col_map.items()): if [j[0] for j in IN_TRANS_HDRS + opthdrs + ['Gene_Name'] \ + IN_VAR_HDRS if key.lower() in j.lower()] or\ [j[0] for j in colnames if key.lower() in j[0].lower()]: sheets['coldef'].row(tempidx).set_cell_text(0, key) sheets['coldef'].row(tempidx).set_cell_text(1, value) tempidx = tempidx + 1 f = Filter(fl) rowidx = 1 senum = 1 excelrows = 0 basename = outfile[:-4] booknum = 2 logger.info('Writing data to VCF tab') logger.info('The fields from the varant annotation are '\ + ','.join(IN_TRANS_HDRS + IN_VAR_HDRS)) for rec in parser: parser.parseinfo(rec) if excelrows > 60000: logger.info('Write out book and create a new one as the number\ of rows reached 60000') write(book, outfile) outfile = basename + '_' + str(booknum) + '.xls' book, sheets = create_book(colnames, samplehdrs, opthdrs) tempidx = 1 for key, value in sorted(col_map.items()): if [j[0] for j in IN_TRANS_HDRS + opthdrs + ['Gene_Name'] \ + IN_VAR_HDRS if key.lower() in j.lower()] or\ [j[0] for j in colnames if key.lower() in j[0].lower()]: sheets['coldef'].row(tempidx).set_cell_text(0, key) sheets['coldef'].row(tempidx).set_cell_text(1, value) tempidx = tempidx + 1 excelrows = 0 booknum += 1 rowidx = 1 senum = 1 if f.retain(rec): pass else: continue excelrows += 1 if samples: parser.parsegenotypes(rec) row = sheets['vcf'].row(rowidx) idx = 0 for e, t in colnames[:7]: if e is 'FILTER' and rec[e.lower()][0] is '.': write_cell(row, idx, e.lower(), None, t, ';') else: write_cell(row, idx, e.lower(), rec[e.lower()], t, ';') idx += 1 info = rec.info for e, t in infokeys: if e not in IN_EFF: write_cell(row, idx, e, info.get(e, None), t) idx += 1 par_ant = parse(info) ga = get_prior_geneannot(info, alltrans=True) idx = write_varant_links(row, rec, ga, samples, formkeys, colnames, idx, vknown, genescores) senum += 1 sheets['vcf'].panes_frozen = True sheets['vcf'].remove_splits = True sheets['vcf'].vert_split_pos = 2 sheets['vcf'].horz_split_pos = 1 for s in samples: ss = getattr(rec, s) for e, t in formkeys: v = ss.get(e, None) write_cell(row, idx, e, v, t) idx += 1 rowidx += 1 if not excelrows % 500: sheets['vcf'].flush_row_data() write(book, outfile)
def _extract_mutation_info(self,beta_fits): ''' objective: to extract (gene_qsymbol,mutation_type,variant_class_tag,transcript_length,insillico_prediction_score,MAF_significance_offset,zygosity) from annotated/filtered VCF file to transfer genmod information to class_tag and also get rid of some redundancy in VCF info ''' job_name = '_extract_mutation_info' msg='collecting variant information and class label to determine genetic damage [%s;%s]...'%(job_name,self.vcf) lib_utils.msgout('notice',msg);self.logger.info(msg) mutation_info = [] if self.proband_id: rewrite_vcf = True v = vcf.VCFParser(self.vcf,sampleids=[self.proband_id]) pdom,pdom0 = self.gather_pdomain_scores(v) v.stream.close() vcf_tmp = self.vcf+'.tmp' ostream = open(vcf_tmp, 'w') rmInfo = ['Exonic','Annotation','Compounds'] v = vcf.VCFParser(self.vcf) v.writeheader(ostream,to_del_info = rmInfo) else: rewrite_vcf = False v = vcf.VCFParser(self.vcf) pdom,pdom0 = self.gather_pdomain_scores(v) v.stream.close() v = vcf.VCFParser(self.vcf) msg = 'Importing max transcript length for each gene ...' lib_utils.msgout('notice', msg); self.logger.info(msg) refgene = Refgene() cds_lens = refgene.get_max_cds_length() tx_lens = {} for gene, cds_len in cds_lens.iteritems(): tx_lens[gene] = int(cds_len/3.) ridx = 0 for rec in v: v.parseinfo(rec) #to remove redundant gene symbols annotated by genmod but add transcript version if rewrite_vcf: for rkey in rmInfo: v.delete_info(rec, rkey) if rec.info.GeneticModels: genmod_tag = lib_ped.parse_genmod_inherit_model(\ rec.info.GeneticModels[0].split(':')[1]) rec.info.CLASS_TAG += genmod_tag v.write(ostream, rec) varlist = normalize_variant(rec.chrom, rec.pos, rec.ref, rec.alt) mut_type = varlist[0][-1] if ':' in rec.id[0]: mut_type = 'mnp' # collect conservation prediction score (CADD and GERP++) cadd_aa = './.' px_cadd = None if rec.info.CADD_raw: # to get CADD_raw (average) px_cadd, cadd_aa = vcf.get_CADD_scores(mut_type, rec.info.CADD_aa, rec.info.CADD_raw, beta_fits) # to get GERP++ score px_gerp = None if rec.info.GerpConserve: px_gerp = vcf.get_GERP_scores(mut_type, cadd_aa, rec.info.GerpRSScore, beta_fits) # which score can be chosen px = 0.5 if self.cadd>0 and px_cadd is not None: px = px_cadd elif px_gerp is not None: px = px_gerp vpop = vp.parse(rec.info) genes = [] # to get MAF in the order of ExAC, ESP, and 1K if rec.info.EXACDB: maf = get_min_maf(rec.info.EXACAF[0]) elif rec.info.ESPDB: maf = get_min_maf(rec.info.ESPAF[0]) elif rec.info.KGDB: maf = get_min_maf(rec.info.KGAF[0]) else: maf = 0. # to compute a significance of MAF maf_offset = self.dm.get_maf_xoffset(maf) #pdom.iloc[ridx]==ridx pdom_idx = pdom.index[pdom.ridx == ridx].tolist() if pdom_idx: patho_p = pdom.phat_lo[pdom_idx[0]] patho_pden = pdom.patho_dens_p[pdom_idx[0]] else: # assign a default pathogenic domain value (15% quantile value) patho_p = pdom0.phat_lo patho_pden = pdom0.patho_dens_p vartype = get_var_type(rec.ref,rec.alt) # to get transcript length for altnum, val in vpop.items(): # for each gene involved with the variant for gene, gd in val.items(): protein_len = self.dm.avg_protein_len if gene in tx_lens: protein_len = tx_lens[gene] # store a set of essential annotation to be used for genetic damage if gene not in genes: mutation_info.append([gene, vartype, rec.info.CLASS_TAG, protein_len, px, maf_offset, patho_p, patho_pden]) genes.append(gene) ridx += 1 # done reading filterd VCF file if rewrite_vcf: v.stream.close() ostream.close() os.rename(vcf_tmp,self.vcf) msg = 'done. [%s]'%job_name lib_utils.msgout('notice',msg); self.logger.info(msg) return mutation_info
def __init__(self, uargs): #transferring user input arguments to class member variables self.to_delete_fns = [] self.exp_tag = uargs.exp_tag self.vknown = uargs.vknown self.cadd = uargs.cadd self.top_k_disease = uargs.top_k_disease self.excl_non_coding = False self.sparser = SafeConfigParser() self.omim = None self.pheno_dmg = {} self.gt_dmg = {} self.gene_dmg = {} self.vknown_genes = {} lib_utils.msgout('notice','initializing Divine ...','Divine') divine_root_dir = os.environ.get("DIVINE") if not divine_root_dir: raise EnvironmentError("set DIVINE variable properly!") config_fn = os.path.join(divine_root_dir,'gcn','config','divine.conf') if not lib_utils.check_if_file_valid(config_fn): raise IOError("check if the configuration file[%s] is valid!" % config_fn) self.config_fn = config_fn self.entries = {'divine_root':divine_root_dir} self._set_args(uargs) self.hpo_query = uargs.hpo_query if self.hpo_query is None: self.hpo2disease_fn = None self.pheno_dmg_fn = None self.disease_rank_fn = None else: self.hpo2disease_fn = self._assign_out_fn('hpo_to_diseases','tsv') self.pheno_dmg_fn = self._assign_out_fn('pheno_gene_rank','tsv') self.disease_rank_fn = self._assign_out_fn('diseases_rank','tsv') self.gene_rank_fn = self._assign_out_fn('gene_rank', 'tsv') self.vcf = uargs.vcf self.ped = None self.proband_id = None self.genotype = True if self.vcf: self.is_family_vcf = False if uargs.ped: self.is_family_vcf = True if uargs.proband_id: proband_idx = lib_ped.check_consistency_ped_vcf(\ self.vcf,uargs.ped,uargs.proband_id) self.ped = uargs.ped self.proband_id = uargs.proband_id else: msg = "A family file [%s] was provided but you didn't provide a proband ID to examine. Specify the probrand ID available in the VCF [%s] using an option -p."\ %(uargs.ped,self.vcf) print(msg) raise RuntimeError(msg) else: #get sample_ids contained into VCF file v = vcf.VCFParser(self.vcf) if len(v.samples) > 1: raise RuntimeError('VCF file [%s] contains more than two samples. Let me know which sample is a proband to diagnose!'%self.vcf) elif len(v.samples) == 1: #search sample_id and create a temp ped for the proband self.ped = os.path.join(self.out_dir,'proband_tmp.ped') self.proband_id = lib_ped.create_proband_ped(self.vcf,self.ped) self.to_delete_fns.append(self.ped) else: self.genotype = False self.xls = None self.hgmd = uargs.hgmd self.cosmic = uargs.cosmic self.dblink = uargs.dblink # damage factor w.r.t the location of variant within the transcript self.dm = damaging_model.DmgCoeff(\ uargs.indel_fidel,uargs.go_seed_k,self.logger) if uargs.ref_exon_only==1: msg = 'VCF is going to be masked by RefGene coding region' lib_utils.msgout('notice',msg);self.logger.info(msg) self.ref_exon_only = uargs.ref_exon_only lib_utils.msgout('notice','done. initialization')
def append_annotation_to_vcf2(vcf_fn, vars_to_summuary, submissions, out_vcf): print 'appending annotation to clinvar VCF file ...' v = vcf.VCFParser(vcf_fn) ostream = open2(out_vcf, 'w') v.add_meta_info("REFTX", "1", "String", "RefSeq Transcript Name") v.add_meta_info("HGVSc", "1", "String", "HGVSc change in HGVS nomenclature") v.add_meta_info("HGVSp", "1", "String", "AA change in HGVS nomenclature") v.add_meta_info("SPLOC", "1", "Integer", "Distance from the predicted splice site") v.add_meta_info("DATE", "1", "String", "Last evaluated date") v.add_meta_info("REV", "1", "String", "Review status") v.add_meta_info("CLNMETHOD", "1", "String", "Collection methods") v.writeheader(ostream) for rec in v: v.parseinfo(rec) # clnacc = re.split('[|,]', rec.info.CLNACC) # rec.info.CLNACC = '|'.join(list(set(clnacc))) uniq_rcv_ids = [] for rcv_id_str in rec.info.CLNACC: for rcv_id in rcv_id_str.split('|'): if rcv_id in uniq_rcv_ids: continue uniq_rcv_ids.append(rcv_id) # print 'rec.info.CLNACC:',rec.info.CLNACC #cj_debug for rcv_id in uniq_rcv_ids: rcv_id = rcv_id.split('.')[0] if rcv_id in vars_to_summuary: rec.info.REFTX = vars_to_summuary[rcv_id].REFTX if vars_to_summuary[rcv_id].HGVSc: rec.info.HGVSc = vars_to_summuary[rcv_id].HGVSc mObj = re.search(r'c\.(.*)([\+\-]\d+)\D+', rec.info.HGVSc) if mObj: SPLOC = mObj.group(2) if abs(int(SPLOC)) < 3: rec.info.SPLOC = SPLOC if vars_to_summuary[rcv_id].HGVSp: rec.info.HGVSp = vars_to_summuary[rcv_id].HGVSp if vars_to_summuary[rcv_id].DATE: rec.info.DATE = vars_to_summuary[rcv_id].DATE if vars_to_summuary[rcv_id].REV: rec.info.REV = vars_to_summuary[rcv_id].REV if vars_to_summuary[rcv_id].variation_id in submissions: cmethods = list( set(submissions[vars_to_summuary[rcv_id].variation_id]. collection_methods)) # print 'cmethods:',cmethods #cj_debug rec.info.CLNMETHOD = '|'.join(cmethods) found = True break rec.info.CLNACC = uniq_rcv_ids for j, clndbn in enumerate(rec.info.CLNDBN): rec.info.CLNDBN[j] = clndbn.replace('\\x2c_', ',').replace('\\x2c', ',') v.write(ostream, rec) ostream.close() v.stream.close() print 'Done.'