Пример #1
0
def parse_gene_prior(genePriorFile, mode):
    #assume that genePriorFile is likes
    #gene\trank\tp-value

    dGene = {}
    fp = lib_utils.open2(genePriorFile, 'r')
    scoreSum = 0.
    maxScore = 0.
    fp.next()
    for i in fp:
        gene, rank, score = i.rstrip().split('\t')
        if mode == 1:
            dGene[gene] = float(rank)
        elif mode == 2:
            dGene[gene] = float(score)

        scoreSum += dGene[gene]

        if dGene[gene] > maxScore:
            maxScore = dGene[gene]
    fp.close()

    #normalize
    if mode == 1:
        for gene, val in dGene.iteritems():
            dGene[gene] = (maxScore - val + 1.) / scoreSum

    return dGene
Пример #2
0
    def disease_to_genes_sum(self, gene_norm=True):
        '''
		objective: from hit scores of query hpo to disease, associate disease to genes
		input:
		  -hpo2disease_fn: a file generated by hpo_to_disease()
		    '#query(file_name)\tomim\tgenes\tscore\n'
		  -gene_norm: want to normalize accumulated phenotype score per gene? [True]
		'''

        job_name = 'disease_to_genes'
        if self.hpo2disease_fn is None:
            self.hpo_to_diseases()

        msg = 'aggregating HPO hit scores of disease to each gene [%s;%s]...' % \
             (job_name,self.hpo2disease_fn)
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        fp = lib_utils.open2(self.hpo2disease_fn, 'r')

        #accumulating phenotye-matching score into genes associated with the disease
        pheno_genes = {}
        pheno_genes_cnt = {}
        for i in fp:  # for each disease
            if i.startswith('#'): continue
            i = i.rstrip()
            _, omim, geneStr, funsimMatAvg = i.rstrip().split('\t')
            genes = geneStr.split(',')
            funsimMatAvg = float(funsimMatAvg)
            for gene in genes:  # for each gene
                if funsimMatAvg > 0.:
                    if gene not in pheno_genes:
                        pheno_genes[gene] = 0.
                        pheno_genes_cnt[gene] = 0.
                    pheno_genes[gene] += funsimMatAvg
                    pheno_genes_cnt[gene] += 1.
        fp.close()

        if gene_norm:
            msg = 'normalizing a bipartite graph between diseases and genes...'
            lib_utils.msgout('notice', msg, job_name)
            self.logger.info(msg)
            for gene in pheno_genes.keys():
                pheno_genes[gene] /= pheno_genes_cnt[gene]

        self.pheno_dmg = lib_utils.normalize_dic(pheno_genes, 'sum')

        #print phenotypic damage scores
        self.rank_pheno_gene()

        msg = 'done. [%s]' % job_name
        lib_utils.msgout('notice', msg)
        self.logger.info(msg)

        #clean up variables
        pheno_genes = None
        pheno_genes_cnt = None
Пример #3
0
def protein_to_gene(esp2geneFile):

    dProtein2gene = {}

    fp = lib_utils.open2(esp2geneFile, 'r')
    fp.next()
    for i in fp:
        gene, protein = i[:-1].split('\t')
        if gene and protein:
            if protein not in dProtein2gene:
                dProtein2gene[protein] = gene

    fp.close()
    return dProtein2gene
Пример #4
0
    def gene_to_protein(self):
        '''
		to create a map from ref gene symbol to protein
		'''
        dGene2Prot = {}

        fp = lib_utils.open2(self.dv.entries['esp_to_gene'], 'r')
        fp.next()
        for i in fp:
            gene, protein = i[:-1].split('\t')
            if gene and protein:
                if gene not in dGene2Prot:
                    dGene2Prot[gene] = protein

        fp.close()
        return dGene2Prot
Пример #5
0
    def protein_to_gene(self):
        '''
		to create a map from protein to ref gene symbol
		'''
        dProt2Gene = {}

        fp = lib_utils.open2(self.dv.entries['esp_to_gene'], 'r')
        fp.next()
        for i in fp:
            gene, protein = i[:-1].split('\t')
            if gene and protein:
                if protein not in dProt2Gene:
                    dProt2Gene[protein] = gene

        fp.close()
        return dProt2Gene
Пример #6
0
def get_sparse_elements(proteinLinkFile, min_edge_weight):
    '''
	to store ppi network
	input: dProtein2gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link
	output: update dProtein2gene, dGenes when add_dangled is enabled. Store ppi and lnkProteins
	'''
    #read string DB and assign an integer to each protein symbol
    fp = lib_utils.open2(proteinLinkFile, 'r')

    nNodes = 0
    linked = [-1, -1]
    dProtein2num = {}
    lnkProteins = []
    ppi = [[], [], []]  #from protein, to protein, link weight

    lib_utils.msgout(
        'notice',
        'preparing a genetic network matrix. Please, be patient......',
        'pagerank|heat_diffusion')
    #store col,row,weight from ppi file
    fp.next()
    for i in fp:
        #print '%s'%i #debug
        linked[0], linked[1], weight = i.rstrip().split(' ')
        weight = float(weight)
        if weight < min_edge_weight: continue
        for c in range(2):
            protein = extract_ensembl_protein(linked[c])

            #to register a protein node
            if not protein in dProtein2num:
                dProtein2num[protein] = nNodes
                lnkProteins.append(
                    protein
                )  #item index corresponds to a node number of the protein
                nNodes += 1

            ppi[c].append(dProtein2num[protein])
        ppi[2].append(weight)
    fp.close()

    dProtein2num = None

    return nNodes, ppi, lnkProteins
Пример #7
0
    def get_sparse_elements(self):
        '''
		to store ppi network
		input: self.dProt2Gene, dGenes(whether the gene is in ppi or not)- protein-gene relation; proteinLinkFile- ppi link
		output: update self.dProt2Gene, dGenes when add_dangled is enabled. Store ppi and Prots
		'''
        #read string DB and assign an integer to each protein symbol
        fp = lib_utils.open2(self.dv.entries['string_link'], 'r')

        linked = [-1, -1]

        self.nNodes = 0
        self.Prots = []
        self.dProt2Idx = {}

        lib_utils.msgout(
            'notice',
            'preparing a genetic network matrix. Please, be patient ...',
            'pagerank|heat_diffusion')
        #store col,row,weight from ppi file

        fp.next()
        for i in fp:
            #print '%s'%i #debug
            linked[0], linked[1], weight = i.rstrip().split()
            weight = float(weight)
            if weight < self.min_edge_weight: continue

            for c in range(2):
                protein = extract_ensembl_protein(linked[c])

                #to register a protein node
                if not protein in self.dProt2Idx:
                    self.dProt2Idx[protein] = self.nNodes

                    # item index corresponds to a node number of the protein
                    self.Prots.append(protein)
                    self.nNodes += 1

                self.ppi[c].append(self.dProt2Idx[protein])
            self.ppi[2].append(weight)
        fp.close()
Пример #8
0
def convert_node2gene(FinalNodeScores, PerturbedGenes, dProtein2gene,
                      lnkProteins, rank_fn):

    nodeScores, dangledScores = FinalNodeScores
    cPerturbedGenes, dangledGenes = PerturbedGenes

    rank_fn2 = lib_utils.file_tag2(rank_fn, 'tmp', None)
    fp2 = lib_utils.open2(rank_fn2, 'w')
    fp2.write('#gene\tpredicted_score[-1/log10(x)]\tseed_score\n')
    for n, protein in enumerate(lnkProteins):
        seed_score = 0.
        gene = protein
        genetic_dmg_score = 0.
        if protein in dProtein2gene:
            gene = dProtein2gene[protein]
            if gene in cPerturbedGenes:
                seed_score = cPerturbedGenes[gene].score
                genetic_dmg_score = cPerturbedGenes[gene].gdmg

        pred_score = 0.
        if nodeScores[n] > 0:
            pred_score = -1. / math.log10(nodeScores[n])
        if genetic_dmg_score > 0.:
            fp2.write('%s\t%g\t%g\n' % (gene, pred_score, seed_score))

    #add dangled node score
    for n, gene in enumerate(dangledGenes):
        pred_score = 0.
        if dangledScores[n] > 0:
            pred_score = -1. / math.log10(dangledScores[n])
        if cPerturbedGenes[gene].gdmg > 0.:
            fp2.write('%s\t%g\t%g\n' %
                      (gene, pred_score, cPerturbedGenes[gene].score))

    fp2.close()

    #sort by score
    lib_utils.sort_tsv_by_col2(rank_fn2, [2], ['gr'], False, rank_fn)
    os.unlink(rank_fn2)
Пример #9
0
def main():
  parser = argparse.ArgumentParser(description="training cadd cli")
  parser.add_argument('--hgmd', action='store_const', dest = 'hgmd', required=False, default = False, const = True, help='want to include hgmd for training? It requires license. [False]')
  parser.add_argument('-r', action='store', dest='kg_sample_rate', required=False, default = .3, type=float, help='sampling rate for benign variants in 1kG')
  parser.add_argument('-o', action='store', dest='out_dir', required=True, help='output dir')
  parser.add_argument('--debug', action='store_const', dest='debug', required=False, default=False, const=True, help='debug?[False]')
  args=parser.parse_args()
  
  if not os.path.exists(args.out_dir):
    os.makedirs(args.out_dir)

  #load HgmdDB (requires license!)
  if args.hgmd:
    try:
      hgmd = HgmdDB()
      training = hgmd.select_all_hgmd()
    except:
      print 'error: check if HGMD database is avail!'
      sys.exit(1)
	
  if False:
    #load ClinvarDB
    clnvar = ClinvarDB()
    training.extend(clnvar.select_all_clinvar())

    #load KGDB
    kgdb = KGDB()
    training.extend(kgdb.select_snps(0.1, 0.5, sample_rate=args.kg_sample_rate , snp_tag='benign_1kMAF'))
  
  #convert to vcf
  tr_vcfs = []

  clitags = ['benign','pathogenic']

  for c in range(2):
    tr_vcf = os.path.join(args.out_dir,'clin_%s_tr.vcf'%clitags[c])
    if not args.debug or not os.path.exists(tr_vcf):
      tr_vcf_body = tr_vcf + '.body' 
      fp2 = lib_utils.open2(tr_vcf_body,'w')
      printed = {}
      
      for chrom, pos, id, ref, alt, clisig in training:
        if clitags[c] in clisig:
          prim_key = '%s_%s_%s_%s' % (chrom,pos,ref,alt)
          if prim_key in printed: continue
          printed[prim_key]=clisig
          fp2.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\tCLINSIG_CLASS=%s\n'%(chrom,pos,id,ref,alt,'100','PASS',clisig))
      fp2.close()
      
      #sort
      tr_vcf_body_so = tr_vcf_body + '.sorted'
      cmd = 'sort -k1,1 -k2,2n %s > %s' % (tr_vcf_body,tr_vcf_body_so)
      p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
      output, err = p.communicate()
      rc = p.returncode
          
      #append header
      tr_vcf_header = tr_vcf+'.head'
      fp2 = lib_utils.open2(tr_vcf_header,'w')
      fp2.write('##fileformat=VCFv4.2\n')
      fp2.write('##INFO=<ID=CLINSIG_CLASS,Number=7,Type=String,Description="benign_CLINVARDB,pathogenic_CLINVARDB,vus_CLINVARDB,benign_HGMDDB,pathogenic_HGMDDB,vus_HGMDDB,benign_1kMAF",Source="CLINVAR,HGMD",Version="03/01/2015">\n')
      fp2.write('#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n')
      fp2.close()
      cmd = 'cat %s %s > %s' % (tr_vcf_header,tr_vcf_body_so,tr_vcf)
      p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
      output, err = p.communicate()
      rc = p.returncode
      os.unlink(tr_vcf_header)
      os.unlink(tr_vcf_body)
      os.unlink(tr_vcf_body_so)

    tr_vcfs.append(tr_vcf)
  
  training = None
  #run gcn
  gcn_dir = os.environ.get('GCN', None)
  if gcn_dir:
    annotpipe_bin = os.path.join(gcn_dir, 'gcn', 'bin', 'annotpipe.py')
  else:
    print 'error: cannot find annotpipe.py!'
    sys.exit(1)

  tr_varant_vcfs = []
  cadd_trset_params = []
  mnp_cadd_trset_params =[]
  gerp_trset_params =[]
  cadd_trsets = []
  mnp_cadd_trsets =[]
  gerp_trsets =[]
  
  filter_bin = os.path.join(gcn_dir,'gcn','lib','utils','filter_cj.py')   
  for c,tr_vcf in enumerate(tr_vcfs):
    tr_varant_vcf = os.path.join(args.out_dir,'clin_%s_tr_varant.vcf'%clitags[c])
    cmd = 'python %s -i %s -o %s'%(annotpipe_bin,tr_vcf,tr_varant_vcf)
    if not args.debug or not os.path.exists(tr_varant_vcf):
      print cmd
      p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
      output, err = p.communicate()
      rc = p.returncode
  
    #filter varant vcf
    tr_varant_filt_vcf = os.path.join(args.out_dir,'clin_%s_tr_varant_filt.vcf'%clitags[c])
    filterconf_tr = os.path.join(gcn_dir,'gcn','config','filter_tr_%s.conf'%clitags[c])
    
    cmd = 'python %s -i %s -o %s -f %s --no_genotype' %(filter_bin,tr_varant_vcf,tr_varant_filt_vcf,filterconf_tr)
    if not args.debug or not os.path.exists(tr_varant_filt_vcf):
      print cmd
      p = sp.Popen(cmd, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
      output, err = p.communicate()
      rc = p.returncode
  
    cadd_trset, mnp_cadd_trset, gerp_trset = train_conservation_coeff(tr_varant_filt_vcf,args.hgmd)
    
    #to get parameters fitted in beta dist
    cadd_trset_param, mnp_cadd_trset_param, gerp_trset_param = run_beta_fit(cadd_trset, mnp_cadd_trset, gerp_trset)
    
    cadd_trset_params.append(cadd_trset_param)
    mnp_cadd_trset_params.append(mnp_cadd_trset_param)
    gerp_trset_params.append(gerp_trset_param)
    
    cadd_trsets.append(cadd_trset)
    mnp_cadd_trsets.append(mnp_cadd_trset)
    gerp_trsets.append(gerp_trset)
    
    print 'done.'

  pyv = os.path.join(args.out_dir,'clin_tr.pyv')
  fp2 = open(pyv,'wb')
  pickle.dump([cadd_trset_params, mnp_cadd_trset_params, gerp_trset_params], fp2)
  fp2.close()
Пример #10
0
    def convert_node2gene(self):
        '''
		for only gt_dmg genes, print out gene, harmonic score, and seed score 
		'''

        rank_fn_tmp = '%s.tmp' % self.dv.gene_rank_fn
        fp2 = lib_utils.open2(rank_fn_tmp, 'w')
        fp2.write(
            '#gene\tpredicted_score\tseed_score\tgt_dmg_score\tpheno_score\tcontain_known_pathogenic\n'
        )
        genes_printed = {}
        #browsing each node in the whole (original) ppi network
        for n, protein in enumerate(self.Prots):
            seed_score = 0.
            gene = protein

            #check if this node (restart value) was assigned previously
            if protein in self.dProt2Gene:
                gene = self.dProt2Gene[protein]

                if gene in self.dv.gene_dmg:
                    seed_score = self.dv.gene_dmg[gene][0]

            #to get harmonic score and save into dv.gene_dmg
            pred_score = 0.
            if self.harmonic_sc[n][0] > 0.:
                pred_score = self.harmonic_sc[n][0]
                if gene in self.dv.gene_dmg:
                    self.dv.gene_dmg[gene][1] = pred_score

            #NOTE that print only a gene having at one mutation
            if (not self.dv.gt_dmg) or \
              (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.):

                pheno_sc = 0.
                if gene in self.dv.pheno_dmg:
                    pheno_sc = self.dv.pheno_dmg[gene].score

                if self.dv.vknown:
                    if gene in self.dv.vknown_genes: is_vknown = 'Y'
                    else: is_vknown = 'N'
                else: is_vknown = 'NA'

                if gene in genes_printed:
                    gene2 = '%s|%s' % (gene, protein)
                else:
                    gene2 = gene
                    genes_printed[gene] = True

                fp2.write('%s\t%g\t%g\t%g\t%g\t%s\n'%\
                    (gene2,pred_score,seed_score,\
                    self.dv.gt_dmg[gene].score,pheno_sc,is_vknown))

        #repeat the same procedure to dangled nodes
        for n, gene in enumerate(self.dangledGenes):

            self.dv.gene_dmg[gene][1] = self.harmonic_dng_sc[n][0]

            if (not self.dv.gt_dmg) or \
              (gene in self.dv.gt_dmg and self.dv.gt_dmg[gene].score>0.):

                pheno_sc = 0.
                if gene in self.dv.pheno_dmg:
                    pheno_sc = self.dv.pheno_dmg[gene].score

                if self.dv.vknown:
                    if gene in self.dv.vknown_genes: is_vknown = 'Y'
                    else: is_vknown = 'N'
                else: is_vknown = 'NA'

                fp2.write('%s\t%g\t%g\t%s\t%g\t%s\n'%\
                 (gene,self.dv.gene_dmg[gene][1],self.dv.gene_dmg[gene][0],\
                 self.dv.gt_dmg[gene].score,pheno_sc,is_vknown))

        fp2.close()

        #sort by score
        lib_utils.sort_tsv_by_col2(\
         rank_fn_tmp, [2], ['gr'], False, self.dv.gene_rank_fn)
        os.unlink(rank_fn_tmp)
Пример #11
0
def append_annotation_to_vcf2(vcf_fn, vars_to_summuary, submissions, out_vcf):

    print 'appending annotation to clinvar VCF file ...'
    v = vcf.VCFParser(vcf_fn)
    ostream = open2(out_vcf, 'w')

    v.add_meta_info("REFTX", "1", "String", "RefSeq Transcript Name")
    v.add_meta_info("HGVSc", "1", "String",
                    "HGVSc change in HGVS nomenclature")
    v.add_meta_info("HGVSp", "1", "String", "AA change in HGVS nomenclature")
    v.add_meta_info("SPLOC", "1", "Integer",
                    "Distance from the predicted splice site")
    v.add_meta_info("DATE", "1", "String", "Last evaluated date")
    v.add_meta_info("REV", "1", "String", "Review status")
    v.add_meta_info("CLNMETHOD", "1", "String", "Collection methods")
    v.writeheader(ostream)

    for rec in v:
        v.parseinfo(rec)

        # clnacc = re.split('[|,]', rec.info.CLNACC)
        # rec.info.CLNACC = '|'.join(list(set(clnacc)))

        uniq_rcv_ids = []
        for rcv_id_str in rec.info.CLNACC:
            for rcv_id in rcv_id_str.split('|'):
                if rcv_id in uniq_rcv_ids: continue
                uniq_rcv_ids.append(rcv_id)

        # print 'rec.info.CLNACC:',rec.info.CLNACC #cj_debug
        for rcv_id in uniq_rcv_ids:

            rcv_id = rcv_id.split('.')[0]
            if rcv_id in vars_to_summuary:
                rec.info.REFTX = vars_to_summuary[rcv_id].REFTX
                if vars_to_summuary[rcv_id].HGVSc:
                    rec.info.HGVSc = vars_to_summuary[rcv_id].HGVSc
                    mObj = re.search(r'c\.(.*)([\+\-]\d+)\D+', rec.info.HGVSc)
                    if mObj:
                        SPLOC = mObj.group(2)
                        if abs(int(SPLOC)) < 3:
                            rec.info.SPLOC = SPLOC

                if vars_to_summuary[rcv_id].HGVSp:
                    rec.info.HGVSp = vars_to_summuary[rcv_id].HGVSp
                if vars_to_summuary[rcv_id].DATE:
                    rec.info.DATE = vars_to_summuary[rcv_id].DATE
                if vars_to_summuary[rcv_id].REV:
                    rec.info.REV = vars_to_summuary[rcv_id].REV
                if vars_to_summuary[rcv_id].variation_id in submissions:
                    cmethods = list(
                        set(submissions[vars_to_summuary[rcv_id].variation_id].
                            collection_methods))
                    # print 'cmethods:',cmethods #cj_debug
                    rec.info.CLNMETHOD = '|'.join(cmethods)

                found = True
                break

        rec.info.CLNACC = uniq_rcv_ids
        for j, clndbn in enumerate(rec.info.CLNDBN):
            rec.info.CLNDBN[j] = clndbn.replace('\\x2c_',
                                                ',').replace('\\x2c', ',')

        v.write(ostream, rec)

    ostream.close()
    v.stream.close()
    print 'Done.'
Пример #12
0
def reformat_go_sim_fns(go_sim_fns,
                        out_fn,
                        method_id=1):  #method_id (1) means SimRel

    suflabs = ['BP', 'MF', 'CC']
    fp2 = lib_utils.open2(out_fn, 'w')
    v = 0
    for key, go_sim_fn in go_sim_fns.iteritems():  #BP,MF,CC
        print 'appending root node at the end of [%s]' % go_sim_fn
        suflab = suflabs[v]
        fp = lib_utils.open2(go_sim_fn, 'r')
        go_sim_fn2 = lib_utils.file_tag2(go_sim_fn, 'category', None)
        fp.next()  #strip off head
        for i in fp:
            j = i.rstrip().split('\t')
            if len(j) == 2:
                j.append('-1.')
            fp2.write('%s\t%s\n' %
                      ('\t'.join(j), suflab))  #uniprot1,uniprot2,score,BP
        fp.close()
        print 'done.'
        v += 1
    fp2.close()

    print 'sorting...'
    #to get temporary file to sort
    out_fn2 = lib_utils.file_tag2(out_fn, 'sort', None)
    temp_sort_dir, _, _, _ = lib_utils.separateDirFn2(out_fn)
    lib_utils.sort_tsv_by_col2(out_fn, [1, 2, 4], ['V', 'V', 'V'],
                               False,
                               out_fn2,
                               temp_dir=temp_sort_dir)
    os.rename(out_fn2, out_fn)
    print 'done.'

    #groupping

    print 'collapsing GO sim scores to make the format easier to import SQL [%s] ...' % out_fn
    out_fn2 = lib_utils.file_tag2(out_fn, 'dense', None)
    fp2 = lib_utils.open2(out_fn2, 'w')
    #heads = '#uniprot1\tuniprot2\tscore_mode\tBP\tMF\tCC\tmethod_id'
    #fp2.write('%s\n'%heads)

    fp = lib_utils.open2(out_fn, 'r')

    visit1 = True
    idx = {'BP': 0, 'MF': 1, 'CC': 2}
    prev_key = None
    gosim_holder = ['-1', '-1', '-1']  #-1 means N/A

    for i in fp:
        prot1, prot2, score, go_class = i.rstrip().split('\t')
        key = '%s\t%s' % (prot1, prot2)
        if key != prev_key:
            if visit1:
                visit1 = False
            else:  #wrap up
                fp2.write('%s\t%s\t%d\n' %
                          (prev_key, '\t'.join(gosim_holder), method_id))
                gosim_holder = ['-1', '-1', '-1']
            gosim_holder[idx[go_class]] = score
            prev_key = key
        else:  #keep storing values
            gosim_holder[idx[go_class]] = score
    fp.close()

    #don't forget the last entry
    fp2.write('%s\t%s\t%d\n' % (prev_key, '\t'.join(gosim_holder), method_id))
    fp2.close()
    os.rename(out_fn2, out_fn)

    print 'done.'