Exemplo n.º 1
0
def composeTMP_nonTMP(ftmp, fnontmp, fpair, num):
    '''
    如果给了ftmp,fnontmp,就参考所给的列表组合
    如果这个文件不存在,则从数据库中查询这两个列表
    :param ftmp:
    :param fnontmp:
    :param fpair: 组合结果
    :param num: 组合的总数
    :return:
    '''
    if not (os.access(ftmp, os.F_OK) and os.access(fnontmp, os.F_OK)):
        tmplist, nontmplist = generateCriterLists(ftmp, fnontmp)
    else:
        tmplist, nontmplist = readIDlist(ftmp), readIDlist(fnontmp)
    sampL1 = min(len(tmplist), num)
    sampL2 = min(len(nontmplist), num)
    L1 = random.sample(range(0, len(tmplist)), sampL1)
    L2 = random.sample(range(0, len(nontmplist)), sampL2)
    with open(fpair, 'w') as fo:
        for idx in range(num):
            fo.write('%s\t%s\n' %
                     (tmplist[L1[(idx + random.randint(2, 9)) % sampL1]],
                      nontmplist[L2[(idx + random.randint(1, 5)) % sampL2]]))
            fo.flush()
            print(idx)
Exemplo n.º 2
0
def getSingleInfo(fin, fout, fin_type='single', col=[0, 1]):
    if fin_type == 'pair':
        df = pd.read_table(fin, header=None)[col]
        dat = df.to_numpy().reshape(1, -1)
        proteins = set(dat[0])
    elif fin_type == 'single':
        proteins = readIDlist(fin)
    else:
        pass
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {
        '_id': True,
        'sequence.@length': True,
        'sequence.#text': True,
        'keyword.@id': True,
        'comment.subcellularLocation.location': True
    }
    prod = Protein()
    with open(fout, 'w') as fo:
        for AC in proteins:
            pro = queryProtein(AC, do, projection=projection)
            pro['accession'] = AC
            if not prod.checkProtein(
                    pro['sequence']['#text'], 50, 2000, uncomm=True):
                continue
            proinfo = ensomblePortein(pro)
            for v in proinfo.values():
                fo.write(str(v))
                fo.write('\t')
            fo.write('\n')
            fo.flush()
Exemplo n.º 3
0
def queryPathway_Gene(foutPathway_Gene,
                      fpathway=None,
                      fpathwayInfo=None,
                      hsa='hsa'):
    '''

    :param foutPathway_Gene:
        0     hsa00010  pathway id
        1         3101  gene id
        2          HK3  gene symbol
        3     [K00844]  KO id
        4    [2.7.1.1]  EC number
    :param fpathway:
    :param fpathwayInfo:
    :param hsa:
    :return:
    '''
    if os.access(fpathway, os.F_OK):
        repair_pathways = readIDlist(fpathway)
    else:
        repair_pathways = queryAllPathway(fpathway=fpathway,
                                          fpathwayInfo=fpathwayInfo,
                                          hsa=hsa)
    for idx, pathway in enumerate(repair_pathways):
        geneID_geneName_KO_EC = []
        print(idx, end='.')
        if idx < 65: continue
        for gene_id, gene_symbol, KO, EC in extractGeneFromPathway(pathway):
            geneID_geneName_KO_EC.append(
                (pathway, gene_id, gene_symbol, KO, EC))
        saveList(geneID_geneName_KO_EC, foutPathway_Gene, file_mode='a')
Exemplo n.º 4
0
 def extractFasta(self,
                  fin_fasta,
                  fin_idlist,
                  fout_fasta,
                  in_multi=True,
                  out_multi=True):
     oridict = self.getDict(fin_fasta, multi=in_multi)
     desdict = {}
     idlist = readIDlist(fin_idlist)
     for id in idlist:
         desdict[id] = oridict[id]
     self.dict2fasta(desdict, fout_fasta, multi=out_multi)
Exemplo n.º 5
0
def fullyComposeTMP_nonTMP(ftmp, fnontmp, fpair):
    '''
    如果给了ftmp,fnontmp,就参考所给的列表组合
    如果这个文件不存在,则从数据库中查询这两个列表
    :param ftmp:
    :param fnontmp:
    :param fpair: 组合结果
    :param num: 组合的总数
    :return:
    '''
    if not (os.access(ftmp, os.F_OK) and os.access(fnontmp, os.F_OK)):
        print('Cretira not found, qury from Mongodb...')
        tmplist, nontmplist = generateCriterLists(ftmp, fnontmp)
    else:
        tmplist, nontmplist = readIDlist(ftmp), readIDlist(fnontmp)
    with open(fpair, 'w') as fo:
        for elem1 in tmplist:
            for elem2 in nontmplist:
                print(elem1, elem2)
                fo.write('%s\t%s\n' % (elem1, elem2))
                fo.flush()
Exemplo n.º 6
0
def findKeyProtein(fin, fout, keyword):
    # keyword = 'KW-0297'
    tmplist = readIDlist(fin)
    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True}
    for ac in tmplist:
        dic = {'accession': ac, "keyword.@id": keyword}
        result = do.QueryObj(dic, projection=projection)
        for r in result:
            GPCRlist.append(ac)
            print(r)
    saveList(GPCRlist, fout)
Exemplo n.º 7
0
def getFasta(fin_all_protein, fout):
    proteins = readIDlist(fin_all_protein)
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'sequence.#text': True}
    with open(fout, 'w') as fo:
        from Bio import SeqIO
        from Bio.Seq import Seq
        from Bio.SeqRecord import SeqRecord
        records = []

        for AC in proteins:
            pro = queryProtein(AC, do, projection=projection)
            if not pro: continue
            record = SeqRecord(Seq(pro['sequence']['#text']),
                               id=AC,
                               description='')
            records.append(record)

            #fo.write('>%s\n%s\n'%(AC,pro['sequence']['#text']))
            #fo.flush()

        SeqIO.write(records, fout, 'fasta')
Exemplo n.º 8
0
def findGProtein(fin, fout):
    # {"protein.recommendedName.fullName":{$regex:/Guanine nucleotide-binding protein*/}}
    # keyword = 'KW-0297'
    tmplist = readIDlist(fin)
    GPCRlist = []
    do = DataOperation('uniprot', 'uniprot_sprot')
    projection = {'_id': True}
    count = 0
    for ac in tmplist:
        dic = {
            'accession':
            ac,
            '$or': [{
                "protein.recommendedName.fullName": {
                    '$regex': 'G protein '
                }
            }, {
                "protein.recommendedName.fullName": {
                    '$regex': 'Guanine nucleotide-binding protein'
                }
            }, {
                "protein.alternativeName.fullName": {
                    '$regex': 'G protein '
                }
            }, {
                "protein.alternativeName.fullName": {
                    '$regex': 'Guanine nucleotide-binding protein'
                }
            }]
        }
        result = do.QueryObj(dic, projection=projection)
        for r in result:
            count = count + 1
            GPCRlist.append(ac)
            print(count, r)
    saveList(GPCRlist, fout)
Exemplo n.º 9
0
def getTmp_SpPair(tmpf,spf,finPair,foutPair,type1='TMP',type2='SP',crossover = False):
    """
    cretira:/home/jjhnenu/data/PPI/release/criteria/    20200701
    tmpf='/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0812_131609.list'
    spf='/home/jjhnenu/data/PPI/release/criteria/allcession_soluble_614454.list'
    :param tmpf:
    :param spf:
    :param finPair:
    :param foutPair:
    :return:
    tmpf = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\tmp\\KW-0812.list'
    spf = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\sp\\splist.list'
    finPair = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\interactor_interaction\\noprepeat.txt'
    foutPair = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\tmp_sp\\tmp_sp.txt'
    getTmp_SpPair(tmpf, spf, finPair, foutPair)
    """

    tmplist = readIDlist(tmpf)
    splist = readIDlist(spf)
    print(len(tmplist),len(splist))
    # len(tmplist)<len(splist)
    splitmark = '\\'
    if '/' in foutPair:
        splitmark = '/'
    mypath = foutPair[:foutPair.rindex(splitmark)+1]
    fSP_SP = mypath+'%s_%s.txt'%(type2,type2)
    fTMP_TMP = mypath+'%s_%s.txt'%(type1,type1)

    fnonTMP_SP = mypath + 'non%s_%s.txt' % (type1, type2)
    fTmp_nonSP = mypath + '%s_non%s.txt' % (type1, type2)

    fTMP_nonTmp = mypath+'%s_non%s.txt'%(type1,type1)
    fnonTmp_nonTmp = mypath+'non%s_non%s.txt'%(type1,type1)

    fnonsp_sp = mypath+'non%s_%s.txt'%(type2,type2)
    fnonsp_nonsp = mypath+'non%s_non%s.txt'%(type2,type2)

    fdrop = mypath+'drop.txt'
    with open(finPair, 'r') as fin, \
            open(foutPair, 'w') as fout,\
            open(fSP_SP,'w') as foutsp,\
            open(fTMP_TMP,'w') as fouttmp,\
            open(fnonTMP_SP,'w') as foutnonTMP_SP,\
            open(fTmp_nonSP,'w') as foutTmp_nonSP,\
            open(fTMP_nonTmp,'w') as foutTMP_nonTmp,\
            open(fnonTmp_nonTmp,'w') as foutnonTmp_nonTmp,\
            open(fnonsp_sp, 'w') as fout_nonsp_sp,\
            open(fnonsp_nonsp, 'w') as fout_nonsp_nonsp,\
            open(fdrop, 'w') as fout_drop\
            :
        line = fin.readline()
        while (line):
            pair = line.split('\t')
            try:
                a = pair[0]
                b = pair[1][:-1]
            except:
                print(pair)
            # Atmp = a in tmplist
            # Asp = False if Atmp and not crossover else a in splist
            # Btmp = b in tmplist
            # Bsp = False if Btmp and not crossover else b in splist

            Atmp = a in tmplist
            Asp =  a in splist
            Btmp = b in tmplist
            Bsp =  b in splist

            if Atmp and Bsp:    # TMP_SP
                fout.write(a + '\t' + b + '\n')
                fout.flush()
                print('%s %s save this pair' % (a, b))
            elif Asp and Btmp:  # TMP_SP
                fout.write(b + '\t' + a + '\n')
                fout.flush()
                print('%s %s save this pair' % (a, b))
            elif Asp and Bsp:   # SP_SP
                foutsp.write(a + '\t' + b + '\n')
                foutsp.flush()
                print('%s %s save this pair %s_%s' % (a, b,type2,type2))
            elif Atmp and Btmp: # TMP_TMP
                fouttmp.write(a + '\t' + b + '\n')
                fouttmp.flush()
                print('%s\t%s save this pair %s_%s' % (a, b,type1,type1))
            else:
                pass

            if not Atmp and Bsp:
                foutnonTMP_SP.write(a + '\t' + b + '\n')
                foutnonTMP_SP.flush()
                print('%s %s save this pair %s_non%s' % (a, b, type1, type2))
            elif Atmp and not Bsp:
                foutTmp_nonSP.write(a + '\t' + b + '\n')
                foutTmp_nonSP.flush()
                print('%s %s save this pair %s_non%s' % (a, b, type1, type2))
            else:pass

            # if (a in tmplist and b not in tmplist) or (a not in tmplist and b in tmplist):
            #     foutTMP_nonTmp.write(a + '\t' + b + '\n')
            #     foutTMP_nonTmp.flush()
            #     print('%s %s save this pair TMP_nonTmp' % (a, b))
            # if a not in tmplist and b not in tmplist:
            #     foutnonTmp_nonTmp.write(a + '\t' + b + '\n')
            #     foutnonTmp_nonTmp.flush()
            #     print('%s %s save this pair nonTMP_nonTmp' % (a, b))
            if Atmp and not Btmp:   # TMP_nonTMP
                foutTMP_nonTmp.write(a + '\t' + b + '\n')
                foutTMP_nonTmp.flush()
                print('%s %s save this pair %s_non%s' % (a, b,type1,type1))
            elif not Atmp and Btmp: # TMP_nonTMP
                foutTMP_nonTmp.write(b + '\t' + a + '\n')
                foutTMP_nonTmp.flush()
                print('%s %s save this pair %s_non%s' % (a, b,type1,type1))
            elif not Atmp and not Btmp:
                foutnonTmp_nonTmp.write(a + '\t' + b + '\n')
                foutnonTmp_nonTmp.flush()
                print('%s %s save this pair non%s_non%s' % (a, b,type1,type1))
            else:
                pass
            if not Asp and Bsp:
                fout_nonsp_sp.write(a + '\t' + b + '\n')
                fout_nonsp_sp.flush()
                print('%s %s save this pair non%s_%s' % (a, b,type2,type2))
            elif Asp and not Bsp:
                fout_nonsp_sp.write(b + '\t' + a + '\n')
                fout_nonsp_sp.flush()
                print('%s %s save this pair non%s_%s' % (a, b,type2,type2))
            elif not Asp and not Bsp:
                fout_nonsp_nonsp.write(a + '\t' + b + '\n')
                fout_nonsp_nonsp.flush()
                print('%s %s save this pair non%s_non%s' % (a, b,type2,type2))
            else:
                # fout_drop.write(a + '\t' + b + '\n')
                # print('%s %s drop this pair' % (a, b))
                pass

            line = fin.readline()
    func = countpair
    handledir(mypath, func)
    print('get%s_%sPair end'%(type1,type2))
Exemplo n.º 10
0
# Title     : tesKEGGAPI.py
# Created by: [email protected]
# Created on: 2021/2/4 15:23
# des : TODO
import os

from common import readIDlist

if __name__ == '__main__':
    from Bio.KEGG import REST
    dirout = 'file/6bioAnalysis/keggDB/pathwayInfo'
    human_pathways = REST.kegg_list("pathway", "hsa").read()
    repair_pathways = readIDlist('file/6bioAnalysis/keggDB/1pathway_human.tsv')

    # Get the genes for pathways and add them to a list
    repair_genes = []
    for idx,pathway in enumerate(repair_pathways):
        print(idx,pathway)
        pathway_file = REST.kegg_get(pathway).read()  # query and read each pathway
        with open(os.path.join(dirout,'%s.txt'%pathway),'w') as fo:
            fo.write(pathway_file)
            fo.flush()
Exemplo n.º 11
0
def handlePair(foutdir,
               sep=',',
               dbname=None,
               checkTMP=True,
               jumpStep=None,
               fin=None,
               f2tmp_nonTtmp_info_qualified=None,
               keepOne=False):
    '''
    数据量较少,直接逐行查询,很多蛋白被查询了多次
    :param foutdir:
    :param sep: sep of fin file
    :parameter dbname: name of mongodb
    :parameter jumpStep: skip some step in this method [1,2,3,4]
    :parameter fin:ignore this parameter when 1 in jumpStep
    :parameter f2tmp_nonTtmp_info_qualified: sign this path in the dir

    :return:

    fin = 'file/1intAct_pair_norepeat.txt'
    foutdir = 'file/1positive'
    handlePair(fin,foutdir)
    '''
    '''
    config path
    '''
    f1tmp_nontmp_info = os.path.join(foutdir, '1tmp_nontmp_info.tsv')
    f1TMP_nonTMP = os.path.join(foutdir, '1tmp_nontmp.tsv')
    if not f2tmp_nonTtmp_info_qualified:
        f2tmp_nonTtmp_info_qualified = os.path.join(
            foutdir, '2tmp_nontmp_info_qualified.tsv')
    fout_fasta = os.path.join(foutdir, '2pair.fasta')
    fout_tmp_fasta = os.path.join(foutdir, '2tmp.fasta')
    fout_nontmp_fasta = os.path.join(foutdir, '2nontmp.fasta')
    f2positive = os.path.join(foutdir, '2pair.tsv')
    f2tmp = os.path.join(foutdir, '2tmp.list')
    f2nontmp = os.path.join(foutdir, '2nontmp.list')
    f2all = os.path.join(foutdir, '2all.list')
    f2tmp_info = os.path.join(foutdir, '2tmp_info.tsv')
    f2nontmp_info = os.path.join(foutdir, '2nontmp_info.tsv')
    f2all_info = os.path.join(foutdir, '2all_info.tsv')
    f3subcell = os.path.join(foutdir, '3subcellular.tsv')
    '''
    1. get tmp nontmp pair
    time 1766.4131457805634
    '''
    if jumpStep == None or 1 not in jumpStep:
        getPairInfo_TMP_nonTMP(fin,
                               f1tmp_nontmp_info,
                               sep=sep,
                               checkTMP=checkTMP,
                               keepOne=keepOne)
        simplifyTable(f1tmp_nontmp_info, f1TMP_nonTMP)
    '''
    2. get qualified tmp nontmp pair
    '''
    if jumpStep == None or 2 not in jumpStep:
        saveQualified(f1tmp_nontmp_info, f2tmp_nonTtmp_info_qualified)
    '''
    3. get related list,fasta,pair
    '''
    if jumpStep == None or 3 not in jumpStep:
        simplifyTable(f2tmp_nonTtmp_info_qualified, f2positive)

        extractPairAndFasta(f2tmp_nonTtmp_info_qualified,
                            fout_fasta,
                            fout_tmp_fasta=fout_tmp_fasta,
                            fout_nontmp_fasta=fout_nontmp_fasta)
        getproteinlist(f2tmp_nonTtmp_info_qualified,
                       ftmp=f2tmp,
                       fnontmp=f2nontmp,
                       fall=f2all,
                       ftmp_info=f2tmp_info,
                       ftmp_nontmp_info=f2nontmp_info,
                       fall_info=f2all_info)
    '''
    4. save to mongodb
    '''
    if jumpStep == None or (4 not in jumpStep and dbname):
        notsvaelist = save(readIDlist(f2all), dbname)
        print('those protein not save in the mongodb', notsvaelist)
    '''
    5. calcu subcellular
    '''
    if jumpStep == None or 5 not in jumpStep:
        handleRow(f2tmp_nonTtmp_info_qualified, f3subcell, calcuSubcell)