def getPairTag(fin, f1pairWithTag_info, sep='\t'): ''' :param fin: fin pair :param f1pairWithTag_info: fout :return: tag 0, TMP_SP 1, TMP_TMP 2, SP_SP ''' do = DataOperation('uniprot', 'uniprot_sprot') with open(f1pairWithTag_info, 'w') as fo: for pa, pb in getPairs(fin, sep=sep, title=False): print('%s\t%s' % (pa, pb)) result = tagPair(pa, pb, do) if result == None: continue proA = ensomblePortein(result[0]) proB = ensomblePortein(result[1]) for v in proA.values(): fo.write(str(v)) fo.write('\t') for v in proB.values(): fo.write(str(v)) fo.write('\t') fo.write(str(result[2])) fo.write('\n') fo.flush()
def getPairInfo_TMP_nonTMP(fin, fout, sep='\t', checkTMP=True, keepOne=False): ''' :param fin: Q7BCK4 B6JN06 E7QG89 B2FN41 :param fout: TMP + nonTMP ['accession', 'name', 'length', 'noX', 'inlenRange', 'subcellularLocations', 'seq'] Q7BCK4 ICSA_SHIFL 1102 True True ['Periplasm', 'Secreted', 'Cell surface', 'Cell outer membrane'] MNQIHKFFCNMTQCSQGGAGELPTVKEKTCKLSFSPFVVGASLLLGGPIAFATPLSGTQELHFSEDNYEKLLTPVDGLSPLGAGEDGMDAWYITSSNPSHASRTKLRINSDIMISAGHGGAGDNNDGNSCGGNGGDSITGSDLSIINQGMILGGSGGSGADHNGDGGEAVTGDNLFIINGEIISGGHGGDSYSDSDGGNGGDAVTGVNLPIINKGTISGGNGGNNYGEGDGGNGGDAITGSSLSVINKGTFAGGNGGAAYGYGYDGYGGNAITGDNLSVINNGAILGGNGGHWGDAINGSNMTIANSGYIISGKEDDGTQNVAGNAIHITGGNNSLILHEGSVITGDVQVNNSSILKIINNDYTGTTPTIEGDLCAGDCTTVSLSGNKFTVSGDVSFGENSSLNLAGISSLEASGNMSFGNNVKVEAIINNWAQKDYKLLSADKGITGFSVSNISIINPLLTTGAIDYTKSYISDQNKLIYGLSWNDTDGDSHGEFNLKENAELTVSTILADNLSHHNINSWDGKSLTKSGEGTLILAEKNTYSGFTNINAGILKMGTVEAMTRTAGVIVNKGATLNFSGMNQTVNTLLNSGTVLINNINAPFLPDPVIVTGNMTLEKNGHVILNNSSSNVGQTYVQKGNWHGKGGILSLGAVLGNDNSKTDRLEIAGHASGITYVAVTNEGGSGDKTLEGVQIISTDSSDKNAFIQKGRIVAGSYDYRLKQGTVSGLNTNKWYLTSQMDNQESKQMSNQESTQMSSRRASSQLVSSLNLGEGSIHTWRPEAGSYIANLIAMNTMFSPSLYDRHGSTIVDPTTGQLSETTMWIRTVGGHNEHNLADRQLKTTANRMVYQIGGDILKTNFTDHDGLHVGIMGAYGYQDSKTHNKYTSYSSRGTVSGYTAGLYSSWFQDEKERTGLYMDAWLQYSWFNNTVKGDGLTGEKYSSKGITGALEAGYIYPTIRWTAHNNIDNALYLNPQVQITRHGVKANDYIEHNGTMVTSSGGNNIQAKLGLRTSLISQSCIDKETLRKFEPFLEVNWKWSSKQYGVIMNGMSNHQIGNRNVIELKTGVGGRLADNLSIWGNVSQQLGNNSYRDTQGILGVKYTF B6JN06 G6PI_HELP2 545 True True ['Cytoplasm'] MLTQLKTYPKLLKHYEEIKEAHMRDWFSKDKERASRYFVQLESLSLDYSKNRLNDTTLKLLFELANDCSLKEKIEAMFKGEKINTTEKRAVLHTALRSLNDTEILLDNMEVLKSVRSVLKRMRAFSDSVRSGKRLGYTNQVITDIVNIGIGGSDLGALMVCTALKRYGHPRLKMHFVSNVDGTQILDVLEKINPASTLFIVASKTFSTQETLTNALTARKWFVERSGDEKHIAKHFVAVSTNKEAVQQFGIDEHNMFEFWDFVGGRYSLWSAIGLSIMIYLGKKNFNALLKGAYLMDEHFRNAPFESNLPVLMGLIGVWYINFFQSKSHLIAPYDQYLRHFPKFIQQLDMESNGKRISKKGETIPYDTCPVVWGDMGINAQHAFFQLLHQGTHLIPIDFIASLDKKPNAKGHHEILFSNVLAQAQAFMKGKSYEEALGELLFKGLDKDEAKDLAHHRVFFGNRPSNILLLEKISPSNIGALVALYEHKVFVQGVIWDINSFDQWGVELGKELAVPILQELEGHKSNAYFDSSTKHLIELYKNYNQ E7QG89 SEC11_YEASZ 167 True True ['Endoplasmic reticulum membrane'] MNLRFELQKLLNVCFLFASAYMFWQGLAIATNSASPIVVVLSGSMEPAFQRGDILFLWNRNTFNQVGDVVVYEVEGKQIPIVHRVLRQHNNHADKQFLLTKGDNNAGNDISLYANKKIYLNKSKEIVGTVKGYFPQLGYITIWISENKYAKFALLGMLGLSALLGGE B2FN41 EX7L_STRMK 443 True True ['Cytoplasm'] MQPRNNDILTPSQLNTLARDLLEGSFPAIWVEAELGSVARPASGHLYFTLKDARAQLRAAMFRMKAQYLKFVPREGMRVLVRGKVTLYDARGEYQMVLDHMEEAGEGALRRAFEELKARLEAEGLFDPARKRPLPTHVQRLAVITSPTGAAVRDVLSVLGRRFPLLEVDLLPTLVQGSSAAAQITRLLQAADASGRYDVILLTRGGGSLEDLWAFNDEALARAIAASRTPVVSAVGHETDFSLSDFAADLRAPTPSVAAELLVPDQRELALRLRRTAARMVQLQRHAMQQAMQRADRALLRLNAQSPQARLDLLRRRQLDLGRRLHAVFNQQQERRAARLRHAAAVLRGHHPQRQLDAMQRRLAALRGRPQAAMQRLLERDALRLRGLARSLEAVSPLATVARGYSILTRTDDGTLVRKVNQVQPGDALQARVGDGVIDVQVK :return: fin = 'file/_1pair.txt' fout = 'file/_2pair_info.txt' getPairInfo_TMP_nonTMP(fin,fout) ''' do = DataOperation('uniprot', 'uniprot_sprot') with open(fout, 'w') as fo: for pa, pb in getPairs(fin, sep=sep, title=False): print('%s\t%s' % (pa, pb)) result = getTaN(pa, pb, do, checkTMP=checkTMP, keepOne=keepOne) if result == None: continue tmp = ensomblePortein(result[0]) nontmp = ensomblePortein(result[1]) for v in tmp.values(): fo.write(str(v)) fo.write('\t') for v in nontmp.values(): fo.write(str(v)) fo.write('\t') fo.write('\n') fo.flush()
def loadTest(self,fin_pair,dir_in,onehot=False,is_shuffle=False,limit=0): """ :param fin_pair: :param dir_in: :param limit: :param onehot: :return: data,label """ x_test = [] y_test = [] count = 0 for proteins in getPairs(fin_pair, title=False): count = count +1 xelem,yelem = self.loadPpair(dir_in, proteins) x_test.append(xelem) y_test.append(yelem) # eachfile = os.path.join(dir_in, '%s_%s.npy' % (proteins[0], proteins[1])) # # print(count,eachfile) # try: # # elem = np.load(os.path.join(dir_in, eachfile)) # elem = np.load(eachfile) # x_test.append(elem) # # loading test dataset or positive dataset # if len(proteins) < 3 or proteins[2] == '1': # y_test.append(1) # else: # y_test.append(0) # except: # print('not find feature of this pair', str(proteins)) if count == limit:break data = np.array(x_test) label = np.array(y_test) return self.subprocess(data,label,test_size=0, random_state=123,onehot=onehot,is_shuffle=is_shuffle)
def loadTest(self, fin_pair, dir_in, onehot=False, is_shuffle=False): """ :param fin_pair: :param dir_in: :param limit: :param onehot: :return: data,label """ x_test = [] y_test = [] for proteins in getPairs(fin_pair, title=False): eachfile = os.path.join(dir_in, '%s_%s.npy' % (proteins[0], proteins[1])) try: elem = np.load(os.path.join(dir_in, eachfile)) x_test.append(elem) # loading test dataset or positive dataset if len(proteins) < 3 or proteins[2] == '1': y_test.append(1) else: y_test.append(0) except: print('not find feature of this pair', str(proteins)) data = np.array(x_test) label = np.array(y_test) return self.subprocess(data, label, test_size=0, random_state=123, onehot=onehot, is_shuffle=is_shuffle)
def loadPair(self, fin_pair, dir_in, limit=0): positive = [] negative = [] row = 0 for proteins in getPairs(fin_pair, title=False): eachfile = os.path.join(dir_in, '%s_%s.npy' % (proteins[0], proteins[1])) try: elem = np.load(os.path.join(dir_in, eachfile)) # loading test dataset or positive dataset if len(proteins) < 3 or proteins[2] == '1': positive.append(elem) row = row + 1 else: negative.append(elem) except: print('not find feature of this pair', str(proteins)) if limit != 0 and limit < min(len(positive), len(negative)): positive, negative = positive[:limit], negative[:limit] print('positive : ', len(positive)) print('negative : ', len(negative)) positive = np.stack(positive) negative = np.stack(negative) if negative != [] else [] self.positive = positive self.negative = negative
def getPairseq(self, fin_fasta, fin_ID_pair, fout_seq_pair, saveID=False, num=0, multi=True): # fd = FastaDealer() # fin_fasta = '/home/jjhnenu/data/PPI/release/featuredb/positiveV1.fasta' # fin_ID_pair = '/home/jjhnenu/data/PPI/release/pairdata/positive_2049.txt' # fout_seq_pair = '/home/jjhnenu/data/PPI/release/pairdata/positive_2049_seq.txt' # mydict = fd.getDict(fin_fasta, multi=True) mydict = self.getDict(fin_fasta, multi=multi) file_num = 0 count = 0 fout_seq_pair = fout_seq_pair myfout_seq_pair = fout_seq_pair.split('.')[0] + '_%d.txt' % file_num myfout_seq_ID_pair = fout_seq_pair.split( '.')[0] + '_ID_%d.txt' % file_num fo = open(myfout_seq_pair, 'w') fo_ID = open(myfout_seq_ID_pair, 'w') for record in getPairs(fin_ID_pair, sep='\t', title=False): a = record[0] b = record[1] c = '' if len(record) == 3: c = record[2] if saveID: # fo.write('>%s %s\n' % (a, c)) fo.write('>%s\n' % a) fo.flush() fo.write(mydict[a] + '\n') fo.flush() if saveID: # fo.write('>%s %s\n' % (b, c)) fo.write('>%s\n' % b) fo_ID.write('%s\t%s\t%s\n' % (a, b, c)) fo.write(mydict[b] + '\n') fo.flush() count = count + 1 if num != 0 and count == num: file_num = file_num + 1 count = 0 fo.close() fo_ID.close() myfout_seq_pair = fout_seq_pair.split( '.')[0] + '_%d.txt' % file_num myfout_seq_ID_pair = fout_seq_pair.split( '.')[0] + '_ID_%d.txt' % file_num fo = open(myfout_seq_pair, 'w') fo_ID = open(myfout_seq_ID_pair, 'w') fo.close() fo_ID.close() print()
def base_compose(self, dirout_feature, fin_pair, dir_feature_db, feature_type='V_PSSM', fout_pair=''): check_path(dirout_feature) fo = open(fout_pair, 'w') if fout_pair != '' else None row = 0 for pairs in getPairs(fin_pair): a = pairs[0] b = pairs[1] # print(pairs) # ['O35668', 'P00516'] fa = os.path.join(dir_feature_db, a + '.npy') fb = os.path.join(dir_feature_db, b + '.npy') row = row + 1 print('loading %d th feature pair' % row) if not (os.access(fa, os.F_OK) and os.access(fb, os.F_OK)): print( '===============features of pairs not found %s %s================' % (a, b), os.access(fa, os.F_OK), os.access(fb, os.F_OK)) continue pa = np.load(fa, allow_pickle=True) pb = np.load(fb, allow_pickle=True) if (len(pa) < 50 or len(pa) > 2000 or max(pa) > 20) or (len(pb) < 50 or len(pb) > 2000 or max(pb) > 20): print('wrong length or x') continue if fo != None: fo.write('%s\t%s\n' % (a, b)) fo.flush() # padding if feature_type == Feature_type.V_PSSM: pc = self.padding_PSSM(pa, pb, vstack=True) elif feature_type == Feature_type.H_PSSM: pc = self.padding_PSSM(pa, pb, vstack=False) elif feature_type == Feature_type.SEQ_1D: pc = self.padding_seq1D(pa, pb, vstack=False) # elif feature_type == Feature_type.SEQ_1D_OH:pc = self.padding_seq1D(pa,pb,vstack=False) elif feature_type == Feature_type.SEQ_2D: pc = self.padding_seq2D(pa, pb) else: print('incoreect feature_type') return # 保存padding后的成对特征 fout = os.path.join(dirout_feature, "%s_%s.npy" % (a, b)) np.save(fout, pc) del pc, pa, pb if fo != None: fo.close()