def _5train(f1out, eachfile, train, dirout_feature, f2resultOut, fin_model=None): ''' training the model ''' print('training on the model') dir_in = dirout_feature dirout = os.path.join(f2resultOut, eachfile) check_path(dirout) validate = {} validate['fin_pair'] = os.path.join(f1out, eachfile, 'validate.txt') validate['dir_in'] = dir_in onehot = True entry( dirout, train, dir_in, model_type=Param.CNN1D_OH, limit=0, onehot=onehot, kernel_size=90, epochs=80, # epochs=30, filters=300, batch_size=500, validate=validate, fin_model=fin_model)
def _1posiPair(dirDIP, dirout): ''' posi tmp and nontmp for each save 1000 pair save 880 protein fasta file/8DIPPredict/data/Ecoli/2pair.fasta save 369 tmp file/8DIPPredict/data/Ecoli/2tmp.fasta save 511 nontmp file/8DIPPredict/data/Ecoli/2nontmp.fasta # fin = '/home/jjhnenu/data/PPI/release/otherdata/DIP/Ecoli/2Ecoli20170205_id_pair_12246.txt' # foutdir = 'file/8DIPPredict/data/Ecoli' # check_path(foutdir) # handlePair(foutdir,sep='\t',fin=fin,jumpStep=[5],keepOne=True) ''' # dirDIP = '/home/jjhnenu/data/PPI/release/otherdata/DIP/' # dirout = 'file/8DIPPredict/data/' for eachdir in os.listdir(dirDIP): if eachdir not in ['Ecoli', 'Mus', 'Human', 'SC', 'HP']: continue currentdir = os.path.join(dirDIP, eachdir) for eachfile in os.listdir(currentdir): if 'id_pair' not in eachfile: continue fin = os.path.join(currentdir, eachfile) foutdir = os.path.join(dirout, eachdir) check_path(foutdir) handlePair(foutdir, sep='\t', fin=fin, jumpStep=[5], keepOne=True)
def composeNegaPair(currentdir, fpositive, foutdir): ftmp = os.path.join(currentdir, '2tmp.list') fnontmp = os.path.join(currentdir, '2nontmp.list') fposi = os.path.join(currentdir, '2pair.tsv') f1pair = os.path.join(foutdir, '1pair.tsv') f2pair = os.path.join(foutdir, '2pair.tsv') f2pairInfo = os.path.join(foutdir, '2pairInfo.tsv') f3pairInfo = os.path.join(foutdir, '3pairInfo.tsv') f4pairInfo_subcell = os.path.join(foutdir, '4pairInfo_subcell.tsv') f4pairInfo_subcell_differ = os.path.join(foutdir, '4pairInfo_subcell_differ.tsv') dirout_related = os.path.join(foutdir, '4pairInfo_subcell_differ_related') check_path(dirout_related) df = pd.read_table(fposi, header=None) composeTMP_nonTMP(ftmp, fnontmp, f1pair, int(df.shape[0] * 1.5)) dropPositiveAndRepeate(f1pair, fpositive, f2pair) dropPositiveAndRepeate(f1pair, fposi, f2pair) getPairInfo_TMP_nonTMP(f2pair, f2pairInfo, sep='\t', checkTMP=False, keepOne=True) saveQualified(f2pairInfo, f3pairInfo) handleRow(f3pairInfo, f4pairInfo_subcell, calcuSubcell) saveDifferSubcell(f4pairInfo_subcell, f4pairInfo_subcell_differ) saveRelated(f4pairInfo_subcell_differ, dirout_related)
def saveRelated(fin_info,dirout): ''' :param fin_info: :param dirout: :return: # fin_info = os.path.join(dirout, '2subcellular_differ.tsv') ''' check_path(dirout) print('save related to',dirout) fout_fasta = os.path.join(dirout, '2pair.fasta') fout_tmp_fasta = os.path.join(dirout, '2tmp.fasta') fout_nontmp_fasta = os.path.join(dirout, '2nontmp.fasta') f2positive = os.path.join(dirout, '2pair.tsv') f2tmp = os.path.join(dirout, '2tmp.list') f2nontmp = os.path.join(dirout, '2nontmp.list') f2all = os.path.join(dirout, '2all.list') f2tmp_info = os.path.join(dirout, '2tmp_info.tsv') f2nontmp_info = os.path.join(dirout, '2nontmp_info.tsv') f2all_info = os.path.join(dirout, '2all_info.tsv') simplifyTable(fin_info, f2positive) extractPairAndFasta(fin_info, fout_fasta, fout_tmp_fasta=fout_tmp_fasta, fout_nontmp_fasta=fout_nontmp_fasta) getproteinlist(fin_info, ftmp=f2tmp, fnontmp=f2nontmp, fall=f2all, ftmp_info=f2tmp_info, ftmp_nontmp_info=f2nontmp_info, fall_info=f2all_info)
def _2_1combineFasta(fposiInfo,fnegaInfo,dirout): ''' :param fposiInfo: :param fnegaInfo: :param dirout: :return: fin_pair = '%s/dirRelated/2pair.tsv'%dirout fasta = '%s/dirRelated/2pair.fasta'%dirout ''' ''' config path ''' # fposiInfo = 'file/10humanTrain/1positive/9human_human.tsv' # fnegaInfo = 'file/10humanTrain/2negative/4subcellular/2subcellular_differ.tsv' # # dirout = 'file/10humanTrain/3cluster/all' check_path(dirout) fpairinfo = os.path.join(dirout, '1pairinfo.tsv') dirRelated = os.path.join(dirout, 'dirRelated') ''' concat positive and negative with info ''' df1 = pd.read_table(fposiInfo,header=None) df2 = pd.read_table(fnegaInfo,header=None) df3 = pd.concat([df1,df2]) df3.to_csv(fpairinfo,header=None,index=None,sep='\t') ''' save 19724 protein fasta file/10humanTrain/3cluster/all/dirRelated/2pair.fasta save 5089 tmp file/10humanTrain/3cluster/all/dirRelated/2tmp.fasta save 14635 nontmp file/10humanTrain/3cluster/all/dirRelated/2nontmp.fasta ''' saveRelated(fpairinfo, dirRelated)
def groupCalculate(dirin, filetype='all'): """ /home/jjhnenu/data/PPI/release/result/group/p_fp_1_1/1/all/_evaluate.txt :param dirin: /home/jjhnenu/data/PPI/release/result/group :return: """ for eachdir in os.listdir(dirin): subdir = os.path.join( dirin, eachdir) # /home/jjhnenu/data/PPI/release/result/group/p_fp_1_1/ data = [] columns = ['Loss', 'Acc', 'Precision', 'Recall', 'F1score', 'MCC'] # columns = ['loss', 'acc', 'metric_precision', 'metric_recall', 'metric_F1score', 'matthews_correlation'] print(columns) for eachsubdir in os.listdir(subdir): # 0 1 2 3 4 5 fin = os.path.join(subdir, eachsubdir, filetype, '_evaluate.txt') # fin = os.path.join(subdir,eachsubdir,filetype,'_history_dict.txt') if not os.access(fin, os.F_OK): continue with open(fin, 'r') as fi: line = fi.readline()[:-1] # sum += np.array(line.split(':')[-1][1:-1].split(',')) data.append(line.split(':')[-1][1:-1].split(',')) print(str(line.split(':')[-1][1:-1].split(','))[1:-1]) mydata = pd.DataFrame(data) t = mydata.apply(pd.to_numeric) t.columns = columns t.loc['mean'] = t.apply(lambda x: x.mean()) dirout = os.path.join(subdir.replace('result', 'statistic')) check_path(dirout) # float_format = '%.3f' t.sort_index(inplace=True) t.to_csv(os.path.join(dirout, 'result.csv'), index=True, header=True) print(dirout)
def load_ed_models(model_paths, pc): ed_models = [] ed_model_params = [] for i, path in enumerate(model_paths): print '...Loading nmt model {}'.format(i) ed_model_folder = check_path(path, 'ED_MODEL_FOLDER_{}'.format(i), is_data_path=False) best_model_path = ed_model_folder + '/bestmodel.txt' hypoparams_file_reader = codecs.open(ed_model_folder + '/best.dev', 'r', 'utf-8') hyperparams_dict = dict([ line.strip().split(' = ') for line in hypoparams_file_reader.readlines() ]) model_hyperparams = { 'INPUT_DIM': int(hyperparams_dict['INPUT_DIM']), 'HIDDEN_DIM': int(hyperparams_dict['HIDDEN_DIM']), 'LAYERS': int(hyperparams_dict['LAYERS']), 'VOCAB_PATH': hyperparams_dict['VOCAB_PATH'] } # a fix for vocab path when transferring files b/n vm model_hyperparams['VOCAB_PATH'] = check_path(path + '/vocab.txt', 'vocab_path', is_data_path=False) ed_model_params.append(pc.add_subcollection('ed{}'.format(i))) ed_model = SoftAttention(ed_model_params[i], model_hyperparams, best_model_path) ed_models.append(ed_model) return ed_models
def _7trainAndTest(dirout_feature, fin_train, fin_validate, dirout): # time 664909.4274818897 ~ 7.6 day ''' training the model ''' print('training on the model') check_path(dirout) validate = {} validate['fin_pair'] = fin_validate validate['dir_in'] = dirout_feature onehot = True entry( dirout, fin_train, dirout_feature, model_type=Param.CNN1D_OH, limit=0, onehot=onehot, kernel_size=90, epochs=80, # epochs=30, filters=300, batch_size=50, validate=validate) ''' testing on the model ''' print('testing the model')
def calculateResults(dirout, dirin, filename='_evaluate.txt', row=0, resultfilename='result.csv'): """ %s\%s\_evaluate.txt :param dirin: contains a list of \%s\_evaluate.txt :return: dirin = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_v1_train_validate_v2_fixpositive_2/2/test_DIP/' dirout = dirin calculateResults(dirout,dirin,filename='log.txt',row = 2,resultfilename = 'result.csv') """ # dirin = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\stage2\processPair2445\pair\positiveV1\onehot\result' check_path(dirout) count = 0 data = [] # columns = ['loss', 'acc', 'metric_precision', 'metric_recall', 'metric_F1score', 'matthews_correlation'] columns = ['Loss', 'Acc', 'Precision', 'Recall', 'F1score', 'MCC'] indexs = [] print(columns) print(dirin) for eachdir in os.listdir(dirin): print(eachdir) if '.' in eachdir: continue fin = os.path.join(dirin, eachdir) sep = '\\' if '\\' in filename else '/' if sep in filename: for f in filename.split(sep): fin = os.path.join(fin, f) else: fin = os.path.join(fin, filename) # fin = '%s\%s\_evaluate.txt' % (dirin, eachdir) if not os.access(fin, os.F_OK): print('not access to:', fin) continue with open(fin, 'r') as fi: real_row = 0 while (real_row != row): fi.readline() real_row = real_row + 1 line = fi.readline()[:-1] # sum += np.array(line.split(':')[-1][1:-1].split(',')) line = line.replace('nan', '0') print('****************', line, '********************') data.append(line.split(':')[-1][1:-1].split(',')) indexs.append(eachdir) count = count + 1 print(str(line.split(':')[-1][1:-1].split(','))[1:-1]) mydata = pd.DataFrame(data) mydata.replace('nan', 0, inplace=True) t = mydata.apply(pd.to_numeric) t.loc['mean'] = t.apply(lambda x: x.mean()) indexs.append('mean') t.index = indexs t.columns = columns t.sort_index(inplace=True) t.to_csv(os.path.join(dirout, resultfilename), index=True, header=True)
def getNpy(self, fin_fasta, out_dir, multi=True): check_path(out_dir) for ID, seq in self.getYield(fin_fasta, multi=multi): if p.checkProtein(seq, 50, 2000, uncomm=True): filename = os.path.join(out_dir, "%s.npy" % ID) result = self.seq2num(seq) if len(result) != 0: np.save(filename, result)
def getPhsi_Blos(self, fin_fasta, out_dir, multi=True, checkprotein=True): check_path(out_dir) p = Protein() for ID, seq in self.getYield(fin_fasta, multi=multi): if not checkprotein or p.checkProtein(seq, 50, 2000, uncomm=True): filename = os.path.join(out_dir, "%s.npy" % ID) result = self.phsi_blos(seq) if len(result) != 0: np.save(filename, result)
def savepredict(fin_pair, dir_in, fin_model, dirout_result): # fin_pair = '/home/19jiangjh/data/PPI/release/pairdata/p_fw/1/0/test.txt' # dir_in = '/home/19jiangjh/data/PPI/release/feature/p_fp_fw_19471' # fin_model = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/_my_model.h5' # dirout_result = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/test' check_path(dirout_result) onehot = True dataarray = BaseData().loadTest(fin_pair, dir_in, onehot=onehot, is_shuffle=False) x_test, y_test = dataarray model = models.load_model(fin_model, custom_objects=MyEvaluate.metric_json) result = model.evaluate(x_test, y_test, verbose=False, batch_size=90) result_predict = model.predict(x_test, batch_size=90) result_predict = result_predict.reshape(-1) result_class = model.predict_classes(x_test, batch_size=90) result_class = result_class.reshape(-1) y_test = y_test.reshape(-1) print('Loss:%f,ACC:%f' % (result[0], result[1])) df = pd.read_table(fin_pair, header=None) # df.columns = ['tmp', 'nontmp'] df.rename(columns={0: 'tmp', 1: 'nontmp'}, inplace=True) df['real_label'] = list(y_test) df['predict_label'] = result_class df['predict'] = result_predict df.to_csv(os.path.join(dirout_result, 'result.csv'), index=False) result_manual = MyEvaluate().evaluate_manual(y_test, result_predict) print( '[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]' ) print(result_manual) print('[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]') result_manual2 = calculate_performance(len(x_test), y_test, result_class, result_predict) print(result_manual2) with open(os.path.join(dirout_result, 'log.txt'), 'w') as fo: fo.write('test dataset %s\n' % fin_pair) fo.write('Loss:%f,ACC:%f\n' % (result[0], result[1])) fo.write('evaluate result:' + str(result) + '\n') fo.write( 'manual result:[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]\n' ) fo.write('manual result:' + str(result_manual) + '\n') fo.write( 'manual result2:[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]\n' ) fo.write('manual result2:' + str(result_manual2) + '\n') fo.flush()
def entry(dirout, fin_pair, dir_in, model_type=Param.CNN1D, limit=0, onehot=False, kernel_size=3, epochs=60, filters=250, batch_size=100, validate=None): # model_type = Param.CNN1D # fin_pair = '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt' # dir_in = '/home/jjhnenu/data/PPI/release/feature/pssm400_feature_1D/p_fp_fw_2_1_1/all/' # fout = '/home/jjhnenu/data/PPI/release/result/pssm400_feature_1D/p_fp_fw_2_1_1/all/' # if 'all' not in dirout:return check_path(dirout) print('dirout:', dirout) # dir_in = dirout.replace(des, 'feature') print('dir_in:', dir_in) print('fin_pair', fin_pair) bd = BaseData() if validate == None: (x_train, y_train), (x_test, y_test) = bd.load(fin_pair, dir_in, test_size=0.1, limit=limit, onehot=onehot) else: x_test, y_test = bd.loadTest(validate['fin_pair'], validate['dir_in'], onehot=onehot, is_shuffle=True) x_train, y_train = bd.loadTest(fin_pair, dir_in, onehot=onehot, is_shuffle=True) print('Build and fit model...') print('x_train.shape[1:]', x_train.shape[1:]) mm = MyModel( model_type=model_type, input_shape=x_train.shape[1:], filters=filters, kernel_size=kernel_size, pool_size=2, hidden_dims=250, batch_size=batch_size, epochs=epochs, ) mm.process(dirout, x_train, y_train, x_test, y_test) print('save result to %s' % dirout)
def base_compose(self, dirout_feature, fin_pair, dir_feature_db, feature_type='V_PSSM', fout_pair=''): check_path(dirout_feature) fo = open(fout_pair, 'w') if fout_pair != '' else None row = 0 for pairs in getPairs(fin_pair): a = pairs[0] b = pairs[1] # print(pairs) # ['O35668', 'P00516'] fa = os.path.join(dir_feature_db, a + '.npy') fb = os.path.join(dir_feature_db, b + '.npy') row = row + 1 print('loading %d th feature pair' % row) if not (os.access(fa, os.F_OK) and os.access(fb, os.F_OK)): print( '===============features of pairs not found %s %s================' % (a, b), os.access(fa, os.F_OK), os.access(fb, os.F_OK)) continue pa = np.load(fa, allow_pickle=True) pb = np.load(fb, allow_pickle=True) if (len(pa) < 50 or len(pa) > 2000 or max(pa) > 20) or (len(pb) < 50 or len(pb) > 2000 or max(pb) > 20): print('wrong length or x') continue if fo != None: fo.write('%s\t%s\n' % (a, b)) fo.flush() # padding if feature_type == Feature_type.V_PSSM: pc = self.padding_PSSM(pa, pb, vstack=True) elif feature_type == Feature_type.H_PSSM: pc = self.padding_PSSM(pa, pb, vstack=False) elif feature_type == Feature_type.SEQ_1D: pc = self.padding_seq1D(pa, pb, vstack=False) # elif feature_type == Feature_type.SEQ_1D_OH:pc = self.padding_seq1D(pa,pb,vstack=False) elif feature_type == Feature_type.SEQ_2D: pc = self.padding_seq2D(pa, pb) else: print('incoreect feature_type') return # 保存padding后的成对特征 fout = os.path.join(dirout_feature, "%s_%s.npy" % (a, b)) np.save(fout, pc) del pc, pa, pb if fo != None: fo.close()
def build_img(self, font_name='', bg_color=None, b_color_img_name="", txt_freq=None): try: if not font_name.strip(): pass else: self.font_name = font_name self.font_path = os.path.join(my_dirpath, "../font/%s.ttf" % self.font_name) self.bg_color = bg_color self.back_color_img = imread( os.path.join(my_dirpath, "../image/%s" % b_color_img_name)) my_log.logger.info("Start creating word clouds...") wc = WordCloud( font_path=self.font_path, #设置字体 mode="RGBA", background_color=self.bg_color, #背景颜色 max_words=self.max_words, # 词云显示的最大词数 mask=self.back_color_img, #设置背景图片 max_font_size=self.max_font_size, #字体最大值 random_state=self.random_state, #设置有多少种随机生成状态,即有多少种配色方案 colormap="viridis", #随机颜色 relative_scaling=1, scale=1.2) wc.generate_from_frequencies(txt_freq) #根据图片的颜色布局进行着色 # image_colors = ImageColorGenerator(self.back_color_img) # wc.recolor(color_func=image_colors) # 根据给定的颜色值进行渲染 # grouped_color_func = GroupedColorFunc(color_to_words, default_color) # wc.recolor(color_func=grouped_color_func) # 绘制词云,保存图片 save_img_path = os.path.join(my_dirpath, "./image/build_img/") check_path(save_img_path) img_path = save_img_path + str(time.time()) + ".png" wc.to_file(img_path) my_log.logger.info("build img success.img_path:%s" % img_path) except Exception as e: my_log.logger.error("build img error...") my_log.logger.error(e)
def _3clusters(fin_posi,fin_nega,fin_tmp,fin_nontmp,dirout): ''' :param fin_posi: :param fin_nega: :param fin_tmp: :param fin_nontmp: :param dirout: :return: f4posi 'file/10humanTrain/3cluster/4posi.tsv' f4nega 'file/10humanTrain/3cluster/4nega.tsv' cd hit 0.4 cd-hit tool :http://weizhong-lab.ucsd.edu/cdhit_suite/cgi-bin/index.cgi?cmd=cd-hit get *.clstr file ''' # fin_posi = 'file/10humanTrain/1positive/2pair.tsv' # (44210, 2) # fin_nega = 'file/10humanTrain/2negative/4subcellular/dirRelated/2pair.tsv' # (61323, 2) # fin_tmp = 'file/10humanTrain/3cluster/1tmp.clstr' # fin_nontmp = 'file/10humanTrain/3cluster/1nontmp.clstr' # dirout = 'file/10humanTrain/3cluster/' check_path(dirout) f2pair = os.path.join(dirout,'1pair.tsv') f3out_tmp = os.path.join(dirout,'3tmp.tsv') f3out_nontmp = os.path.join(dirout,'3nontmp.tsv') f3pair = os.path.join(dirout,'3pair.tsv') f3pair_clstr = os.path.join(dirout,'3pair_clstr.tsv') f4pair = os.path.join(dirout,'4pair.tsv') f4posi = os.path.join(dirout,'4posi.tsv') f4nega = os.path.join(dirout,'4nega.tsv') # (60697, 3) ''' concat positive and negative pair ''' concatPAN(fin_posi, fin_nega, f2pair) cluster2Table(fin_tmp,f3out_tmp) cluster2Table(fin_nontmp, f3out_nontmp) pairWithClusterLable(f2pair,f3out_tmp,f3out_nontmp,fout_clus=f3pair_clstr,fout=f3pair) ''' extract positive,negative ''' saveRelated_posi_nega(f3pair,f4pair,f4posi,f4nega)
def _1posiSampleHumanPair(fin_1posiInfo,dirout): ''' positive samples human-human ''' # fin_1posiInfo = 'file/1positive/2tmp_nontmp_info_qualified.tsv' # dirout = 'file/10humanTrain/1positive' check_path(dirout) f8tmp_species = os.path.join(dirout, '8tmp_species.tsv') f8nontmp_species = os.path.join(dirout, '8nontmp_species.tsv') f8species = os.path.join(dirout, '8species.tsv') f8sameSpecies = os.path.join(dirout, '8sameSpecies.tsv') f8posiSpecies = os.path.join(dirout, '8posiSpecies.tsv') f9human_related = os.path.join(dirout, '9human_related.tsv') f9human_human = os.path.join(dirout, '9human_human.tsv') # 44210 pairinfo findSpecies(fin_1posiInfo,f8species, f8tmp_species, f8nontmp_species, f8sameSpecies,f8posiSpecies,col=[1,8]) species = 'HUMAN' relatedSpecies(f8posiSpecies, species, f9human_related, f9human_human, col=[0,7,14,15]) fout_fasta = os.path.join(dirout, '2pair.fasta') fout_tmp_fasta = os.path.join(dirout, '2tmp.fasta') fout_nontmp_fasta = os.path.join(dirout, '2nontmp.fasta') f2positive = os.path.join(dirout, '2pair.tsv') f2tmp = os.path.join(dirout, '2tmp.list') f2nontmp = os.path.join(dirout, '2nontmp.list') f2all = os.path.join(dirout, '2all.list') f2tmp_info = os.path.join(dirout, '2tmp_info.tsv') f2nontmp_info = os.path.join(dirout, '2nontmp_info.tsv') f2all_info = os.path.join(dirout, '2all_info.tsv') # save 11995 protein fasta file/10humanTrain/1positive/2pair.fasta # save 3513 tmp file/10humanTrain/1positive/2tmp.fasta # save 8482 nontmp file/10humanTrain/1positive/2nontmp.fasta simplifyTable(f9human_human, f2positive) extractPairAndFasta(f9human_human, fout_fasta, fout_tmp_fasta=fout_tmp_fasta, fout_nontmp_fasta=fout_nontmp_fasta) getproteinlist(f9human_human, ftmp=f2tmp, fnontmp=f2nontmp, fall=f2all, ftmp_info=f2tmp_info, ftmp_nontmp_info=f2nontmp_info, fall_info=f2all_info)
def save(self, dirout, flist, ratios, limit, labels=None, sep='\t', filename='all.txt', groupcount=1, repeate=True): """ same length of flist,ratios,labels if not None :param fout: :param flist: :param ratios: :param limit: :param labels:[1,0,0] 样本中无标签时,需要提供这个标签 :param sep: :return: case dirin = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\release\data' fin_p = r'%s\positive_2049.txt' % dirin fin_fp = r'%s\negative_fpositive_10245.txt' % dirin fin_fw = r'%s\negative_fswissprot_7781.txt' % dirin fout = r'%s\p_fp_fw_2_1_1\all.txt' % dirin flist = [fin_p, fin_fp,fin_fw] ratios = [0.5,0.25,0.25] labels = [1,0,0] limit = 2049 ComposeData().save(fout, flist, ratios, limit, labels=labels) """ for idx, elem in enumerate( self.doCompose(flist, ratios, limit, labels=labels, groupcount=groupcount, repeate=repeate)): data = pd.concat(elem) data = data.sample(frac=1) fout = os.path.join(dirout, str(idx)) check_path(fout) fout = os.path.join(fout, filename) quick_save(data, fout, sep=sep)
def save(self): for i in range(len(self.fin)): dirout = os.path.join(self.dirin, self.type[i]) check_path(dirout) ComposeData().save(dirout, self.fin[i], self.ratio[i], self.limit, labels=self.label[i], filename='all.txt') print('divided to train and test') for eachdir in os.listdir(dirout): if '.' in eachdir: continue fin = os.path.join(dirout, eachdir, 'all.txt') train = os.path.join(dirout, eachdir, 'train.txt') test = os.path.join(dirout, eachdir, 'test.txt') fouts = [train, test] ratios = [0.8, 0.2] PairDealer().part(fin, ratios, fouts)
def process(fin_fasta, dir_feature_db, dir_feature, dir_pair): ''' generate featuredb and feature(feature pair) :param fin_fasta:>ID\nseq\n :param dir_feature_db: :param dir_feature: :param dir_pair: contains several protein file # ID pair proteinA\tproteinB\n :return: ''' check_path(dir_feature_db) check_path(dir_feature) # fasta to feature fd = FastaDealer() fd.getNpy(fin_fasta, dir_feature_db) for eachfile in os.listdir(dir_pair): print(eachfile) fin_pair = os.path.join(dir_pair, eachfile) BaseFeature().base_compose(dir_feature, fin_pair, dir_feature_db, feature_type=Feature_type.SEQ_1D)
def get_city(self): """ 获取城市列表 """ if check_path(self.city_names): try: with pathlib.Path(self.city_names).open("r") as city_file: self.citys = json.loads(city_file.read()) return except Exception as e: error(f"城市文件读取错误: {e}") try: resp = self.menu_session.get(self.url_list["citys"]) self.save_debug_file(resp.text, "get_city.html") res = self.reg_list["citys"].findall(resp.text) addsucess() self.citys = json.loads(res[0]) create_json(self.citys, self.city_names) except Exception as e: error(f"获取城市信息失败: {e}") addfailed() exit() success(f"{len(self.citys.keys())} citys")
def savepredict(fin_pair, dir_in, fin_model, dirout_result, batch_size=90, limit=0, posi=False, onehot=True): # fin_pair = '/home/19jiangjh/data/PPI/release/pairdata/p_fw/1/0/test.txt' # dir_in = '/home/19jiangjh/data/PPI/release/feature/p_fp_fw_19471' # fin_model = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/_my_model.h5' # dirout_result = '/home/19jiangjh/data/PPI/release/result_in_paper/alter_ratio/p_fw_train_validate/1/test' check_path(dirout_result) print('predict ', fin_pair, '...') print('using feature ', dir_in, '...') print('save result in ', dirout_result) df = pd.read_table(fin_pair, header=None) if df.shape[1] != 3: df[2] = 1 if posi else 0 dataarray = BaseData().loadTest(fin_pair, dir_in, onehot=onehot, is_shuffle=False, limit=limit) x_test, y_test = dataarray print('load model...', fin_model) # from tensorflow.keras.models import load_model # model = load_model(fin_model, custom_objects=MyEvaluate.metric_json) model = models.load_model(fin_model, custom_objects=MyEvaluate.metric_json) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=MyEvaluate.metric) result = model.evaluate(x_test, y_test, verbose=1, batch_size=batch_size) result_predict = model.predict(x_test, batch_size=batch_size) result_class = (result_predict > 0.5).astype("int32") result_predict = result_predict.reshape(-1) # result_class = model.predict_classes(x_test,batch_size=batch_size) # UserWarning: `model.predict_classes()` is deprecated and will be removed after 2021-01-01. result_class = result_class.reshape(-1) # y_test = y_test.reshape(-1) print('Loss:%f,ACC:%f' % (result[0], result[1])) if limit != 0: df = df[:limit] df.columns = ['tmp', 'nontmp', 'real_label'] # df.rename(columns={0: 'tmp', 1: 'nontmp'}, inplace=True) # df['real_label'] = list(y_test) df['predict_label'] = result_class df['predict'] = result_predict df.sort_values(by=['predict'], ascending=False).to_csv(os.path.join( dirout_result, 'result.csv'), index=False) # result_manual = MyEvaluate().evaluate_manual(y_test, result_predict) # print('[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]') # print(result_manual) # print('[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]') # result_manual2 =calculate_performance(len(x_test), y_test, result_class, result_predict) # print(result_manual2) with open(os.path.join(dirout_result, 'log.txt'), 'w') as fo: fo.write('test dataset %s\n' % fin_pair) fo.write('Loss:%f,ACC:%f\n' % (result[0], result[1])) fo.write('evaluate result:' + str(result) + '\n') fo.write( 'manual result:[acc,metric_precision, metric_recall, metric_F1score, matthews_correlation]\n' ) # fo.write('manual result:' + str(result_manual) + '\n') # fo.write('manual result2:[acc,precision,sensitivity,f1,mcc,aps,aucResults,specificity]\n') # fo.write('manual result2:'+str(result_manual2)+'\n') fo.flush()
projection = {'_id': True, 'UNIPROID':True} docs = do.GetALL(projection=projection,limit=0) for protein in docs: if protein['UNIPROID'] == '':continue for pro in multiSplit(protein['UNIPROID']): yield pro.strip() def writeProtins(fout): proteins = [] for protein in getallProtein(): proteins.extend(protein) saveList(getallProtein(),fout) if __name__ == '__main__': print('start', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) start = time.time() db_name = 'ttd' table_target = 'target' fout = 'file/otherfile/2ttd_target_protein.list' check_path('file/otherfile/') writeProtins(fout) pass print('stop', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())) print('time', time.time() - start)
f2nontmpInfo = os.path.join(dirout,'2nontmpInfo.tsv') f2tmp = os.path.join(dirout, '2tmp.list') f2nontmp = os.path.join(dirout, '2nontmp.list') f3pair = os.path.join(dirout,'3pair.tsv') f3pair_norepeat = os.path.join(dirout,'3pair_norepeat.tsv') f3pairInfo = os.path.join(dirout,'2pairInfo.tsv') f3tmp_fasta = os.path.join(dirout,'3tmp.fasta') f3nontmp_fasta = os.path.join(dirout,'3nontmp.fasta') f3all_fasta = os.path.join(dirout,'3all.fasta') dir3bathData = os.path.join(dirout,'3batchData') check_path(dir3bathData) f4sample1k = os.path.join(dir3bathData,'3batchData') ''' get TMP, nonTMP list query 5208 tmp and 15186 nontmp ''' # generateHumanLists(f1tmp, f1nontmp) ''' get qualified single info ''' # getSingleInfo(f1tmp,f2tmpInfo,fin_type='single') # getSingleInfo(f1nontmp,f2nontmpInfo,fin_type='single') ''' get qualified protein
''' input file : positive pair and negative pair AC-pair of TMP-nonTMP fasta generate feature fasta ''' print() ''' fasta to feature ''' # >ID\nseq\n fin_fasta = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_DIP_fasta20170301_simple.seq' dir_feature_db = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_featuredb' check_path(dir_feature_db) fd = FastaDealer() fd.getNpy(fin_fasta, dir_feature_db) ''' generate feature pair 50-2000 no X ''' dir_feature_db = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_featuredb' dir_feature = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_feature/' # ID pair proteinA\tproteinB\n dir_pair = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_TMP_nonTMP/' dirout_pair = '/home/jjhnenu/data/PPI/release/otherdata/DIP/_TMP_nonTMP_qualified/' check_path(dir_feature) check_path(dirout_pair) for eachfile in os.listdir(dir_pair): print(eachfile)
def crosshumanTrain(modelreuse=False): f2all = 'file/10humanTrain/3cluster/4pair.tsv' # 正负样本非1:1了 f1out = 'file/10humanTrain/4train/group' f2out = 'file/10humanTrain/4train/cross/group' f3out = 'file/10humanTrain/4train/cross' # dirout_feature = '/home/19jjhnenu/Data/SeqTMPPI2W/feature/129878/' # f2resultOut = '/home/19jjhnenu/Data/SeqTMPPI2W/result/10humanTrain_80epoch/group_reusemodel_5CV' dirout_feature = '/root/19jjhnenu/Data/SeqTMPPI2W/feature/129878/' f2resultOut = '/root/19jjhnenu/Data/SeqTMPPI2W/result/10humanTrain_80epoch/group_reusemodel_5CV' check_path(f2resultOut) check_path(f2out) # flist = [os.path.join(f1out,x,'all.txt') for x in os.listdir(f1out)] # concatFile(flist,f2all) ''' train:test = 5:1 ''' train = os.path.join(f3out, 'train_vali.txt') test = os.path.join(f3out, 'test.txt') ratios_tvt = [5, 1] f3outs = [train, test] # PairDealer().part(os.path.join(f3out,'all.txt'),ratios_tvt,f3outs) # PairDealer().part(f2all,ratios_tvt,f3outs) ''' 5cv ''' ratios_tvt = [1] * 5 f3outs = [os.path.join(f2out, '%d.txt' % x) for x in range(5)] # PairDealer().part(os.path.join(f3out,'all.txt'),ratios_tvt,f3outs) # PairDealer().part(os.path.join(f3out,'train_vali.txt'),ratios_tvt,f3outs) ''' cross train ''' for cv in range(5): # oldfile = '-1' oldfile = '2' for elem in range(5): if elem == cv: continue f2dirout = os.path.join(f2resultOut, str(cv)) f3dirout = os.path.join(f2dirout, str(elem)) fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') if not os.access(fin_model, os.F_OK) or not modelreuse: fin_model = None train = os.path.join(f2out, '%d.txt' % elem) validate = {} validate['fin_pair'] = os.path.join(f2out, '%d.txt' % cv) validate['dir_in'] = dirout_feature onehot = True entry( f3dirout, train, dirout_feature, model_type=Param.CNN1D_OH, limit=0, onehot=onehot, kernel_size=90, epochs=80, # epochs=2, filters=300, batch_size=500, validate=validate, fin_model=fin_model) oldfile = str(elem)
def crossTrain(f2out, dirout_feature, f2resultOut, modelreuse=True): ''' cross train and test ''' f1out = 'file/4train/' f2outdir = os.path.join(f1out, str(0)) fin_pair = os.path.join(f2outdir, 'all.txt') f2outdir = os.path.join(f1out, '5CV', 'data') check_path(f2outdir) train = os.path.join(f2outdir, 'train_vali.txt') test = os.path.join(f2outdir, 'test.txt') ratios_tvt = [5, 1] f3outs = [train, test] # PairDealer().part(fin_pair,ratios_tvt,f3outs) ''' train model ''' f2outdir = os.path.join(f1out, '5CV', 'data') check_path(f2outdir) train = os.path.join(f2outdir, 'train_vali.txt') # f2out = 'file/4train/5CV/elem' ratios_tvt = [1] * 5 f3outs = [os.path.join(f2out, '%d.txt' % x) for x in range(5)] # PairDealer().part(train,ratios_tvt,f3outs) limit = 0 # eachdir = 'benchmark' # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir # f2resultOut = '/home/19jjhnenu/Data/SeqTMPPI2W/result/5CV_1' ''' cross train ''' for cv in range(5): oldfile = '-1' f2dirout = os.path.join(f2resultOut, str(cv)) fin_model = '' f3dirout = '' for elem in range(5): if cv == elem: continue f3dirout = os.path.join(f2dirout, str(elem)) fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') if not os.access(fin_model, os.F_OK) or not modelreuse: fin_model = None train = os.path.join(f2out, '%d.txt' % elem) validate = {} validate['fin_pair'] = os.path.join(f2out, '%d.txt' % cv) validate['dir_in'] = dirout_feature onehot = False # entry(f3dirout, train, dirout_feature, model_type=Param.CNN1D_OH, limit=0, onehot=onehot, kernel_size=90, # epochs=80, # # epochs=2, # filters=300, batch_size=500, validate=validate, # fin_model=fin_model) # entry(f3dirout, train, dirout_feature, model_type=Param.TRANSFORMER, limit=10, onehot=onehot, kernel_size=90, # # epochs=80, # epochs=2, # filters=300, batch_size=500, validate=validate, # fin_model=fin_model) # # oldfile = str(elem) # # print(f3dirout) # calculateResults(f2dirout, f2dirout, resultfilename='result.csv') # eachdir = 'benchmark' # eachdir = 'benchmark' # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir # print('testing the model on test dataset') # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') # fin_test = 'file/4train/5CV/data/test.txt' # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_test/%d'%cv # check_path(dirout_result_test) # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test,batch_size=500,limit=2000,onehot = onehot) ''' testing on DIP all.txt in DIP/predict ''' # fin_test = 'file/8DIPPredict/predict/all.txt' # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'DIP' # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_DIP/%d' % cv # check_path(dirout_result_test) # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot) ''' testing on DIP all.txt in DIP/data ''' # fin_test = 'file/8DIPPredict/data/all.txt' # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'DIP' # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_DIP_posi/%d' % cv # check_path(dirout_result_test) # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot) ''' testing on all.txt in Imex ''' # fin_test = 'file/8ImexPredict/4pair.tsv' # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'IMEx' # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_IMEx_posi/%d' % cv # check_path(dirout_result_test) # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot) ''' IMEx + - ''' # fin_test = 'file/8ImexPredict/predict/0/all.txt' # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'IMEx' # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_IMEx/%d' % cv # check_path(dirout_result_test) # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot) ''' testing on DIP all.txt in DIP/data/Human ''' # for eachfile in ['Ecoli', 'Human', 'Mus', 'SC']: # fin_test = 'file/8DIPPredict/data/%s/2pair.tsv'%eachfile # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % 'DIP' # fin_model = os.path.join(f2dirout, oldfile, '_my_model.h5') # dirout_result_test = '/home/19jjhnenu/Data/Phsi_Blos/result/5CV_1_DIP_%s/%d' % (eachfile,cv) # check_path(dirout_result_test) # savepredict(fin_test, dirout_feature, fin_model, dirout_result_test, batch_size=500,onehot = onehot) '''
# # fin_pair = 'file/8ImexPredict/data/2pair.tsv' # fin_fasta = 'file/8ImexPredict/data/2pair.fasta' # eachdir = 'IMEx' # dir_feature_db = '/home/19jjhnenu/Data/Phsi_Blos/featuredb/%s/' % eachdir # dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir # # getFeature(fin_pair, fin_fasta, dir_feature_db, dirout_feature) ''' train ''' eachdir = 'benchmark' dirout_feature = '/home/19jjhnenu/Data/Phsi_Blos/feature/%s/' % eachdir dir_in = dirout_feature eachdir = 'benchmark' dirout = '/home/19jjhnenu/Data/Phsi_Blos/result/%s/' % eachdir check_path(dirout) validate = {} validate['fin_pair'] = 'file/4train/0/validate.txt' validate['dir_in'] = dir_in onehot = False import os import tensorflow as tf gpu_id = '0,1,2,3' # gpu_id = '4,5,6,7' os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) os.system('echo $CUDA_VISIBLE_DEVICES') tf_config = tf.compat.v1.ConfigProto() tf_config.gpu_options.allow_growth = True
SC uniprotPairFromImex ''' # # mpf = '/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0472_189043.list' # tmpf = '/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0812_131609.list' # spf = '/home/jjhnenu/data/PPI/release/criteria/allcession_soluble_614454.list' # finPair = '/home/jjhnenu/data/PPI/release/otherdata/Ecoli/2Ecoli20170205_id_pair_12246.txt' # foutPair = '/home/jjhnenu/data/PPI/release/otherdata/Ecoli/TMP_SP/TMP_SP.txt' # _path,_fname = os.path.split(foutPair) # check_path(_path) # getTmp_SpPair(tmpf,spf,finPair, foutPair,type1='TMP',type2='SP',crossover = True) ''' count pair ''' # dirout_pair = r'E:\githubCode\BioComputeCodeStore\JiangJiuhong\data\PPI\release\otherdata\DIP\_TMP_nonTMP_qualified_drop_positive' # check_path(dirout_pair) # for eachfile in os.listdir(dirout_pair): # fout_pair = os.path.join(dirout_pair, eachfile) # countpair(fout_pair) ''' imex 2020 0708 ''' tmpf = '/home/jjhnenu/data/PPI/release/criteria/allcession_KW-0812_131609.list' spf = '/home/jjhnenu/data/PPI/release/criteria/allcession_soluble_614454.list' finPair = '/home/jjhnenu/data/PPI/release/otherdata/uniprotPiarFromImex20200709/getpair15955.csv' foutPair = '/home/jjhnenu/data/PPI/release/otherdata/uniprotPiarFromImex20200709/TMP_SP/TMP_SP.txt' _path,_fname = os.path.split(foutPair) check_path(_path) getTmp_SpPair(tmpf,spf,finPair, foutPair,type1='TMP',type2='SP',crossover = True)
def main(limit=5): # if __name__ == '__main__': # limit = 0 # limit = 5 onehot = False ''' load feature ''' print('Loading feature...') ''' pssm ''' # model_type = Param.CNN2D # tpssmd = TmpPSSMData() # # (x_train, y_train), (x_test, y_test) = tpssmd.loadPAN(inPDir, inNDir, limit=limit) # # fin_pair = '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt' # dir_in = '/home/jjhnenu/data/PPI/release/feature/pssm_feature_2D/p_fp_fw_2_1_1/all/' # fout = '/home/jjhnenu/data/PPI/release/result/pssm_feature_2D/p_fp_fw_2_1_1/all' # check_path(fout) # (x_train, y_train), (x_test, y_test) = tpssmd.load(fin_pair,dir_in,limit=limit) ''' seq1D ''' # model_type = Param.CNN1D # fin_pair = '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt' # dir_in = '/home/jjhnenu/data/PPI/release/feature/seq_feature_1D/p_fp_fw_2_1_1/all/' # fout = '/home/jjhnenu/data/PPI/release/result/seq_feature_1D/p_fp_fw_2_1_1/all/' ''' seq1D onehot ''' # model_type = Param.CNN1D_OH # onehot = True # fin_pair = '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt' # dir_in = '/home/jjhnenu/data/PPI/release/feature/seq_feature_1D/p_fp_fw_2_1_1/all' # fout = '/home/jjhnenu/data/PPI/release/result/seq_feature_1D_onehot/p_fp_fw_2_1_1/all/' ''' test seq2D Could not load dynamic library 'libnvinfer.so.6'; dlerror: libnvinfer.so.6: cannot open shared object file: No such file or directory 2020-04-30 21:41:50.315008: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'libnvinfer_plugin.so.6'; dlerror: libnvinfer_plugin.so.6: cannot open shared object file: No such file or directory 2020-04-30 21:41:50.315029: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:30] Cannot dlopen some TensorRT libraries. If you would like to use Nvidia GPU with TensorRT, please make sure the missing libraries mentioned above are installed properly ''' onehot = True model_type = Param.CNN2D fin_pair = '/home/jjhnenu/data/PPI/release/pairdata/p_fp_fw_2_1_1/all.txt' dir_in = '/home/jjhnenu/data/PPI/release/feature/seq_feature_2D/p_fp_fw_2_1_1/all/' fout = '/home/jjhnenu/data/PPI/release/result/seq_feature_2D/p_fp_fw_2_1_1/all/' ''' pssm hstack ''' # model_type = Param.CNN2D # fin_pair = '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt' # dir_in = '/home/jjhnenu/data/PPI/release/feature/pssm_feature_2D_hstack/p_fp_fw_2_1_1/all/' # fout = '/home/jjhnenu/data/PPI/release/result/pssm_feature_2D_hstack/p_fp_fw_2_1_1/all/' ''' pssm 400 ''' # model_type = Param.CNN1D # fin_pair = '/home/jjhnenu/data/PPI/release/data/p_fp_fw_2_1_1/all.txt' # dir_in = '/home/jjhnenu/data/PPI/release/feature/pssm400_feature_1D/p_fp_fw_2_1_1/all/' # fout = '/home/jjhnenu/data/PPI/release/result/pssm400_feature_1D/p_fp_fw_2_1_1/all/' check_path(fout) (x_train, y_train), (x_test, y_test) = BaseData().load(fin_pair, dir_in, limit=limit, onehot=onehot) print('Build and fit model...') print('x_train.shape[1:]', x_train.shape[1:]) mm = MyModel( model_type=model_type, input_shape=x_train.shape[1:], filters=250, kernel_size=3, pool_size=2, hidden_dims=250, batch_size=100, epochs=80, ) mm.process(fout, x_train, y_train, x_test, y_test) print('save result to %s' % fout)