def main(para): if 'ListSize' not in para: para['ListSize'] = '-1' if 'ThreadNum' not in para: para['ThreadNum'] = '1' if 'RandomSeed' not in para: para['RandomSeed'] = '2014' if 'ModelName' not in para: para['ModelName'] = 'RF-bin' if 'ModelFile' not in para: para['ModelFile'] = 'set1_all_new.txt.fea.max.RF-bin' if 'DockMethod' not in para: para['FeatureType'] = 'SaveResidue' if 'SolutionNum' not in para: para['SolutionNum'] = '10' if 'PredictCutoff' not in para: para['PredictCutoff'] = '0.5' ## Step 1: docking all pdb chain pairs and train if not os.path.exists(para['ModelFile']): import cross_validation para1 = para.copy() para1['SplitFold'] = '1' cross_validation.main(para1) ## Step 2: docking new pairs and predict ## prepare pdb pair list pdblistfile = 'list_from_user.txt' with open(pdblistfile, 'w') as tempfile: tempfile.write('Hhp1\tTas3\t4HOK\tA\t3D1D\tA\n') #tempfile.write('Hhp1\tMoc3\t4HOK\tA\tMOC3_modbase\t \n') tempfile.write('Hhp1\tPpc89\t4HOK\tA\tPPC89_Modbase\t \n') ## docking them and generate features feature_file = 'features_for_predicting.txt' #if not os.path.exists(feature_file): if True: import prepare_feature para2 = para.copy() para2['ListFile'] = pdblistfile para2['OutFile'] = feature_file para2['ListFormat'] = 'p1/p2/pdb1/ch1/pdb2/ch2' prepare_feature.main(para2) ## 2.c: predict their contact probabilities from cross_validation import add_residue_label, model_predict add_residue_label(feature_file) predfile = model_predict(feature_file, model=para['ModelName'], mfile=para['ModelFile']) ## Step 3: get predicted values for each residue from cross_validation import map_pdb_residue residue_value = map_pdb_residue(predfile) with open(para['ExeFile']+'data.txt', 'w') as outfile: for g1g2, res, val in residue_value: outfile.write('%s\t%s\t-1\t%s\n'%('\t'.join(g1g2.split('=')), res, val)) from evaluate_perform import read_residue_data, group_residue idx, val1, val2 = read_residue_data(para['ExeFile']+'data.txt') pp_val = group_residue(idx, val2) ## using predicted value for p1, p2 in pp_val: res = pp_val[(p1,p2)] pp = list(set([r.split(':')[0] for r in res])) if True: ## reformat protein names if p1 == p2 and len(pp) == 1: p1 = pp p2 = pp elif len(pp) == 2: p1 = pp[0] p2 = pp[1] else: print 'Failed to map', p1, p2, pp show(p1) show(p2) int1 = [r for r in res if r.split(':')[0]==p1 and res[r] >= float(para['PredictCutoff'])] int2 = [r for r in res if r.split(':')[0]==p2 and res[r] >= float(para['PredictCutoff'])] ord1 = sorted([int(r.split(':')[-1]) for r in int1]) ord2 = sorted([int(r.split(':')[-1]) for r in int2]) show(','.join([str(i) for i in ord1])) show(','.join([str(i) for i in ord2])) show()
def main(para): if 'ListFile' not in para: para['ListFile'] = para['DataPath']+'/set1_all_new.txt' if 'SplitFold' not in para: para['SplitFold'] = '10' if 'SolutionNum' not in para: para['SolutionNum'] = '10' if 'RandomSeed' not in para: para['RandomSeed'] = '2014' if 'ThreadNum' not in para: para['ThreadNum'] = '1' if 'FeatureType' not in para: para['FeatureType'] = 'SaveResidue' if 'ModelName' not in para: para['ModelName'] = 'RF-bin' train_list = [] ## unique pdb list for training real_value = [] ## from cocrystal pred_value = [] ## from learning model other_vals = {} if para['FeatureType'] == 'SaveResidue': other_vals = {2:[], 3:[], 4:[], 5:[]} # elif para['FeatureType'] == 'SaveZDOCK': # other_vals = {2:[], 3:[], 4:[]} elif para['FeatureType'] == 'SavePatchDock': other_vals = {2:[], 3:[], 4:[]} elif para['FeatureType'] == 'SaveSequence': other_vals = {2:[], 3:[], 4:[]} all_res, pp_int = get_res_labels(para) import prepare_feature for train, test in split_data(para['ListFile'], int(para['SplitFold']), int(para['RandomSeed'])): if para['SplitFold'] == '1': train = para['ListFile'] test = para['ListFile'] if False: ## compare DDI network from generate_hSIN import generate_ddi2, get_pdb_subset generate_ddi2(output_file = train + '.ddi', pdb_subset = get_pdb_subset(train)) generate_ddi2(output_file = test + '.ddi', pdb_subset = get_pdb_subset(test)) from domain_map import reduced_ddi train_ddi = reduced_ddi(train + '.ddi') test_ddi = reduced_ddi(test + '.ddi') ############################################################### ## Train show(train, False) para2 = para.copy() ## copy parameters para2['ListFile'] = train para2['ListSize'] = '-1' para2['ListFormat'] = 'p1/p2/pdb1/ch1/pdb2/ch2' prepare_feature.main(para2) resfile = combine_pdb_residue(para2['OutFile']) #resfile = add_more_info(resfile, train_ddi) show(add_residue_label(resfile, pp_int), False) mfile = train_model(resfile, model=para['ModelName']) ############################################################### ## Test show(test, False) para2['ListFile'] = test prepare_feature.main(para2) resfile = combine_pdb_residue(para2['OutFile']) #resfile = add_more_info(resfile, train_ddi) show(add_residue_label(resfile, None), False) outfile = model_predict(resfile, model=para['ModelName'], mfile=mfile) ## Save values if not os.path.exists(outfile): continue ## skip this fold pred_value += map_pdb_residue(outfile, para2['MapFile']) for idx in other_vals: values = other_vals[idx] values += map_pdb_residue(resfile, para2['MapFile'], useidx=idx) other_vals[idx] = values show() if para['SplitFold'] != '1': ## clean files os.system('rm %s*'%train) os.system('rm %s*'%test) ## Get the real labels of residues in predicted protein pairs pred_pp = set([pp for pp, res, val in pred_value]) show('Performance based on %s protein pairs'%len(pred_pp)) for pp, res, val in all_res: if pp in pred_pp: real_value.append((pp, res, val)) ## Comparison between know_value and pred_value agaist real_value save_list = [('cocry', real_value), ('pred'+'-'+para['FeatureType']+'-'+para['ModelName'], pred_value)] for idx in other_vals: save_list.append((para['FeatureType']+'-idx%s'%idx, other_vals[idx])) for name, value in save_list: head = []; real = []; pred = []; for idx, res, val1, val2 in intersect_poslist(real_value, value): p1, p2 = idx.split('=') head.append([p1, p2, res]) real.append(val1) pred.append(val2) with open('cv_cocry_'+name+'_'+para['RandomSeed']+'.txt', 'w') as tempfile: for _a, _b, _c in zip(head, real, pred): tempfile.write('%s\t%s\t%s\n'%('\t'.join(_a), _b, _c)) ## evaluate show(name, False) area, px, py, pc = performance(real, pred, x='FPR', y='TPR') show(area, False) #show(); show('FPR'); show(px, True); #show('TPR'); show(py, True); area, px, py, pc = performance(real, pred, x='TPR', y='PPV') show(area, False) #show(); show('Recall'); show(px, True); #show('Precision'); show(py, True); #show('Cutoff'); show(pc, True); show()