def Goldstandard_from_cluster_File(gsF, foundprots=""): clusters = GS.Clusters(need_to_be_mapped=False) clusters.read_file(gsF) if foundprots != "": clusters.remove_proteins(foundprots) gs = GS.Goldstandard_from_Complexes("All") gs.complexes = clusters gs.make_pos_neg_ppis() return gs
def create_goldstandard(clusters, target_taxid, valprots): if target_taxid != "9606" and target_taxid != "": orthmap = GS.Inparanoid(taxid=target_taxid) else: orthmap = "" gs = GS.Goldstandard_from_Complexes("Goldstandard") gs.make_reference_data(clusters, orthmap, found_prots=valprots) return gs
def orth_map(args): clusterF, taxid, outF = args clust = GS.Clusters(False) clust.read_file(clusterF) orthmap = GS.Inparanoid(taxid=taxid) orthmap.mapComplexes(clust) clust.merge_complexes() clust.filter_complexes() outFH = open(outF, "w") outFH.write(clust.to_string()) outFH.close()
def make_eval(args): # pred_clust_F, ref_clust_F, ppiF, cutoff, outF = args pred_clust_F, ref_clust_F = args #num_ppis = CS.lineCount(ppiF) pred_clusters = GS.Clusters(False) pred_clusters.read_file(pred_clust_F) ref_clusters = GS.Clusters(False) ref_clusters.read_file(ref_clust_F) # utils.clustering_evaluation(train.complexes, pred_clusters, "Train", True) scores, head = utils.clustering_evaluation(ref_clusters, pred_clusters, "", True)
def load_data(data, scores, orthmap="", fc=2, mfc=1): if type(data) is list: paths = data else: paths = [os.path.join(data, fn) for fn in next(os.walk(data))[2]] elutionDatas = [] elutionProts = set([]) for elutionFile in paths: if elutionFile.rsplit(os.sep, 1)[-1].startswith("."): continue elutionFile = elutionFile.rstrip() elutionData = CS.ElutionData(elutionFile, frac_count=fc, max_frac_count=mfc) if orthmap != "": if orthmap != False: mapper = GS.Inparanoid("", inparanoid_cutoff=1) mapper.readTable(orthmap, direction=0) elutionData.orthmap(mapper) elutionDatas.append(elutionData) elutionProts = elutionProts | set(elutionData.prot2Index.keys()) for score in scores: score.init(elutionData) return elutionProts, elutionDatas
def rf_cutoff(args): pred_clust_F, ref_clust_F, ppiF, cutoff, outF = args num_ppis = CS.lineCount(ppiF) pred_clusters = GS.Clusters(False) pred_clusters.read_file(pred_clust_F) ref_clusters = GS.Clusters(False) ref_clusters.read_file(ref_clust_F) # utils.clustering_evaluation(train.complexes, pred_clusters, "Train", True) scores, head = utils.clustering_evaluation(ref_clusters, pred_clusters, "", True) outFH = open(outF, "w") outFH.write("%s\t%i\t%i\t%s\n" % (cutoff, num_ppis, len(pred_clusters.complexes), scores)) outFH.close()
def Goldstandard_from_PPI_File(gsF, foundprots=""): out = GS.Goldstandard_from_Complexes("gs") gsFH = open(gsF) for line in gsFH: line = line.rstrip() ida, idb, class_label = line.split("\t")[0:3] if foundprots != "" and (ida not in foundprots or idb not in foundprots): continue edge = "\t".join(sorted([ida, idb])) if class_label == "positive": out.positive.add(edge) else: out.negative.add(edge) gsFH.close() return out
def get_reference_from_net(target_taxid): if target_taxid != "9606": reference_clusters = [ GS.Intact_clusters(True), GS.CORUM(True), GS.QuickGO("9606", True), GS.QuickGO(target_taxid, False) ] else: reference_clusters = [ GS.Intact_clusters(False), GS.CORUM(False), GS.QuickGO("9606", False) ] return reference_clusters
def cut(args): fc, scoreF, outF = args if fc == "00000000": sys.exit() this_scores = get_fs_comb(fc) scoreCalc = CS.CalculateCoElutionScores("", "", "", "", cutoff=0.5) empty_gs = GS.Goldstandard_from_Complexes() empty_gs.positive = set([]) empty_gs.negative = set([]) scoreCalc.readTable(scoreF, empty_gs) print scoreCalc.to_predict feature_comb = feature_selector([fs.name for fs in this_scores], scoreCalc) feature_comb.open() outFH = open(outF, "w") print >> outFH, "\t".join(feature_comb.scoreCalc.header) for i in range(feature_comb.to_predict): edge, edge_scores = feature_comb.get_next() if edge == "" or edge_scores == []: continue print >> outFH, "%s\t%s" % (edge, "\t".join(map(str, edge_scores))) outFH.close() feature_comb.close()
def stability_evaluation(n_fold, all_gs, scoreCalc, clf, output_dir, mode, anno_source, anno_F): tmp_train_eval_container = (all_gs.split_into_n_fold2( n_fold, set(scoreCalc.ppiToIndex.keys()))["turpleKey"]) #create the dictionary to store the predicted PPIs PPIs_dict_for_each_fold = {} #create the dictionary to store the predicted complexes complexes_dict_for_each_fold = {} for index in range(n_fold): train, eval = tmp_train_eval_container[index] print "All comp:%i" % len(all_gs.complexes.complexes) print "Train comp:%i" % len(train.complexes.complexes) print "Eval comp:%i" % len(eval.complexes.complexes) print "Num valid ppis in training pos: %i" % len(train.positive) print "Num valid ppis in training neg: %i" % len(train.negative) print "Num valid ppis in eval pos: %i" % len(eval.positive) print "Num valid ppis in eval neg: %i" % len(eval.negative) # Evaluate classifier bench_clf(scoreCalc, train, eval, clf, output_dir, verbose=True) functionalData = "" if mode != "exp": functionalData = get_FA_data(anno_source, anno_F) print functionalData.scores.shape print "the functional evidence data shape is: " # Predict protein interaction based on n_fold cross validation network = make_predictions(scoreCalc, "exp", clf, train, fun_anno="", verbose=False) # need to write the network into a file for later-on complexes prediction. outFH = open("%s.%s.pred.txt" % (output_dir, mode + anno_source), "w") print >> outFH, "\n".join(network) outFH.close() PPIs_dict_for_each_fold[index] = set(get_network_edges(network)) #predicted_clusters from the predicted PPI network predict_clusters("%s.%s.pred.txt" % (output_dir, mode + anno_source), "%s.%s.clust.txt" % (output_dir, mode + anno_source)) pred_clusters = GS.Clusters(False) pred_clusters.read_file("%s.%s.clust.txt" % (output_dir, mode + anno_source)) complexes_dict_for_each_fold[index] = pred_clusters print "fold " + str(index + 1) + "is done" #create a matrix for storing overlapped matrix, each element in the matrix is a zero. overlapped_ratio_matrix_PPIs = np.zeros((n_fold, n_fold)) overlapped_ratio_matrix_complexes = np.zeros((n_fold, n_fold)) for i in range(0, n_fold): for j in range(0, n_fold): overlapped_ratio_matrix_PPIs[i, j] = (len( PPIs_dict_for_each_fold[i] & PPIs_dict_for_each_fold[j])) / ( (len(PPIs_dict_for_each_fold[i]) + len(PPIs_dict_for_each_fold[j])) / 2) # calculate the overlapped complexes numbers from both direction and then get the avergae of them overlapped_no1 = complexes_dict_for_each_fold[i].getOverlapp( complexes_dict_for_each_fold[j], cutoff=0.25) overlapped_no2 = complexes_dict_for_each_fold[j].getOverlapp( complexes_dict_for_each_fold[i], cutoff=0.25) averaged_overlapped_complexes_no = (overlapped_no1 + overlapped_no2) / 2 overlapped_ratio_matrix_complexes[ i, j] = averaged_overlapped_complexes_no / ( (len(complexes_dict_for_each_fold[i].get_complexes()) + len(complexes_dict_for_each_fold[j].get_complexes())) / 2) print overlapped_ratio_matrix_PPIs print overlapped_ratio_matrix_complexes # create the txt file to save the overlap matrix for stabilit testing. filename1 = output_dir + " n_fold_corss_validation_PPIs overlap matrix.txt" filename2 = output_dir + " n_fold_corss_validation_complexes overlap matrix.txt" np.savetxt(filename1, overlapped_ratio_matrix_PPIs, delimiter='\t') np.savetxt(filename2, overlapped_ratio_matrix_complexes, delimiter='\t')
def main(): parser = argparse.ArgumentParser() parser.add_argument( "-s", "--feature_selection", type=str, help= "Select which features to use. This is an 8 position long array of 0 and 1, where each position determines which co-elution feature to use. Features sorted by position are: MI, Bayes, Euclidean, WCC, Jaccard, PCCN, PCC, and Apex. Each default=11101001", default="11101001") parser.add_argument( "input_dir", type=str, help="Directory containing the elution files for each experiment") parser.add_argument( "-t", "--taxid", type=str, help="TAXID to automatically download reference from GO,CORUM,INtACT", default="") parser.add_argument( "-c", "--cluster", type=str, help="Path to file containing protein clsuter reference", default="") parser.add_argument("-p", "--ppi", type=str, help="path to ppi File", default="") parser.add_argument("output_dir", type=str, help="Directory containing the output files") parser.add_argument("-o", "--output_prefix", type=str, help="Prefix name for all output Files", default="Out") parser.add_argument( "-M", "--classifier", type=str, help="Select which classifier to use. Values: RF SVM, default RF", default="RF") parser.add_argument("-n", "--num_cores", type=int, help="Number of cores to be used, default 1", default=1) parser.add_argument( "-m", "--mode", type=str, help= "Run EPIC with experimental, functional, or both evidences. Values: EXP, FA, COMB, default: EXP ", default="EXP") parser.add_argument( "-f", "--fun_anno_source", type=str, help= "Where to get functional annotaiton from. Values: STRING or GM or FILE, default= GM", default="GM") parser.add_argument( "-F", "--fun_anno_file", type=str, help= "Path to File containing functional annotation. This flag needs to be set when using FILE as fun_anno_source.", ) parser.add_argument("-r", "--co_elution_cutoff", type=float, help="Co-elution score cutoff. default 0.5", default=0.5) parser.add_argument( "-R", "--classifier_cutoff", type=float, help="Classifier confidence valye cutoff. default = 0.5", default=0.5) parser.add_argument( "-e", "--elution_max_count", type=int, help= "Removies protein that have a maximal peptide count less than the given value. default = 1", default=1) parser.add_argument( "-E", "--frac_count", type=int, help= "Number of fracrions a protein needs to be measured in. default = 2", default=2) parser.add_argument( "-P", "--precalcualted_score_file", type=str, help= "Path to precalulated scorefile to read scores from for faster rerunning of EPIC. default = None", default="NONE") args = parser.parse_args() args.mode = args.mode.upper() args.fun_anno_source = args.fun_anno_source.upper() #Create feature combination if args.feature_selection == "00000000": print "Select at least one feature" sys.exit() this_scores = utils.get_fs_comb(args.feature_selection) print "\t".join([fs.name for fs in this_scores]) # Initialize CLF use_rf = args.classifier == "RF" clf = CS.CLF_Wrapper(args.num_cores, use_rf) # Load elution data foundprots, elution_datas = utils.load_data(args.input_dir, this_scores, fc=args.frac_count, mfc=args.elution_max_count) # Generate reference data set gs = "" if ((args.taxid != "" and args.ppi != "") or (args.cluster != "" and args.ppi != "")): print "Refernce from cluster and PPI are nor compatiple. Please supply ppi or complex reference, not both!" sys.exit() if args.taxid == "" and args.ppi == "" and args.cluster == "": print "Please supply a reference by setting taxid, cluster, or ppi tag" sys.exit() gs_clusters = [] if (args.taxid != "" and args.cluster == "" and args.ppi == ""): print "Loading clusters from GO, CORUM, and Intact" gs_clusters.extend(utils.get_reference_from_net(args.taxid)) if args.cluster != "": print "Loading complexes from file" if args.mode == "FA": gs_clusters.append(GS.FileClusters(args.cluster, "all")) else: gs_clusters.append(GS.FileClusters(args.cluster, foundprots)) if args.ppi != "": print "Reading PPI file from %s" % args.reference gs = Goldstandard_from_PPI_File(args.ppi, foundprots) print gs_clusters if len(gs_clusters) > 0: gs = utils.create_goldstandard(gs_clusters, args.taxid, foundprots) output_dir = args.output_dir + os.sep + args.output_prefix refFH = open(output_dir + ".ref_complexes.txt", "w") for comp in gs.complexes.complexes: print >> refFH, "%s\t%s" % (",".join(comp), ",".join( gs.complexes.complexes[comp])) refFH.close() scoreCalc = CS.CalculateCoElutionScores(this_scores, elution_datas, output_dir + ".scores.txt", num_cores=args.num_cores, cutoff=args.co_elution_cutoff) if args.precalcualted_score_file == "NONE": scoreCalc.calculate_coelutionDatas(gs) else: scoreCalc.readTable(args.precalcualted_score_file, gs) print scoreCalc.scores.shape functionalData = "" gs.positive = set(gs.positive & set(scoreCalc.ppiToIndex.keys())) gs.negative = set(gs.negative & set(scoreCalc.ppiToIndex.keys())) gs.rebalance() print len(gs.positive) print len(gs.negative) if args.mode != "EXP": print "Loading functional data" functionalData = utils.get_FA_data(args.fun_anno_source, args.taxid, args.fun_anno_file) print "Dimension of fun anno " + str(functionalData.scores.shape) print "Start benchmarking" if args.mode == "EXP": utils.cv_bench_clf(scoreCalc, clf, gs, output_dir, format="pdf", verbose=True, folds=5) if args.mode == "COMB": tmp_sc = copy.deepcopy(scoreCalc) tmp_sc.add_fun_anno(functionalData) utils.cv_bench_clf(tmp_sc, clf, gs, output_dir, format="pdf", verbose=True, folds=5) if args.mode == "FA": utils.cv_bench_clf(functionalData, clf, gs, output_dir, format="pdf", verbose=True, folds=5) # PPI evaluation print utils.cv_bench_clf(scoreCalc, clf, gs, args.output_dir, verbose=False, format="pdf", folds=5) #print "I am here" network = utils.make_predictions(scoreCalc, args.mode, clf, gs, fun_anno=functionalData) # Predict protein interaction outFH = open("%s.pred.txt" % (output_dir), "w") final_network = [] for PPI in network: items = PPI.split("\t") if float(items[2]) >= args.classifier_cutoff: final_network.append(PPI) print >> outFH, "\n".join(final_network) outFH.close() # Predicting clusters utils.predict_clusters("%s.pred.txt" % (output_dir), "%s.clust.txt" % (output_dir)) # Evaluating predicted clusters pred_clusters = GS.Clusters(False) pred_clusters.read_file("%s.clust.txt" % (output_dir)) overlapped_complexes_with_reference = gs.get_complexes( ).get_overlapped_complexes_set(pred_clusters) print "# of complexes in reference dataset: " + str( len(overlapped_complexes_with_reference)) #clust_scores, header = utils.clustering_evaluation(gs.complexes, pred_clusters, "", False) clust_scores, header, composite_score = utils.clustering_evaluation( gs.complexes, pred_clusters, "", False) outFH = open("%s.eval.txt" % (output_dir), "w") header = header.split("\t") clust_scores = clust_scores.split("\t") for i, head in enumerate(header): print "%s\t%s" % (head, clust_scores[i]) print >> outFH, "%s\t%s" % (head, clust_scores[i]) outFH.close()
def n_fold_cross_validation(n_fold, all_gs, scoreCalc, clf, output_dir, overlap, local): out_scores = [] out_head = [] header = [ "Num_pred_PPIS", "NUM_pred_CLUST", "mmr", "overlapp", "simcoe", "mean_simcoe_overlap", "sensetivity", "ppv", "accuracy", "sep" ] train_eval_container = all_gs.n_fols_split(n_fold, overlap) # create a matrix to store the computed complexes vealuation metrics complex_eval_score_vector = np.zeros((n_fold, 10)) val_ppis = set(scoreCalc.ppiToIndex.keys()) print "Number of ppis with e-score>0.5: %i" % len(val_ppis) #the global cluster will contain all clusters predcited from n-fold-corss validation for index in range(n_fold): print "processinng fold " + str(index + 1) train, eval = train_eval_container[index] train.positive = train.positive & val_ppis train.negative = train.negative & val_ppis train.rebalance() print "All comp:%i" % len(all_gs.complexes.complexes) print "Train comp:%i" % len(train.complexes.complexes) print "Eval comp:%i" % len(eval.complexes.complexes) print "Num valid ppis in training pos: %i" % len(train.positive & val_ppis) print "Num valid ppis in training neg: %i" % len(train.negative & val_ppis) print "Num valid ppis in eval pos: %i" % len(eval.positive) print "Num valid ppis in eval neg: %i" % len(eval.negative) print "Overlap positive %i" % (len(train.positive & eval.positive)) print "Overlap negative %i" % (len(train.negative & eval.negative)) network = [] if local: # Predict protein interaction based on n_fold cross validation network = utils.make_predictions_cross_validation( scoreCalc, train, eval, clf) else: network = utils.predictInteractions(scoreCalc, clf, train, verbose=True) netF = "%s.fold_%s.pred.txt" % (output_dir, index) clustF = "%s.fold_%s.clust.txt" % (output_dir, index) #if os.path.isfile(netF): # netFH = open(netF) # for line in netFH: # line = line.rstrip() # network.append(line) # netFH.close() fold_head = [] if len(network) == 0: print "No edges were predicted" tmp_scores = [0] * 10 fold_head = "\t".join( ["%s%s" % ("Fold %i " % (index + 1), h) for h in header]) out_head.append(fold_head) out_scores.append("\t".join(map(str, tmp_scores))) complex_eval_score_vector[index, :] = tmp_scores continue tmp = [] for ppi in network: prota, protb, score = ppi.split("\t") if float(score) > 0.5: # this is random forest confidence cut off tmp.append(ppi) network = tmp outFH = open(netF, "w") print >> outFH, "\n".join(network) outFH.close() # Predicting clusters utils.predict_clusters(netF, clustF) # Evaluating predicted clusters pred_clusters = GS.Clusters(False) pred_clusters.read_file(clustF) print "number of complexes" print len(pred_clusters.get_complexes()) print "number of ppis" print len(network) fold_scores, fold_head = utils.clustering_evaluation( eval.complexes, pred_clusters, "Fold %i " % (index + 1), True) out_scores.append( "%i\t%i\t%s" % (len(network), len(pred_clusters.get_complexes()), fold_scores)) out_head.append("\t".join( ["%s%s" % ("Fold %i " % (index + 1), h) for h in header])) tmp_scores = [len(network), len(pred_clusters.get_complexes())] tmp_scores.extend(map(float, fold_scores.split("\t"))) tmp_scores = np.array(tmp_scores) complex_eval_score_vector[index, :] = tmp_scores averaged_complex_eval_metrics_vector = np.mean(complex_eval_score_vector, axis=0) out_scores.append("\t".join(map(str, averaged_complex_eval_metrics_vector))) mean_head = "\t".join(["%s%s" % ("Mean ", h) for h in header]) out_head.append(mean_head) return "\t".join(out_scores), "\t".join(out_head)