示例#1
0
文件: utils.py 项目: shaoxiuma/EPIC
def get_FA_data(anno_source, taxid, file="", datadir=""):
    functionalData = ""
    if anno_source == "GM":

        genemania = CS.Genemania(taxid)
        #genemania = CS.Genemania("6239")
        functionalData = genemania.getScoreCalc()

    elif anno_source == "STRING":

        string = CS.STRING(taxid, datadir)
        functionalData = string.getScoreCalc()

    elif anno_source == "FILE":
        if file == "":
            print "When using FILE tag please suppy path to file containing functional annotation using -F file+path"
            sys.exit()
        # the supplied functional evidence data needs to have the correct header row...
        externaldata = CS.ExternalEvidence(file)
        #externaldata.readFile()
        functionalData = externaldata.getScoreCalc()

    else:
        print "EPIC only support GeneMane, STRING, and flat file input please use the followign tags for anno_source GM, STRING, FILE. Returning empty string object."
    return functionalData
示例#2
0
def calc_feature_combination(args):
    feature_combination, se, input_dir, use_rf, num_folds, overlap, local, cutoff, num_cores, scoreF, mode, anno, faF, ref_complexes, output_dir = args
    #Create feature combination
    cutoff = float(cutoff) / 100
    num_folds = int(num_folds)

    if feature_combination == "00000000": sys.exit()
    this_scores = get_fs_comb(feature_combination)
    num_cores = int(num_cores)
    use_rf = use_rf == "True"
    overlap = overlap == "True"
    local = local == "True"

    clf_name = "SVM"
    if use_rf: clf_name = "RF"

    clf = CS.CLF_Wrapper(num_cores, use_rf)

    ref_gs = Goldstandard_from_cluster_File(ref_complexes)

    scoreCalc = CS.CalculateCoElutionScores(this_scores,
                                            "",
                                            scoreF,
                                            num_cores=num_cores,
                                            cutoff=cutoff)
    scoreCalc.readTable(scoreF, ref_gs)
    feature_comb = feature_selector([fs.name for fs in this_scores], scoreCalc)

    print feature_comb.scoreCalc.scores.shape
    print scoreCalc.scores.shape
    if mode == "comb":
        fa = utils.get_FA_data(anno, faF)
        feature_comb.add_fun_anno(fa)
    elif mode == "fa":
        feature_comb = utils.get_FA_data(anno, faF)
        print type(feature_comb)

    elif mode != "exp":
        print "not support this mode"
        sys.exit()

    scores, head = n_fold_cross_validation(num_folds, ref_gs, feature_comb,
                                           clf, output_dir, overlap, local)

    outFH = open(output_dir + ".eval.txt", "w")
    print "FS\tSE\tCLF\t" + head
    print "%s\t%s\t%s\t" % (feature_combination, se, clf_name) + scores

    print >> outFH, "FS\tSE\tCLF\t" + head
    print >> outFH, "%s\t%s\t%s\t" % (feature_combination, se,
                                      clf_name) + scores
    outFH.close()
def main():
	(scoreF, refF, elutionF, geneNameF, outF) = sys.argv[1:]
	
	geneNameFH = open(geneNameF)
	geneName= {}
	species = {}
	for line in geneNameFH:
		line = line.rstrip()
		ida, idb, spec = line.split("\t")
		if ida not in geneName: geneName[ida] = set([])
		geneName[ida].add(idb)
		species[ida] = spec
		species[idb] = spec 
	geneNameFH.close()

	toLearn, toPred = calcS.loadScoreData(scoreF, refF)
	
	rfc =  calcS.trainML(toLearn)
	print rfc.getValScores()
	
	ref, eluD, calc = calcS.loadData(refF, elutionF)
	
	
	calc.calculate2DScores(ref)
	outFH = open(outF + ".arff", "w")
	outFH.write(calc.toArffData())
	outFH.close()
	print "Calculated scores"
	
	rfc2 =  calcS.trainML(calc)
	print rfc2.getValScores()
	
	data, targets = toPred.toSklearnData()
	dataL, targetsL = toLearn.toSklearnData()
	preds = rfc.predict(data)
	prots = []
	for protA, protB, label in toPred.scores:
		prots.append((protA, protB))

	outFH = open(outF, "w")
	for i in range(len(preds)):
		protA, protB = prots[i]
		if protA in geneName: geneA = ",".join(geneName[protA])
		if protB in geneName: geneB = ",".join(geneName[protB])
		spec = species[protA]
		if preds[i][1]>0.5:
			print >> outFH, "%s\t%s\t%s\t%s\t%s\t%f" % (protA, protB, geneA, geneB, spec, preds[i][1])
	outFH.close()
示例#4
0
def main():
    (elutionF, refF, outD) = sys.argv[1:]
    reference, elutionData, scoreCalc = calcS.loadData(refF, elutionF)
    iexFractions = range(1, 49)

    out = np.array([[-1.00] * 48] * 48)
    for removeLeft in range(1, 49):
        tmpFracs = copy.copy(iexFractions)
        for i in range(1, removeLeft):
            if i in tmpFracs:
                tmpFracs.remove(i)
        for removeRight in reversed(range(removeLeft + 1, 50)):
            if removeRight in tmpFracs:
                tmpFracs.remove(removeRight)
            print tmpFracs
            fractions = getIEXFracs(tmpFracs)
            tmpElution = copy.copy(elutionData)
            tmpElution.getSubset(fractions)
            scoreCalc = calcS.CalculateCoElutionScores(tmpElution)
            scoreCalc.calculateAllScores([calcS.Euclidiean()], reference)
            data, targets = scoreCalc.toSklearnData()
            clf = calcS.RandomForest(data, targets)
            scores = clf.getValScores()
            out[removeLeft - 1][49 - removeRight] = scores[1]
            print "%i\t%i\t%.2f" % (removeLeft - 1, 49 - removeRight, scores[1])

    outFH = open(outD + ".iex.dat", "w")
    print >> outFH, "\t" + "\t".join(map(str, range(48)))
    for i in range(48):
        print >> outFH, "%i\t%s" % (i, "\t".join(map("{0:.2f}".format, out[i])))
    outFH.close()
示例#5
0
文件: utils.py 项目: nrnb/EPIC
def load_data(data, scores, orthmap="", fc=2, mfc=1):

    if type(data) is list:
        paths = data
    else:
        paths = [os.path.join(data, fn) for fn in next(os.walk(data))[2]]

    elutionDatas = []
    elutionProts = set([])
    for elutionFile in paths:
        if elutionFile.rsplit(os.sep, 1)[-1].startswith("."): continue
        elutionFile = elutionFile.rstrip()
        elutionData = CS.ElutionData(elutionFile,
                                     frac_count=fc,
                                     max_frac_count=mfc)
        if orthmap != "":
            if orthmap != False:
                mapper = GS.Inparanoid("", inparanoid_cutoff=1)
                mapper.readTable(orthmap, direction=0)
                elutionData.orthmap(mapper)
        elutionDatas.append(elutionData)
        elutionProts = elutionProts | set(elutionData.prot2Index.keys())
        for score in scores:
            score.init(elutionData)
    return elutionProts, elutionDatas
示例#6
0
def main():
	(elutionF, geneNameF, outPrefix) = sys.argv[1:]
	
	geneNameFH = open(geneNameF)
	geneName= {}
	species = {}
	for line in geneNameFH:
		line = line.rstrip()
		ida, idb, spec = line.split("\t")
		if ida not in geneName: geneName[ida] = set([])
		geneName[ida].add(idb)
		species[ida] = spec
		species[idb] = spec 
	geneNameFH.close()

	elutionData, scoreCalc = calcS.loadEData(elutionF)
	preds = scoreCalc.getAllPairs()
	out = {}
	for protA, protB, _ in preds:
		if protA not in species or protB not in species: continue
		if species[protA] != species[protB]: continue
		if species[protA] not in out: out[species[protA]] = set()
		out[species[protA]].add("\t".join(sorted([protA, protB])))	

	for species in out:
		outFH = open("%s.%s.topred.txt" % (outPrefix, species) , "w")
		print >> outFH, "ProtA\tProtB"
		print >> outFH, "\n".join(out[species])
		outFH.close()
示例#7
0
def ppi_fs(args):
    fsc, scoreF, use_rf, se, num_cores, refComplexesF, output_dir = args
    num_cores = int(num_cores)
    use_rf = use_rf == "True"

    clf_name = "SVM"
    if use_rf: clf_name = "RF"
    clf = CS.CLF_Wrapper(num_cores, use_rf)

    this_fs = get_fs_comb(fsc)
    all_gs = Goldstandard_from_cluster_File(refComplexesF)
    valprots = all_gs.get_proteins()

    scoreCalc = CS.CalculateCoElutionScores(this_fs,
                                            "",
                                            scoreF,
                                            num_cores=num_cores,
                                            cutoff=-1)
    scoreCalc.readTable(scoreF, all_gs)
    print scoreCalc.scores.shape

    test_scoreCalc = feature_selector([fs.name for fs in this_fs], scoreCalc)

    print("The size of chopped matrix for selected features")
    print np.shape(test_scoreCalc.get_scoreCalc().get_all_scores())

    print "training ppis: %i" % len(set(test_scoreCalc.ppiToIndex.keys()))

    train_gold_complexes = all_gs.return_gold_standard_complexes(
        set(test_scoreCalc.ppiToIndex.keys()))

    print "Train_gold comp:%i" % len(train_gold_complexes.complexes.complexes)

    print "Num valid ppis in pos: %i" % len(train_gold_complexes.positive)
    print "Num valid ppis in neg: %i" % len(train_gold_complexes.negative)

    # Evaluate classifier
    evaluation_results = utils.bench_by_PPI_clf(10, test_scoreCalc,
                                                train_gold_complexes, clf)

    print evaluation_results

    outFH = open("%s.ppi_eva.txt" % (output_dir), "w")
    print >> outFH, "FS\tSE\tCLF\tFM\tauPR\tauROC\n%s\t%s\t%s\t%s" % (
        fsc, se, clf_name, "\t".join(map(str, evaluation_results)))
    outFH.close()
示例#8
0
def main():
	(elutionF, outF) = sys.argv[1:]
	
	elutionData, scoreCalc = calcS.loadEData(elutionF)

	scoreCalc.calculateAllPairs([calcS.MutualInformation(2)])

	outFH = open(outF, "w")
	outFH.write(scoreCalc.toTable(False))
	outFH.close()
示例#9
0
文件: utils.py 项目: nrnb/EPIC
def get_fs_comb(comb_string):
    #Create feature combination
    scores = [
        CS.MutualInformation(2),
        CS.Bayes(3),
        CS.Euclidiean(),
        CS.Wcc(),
        CS.Jaccard(),
        CS.Poisson(5),
        CS.Pearson(),
        CS.Apex()
    ]
    this_scores = []
    for i, feature_selection in enumerate(comb_string):
        if feature_selection == "1": this_scores.append(scores[i])
    return this_scores
示例#10
0
def main():
	(elutionF, refF, windowSize, outF) = sys.argv[1:]
	windowSize = int(windowSize)
	outData = ['']*3
	reference, elutionData, scoreCalc = calcS.loadData(refF, elutionF)
	j = 0
	name = elutionF.split("Ce_")[1].split(".")[0]
	for resultScore in getFracEvals(elutionData.elutionMat):
		data_lines = entropyVSprecision(elutionData, reference, resultScore, windowSize)
		for i in range(len(data_lines)):
			outData[j] += "\n%s\t%i\t%s" % (name, windowSize, data_lines[i])
		j += 1

	if len(outData[0]) != 0: 
		printTable("%s_%s_Entropy_%i.dat" % (outF, name, windowSize), "Entropy", outData[0])
		printTable("%s_%s_Prot-prob_%i.dat" % (outF, name, windowSize), "Prot-prob", outData[1])
		printTable("%s_%s_Num-prots_%i.dat" % (outF, name, windowSize), "Num-prots", outData[2])
示例#11
0
def rf_cutoff(args):
    pred_clust_F, ref_clust_F, ppiF, cutoff, outF = args

    num_ppis = CS.lineCount(ppiF)
    pred_clusters = GS.Clusters(False)
    pred_clusters.read_file(pred_clust_F)

    ref_clusters = GS.Clusters(False)
    ref_clusters.read_file(ref_clust_F)

    #	utils.clustering_evaluation(train.complexes, pred_clusters, "Train", True)
    scores, head = utils.clustering_evaluation(ref_clusters, pred_clusters, "",
                                               True)

    outFH = open(outF, "w")
    outFH.write("%s\t%i\t%i\t%s\n" %
                (cutoff, num_ppis, len(pred_clusters.complexes), scores))
    outFH.close()
示例#12
0
def cut(args):
    fc, scoreF, outF = args
    if fc == "00000000": sys.exit()
    this_scores = get_fs_comb(fc)
    scoreCalc = CS.CalculateCoElutionScores("", "", "", "", cutoff=0.5)
    empty_gs = GS.Goldstandard_from_Complexes()
    empty_gs.positive = set([])
    empty_gs.negative = set([])
    scoreCalc.readTable(scoreF, empty_gs)
    print scoreCalc.to_predict
    feature_comb = feature_selector([fs.name for fs in this_scores], scoreCalc)
    feature_comb.open()
    outFH = open(outF, "w")
    print >> outFH, "\t".join(feature_comb.scoreCalc.header)
    for i in range(feature_comb.to_predict):
        edge, edge_scores = feature_comb.get_next()
        if edge == "" or edge_scores == []: continue
        print >> outFH, "%s\t%s" % (edge, "\t".join(map(str, edge_scores)))
    outFH.close()
    feature_comb.close()
示例#13
0
def calc_scores(args):
    topred = []
    if args[0] == "-ref":
        _, refF, fs, numcores, cutoff, e_dir, outF = args
        gs = Goldstandard_from_cluster_File(refF)
        topred = list(gs.positive | gs.negative)
        print len(topred)
    else:
        fs, numcores, cutoff, e_dir, outF = args

    numcores = int(numcores)
    cutoff = float(cutoff)

    this_fs = get_fs_comb(fs)
    prots, edatas = utils.load_data(e_dir, this_fs)
    scoreCalc = CS.CalculateCoElutionScores(this_fs,
                                            edatas,
                                            outF,
                                            num_cores=numcores,
                                            cutoff=cutoff)
    if topred == []: topred = scoreCalc.getAllPairs()
    scoreCalc.calculateScores(topred)
示例#14
0
 def filter_scoreCalc(self, scoreCalc):
     filtered_sc = CS.CalculateCoElutionScores("", "", "", 1)
     filtered_sc.scoreF = scoreCalc.scoreF
     filtered_sc.header = list(
         np.array(scoreCalc.header)[self.to_keep_header])
     filtered_sc.scores = np.zeros(
         (len(scoreCalc.ppiToIndex.keys()), len(self.to_keep_score)))
     ppi_index = 0
     for i in range(scoreCalc.scores.shape[0]):
         ppi = scoreCalc.IndexToPpi[i]
         protA, protB = ppi.split("\t")
         if (protA not in self.valprots
                 or protB not in self.valprots) and self.valprots != []:
             continue
         ppi_scores = self.filter_score(scoreCalc.scores[i, :])
         if ppi_scores == []: continue
         filtered_sc.ppiToIndex[ppi] = ppi_index
         filtered_sc.IndexToPpi[ppi_index] = ppi
         filtered_sc.scores[ppi_index, :] = ppi_scores
         ppi_index += 1
     filtered_sc.scores = filtered_sc.scores[0:ppi_index, :]
     return filtered_sc
示例#15
0
	def read_scores(scoreF, cutoff):
		num_prots = CS.lineCount(scoreF)
		scoreFH = open(scoreF)
		header = scoreFH.readline().rstrip()
		header = header.split("\t")
		out = CS.CalculateCoElutionScores("", "", "", 4)
		out.scores = np.zeros((num_prots , len(header[2:])))
		out.header = header
		i = 0
		for line in scoreFH:
			line = line.rstrip()
			if line == "":continue
			line = line.split("\t")
			edge = "\t".join(line[:2])
			this_score = np.array(map(float, line[2:]))
			if len(list(set(np.where(this_score >= cutoff)[0]))) > 0:
				out.ppiToIndex[edge] = i
				out.IndexToPpi[i] = edge
				out.scores[i, :] = this_score
				i += 1
		out.scores = out.scores[0:i, :]
		print i
		return out
示例#16
0
def main():
	(elutionFiles, refF, direction, outF) = sys.argv[1:]
	elutionFilesFH = open(elutionFiles)
	outData = {}
	maxSize = 0
	for line in elutionFilesFH:
		line = line.rstrip()
		reference, elutionData, scoreCalc = calcS.loadData(refF, line)
		scores = removeFracs(elutionData, reference, scoreCalc, direction)
		name = line.split("Ce_")[2].split(".")[0]
		outData[name] = scores
		maxSize = max(len(scores), maxSize)
	elutionFilesFH.close()
	
	outFH = open(outF, "w")
	print >> outFH, "Experiment_name\tFraction_%s" % ("\tFraction_".join(map(str,range(1, maxSize+1))))
	for dataset in outData:
		scores = outData[dataset]
		numFracs = len(scores)
		outline = "%s\t%s"  % (dataset, "\t".join(map(str, scores)))
		if maxSize-numFracs > 0:
			outline = "%s\t%s" % (outline, "\t".join(["NA"]*(maxSize-numFracs)))
		print >> outFH, outline
	outFH.close()
示例#17
0
def exp_comb(args):
    FS, i, j, num_iter, input_dir, num_cores, ref_complexes, scoreF, mode, fun_anno_F, ppi, output_dir = args
    i, j, num_iter, num_cores = map(int, [i, j, num_iter, num_cores])
    ppi == "True"

    search_engine = input_dir.split(os.path.sep)[-2]

    def get_eData_comb(data_dir, num_iex, num_beads):
        all_exp = map(str, glob.glob(data_dir + "*.txt"))
        iex_exp = [
            f for f in all_exp if (f.split(os.sep)[-1].startswith("all"))
        ]
        beads_exp = [
            f for f in all_exp if (not f.split(os.sep)[-1].startswith("all"))
        ]
        if (i > len(iex_exp)):
            print "i is to large"
            sys.exit()
        if (j > len(beads_exp)):
            print "j is to large"
            sys.exit()

        sel_iex = rnd.sample(iex_exp, num_iex)
        sel_beads = rnd.sample(beads_exp, num_beads)
        return sel_iex + sel_beads

    # EPIC paramters
    if FS == "00000000": sys.exit()
    this_scores = get_fs_comb(FS)
    clf = CS.CLF_Wrapper(num_cores, True)

    ref_gs = Goldstandard_from_cluster_File(ref_complexes)

    scoreCalc = CS.CalculateCoElutionScores(this_scores,
                                            "",
                                            scoreF,
                                            num_cores=num_cores,
                                            cutoff=0.5)
    scoreCalc.readTable(scoreF, ref_gs)

    # the supplied functional evidence data needs to have the correct header row...
    functionalData = ""
    if mode == "comb":
        functionalData = utils.get_FA_data("FILE", fun_anno_F)

    if i == 0 and j == 0: sys.exit()

    out_head = ""
    all_scores = []

    for iter in range(num_iter):

        rnd.seed()
        this_eprofiles = get_eData_comb(input_dir, i, j)
        this_eprofiles_fnames = [
            f.rsplit(os.sep, 1)[1] for f in this_eprofiles
        ]
        rnd.seed(1)

        print this_eprofiles_fnames

        this_foundprots, _ = utils.load_data(this_eprofiles, [])
        print len(this_foundprots)

        feature_comb = feature_selector(
            [fs.name for fs in this_scores],
            scoreCalc,
            valprots=this_foundprots,
            elution_file_names=this_eprofiles_fnames)
        if mode == "comb":

            feature_comb.add_fun_anno(functionalData)

        scores = ""
        head = ""

        if ppi:
            print "Running PPI cross fold"
            ppi_ref = ref_gs.return_gold_standard_complexes(
                set(feature_comb.scoreCalc.ppiToIndex.keys()))
            fmeasure, auc_pr, auc_roc = utils.bench_by_PPI_clf(
                10, feature_comb, ppi_ref, clf)
            scores = "\t".join(map(str, [fmeasure, auc_pr, auc_roc]))
            head = "\tFM\taucPR\taucROC"
        else:
            print "Running Cluster cross fold"
            scores, head = n_fold_cross_validation(2,
                                                   ref_gs,
                                                   feature_comb,
                                                   clf,
                                                   "%s_%i_%i" %
                                                   (output_dir, i, j),
                                                   overlap=True,
                                                   local=False)

    #	head, scores = run_epic_with_feature_combinations(this_scores, ref_gs, scoreCalc, clf, output_dir, valprots=this_foundprots)
        out_head = head
        all_scores.append(
            "%s\t%s\t%i\t%i\t%s\t%i\t%s" %
            (FS, mode, i, j, search_engine, len(this_foundprots), scores))
        print head
        print scores

    outFH = open(output_dir + ".%i_%i.all.eval.txt" % (i, j), "w")
    print >> outFH, "FS\tNum_iex\tNum_beads\tSearch_engine\tNum_Prots\t%s" % out_head
    for score in all_scores:
        print >> outFH, "%s" % (score)
    outFH.close()
示例#18
0
文件: main.py 项目: shaoxiuma/EPIC
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-s",
        "--feature_selection",
        type=str,
        help=
        "Select which features to use. This is an 8 position long array of 0 and 1, where each position determines which co-elution feature to use. Features sorted by position are: MI, Bayes, Euclidean, WCC, Jaccard, PCCN, PCC, and Apex.  Each default=11101001",
        default="11101001")
    parser.add_argument(
        "input_dir",
        type=str,
        help="Directory containing the elution files for each experiment")

    parser.add_argument(
        "-t",
        "--taxid",
        type=str,
        help="TAXID to automatically download reference from GO,CORUM,INtACT",
        default="")
    parser.add_argument(
        "-c",
        "--cluster",
        type=str,
        help="Path to file containing protein clsuter reference",
        default="")
    parser.add_argument("-p",
                        "--ppi",
                        type=str,
                        help="path to ppi File",
                        default="")

    parser.add_argument("output_dir",
                        type=str,
                        help="Directory containing the output files")
    parser.add_argument("-o",
                        "--output_prefix",
                        type=str,
                        help="Prefix name for all output Files",
                        default="Out")

    parser.add_argument(
        "-M",
        "--classifier",
        type=str,
        help="Select which classifier to use. Values: RF SVM, default RF",
        default="RF")
    parser.add_argument("-n",
                        "--num_cores",
                        type=int,
                        help="Number of cores to be used, default 1",
                        default=1)

    parser.add_argument(
        "-m",
        "--mode",
        type=str,
        help=
        "Run EPIC with experimental, functional, or both evidences. Values: EXP, FA, COMB, default: EXP  ",
        default="EXP")
    parser.add_argument(
        "-f",
        "--fun_anno_source",
        type=str,
        help=
        "Where to get functional annotaiton from. Values: STRING or GM or FILE, default= GM",
        default="GM")
    parser.add_argument(
        "-F",
        "--fun_anno_file",
        type=str,
        help=
        "Path to File containing functional annotation. This flag needs to be set when using FILE as fun_anno_source.",
    )
    parser.add_argument("-r",
                        "--co_elution_cutoff",
                        type=float,
                        help="Co-elution score cutoff. default 0.5",
                        default=0.5)
    parser.add_argument(
        "-R",
        "--classifier_cutoff",
        type=float,
        help="Classifier confidence valye cutoff. default = 0.5",
        default=0.5)
    parser.add_argument(
        "-e",
        "--elution_max_count",
        type=int,
        help=
        "Removies protein that have a maximal peptide count less than the given value. default = 1",
        default=1)
    parser.add_argument(
        "-E",
        "--frac_count",
        type=int,
        help=
        "Number of fracrions a protein needs to be measured in. default = 2",
        default=2)

    parser.add_argument(
        "-P",
        "--precalcualted_score_file",
        type=str,
        help=
        "Path to precalulated scorefile to read scores from for faster rerunning of EPIC. default = None",
        default="NONE")

    args = parser.parse_args()

    args.mode = args.mode.upper()
    args.fun_anno_source = args.fun_anno_source.upper()

    #Create feature combination
    if args.feature_selection == "00000000":
        print "Select at least one feature"
        sys.exit()

    this_scores = utils.get_fs_comb(args.feature_selection)
    print "\t".join([fs.name for fs in this_scores])

    # Initialize CLF
    use_rf = args.classifier == "RF"
    clf = CS.CLF_Wrapper(args.num_cores, use_rf)

    # Load elution data
    foundprots, elution_datas = utils.load_data(args.input_dir,
                                                this_scores,
                                                fc=args.frac_count,
                                                mfc=args.elution_max_count)

    # Generate reference data set
    gs = ""
    if ((args.taxid != "" and args.ppi != "")
            or (args.cluster != "" and args.ppi != "")):
        print "Refernce from cluster and PPI are nor compatiple. Please supply ppi or complex reference, not both!"
        sys.exit()

    if args.taxid == "" and args.ppi == "" and args.cluster == "":
        print "Please supply a reference by setting taxid, cluster, or ppi tag"
        sys.exit()

    gs_clusters = []
    if (args.taxid != "" and args.cluster == "" and args.ppi == ""):
        print "Loading clusters from GO, CORUM, and Intact"
        gs_clusters.extend(utils.get_reference_from_net(args.taxid))

    if args.cluster != "":
        print "Loading complexes from file"
        if args.mode == "FA":
            gs_clusters.append(GS.FileClusters(args.cluster, "all"))
        else:
            gs_clusters.append(GS.FileClusters(args.cluster, foundprots))

    if args.ppi != "":
        print "Reading PPI file from %s" % args.reference
        gs = Goldstandard_from_PPI_File(args.ppi, foundprots)

    print gs_clusters
    if len(gs_clusters) > 0:
        gs = utils.create_goldstandard(gs_clusters, args.taxid, foundprots)

    output_dir = args.output_dir + os.sep + args.output_prefix

    refFH = open(output_dir + ".ref_complexes.txt", "w")
    for comp in gs.complexes.complexes:
        print >> refFH, "%s\t%s" % (",".join(comp), ",".join(
            gs.complexes.complexes[comp]))
    refFH.close()

    scoreCalc = CS.CalculateCoElutionScores(this_scores,
                                            elution_datas,
                                            output_dir + ".scores.txt",
                                            num_cores=args.num_cores,
                                            cutoff=args.co_elution_cutoff)
    if args.precalcualted_score_file == "NONE":
        scoreCalc.calculate_coelutionDatas(gs)
    else:
        scoreCalc.readTable(args.precalcualted_score_file, gs)

    print scoreCalc.scores.shape

    functionalData = ""
    gs.positive = set(gs.positive & set(scoreCalc.ppiToIndex.keys()))
    gs.negative = set(gs.negative & set(scoreCalc.ppiToIndex.keys()))
    gs.rebalance()

    print len(gs.positive)
    print len(gs.negative)

    if args.mode != "EXP":
        print "Loading functional data"
        functionalData = utils.get_FA_data(args.fun_anno_source, args.taxid,
                                           args.fun_anno_file)
        print "Dimension of fun anno " + str(functionalData.scores.shape)

    print "Start benchmarking"

    if args.mode == "EXP":
        utils.cv_bench_clf(scoreCalc,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    if args.mode == "COMB":
        tmp_sc = copy.deepcopy(scoreCalc)
        tmp_sc.add_fun_anno(functionalData)
        utils.cv_bench_clf(tmp_sc,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    if args.mode == "FA":
        utils.cv_bench_clf(functionalData,
                           clf,
                           gs,
                           output_dir,
                           format="pdf",
                           verbose=True,
                           folds=5)

    # PPI evaluation
    print utils.cv_bench_clf(scoreCalc,
                             clf,
                             gs,
                             args.output_dir,
                             verbose=False,
                             format="pdf",
                             folds=5)
    #print "I am here"

    network = utils.make_predictions(scoreCalc,
                                     args.mode,
                                     clf,
                                     gs,
                                     fun_anno=functionalData)

    # Predict protein interaction
    outFH = open("%s.pred.txt" % (output_dir), "w")

    final_network = []
    for PPI in network:
        items = PPI.split("\t")
        if float(items[2]) >= args.classifier_cutoff:
            final_network.append(PPI)

    print >> outFH, "\n".join(final_network)
    outFH.close()

    # Predicting clusters
    utils.predict_clusters("%s.pred.txt" % (output_dir),
                           "%s.clust.txt" % (output_dir))

    # Evaluating predicted clusters
    pred_clusters = GS.Clusters(False)
    pred_clusters.read_file("%s.clust.txt" % (output_dir))
    overlapped_complexes_with_reference = gs.get_complexes(
    ).get_overlapped_complexes_set(pred_clusters)
    print "# of complexes in reference dataset: " + str(
        len(overlapped_complexes_with_reference))
    #clust_scores, header = utils.clustering_evaluation(gs.complexes, pred_clusters, "", False)
    clust_scores, header, composite_score = utils.clustering_evaluation(
        gs.complexes, pred_clusters, "", False)
    outFH = open("%s.eval.txt" % (output_dir), "w")
    header = header.split("\t")
    clust_scores = clust_scores.split("\t")
    for i, head in enumerate(header):
        print "%s\t%s" % (head, clust_scores[i])
        print >> outFH, "%s\t%s" % (head, clust_scores[i])
    outFH.close()