def kfold(mleft, mright, nmleft, nmright, numtree, biniter, seed): """ Defines a kfold cross-validation method where T is the dataset to train the RandomForest algorithm and k is a subelement of T which then is tested on training-set T """ matchesleft = mleft matchesright = mright nonmatchesleft = nmleft nonmatchesright = nmright matchresult=[] nonmatchresult=[] oob=0 for ids in range(0, len(treeIDsleft)): biniter=biniter matchlistleft=[] matchlistright=[] nonmatchlistleft=[] nonmatchlistright=[] for ID in range(0, len(matchesleft)): if matchesleft[ID] == matchesleft[ids]: continue else: matchlistleft.append(matchesleft[ID]) for ID in range(0, len(matchesleft)): if matchesright[ID] == matchesright[ids]: continue else: matchlistright.append(matchesright[ID]) for ID in range(0, len(matchesleft)): if nonmatchesleft[ID] == nonmatchesleft[ids]: continue else: nonmatchlistleft.append(nonmatchesleft[ID]) for ID in range(0, len(matchesleft)): if nonmatchesright[ID] == nonmatchesright[ids]: continue else: nonmatchlistright.append(nonmatchesright[ID]) kmatch =[matchesleft[ids], matchesright[ids]] knonmatch= [nonmatchesleft[ids], nonmatchesright[ids]] matching = featureList(matchlistleft, matchlistright, biniter) nonmatching = featureList(nonmatchlistleft, nonmatchlistright, biniter) numTrees = numtree numFeatures = len(matching.values()[0]) classifier = createClassifier(numTrees, numFeatures + 1, seed) # +1 to include the class outofbag = trainClassifier(classifier, matching.values(), nonmatching.values()) oob+=float(outofbag) print "oob =", outofbag print "match vs raining set:", classify(classifier, [featureList([kmatch[0]], [kmatch[1]], biniter).values()[0]]) print "nonmatch vs training set:", classify(classifier, [featureList([knonmatch[0]], [knonmatch[1]], biniter).values()[0]]) matchresult.append(classify(classifier, [featureList([kmatch[0]], [kmatch[1]], biniter).values()[0]])) nonmatchresult.append(classify(classifier, [featureList([knonmatch[0]], [knonmatch[1]], biniter).values()[0]])) #print matchresult #print nonmatchresult counter=0 for m in matchresult: if m[0] == 1.0: counter+=1 for m in nonmatchresult: if m[0] == 0.0: counter +=1 elements = 2 * len(matchresult) performance = (float(counter) / float(elements)) moob = oob / float(elements / 2) return "The performance is:", performance, "and the mean oob is:", moob