예제 #1
0
def kfold(mleft, mright, nmleft, nmright, numtree, biniter, seed):
  """ 
  Defines a kfold cross-validation method where T is the dataset 
  to train the RandomForest algorithm and k is a subelement of T which 
  then is tested on training-set T
  """
  matchesleft = mleft
  matchesright = mright
  nonmatchesleft = nmleft
  nonmatchesright = nmright
  matchresult=[]
  nonmatchresult=[]
  oob=0
  
  for ids in range(0, len(treeIDsleft)):
    biniter=biniter
    matchlistleft=[]
    matchlistright=[]
    nonmatchlistleft=[]
    nonmatchlistright=[]

    for ID in range(0, len(matchesleft)):
      if matchesleft[ID] == matchesleft[ids]:
        continue
      else:
        matchlistleft.append(matchesleft[ID])
        
    for ID in range(0, len(matchesleft)):
      if matchesright[ID] == matchesright[ids]:
        continue
      else:
        matchlistright.append(matchesright[ID])

    for ID in range(0, len(matchesleft)):
      if nonmatchesleft[ID] == nonmatchesleft[ids]:
        continue
      else:
        nonmatchlistleft.append(nonmatchesleft[ID])

    for ID in range(0, len(matchesleft)):
      if nonmatchesright[ID] == nonmatchesright[ids]:
        continue
      else:
        nonmatchlistright.append(nonmatchesright[ID])

    kmatch =[matchesleft[ids], matchesright[ids]]
    knonmatch= [nonmatchesleft[ids], nonmatchesright[ids]]
    matching = featureList(matchlistleft, matchlistright, biniter)
    nonmatching = featureList(nonmatchlistleft, nonmatchlistright, biniter)  
    numTrees = numtree
    numFeatures = len(matching.values()[0])
    classifier = createClassifier(numTrees, numFeatures + 1, seed) # +1 to include the class
    outofbag = trainClassifier(classifier, matching.values(), nonmatching.values())
    oob+=float(outofbag)
    print "oob =", outofbag
    print "match vs raining set:", classify(classifier, [featureList([kmatch[0]], [kmatch[1]], biniter).values()[0]])
    print "nonmatch vs training set:", classify(classifier, [featureList([knonmatch[0]], [knonmatch[1]], biniter).values()[0]])
    matchresult.append(classify(classifier, [featureList([kmatch[0]], [kmatch[1]], biniter).values()[0]]))
    nonmatchresult.append(classify(classifier, [featureList([knonmatch[0]], [knonmatch[1]], biniter).values()[0]]))

  #print matchresult
  #print nonmatchresult   
  counter=0
  for m in matchresult:
    if m[0] == 1.0:
      counter+=1
  for m in nonmatchresult:
    if m[0] == 0.0:
      counter +=1
  elements = 2 * len(matchresult)
  performance = (float(counter) / float(elements))
  moob = oob / float(elements / 2)  
  return "The performance is:", performance, "and the mean oob is:", moob