def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N,D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True ### SOLUTION-AFTER-EQUALS self.label = util.mode(Y) ### SOLUTION-AFTER-EQUALS else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = Y[X[:,d] < 0.5] ### SOLUTION-AFTER-EQUALS rightY = Y[X[:,d] >= 0.5] ### SOLUTION-AFTER-EQUALS # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = sum(leftY != util.mode(leftY)) + sum(rightY != util.mode(rightY)) ### SOLUTION-AFTER-EQUALS # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False ### SOLUTION-AFTER-EQUALS self.feature = bestFeature ### SOLUTION-AFTER-EQUALS self.left = DT({'maxDepth': maxDepth-1}) self.right = DT({'maxDepth': maxDepth-1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### BEGIN-SOLUTION self.left.trainDT( X[ X[:,bestFeature] < 0.5, :], Y[ X[:,bestFeature] < 0.5 ], maxDepth-1, [bestFeature] + used) self.right.trainDT(X[ X[:,bestFeature] >= 0.5, :], Y[ X[:,bestFeature] >= 0.5 ], maxDepth-1, [bestFeature] + used)
def trainDT(self, X, Y, maxDepth, used): # size of the data set N, D = X.shape # Stopping Critera if maxDepth <= 0 or len(util.uniq(Y)) <= 1: self.isLeaf = 1 self.label = util.mode(Y) else: bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue leftY = Y[X[:, d] <= 0.5] rightY = Y[X[:, d] >= 0.5] leftYmode = util.mode(leftY) rightYmode = util.mode(rightY) leftYerror = (leftY != leftYmode).sum() rightYerror = (rightY != rightYmode).sum() error = leftYerror + rightYerror if error <= bestError: bestFeature = d bestError = error # Error check. if bestFeature < 0: self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False self.feature = bestFeature used.append(bestFeature) self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) self.left.trainDT(X[X[:, bestFeature] <= 0.5, :], Y[X[:, bestFeature] <= 0.5], maxDepth - 1, used) self.right.trainDT(X[X[:, bestFeature] >= 0.5, :], Y[X[:, bestFeature] >= 0.5], maxDepth - 1, used)
def handle_agent_action(sock, agent, action, data): if action == 'register': agent = data['agent'] inputs = uniq(data['inputs']) cleanses = uniq(data['cleanses']) outputs = uniq(data['outputs']) assert all(e in outputs for e in inputs + cleanses) agents[agent] = dict(time=time.time(), socket=sock, inputs=inputs, cleanses=cleanses, outputs=outputs) add_agent(agent) return dict(status='ok', agent=agent) elif action == 'ping': agents[agent]['time'] = time.time() return dict(status='ok', result='pong') elif action == 'reply': agents[agent]['time'] = time.time() job = data['id'] data = dict(status=data['status'], data=data['data']) forward(agent, job, data) return dict(status='ok')
def search_page(company): feeders = get_company_feeders(company).values() feeder_names = [feeder['group'].partition('-')[2] for feeder in feeders] agents = [agent for feeder in feeders for agent in feeder['agent2data'].keys()] inputs = [inp for feeder in feeders for agent_data in feeder['agent2data'].values() for inp in agent_data['inputs']] inputs = uniq(inputs) job = None reqvalues = {k:v for k,v in request.values.items()} # how to translate to dict msg = reqvalues.get('msg') job = reqvalues.get('job') if msg: pass elif job: pass elif reqvalues: job = create_job() print(reqvalues) data = dict(action='find', job=job, data=dict(reqvalues)) print('sending search:', data) send2company(company, data) return render_template('search.html', current_user=current_user, company=company, feeders=feeder_names, agents=agents, inputs=inputs, job=job, reqvalues=reqvalues, usrmsg=msg)
def files(self, extension = '*', newer = True): """Returns a list of existing files matching our basenames and the given extension. First the files basename + extension are returned, then basename + '-[0-9]+' + extension, then basename + '-.+' + extension. If newer is True (the default), only files that are newer than the jobfile() are returned. """ jobfile = self.jobfile() if jobfile: files = util.files(self.basenames(), extension) if newer: try: mtime = os.path.getmtime(jobfile) files = filter(lambda fname: os.path.getmtime(fname) >= mtime, files) except (OSError, IOError): pass return list(util.uniq(files)) return []
def trainDT(self, X, Y, maxDepth, criterion, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape ### N and D are number of rows and columns of X data matrix # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True ###util.raiseNotDefined() ### TODO: YOUR CODE HERE Boolean true or false self.label = util.mode( Y ) ###util.raiseNotDefined() which class to return to for leaf nodes? else: if criterion == 'ig': # information gain # compute the entropy at this node ### TODO: YOUR CODE HERE def entropy(y): P = np.count_nonzero(y == 1) #print("p") #print(P) N = np.count_nonzero(y == -1) #print("n") #print(N) S = N + P if (P > 0): a = (-(P / S) * math.log((P / S), 2)) else: a = 0 if (N > 0): b = (-(N / S) * math.log((N / S), 2)) else: b = 0 #return ((-(P/S) * math.log((P/S),2)) - ((N/S)* math.log((N/S),2))) return a + b self.entropy = entropy( Y) # entropy(Y) ###it depends on count at each feature print(self.entropy) # we need to find a feature to split on bestFeature = -1 # which feature has lowest error -- split feature # use error stats or gain stats (not both) depending on criterion # initialize error stats bestError = np.finfo( 'd').max # finding max value of d -- just initializing # initialize gain stats bestGain = np.finfo('d').min # Minimum value of d assigned to gain for d in range(D): ### d is FEATURE and iteration variable i # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? check the feature value if its less than 0.5 goes left and greater than 0.5 goes rig leftY = Y[ X[:, d] <= 0.5] ###util.raiseNotDefined() x[:,d] slicing the matrix to give the dth column left_node = np.count_nonzero(leftY) #print("echo") #print(left_node) rightY = Y[ X[:, d] > 0.5] ###util.raiseNotDefined() ### TODO: YOUR CODE HERE right_node = np.count_nonzero(rightY) #print(right_node) # misclassification rate if criterion == 'mr': # we'll classify the left points as their most-- so error is difference between all Y on left and Y=0 on left # common class and ditto right points. our error-- for right same except Y=1 # is the how many are not their mode. count_left = util.mode( leftY ) ### finding the most common class on the left (+1 or -1) count_right = util.mode( rightY ) ### finding the most common class on the right (+1 or -1) error_lefttree = len(leftY[leftY != count_left]) error_righttree = len(rightY[rightY != count_right]) error = error_lefttree + error_righttree #util.raiseNotDefined()#TODO:YOURCODE HERE difference between the # update min, max, bestFeature if error <= bestError: bestFeature = d bestError = error # information gain elif criterion == 'ig': # now use information gain Total = np.count_nonzero(Y) #print(Total) N1 = np.count_nonzero(leftY) #print(N1) P1 = np.count_nonzero(rightY) #print(P1) entropy_left = entropy(leftY) #print(entropy_left) entropy_right = entropy(rightY) #print(entropy_right) gain = (entropy(Y)) - ((N1 / Total) * entropy(leftY)) - ( (P1 / Total) * entropy(rightY) ) ### TODO: YOUR CODE HERE afterwords #print(gain) # update min, max, bestFeature if gain >= bestGain: bestFeature = d bestGain = gain self.gain = bestGain # information gain corresponding to this split if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False ###util.raiseNotDefined() ### TODO: YOUR CODE HERE self.feature = bestFeature ###util.raiseNotDefined() ### TODO: YOUR CODE HERE self.left = DT({ 'maxDepth': maxDepth - 1, 'criterion': criterion }) ## left sub tree self.right = DT({ 'maxDepth': maxDepth - 1, 'criterion': criterion }) ## right sub tree # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE -- First we need to divide rows and columns to respective trees ###util.raiseNotDefined() used.append(bestFeature ) ## so that the feature does not get used again Y_left = Y[X[:, bestFeature] <= 0.5] Y_right = Y[X[:, bestFeature] > 0.5] X_left = X[X[:, bestFeature] <= 0.5, :] X_right = X[X[:, bestFeature] > 0.5, :] self.left.trainDT(X_left, Y_left, maxDepth - 1, criterion, used) self.right.trainDT(X_right, Y_right, maxDepth - 1, criterion, used)
def from_corpus(cls, corpus, unk=None): vocab = util.uniq(reduce(lambda x, y : x + y, corpus)) return cls(vocab, unk)
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.label = util.raiseNotDefined() ### TODO: YOUR CODE HERE else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = util.raiseNotDefined() ### TODO: YOUR CODE HERE rightY = util.raiseNotDefined() ### TODO: YOUR CODE HERE # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = util.raiseNotDefined() ### TODO: YOUR CODE HERE # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.feature = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE util.raiseNotDefined()
def trainDT(self, X, Y, maxDepth, criterion, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.label = util.raiseNotDefined() ### TODO: YOUR CODE HERE else: if criterion == 'ig': # information gain # compute the entropy at this node ### TODO: YOUR CODE HERE self.entropy = util.raiseNotDefined() # we need to find a feature to split on bestFeature = -1 # which feature has lowest error # use error stats or gain stats (not both) depending on criterion # initialize error stats bestError = np.finfo('d').max # initialize gain stats bestGain = np.finfo('d').min for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = util.raiseNotDefined() ### TODO: YOUR CODE HERE rightY = util.raiseNotDefined() ### TODO: YOUR CODE HERE # misclassification rate if criterion == 'mr': # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = util.raiseNotDefined() ### TODO: YOUR CODE HERE # update min, max, bestFeature if error <= bestError: bestFeature = d bestError = error # information gain elif criterion == 'ig': # now use information gain gain = util.raiseNotDefined() ### TODO: YOUR CODE HERE # update min, max, bestFeature if gain >= bestGain: bestFeature = d bestGain = gain self.gain = bestGain # information gain corresponding to this split if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.feature = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.left = DT({ 'maxDepth': maxDepth - 1, 'criterion': criterion }) self.right = DT({ 'maxDepth': maxDepth - 1, 'criterion': criterion }) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE util.raiseNotDefined()
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True # TODO: well, that's leaf self.label = util.mode(Y) # TODO: and retturn mode of labels else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = Y[X[:, d] < 0.5] # TODO: Labels which feature value less than .5 rightY = Y[ X[:, d] >= 0.5] # TODO: Labels which feature value greater than or equal to .5 # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = size(nonzero([leftY != util.mode(leftY)])) + size( nonzero([rightY != util.mode(rightY)]) ) # TODO: counting in each branch amount of labels that are not equal to their mode # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False # TODO: that's not leaf, cause it is a whole branch self.feature = bestFeature # TODO: which carries its own feature self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments # TODO: define X and Y for left and right parts of current tree and init DT training # redefine labels with the best feature used = used + [self.feature] # anti infinite loop leftX = X[X[:, self.feature] < 0.5] rightX = X[X[:, self.feature] >= 0.5] leftY = Y[X[:, self.feature] < 0.5] rightY = Y[X[:, self.feature] >= 0.5] self.left.trainDT(leftX, leftY, maxDepth - 1, used) self.right.trainDT(rightX, rightY, maxDepth - 1, used)
def hosts(self): return sorted(util.uniq([row.host for row in self]))
def pools(self): return sorted(util.uniq([row.cell for row in self]))
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N,D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return # self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.isLeaf = True #self.label = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.label = util.mode(Y) ### TODO: YOUR CODE HERE else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # Split X and Y according to a feature value of a feature d left_vx = X[X[:,d]<=0] left_vy = Y[X[:,d]<=0] right_vx = X[X[:,d]>0] right_vy = Y[X[:,d]>0] # suppose we split on this feature; what labels # would go left and right? #leftY = util.raiseNotDefined() ### TODO: YOUR CODE HERE #rightY = util.raiseNotDefined() ### TODO: YOUR CODE HERE # count majority leftY = util.mode( left_vy ) rightY = util.mode( right_vy ) # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. #error = util.raiseNotDefined() ### TODO: YOUR CODE HERE # count errors error = len( left_vy[ left_vy!=leftY ] ) + len( right_vy[ right_vy!=rightY ] ) # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: #self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.isLeaf = False #self.feature = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.feature = bestFeature # update used features and prepare arrays to pass child nodes. used.append( bestFeature ) right_used = deepcopy( used ) left_used = deepcopy( used ) # set split training data according to the feature selected. left_vx = X[X[:,self.feature]<=0] left_vy = Y[X[:,self.feature]<=0] right_vx = X[X[:,self.feature]>0] right_vy = Y[X[:,self.feature]>0] self.left = DT({'maxDepth': maxDepth-1}) self.right = DT({'maxDepth': maxDepth-1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE # util.raiseNotDefined() self.left.trainDT(left_vx, left_vy, maxDepth-1, left_used) self.right.trainDT(right_vx, right_vy, maxDepth-1, right_used)
def from_corpus(cls, corpus, unk=None): vocab = util.uniq(reduce(lambda x, y: x + y, corpus)) return cls(vocab, unk)
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True self.label = util.mode(Y) else: # get the size of the data set N, D = X.shape # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue #put negative values on the left and positive on the right negInd = [i for i, x in enumerate(X) if x[d] < 0.5] #indices negVal = [x for i, x in enumerate(X) if x[d] < 0.5] #entire x values posInd = [i for i, x in enumerate(X) if x[d] >= 0.5] #indices posVal = [x for i, x in enumerate(X) if x[d] >= 0.5] #entire y values # negX = [X[i][d] for i in negInd] # posX = [X[i][d] for i in posInd] leftY = [Y[i] for i in negInd] #leftY = util.raiseNotDefined() ### TODO: YOUR CODE HERE rightY = [Y[i] for i in posInd] #rightY = util.raiseNotDefined() ### TODO: YOUR CODE HERE #calculating guesses left_guess = 0 if len(leftY) != 0: if np.mean(leftY) >= 0: left_guess = 1 else: left_guess = -1 right_guess = 0 if len(rightY) != 0: if np.mean(rightY) >= 0: right_guess = 1 else: right_guess = -1 # calculating error by looking at mislabeled points num_errors = 0.0 for y in leftY: if y != left_guess: num_errors = num_errors + 1 for y in rightY: if y != right_guess: num_errors = num_errors + 1 error = num_errors / N # check to see if this is a better error rate if error <= bestError: permNeg = array(negVal) permPos = array(posVal) permLeft = array(leftY) permRight = array(rightY) permRight bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False ### TODO: YOUR CODE HERE self.feature = bestFeature ### TODO: YOUR CODE HERE self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments used.append(bestFeature) self.left.trainDT(permNeg, permLeft, maxDepth - 1, used) self.right.trainDT(permPos, permRight, maxDepth - 1, used)
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set if len(X) <= 0: N = D = 0 else: N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return ### TODO: YOUR CODE HERE self.isLeaf = True self.label = util.mode(Y) else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? ### TODO: YOUR CODE HERE counterno = util.Counter() counteryes = util.Counter() for i, x in enumerate(X): if x[d] < 0.5: counterno['NO' if Y[i] < 0 else 'YES'] += 1 else: counteryes['NO' if Y[i] < 0 else 'YES'] += 1 leftY = 1 if counterno['YES'] >= counterno['NO'] else -1 rightY = 1 if counteryes['YES'] >= counteryes['NO'] else -1 # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. ### TODO: YOUR CODE HERE error = counterno['YES' if counterno['YES'] < counterno['NO'] else 'NO'] +\ counteryes['YES' if counteryes['YES'] < counteryes['NO'] else 'NO'] # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False self.feature = bestFeature ### TODO: YOUR CODE HERE new_used = used[:] new_used.append(bestFeature) self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE nos = [[], []] yess = [[], []] for i, x in enumerate(X): if x[bestFeature] < 0.5: nos[0].append(x) nos[1].append(Y[i]) else: yess[0].append(x) yess[1].append(Y[i]) self.left.trainDT(array(nos[0]), nos[1], maxDepth - 1, new_used) self.right.trainDT(array(yess[0]), yess[1], maxDepth - 1, new_used)
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True self.label = util.mode(Y) # do not have to make any decision else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = Y[X[:, d] < 0.5] # left for feature that are less than 0.5 rightY = Y[X[:, d] >= 0.5] # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = size((leftY != util.mode(leftY)).nonzero()) + size( (rightY != util.mode(rightY)).nonzero()) # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False self.feature = bestFeature self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments self.left.trainDT(X[X[:, self.feature] < 0.5], Y[X[:, self.feature] < 0.5], self.left.opts['maxDepth'], used + [ self.feature, ]) self.right.trainDT(X[X[:, self.feature] >= 0.5], Y[X[:, self.feature] >= 0.5], self.right.opts['maxDepth'], used + [ self.feature, ]) # For Chi-square pruning self.split = array( [[ size((Y[X[:, self.feature] < 0.5] == 1).nonzero()), size((Y[X[:, self.feature] < 0.5] == -1).nonzero()) ], [ size((Y[X[:, self.feature] >= 0.5] == 1).nonzero()), size((Y[X[:, self.feature] >= 0.5] == -1).nonzero()) ]])
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.label = util.raiseNotDefined() ### TODO: YOUR CODE HERE else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = util.raiseNotDefined() ### TODO: YOUR CODE HERE rightY = util.raiseNotDefined() ### TODO: YOUR CODE HERE # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = util.raiseNotDefined() ### TODO: YOUR CODE HERE # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.feature = util.raiseNotDefined() ### TODO: YOUR CODE HERE self.left = DT({"maxDepth": maxDepth - 1}) self.right = DT({"maxDepth": maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE util.raiseNotDefined()
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N,D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True self.label = util.mode(Y); else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = Y[X[:, d] < 0.5] rightY = Y[X[:, d] >= 0.5] # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = size((leftY!=util.mode(leftY)).nonzero()) + size((rightY!=util.mode(rightY)).nonzero()) # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False; self.feature = bestFeature; self.left = DT({'maxDepth': maxDepth-1}) self.right = DT({'maxDepth': maxDepth-1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments leftD = X[X[:, self.feature] < 0.5] rightD = X[X[:, self.feature] >= 0.5] # redefine labels with the best feature leftY = Y[X[:, self.feature] < 0.5] rightY = Y[X[:, self.feature] >= 0.5] used = used + [self.feature] # print "best feature found is ", self.feature # print "updated used:", used, " maxDepth:", self.left.opts['maxDepth'] # print "leftY:", leftY # print "rightY:", rightY # pdb.set_trace() self.left.trainDT(leftD, leftY, self.left.opts['maxDepth'], used); self.right.trainDT(rightD, rightY, self.right.opts['maxDepth'], used);
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True self.label = util.mode(Y) else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = Y[X[:, d] < 0.5] rightY = Y[X[:, d] >= 0.5] # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = size((leftY != util.mode(leftY)).nonzero()) + size( (rightY != util.mode(rightY)).nonzero()) # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False self.feature = bestFeature self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments leftD = X[X[:, self.feature] < 0.5] rightD = X[X[:, self.feature] >= 0.5] # redefine labels with the best feature leftY = Y[X[:, self.feature] < 0.5] rightY = Y[X[:, self.feature] >= 0.5] used = used + [self.feature] # print "best feature found is ", self.feature # print "updated used:", used, " maxDepth:", self.left.opts['maxDepth'] # print "leftY:", leftY # print "rightY:", rightY # pdb.set_trace() self.left.trainDT(leftD, leftY, self.left.opts['maxDepth'], used) self.right.trainDT(rightD, rightY, self.right.opts['maxDepth'], used)
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N, D = X.shape bestLeftY = [] bestRightY = [] # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True ### TODO: YOUR CODE HERE # self.label = util.mode(Y) ### TODO: YOUR CODE HERE self.label = util.mode(Y) else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # have we used this feature yet if d in used: continue # suppose we split on this feature; what labels # would go left and right? leftY = Y[X[:, d] < 0.5] ### TODO: YOUR CODE HERE rightY = Y[X[:, d] >= 0.5] ### TODO: YOUR CODE HERE # y_counter = 0 # for fe_val in X[:,d]: # if fe_val >= 0.5: # rightY.append(Y[y_counter]) # else: # leftY.append(Y[y_counter]) # # y_counter += 1 # print 'left ==== {left}'.format(left = leftY) # print rightY # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = self.sizeWithoutK( util.mode(leftY), leftY) + self.sizeWithoutK( util.mode(rightY), rightY) ### TODO: YOUR CODE HERE # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error bestLeftY = leftY bestRightY = rightY if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False ### TODO: YOUR CODE HERE self.feature = bestFeature ### TODO: YOUR CODE HERE self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments ### TODO: YOUR CODE HERE # New X without feature used.append(self.feature) Xne = delete(X, self.feature, 1) self.left.trainDT(X[X[:, self.feature] < 0.5, :], bestLeftY, maxDepth - 1, used) self.right.trainDT(X[X[:, self.feature] >= 0.5, :], bestRightY, maxDepth - 1, used)
def gen_phases(start, end, minn=0, maxx=4): for phase in ['{:0>5}'.format(str(x)) for x in range(start, end)]: if len([ch for ch in phase if not between(int(ch), minn, maxx)]) == 0: if util.uniq(phase): yield phase
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # get the size of the data set N,D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True ### TODO: YOUR CODE HERE self.label = util.mode(Y) ### TODO: YOUR CODE HERE else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature leftY = [] rightY = [] for d in range(D): # have we used this feature yet if d in used: continue countLeftpos = 0 countLeftneg = 0 countRightpos = 0 countRightneg = 0 errorLeft = 0 errorRight = 0 # suppose we split on this feature; what labels # would go left and right? for i in range (N): if X[i,d] < 0.5 and Y[i] == 1: countLeftpos += 1 if X[i,d] < 0.5 and Y[i] == -1: countLeftneg += 1 if X[i,d] >= 0.5 and Y[i] == 1: countRightpos += 1 if X[i,d] >= 0.5 and Y[i] == -1: countRightneg += 1 if countLeftpos > countLeftneg: errorLeft = countLeftneg if countLeftneg > countLeftpos: errorLeft = countLeftpos if countLeftneg == countLeftpos: errorLeft = countLeftpos if countRightpos > countRightneg: errorRight = countRightneg if countRightneg > countRightpos: errorRight = countRightpos if countRightneg == countRightpos: errorRight = countRightpos # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = errorLeft + errorRight ### TODO: YOUR CODE HERE # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False ### TODO: YOUR CODE HERE self.feature = bestFeature used.append(bestFeature) ### TODO: YOUR CODE HERE leftX = X[X[:,bestFeature] < 0.5, :] rightX = X[X[:,bestFeature] >= 0.5, :] for i in range (N): if X[i,bestFeature] < 0.5: leftY.append(Y[i]) else: rightY.append(Y[i]) self.left = DT({'maxDepth': maxDepth-1}) self.right = DT({'maxDepth': maxDepth-1}) self.left.trainDT(leftX, leftY, maxDepth-1, used) self.right.trainDT(rightX, rightY, maxDepth-1, used)
def trainDT(self, X, Y, maxDepth, used): """ recursively build the decision tree """ # print used # get the size of the data set N, D = X.shape # check to see if we're either out of depth or no longer # have any decisions to make if maxDepth <= 0 or len(util.uniq(Y)) <= 1: # we'd better end at this point. need to figure # out the label to return self.isLeaf = True self.label = util.mode(Y) else: # we need to find a feature to split on bestFeature = -1 # which feature has lowest error bestError = N # the number of errors for this feature for d in range(D): # Index # have we used this feature yet if d in used: continue xFiltered = X[:, d] leftY = Y[xFiltered < 0.5] rightY = Y[xFiltered >= 0.5] # we'll classify the left points as their most # common class and ditto right points. our error # is the how many are not their mode. error = size((leftY != util.mode(leftY)).nonzero()) + size( (rightY != util.mode(rightY)).nonzero()) # check to see if this is a better error rate if error <= bestError: bestFeature = d bestError = error if bestFeature < 0: # this shouldn't happen, but just in case... self.isLeaf = True self.label = util.mode(Y) else: self.isLeaf = False self.feature = bestFeature # To maximize accuracy # print "Feature:" # print repr(self.feature) self.left = DT({'maxDepth': maxDepth - 1}) self.right = DT({'maxDepth': maxDepth - 1}) # recurse on our children by calling # self.left.trainDT(...) # and # self.right.trainDT(...) # with appropriate arguments xFiltered = X[:, self.feature] leftY = Y[xFiltered < 0.5] rightY = Y[xFiltered >= 0.5] self.left.trainDT(X[X[:, self.feature] < 0.5], leftY, (maxDepth - 1), used + [self.feature]) self.right.trainDT(X[X[:, self.feature] >= 0.5], rightY, (maxDepth - 1), used + [self.feature])