Exemplo n.º 1
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N,D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True               ### SOLUTION-AFTER-EQUALS
            self.label  = util.mode(Y)       ### SOLUTION-AFTER-EQUALS

        else:
            # we need to find a feature to split on
            bestFeature = -1     # which feature has lowest error
            bestError   = N      # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY  = Y[X[:,d] <  0.5]   ### SOLUTION-AFTER-EQUALS
                rightY = Y[X[:,d] >= 0.5]   ### SOLUTION-AFTER-EQUALS

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = sum(leftY != util.mode(leftY)) + sum(rightY != util.mode(rightY))    ### SOLUTION-AFTER-EQUALS

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError   = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label  = util.mode(Y)

            else:
                self.isLeaf  = False                    ### SOLUTION-AFTER-EQUALS
                self.feature = bestFeature              ### SOLUTION-AFTER-EQUALS

                self.left  = DT({'maxDepth': maxDepth-1})
                self.right = DT({'maxDepth': maxDepth-1})
                # recurse on our children by calling
                #   self.left.trainDT(...) 
                # and
                #   self.right.trainDT(...) 
                # with appropriate arguments
                ### BEGIN-SOLUTION
                self.left.trainDT( X[ X[:,bestFeature] <  0.5, :], Y[ X[:,bestFeature] <  0.5 ], maxDepth-1, [bestFeature] + used)
                self.right.trainDT(X[ X[:,bestFeature] >= 0.5, :], Y[ X[:,bestFeature] >= 0.5 ], maxDepth-1, [bestFeature] + used)
Exemplo n.º 2
0
    def trainDT(self, X, Y, maxDepth, used):
        # size of the data set
        N, D = X.shape

        # Stopping Critera
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            self.isLeaf = 1
            self.label = util.mode(Y)

        else:
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature

            for d in range(D):

                # have we used this feature yet
                if d in used:
                    continue

                leftY = Y[X[:, d] <= 0.5]
                rightY = Y[X[:, d] >= 0.5]

                leftYmode = util.mode(leftY)
                rightYmode = util.mode(rightY)

                leftYerror = (leftY != leftYmode).sum()
                rightYerror = (rightY != rightYmode).sum()

                error = leftYerror + rightYerror
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            # Error check.
            if bestFeature < 0:
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature
                used.append(bestFeature)

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})

                self.left.trainDT(X[X[:, bestFeature] <= 0.5, :],
                                  Y[X[:, bestFeature] <= 0.5], maxDepth - 1,
                                  used)
                self.right.trainDT(X[X[:, bestFeature] >= 0.5, :],
                                   Y[X[:, bestFeature] >= 0.5], maxDepth - 1,
                                   used)
Exemplo n.º 3
0
def handle_agent_action(sock, agent, action, data):
    if action == 'register':
        agent = data['agent']
        inputs = uniq(data['inputs'])
        cleanses = uniq(data['cleanses'])
        outputs = uniq(data['outputs'])
        assert all(e in outputs for e in inputs + cleanses)
        agents[agent] = dict(time=time.time(),
                             socket=sock,
                             inputs=inputs,
                             cleanses=cleanses,
                             outputs=outputs)
        add_agent(agent)
        return dict(status='ok', agent=agent)
    elif action == 'ping':
        agents[agent]['time'] = time.time()
        return dict(status='ok', result='pong')
    elif action == 'reply':
        agents[agent]['time'] = time.time()
        job = data['id']
        data = dict(status=data['status'], data=data['data'])
        forward(agent, job, data)
        return dict(status='ok')
Exemplo n.º 4
0
def search_page(company):
    feeders = get_company_feeders(company).values()
    feeder_names = [feeder['group'].partition('-')[2] for feeder in feeders]
    agents = [agent for feeder in feeders for agent in feeder['agent2data'].keys()]
    inputs = [inp for feeder in feeders for agent_data in feeder['agent2data'].values() for inp in agent_data['inputs']]
    inputs = uniq(inputs)
    job = None
    reqvalues = {k:v for k,v in request.values.items()} # how to translate to dict
    msg = reqvalues.get('msg')
    job = reqvalues.get('job')
    if msg:
        pass
    elif job:
        pass
    elif reqvalues:
        job = create_job()
        print(reqvalues)
        data = dict(action='find', job=job, data=dict(reqvalues))
        print('sending search:', data)
        send2company(company, data)
    return render_template('search.html', current_user=current_user, company=company, feeders=feeder_names, agents=agents, inputs=inputs, job=job, reqvalues=reqvalues, usrmsg=msg)
Exemplo n.º 5
0
 def files(self, extension = '*', newer = True):
     """Returns a list of existing files matching our basenames and the given extension.
     
     First the files basename + extension are returned,
     then basename + '-[0-9]+' + extension,
     then basename + '-.+' + extension.
     
     If newer is True (the default), only files that are newer than the jobfile() are returned.
     
     """
     jobfile = self.jobfile()
     if jobfile:
         files = util.files(self.basenames(), extension)
         if newer:
             try:
                 mtime = os.path.getmtime(jobfile)
                 files = filter(lambda fname: os.path.getmtime(fname) >= mtime, files)
             except (OSError, IOError):
                 pass
         return list(util.uniq(files))
     return []
Exemplo n.º 6
0
    def trainDT(self, X, Y, maxDepth, criterion, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape  ### N and D are number of rows and columns of X data matrix

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE Boolean true or false

            self.label = util.mode(
                Y
            )  ###util.raiseNotDefined()  which class to return to for leaf nodes?

        else:
            if criterion == 'ig':  # information gain
                # compute the entropy at this node
                ### TODO: YOUR CODE HERE
                def entropy(y):
                    P = np.count_nonzero(y == 1)
                    #print("p")
                    #print(P)
                    N = np.count_nonzero(y == -1)
                    #print("n")
                    #print(N)
                    S = N + P
                    if (P > 0):
                        a = (-(P / S) * math.log((P / S), 2))
                    else:
                        a = 0
                    if (N > 0):
                        b = (-(N / S) * math.log((N / S), 2))
                    else:
                        b = 0

                    #return ((-(P/S) * math.log((P/S),2)) - ((N/S)* math.log((N/S),2)))
                    return a + b

                self.entropy = entropy(
                    Y)  # entropy(Y) ###it depends on count at each feature
                print(self.entropy)

            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error -- split feature

            # use error stats or gain stats (not both) depending on criterion

            # initialize error stats
            bestError = np.finfo(
                'd').max  # finding max value of d -- just initializing

            # initialize gain stats
            bestGain = np.finfo('d').min  # Minimum value of d assigned to gain

            for d in range(D):  ### d is FEATURE and iteration variable i
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?   check the feature value if its less than 0.5 goes left and greater than 0.5 goes rig
                leftY = Y[
                    X[:, d] <=
                    0.5]  ###util.raiseNotDefined()  x[:,d] slicing the matrix to give the dth column
                left_node = np.count_nonzero(leftY)
                #print("echo")
                #print(left_node)
                rightY = Y[
                    X[:, d] >
                    0.5]  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE
                right_node = np.count_nonzero(rightY)
                #print(right_node)

                # misclassification rate
                if criterion == 'mr':
                    # we'll classify the left points as their most-- so error is difference between all Y on left and Y=0 on left                   # common class and ditto right points.  our error-- for right same except Y=1
                    # is the how many are not their mode.

                    count_left = util.mode(
                        leftY
                    )  ### finding the most common class on the left (+1 or -1)
                    count_right = util.mode(
                        rightY
                    )  ### finding the most common class on the right (+1 or -1)

                    error_lefttree = len(leftY[leftY != count_left])
                    error_righttree = len(rightY[rightY != count_right])
                    error = error_lefttree + error_righttree  #util.raiseNotDefined()#TODO:YOURCODE HERE difference between the

                    # update min, max, bestFeature
                    if error <= bestError:
                        bestFeature = d
                        bestError = error

                # information gain
                elif criterion == 'ig':
                    # now use information gain
                    Total = np.count_nonzero(Y)
                    #print(Total)
                    N1 = np.count_nonzero(leftY)
                    #print(N1)
                    P1 = np.count_nonzero(rightY)
                    #print(P1)
                    entropy_left = entropy(leftY)
                    #print(entropy_left)
                    entropy_right = entropy(rightY)
                    #print(entropy_right)

                    gain = (entropy(Y)) - ((N1 / Total) * entropy(leftY)) - (
                        (P1 / Total) * entropy(rightY)
                    )  ### TODO: YOUR CODE HERE afterwords
                    #print(gain)
                    # update min, max, bestFeature
                    if gain >= bestGain:
                        bestFeature = d
                        bestGain = gain

            self.gain = bestGain  # information gain corresponding to this split
            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE

                self.feature = bestFeature  ###util.raiseNotDefined()    ### TODO: YOUR CODE HERE

                self.left = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })  ## left sub tree
                self.right = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })  ## right sub tree
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE -- First we need to divide rows and columns to respective trees
                ###util.raiseNotDefined()
                used.append(bestFeature
                            )  ## so that the feature does not get used again
                Y_left = Y[X[:, bestFeature] <= 0.5]
                Y_right = Y[X[:, bestFeature] > 0.5]
                X_left = X[X[:, bestFeature] <= 0.5, :]
                X_right = X[X[:, bestFeature] > 0.5, :]

                self.left.trainDT(X_left, Y_left, maxDepth - 1, criterion,
                                  used)
                self.right.trainDT(X_right, Y_right, maxDepth - 1, criterion,
                                   used)
Exemplo n.º 7
0
 def from_corpus(cls, corpus, unk=None):
     vocab = util.uniq(reduce(lambda x, y : x + y, corpus))
     return cls(vocab, unk)
Exemplo n.º 8
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

            self.label = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                rightY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.feature = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                util.raiseNotDefined()
Exemplo n.º 9
0
    def trainDT(self, X, Y, maxDepth, criterion, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

            self.label = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

        else:
            if criterion == 'ig':  # information gain
                # compute the entropy at this node
                ### TODO: YOUR CODE HERE
                self.entropy = util.raiseNotDefined()

            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error

            # use error stats or gain stats (not both) depending on criterion

            # initialize error stats
            bestError = np.finfo('d').max

            # initialize gain stats
            bestGain = np.finfo('d').min

            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                rightY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # misclassification rate
                if criterion == 'mr':
                    # we'll classify the left points as their most
                    # common class and ditto right points.  our error
                    # is the how many are not their mode.
                    error = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                    # update min, max, bestFeature
                    if error <= bestError:
                        bestFeature = d
                        bestError = error

                # information gain
                elif criterion == 'ig':
                    # now use information gain
                    gain = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                    # update min, max, bestFeature
                    if gain >= bestGain:
                        bestFeature = d
                        bestGain = gain

            self.gain = bestGain  # information gain corresponding to this split
            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.feature = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.left = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })
                self.right = DT({
                    'maxDepth': maxDepth - 1,
                    'criterion': criterion
                })
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                util.raiseNotDefined()
Exemplo n.º 10
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True  # TODO: well, that's leaf

            self.label = util.mode(Y)  # TODO: and retturn mode of labels

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature

            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = Y[X[:, d] <
                          0.5]  # TODO: Labels which feature value less than .5
                rightY = Y[
                    X[:, d] >=
                    0.5]  # TODO: Labels which feature value greater than or equal to .5

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size(nonzero([leftY != util.mode(leftY)])) + size(
                    nonzero([rightY != util.mode(rightY)])
                )  # TODO: counting in each branch amount of labels that are not equal to their mode

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  # TODO: that's not leaf, cause it is a whole branch
                self.feature = bestFeature  # TODO: which carries its own feature

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments

                # TODO: define X and Y for left and right parts of current tree and init DT training
                # redefine labels with the best feature
                used = used + [self.feature]  # anti infinite loop

                leftX = X[X[:, self.feature] < 0.5]
                rightX = X[X[:, self.feature] >= 0.5]

                leftY = Y[X[:, self.feature] < 0.5]
                rightY = Y[X[:, self.feature] >= 0.5]

                self.left.trainDT(leftX, leftY, maxDepth - 1, used)
                self.right.trainDT(rightX, rightY, maxDepth - 1, used)
Exemplo n.º 11
0
	def hosts(self):
		return sorted(util.uniq([row.host for row in self]))
Exemplo n.º 12
0
	def pools(self):
		return sorted(util.uniq([row.cell for row in self]))
Exemplo n.º 13
0
	def trainDT(self, X, Y, maxDepth, used):
		"""
		recursively build the decision tree
		"""

		# get the size of the data set
		N,D = X.shape

		# check to see if we're either out of depth or no longer
		# have any decisions to make
		if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
			# we'd better end at this point.  need to figure
			# out the label to return
			# self.isLeaf = util.raiseNotDefined()	### TODO: YOUR CODE HERE
			self.isLeaf = True

			#self.label  = util.raiseNotDefined()	### TODO: YOUR CODE HERE
			self.label  = util.mode(Y)	### TODO: YOUR CODE HERE


		else:
			# we need to find a feature to split on
			bestFeature = -1	 # which feature has lowest error
			bestError   = N	  # the number of errors for this feature
			for d in range(D):
				# have we used this feature yet
				if d in used:
					continue
					
				# Split X and Y according to a feature value of a feature d
				left_vx = X[X[:,d]<=0]
				left_vy = Y[X[:,d]<=0]
				right_vx = X[X[:,d]>0]
				right_vy = Y[X[:,d]>0]
				# suppose we split on this feature; what labels
				# would go left and right?
				#leftY  = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				
				#rightY = util.raiseNotDefined()	### TODO: YOUR CODE HERE

				# count majority
				leftY  = util.mode( left_vy )
				rightY = util.mode( right_vy )

				# we'll classify the left points as their most
				# common class and ditto right points.  our error
				# is the how many are not their mode.
				#error = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				# count errors 
				error = len( left_vy[ left_vy!=leftY ] ) + len( right_vy[ right_vy!=rightY ] )


				# check to see if this is a better error rate
				if error <= bestError:
					bestFeature = d
					bestError   = error

			if bestFeature < 0:
				# this shouldn't happen, but just in case...
				self.isLeaf = True
				self.label  = util.mode(Y)

			else:
				#self.isLeaf  = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				self.isLeaf  = False

				#self.feature = util.raiseNotDefined()	### TODO: YOUR CODE HERE
				self.feature = bestFeature

				# update used features and prepare arrays to pass child nodes.
				used.append( bestFeature )
				right_used = deepcopy( used )
				left_used  = deepcopy( used )

				# set split training data according to the feature selected.
				left_vx = X[X[:,self.feature]<=0]
				left_vy = Y[X[:,self.feature]<=0]
				right_vx = X[X[:,self.feature]>0]
				right_vy = Y[X[:,self.feature]>0]

				self.left  = DT({'maxDepth': maxDepth-1})
				self.right = DT({'maxDepth': maxDepth-1})
				# recurse on our children by calling
				#   self.left.trainDT(...) 
				# and
				#   self.right.trainDT(...) 
				# with appropriate arguments
				### TODO: YOUR CODE HERE
				# util.raiseNotDefined()
				self.left.trainDT(left_vx, left_vy, maxDepth-1, left_used)
				self.right.trainDT(right_vx, right_vy, maxDepth-1, right_used)
Exemplo n.º 14
0
 def from_corpus(cls, corpus, unk=None):
     vocab = util.uniq(reduce(lambda x, y: x + y, corpus))
     return cls(vocab, unk)
Exemplo n.º 15
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # get the size of the data set
            N, D = X.shape

            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                #put negative values on the left and positive on the right
                negInd = [i for i, x in enumerate(X) if x[d] < 0.5]  #indices
                negVal = [x for i, x in enumerate(X)
                          if x[d] < 0.5]  #entire x values
                posInd = [i for i, x in enumerate(X) if x[d] >= 0.5]  #indices
                posVal = [x for i, x in enumerate(X)
                          if x[d] >= 0.5]  #entire y values
                # negX = [X[i][d] for i in negInd]
                # posX = [X[i][d] for i in posInd]
                leftY = [Y[i] for i in negInd]
                #leftY  = util.raiseNotDefined()    ### TODO: YOUR CODE HERE
                rightY = [Y[i] for i in posInd]
                #rightY = util.raiseNotDefined()    ### TODO: YOUR CODE HERE

                #calculating guesses
                left_guess = 0
                if len(leftY) != 0:
                    if np.mean(leftY) >= 0:
                        left_guess = 1
                    else:
                        left_guess = -1
                right_guess = 0
                if len(rightY) != 0:
                    if np.mean(rightY) >= 0:
                        right_guess = 1
                    else:
                        right_guess = -1

                # calculating error by looking at mislabeled points
                num_errors = 0.0
                for y in leftY:
                    if y != left_guess:
                        num_errors = num_errors + 1
                for y in rightY:
                    if y != right_guess:
                        num_errors = num_errors + 1
                error = num_errors / N

                # check to see if this is a better error rate
                if error <= bestError:
                    permNeg = array(negVal)
                    permPos = array(posVal)
                    permLeft = array(leftY)
                    permRight = array(rightY)
                    permRight
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  ### TODO: YOUR CODE HERE

                self.feature = bestFeature  ### TODO: YOUR CODE HERE

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                used.append(bestFeature)
                self.left.trainDT(permNeg, permLeft, maxDepth - 1, used)
                self.right.trainDT(permPos, permRight, maxDepth - 1, used)
Exemplo n.º 16
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        if len(X) <= 0:
            N = D = 0
        else:
            N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            ### TODO: YOUR CODE HERE
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                ### TODO: YOUR CODE HERE
                counterno = util.Counter()
                counteryes = util.Counter()
                for i, x in enumerate(X):
                    if x[d] < 0.5:
                        counterno['NO' if Y[i] < 0 else 'YES'] += 1
                    else:
                        counteryes['NO' if Y[i] < 0 else 'YES'] += 1
                leftY = 1 if counterno['YES'] >= counterno['NO'] else -1
                rightY = 1 if counteryes['YES'] >= counteryes['NO'] else -1

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                ### TODO: YOUR CODE HERE
                error = counterno['YES' if counterno['YES'] < counterno['NO'] else 'NO'] +\
                        counteryes['YES' if counteryes['YES'] < counteryes['NO'] else 'NO']

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature  ### TODO: YOUR CODE HERE
                new_used = used[:]
                new_used.append(bestFeature)

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                nos = [[], []]
                yess = [[], []]
                for i, x in enumerate(X):
                    if x[bestFeature] < 0.5:
                        nos[0].append(x)
                        nos[1].append(Y[i])
                    else:
                        yess[0].append(x)
                        yess[1].append(Y[i])

                self.left.trainDT(array(nos[0]), nos[1], maxDepth - 1,
                                  new_used)
                self.right.trainDT(array(yess[0]), yess[1], maxDepth - 1,
                                   new_used)
Exemplo n.º 17
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True

            self.label = util.mode(Y)  # do not have to make any decision

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = Y[X[:, d] <
                          0.5]  # left for feature that are less than 0.5

                rightY = Y[X[:, d] >= 0.5]

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY != util.mode(leftY)).nonzero()) + size(
                    (rightY != util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False

                self.feature = bestFeature

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                self.left.trainDT(X[X[:, self.feature] < 0.5],
                                  Y[X[:, self.feature] < 0.5],
                                  self.left.opts['maxDepth'], used + [
                                      self.feature,
                                  ])
                self.right.trainDT(X[X[:, self.feature] >= 0.5],
                                   Y[X[:, self.feature] >= 0.5],
                                   self.right.opts['maxDepth'], used + [
                                       self.feature,
                                   ])
                # For Chi-square pruning
                self.split = array(
                    [[
                        size((Y[X[:, self.feature] < 0.5] == 1).nonzero()),
                        size((Y[X[:, self.feature] < 0.5] == -1).nonzero())
                    ],
                     [
                         size((Y[X[:, self.feature] >= 0.5] == 1).nonzero()),
                         size((Y[X[:, self.feature] >= 0.5] == -1).nonzero())
                     ]])
Exemplo n.º 18
0
Arquivo: dt.py Projeto: Ruhjkg/ciml
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

            self.label = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                rightY = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.feature = util.raiseNotDefined()  ### TODO: YOUR CODE HERE

                self.left = DT({"maxDepth": maxDepth - 1})
                self.right = DT({"maxDepth": maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE
                util.raiseNotDefined()
Exemplo n.º 19
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """
        # get the size of the data set
        N,D = X.shape
        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label  = util.mode(Y);

        else:
            # we need to find a feature to split on
            bestFeature = -1     # which feature has lowest error
            bestError   = N      # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?

                leftY  = Y[X[:, d] < 0.5]
                rightY = Y[X[:, d] >= 0.5]
                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY!=util.mode(leftY)).nonzero()) + size((rightY!=util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError   = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label  = util.mode(Y)

            else:
                self.isLeaf  = False;
                self.feature = bestFeature;

                self.left  = DT({'maxDepth': maxDepth-1})
                self.right = DT({'maxDepth': maxDepth-1})
                # recurse on our children by calling
                #   self.left.trainDT(...) 
                # and
                #   self.right.trainDT(...) 
                # with appropriate arguments
                leftD = X[X[:, self.feature] < 0.5]
                rightD = X[X[:, self.feature] >= 0.5]
                # redefine labels with the best feature
                leftY = Y[X[:, self.feature] < 0.5]
                rightY = Y[X[:, self.feature] >= 0.5]
                used = used + [self.feature]
                # print "best feature found is ", self.feature
                # print "updated used:", used, " maxDepth:", self.left.opts['maxDepth']
                # print "leftY:", leftY
                # print "rightY:", rightY
#                pdb.set_trace()
                self.left.trainDT(leftD, leftY, self.left.opts['maxDepth'], used);
 
                self.right.trainDT(rightD, rightY, self.right.opts['maxDepth'], used);
Exemplo n.º 20
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """
        # get the size of the data set
        N, D = X.shape
        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?

                leftY = Y[X[:, d] < 0.5]
                rightY = Y[X[:, d] >= 0.5]
                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY != util.mode(leftY)).nonzero()) + size(
                    (rightY != util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                leftD = X[X[:, self.feature] < 0.5]
                rightD = X[X[:, self.feature] >= 0.5]
                # redefine labels with the best feature
                leftY = Y[X[:, self.feature] < 0.5]
                rightY = Y[X[:, self.feature] >= 0.5]
                used = used + [self.feature]
                # print "best feature found is ", self.feature
                # print "updated used:", used, " maxDepth:", self.left.opts['maxDepth']
                # print "leftY:", leftY
                # print "rightY:", rightY
                #                pdb.set_trace()
                self.left.trainDT(leftD, leftY, self.left.opts['maxDepth'],
                                  used)

                self.right.trainDT(rightD, rightY, self.right.opts['maxDepth'],
                                   used)
Exemplo n.º 21
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N, D = X.shape
        bestLeftY = []
        bestRightY = []

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return

            self.isLeaf = True  ### TODO: YOUR CODE HERE

            # self.label  = util.mode(Y)   ### TODO: YOUR CODE HERE
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature

            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue

                # suppose we split on this feature; what labels
                # would go left and right?
                leftY = Y[X[:, d] < 0.5]  ### TODO: YOUR CODE HERE
                rightY = Y[X[:, d] >= 0.5]  ### TODO: YOUR CODE HERE

                # y_counter = 0
                # for fe_val in X[:,d]:
                #     if fe_val >= 0.5:
                #         rightY.append(Y[y_counter])
                #     else:
                #         leftY.append(Y[y_counter])
                #
                #     y_counter += 1
                # print 'left ==== {left}'.format(left = leftY)
                # print rightY

                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = self.sizeWithoutK(
                    util.mode(leftY), leftY) + self.sizeWithoutK(
                        util.mode(rightY), rightY)  ### TODO: YOUR CODE HERE

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error
                    bestLeftY = leftY
                    bestRightY = rightY

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False  ### TODO: YOUR CODE HERE

                self.feature = bestFeature  ### TODO: YOUR CODE HERE

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments
                ### TODO: YOUR CODE HERE

                # New X without feature
                used.append(self.feature)
                Xne = delete(X, self.feature, 1)

                self.left.trainDT(X[X[:, self.feature] < 0.5, :], bestLeftY,
                                  maxDepth - 1, used)
                self.right.trainDT(X[X[:, self.feature] >= 0.5, :], bestRightY,
                                   maxDepth - 1, used)
Exemplo n.º 22
0
def gen_phases(start, end, minn=0, maxx=4):
    for phase in ['{:0>5}'.format(str(x)) for x in range(start, end)]:
        if len([ch for ch in phase if not between(int(ch), minn, maxx)]) == 0:
            if util.uniq(phase):
                yield phase
Exemplo n.º 23
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """

        # get the size of the data set
        N,D = X.shape
          

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True    ### TODO: YOUR CODE HERE

            self.label  = util.mode(Y)    ### TODO: YOUR CODE HERE


        else:
            # we need to find a feature to split on
            bestFeature = -1     # which feature has lowest error
            bestError   = N      # the number of errors for this feature           
            leftY = []
            rightY = []
            for d in range(D):
                # have we used this feature yet
                if d in used:
                    continue
                countLeftpos = 0
                countLeftneg = 0
                countRightpos = 0
                countRightneg = 0
                errorLeft = 0
                errorRight = 0

                # suppose we split on this feature; what labels
                # would go left and right?
                            
                for i in range (N):
                    if X[i,d] < 0.5 and Y[i] == 1:
                        countLeftpos += 1
                    if X[i,d] < 0.5 and Y[i] == -1:
                        countLeftneg += 1
                    if X[i,d] >= 0.5 and Y[i] == 1:
                        countRightpos += 1
                    if X[i,d] >= 0.5 and Y[i] == -1:
                        countRightneg += 1
                        
                if countLeftpos > countLeftneg:
                    errorLeft = countLeftneg
                if countLeftneg > countLeftpos:
                    errorLeft = countLeftpos
                if countLeftneg == countLeftpos:
                    errorLeft = countLeftpos
                if countRightpos > countRightneg:
                    errorRight = countRightneg
                if countRightneg > countRightpos:
                    errorRight = countRightpos
                if countRightneg == countRightpos:
                    errorRight = countRightpos


                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error =  errorLeft + errorRight   ### TODO: YOUR CODE HERE


                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError   = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label  = util.mode(Y)

            else:
                self.isLeaf  = False    ### TODO: YOUR CODE HERE

                self.feature = bestFeature
                
                used.append(bestFeature)
                ### TODO: YOUR CODE HERE
                
                leftX = X[X[:,bestFeature] < 0.5, :]
                rightX = X[X[:,bestFeature] >= 0.5, :]
                
                for i in range (N):
                    if X[i,bestFeature] < 0.5:
                        leftY.append(Y[i])
                    else:
                        rightY.append(Y[i])

                self.left  = DT({'maxDepth': maxDepth-1})
                self.right = DT({'maxDepth': maxDepth-1})
                self.left.trainDT(leftX, leftY, maxDepth-1, used)
                self.right.trainDT(rightX, rightY, maxDepth-1, used)
Exemplo n.º 24
0
    def trainDT(self, X, Y, maxDepth, used):
        """
        recursively build the decision tree
        """
        # print used
        # get the size of the data set
        N, D = X.shape

        # check to see if we're either out of depth or no longer
        # have any decisions to make
        if maxDepth <= 0 or len(util.uniq(Y)) <= 1:
            # we'd better end at this point.  need to figure
            # out the label to return
            self.isLeaf = True
            self.label = util.mode(Y)

        else:
            # we need to find a feature to split on
            bestFeature = -1  # which feature has lowest error
            bestError = N  # the number of errors for this feature
            for d in range(D):  # Index
                # have we used this feature yet
                if d in used:
                    continue

                xFiltered = X[:, d]
                leftY = Y[xFiltered < 0.5]
                rightY = Y[xFiltered >= 0.5]
                # we'll classify the left points as their most
                # common class and ditto right points.  our error
                # is the how many are not their mode.
                error = size((leftY != util.mode(leftY)).nonzero()) + size(
                    (rightY != util.mode(rightY)).nonzero())

                # check to see if this is a better error rate
                if error <= bestError:
                    bestFeature = d
                    bestError = error

            if bestFeature < 0:
                # this shouldn't happen, but just in case...
                self.isLeaf = True
                self.label = util.mode(Y)

            else:
                self.isLeaf = False
                self.feature = bestFeature  # To maximize accuracy
                # print "Feature:"
                # print repr(self.feature)

                self.left = DT({'maxDepth': maxDepth - 1})
                self.right = DT({'maxDepth': maxDepth - 1})
                # recurse on our children by calling
                #   self.left.trainDT(...)
                # and
                #   self.right.trainDT(...)
                # with appropriate arguments

                xFiltered = X[:, self.feature]
                leftY = Y[xFiltered < 0.5]
                rightY = Y[xFiltered >= 0.5]

                self.left.trainDT(X[X[:, self.feature] < 0.5], leftY,
                                  (maxDepth - 1), used + [self.feature])
                self.right.trainDT(X[X[:, self.feature] >= 0.5], rightY,
                                   (maxDepth - 1), used + [self.feature])