def predict(self, tupla, prediction={}, w=1): # Si es que es el nodo raiz if len(prediction.keys()) == 0: prediction = {c: 0.0 for c in self.data['class'].unique()} if self.is_leaf: aux = deepcopy(prediction) aux[self.clase] += w return aux # Puede que falte chequear casos bordes, al igual que lo hago en get_menores y get_mayores else: feature_name = self.feat_name.replace('.mean', '') mean = tupla[feature_name + '.mean'] std = tupla[feature_name + '.std'] l = tupla[feature_name + '.l'] r = tupla[feature_name + '.r'] pivote = self.feat_value w_left = min(w * pyRF_prob.cdf(pivote, mean, std, l, r), 1) w_right = min(w * (1 - pyRF_prob.cdf(pivote, mean, std, l, r)), 1) a = self.right.predict(tupla, prediction, w_right) b = self.left.predict(tupla, prediction, w_left) # Tengo que retornar la suma elementwise de los diccionarios a y b return {key: a[key] + b[key] for key in a}
def predict(self, tupla, prediction={}, w=1): # Si es que es el nodo raiz if len(prediction.keys()) == 0: prediction = {c: 0.0 for c in self.data['class'].unique()} if self.is_leaf: aux = deepcopy(prediction) aux[self.clase] += w return aux # Puede que falte chequear casos bordes, al igual que lo hago en get_menores y get_mayores else: mean = tupla[feature_name + '.mean'] std = tupla[feature_name + '.std'] l = tupla[feature_name + '.l'] r = tupla[feature_name + '.r'] pivote = self.feat_value w_left = min(w * pyRF_prob.cdf(pivote, mean, std, l, r), 1) w_right = min(w * (1 - pyRF_prob.cdf(pivote, mean, std, l, r)), 1) a = self.right.predict(tupla, prediction, w_right) b = self.left.predict(tupla, prediction, w_left) # Tengo que retornar la suma elementwise de los diccionarios a y b return {key: a[key] + b[key] for key in a}
def test_split_at_right_border(self): feature_mass = pyRF_prob.cdf(8, 5, 1, 2, 8) self.assertEqual(feature_mass, 1) feature_mass = pyRF_prob.cdf(5, 5, 1, 2, 5) self.assertEqual(feature_mass, 1) feature_mass = pyRF_prob.cdf(4, 5, 1, 2, 4) self.assertEqual(feature_mass, 1)
def test_split_at_left_border(self): # pyRF_prob.cdf(pivote, mean, std, left_bound, right_bound) feature_mass = pyRF_prob.cdf(2, 5, 1, 2, 8) self.assertEqual(feature_mass, 0) feature_mass = pyRF_prob.cdf(5, 5, 1, 5, 8) self.assertEqual(feature_mass, 0) feature_mass = pyRF_prob.cdf(6, 5, 1, 6, 8) self.assertEqual(feature_mass, 0)
def split_tuples_by_pivot(w_list, mean_list, std_list, left_bound_list, right_bound_list, class_list, pivote): """divides a group of data according to a pivot It operates along all the data. And then returns two dictionaries with the total sum of the mass separated by class. Returns: menores: Dictionary for the data thats inferior than the pivot mayores: Dictionary for the data thats superior to the pivot """ clip = lambda x, l, r: l if x < l else r if x > r else x clases = set(class_list) menores = {c: 0.0 for c in clases} mayores = {c: 0.0 for c in clases} for i in xrange(len(class_list)): cum_prob = pyRF_prob.cdf(pivote, mean_list[i], std_list[i], left_bound_list[i], right_bound_list[i]) cum_prob = clip(cum_prob, 0, 1) menores[class_list[i]] += w_list[i] * cum_prob mayores[class_list[i]] += w_list[i] * (1 - cum_prob) return menores, mayores
def get_weight(self, tupla, pivote, feature_name, how): """ Determina la distribucion de probabilidad gaussiana acumulada entre dos bordes. pivote: valor de corte how: determina si la probabilidad se calcula desde l hasta pivote o desde pivote hasta r -> mayor: probabilidad de caer entre pivote y limite right_bound -> menor: probabilidad de caer entre left_bound y pivote """ left_bound = tupla[feature_name + '.l'] right_bound = tupla[feature_name + '.r'] if left_bound >= pivote and how == 'mayor' or right_bound <= pivote and how == 'menor': return tupla else: w = tupla['weight'] mean = tupla[feature_name + '.mean'] std = tupla[feature_name + '.std'] feature_mass = pyRF_prob.cdf(pivote, mean, std, left_bound, right_bound) if math.isnan(feature_mass): if pivote > right_bound: if how == 'menor': feature_mass = 1.0 else: feature_mass = 0.0 else: if how == 'menor': feature_mass = 0.0 else: feature_mass = 1.0 if how == 'menor': if (feature_mass >= self.min_mass_threshold): tupla['weight'] = min(w * feature_mass, 1) else: tupla['weight'] = 0 # tupla[feature_name+'.r'] = min(pivote, tupla[feature_name + '.r']) tupla[feature_name + '.r'] = pivote return tupla elif how == 'mayor': feature_mass = 1 - feature_mass if (feature_mass >= self.min_mass_threshold): tupla['weight'] = min(w * feature_mass, 1) else: tupla['weight'] = 0 # tupla[feature_name+'.l'] = max(pivote, tupla[feature_name + '.l']) tupla[feature_name + '.l'] = pivote return tupla
def predict(self, tupla, prediction={}, w=1.0): # Si es que es el nodo raiz if len(prediction.keys()) == 0: prediction = {c: 0.0 for c in self.classes} if self.is_leaf: aux = deepcopy(prediction) aux[self.clase] += w return aux # Puede que falte chequear casos bordes, al igual que lo hago en get_menores y get_mayores else: feature_name = self.feat_name.replace('.mean', '') mean = tupla[feature_name + '.mean'] std = tupla[feature_name + '.std'] l = tupla[feature_name + '.l'] r = tupla[feature_name + '.r'] pivote = self.feat_value aux_mass = pyRF_prob.cdf(pivote, mean, std, l, r) # MAL FIX if math.isnan(aux_mass): if pivote > r: aux_mass = 1.0 else: aux_mass = 0.0 clip = lambda hi, lo, x: lo if x <= lo else hi if x >= hi else x aux_mass = clip(1, 0, aux_mass) w_left = w * aux_mass w_right = w * (1.0 - aux_mass) print str(aux_mass) + ' ' + str(1.0 - aux_mass) a = self.right.predict(tupla, prediction, w_right) b = self.left.predict(tupla, prediction, w_left) # Tengo que retornar la suma elementwise de los diccionarios a y b return {key: a[key] + b[key] for key in a}
def get_weight(self, tupla, pivote, feature_name, how): """ Determina la distribucion de probabilidad gaussiana acumulada entre dos bordes. pivote: valor de corte how: determina si la probabilidad se calcula desde l hasta pivote o desde pivote hasta r """ left_bound = tupla[feature_name + '.l'] right_bound = tupla[feature_name + '.r'] if left_bound >= pivote and how == 'mayor' or right_bound <= pivote and how == 'menor': return tupla else: w = tupla['weight'] mean = tupla[feature_name + '.mean'] std = tupla[feature_name + '.std'] feature_mass = pyRF_prob.cdf(pivote, mean, std, left_bound, right_bound) if how == 'menor': if (feature_mass >= self.min_mass_threshold): tupla['weight'] = min(w * feature_mass, 1) else: tupla['weight'] = 0 # tupla[feature_name+'.r'] = min(pivote, tupla[feature_name + '.r']) tupla[feature_name + '.r'] = pivote return tupla elif how == 'mayor': feature_mass = 1 - feature_mass if (feature_mass >= self.min_mass_threshold): tupla['weight'] = min(w * feature_mass, 1) else: tupla['weight'] = 0 # tupla[feature_name+'.l'] = max(pivote, tupla[feature_name + '.l']) tupla[feature_name + '.l'] = pivote return tupla
def split_tuples_by_pivot(self, w_list, mean_list, std_list, left_bound_list, right_bound_list, class_list, pivote, menores, mayores): """divides a group of data according to a pivot It operates along all the data. And then returns two dictionaries with the total sum of the mass separated by class. Returns: menores: Dictionary for the data thats inferior than the pivot mayores: Dictionary for the data thats superior to the pivot """ clip = lambda x, l, r: l if x < l else r if x > r else x # Este loop es fundamental paralelizarlo for i in xrange(len(class_list)): cum_prob = pyRF_prob.cdf(pivote, mean_list[i], std_list[i], left_bound_list[i], right_bound_list[i]) cum_prob = clip(cum_prob, 0, 1) # En vez de agregar estas cantidades hago un submetodo que las retorne # Hago un map y dsp las unzipeo y las sumo según su clase menores[class_list[i]] += w_list[i] * cum_prob mayores[class_list[i]] += w_list[i] * (1 - cum_prob) return menores, mayores