def generate_conditions(data, attr, cls, default_conditions={}): if not data[attr]: return [] if attr in default_conditions: return default_conditions[attr] if not continuous(data[attr][0]): return [Condition('=="%s"'%(value), partial((lambda v, x:x == v), value)) for value in set(data[attr])] result = {} for value in set(data[attr]): result[value] = { '<=': Counter(), '>': Counter(), } count = 0.0 for i, value in enumerate(data[attr]): cls_value = data[cls][i] for base in result: if value <= base: result[base]['<='][cls_value] += 1 else: result[base]['>'][cls_value] += 1 count += 1.0 for base in result: sv1 = sum(result[base]['<='].values()) sv2 = sum(result[base]['>'].values()) result[base]['<='] =sv1*entropy_from_counter(result[base]['<=']) / count result[base]['>'] = sv2*entropy_from_counter(result[base]['>']) / count result[base] = result[base]['<='] + result[base]['>'] number = min(result, key=result.get) return [ Condition('<=' + str(number), lambda x: x <= number), Condition('>' + str(number), lambda x: x > number) ]
def extract_comp(tup): result = [] for element in tup: if not continuous(element): result.append(lambda x, y: 0 if x == y else 1) else: result.append(lambda x, y: x - y) return result
def extract_normalizer(data, cls): result = {} for attr, values in data.items(): if attr == cls: continue if not continuous(values[0]): result[attr] = lambda x: x else: mi, ma = min(values), max(values) mi, ma = float(mi), float(ma) result[attr] = lambda x: (float(x) - mi) / ma return result
def naive_bayes(data, cls, new_tup, adjust=False): counter = Counter(data[cls]) result = Counter() for ci, count in counter.items(): pci = float(count) / float(len(data[cls])) mult = 1.0 base = {x:0.0 for x in new_tup} p = {} for x, x_value in new_tup.items(): countx_ci = count_x_and_y(data[x], data[cls], x_value, ci) p[x] = p_continuous if continuous(x_value) else p_categorical if adjust and not countx_ci: base[x] = 1.0 for x, x_value in new_tup.items(): px_ci = p[x](data[x], data[cls], x_value, ci, count, base[x]) mult *= px_ci result[ci] = mult * pci return result.most_common(1)[0][0]