def split(self, max_depth, depth): # print("testing") if self.impurity != 0 and (max_depth > depth or max_depth == False): if self.unsplit: if self.best_split[0] == 'x': axis = 0 else: axis = 1 low_points = [] high_points = [] for point in self.df.to_array(): if point[axis] < self.best_split[1]: low_points.append(point) elif point[axis] >= self.best_split[1]: high_points.append(point) self.low = Node( DataFrame.from_array(low_points, self.df.columns), self.split_metric) self.high = Node( DataFrame.from_array(high_points, self.df.columns), self.split_metric) self.unsplit = False elif max_depth > depth + 1 or max_depth == False: if self.low.impurity != 0: self.low.split(max_depth, depth + 1) if self.high.impurity != 0: self.high.split(max_depth, depth + 1) else: self.unsplit = False
def fit(self, dataframe, dependent_variable): self.first_variable = dataframe.columns[0] self.dependent_variable = dependent_variable if self.degree == 0: new_columns = [self.dependent_variable] elif self.degree == 1: new_columns = [self.first_variable, self.dependent_variable] else: new_columns = [self.first_variable] for i in range(2, self.degree + 1): new_term = self.first_variable + '^' + str(i) new_columns.append(new_term) new_columns.append(self.dependent_variable) new_dataset = [] for pair in dataframe.to_array(): new_values = [] for i in range(1, self.degree + 1): value = pair[0]**i new_values.append(value) new_values.append(pair[1]) new_dataset.append(new_values) self.df = DataFrame.from_array(new_dataset, new_columns)
def __init__(self, data_class, prediction_column, max_value, delta, constant=True): super().__init__(data_class, prediction_column) self.prediction = prediction_column self.current_input = None self.max_val = max_value self.original_data = DataFrame.from_array(data_class.to_array(), data_class.columns) print("#0" + str(self.original_data.to_array())) self.original_data = self.original_data.append_columns( {'constant': [1 for _ in range(len(data_class.to_array()))]}, ['constant'] + data_class.columns) self.data = data_class.apply( self.prediction_column, lambda x: self.set_bound_replacements(delta, x)) if constant: self.data = self.data.append_columns( {'constant': [1 for _ in range(len(self.data.to_array()))]}, ['constant'] + self.data.columns) self.multipliers = self.solve_coefficients() print("#1" + str(self.multipliers)) print("#2" + str(self.original_data.to_array()))
def nearest_neighbors(self, observation): close_list = self.compute_distances(observation).to_array() sorted_list = [] for n in range(len(close_list)): sorted_list.append( close_list.pop(self.sort_closest_cookie(close_list))) return DataFrame.from_array(sorted_list[::-1], columns=['distance', 'Cookie Type'])
def calc_goodness(self, split, axis): goodness = self.impurity low = [] high = [] for point in self.df.to_array(): if point[axis] < split: low.append(point) elif point[axis] >= split: high.append(point) low_node = Node(DataFrame.from_array(low, self.df.columns), self.split_metric) high_node = Node(DataFrame.from_array(high, self.df.columns), self.split_metric) new_nodes = [low_node, high_node] for split_node in new_nodes: goodness -= (len(split_node.row_indices) / len(self.row_indices)) * split_node.impurity return round(goodness, 3)
def compute_distances(self, observation): data_arr = self.dataframe.to_array() data_dict = self.dataframe.data_dict distances = [] for i in range(len(data_arr)): distances.append([ sum([(observation[entry] - data_dict[entry][i])**2 for entry in observation])**(0.5), data_arr[i][0] ]) return DataFrame.from_array(distances, ['Distance', 'Cookie Type'])
def split(self, if_once=False, depth_needed=None): if depth_needed is None or self.depth < depth_needed: if self.low is None and self.high is None: if self.final_split is False: self.possible_splits = self.get_possible_splits() self.get_best_split() if self.best_split is None: return if str(self.depth) in self.tree.splits: self.tree.splits[str(self.depth)].append( self.best_split) else: self.tree.splits[str(self.depth)] = [self.best_split] low = [] high = [] for entry in self.df.to_array(): if entry[self.best_split_index] < self.best_split[1]: low.append(entry) elif entry[ self.best_split_index] >= self.best_split[1]: high.append(entry) self.low = Node(DataFrame.from_array(low, self.df.columns), self.split_metric, (self.depth + 1), tree=self.tree) self.high = Node(DataFrame.from_array( high, self.df.columns), self.split_metric, (self.depth + 1), tree=self.tree) if not if_once: self.low.split(depth_needed=depth_needed) self.high.split(depth_needed=depth_needed) else: return else: if self.low is not None: self.low.split(if_once, depth_needed=depth_needed) if self.high is not None: self.high.split(if_once, depth_needed=depth_needed) return else: return
def run_tests(training_set, testing_set, decision_tree, forest = False): correct = 0 training_df = DataFrame.from_array(training_set, ['bmi', 'weight', 'class']) decision_tree.fit(training_df) for test in testing_set: test_dict = {'bmi' : test[0], 'weight' : test[1]} if forest: prediction = decision_tree.predict(test_dict) else: prediction = decision_tree.classify(test_dict) if prediction == test[2]: correct += 1 return correct,len(testing_set)
def compute_distances(self, observation): distances = [] for data in self.df.to_array(): distances.append( self.compute_distance(observation, [ data[n] for n in range(len(data)) if n != self.df.columns.index(self.prediction_column) ])) result = [[n] for n in distances] for n in range(len(distances)): result[n].append(self.df.to_array()[n][self.df.columns.index( self.prediction_column)]) return DataFrame.from_array(result, columns=['distance', 'Cookie Type'])
def calc_goodness(self, split, axis_index): goodness = self.impurity low = [] high = [] for point in self.df.to_array(): if point[axis_index] < split: low.append(point) elif point[axis_index] >= split: high.append(point) low_node = Node(DataFrame.from_array(low, self.df.columns), self.split_metric, depth=int(self.depth) + 1, check_splits=False, tree=self.tree) high_node = Node(DataFrame.from_array(high, self.df.columns), self.split_metric, depth=(self.depth + 1), check_splits=False, tree=self.tree) nodes = [low_node, high_node] for split_node in nodes: goodness -= (len(split_node.row_indices) / len(self.row_indices)) * split_node.impurity return goodness
def calc_possible_splits(self): points = [[], 'x', [], 'y'] for x in self.df.ordered_dict['x']: if x not in points[0]: points[0].append(x) for y in self.df.ordered_dict['y']: if y not in points[2]: points[2].append(y) splits = [] for n in range(2): for i in range(len(points[2 * n]) - 1): splits.append([ points[2 * n + 1], (points[2 * n][i] + points[2 * n][i + 1]) / 2, self.calc_goodness( (points[2 * n][i] + points[2 * n][i + 1]) / 2, n) ]) return DataFrame.from_array(splits, ['feature', 'value', 'goodness of split'])
def get_possible_splits(self): axis = [ axis for axis in self.df.columns if axis != 'class' and axis != 'indices' ] all_splits = [] for i in range(len(self.distinct_values)): for j in range(len(self.distinct_values[i]) - 1): split_value = (self.distinct_values[i][j] + self.distinct_values[i][j + 1]) / 2 all_splits.append( [axis[i], split_value, self.calc_goodness(split_value, i)]) if self.split_metric == 'random': if len(list(set([split[0] for split in all_splits]))) == 0: return [] random_choice = random.choice( list(set([split[0] for split in all_splits]))) new_splits = [ split for split in all_splits if split[0] == random_choice ] all_splits = new_splits return DataFrame.from_array( all_splits, ['axis', 'split_value', 'goodness of split'])
data_dict = {'Pete': [1, 0, 1, 0], 'John': [2, 1, 0, 2], 'Sarah': [3, 1, 4, 0]} df1 = DataFrame(data_dict, column_order=['Pete', 'John', 'Sarah']) df2 = df1.apply('John', lambda x: 7 * x) print('Testing method "apply"...') assert df2.data_dict == { 'Pete': [1, 0, 1, 0], 'John': [14, 7, 0, 14], 'Sarah': [3, 1, 4, 0] } print('PASSED') columns = ['firstname', 'lastname', 'age'] arr = [['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17], ['Anna', 'Smith', 13], ['Sylvia', 'Mendez', 9]] df = DataFrame.from_array(arr, columns) print('Testing method "select_rows_where"...') assert df.where(lambda row: len(row['firstname']) >= len(row['lastname']) and row['age'] > 10).to_array() == [['Charles', 'Trapp', 17]] print('PASSED') print('Testing method "order_by"...') assert df.order_by('age', order="ASC").to_array() == [['Kevin', 'Fray', 5], ['Sylvia', 'Mendez', 9], ['Anna', 'Smith', 13], ['Charles', 'Trapp', 17]] assert df.order_by('firstname', order="DESC").to_array() == [['Sylvia', 'Mendez', 9], ['Kevin', 'Fray', 5],
import sys sys.path.append('src') from matrix import Matrix from dataframe import DataFrame from linear_regressor import LinearRegressor from logistic_regressor import LogisticRegressor dataset = [(0.0, 4.0), (0.2, 8.9), (0.4, 17.2), (0.6, 28.3), (0.8, 41.6), (1.0, 56.5), (1.2, 72.4), (1.4, 88.7), (1.6, 104.8), (1.8, 120.1), (2.0, 134.0), (2.2, 145.9), (2.4, 155.2), (2.6, 161.3), (2.8, 163.6), (3.0, 161.5), (3.2, 154.4), (3.4, 141.7), (3.6, 122.8), (3.8, 97.1), (4.0, 64.0), (4.2, 22.9), (4.4, -26.8), (4.6, -85.7), (4.8, -154.4)] new_columns = ['x', 'x^2', 'x^3', 'y'] new_dataset = [(pair[0], pair[0]**2, pair[0]**3, pair[1]) for pair in dataset] df = DataFrame.from_array(new_dataset, new_columns) polynomial_regressor = LinearRegressor(df, 'y') polynomial_regressor_coefficients = polynomial_regressor.coefficients print("polynomial_regressor_coefficients:", polynomial_regressor_coefficients)
import sys sys.path.append('src') from matrix import Matrix from dataframe import DataFrame from linear_regressor import LinearRegressor import math import matplotlib.pyplot as plt df = DataFrame.from_array([(0.0, 7.0), (0.2, 5.6), (0.4, 3.56), (0.6, 1.23), (0.8, -1.03), (1.0, -2.89), (1.2, -4.06), (1.4, -4.39), (1.6, -3.88), (1.8, -2.64), (2.0, -0.92), (2.2, 0.95), (2.4, 2.63), (2.6, 3.79), (2.8, 4.22), (3.0, 3.8), (3.2, 2.56), (3.4, 0.68), (3.6, -1.58), (3.8, -3.84), (4.0, -5.76), (4.2, -7.01), (4.4, -7.38), (4.6, -6.76), (4.8, -5.22)], columns=['x', 'y']) #add different columns df = df.apply_add('x', lambda x: math.sin(x), 'sin(x)') df = df.apply_add('x', lambda x: math.cos(x), 'cos(x)') df = df.apply_add('x', lambda x: math.sin(2 * x), 'sin(2*x)') df = df.apply_add('x', lambda x: math.cos(2 * x), 'cos(2*x)') #save x and y values x_values = list(df.data_dict['x']) y_values = list(df.data_dict['y']) #delete x values df = df.del_column('x') #find coefficients linear_regressor = LinearRegressor(df, dependent_variable='y')
# # [ 5, 0, 1, 0, 0, 5, 0, 0, 0, 0, 1, 8], # # [ 5, 0, 0, 1, 0, 0, 5, 0, 0, 0, 1, 1], # # [ 5, 0, 1, 1, 0, 5, 5, 0, 0, 1, 1, 0], # # [ 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5], # # [ 0, 5, 1, 0, 0, 0, 0, 5, 0, 0, 1, 0], # # [ 0, 5, 0, 1, 0, 0, 0, 0, 5, 0, 1, 9], # # [ 0, 5, 1, 1, 0, 0, 0, 5, 5, 1, 1, 0], # # [ 5, 5, 0, 0, 25, 0, 0, 0, 0, 0, 1, 0], # # [ 5, 5, 1, 0, 25, 5, 0, 5, 0, 0, 1, 0], # # [ 5, 5, 0, 1, 25, 0, 5, 0, 5, 0, 1, 0], # # [ 5, 5, 1, 1, 25, 5, 5, 5, 5, 1, 1, 0]] columns = ['firstname', 'lastname', 'age'] arr = [['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17], ['Anna', 'Smith', 13], ['Sylvia', 'Mendez', 9]] df = DataFrame.from_array(arr, columns) print(df.to_array()) print(df.select_columns(['firstname', 'age']).to_array()) # [['Kevin', 5], # ['Charles', 17], # ['Anna', 13], # ['Sylvia', 9]] print(df.select_rows([1, 3]).to_array()) # [['Charles', 'Trapp', 17], # ['Sylvia', 'Mendez', 9]] print( df.select_rows_where(lambda row: len(row['firstname']) >= len(row[
(1.0, 56.5), (1.2, 72.4), (1.4, 88.7), (1.6, 104.8), (1.8, 120.1), (2.0, 134.0), (2.2, 145.9), (2.4, 155.2), (2.6, 161.3), (2.8, 163.6), (3.0, 161.5), (3.2, 154.4), (3.4, 141.7), (3.6, 122.8), (3.8, 97.1), (4.0, 64.0), (4.2, 22.9), (4.4, -26.8), (4.6, -85.7), (4.8, -154.4)] df = DataFrame.from_array(arr, ['x', 'y']) df = df.create_interaction_terms('x', 'x') df = df.create_interaction_terms('x * x', 'x') regressor = LinearRegressor(df, 'y') print(regressor.coefficients)
import matplotlib.pyplot as plt list_data = [[1, 0], [2, 0], [3, 0], [2, 1], [3, 1], [4, 1]] delta_table = [0.1, 0.01, 0.001, 0.0001] all_coords = [] for delta_low in delta_table: # new_list=[] # for pair in list_data: # if pair[1] == 0: # new_list.append([pair[0],delta]) # else: # new_list.append([pair[0],1-delta]) df = DataFrame.from_array(list_data, columns=['x', 'y']) regressor = LogisticRegressor(df, prediction_column='y', max_value=1, delta=delta_low) coords = [[], []] for x in range(20): coords[0].append(x / 100) coords[1].append(regressor.predict({'constant': 1, 'x': x})) all_coords.append(coords) print(all_coords) plt.style.use('bmh') for coords in all_coords: plt.plot(coords[0], coords[1], linewidth=2.5)
import sys sys.path.append('src') from dataframe import DataFrame sys.path.append('kaggle/titanic') from parse_line import parse_line df = DataFrame.from_array([['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17], ['Anna', 'Smith', 13], ['Sylvia', 'Mendez', 9]], columns=['firstname', 'lastname', 'age']) assert df.query( "SELECT lastname, firstname, age ORDER BY age DESC").to_array() == [[ 'Trapp', 'Charles', 17 ], ['Smith', 'Anna', 13], ['Mendez', 'Sylvia', 9], ['Fray', 'Kevin', 5]] print("\npassed test 1") assert df.query("SELECT firstname ORDER BY lastname ASC").to_array() == [[ 'Kevin' ], ['Sylvia'], ['Anna'], ['Charles']] print("\npassed test 2") df = DataFrame.from_array( [['Kevin', 'Fray', 5], ['Melvin', 'Fray', 5], ['Charles', 'Trapp', 17], ['Carl', 'Trapp', 17], ['Anna', 'Smith', 13], ['Hannah', 'Smith', 13], ['Sylvia', 'Mendez', 9], ['Cynthia', 'Mendez', 9]], columns=['firstname', 'lastname', 'age']) assert df.query( "SELECT lastname, firstname, age ORDER BY age ASC, firstname DESC"
import math data = [(0.0, 7.0), (0.2, 5.6), (0.4, 3.56), (0.6, 1.23), (0.8, -1.03), (1.0, -2.89), (1.2, -4.06), (1.4, -4.39), (1.6, -3.88), (1.8, -2.64), (2.0, -0.92), (2.2, 0.95), (2.4, 2.63), (2.6, 3.79), (2.8, 4.22), (3.0, 3.8), (3.2, 2.56), (3.4, 0.68), (3.6, -1.58), (3.8, -3.84), (4.0, -5.76), (4.2, -7.01), (4.4, -7.38), (4.6, -6.76), (4.8, -5.22)] columns = ['y', 'sin(x)', 'cos(x)', 'sin(2x)', 'cos(2x)'] new_data = [[y, math.sin(x), math.cos(x), math.sin(2 * x), math.cos(2 * x)] for (x, y) in data] df = DataFrame.from_array(new_data, columns) regressor = LinearRegressor(df, 'y') print(regressor.coefficients) ''' import matplotlib.pyplot as plt plt.style.use('bmh') x_points = [] predicted_points = [] x = 0 while x <= 5 : data_dict = {'sin(x)' : math.sin(x), 'cos(x)' : math.cos(x), 'sin(2x)' : math.sin(2 * x), 'cos(2x)' : math.cos(2 * x)} x_points.append(x)
import sys sys.path.append('src') from polynomial_regressor import PolynomialRegressor from dataframe import DataFrame data = [(1, 3.1), (2, 10.17), (3, 20.93), (4, 38.71), (5, 60.91), (6, 98.87), (7, 113.92), (8, 146.95), (9, 190.09), (10, 232.65)] df = DataFrame.from_array(data, ['time', 'distance']) quadratic_regressor = PolynomialRegressor(degree=2) quadratic_regressor.fit(df, 'distance') print('Quadratic Regressor:') print(quadratic_regressor.coefficients) for t in [5, 10, 200]: print('Distance after ' + str(t) + ' seconds:', quadratic_regressor.predict({'time': t})) df = DataFrame.from_array(data, ['time', 'distance']) cubic_regressor = PolynomialRegressor(degree=3) cubic_regressor.fit(df, 'distance') print('Cubic Regressor:') print(cubic_regressor.coefficients) for t in [5, 10, 200]: print('Distance after ' + str(t) + ' seconds:', cubic_regressor.predict({'time': t}))
import sys sys.path.append('src') from matrix import Matrix from dataframe import DataFrame from linear_regressor import LinearRegressor df = DataFrame.from_array( [[0, 0, 1], [1, 0, 2], [2, 0, 4], [4, 0, 8], [6, 0, 9], [0, 2, 2], [0, 4, 5], [0, 6, 7], [0, 8, 6]], columns=['slices of roast beef', 'tbsp of peanut butter', 'rating']) regressor = LinearRegressor(df, dependent_variable='rating') print(regressor.coefficients) print( regressor.predict({ 'slices of roast beef': 5, 'tbsp of peanut butter': 0 })) print( regressor.predict({ 'slices of roast beef': 5, 'tbsp of peanut butter': 5 }))
import sys sys.path.append('src') from dataframe import DataFrame from decision_tree import DecisionTree from random_forest import RandomForest data = [[2,13,'B'],[2,13,'B'],[2,13,'B'],[2,13,'B'],[2,13,'B'],[2,13,'B'], [3,13,'B'],[3,13,'B'],[3,13,'B'],[3,13,'B'],[3,13,'B'],[3,13,'B'], [2,12,'B'],[2,12,'B'], [3,12,'A'],[3,12,'A'], [3,11,'A'],[3,11,'A'], [3,11.5,'A'],[3,11.5,'A'], [4,11,'A'],[4,11,'A'], [4,11.5,'A'],[4,11.5,'A'], [2,10.5,'A'],[2,10.5,'A'], [3,10.5,'B'], [4,10.5,'A'], [3, 9.5, 'A'], [2,10,'A']] df = DataFrame.from_array(data, columns = ['x', 'y', 'class']) r = RandomForest(10) r.fit(df) print(r.predict({'x': 3, 'y': 10}))
import sys sys.path.append('src') from matrix import Matrix from dataframe import DataFrame from linear_regressor import LinearRegressor from logistic_regressor import LogisticRegressor df = DataFrame.from_array( [[0, 0, 1, 0], [1, 0, 2, 0], [2, 0, 4, 0], [4, 0, 8, 0], [6, 0, 9, 0], [0, 2, 2, 0], [0, 4, 5, 0], [0, 6, 7, 0], [0, 8, 6, 0], [2, 2, 0.1, 4], [3, 4, 0.1, 12]], columns = ['beef', 'pb', 'rating', 'interactive'] ) log_reg = LogisticRegressor(df,10, dependent_variable = 'rating') print(log_reg.predict({'beef': 5, 'pb': 0 , 'interactive':0})) print(log_reg.predict({'beef': 12, 'pb': 0 , 'interactive':0})) print(log_reg.predict({'beef': 5, 'pb': 5 , 'interactive':25}))
df = DataFrame.from_array([[1,0.2], [2,0.25], [3,0.5]], columns = ['hours worked', 'progress']) regressor = LinearRegressor(df, dependent_variable='progress') print('Does all the linear_regressor stuff work') assert regressor.coefficients == [0.01667, 0.15], 'No, coefficients does not work' assert regressor.predict({'hours worked': 4}) == 0.61667, 'No, predict does not work' print('Yes they do', "\n") ''' df = DataFrame.from_array( [[0, 0, 0.1], [1, 0, 0.2], [0, 2, 0.5], [4, 5, 0.6]], columns=['scoops of chocolate', 'scoops of vanilla', 'taste rating']) regressor = LinearRegressor(df, dependent_variable='taste rating') print('Does all the linear_regressor stuff work') reg_coeff = regressor.coefficients.copy() for (key, value) in reg_coeff.items(): reg_coeff[key] = round(value, 8) assert reg_coeff == { 'constant': 0.19252336, 'scoops of chocolate': -0.05981308, 'scoops of vanilla': 0.13271028 }, 'No, coefficients does not work'
import sys sys.path.append('src') from k_nearest_neighbors_classifier import KNearestNeighborsClassifier from dataframe import DataFrame df = DataFrame.from_array( [['Shortbread', 0.14, 0.14, 0.28, 0.44], ['Shortbread', 0.10, 0.18, 0.28, 0.44], ['Shortbread', 0.12, 0.10, 0.33, 0.45], ['Shortbread', 0.10, 0.25, 0.25, 0.40], ['Sugar', 0.00, 0.10, 0.40, 0.50], ['Sugar', 0.00, 0.20, 0.40, 0.40], ['Sugar', 0.10, 0.08, 0.35, 0.47], ['Sugar', 0.00, 0.05, 0.30, 0.65], ['Fortune', 0.20, 0.00, 0.40, 0.40], ['Fortune', 0.25, 0.10, 0.30, 0.35], ['Fortune', 0.22, 0.15, 0.50, 0.13], ['Fortune', 0.15, 0.20, 0.35, 0.30], ['Fortune', 0.22, 0.00, 0.40, 0.38]], columns=[ 'Cookie Type', 'Portion Eggs', 'Portion Butter', 'Portion Sugar', 'Portion Flour' ]) knn = KNearestNeighborsClassifier(k=5) knn.fit(df, dependent_variable='Cookie Type') observation = { 'Portion Eggs': 0.10, 'Portion Butter': 0.15, 'Portion Sugar': 0.30, 'Portion Flour': 0.45 } print(knn.compute_distances(observation).to_array()) # Returns a dataframe representation of the following array:
# for i in range(len(pos_neg)): # correct_class = pos_neg[i][3] # observation = into_new_observation(pos_neg[i]) # prediction = r.predict(observation) # if prediction == correct_class: # correct += 1 # assert correct/len(pos_neg) * 100 == 100, 'WRONG ACCURACY BRUH' points = [[x, y, z, 'A'] for z in range(-5, 6) for y in range(-5, 6) for x in range(-5, 6) if x * y * z != 0] points.extend([[x, y, z, 'B'] for z in range(1, 6) for y in range(1, 6) for x in range(1, 6) if x * y * z != 0]) points.extend([[x, y, z, 'B'] for z in range(1, 6) for y in range(1, 6) for x in range(1, 6) if x * y * z != 0]) df = DataFrame.from_array(points, columns=['x', 'y', 'z', 'class']) r = RandomForest(100, depth=None) r.fit(df) correct = 0 for i in range(len(points)): correct_class = points[i][3] observation = into_new_observation(points[i]) prediction = r.predict(observation) if prediction == correct_class: correct += 1 assert correct / len(points) * 100 == 90, 'WRONG ACCURACY BRUH' print('passed')
import sys sys.path.append('src') from matrix import Matrix from dataframe import DataFrame from linear_regressor import LinearRegressor from logistic_regressor import LogisticRegressor df = DataFrame.from_array( [[0, 0, [], 1], [0, 0, ['mayo'], 1], [0, 0, ['jelly'], 4], [0, 0, ['mayo', 'jelly'], 0], [5, 0, [], 4], [5, 0, ['mayo'], 8], [5, 0, ['jelly'], 1], [5, 0, ['mayo', 'jelly'], 0], [0, 5, [], 5], [0, 5, ['mayo'], 0], [0, 5, ['jelly'], 9], [0, 5, ['mayo', 'jelly'], 0], [5, 5, [], 0], [5, 5, ['mayo'], 0], [5, 5, ['jelly'], 0], [5, 5, ['mayo', 'jelly'], 0]], columns=['beef', 'pb', 'condiments', 'rating']) df = df.create_dummy_variables('condiments') df = df.create_interaction_terms('beef', 'pb') df = df.create_interaction_terms('beef', 'mayo') df = df.create_interaction_terms('beef', 'jelly') df = df.create_interaction_terms('pb', 'mayo') df = df.create_interaction_terms('pb', 'jelly') df = df.create_interaction_terms('mayo', 'jelly') log_df = DataFrame(df.data_dict, df.columns) logistic_regressor = LogisticRegressor(log_df, 10, dependent_variable='rating') # test 8 slices of beef + mayo observation = {'beef': 8, 'mayo': 1} assert round(logistic_regressor.predict(observation), 2) == 9.72
import sys sys.path.append('src') from dataframe import DataFrame from polynomial_regressor import PolynomialRegressor df = DataFrame.from_array( [(0,1), (1,2), (2,5), (3,10), (4,20), (5,30)], columns = ['x', 'y'] ) constant_regressor = PolynomialRegressor(degree=0) constant_regressor.fit(df, dependent_variable='y') print(constant_regressor.coefficients) {'constant': 11.3333} print(constant_regressor.predict({'x': 2})) 11.3333 linear_regressor = PolynomialRegressor(degree=1) linear_regressor.fit(df, dependent_variable='y') print(linear_regressor.coefficients) {'constant': -3.2381, 'x': 5.8286} print(linear_regressor.predict({'x': 2})) 8.4190 quadratic_regressor = PolynomialRegressor(degree=2) quadratic_regressor.fit(df, dependent_variable='y') print(quadratic_regressor.coefficients) {'constant': 1.1071, 'x': -0.6893, 'x^2': 1.3036}
# print('\nTesting root low high indices') # assert dt.root.low.high.row_indices == [6] # print('passed') # print('\nTesting root low low impurity') # assert dt.root.low.low.impurity == 0 # print('passed') # print('\nTesting root low high impurity') # assert dt.root.low.high.impurity == 0 # print('passed') print('Splitting Tests') df = DataFrame.from_array( [[1, 11, 'A'], [1, 12, 'A'], [2, 11, 'A'], [1, 13, 'B'], [2, 13, 'B'], [3, 13, 'B'], [3, 11, 'B']], columns=['x', 'y', 'class']) dt = DecisionTree(split_metric='gini') dt.initialize(df) dt.split() dt.split() assert dt.root.high.row_indices == [3, 4, 5] assert dt.root.low.low.row_indices == [0, 1, 2] assert dt.root.low.high.row_indices == [6] print('passed') dt = DecisionTree(split_metric='gini') dt.fit(df) assert dt.root.high.row_indices == [3, 4, 5] assert dt.root.low.low.row_indices == [0, 1, 2]