Пример #1
0
 def split(self, max_depth, depth):
     # print("testing")
     if self.impurity != 0 and (max_depth > depth or max_depth == False):
         if self.unsplit:
             if self.best_split[0] == 'x':
                 axis = 0
             else:
                 axis = 1
             low_points = []
             high_points = []
             for point in self.df.to_array():
                 if point[axis] < self.best_split[1]:
                     low_points.append(point)
                 elif point[axis] >= self.best_split[1]:
                     high_points.append(point)
             self.low = Node(
                 DataFrame.from_array(low_points, self.df.columns),
                 self.split_metric)
             self.high = Node(
                 DataFrame.from_array(high_points, self.df.columns),
                 self.split_metric)
             self.unsplit = False
         elif max_depth > depth + 1 or max_depth == False:
             if self.low.impurity != 0:
                 self.low.split(max_depth, depth + 1)
             if self.high.impurity != 0:
                 self.high.split(max_depth, depth + 1)
     else:
         self.unsplit = False
    def fit(self, dataframe, dependent_variable):

        self.first_variable = dataframe.columns[0]
        self.dependent_variable = dependent_variable

        if self.degree == 0:
            new_columns = [self.dependent_variable]
        elif self.degree == 1:
            new_columns = [self.first_variable, self.dependent_variable]
        else:
            new_columns = [self.first_variable]
            for i in range(2, self.degree + 1):
                new_term = self.first_variable + '^' + str(i)
                new_columns.append(new_term)
            new_columns.append(self.dependent_variable)

        new_dataset = []
        for pair in dataframe.to_array():
            new_values = []
            for i in range(1, self.degree + 1):
                value = pair[0]**i
                new_values.append(value)
            new_values.append(pair[1])
            new_dataset.append(new_values)

        self.df = DataFrame.from_array(new_dataset, new_columns)
Пример #3
0
 def __init__(self,
              data_class,
              prediction_column,
              max_value,
              delta,
              constant=True):
     super().__init__(data_class, prediction_column)
     self.prediction = prediction_column
     self.current_input = None
     self.max_val = max_value
     self.original_data = DataFrame.from_array(data_class.to_array(),
                                               data_class.columns)
     print("#0" + str(self.original_data.to_array()))
     self.original_data = self.original_data.append_columns(
         {'constant': [1 for _ in range(len(data_class.to_array()))]},
         ['constant'] + data_class.columns)
     self.data = data_class.apply(
         self.prediction_column,
         lambda x: self.set_bound_replacements(delta, x))
     if constant:
         self.data = self.data.append_columns(
             {'constant': [1 for _ in range(len(self.data.to_array()))]},
             ['constant'] + self.data.columns)
     self.multipliers = self.solve_coefficients()
     print("#1" + str(self.multipliers))
     print("#2" + str(self.original_data.to_array()))
Пример #4
0
 def nearest_neighbors(self, observation):
     close_list = self.compute_distances(observation).to_array()
     sorted_list = []
     for n in range(len(close_list)):
         sorted_list.append(
             close_list.pop(self.sort_closest_cookie(close_list)))
     return DataFrame.from_array(sorted_list[::-1],
                                 columns=['distance', 'Cookie Type'])
Пример #5
0
 def calc_goodness(self, split, axis):
     goodness = self.impurity
     low = []
     high = []
     for point in self.df.to_array():
         if point[axis] < split:
             low.append(point)
         elif point[axis] >= split:
             high.append(point)
     low_node = Node(DataFrame.from_array(low, self.df.columns),
                     self.split_metric)
     high_node = Node(DataFrame.from_array(high, self.df.columns),
                      self.split_metric)
     new_nodes = [low_node, high_node]
     for split_node in new_nodes:
         goodness -= (len(split_node.row_indices) /
                      len(self.row_indices)) * split_node.impurity
     return round(goodness, 3)
 def compute_distances(self, observation):
     data_arr = self.dataframe.to_array()
     data_dict = self.dataframe.data_dict
     distances = []
     for i in range(len(data_arr)):
         distances.append([
             sum([(observation[entry] - data_dict[entry][i])**2
                  for entry in observation])**(0.5), data_arr[i][0]
         ])
     return DataFrame.from_array(distances, ['Distance', 'Cookie Type'])
Пример #7
0
 def split(self, if_once=False, depth_needed=None):
     if depth_needed is None or self.depth < depth_needed:
         if self.low is None and self.high is None:
             if self.final_split is False:
                 self.possible_splits = self.get_possible_splits()
                 self.get_best_split()
                 if self.best_split is None:
                     return
                 if str(self.depth) in self.tree.splits:
                     self.tree.splits[str(self.depth)].append(
                         self.best_split)
                 else:
                     self.tree.splits[str(self.depth)] = [self.best_split]
                 low = []
                 high = []
                 for entry in self.df.to_array():
                     if entry[self.best_split_index] < self.best_split[1]:
                         low.append(entry)
                     elif entry[
                             self.best_split_index] >= self.best_split[1]:
                         high.append(entry)
                 self.low = Node(DataFrame.from_array(low, self.df.columns),
                                 self.split_metric, (self.depth + 1),
                                 tree=self.tree)
                 self.high = Node(DataFrame.from_array(
                     high, self.df.columns),
                                  self.split_metric, (self.depth + 1),
                                  tree=self.tree)
                 if not if_once:
                     self.low.split(depth_needed=depth_needed)
                     self.high.split(depth_needed=depth_needed)
             else:
                 return
         else:
             if self.low is not None:
                 self.low.split(if_once, depth_needed=depth_needed)
             if self.high is not None:
                 self.high.split(if_once, depth_needed=depth_needed)
             return
     else:
         return
Пример #8
0
def run_tests(training_set, testing_set, decision_tree, forest = False):
    correct = 0
    training_df = DataFrame.from_array(training_set, ['bmi', 'weight', 'class'])
    decision_tree.fit(training_df)
    for test in testing_set:
        test_dict = {'bmi' : test[0], 'weight' : test[1]}
        if forest:
            prediction = decision_tree.predict(test_dict)
        else:
            prediction = decision_tree.classify(test_dict)
        if prediction == test[2]:
            correct += 1
    return correct,len(testing_set)
Пример #9
0
 def compute_distances(self, observation):
     distances = []
     for data in self.df.to_array():
         distances.append(
             self.compute_distance(observation, [
                 data[n] for n in range(len(data))
                 if n != self.df.columns.index(self.prediction_column)
             ]))
     result = [[n] for n in distances]
     for n in range(len(distances)):
         result[n].append(self.df.to_array()[n][self.df.columns.index(
             self.prediction_column)])
     return DataFrame.from_array(result,
                                 columns=['distance', 'Cookie Type'])
Пример #10
0
 def calc_goodness(self, split, axis_index):
     goodness = self.impurity
     low = []
     high = []
     for point in self.df.to_array():
         if point[axis_index] < split:
             low.append(point)
         elif point[axis_index] >= split:
             high.append(point)
     low_node = Node(DataFrame.from_array(low, self.df.columns),
                     self.split_metric,
                     depth=int(self.depth) + 1,
                     check_splits=False,
                     tree=self.tree)
     high_node = Node(DataFrame.from_array(high, self.df.columns),
                      self.split_metric,
                      depth=(self.depth + 1),
                      check_splits=False,
                      tree=self.tree)
     nodes = [low_node, high_node]
     for split_node in nodes:
         goodness -= (len(split_node.row_indices) /
                      len(self.row_indices)) * split_node.impurity
     return goodness
Пример #11
0
 def calc_possible_splits(self):
     points = [[], 'x', [], 'y']
     for x in self.df.ordered_dict['x']:
         if x not in points[0]:
             points[0].append(x)
     for y in self.df.ordered_dict['y']:
         if y not in points[2]:
             points[2].append(y)
     splits = []
     for n in range(2):
         for i in range(len(points[2 * n]) - 1):
             splits.append([
                 points[2 * n + 1],
                 (points[2 * n][i] + points[2 * n][i + 1]) / 2,
                 self.calc_goodness(
                     (points[2 * n][i] + points[2 * n][i + 1]) / 2, n)
             ])
     return DataFrame.from_array(splits,
                                 ['feature', 'value', 'goodness of split'])
Пример #12
0
 def get_possible_splits(self):
     axis = [
         axis for axis in self.df.columns
         if axis != 'class' and axis != 'indices'
     ]
     all_splits = []
     for i in range(len(self.distinct_values)):
         for j in range(len(self.distinct_values[i]) - 1):
             split_value = (self.distinct_values[i][j] +
                            self.distinct_values[i][j + 1]) / 2
             all_splits.append(
                 [axis[i], split_value,
                  self.calc_goodness(split_value, i)])
     if self.split_metric == 'random':
         if len(list(set([split[0] for split in all_splits]))) == 0:
             return []
         random_choice = random.choice(
             list(set([split[0] for split in all_splits])))
         new_splits = [
             split for split in all_splits if split[0] == random_choice
         ]
         all_splits = new_splits
     return DataFrame.from_array(
         all_splits, ['axis', 'split_value', 'goodness of split'])
Пример #13
0
data_dict = {'Pete': [1, 0, 1, 0], 'John': [2, 1, 0, 2], 'Sarah': [3, 1, 4, 0]}

df1 = DataFrame(data_dict, column_order=['Pete', 'John', 'Sarah'])
df2 = df1.apply('John', lambda x: 7 * x)
print('Testing method "apply"...')
assert df2.data_dict == {
    'Pete': [1, 0, 1, 0],
    'John': [14, 7, 0, 14],
    'Sarah': [3, 1, 4, 0]
}
print('PASSED')

columns = ['firstname', 'lastname', 'age']
arr = [['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17], ['Anna', 'Smith', 13],
       ['Sylvia', 'Mendez', 9]]
df = DataFrame.from_array(arr, columns)

print('Testing method "select_rows_where"...')
assert df.where(lambda row: len(row['firstname']) >= len(row['lastname']) and
                row['age'] > 10).to_array() == [['Charles', 'Trapp', 17]]
print('PASSED')

print('Testing method "order_by"...')
assert df.order_by('age', order="ASC").to_array() == [['Kevin', 'Fray', 5],
                                                      ['Sylvia', 'Mendez', 9],
                                                      ['Anna', 'Smith', 13],
                                                      ['Charles', 'Trapp', 17]]

assert df.order_by('firstname',
                   order="DESC").to_array() == [['Sylvia', 'Mendez', 9],
                                                ['Kevin', 'Fray', 5],
import sys
sys.path.append('src')
from matrix import Matrix
from dataframe import DataFrame
from linear_regressor import LinearRegressor
from logistic_regressor import LogisticRegressor

dataset = [(0.0, 4.0), (0.2, 8.9), (0.4, 17.2), (0.6, 28.3), (0.8, 41.6),
           (1.0, 56.5), (1.2, 72.4), (1.4, 88.7), (1.6, 104.8), (1.8, 120.1),
           (2.0, 134.0), (2.2, 145.9), (2.4, 155.2), (2.6, 161.3),
           (2.8, 163.6), (3.0, 161.5), (3.2, 154.4), (3.4, 141.7),
           (3.6, 122.8), (3.8, 97.1), (4.0, 64.0), (4.2, 22.9), (4.4, -26.8),
           (4.6, -85.7), (4.8, -154.4)]

new_columns = ['x', 'x^2', 'x^3', 'y']
new_dataset = [(pair[0], pair[0]**2, pair[0]**3, pair[1]) for pair in dataset]

df = DataFrame.from_array(new_dataset, new_columns)
polynomial_regressor = LinearRegressor(df, 'y')
polynomial_regressor_coefficients = polynomial_regressor.coefficients
print("polynomial_regressor_coefficients:", polynomial_regressor_coefficients)
Пример #15
0
import sys
sys.path.append('src')
from matrix import Matrix
from dataframe import DataFrame
from linear_regressor import LinearRegressor
import math
import matplotlib.pyplot as plt

df = DataFrame.from_array([(0.0, 7.0), (0.2, 5.6), (0.4, 3.56), (0.6, 1.23),
                           (0.8, -1.03), (1.0, -2.89), (1.2, -4.06),
                           (1.4, -4.39), (1.6, -3.88), (1.8, -2.64),
                           (2.0, -0.92), (2.2, 0.95), (2.4, 2.63), (2.6, 3.79),
                           (2.8, 4.22), (3.0, 3.8), (3.2, 2.56), (3.4, 0.68),
                           (3.6, -1.58), (3.8, -3.84), (4.0, -5.76),
                           (4.2, -7.01), (4.4, -7.38), (4.6, -6.76),
                           (4.8, -5.22)],
                          columns=['x', 'y'])
#add different columns
df = df.apply_add('x', lambda x: math.sin(x), 'sin(x)')
df = df.apply_add('x', lambda x: math.cos(x), 'cos(x)')
df = df.apply_add('x', lambda x: math.sin(2 * x), 'sin(2*x)')
df = df.apply_add('x', lambda x: math.cos(2 * x), 'cos(2*x)')
#save x and y values
x_values = list(df.data_dict['x'])
y_values = list(df.data_dict['y'])

#delete x values

df = df.del_column('x')
#find coefficients
linear_regressor = LinearRegressor(df, dependent_variable='y')
Пример #16
0
#   #   [ 5,  0,  1,  0,  0,  5,  0,  0,  0,  0,  1,  8],
#   #   [ 5,  0,  0,  1,  0,  0,  5,  0,  0,  0,  1,  1],
#   #   [ 5,  0,  1,  1,  0,  5,  5,  0,  0,  1,  1,  0],
#   #   [ 0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  1,  5],
#   #   [ 0,  5,  1,  0,  0,  0,  0,  5,  0,  0,  1,  0],
#   #   [ 0,  5,  0,  1,  0,  0,  0,  0,  5,  0,  1,  9],
#   #   [ 0,  5,  1,  1,  0,  0,  0,  5,  5,  1,  1,  0],
#   #   [ 5,  5,  0,  0, 25,  0,  0,  0,  0,  0,  1,  0],
#   #   [ 5,  5,  1,  0, 25,  5,  0,  5,  0,  0,  1,  0],
#   #   [ 5,  5,  0,  1, 25,  0,  5,  0,  5,  0,  1,  0],
#   #   [ 5,  5,  1,  1, 25,  5,  5,  5,  5,  1,  1,  0]]

columns = ['firstname', 'lastname', 'age']
arr = [['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17], ['Anna', 'Smith', 13],
       ['Sylvia', 'Mendez', 9]]
df = DataFrame.from_array(arr, columns)

print(df.to_array())

print(df.select_columns(['firstname', 'age']).to_array())
# [['Kevin', 5],
# ['Charles', 17],
# ['Anna', 13],
# ['Sylvia', 9]]

print(df.select_rows([1, 3]).to_array())
# [['Charles', 'Trapp', 17],
# ['Sylvia', 'Mendez', 9]]

print(
    df.select_rows_where(lambda row: len(row['firstname']) >= len(row[
 (1.0, 56.5),
 (1.2, 72.4),
 (1.4, 88.7),
 (1.6, 104.8),
 (1.8, 120.1),
 (2.0, 134.0),
 (2.2, 145.9),
 (2.4, 155.2),
 (2.6, 161.3),
 (2.8, 163.6),
 (3.0, 161.5),
 (3.2, 154.4),
 (3.4, 141.7),
 (3.6, 122.8),
 (3.8, 97.1),
 (4.0, 64.0),
 (4.2, 22.9),
 (4.4, -26.8),
 (4.6, -85.7),
 (4.8, -154.4)]

df = DataFrame.from_array(arr, ['x', 'y'])

df = df.create_interaction_terms('x', 'x')

df = df.create_interaction_terms('x * x', 'x')

regressor = LinearRegressor(df, 'y')

print(regressor.coefficients)
Пример #18
0
import matplotlib.pyplot as plt

list_data = [[1, 0], [2, 0], [3, 0], [2, 1], [3, 1], [4, 1]]

delta_table = [0.1, 0.01, 0.001, 0.0001]
all_coords = []

for delta_low in delta_table:
    # new_list=[]
    # for pair in list_data:
    #     if pair[1] == 0:
    #         new_list.append([pair[0],delta])
    #     else:
    #         new_list.append([pair[0],1-delta])

    df = DataFrame.from_array(list_data, columns=['x', 'y'])

    regressor = LogisticRegressor(df,
                                  prediction_column='y',
                                  max_value=1,
                                  delta=delta_low)

    coords = [[], []]
    for x in range(20):
        coords[0].append(x / 100)
        coords[1].append(regressor.predict({'constant': 1, 'x': x}))
    all_coords.append(coords)
print(all_coords)
plt.style.use('bmh')
for coords in all_coords:
    plt.plot(coords[0], coords[1], linewidth=2.5)
import sys
sys.path.append('src')
from dataframe import DataFrame
sys.path.append('kaggle/titanic')
from parse_line import parse_line

df = DataFrame.from_array([['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17],
                           ['Anna', 'Smith', 13], ['Sylvia', 'Mendez', 9]],
                          columns=['firstname', 'lastname', 'age'])

assert df.query(
    "SELECT lastname, firstname, age ORDER BY age DESC").to_array() == [[
        'Trapp', 'Charles', 17
    ], ['Smith', 'Anna', 13], ['Mendez', 'Sylvia', 9], ['Fray', 'Kevin', 5]]

print("\npassed test 1")

assert df.query("SELECT firstname ORDER BY lastname ASC").to_array() == [[
    'Kevin'
], ['Sylvia'], ['Anna'], ['Charles']]

print("\npassed test 2")

df = DataFrame.from_array(
    [['Kevin', 'Fray', 5], ['Melvin', 'Fray', 5], ['Charles', 'Trapp', 17],
     ['Carl', 'Trapp', 17], ['Anna', 'Smith', 13], ['Hannah', 'Smith', 13],
     ['Sylvia', 'Mendez', 9], ['Cynthia', 'Mendez', 9]],
    columns=['firstname', 'lastname', 'age'])

assert df.query(
    "SELECT lastname, firstname, age ORDER BY age ASC, firstname DESC"
import math

data = [(0.0, 7.0), (0.2, 5.6), (0.4, 3.56), (0.6, 1.23), (0.8, -1.03),
        (1.0, -2.89), (1.2, -4.06), (1.4, -4.39), (1.6, -3.88), (1.8, -2.64),
        (2.0, -0.92), (2.2, 0.95), (2.4, 2.63), (2.6, 3.79), (2.8, 4.22),
        (3.0, 3.8), (3.2, 2.56), (3.4, 0.68), (3.6, -1.58), (3.8, -3.84),
        (4.0, -5.76), (4.2, -7.01), (4.4, -7.38), (4.6, -6.76), (4.8, -5.22)]

columns = ['y', 'sin(x)', 'cos(x)', 'sin(2x)', 'cos(2x)']

new_data = [[y, math.sin(x),
             math.cos(x),
             math.sin(2 * x),
             math.cos(2 * x)] for (x, y) in data]

df = DataFrame.from_array(new_data, columns)

regressor = LinearRegressor(df, 'y')

print(regressor.coefficients)
'''
import matplotlib.pyplot as plt
plt.style.use('bmh')

x_points = []
predicted_points = []

x = 0
while x <= 5 :
    data_dict = {'sin(x)' : math.sin(x), 'cos(x)' : math.cos(x), 'sin(2x)' : math.sin(2 * x), 'cos(2x)' : math.cos(2 * x)}
    x_points.append(x)
import sys
sys.path.append('src')
from polynomial_regressor import PolynomialRegressor
from dataframe import DataFrame

data = [(1, 3.1), (2, 10.17), (3, 20.93), (4, 38.71), (5, 60.91), (6, 98.87),
        (7, 113.92), (8, 146.95), (9, 190.09), (10, 232.65)]

df = DataFrame.from_array(data, ['time', 'distance'])

quadratic_regressor = PolynomialRegressor(degree=2)
quadratic_regressor.fit(df, 'distance')
print('Quadratic Regressor:')
print(quadratic_regressor.coefficients)

for t in [5, 10, 200]:
    print('Distance after ' + str(t) + ' seconds:',
          quadratic_regressor.predict({'time': t}))

df = DataFrame.from_array(data, ['time', 'distance'])

cubic_regressor = PolynomialRegressor(degree=3)
cubic_regressor.fit(df, 'distance')
print('Cubic Regressor:')
print(cubic_regressor.coefficients)

for t in [5, 10, 200]:
    print('Distance after ' + str(t) + ' seconds:',
          cubic_regressor.predict({'time': t}))
import sys
sys.path.append('src')
from matrix import Matrix
from dataframe import DataFrame
from linear_regressor import LinearRegressor

df = DataFrame.from_array(
    [[0, 0, 1], [1, 0, 2], [2, 0, 4], [4, 0, 8], [6, 0, 9], [0, 2, 2],
     [0, 4, 5], [0, 6, 7], [0, 8, 6]],
    columns=['slices of roast beef', 'tbsp of peanut butter', 'rating'])

regressor = LinearRegressor(df, dependent_variable='rating')
print(regressor.coefficients)
print(
    regressor.predict({
        'slices of roast beef': 5,
        'tbsp of peanut butter': 0
    }))
print(
    regressor.predict({
        'slices of roast beef': 5,
        'tbsp of peanut butter': 5
    }))
Пример #23
0
import sys
sys.path.append('src')
from dataframe import DataFrame
from decision_tree import DecisionTree
from random_forest import RandomForest

data = [[2,13,'B'],[2,13,'B'],[2,13,'B'],[2,13,'B'],[2,13,'B'],[2,13,'B'],
        [3,13,'B'],[3,13,'B'],[3,13,'B'],[3,13,'B'],[3,13,'B'],[3,13,'B'],
        [2,12,'B'],[2,12,'B'],
        [3,12,'A'],[3,12,'A'],
        [3,11,'A'],[3,11,'A'],
        [3,11.5,'A'],[3,11.5,'A'],
        [4,11,'A'],[4,11,'A'],
        [4,11.5,'A'],[4,11.5,'A'],
        [2,10.5,'A'],[2,10.5,'A'],
        [3,10.5,'B'],
        [4,10.5,'A'],
        [3, 9.5, 'A'],
        [2,10,'A']]

df = DataFrame.from_array(data, columns = ['x', 'y', 'class'])

r = RandomForest(10)
r.fit(df)
print(r.predict({'x': 3, 'y': 10}))
Пример #24
0
import sys
sys.path.append('src')
from matrix import Matrix
from dataframe import DataFrame
from linear_regressor import LinearRegressor
from logistic_regressor import LogisticRegressor

 df = DataFrame.from_array(
[[0, 0, 1, 0], 
[1, 0, 2, 0], 
[2, 0, 4, 0], 
[4, 0, 8, 0], 
[6, 0, 9, 0], 
[0, 2, 2, 0], 
[0, 4, 5, 0], 
[0, 6, 7, 0], 
[0, 8, 6, 0],
[2, 2, 0.1, 4],
[3, 4, 0.1, 12]],
columns = ['beef', 'pb', 'rating', 'interactive']
)
log_reg = LogisticRegressor(df,10, dependent_variable = 'rating')
print(log_reg.predict({'beef': 5, 'pb': 0 , 'interactive':0}))
print(log_reg.predict({'beef': 12, 'pb': 0 , 'interactive':0}))
print(log_reg.predict({'beef': 5, 'pb': 5 , 'interactive':25}))
df = DataFrame.from_array([[1,0.2], [2,0.25], [3,0.5]], columns = ['hours worked', 'progress'])

regressor = LinearRegressor(df, dependent_variable='progress')

print('Does all the linear_regressor stuff work')

assert regressor.coefficients == [0.01667, 0.15], 'No, coefficients does not work'

assert regressor.predict({'hours worked': 4}) == 0.61667, 'No, predict does not work'

print('Yes they do', "\n")
'''

df = DataFrame.from_array(
    [[0, 0, 0.1], [1, 0, 0.2], [0, 2, 0.5], [4, 5, 0.6]],
    columns=['scoops of chocolate', 'scoops of vanilla', 'taste rating'])

regressor = LinearRegressor(df, dependent_variable='taste rating')

print('Does all the linear_regressor stuff work')

reg_coeff = regressor.coefficients.copy()
for (key, value) in reg_coeff.items():
    reg_coeff[key] = round(value, 8)

assert reg_coeff == {
    'constant': 0.19252336,
    'scoops of chocolate': -0.05981308,
    'scoops of vanilla': 0.13271028
}, 'No, coefficients does not work'
import sys
sys.path.append('src')
from k_nearest_neighbors_classifier import KNearestNeighborsClassifier
from dataframe import DataFrame

df = DataFrame.from_array(
    [['Shortbread', 0.14, 0.14, 0.28, 0.44],
     ['Shortbread', 0.10, 0.18, 0.28, 0.44],
     ['Shortbread', 0.12, 0.10, 0.33, 0.45],
     ['Shortbread', 0.10, 0.25, 0.25, 0.40], ['Sugar', 0.00, 0.10, 0.40, 0.50],
     ['Sugar', 0.00, 0.20, 0.40, 0.40], ['Sugar', 0.10, 0.08, 0.35, 0.47],
     ['Sugar', 0.00, 0.05, 0.30, 0.65], ['Fortune', 0.20, 0.00, 0.40, 0.40],
     ['Fortune', 0.25, 0.10, 0.30, 0.35], ['Fortune', 0.22, 0.15, 0.50, 0.13],
     ['Fortune', 0.15, 0.20, 0.35, 0.30], ['Fortune', 0.22, 0.00, 0.40, 0.38]],
    columns=[
        'Cookie Type', 'Portion Eggs', 'Portion Butter', 'Portion Sugar',
        'Portion Flour'
    ])

knn = KNearestNeighborsClassifier(k=5)
knn.fit(df, dependent_variable='Cookie Type')
observation = {
    'Portion Eggs': 0.10,
    'Portion Butter': 0.15,
    'Portion Sugar': 0.30,
    'Portion Flour': 0.45
}

print(knn.compute_distances(observation).to_array())
# Returns a dataframe representation of the following array:
Пример #27
0
# for i in range(len(pos_neg)):
#     correct_class = pos_neg[i][3]
#     observation = into_new_observation(pos_neg[i])
#     prediction = r.predict(observation)
#     if prediction == correct_class:
#         correct += 1

# assert correct/len(pos_neg) * 100 == 100, 'WRONG ACCURACY BRUH'

points = [[x, y, z, 'A'] for z in range(-5, 6) for y in range(-5, 6)
          for x in range(-5, 6) if x * y * z != 0]
points.extend([[x, y, z, 'B'] for z in range(1, 6) for y in range(1, 6)
               for x in range(1, 6) if x * y * z != 0])
points.extend([[x, y, z, 'B'] for z in range(1, 6) for y in range(1, 6)
               for x in range(1, 6) if x * y * z != 0])

df = DataFrame.from_array(points, columns=['x', 'y', 'z', 'class'])
r = RandomForest(100, depth=None)
r.fit(df)
correct = 0

for i in range(len(points)):
    correct_class = points[i][3]
    observation = into_new_observation(points[i])
    prediction = r.predict(observation)
    if prediction == correct_class:
        correct += 1

assert correct / len(points) * 100 == 90, 'WRONG ACCURACY BRUH'

print('passed')
Пример #28
0
import sys
sys.path.append('src')
from matrix import Matrix
from dataframe import DataFrame
from linear_regressor import LinearRegressor
from logistic_regressor import LogisticRegressor

df = DataFrame.from_array(
    [[0, 0, [], 1], [0, 0, ['mayo'], 1], [0, 0, ['jelly'], 4],
     [0, 0, ['mayo', 'jelly'], 0], [5, 0, [], 4], [5, 0, ['mayo'], 8],
     [5, 0, ['jelly'], 1], [5, 0, ['mayo', 'jelly'], 0], [0, 5, [], 5],
     [0, 5, ['mayo'], 0], [0, 5, ['jelly'], 9], [0, 5, ['mayo', 'jelly'], 0],
     [5, 5, [], 0], [5, 5, ['mayo'], 0], [5, 5, ['jelly'], 0],
     [5, 5, ['mayo', 'jelly'], 0]],
    columns=['beef', 'pb', 'condiments', 'rating'])
df = df.create_dummy_variables('condiments')
df = df.create_interaction_terms('beef', 'pb')

df = df.create_interaction_terms('beef', 'mayo')
df = df.create_interaction_terms('beef', 'jelly')
df = df.create_interaction_terms('pb', 'mayo')
df = df.create_interaction_terms('pb', 'jelly')
df = df.create_interaction_terms('mayo', 'jelly')
log_df = DataFrame(df.data_dict, df.columns)

logistic_regressor = LogisticRegressor(log_df, 10, dependent_variable='rating')

# test 8 slices of beef + mayo
observation = {'beef': 8, 'mayo': 1}

assert round(logistic_regressor.predict(observation), 2) == 9.72
Пример #29
0
import sys
sys.path.append('src')
from dataframe import DataFrame
from polynomial_regressor import PolynomialRegressor

df = DataFrame.from_array(
    [(0,1), (1,2), (2,5), (3,10), (4,20), (5,30)],
    columns = ['x', 'y']
)

constant_regressor = PolynomialRegressor(degree=0)
constant_regressor.fit(df, dependent_variable='y')
print(constant_regressor.coefficients)
{'constant': 11.3333}
print(constant_regressor.predict({'x': 2}))
11.3333

linear_regressor = PolynomialRegressor(degree=1)

linear_regressor.fit(df, dependent_variable='y')
print(linear_regressor.coefficients)
{'constant': -3.2381, 'x': 5.8286}
print(linear_regressor.predict({'x': 2}))
8.4190



quadratic_regressor = PolynomialRegressor(degree=2)
quadratic_regressor.fit(df, dependent_variable='y')
print(quadratic_regressor.coefficients)
{'constant': 1.1071, 'x': -0.6893, 'x^2': 1.3036}
Пример #30
0
# print('\nTesting root low high indices')
# assert dt.root.low.high.row_indices == [6]
# print('passed')

# print('\nTesting root low low impurity')
# assert dt.root.low.low.impurity == 0
# print('passed')

# print('\nTesting root low high impurity')
# assert dt.root.low.high.impurity == 0
# print('passed')

print('Splitting Tests')
df = DataFrame.from_array(
    [[1, 11, 'A'], [1, 12, 'A'], [2, 11, 'A'], [1, 13, 'B'], [2, 13, 'B'],
     [3, 13, 'B'], [3, 11, 'B']],
    columns=['x', 'y', 'class'])

dt = DecisionTree(split_metric='gini')
dt.initialize(df)
dt.split()
dt.split()

assert dt.root.high.row_indices == [3, 4, 5]
assert dt.root.low.low.row_indices == [0, 1, 2]
assert dt.root.low.high.row_indices == [6]
print('passed')
dt = DecisionTree(split_metric='gini')
dt.fit(df)
assert dt.root.high.row_indices == [3, 4, 5]
assert dt.root.low.low.row_indices == [0, 1, 2]