Exemplo n.º 1
0
    "PassengerId": int,
    "Survived": int,
    "Pclass": int,
    "Name": str,
    "Sex": str,
    "Age": float,
    "SibSp": int,
    "Parch": int,
    "Ticket": str,
    "Fare": float,
    "Cabin": str,
    "Embarked": str
}

df = DataFrame.from_csv("kaggle/titanic/dataset_of_knowns.csv",
                        data_types=data_types,
                        parser=parse_line)
df2 = df.generate_new_column("Name", "Surname", lambda x: x.split(",")[0][1:])
df3 = df2.generate_new_column(
    "Cabin", "CabinType", lambda x: None
    if x is None or len(x) == 0 else x.split(" ")[0][0])
df4 = df3.generate_new_column(
    "Cabin", "CabinNumber", lambda x: None if x is None or len(y := x.split(
        " ")) == 0 or len(y[0]) == 1 else int(y[0][1:]))
df5 = df4.generate_new_column(
    "Ticket", "TicketType", lambda x: None
    if x is None or len(y := x.split(" ")) == 1 else y[0])
df6 = df5.generate_new_column(
    "Ticket", "TicketNumber", lambda x: None
    if len(y := x.split(" ")) == 0 or not y[-1].isnumeric() else int(y[-1]))
df6.filter_columns([
Exemplo n.º 2
0
from dataframe import DataFrame
sys.path.pop(-1)

data_types = {
    "gender": str,
    "race/ethnicity": str,
    "parental level of education": str,
    "lunch": str,
    "test preparation course": str,
    "math score": int,
    "reading score": int,
    "writing score": int
}

dataframe = DataFrame.from_csv("quiz_2-5/StudentsPerformance.csv",
                               data_types=data_types,
                               parser=parse_line)

dataframe.apply('gender', lambda sex: 1.0 if sex == 'male' else 0.0)
race_enthnicity = ['group A', 'group B', 'group C', 'group D', 'group E']
race_enthnicity_2 = [4.0, 2.0, 0.0, 1.0, 3.0]
dataframe.apply('race/ethnicity',
                lambda race: race_enthnicity_2[race_enthnicity.index(race)])
level_of_education = [
    'some high school', 'high school', 'associates degree', 'some college',
    'bachelors degree', 'masters degree'
]
level_of_education_2 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]
dataframe.apply(
    'parental level of education',
    lambda edu: level_of_education_2[level_of_education.index(edu)])
Exemplo n.º 3
0
import sys
sys.path.append('src')
from dataframe import DataFrame
from decision_tree import DecisionTree
from node import Node
from random_forest import RandomForest
import math

# path_to_datasets = 'C:/Users/mezag/Documents/Github/machine_learning/datasets/'
path_to_datasets = './datasets/'
filename = 'freshman_lbs.csv'
filepath = path_to_datasets + filename
df = DataFrame.from_csv(filepath)
df = df.filter_columns(['Sex', 'Weight (lbs, Sep)', 'BMI (Sep)'])
df = df.swap_columns(0, 2)
df = df.swap_columns(0, 1)
df = df.rename_columns(['weight', 'bmi', 'class'])

df = df.apply('weight', lambda x: float(x))
df = df.apply('bmi', lambda x: float(x))
df.apply('class', lambda x: x.strip('"'))

# def split_sets(data, num_sets):
#     training_sets = []
#     testing_sets = []
#     interval = math.ceil(len(data)/num_sets)
#     for i in range(num_sets):
#         training = []
#         testing = []
#         starter = i*interval
#         cutoff = i*interval + interval
Exemplo n.º 4
0
filepath = path_to_datasets + filename
data_types = {
    "PassengerId": int,
    "Survived": int,
    "Pclass": int,
    "Name": str,
    "Sex": str,
    "Age": float,
    "SibSp": int,
    "Parch": int,
    "Ticket": str,
    "Fare": float,
    "Cabin": str,
    "Embarked": str
}
df = DataFrame.from_csv(filepath, True, data_types, parse_line)
assert df.columns == [
    'PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
    'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'
]
assert df.to_array()[:3] == [
    [
        1, 0, 3, '"Braund, Mr. Owen Harris"', "male", 22, 1, 0, "A/5 21171",
        7.25, "", "S"
    ],
    [
        2, 1, 1, '"Cumings, Mrs. John Bradley (Florence Briggs Thayer)"',
        "female", 38, 1, 0, "PC 17599", 71.2833, "C85", "C"
    ],
    [
        3, 1, 3, '"Heikkinen, Miss. Laina"', "female", 26, 0, 0,
Exemplo n.º 5
0
import sys
sys.path.append('src/models')
from logistic_regressor import LogisticRegressor
from linear_regressor import LinearRegressor
from random_forest import RandomForest
from decision_tree import DecisionTree
from naive_bayes_classifier import NaiveBayesClassifier
from k_nearest_neighbors_classifier import KNearestNeighborsClassifier
from dataframe import DataFrame

path_to_datasets = 'C:/Users/colbi/VSCode/Computational Math/machine-learning/kaggle/titanic/'

filename = 'dataset_of_knowns.csv'
filepath = path_to_datasets + filename
dataframe = DataFrame.from_csv(filepath, header=True)
dataframe.apply('Survived', lambda i: i if isinstance(i, float) else float(i))
dataframe.append_columns(
    {'indices': [index for index in range(len(dataframe))]})
dataframe_indices = dataframe['indices']
survived_people = dataframe['Survived']
dataframe.remove_columns(
    ['PassengerId', 'Survived', 'Ticket', 'Fare', 'Cabin', 'Name', 'indices'])
dataframe.apply('Sex', lambda sex: 0 if sex == 'male' else 1)
dataframe.apply(
    'Age', lambda i: i if isinstance(i, float) else float(i) if i != '' else 0)
dataframe.apply('Pclass', lambda i: i if isinstance(i, float) else float(i))
dataframe.apply('SibSp', lambda i: i if isinstance(i, float) else float(i))
dataframe.apply('Parch', lambda i: i if isinstance(i, float) else float(i))
dataframe.apply('Embarked', lambda s: 0 if s == 'S' else 1 if s == 'C' else 2)
dataframe.append_pairwise_interactions()
Exemplo n.º 6
0
], 'No, order_by does not work for "age" and ascending = True'

assert df.order_by('firstname', ascending=False).to_array() == [
    ['Sylvia', 'Mendez', 9], ['Kevin', 'Fray', 5], ['Charles', 'Trapp', 17],
    ['Anna', 'Smith', 13]
], 'No, order_by does not work for "firstname" and ascending = False'
print('Yes it does!', "\n")

import csv

test = []

path_to_datasets = '/home/runner/machine-learning/datasets/'
filename = 'airtravel.csv'
filepath = path_to_datasets + filename
df = DataFrame.from_csv(filepath, header=True)

print('Does from_csv work?')

assert df.columns == ['"Month"', '"1958"', '"1959"',
                      '"1960"'], 'No, the columns are not correct'

assert df.to_array() == [['"JAN"', '340', '360', '417'],
                         ['"FEB"', '318', '342', '391'],
                         ['"MAR"', '362', '406', '419'],
                         ['"APR"', '348', '396', '461'],
                         ['"MAY"', '363', '420', '472'],
                         ['"JUN"', '435', '472', '535'],
                         ['"JUL"', '491', '548', '622'],
                         ['"AUG"', '505', '559', '606'],
                         ['"SEP"', '404', '463', '508'],