def test_load_CSV_into_numpy(self):

        file_loader = FileLoader()
        file_path = "test_load_csv_into_numpy.csv"
        result = file_loader.load_file(file_path)
        expected = np.array([[1,2],[3,4]])
        self.assertTrue(np.array_equal(result, expected))
    def test_load_MergedCSV_into_numpy(self):

        file_loader = FileLoader()

        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        result= file_loader.load_file(file_path)
        expected = np.zeros((37,397))
        self.assertTrue(result.shape == expected.shape)
Exemplo n.º 3
0
 def __init__(self, path, part_size, stream_id):
     self.__part_size = part_size
     dp = DataProcessor()
     fl = FileLoader()
     data = fl.load_file(path)
     self.__size = len(data)
     self.__chunks = dp.split_data(data, part_size)
     debug('FileStreamer.init(): len(self.__chunks) == %d' % len(self.__chunks))
     self.__stream_id = stream_id
    def test_split_merge_csv_4_25_8(self):
        file_loader = FileLoader()
        data_manager = DataManager()
        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        result = file_loader.load_file(file_path)
        data_manager.set_data(result)
        data_manager.split_data(test_split=0.11,train_split=0.22)

        test_shapes = np.zeros((4, 397)).shape
        valid_shapes = np.zeros((25,397)).shape
        train_shapes = np.zeros((8, 397)).shape
        expected = np.array([test_shapes, valid_shapes, train_shapes])
        result = np.array([data_manager.datum[SplitTypes.Test].shape, data_manager.datum[SplitTypes.Valid].shape, data_manager.datum[SplitTypes.Train].shape])
        self.assertTrue(np.array_equal(result, expected))
    def test_split_merge_csv_7_7_23(self):

         file_loader = FileLoader()
         data_manager = DataManager()
         file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
         result = file_loader.load_file(file_path)
         data_manager.set_data(result)
         data_manager.split_data(test_split=0.19,train_split=0.62)


         valid_and_test_shapes = (7, 397)
         train_shapes = (23, 397)
         expected = np.array([valid_and_test_shapes, valid_and_test_shapes, train_shapes])
         result = np.array([data_manager.datum[SplitTypes.Test].shape, data_manager.datum[SplitTypes.Valid].shape, data_manager.datum[SplitTypes.Train].shape])
         self.assertTrue(np.array_equal(result, expected))
 def test_split_into_target_and_input(self):
     file_loader = FileLoader()
     data_manager = DataManager()
     file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
     result = file_loader.load_file(file_path)
     data_manager.set_data(result)
     data_manager.split_data(test_split=0.11,train_split=0.22)
     test_shapes_input = np.zeros((4, 396)).shape
     valid_shapes_input = np.zeros((25,396)).shape
     train_shapes_input = np.zeros((8, 396)).shape
     test_shapes_target = np.zeros((4, )).shape
     valid_shapes_target = np.zeros((25,)).shape
     train_shapes_target = np.zeros((8, )).shape
     expected = np.array([test_shapes_input, valid_shapes_input, train_shapes_input, test_shapes_target, valid_shapes_target, train_shapes_target])
     result = np.array([data_manager.inputs[SplitTypes.Test].shape, data_manager.inputs[SplitTypes.Valid].shape, data_manager.inputs[SplitTypes.Train].shape, data_manager.targets[SplitTypes.Test].shape, data_manager.targets[SplitTypes.Valid].shape, data_manager.targets[SplitTypes.Train].shape])
     self.assertTrue(np.array_equal(result, expected))
class CoreApplication(object):
  
    def __init__(self):
        self.iterator = True
        self.logger = logging.getLogger('SecurityMetricIDS')
        self.__is_file_valid(config.logname)
        self.__log_loader = FileLoader()

    def start_core(self):
        core_app = threading.Thread(target=self.__start_core_thread, args=())
        core_app.daemon = True
        core_app.start()

    def __is_file_valid(self, filename):
        if not os.path.exists(filename):
            try:
                raise Exception("""Process terminated at {}. Selected path of logging file is not valid.
                 Please specify correct authentication log path.""".format(datetime.datetime.now()))
            except Exception as err:
                self.logger.error(err)
                sys.exit(1)
        else:
            self.logger.info("File validation successful, file {} is available".format(filename))

    def __start_core_thread(self):
        self.logger.info("Starting core of the application")
        self.iterator = True

        metrics_computer = MetricsComputer()
        last_modified_config = os.stat('config.py').st_mtime
        reload(config)
        while self.iterator:
            self.logger.info("New analyse iteration started.")
            self.__log_loader.read_file()
            metrics_computer.compute_metrics()
            time.sleep(config.analyse_time*60)
            if not last_modified_config == os.stat('config.py').st_mtime:
                self.logger.info("Config file was changed. Reloading config.")
                last_modified_config = os.stat('config.py').st_mtime
                reload(config)
        self.logger.info("Reading log data finished.")
        return
    def test_experiment(self):
        output_filename_header = FileLoader.create_output_file()
        time.sleep(1)
        loaded_algorithm_combinations = FileLoader.read_csv_file("../Datasets/test.csv")
        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        loaded_data = FileLoader.load_file(file_path)
        # feature_eliminator = SelectKBest(f_regression,k=k_value)

        print (loaded_algorithm_combinations[0])
        output_filename = FileLoader.create_output_file()

        for i in range(0, 80):
            normalizer = self.getnormalizer(loaded_algorithm_combinations[i][0])

            feature_eliminator = self.getfeature_eliminator(loaded_algorithm_combinations[i][1])
            the_model = self.get_model(loaded_algorithm_combinations[i][2])

            print "taking ", type(normalizer).__name__, "and feature selector ", type(
                feature_eliminator
            ).__name__, "model", type(the_model).__name__
            FileLoader.write_model_in_file(
                output_filename_header,
                type(normalizer).__name__,
                type(feature_eliminator).__name__,
                type(the_model).__name__,
                "",
                "",
                "",
                "",
                "",
            )

            the_data_manager = DataManager(feature_eliminator, normalizer=normalizer)
            the_data_manager.set_data(loaded_data)
            the_data_manager.split_data(test_split=0.15, train_split=0.70)
            exp = Experiment(the_data_manager, the_model)

            exp.run_experiment()
            # arr_selected = feature_eliminator.get_support(indices=True)

            # if(exp.get_r2(SplitTypes.Train) > 0 and exp.get_r2(SplitTypes.Valid) > 0 and exp.get_r2(SplitTypes.Test) >  0):
            FileLoader.write_model_in_file(
                output_filename,
                type(normalizer).__name__,
                type(feature_eliminator).__name__,
                type(the_model).__name__,
                "",
                exp.fitness_matrix[0],
                exp.get_r2(SplitTypes.Train),
                exp.get_r2(SplitTypes.Valid),
                exp.get_r2(SplitTypes.Test),
            )
    def test_experiment_not_transformed_test(self):
        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        loaded_data = FileLoader.load_file(file_path)
        data_manager = DataManager()
        data_manager.set_data(loaded_data)
        data_manager.split_data(test_split=0.19, train_split=0.62)
        learning_model = FakePredictionModel()
        exp = Experiment(data_manager, learning_model)

        exp.run_experiment()

        self.assertEquals(0, exp.get_r2(SplitTypes.Test))
    def test_experiment_svm_svr_37dataset_r2_train(self):
        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        loaded_data = FileLoader.load_file(file_path)
        the_data_manager = DataManager()
        the_data_manager.set_data(loaded_data)
        the_data_manager.split_data(test_split=0.19, train_split=0.62)
        the_model = svm.SVR()
        exp = Experiment(the_data_manager, the_model)
        exp.run_experiment()

        r2_train = exp.get_r2(SplitTypes.Train)
        expected_svm_r2_value = 0.93994377385638073
        self.assertEqual(r2_train, expected_svm_r2_value)
Exemplo n.º 11
0
    def create(self, fname, varnames, plottype, opts={}):
        f = FileLoader.get_cached_reader(fname)
        var = map(f.read, varnames)

        vis = visualizers[plottype]()
        vis.loadVariable(var, opts)
        vis.render(opts)
        k = plottype + '_' + fname + '_' + '_'.join(varnames)
        self._active[k] = vis

        view = vis.getView()
        view.Render()
        return self.getGlobalId(view)
    def test_experiment_svr_37dataset_r2_test(self):
        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        loaded_data = FileLoader.load_file(file_path)
        the_data_manager = DataManager()
        the_data_manager.set_data(loaded_data)
        the_data_manager.split_data(test_split=0.19, train_split=0.62)
        the_model = svm.SVR()
        exp = Experiment(the_data_manager, the_model)

        exp.run_experiment()

        r2_test = exp.get_r2(SplitTypes.Test)
        expected_svm_r2_value = -0.33005242525900247
        self.assertEqual(r2_test, expected_svm_r2_value)
    def test_experiment_sum_of_squares_real37_test(self):
        file_path = "../Datasets/HIV_37_Samples/MergedDataset.csv"
        loaded_data = FileLoader.load_file(file_path)
        the_data_manager = DataManager()
        the_data_manager.set_data(loaded_data)
        the_model = svm.SVR()
        the_data_manager.split_data(test_split=0.19, train_split=0.62)
        exp = Experiment(the_data_manager, the_model)

        exp.run_experiment()
        sum_of_squares_test = exp.get_sum_of_squares(SplitTypes.Test)

        expected = 6.708898437500002

        self.assertAlmostEqual(expected, sum_of_squares_test)
Exemplo n.º 14
0
 def setUp(self):
     unittest.TestCase.setUp(self)
     self.project = FileLoader.read_from_json('houseproject.json')
Exemplo n.º 15
0
 def test_read_from_json(self):
     self.project = FileLoader.read_from_json(self.json_file)
     self.assertTrue(type(self.project) == Project) # should return a Project
     self.assertEqual(len(self.project.task_list),22) # This file has 22 tasks
     self.assertEqual(len(self.project.engr_list), 6) # and 6 engineers
Exemplo n.º 16
0
from FileLoader import FileLoader

fl = FileLoader()
data = fl.load("../athlete_events.csv")
fl.display(data, 10)
fl.display(data, -10)
Exemplo n.º 17
0
import pandas as pd
from FileLoader import FileLoader

def youngestFellah(df, year):
    new_dic = {}
    test = df.loc[df['Year'] == year]
    new_dic['f'] = test['Age'].loc[df['Sex'] == 'F'].min()
    new_dic['m'] = test['Age'].loc[df['Sex'] == 'M'].min()
    print (new_dic)

if __name__ == "__main__":
    path = "./athlete_events.csv"
    fl = FileLoader()
    df = fl.load(path)
    youngestFellah(df, 2004) 
Exemplo n.º 18
0
import pandas as pd
from FileLoader import FileLoader


def youngestFellah(data, year):
    dic = {'f': None, 'm': None}
    if (isinstance(data, pd.DataFrame) and isinstance(year, int)):
        crop = data[data["Year"] == year]
        crop_m = crop[crop["Sex"] == 'M']
        crop_f = crop[crop["Sex"] == 'F']
        dic['f'] = crop_f.Age.min()
        dic['m'] = crop_m.Age.min()
        return (dic)
    else:
        print("ERROR: year is not an int or data is not good")


fl = FileLoader()
df = fl.load("athlete_events.csv")
dic = youngestFellah(df, 1992)
print(dic)
Exemplo n.º 19
0
import pandas
from FileLoader import FileLoader


def youngestFellah(df, year):
    #print(df[['Year', 'Age', 'Sex']])
    dict = {}
    youngest_male = df.loc[(df['Year'] == year)
                           & (df['Sex'] == "M")].sort_values(by='Age').head(1)
    youngest_female = df.loc[(df['Year'] == year)
                             & (df['Sex'] == "F")].sort_values(
                                 by='Age').head(1)
    dict['M'] = youngest_male.iloc[0, 3]
    dict['F'] = youngest_female.iloc[0, 3]
    return dict


file = FileLoader()
df = file.load("athlete_events.csv", ',')
year_1992 = youngestFellah(df, 1992)
print(year_1992)
Exemplo n.º 20
0
from FileLoader import FileLoader


def howManyMedals(df, name):
    medals = {"Gold": "G", "Silver": "S", "Bronze": "B"}
    how_many_medals = {}
    athlete_records = df.loc[df["Name"] == name]

    for index, row in athlete_records.iterrows():
        if row["Medal"] in medals:
            m = medals[row["Medal"]]

            if row["Year"] not in how_many_medals:
                how_many_medals.update({row["Year"]: {"G": 0, "S": 0, "B": 0}})
                how_many_medals[row["Year"]][m] += 1

            else:
                how_many_medals[row["Year"]][m] += 1
        else:
            if row["Year"] not in how_many_medals:
                how_many_medals.update({row["Year"]: {"G": 0, "S": 0, "B": 0}})

    return how_many_medals


loader = FileLoader()
data = loader.load("athlete_events.csv")

print(howManyMedals(data, "Kjetil Andr Aamodt"))
Exemplo n.º 21
0
from FileLoader import FileLoader
from itertools import combinations
fl = FileLoader()

transactions = fl.loadTransactions()


def validateInput():
    while True:

        try:
            userInput = int(input("Please enter a number: "))
            if userInput >= 0 and userInput < 15:
                return userInput
            print("Please enter a number between 0 and 14.")
            continue
        except ValueError:
            print("Please enter a number.\n")
            continue


def calculateSubset(size, position, transactionNumber):
    calculatedSubset = []
    transaction = transactions[transactionNumber].getTransaction()

    for value in range(0, size):

        calculatedSubset.append(transaction[value + position:value + position +
                                            1][0])

    return calculatedSubset
Exemplo n.º 22
0
def main():
    loader = FileLoader()
    data = loader.load('./day04/athlete_events.csv')
    print(proportionBySport(data, 2004, 'Tennis', 'F'))
Exemplo n.º 23
0
#!/usr/bin/env python3
from FileLoader import FileLoader

loader = FileLoader()
data = loader.load('../resources/athlete_events.csv')
loader.display(data, 12)
Exemplo n.º 24
0
from FileLoader import FileLoader

loader = FileLoader()
df = FileLoader.load(loader, "athlete_events.csv")
loader.display(df, -2)
Exemplo n.º 25
0
from FileLoader import FileLoader


def __proportionBySport__(dataFrame, year, sport, gender):
    y_df = dataFrame[dataFrame['Year'] == year]
    g_df = y_df[y_df['Sex'] == gender]
    non_dup_tot = g_df.drop_duplicates('Name')
    nb_tot = len(non_dup_tot.index)
    sport_g_df = g_df[g_df['Sport'] == sport]
    non_dup_sport = sport_g_df.drop_duplicates('Name')
    nb_sport = len(non_dup_sport.index)
    print(non_dup_sport.head(20))
    print(nb_tot, nb_sport)
    return nb_sport * 100 / nb_tot


if __name__ == '__main__':
    ld = FileLoader()
    dt = ld.load('../ex00/athlete_events.csv')
    print(__proportionBySport__(dt, 2004, 'Tennis', 'F'))
Exemplo n.º 26
0
from FileLoader import FileLoader

fl = FileLoader()
df = fl.load("../resources/athlete_events.csv")
fl.display(df, -5)
fl.display(df, 5)
 def __init__(self):
     self.iterator = True
     self.logger = logging.getLogger('SecurityMetricIDS')
     self.__is_file_valid(config.logname)
     self.__log_loader = FileLoader()
Exemplo n.º 28
0
from FileLoader import FileLoader
from HowManyMedalsByCountry import howManyMedalsByCountry

class MyPlotLib():
    def histogram(self, df, features):
        df.hist(features)
        plt.show()

    def density(self, df, features):
        df[features].plot.kde()
        plt.show()

    def pair_plot(self, data, features):
        seaborn.pairplot(data[features])
        plt.show()

        

    def box_plot(self, data, features):
        df.boxplot(features)
        plt.show()

f = FileLoader()
df = f.load('../athlete_events.csv')

mpl = MyPlotLib()
#h = mpl.histogram(df, ['Year', 'ID','Height'])
#i = mpl.density(df, ['Weight','Height'])
#j = mpl.pair_plot(df, ['Weight', 'Height'])
#k = mpl.box_plot(df, ['Weight','Height'])
Exemplo n.º 29
0
from FileLoader import FileLoader

file = FileLoader()

df = file.load('../resources/athlete_events.csv')
print('5 first rows\n')
file.display(df, 5)
print('\n\n5 last rows')
file.display(df, -5)
Exemplo n.º 30
0
# **************************************************************************** #
#                                                                              #
#                                                         :::      ::::::::    #
#    test.py                                            :+:      :+:    :+:    #
#                                                     +:+ +:+         +:+      #
#    By: lboukrou <*****@*****.**>          +#+  +:+       +#+         #
#                                                 +#+#+#+#+#+   +#+            #
#    Created: 2020/06/27 20:36:59 by lboukrou          #+#    #+#              #
#    Updated: 2020/06/28 19:25:49 by lboukrou         ###   ########.fr        #
#                                                                              #
# **************************************************************************** #

from HowManyMedals import HowManyMedals
from FileLoader import FileLoader
import pandas as pd

loader = FileLoader()
df = loader.load('../resources/athlete_events.csv')
name = 'Kjetil Andr Aamodt'
data_years = df[df.Name == name]
print(data_years)
data_years = data_years.dropna()
dic = {}
dic = data_years.to_dict()
print(dic)
gold, silver, bronze = 0, 0, 0
def main():
    loader = FileLoader()
    data = loader.load("../resources/athlete_events.csv")
    std = howManyMedalsByCountry(data, 'London')
    print(std)
Exemplo n.º 32
0
from FileLoader import FileLoader
from DataManager import DataManager
from src.Population import Population

file_path = "../Dataset/00-91-Drugs-All-In-One-File.csv"
loaded_data = FileLoader.load_file(file_path)

data_manager = DataManager(normalizer=None)
data_manager.create_first_population(loaded_data)
data_manager.split_data_into_train_valid_test_sets(test_split=0.15,
                                                   train_split=0.70)

population = Population()
population.create_first_population()
for i in range(1, 50):
    print("row", i, population.population_matrix[i].sum())
Exemplo n.º 33
0
#!/usr/bin/env python

import importlib.util
import sys
sys.path.append('../ex00')
from FileLoader import FileLoader


def proportionBySport(df, year, sport, sex):
    p = df[(df.Year == year) & (df.Sex == sex)]\
.drop_duplicates(subset=['Name', 'ID']).shape[0]
    p_s = df[(df.Year == year) & (df.Sex == sex) & (df.Sport == sport)]\
.drop_duplicates(subset=['Name', 'ID']).shape[0]
    return p_s / p


if __name__ == '__main__':
    ld = FileLoader()
    df = ld.load('../resources/athlete_events.csv')
    ld.display(df, 3)

    print(proportionBySport(df, 2004, 'Tennis', 'F'))
Exemplo n.º 34
0
from FileLoader import FileLoader
loader = FileLoader()

path2 = r"C:\Users\Gabriel\Desktop\Mes documents - Google Drive\DATA\19\day04\athlete_events.csv"
data = loader.load(path2)

from SpatioTemporalData import SpatioTemporalData
sp = SpatioTemporalData(data)
sp.where(1896)

sp.where(2016)

sp.when('Athina')

sp.when('Paris')
Exemplo n.º 35
0
from FileLoader import FileLoader


class SpatioTemporalData:
    def __init__(self, df):
        self.df = df

    def when(self, location):
        years = []
        for i in range(len(self.df)):
            if self.df.loc[i, "City"] == location:
                year = self.df.loc[i, "Year"]
                if year not in years:
                    years.append(year)
        return years

    def where(self, year):
        for i in range(len(self.df)):
            if self.df.loc[i, "Year"] == year:
                return self.df.loc[i, "City"]
        return None


fl = FileLoader()
std = SpatioTemporalData(fl.load("../athlete_events.csv"))
print(std.where(1896))
print(std.where(2016))
print(std.when("Athina"))
print(std.when("Paris"))
import numpy as np
import pandas as pd
import sys
sys.path.append('../ex00')
from FileLoader import FileLoader


def howManyMedalsByCountry(df, country):
    df['Medal'].replace('', np.nan, inplace=True)
    data = df.loc[df.Team == country, ['Year', 'Medal', 'Team', 'Event']]
    data.dropna(subset=['Medal'], inplace=True)
    data = data.drop_duplicates()
    res = {}
    for index, row in data.iterrows():
        if row['Year'] not in res.keys():
            res[row['Year']] = {'G': 0, 'S': 0, 'B': 0}
        if row['Medal'] == 'Gold':
            res[row['Year']]['G'] += 1
        elif row['Medal'] == 'Silver':
            res[row['Year']]['S'] += 1
        elif row['Medal'] == 'Bronze':
            res[row['Year']]['B'] += 1
    return res


if __name__ == '__main__':
    fl = FileLoader()
    data = fl.load('../resources/athlete_events.csv')
    print(howManyMedalsByCountry(data, 'France'))
Exemplo n.º 37
0
from FileLoader import FileLoader


def howManyMedalsByCountry(dataFrame, name):
    name_df = dataFrame[dataFrame['Country'] == name]
    won_md = name_df[pd.notnull(name_df['Medal'])]
    year_arr = won_md.loc[:, 'Year'].drop_duplicates()
    dico2 = {}
    for year in year_arr:
        dico2[year] = {}
        b_y = won_md[won_md['Year'] == year]
        print(b_y)
        gold_nb = len(b_y[b_y['Medal'] == 'Gold'])
        silv_nb = len(b_y[b_y['Medal'] == 'Silver'])
        bro_nb = len(b_y[b_y['Medal'] == 'Bronze'])
        dico2[year]['Gold'] = gold_nb
        dico2[year]['Silver'] = silv_nb
        dico2[year]['Bronze'] = bro_nb
    return dico2


if __name__ == '__main__':
    fl = FileLoader()
    df = fl.load('../ex00/athlete_events.csv')
    howManyMedalsByCountry(df, 'France')
Exemplo n.º 38
0
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from FileLoader import FileLoader


class MyPlotLib:
    @staticmethod
    def histogram(data, features):
        df[features].hist()
        plt.show()

    @staticmethod
    def density(data, features):
        sns.distplot(df[features])
        plt.show()

    @staticmethod
    def pair_plot(data, features):
        pass

    @staticmethod
    def box_plot(data, features):
        pass


if __name__ == "__main__":
    df = FileLoader.load("../data/athlete_events.csv")
    MyPlotLib.histogram(df.drop_duplicates(['ID']), ['Height', 'Weight'])
    MyPlotLib.density(df.drop_duplicates(['ID']), ['Height', 'Weight'])
Exemplo n.º 39
0
        sns.boxplot(x=categorical_var, y=numerical_var, palette="Set2")

        plt.show()

    @staticmethod
    def density(categorical_var, numerical_var):
        """displays the density of the numerical variable. Each
subpopulation should be represented by a separate curve on the graph."""
        #print(categorical_var)
        cat_list = categorical_var.astype('category')
        for cat in cat_list:
            sns.kdeplot(numerical_var[categorical_var == cat],
                        label=cat)  #, categorical_var)

        plt.show()

    @staticmethod
    def compare_histograms(categorical_var, numerical_var):
        """plots the numerical variable in a s"""
        pass


if __name__ == "__main__":
    loader = FileLoader()
    df = loader.load("../assets/athlete_events.csv").dropna()
    h = df[df.Sex == "M"]
    f = df[df.Sex == "F"]
    k = Komparator()
    k.compare_box_plots(df.Sex, df.Weight)
    k.density(df.Sex, df.Weight)
    k.compare_histograms(df.Sex.head(), df.Weight.head())
import pandas as pd
from matplotlib import pyplot as plt
from FileLoader import FileLoader


class Komparator:
    def __init__(self, df):
        self.data = df
        pass

    def compare_box_plots(self, categorical_var, numerical_var):
        self.data[categorical_var].plot.box(colunm=numerical_var)
        plt.show()

    def density(self, categorical_var, numerical_var):
        pass

    def compare_histograms(categorical_var, numerical_var):
        pass


# to end

f = FileLoader()
r = f.load("athlete_events.csv")
k = Komparator(r)
k.compare_box_plots(["Team", "Year"], [1, 2, 3])
Exemplo n.º 41
0
from FileLoader import FileLoader
import pandas as pd


def proportionBySport(df: pd.DataFrame, year: int, sport: str,
                      gender: str) -> float:
    all_sport_df = df[(df['Sex'] == gender) & (df['Year'] == year)]\
           .drop_duplicates(subset='Name', keep='first')
    sport_df = df[(df['Sex'] == gender) & (df['Year'] == year) & (df['Sport'] == sport)]\
           .drop_duplicates(subset='Name', keep='first')
    return sport_df['Sport'].count() / all_sport_df['Sport'].count()


loader = FileLoader()
data = loader.load("../resources/athlete_events.csv")
print(proportionBySport(data, 2004, 'Tennis', 'F'))
Exemplo n.º 42
0
from FileLoader import FileLoader
from YoungestFellah import YoungestFellah as yf

fl = FileLoader()
data = fl.load("../athlete_events.csv")
print(yf.youngestFellah(data, 2004))
from FileLoader import FileLoader
import pandas as pd

class SpatioTemporalData:
    def __init__(self, data):
        self.df = data
    
    def when(self, location):
        places = []
        years = self.df[self.df.City == location]
        for index, row in years.iterrows():
            if row['Year'] not in places:
                places.append(row['Year'])
        return places
    
    def where(self, year):
        year = int(year)
        city = self.df[self.df.Year == year]
        return city['City'].iloc[0]
            
loader = FileLoader()
data = loader.load('../athlete_events.csv')
sp = SpatioTemporalData(data)
print(sp.when('Atlanta'))
print(sp.where('2016'))
Exemplo n.º 44
0
    Args:
        y_true: a scalar or a numpy ndarray for the correct labels
        y_pred: a scalar or a numpy ndarray for the predicted labels
    Returns:
        The accuracy score as a float.
        None on any error.
    Raises:
        This function should not raise any Exception.
    """
    n = 0
    for pred, true in zip(y_pred, y_true):
        if pred == true:
            n += 1
    if normalize == False:
        return n
    if y_pred.shape[0] > 0:
        return n / y_pred.shape[0]
    return None


if __name__ == "__main__":
    if len(sys.argv) == 3:
        loader = FileLoader()
        data1 = loader.load(str(sys.argv[1]))
        data2 = loader.load(str(sys.argv[2]))
        y_true = np.array(data1['Hogwarts House'])
        y_pred = np.array(data2['Hogwarts House'])
        print("score : ", accuracy_score_(y_true, y_pred))
    else:
        print("Usage : python accuracy_score.py path.csv path.csv")
Exemplo n.º 45
0
from FileLoader import FileLoader
from YoungestFellah import youngestFellah

fl = FileLoader()
df = fl.load("../resources/athlete_events.csv")
print(youngestFellah(df, 2004))
Exemplo n.º 46
0
import pandas as pd
import matplotlib.pyplot as plt


class MyPlotLib:
    @staticmethod
    def histogram(data: pd.DataFrame, features: list):
        data.hist(column=['Height', 'Weight'])
        plt.show()

    @staticmethod
    def density(data: pd.DataFrame, features: list):
        ...

    @staticmethod
    def pait_plot(data: pd.DataFrame, features: list):
        ...

    @staticmethod
    def box_plot(data: pd.DataFrame, features: list):
        ...


if __name__ == '__main__':
    from FileLoader import FileLoader
    data_csv = FileLoader.load('../resources/athlete_events.csv')
    MyPlotLib.histogram(data_csv, ['Height', 'Weight'])
Exemplo n.º 47
0
from FileLoader import FileLoader
from DataManager import DataManager
from src.Population import Population

file_path = "../Dataset/00-91-Drugs-All-In-One-File.csv"
loaded_data = FileLoader.load_file(file_path)

data_manager = DataManager(normalizer=None)
data_manager.create_first_population(loaded_data)
data_manager.split_data_into_train_valid_test_sets(test_split=0.15, train_split=0.70)

population = Population()
population.create_first_population()
for i in range (1,50):
    print("row", i, population.population_matrix[i].sum())
    linear_model.BayesianRidge()
    # ,svm.SVR()
    # ,RandomForestRegressor()
    # , LinearRegression()
]

normalizers = [
    None
    # , StandardScaler()
    # ,NumpyNormalizer(), ScikitNormalizer()
    # , MinMaxScaler() ,Binarizer()
    # ,Imputer(), KernelCenterer()
    # ,Normalizer()
]

output_filename = FileLoader.create_output_file()

for normalizer in normalizers:
    for feature_eliminator in feature_eliminators:
        # for k_value in range(5, 20):
        # for k_value in range(13, 14):

        for the_model in the_models:
            print(
                "taking care of ",
                type(normalizer).__name__,
                "and feature selector ",
                type(feature_eliminator).__name__,
                "model",
                type(the_model).__name__,
            )