def test(): # Définition de l'ensemnle de données et affichages print("=============================================================") print("=============================================================") print(" Test de model.py ") print(" et comparaison avec d'autres modèles ") print(" ") print(" ") print(" Chargement des données..") print(" ") data = read_as_df(data_dir + '/' + data_name) print(data['target'].value_counts()) print(" ") print("_____________________________________________________________") print(" ") print(data.head()) print(" ") D = DataManager(data_name, data_dir, replace_missing=True) print("_____________________________________________________________") print(" ") print(D) print(" ") print("_____________________________________________________________") print(" ") print(" ") X_t = D.data['X_train'] Y = D.data['Y_train'] Y = Y.ravel() print("_____________________________________________________________") print(" ") print(" Division des données en deux ensembles (training et validation)") X_train, X_valid, Y_train, Y_valid = train_test_split( X_t, Y, test_size=0.33, random_state=42) print("Dimensions de Y_train") print(Y_train.shape) print("Dimensions de Y_valid") print(Y_valid.shape) print("DONE") print(" ") print(" ") print("_____________________________________________________________") print(" Comparaison des modèles : ") print(" ") print(" ") model1 = model() modelTest = modelComparer("Pipeline RandomForestClassifier avec Preprocessor()", model1) modelTest.addClassifier("OneVsOneClassifier",OneVsOneClassifier(SGDClassifier(random_state=42))) modelTest.addClassifier("AdaBoostClassifier",AdaBoostClassifier(n_estimators=100)) modelTest.addClassifier("RandomForestClassifier",RandomForestClassifier(n_estimators=180, max_depth=None, max_features='auto')) modelTest.addClassifier("KNeighborsClassifier",neighbors.KNeighborsClassifier(n_neighbors=7)) #Fonction 'fit' pour tous les modèles modelTest.fitAll(X_train,Y_train) modelTest.comparingFunction(X_train,Y_train,X_valid,Y_valid,5)
def __init__(self): self.datadir = 'public_data/' self.dataname = 'labels' self.basename = datadir + dataname reload(data_io) self.data = data_io.read_as_df(self.basename) self.features = np.loadtxt("starting_kit/public_data/crime_train.data") self.year,self.month,self.day,self.day_num,self.minute,self.hour,self.X,self.Y,self.PdDistrict,self.address,self.resolution=np.loadtxt("starting_kit/public_data/crime_train.data", unpack=True) self.labels = np.loadtxt("starting_kit/public_data/crime_train.solution") self.drug,self.larceny,self.missing_person,self.prostitution,self.vehicule_theft,self.warrants=np.loadtxt("starting_kit/public_data/crime_train.solution", unpack=True) self.scaler = preprocessing.RobustScaler() self.outliers()
def run_visualization(data_dir, data_name): ''' Show a bunch of graphs.''' # Read data print('Read data') data = read_as_df(data_dir + '/' + data_name) # Standartize data and change target value to numeric print('Standardize data') data_num = standardize_df(data) # Make a heat map print('Make hear map') sns.heatmap(data_num) plt.show() # Make scatter plots (limit to 5 variables if large number of variables) print('Make scatter plots') var_num = data.shape[1] - 1 if var_num > 5: var_num = 5 chosen_columns = list(data.columns[0:var_num]) sns.pairplot(data, vars=chosen_columns, diag_kind="hist", hue="target") plt.show() print('Show correlation matrix') # Correlation matrix corr_mat = data_num[chosen_columns + ['target']].corr(method='pearson') sns.heatmap(corr_mat, annot=True, center=0) plt.show()
from sklearn import datasets from sklearn import linear_model import os codedir = 'sample_code/' from sys import path path.append(codedir) import seaborn as sns sns.set() from scipy.misc import imread datadir = '../public_data/' # Change this to the directory where you put the input data dataname = 'crime' basename = datadir + dataname #print basename #reload(data_io) data = data_io.read_as_df(basename) categories = data.groupby('target') class dataRepresentation: """ attributes: crimes coordinate X and Y crimes date day/month/year résolution 0,1 or 2 crimes location aka district number crimes categories """ """ function that create and display a map with the different location of each crime in terms of the coordinates X and Y and the crime category """
from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split from keras.models import load_model from keras.callbacks import EarlyStopping, ModelCheckpoint from keras.preprocessing.image import ImageDataGenerator import numpy as np import pandas as pd import matplotlib.pyplot as plt import models data_dir = 'input_data' data_name = 'microscopy' data = read_as_df(data_dir + '/' + data_name) D = DataManager(data_name, data_dir, replace_missing=True) # print(D) def reshape(X): num = X.shape[0] X = X.reshape((num, 40, 40, 3)) X = X / 255.0 return X X_train = reshape(D.data['X_train']) X_test = reshape(D.data['X_test']) X_valid = reshape(D.data['X_valid'])
with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) from sklearn.base import BaseEstimator from data_manager import DataManager # The class provided by binome 1 # Note: if zDataManager is not ready, use the mother class DataManager input_dir = "C:\\Users\\isabe\\Downloads\\monet-master\\starting_kit\\c1_input_data" data_name = 'perso' data_dir = '../../public_data' D = DataManager(data_name, data_dir, replace_missing=True) #print(D) print(data_name) from data_io import read_as_df data = read_as_df(data_dir + '/' + data_name) # The perso_data is loaded as a Pandas Data Frame model_dir = '../sample_code_submission/' # Change the model to a better one once you have one! result_dir = '../sample_result_submission/' problem_dir = '../ingestion_program/' score_dir = '../scoring_program/' from sys import path path.append(model_dir) path.append(problem_dir) path.append(score_dir) from numpy.core.umath_tests import inner1d from data_io import write from model import model M = model()
data_num['target']= data_num['target'].astype('category') data_num['target'] = data_num['target'].cat.codes b = sns.heatmap(data_num) plt.show() plt.close() if __name__=="__main__": import matplotlib.pyplot as plt # pour que la soumission sur codalab passe. import seaborn as sns sns.set(font_scale=1.4,style="whitegrid") #set styling preferences data = data_io.read_as_df('C:\\Users\\isabe\\Downloads\\monet-master\\starting_kit\\c1_input_data\\perso') #tout url = "C:\\Users\\isabe\\Downloads\\monet-master\\starting_kit\\c1_input_data\\perso_train.data" df = pd.read_csv(url ,delimiter=' ') # toutes les données sans le head, ni les labels labels = data.iloc[1:,-1] # label : true ou false? pca=PCA(n_components=200) x_reduced=pca.fit_transform(data) # print(data) x_reduced = pd.DataFrame(x_reduced) #print (x_reduced.shape) #data = x_reduced funcPCA (data)