def transform(self, X_df): # X_df_new = X_df[0].copy() # data_new = X_df[1].copy() X_df_new = X_df.copy() train, _ = get_train_data() test, _ = get_test_data() data_new = pd.concat([train, test]) X_df_new = X_df_new.fillna('-1') # replace missing values NaN data_new = data_new.fillna('-1') one_hot_feature = ['LBS', 'age', 'carrier', 'consumptionAbility', 'education','gender', 'house', 'os', 'ct', 'marriageStatus', 'advertiserId', 'campaignId', 'creativeId', 'adCategoryId', 'productId', 'productType'] # features with only one scalar vector_feature = ['appIdAction', 'appIdInstall', 'interest1', 'interest2', 'interest3', 'interest4', 'interest5', 'kw1', 'kw2', 'kw3', 'topic1', 'topic2', 'topic3'] # vector features X_df_new = labelEncoder(data_new, X_df_new, one_hot_feature) data_new = labelEncoder(data_new, data_new, one_hot_feature) # normalize features X_sparse = OneHot(data_new, X_df_new, one_hot_feature) X_sparse = Vectorize(data_new, X_df_new, vector_feature, X_sparse) return X_sparse.tocsr()
def load_train_test_prediction(submission_name): """Load the true and predicted labels for a given submission. Parameters ---------- submission_name : str The name of the submission (e.g. 'abethe_anatomy'). Returns ------- y_true_train : ndarray, shape (n_train_samples, ) The true labels on the training set. y_pred_train : ndarray, shape (n_train_samples, ) The predicted labels on the training set. y_true_test : ndarray, shape (n_test_samples, ) The true labels on the testing set. y_pred_test : ndarray, shape (n_test_samples, ) The predicted labels on the testing set. """ path_store_pred = os.path.join('../submissions', submission_name, 'training_output') y_pred_train = np.load(os.path.join(path_store_pred, 'y_pred_train.npy')) y_pred_test = np.load(os.path.join(path_store_pred, 'y_pred_test.npy')) _, y_true_train = get_train_data('..') _, y_true_test = get_test_data('..') return (y_true_train, y_pred_train, y_true_test, y_pred_test)
def _get_data_rdb_out(): """Split the data to provide the true label and data with only RDB as test. Returns ------- X_train : ndarray, shape (n_train_samples, ) The training data without RDB. X_test : ndarray, shape (n_test_samples, ) The testing data corresponding to the RDB subjects. y_train : ndarray, shape (n_train _samples, ) The labels of the training set. y_test : ndarrays, shape (n_test_samples, ) The labels of the testing set. """ rdb_idx = np.load('rdb_idx.npy') X_test, y_test = get_test_data('..') X_train, y_train = get_train_data('..') X_test_idx = X_test.index.values X_rdb_idx = [X_test_idx == ii for ii in rdb_idx] X_rdb_idx = np.vstack(X_rdb_idx) X_rdb_idx = np.sum(X_rdb_idx, axis=0).astype(bool) return (pd.concat([X_train, X_test[~X_rdb_idx]], axis=0), X_test[X_rdb_idx], np.concatenate([y_train, y_test[~X_rdb_idx]]), y_test[X_rdb_idx])
from submissions.my_model.classifier import Classifier def evaluation(X, y): pipe = make_pipeline(FeatureExtractor(), Classifier()) cv = get_cv(X, y) results = cross_validate(pipe, X, y, scoring=['roc_auc', 'accuracy'], cv=cv, verbose=1, return_train_score=True, n_jobs=1) return results data_train, labels_train = get_train_data() results = evaluation(data_train, labels_train) print("Training score ROC-AUC: {:.3f} +- {:.3f}".format( np.mean(results['train_roc_auc']), np.std(results['train_roc_auc']))) print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format( np.mean(results['test_roc_auc']), np.std(results['test_roc_auc']))) print("Training score accuracy: {:.3f} +- {:.3f}".format( np.mean(results['train_accuracy']), np.std(results['train_accuracy']))) print("Validation score accuracy: {:.3f} +- {:.3f}".format( np.mean(results['test_accuracy']), np.std(results['test_accuracy'])))
import problem import torch from torch.nn.functional import relu from torch.nn.utils.rnn import pad_sequence X_train, y_train = problem.get_train_data() X_test, y_test = problem.get_test_data() class Regressor(): """A PyTorch MLP model consisting of an MLP for each module type. The model is learnt only on single module. The model takes as input the input power and the meta data of the corresponding cascade. To predict the output power the model simply cascades the different MLPs matching the input module cascade.""" def __init__(self): super().__init__() # Since the model need meta data present in the data # we will only instantiate the model when calling the fit function self.Model = PyTorchModel # PyTorch model class self.model = None # PyTorch model instance self.mod_id = None # Module IDs def fit(self, X, y): # Retrieve some information about the modules from the data all_mods = set([(("type", mod[0]), ("nb_feat", len(mod[1]))) for seq, _, _ in X for mod in seq]) mod_info = [dict(m) for m in all_mods] self.mod_id = {mod["type"]: i for i, mod in enumerate(mod_info)} # Instantiate the PyTorch model
from nilearn import datasets, input_data, plotting, image from sklearn.svm import SVC import seaborn as sns import numpy as np import pandas as pd import scipy as sp from nilearn import datasets, input_data, plotting, image from sklearn.preprocessing import QuantileTransformer from sklearn.metrics import roc_auc_score import matplotlib.colors as colors from problem import get_train_data, get_test_data X_train, y_train = get_train_data("..") X_test, y_test = get_test_data("..") all_submissions = [ "abethe_functional_blast", "amicie_functional_blast", "ayoub.ghriss_functional_blast", "mk_functional_blast", "nguigui_functional_blast", "pearrr_functional_blast", "Slasnista_functional_blast", "vzantedeschi_functional_blast", "wwwwmmmm_functional_blast", ] results = {"0%": [], "25%": [], "50%": [], "75%": []}
X.loc[:, 'day'] = X['DateOfDeparture'].dt.day X.loc[:, 'weekday'] = X['DateOfDeparture'].dt.weekday X.loc[:, 'week'] = X['DateOfDeparture'].dt.week X.loc[:, 'n_days'] = X['DateOfDeparture'].apply( lambda date: (date - pd.to_datetime("1970-01-01")).days) # Finally we can drop the original columns from the dataframe return X.drop(columns=["DateOfDeparture"]) def _todense(X): X = X.copy() return X.todense() #Loading training data X, y = problem.get_train_data() #############################################Preprocessing training data##################################### data_merger = FunctionTransformer(_merge_external_data) date_encoder = FunctionTransformer(_encode_dates) dense_matrix = FunctionTransformer(_todense) categorical_encoder_ohe = make_pipeline( SimpleImputer(strategy="constant", fill_value="missing"), OneHotEncoder(handle_unknown="ignore")) categorical_cols_ohe = [ "Arrival", "Departure", "day", "weekday", "holidays", "week", "n_days" ] categorical_encoder_oe = OrdinalEncoder()
import numpy as np import scipy.ndimage as nd from sklearn.base import BaseEstimator from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.dummy import DummyClassifier from keras.layers import Input, MaxPooling3D, UpSampling3D, Conv3D, Reshape, Conv3DTranspose import problem import numpy as np import matplotlib.pylab as plt import submissions.starting_kit.keras_segmentation_classifier as classifier module_path = '.' train_ids = problem.get_train_data() print(train_ids) #spl = problem.ImageLoader([1,2,3,4]) simp = problem.SimplifiedSegmentationClassifier() clf = simp.train_submission(module_path=module_path,patient_ids=train_ids) # n_classes=[0,1] # img_loader = problem.ImageLoader(patient_ids=train_ids, n_classes=n_classes) # clf.fit(img_loader) #test_ids = problem.get_test_data() #score = simp.test_submission(module_path = module_path,trained_model = clf, patient_idxs = test_ids)
import os import importlib import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler from sklearn.compose import make_column_transformer from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score import problem from sklearn.model_selection import RandomizedSearchCV, GridSearchCV from sklearn.impute import SimpleImputer from sklearn.preprocessing import FunctionTransformer from sklearn.inspection import permutation_importance from sklearn.model_selection import train_test_split from sklearn.svm import SVR import xgboost as xgb from sklearn.inspection import permutation_importance def _merge_external_data(X): """ filepath = os.path.join( os.path.dirname(__file__), 'external_data.csv' ) """ # Make sure that DateOfDeparture is of dtype datetime X = X.copy() # modify a copy of X X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture']) # Parse date to also be of dtype datetime data_weather = pd.read_csv("external_data.csv", parse_dates=["DateOfDeparture"])
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Apr 26 15:27:47 2020 @author: dorian """ import os import importlib import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.ensemble import ExtraTreesRegressor from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score import problem from sklearn.model_selection import RandomizedSearchCV from sklearn.impute import SimpleImputer from sklearn.preprocessing import FunctionTransformer from sklearn.inspection import permutation_importance from sklearn.model_selection import train_test_split import shap def _merge_external_data(X): filepath = os.path.join( os.path.dirname(__file__), 'external_data.csv' ) # Make sure that DateOfDeparture is of dtype datetime X = X.copy() # modify a copy of X X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])
""" Simple pipeline to test the naive submission. """ import numpy as np import imp from sklearn.pipeline import Pipeline import problem # Import corpus ------------------------------------------------------------------------ X_train_full, y_train_full = problem.get_train_data(sep="|") cv_list = list(problem.get_cv(X_train_full, y_train_full)) fold = np.random.randint(0, 6) print("fold: " + str(fold)) X_train = X_train_full.iloc[cv_list[fold][0], :] X_test = X_train_full.iloc[cv_list[fold][1], :] y_train = y_train_full[cv_list[fold][0]] y_test = y_train_full[cv_list[fold][1]] # Test submission on 1 fold ------------------------------------------------------------ scorer = problem.F1Score() submission_name = "starting_kit"
# Object detector from object_detector_Unet import * # from object_detector_SSD import * # Ramp sys.path.append('../') import problem #========================================================================================================= #================================ 1. DATA print("\nLoading data", end='...') data_path = '../' Xtrain, Ytrain = problem.get_train_data(data_path) SIZE = floor(Xtrain.shape[0] / 2.) Xtrain = Xtrain[:SIZE] Ytrain = Ytrain[:SIZE] print('done') print('>>', Xtrain.shape, Ytrain.shape, '\n') #========================================================================================================= #================================ 2. TRAINING object_detector = ObjectDetector()