示例#1
0
    def transform(self, X_df):
        # X_df_new = X_df[0].copy()
        # data_new = X_df[1].copy()
        X_df_new = X_df.copy()
        train, _ = get_train_data()
        test, _ = get_test_data()
        data_new = pd.concat([train, test])
        
        X_df_new = X_df_new.fillna('-1')  # replace missing values NaN
        data_new = data_new.fillna('-1')

        one_hot_feature = ['LBS', 'age', 'carrier', 'consumptionAbility',
            'education','gender', 'house', 'os', 'ct', 'marriageStatus',
            'advertiserId', 'campaignId', 'creativeId', 'adCategoryId',
            'productId', 'productType']  # features with only one scalar
        
        vector_feature = ['appIdAction', 'appIdInstall', 'interest1',
            'interest2', 'interest3', 'interest4', 'interest5', 'kw1',
            'kw2', 'kw3', 'topic1', 'topic2', 'topic3']  # vector features

        X_df_new = labelEncoder(data_new, X_df_new, one_hot_feature)
        data_new = labelEncoder(data_new, data_new, one_hot_feature)  # normalize features

        X_sparse = OneHot(data_new, X_df_new, one_hot_feature)
        X_sparse = Vectorize(data_new, X_df_new, vector_feature, X_sparse)

        return X_sparse.tocsr()
示例#2
0
def load_train_test_prediction(submission_name):
    """Load the true and predicted labels for a given submission.

    Parameters
    ----------
    submission_name : str
        The name of the submission (e.g. 'abethe_anatomy').

    Returns
    -------
    y_true_train : ndarray, shape (n_train_samples, )
        The true labels on the training set.
    y_pred_train : ndarray, shape (n_train_samples, )
        The predicted labels on the training set.
    y_true_test : ndarray, shape (n_test_samples, )
        The true labels on the testing set.
    y_pred_test : ndarray, shape (n_test_samples, )
        The predicted labels on the testing set.

    """
    path_store_pred = os.path.join('../submissions', submission_name,
                                   'training_output')

    y_pred_train = np.load(os.path.join(path_store_pred, 'y_pred_train.npy'))
    y_pred_test = np.load(os.path.join(path_store_pred, 'y_pred_test.npy'))

    _, y_true_train = get_train_data('..')
    _, y_true_test = get_test_data('..')

    return (y_true_train, y_pred_train, y_true_test, y_pred_test)
示例#3
0
def _get_data_rdb_out():
    """Split the data to provide the true label and data with only RDB as
    test.

    Returns
    -------
    X_train : ndarray, shape (n_train_samples, )
        The training data without RDB.
    X_test : ndarray, shape (n_test_samples, )
        The testing data corresponding to the RDB subjects.
    y_train : ndarray, shape (n_train _samples, )
        The labels of the training set.
    y_test : ndarrays, shape (n_test_samples, )
        The labels of the testing set.

    """
    rdb_idx = np.load('rdb_idx.npy')
    X_test, y_test = get_test_data('..')
    X_train, y_train = get_train_data('..')
    X_test_idx = X_test.index.values
    X_rdb_idx = [X_test_idx == ii for ii in rdb_idx]
    X_rdb_idx = np.vstack(X_rdb_idx)
    X_rdb_idx = np.sum(X_rdb_idx, axis=0).astype(bool)

    return (pd.concat([X_train, X_test[~X_rdb_idx]], axis=0),
            X_test[X_rdb_idx],
            np.concatenate([y_train, y_test[~X_rdb_idx]]),
            y_test[X_rdb_idx])
示例#4
0
from submissions.my_model.classifier import Classifier


def evaluation(X, y):
    pipe = make_pipeline(FeatureExtractor(), Classifier())
    cv = get_cv(X, y)
    results = cross_validate(pipe,
                             X,
                             y,
                             scoring=['roc_auc', 'accuracy'],
                             cv=cv,
                             verbose=1,
                             return_train_score=True,
                             n_jobs=1)

    return results


data_train, labels_train = get_train_data()
results = evaluation(data_train, labels_train)

print("Training score ROC-AUC: {:.3f} +- {:.3f}".format(
    np.mean(results['train_roc_auc']), np.std(results['train_roc_auc'])))
print("Validation score ROC-AUC: {:.3f} +- {:.3f} \n".format(
    np.mean(results['test_roc_auc']), np.std(results['test_roc_auc'])))

print("Training score accuracy: {:.3f} +- {:.3f}".format(
    np.mean(results['train_accuracy']), np.std(results['train_accuracy'])))
print("Validation score accuracy: {:.3f} +- {:.3f}".format(
    np.mean(results['test_accuracy']), np.std(results['test_accuracy'])))
示例#5
0
import problem
import torch
from torch.nn.functional import relu
from torch.nn.utils.rnn import pad_sequence

X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()


class Regressor():
    """A PyTorch MLP model consisting of an MLP for each module type.
    The model is learnt only on single module.
    The model takes as input the input power and the meta data of the
    corresponding cascade. To predict the output power the model
    simply cascades the different MLPs matching the input module cascade."""
    def __init__(self):
        super().__init__()
        # Since the model need meta data present in the data
        # we will only instantiate the model when calling the fit function
        self.Model = PyTorchModel  # PyTorch model class
        self.model = None  # PyTorch model instance
        self.mod_id = None  # Module IDs

    def fit(self, X, y):
        # Retrieve some information about the modules from the data
        all_mods = set([(("type", mod[0]), ("nb_feat", len(mod[1])))
                        for seq, _, _ in X for mod in seq])
        mod_info = [dict(m) for m in all_mods]
        self.mod_id = {mod["type"]: i for i, mod in enumerate(mod_info)}

        # Instantiate the PyTorch model
示例#6
0
from nilearn import datasets, input_data, plotting, image
from sklearn.svm import SVC

import seaborn as sns
import numpy as np
import pandas as pd
import scipy as sp
from nilearn import datasets, input_data, plotting, image
from sklearn.preprocessing import QuantileTransformer
from sklearn.metrics import roc_auc_score

import matplotlib.colors as colors
from problem import get_train_data, get_test_data

X_train, y_train = get_train_data("..")
X_test, y_test = get_test_data("..")

all_submissions = [
    "abethe_functional_blast",
    "amicie_functional_blast",
    "ayoub.ghriss_functional_blast",
    "mk_functional_blast",
    "nguigui_functional_blast",
    "pearrr_functional_blast",
    "Slasnista_functional_blast",
    "vzantedeschi_functional_blast",
    "wwwwmmmm_functional_blast",
]

results = {"0%": [], "25%": [], "50%": [], "75%": []}
    X.loc[:, 'day'] = X['DateOfDeparture'].dt.day
    X.loc[:, 'weekday'] = X['DateOfDeparture'].dt.weekday
    X.loc[:, 'week'] = X['DateOfDeparture'].dt.week
    X.loc[:, 'n_days'] = X['DateOfDeparture'].apply(
        lambda date: (date - pd.to_datetime("1970-01-01")).days)
    # Finally we can drop the original columns from the dataframe
    return X.drop(columns=["DateOfDeparture"])


def _todense(X):
    X = X.copy()
    return X.todense()


#Loading  training data
X, y = problem.get_train_data()

#############################################Preprocessing training data#####################################
data_merger = FunctionTransformer(_merge_external_data)
date_encoder = FunctionTransformer(_encode_dates)

dense_matrix = FunctionTransformer(_todense)

categorical_encoder_ohe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore"))
categorical_cols_ohe = [
    "Arrival", "Departure", "day", "weekday", "holidays", "week", "n_days"
]

categorical_encoder_oe = OrdinalEncoder()
示例#8
0
import numpy as np
import scipy.ndimage as nd
from sklearn.base import BaseEstimator
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

from keras.layers import Input, MaxPooling3D, UpSampling3D, Conv3D, Reshape, Conv3DTranspose

import problem
import numpy as np
import matplotlib.pylab as plt
import submissions.starting_kit.keras_segmentation_classifier as classifier 

module_path = '.'
train_ids = problem.get_train_data() 
print(train_ids)

#spl = problem.ImageLoader([1,2,3,4])

simp = problem.SimplifiedSegmentationClassifier() 
clf = simp.train_submission(module_path=module_path,patient_ids=train_ids)  


# n_classes=[0,1]
# img_loader = problem.ImageLoader(patient_ids=train_ids, n_classes=n_classes) 
# clf.fit(img_loader)  

#test_ids = problem.get_test_data()
#score = simp.test_submission(module_path = module_path,trained_model = clf, patient_idxs = test_ids)  
示例#9
0
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import problem
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.inspection import permutation_importance
def _merge_external_data(X):
    """
    filepath = os.path.join(
        os.path.dirname(__file__), 'external_data.csv'
    )
    """
    # Make sure that DateOfDeparture is of dtype datetime
    X = X.copy()  # modify a copy of X
    X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])
    # Parse date to also be of dtype datetime
    data_weather = pd.read_csv("external_data.csv", parse_dates=["DateOfDeparture"])
示例#10
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 26 15:27:47 2020
@author: dorian
"""
import os
import importlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
import problem
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
import shap
def _merge_external_data(X):
    filepath = os.path.join(
        os.path.dirname(__file__), 'external_data.csv'
    )
    # Make sure that DateOfDeparture is of dtype datetime
    X = X.copy()  # modify a copy of X
    X.loc[:, "DateOfDeparture"] = pd.to_datetime(X['DateOfDeparture'])
示例#11
0
"""
Simple pipeline to test the naive submission.
"""

import numpy as np
import imp

from sklearn.pipeline import Pipeline

import problem


# Import corpus ------------------------------------------------------------------------

X_train_full, y_train_full = problem.get_train_data(sep="|")
cv_list = list(problem.get_cv(X_train_full, y_train_full))

fold = np.random.randint(0, 6)
print("fold: " + str(fold))

X_train = X_train_full.iloc[cv_list[fold][0], :]
X_test = X_train_full.iloc[cv_list[fold][1], :]
y_train = y_train_full[cv_list[fold][0]]
y_test = y_train_full[cv_list[fold][1]]


# Test submission on 1 fold ------------------------------------------------------------

scorer = problem.F1Score()

submission_name = "starting_kit"
# Object detector
from object_detector_Unet import *
# from object_detector_SSD import *

# Ramp
sys.path.append('../')
import problem

#=========================================================================================================
#================================ 1. DATA

print("\nLoading data", end='...')

data_path = '../'
Xtrain, Ytrain = problem.get_train_data(data_path)

SIZE = floor(Xtrain.shape[0] / 2.)

Xtrain = Xtrain[:SIZE]
Ytrain = Ytrain[:SIZE]

print('done')

print('>>', Xtrain.shape, Ytrain.shape, '\n')

#=========================================================================================================
#================================ 2. TRAINING

object_detector = ObjectDetector()