Exemplo n.º 1
0
 def __init__(self):
     '''
     This constructor is supposed to initialize data members.
     Use triple quotes for function documentation.
     '''
     self.debug = 0
     self.num_train_samples=0
     self.num_feat=1
     self.num_labels=1
     self.is_trained=False
     self.preproc = Preprocessor()
Exemplo n.º 2
0
 def __init__(self):
     '''You may here define the structure of your model. You can create your own type
     of ensemble. You can make ensembles of pipelines or pipelines of ensembles.
     This example votes among two classifiers: BasicClassifier and a pipeline
     whose classifier is itself an ensemble of GaussianNB classifiers.'''
     fancy_classifier = Pipeline([
         ('preprocessing', Preprocessor()),
         ('classification', BaggingClassifier(base_estimator=GaussianNB()))
     ])
     self.clf = VotingClassifier(estimators=[('basic', BasicPredictor()),
                                             ('fancy', fancy_classifier)],
                                 voting='soft')
Exemplo n.º 3
0
 def predict(self, X):
     '''
     This function should provide predictions of labels on (test) data.
     Here we just return zeros...
     Make sure that the predicted values are in the correct format for the scoring
     metric. For example, binary classification problems often expect predictions
     in the form of a discriminant value (if the area under the ROC curve it the metric)
     rather that predictions of the class labels themselves. For multi-class or multi-labels
     problems, class probabilities are often expected if the metric is cross-entropy.
     Scikit-learn also has a function predict-proba, we do not require it.
     The function predict eventually can return probabilities.
     '''
     Prepro = Preprocessor()
     Prepro.pip0(10)
     Prepro.fit_transform(X,y=None)
     
     num_test_samples = len(X)
     if X.ndim>1: num_feat = len(X[0])
     print("PREDICT: dim(X)= [{:d}, {:d}]".format(num_test_samples, num_feat))
     if (self.num_feat != num_feat):
         print("ARRGH: number of features in X does not match training data!")
     print("PREDICT: dim(y)= [{:d}, {:d}]".format(num_test_samples, self.num_labels))
     output= self.clf.predict(X)
     
     return output
Exemplo n.º 4
0
 def __init__(self):
     '''
     This constructor is supposed to initialize data members.
     Use triple quotes for function documentation.
     Model is the class called by Codalab.
     This class must have at least a method "fit" and a method "predict".
     '''
     self.num_train_samples = 0
     self.num_feat = 1
     self.num_labels = 1
     self.is_trained = False
     # The model should be defined in the constructor
     self.mod = Pipeline([('preprocessing', Preprocessor()),
                          ('predictor', Predictor())])
     print("MODEL=" + self.mod.__str__())
Exemplo n.º 5
0
    def fit(self, X, y):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        For classification, labels could be either numbers 0, 1, ... c-1 for c classe
        or one-hot encoded vector of zeros, with a 1 at the kth position for class k.
        The AutoML format support on-hot encoding, which also works for multi-labels problems.
        Use data_converter.convert_to_num() to convert to the category number format.
        For regression, labels are continuous values.
        '''
        Prepro = Preprocessor()
        Prepro.pip0(10)
        Prepro.fit_transform(X, y)
        
        
        self.num_train_samples = len(X)
        if X.ndim>1: self.num_feat = len(X[0])
        print("FIT: dim(X)= [{:d}, {:d}]".format(self.num_train_samples, self.num_feat))
        num_train_samples = len(y)
        if y.ndim>1: self.num_labels = len(y[0])
        print("FIT: dim(y)= [{:d}, {:d}]".format(num_train_samples, self.num_labels))
        if (self.num_train_samples != num_train_samples):
            print("ARRGH: number of samples in X and y do not match!")

        ###### Baseline models ######
        from sklearn.naive_bayes import GaussianNB
        from sklearn.linear_model import LinearRegression
        from sklearn.tree import DecisionTreeRegressor
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.neighbors import KNeighborsRegressor
        from sklearn.svm import SVR
        # Comment and uncomment right lines in the following to choose the model
        #self.clf = GaussianNB()
        #self.clf = LinearRegression()
        #self.clf = DecisionTreeRegressor()
        #self.clf = RandomForestRegressor()
        #self.clf = KNeighborsRegressor()
        #self.clf = SVR(C=1.0, epsilon=0.2)
        if self.is_trained==False:
            self.clf=self.selection_hyperparam(X, y)
          #  self.clf=self.selection_hyperparam__(X, y)
             
          
        
        
        self.is_trained=True
Exemplo n.º 6
0
Another style is to incorporate the test as a main function in the Data manager class itself.
"""
from sys import path
path.append("../ingestion_program")  # Contains libraries you will need
from data_manager import DataManager  # such as DataManager

from prepro import Preprocessor
input_dir = "../sample_data"
output_dir = "../resuts"

basename = 'credit'
D = DataManager(basename, input_dir)  # Load data
print("*** Original data ***")
print D

Prepro = Preprocessor()

# Preprocess on the data and load it back into D
D.data['X_train'] = Prepro.fit_transform(D.data['X_train'], D.data['Y_train'])
D.data['X_valid'] = Prepro.transform(D.data['X_valid'])
D.data['X_test'] = Prepro.transform(D.data['X_test'])

# Here show something that proves that the preprocessing worked fine
print("*** Transformed data ***")
print D

# Preprocessing gives you opportunities of visualization:
# Scatter-plots of the 2 first principal components
# Scatter plots of pairs of features that are most relevant
import matplotlib.pyplot as plt
X = D.data['X_train']
Exemplo n.º 7
0
class model:
    def __init__(self):
        '''
        This constructor is supposed to initialize data members.
        Use triple quotes for function documentation.
        '''
        self.debug = 0
        self.num_train_samples=0
        self.num_feat=1
        self.num_labels=1
        self.is_trained=False
        self.preproc = Preprocessor()
        
    def cross_validation_simple(self, j, k, X, Y):
        return cross_val_score(RandomForestRegressor(100, "mse", None, 2, j, 0.0, k), X, Y, cv=3)
    
    # Recherche des meilleurs paramètres à donner à RandomForestRegressor.
    # A cause de la lenteur de cette méthode, nous l'avons utilisée dans model_param.py, et nous
    # avons directement donné les paramètres optimaux à Random Forest, qui s'avèrent être les paramètres de base.
    def selection_hyperparam(self, X, Y):
        SMax=0
        param=dict()
        tab=[0.3, 0.6, 0.9, 'auto']
        
        for j in range(1, 11, 1):
            for k in range(0, 4, 1):
                a=RandomForestRegressor(100, "mse", None, 2, j, 0.0, tab[k])
                a.fit(X, Y)
                error=self.cross_validation_simple(j, tab[k], X, Y)
                score=mean(error)
                print(" j: "+str(j)+" k :"+str(k))
                
                if(score>SMax):
                    SMax=score
                        
                    param={'param2':j, 'param3':tab[k]}
                    print('first param '+str(param['param2'])+' second param '+str(param['param3']))
        print('first param final '+str(param['param2'])+' second param final '+str(param['param3']))
        
        return param

    def fit(self, X, y):
        '''
        This function should train the model parameters.
        Here we do nothing in this example...
        Args:
            X: Training data matrix of dim num_train_samples * num_feat.
            y: Training label matrix of dim num_train_samples * num_labels.
        Both inputs are numpy arrays.
        For classification, labels could be either numbers 0, 1, ... c-1 for c classe
        or one-hot encoded vector of zeros, with a 1 at the kth position for class k.
        The AutoML format support on-hot encoding, which also works for multi-labels problems.
        Use data_converter.convert_to_num() to convert to the category number format.
        For regression, labels are continuous values.
        '''

        if self.debug:
        	self.num_train_samples = self.preproc.fit_transform(X).shape[0]
        	if self.preproc.fit_transform(X).ndim>1: self.num_feat = self.preproc.fit_transform(X).shape[1]
        	print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat)
        	num_train_samples = y.shape[0]
        	if y.ndim>1: self.num_labels = y.shape[1]
        	print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels)
        	if (self.num_train_samples != num_train_samples):
        		print("ARRGH: number of samples in X and y do not match!")

        ###### Baseline models ######
        from sklearn.naive_bayes import GaussianNB
        from sklearn.linear_model import LinearRegression
        from sklearn.tree import DecisionTreeRegressor
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.neighbors import KNeighborsRegressor
        # Comment and uncomment right lines in the following to choose the model
        #self.model = GaussianNB()
        #self.model = LinearRegression()
        #self.model = DecisionTreeRegressor()
        self.model = RandomForestRegressor()
        #self.model = KNeighborsRegressor()

        self.model.fit(self.preproc.fit_transform(X), y)
        self.is_trained=True

    def predict(self, X):
        '''
        This function should provide predictions of labels on (test) data.
       
        Make sure that the predicted values are in the correct format for the scoring
        metric. For example, binary classification problems often expect predictions
        in the form of a discriminant value (if the area under the ROC curve it the metric)
        rather that predictions of the class labels themselves. For multi-class or multi-labels
        problems, class probabilities are often expected if the metric is cross-entropy.
        Scikit-learn also has a function predict-proba, we do not require it.
        The function predict eventually can return probabilities.
        '''
        if self.debug:
        	num_test_samples = self.preproc.fit_transform(X).shape[0]
        	if self.preproc.fit_transform(X).ndim>1: num_feat = self.preproc.fit_transform(X).shape[1]
        	print("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat)
        	if (self.num_feat != num_feat):
        		print("ARRGH: number of features in X does not match training data!")
        	print("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels)
       
        y = self.model.predict(self.preproc.fit_transform(X))
        return y

    def save(self, path="./"):
        pickle.dump(self, open(path + '_model.pickle', "wb"))

    def load(self, path="./"):
        modelfile = path + '_model.pickle'
        if isfile(modelfile):
            with open(modelfile, "rb") as f:
                self = pickle.load(f)
            print("Model reloaded from: " + modelfile)
        return self
Exemplo n.º 8
0
                        default='./prepro_vocab.json',
                        help='file path for saved preprocessor')
    return parser.parse_args()


if __name__ == '__main__':
    # Get arguments
    print('Getting arguments...')
    args = get_args()

    # make a dataset
    print('Importing dataset...')
    data = SentimentDataset(data=args.train_path)

    # preprocess and save word encodings
    preprocessor = Preprocessor(max_vocab=args.max_vocab)
    data = preprocessor.fit_transform(dataset=data)
    preprocessor.save(args.prepro_save_path)

    # validation split
    data.split_data(validation_count=args.validation_count)
    train_ds, val_ds = data.to_dataset()

    # to dataLoaders
    train_set = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True)
    val_set = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False)

    print('Initializing model...')
    mod = SentimentModel(
        len(preprocessor.vocab2enc) + 3, args.embedding_dim, args.hidden_dim)
    opt = Adam(mod.parameters(), lr=args.lr)
Exemplo n.º 9
0
    parser.add_argument('--prepro_path', type=str, default='./prepro_vocab.json', help='path to fit preprocessor')
    return parser.parse_args()


if __name__ == '__main__':
    # Get arguments
    print('Getting arguments...')
    args = get_args()

    # make a dataset
    print('Importing dataset...')
    data = SentimentDataset(data=args.test_path)

    # preprocess and save word encodings

    preprocessor = Preprocessor(max_vocab=args.max_vocab)
    preprocessor.load()
    data = preprocessor.transform(dataset=data)

    # validation split
    test_ds, _ = data.to_dataset()

    # to dataLoaders
    test_set = DataLoader(test_ds, batch_size=16, shuffle=False)

    # load saved model
    print('Loading trained model...')
    model = torch.load(args.model_path)
    model.eval()

    test(test_set, model, val=False)
Exemplo n.º 10
0
    # Interesting point: the M2 prepared challenges using sometimes AutoML challenge metrics
    # not scikit-learn metrics. For example:
    from libscores import bac_metric
    from libscores import auc_metric

    from data_manager import DataManager
    from data_converter import convert_to_num

    basename = 'credit'
    D = DataManager(basename, input_dir)  # Load data
    print D

    # Here we define 3 classifiers and compare them
    classifier_dict = {
        'Pipeline':
        Pipeline([('prepro', Preprocessor()), ('classif', BasicPredictor())]),
        'RandomPred':
        RandomPredictor(),
        'BasicPred':
        BasicPredictor(),
        'FancyPred':
        FancyPredictor()
    }

    print "Classifier\tAUC\tBAC\tACC\tError bar"
    for key in classifier_dict:
        myclassifier = classifier_dict[key]

        # Train
        Yonehot_tr = D.data['Y_train']
        # Attention pour les utilisateurs de problemes multiclasse,
from sklearn.preprocessing import Imputer
from sklearn.cluster import FeatureAgglomeration
from sys import path
path.append("../ingestion_program")  # Contains libraries you will need
from data_manager import DataManager  # such as DataManager

from prepro import Preprocessor
input_dir = "../sample_data"
output_dir = "../resuts"

basename = 'Housing'
D = DataManager(basename, input_dir)  # Load data
print("*** Original data ***")
print D

Prepro = Preprocessor()

X = np.copy(D.data['X_train'])
y = np.copy(D.data['Y_train'])
x_valid = np.copy(D.data['X_valid'])
x_test = np.copy(D.data['X_valid'])

model_selection = Prepro.selectFeatures(X, y)
D.data['X_train'] = model_selection.transform(X)
D.data['X_valid'] = model_selection.transform(x_valid)
D.data['X_test'] = model_selection.transform(x_test)
estimators = [('imputer', Imputer()), ('scaler', MinMaxScaler()),
              ('clustring', FeatureAgglomeration())]
pipe = Pipeline(estimators)
D.data['X_train'] = pipe.fit_transform(D.data['X_train'], D.data['Y_train'])
D.data['X_valid'] = pipe.transform(D.data['X_valid'])
Exemplo n.º 12
0
    from sklearn.metrics import accuracy_score      
    # Interesting point: the M2 prepared challenges using sometimes AutoML challenge metrics
    # not scikit-learn metrics. For example:
    from libscores import bac_metric
    from libscores import auc_metric
                 
    from data_manager import DataManager 
    from data_converter import convert_to_num 
    
    basename = 'credit'
    D = DataManager(basename, input_dir) # Load data
    print D
    
    # Here we define 3 classifiers and compare them
    classifier_dict = {
            'Pipeline': Pipeline([('prepro', Preprocessor()), ('classif', FancyPredictor())]),
            'RandomPred': RandomPredictor(),
            'BasicPred': BasicPredictor(),
            'FancyPred': FancyPredictor()}
        
    
    print "Classifier\tAUC\tBAC\tACC\tError bar"
    for key in classifier_dict:
        myclassifier = classifier_dict[key]
 
        # Train
        Yonehot_tr = D.data['Y_train']
        # Attention pour les utilisateurs de problemes multiclasse,
        # mettre convert_to_num DANS la methode fit car l'ingestion program
        # fournit Yonehot_tr a la methode "fit"
        # Ceux qui resolvent des problemes a 2 classes ou des problemes de