def predict(self, X): ''' This function should provide predictions of labels on (test) data. Here we just return zeros... Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. For multi-class or multi-labels problems, class probabilities are often expected if the metric is cross-entropy. Scikit-learn also has a function predict-proba, we do not require it. The function predict eventually can return probabilities. ''' Prepro = Preprocessor() Prepro.pip0(10) Prepro.fit_transform(X,y=None) num_test_samples = len(X) if X.ndim>1: num_feat = len(X[0]) print("PREDICT: dim(X)= [{:d}, {:d}]".format(num_test_samples, num_feat)) if (self.num_feat != num_feat): print("ARRGH: number of features in X does not match training data!") print("PREDICT: dim(y)= [{:d}, {:d}]".format(num_test_samples, self.num_labels)) output= self.clf.predict(X) return output
def fit(self, X, y): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. For classification, labels could be either numbers 0, 1, ... c-1 for c classe or one-hot encoded vector of zeros, with a 1 at the kth position for class k. The AutoML format support on-hot encoding, which also works for multi-labels problems. Use data_converter.convert_to_num() to convert to the category number format. For regression, labels are continuous values. ''' Prepro = Preprocessor() Prepro.pip0(10) Prepro.fit_transform(X, y) self.num_train_samples = len(X) if X.ndim>1: self.num_feat = len(X[0]) print("FIT: dim(X)= [{:d}, {:d}]".format(self.num_train_samples, self.num_feat)) num_train_samples = len(y) if y.ndim>1: self.num_labels = len(y[0]) print("FIT: dim(y)= [{:d}, {:d}]".format(num_train_samples, self.num_labels)) if (self.num_train_samples != num_train_samples): print("ARRGH: number of samples in X and y do not match!") ###### Baseline models ###### from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR # Comment and uncomment right lines in the following to choose the model #self.clf = GaussianNB() #self.clf = LinearRegression() #self.clf = DecisionTreeRegressor() #self.clf = RandomForestRegressor() #self.clf = KNeighborsRegressor() #self.clf = SVR(C=1.0, epsilon=0.2) if self.is_trained==False: self.clf=self.selection_hyperparam(X, y) # self.clf=self.selection_hyperparam__(X, y) self.is_trained=True
path.append("../ingestion_program") # Contains libraries you will need from data_manager import DataManager # such as DataManager from prepro import Preprocessor input_dir = "../sample_data" output_dir = "../resuts" basename = 'credit' D = DataManager(basename, input_dir) # Load data print("*** Original data ***") print D Prepro = Preprocessor() # Preprocess on the data and load it back into D D.data['X_train'] = Prepro.fit_transform(D.data['X_train'], D.data['Y_train']) D.data['X_valid'] = Prepro.transform(D.data['X_valid']) D.data['X_test'] = Prepro.transform(D.data['X_test']) # Here show something that proves that the preprocessing worked fine print("*** Transformed data ***") print D # Preprocessing gives you opportunities of visualization: # Scatter-plots of the 2 first principal components # Scatter plots of pairs of features that are most relevant import matplotlib.pyplot as plt X = D.data['X_train'] Y = D.data['Y_train'] plt.scatter(X[:, 0], X[:, 1], c=Y) plt.xlabel('PC1')
class model: def __init__(self): ''' This constructor is supposed to initialize data members. Use triple quotes for function documentation. ''' self.debug = 0 self.num_train_samples=0 self.num_feat=1 self.num_labels=1 self.is_trained=False self.preproc = Preprocessor() def cross_validation_simple(self, j, k, X, Y): return cross_val_score(RandomForestRegressor(100, "mse", None, 2, j, 0.0, k), X, Y, cv=3) # Recherche des meilleurs paramètres à donner à RandomForestRegressor. # A cause de la lenteur de cette méthode, nous l'avons utilisée dans model_param.py, et nous # avons directement donné les paramètres optimaux à Random Forest, qui s'avèrent être les paramètres de base. def selection_hyperparam(self, X, Y): SMax=0 param=dict() tab=[0.3, 0.6, 0.9, 'auto'] for j in range(1, 11, 1): for k in range(0, 4, 1): a=RandomForestRegressor(100, "mse", None, 2, j, 0.0, tab[k]) a.fit(X, Y) error=self.cross_validation_simple(j, tab[k], X, Y) score=mean(error) print(" j: "+str(j)+" k :"+str(k)) if(score>SMax): SMax=score param={'param2':j, 'param3':tab[k]} print('first param '+str(param['param2'])+' second param '+str(param['param3'])) print('first param final '+str(param['param2'])+' second param final '+str(param['param3'])) return param def fit(self, X, y): ''' This function should train the model parameters. Here we do nothing in this example... Args: X: Training data matrix of dim num_train_samples * num_feat. y: Training label matrix of dim num_train_samples * num_labels. Both inputs are numpy arrays. For classification, labels could be either numbers 0, 1, ... c-1 for c classe or one-hot encoded vector of zeros, with a 1 at the kth position for class k. The AutoML format support on-hot encoding, which also works for multi-labels problems. Use data_converter.convert_to_num() to convert to the category number format. For regression, labels are continuous values. ''' if self.debug: self.num_train_samples = self.preproc.fit_transform(X).shape[0] if self.preproc.fit_transform(X).ndim>1: self.num_feat = self.preproc.fit_transform(X).shape[1] print("FIT: dim(X)= [{:d}, {:d}]").format(self.num_train_samples, self.num_feat) num_train_samples = y.shape[0] if y.ndim>1: self.num_labels = y.shape[1] print("FIT: dim(y)= [{:d}, {:d}]").format(num_train_samples, self.num_labels) if (self.num_train_samples != num_train_samples): print("ARRGH: number of samples in X and y do not match!") ###### Baseline models ###### from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.neighbors import KNeighborsRegressor # Comment and uncomment right lines in the following to choose the model #self.model = GaussianNB() #self.model = LinearRegression() #self.model = DecisionTreeRegressor() self.model = RandomForestRegressor() #self.model = KNeighborsRegressor() self.model.fit(self.preproc.fit_transform(X), y) self.is_trained=True def predict(self, X): ''' This function should provide predictions of labels on (test) data. Make sure that the predicted values are in the correct format for the scoring metric. For example, binary classification problems often expect predictions in the form of a discriminant value (if the area under the ROC curve it the metric) rather that predictions of the class labels themselves. For multi-class or multi-labels problems, class probabilities are often expected if the metric is cross-entropy. Scikit-learn also has a function predict-proba, we do not require it. The function predict eventually can return probabilities. ''' if self.debug: num_test_samples = self.preproc.fit_transform(X).shape[0] if self.preproc.fit_transform(X).ndim>1: num_feat = self.preproc.fit_transform(X).shape[1] print("PREDICT: dim(X)= [{:d}, {:d}]").format(num_test_samples, num_feat) if (self.num_feat != num_feat): print("ARRGH: number of features in X does not match training data!") print("PREDICT: dim(y)= [{:d}, {:d}]").format(num_test_samples, self.num_labels) y = self.model.predict(self.preproc.fit_transform(X)) return y def save(self, path="./"): pickle.dump(self, open(path + '_model.pickle', "wb")) def load(self, path="./"): modelfile = path + '_model.pickle' if isfile(modelfile): with open(modelfile, "rb") as f: self = pickle.load(f) print("Model reloaded from: " + modelfile) return self
help='file path for saved preprocessor') return parser.parse_args() if __name__ == '__main__': # Get arguments print('Getting arguments...') args = get_args() # make a dataset print('Importing dataset...') data = SentimentDataset(data=args.train_path) # preprocess and save word encodings preprocessor = Preprocessor(max_vocab=args.max_vocab) data = preprocessor.fit_transform(dataset=data) preprocessor.save(args.prepro_save_path) # validation split data.split_data(validation_count=args.validation_count) train_ds, val_ds = data.to_dataset() # to dataLoaders train_set = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True) val_set = DataLoader(val_ds, batch_size=args.batch_size, shuffle=False) print('Initializing model...') mod = SentimentModel( len(preprocessor.vocab2enc) + 3, args.embedding_dim, args.hidden_dim) opt = Adam(mod.parameters(), lr=args.lr)