def main(): df = training_dataframe(mongodb_uri=MONGO_URI) users = df['user_email'].unique() data = list() os.chdir(checkpoint_path) for user in users: # All the checkpoints to be stored in checkpoints path df = user_to_binary(df, user) if np.where(df.user == 1)[0].__len__() >= 10: X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df, random_state=42) from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RandomizedSearchCV model = RandomForestClassifier() from pprint import pprint pprint(model.get_params()) hyperparameters = { 'max_features': [None, 'auto', 'sqrt', 'log2'], 'max_depth': [None, 1, 5, 10, 15, 20], 'min_samples_leaf': [1, 2, 4, 6], 'min_samples_split': [2, 4, 6, 8, 10], 'n_estimators': [int(x) for x in np.linspace(start=10, stop=100, num=10)], 'criterion': ['gini', 'entropy'] } rf_random = RandomizedSearchCV(model, hyperparameters, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1) # Train the classifier on X_train and Y_train rf_random.fit(X_train, Y_train) # A huge bunch of stuff comes up. To obtain the best parameters, we call: pprint(rf_random.best_params_) # Almacenamos la información info = {} info['user'] = user info['hyperparameters'] = rf_random.best_params_ data.append(info) print('Best score for training_data:', rf_random.best_score_) os.chdir(basedir) os.chdir(logs_path) with open("randomForest_GridSearch.txt", "w") as myfile: json.dump(data, myfile) os.chdir(basedir)
def main(): df = training_dataframe(mongodb_uri=MONGO_URI) users = df['user_email'].unique() data = list() os.chdir(checkpoint_path) for user in users: # All the checkpoints to be stored in checkpoints path df = user_to_binary(df, user) if np.where(df.user == 1)[0].__len__() >= 10: X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df, random_state=42) # Aplicamos estandarización. Se guardará un fichero de estandarización en la carpeta checkpoints X_train = save_scaling(X_train) # Normalizamos el test dataset de acuerdo al training dataset sobre el que se ha hecho oversampling X_test = load_scaling(X_test) from sklearn.svm import SVC model = SVC() from pprint import pprint pprint(model.get_params()) parameter_candidates = [{ 'C': [1, 2, 5, 10, 20, 30, 100, 1000], 'kernel': ['linear'] }, { 'C': [1, 2, 5, 10, 20, 30, 100, 1000], 'gamma': [20., 10., 5., 1., 0.1, 0.01, 0.001, 0.0001], 'kernel': ['rbf'] }] from sklearn.model_selection import GridSearchCV # Conduct Grid Search To Find Parameters Producing Highest Score rf_random = GridSearchCV(estimator=SVC(), param_grid=parameter_candidates, verbose=2, n_jobs=-1) # Train the classifier on X_train and Y_train rf_random.fit(X_train, Y_train) # A huge bunch of stuff comes up. To obtain the best parameters, we call: pprint(rf_random.best_params_) # Almacenamos la información info = {} info['user'] = user info['hyperparameters'] = rf_random.best_params_ data.append(info) print('Best score for training_data:', rf_random.best_score_) os.chdir(basedir) os.chdir(logs_path) with open("svc_GridSearch.txt", "w") as myfile: json.dump(data, myfile) os.chdir(checkpoint_path)
def main(): df = training_dataframe(mongodb_uri=MONGO_URI) users = df['user_email'].unique() data = list() os.chdir(checkpoint_path) for user in users: # All the checkpoints to be stored in checkpoints path df = user_to_binary(df, user) if np.where(df.user==1)[0].__len__() >= 10: X_train, X_test, Y_train, Y_test = obtain_features(dataframe=df, random_state=42) # Aplicamos estandarización. Se guardará un fichero de # estandarización en la carpeta checkpoints X_train = save_scaling(X_train) # Normalizamos el test dataset de acuerdo al training dataset # sobre el que se ha hecho oversampling X_test = load_scaling(X_test) from sklearn.linear_model import LogisticRegression model = LogisticRegression() from pprint import pprint pprint(model.get_params()) parameter_candidates = [ { 'C': [1, 10, 20, 30, 40 ,50, 60 ,70, 80, 90, 100], 'solver': ['newton-cg', 'lbfgs', 'sag'], 'penalty': ['l2'], 'class_weight': [None, 'balanced'] }, { 'C': [1, 10, 20, 30, 40 ,50, 60 ,70, 80, 90, 100], 'solver': ['warn', 'liblinear', 'saga'], 'penalty': ['l2'], 'class_weight': [None, 'balanced'] } ] from sklearn.model_selection import GridSearchCV # Conduct Grid Search To Find Parameters Producing Highest Score rf_random = GridSearchCV(estimator=LogisticRegression(), param_grid=parameter_candidates, verbose=2, n_jobs=-1) # Train the classifier on X_train and Y_train rf_random.fit(X_train, Y_train) # A huge bunch of stuff comes up. To obtain the best parameters, we call: pprint(rf_random.best_params_) # Almacenamos la información info = {} info['user'] = user info['hyperparameters'] = rf_random.best_params_ data.append(info) print('Best score for training_data:', rf_random.best_score_) os.chdir(basedir) # Guardamos todo en un fichero .txt al que después podamos acceder os.chdir(logs_path) with open("logRegr_GridSearch.txt", "w") as myfile: json.dump(data, myfile) os.chdir(basedir)
MONGO_URI = os.environ.get( 'MONGODB_URI', 'mongodb://*****:*****@ds143070.mlab.com:43070/cubeauth') basedir = os.path.abspath(os.path.dirname(__file__)) checkpoint_path = os.path.join(basedir, 'checkpoints') logistic_regression_path = os.path.join(checkpoint_path, 'logistic_regression') support_vector_classifier_path = os.path.join(checkpoint_path, 'support_vector_classifier') random_forest_path = os.path.join(checkpoint_path, 'random_forest') logs_path = os.path.join(basedir, 'logs') # Following models to be supported model = 'svc' #['logRegr', 'svc', 'RandomForest'] # Loading dataframe from database df = training_dataframe(mongodb_uri=MONGO_URI) # Users involved in the experiment so far users = df['user_email'].unique() n_loops = 5 os.chdir(checkpoint_path) for user in users: p = list() r = list() f1 = list() data = user_to_binary(df, user)
def callback(ch, method, properties, body): # Training of the model is launched df = training_dataframe(mongodb_uri=MONGO_URI) users = df['user_email'].unique() for model in models: print('Lanzando GridSearch de Hiperparámetros para modelo ', model) if model == 'logRegr': os.system("python gridsearch_logRegr.py") # Cargamos los parámetros idóneos para cada usuario en un json os.chdir(logs_path) with open('logRegr_GridSearch.txt', mode='r', encoding='utf-8') as f: grid_search = json.load(f) os.chdir(basedir) elif model == 'svc': os.system("python gridsearch_svc.py") # Cargamos los parámetros idóneos para cada usuario en un json os.chdir(logs_path) with open('svc_GridSearch.txt', mode='r', encoding='utf-8') as f: grid_search = json.load(f) os.chdir(basedir) elif model == 'RandomForest': os.system("python gridsearch_randomForest.py") os.chdir(logs_path) with open('randomForest_GridSearch.txt', mode='r', encoding='utf-8') as f: grid_search = json.load(f) os.chdir(basedir) # All the checkpoints to be stored in checkpoints path os.chdir(checkpoint_path) for user in users: print('Comenzando entrenamiento del algortimo ', model, ' para usuario ', user) # Clasificación binaria para cada usuario data = user_to_binary(df, user) # Aplicamos estandarización. Se guardará un fichero de estandarización en la carpeta checkpoints X_train, X_test, Y_train, Y_test = obtain_features(dataframe=data, random_state=42) if model != 'RandomForest': X_train = save_scaling(X_train) # Normalizamos el test dataset de acuerdo al training dataset X_test = load_scaling(X_test) # Nos quedamos sólo con los hiperparámetros del usuario que nos interesan for item in grid_search: if item['user'] == user: info = item['hyperparameters'] # The training is launched for user model_training(x_train=X_train, x_test=X_test, y_train=Y_train, y_test=Y_test, user=user, model=model, info=info) os.chdir(basedir) # Connection to MongoDB is established client = MongoClient(MONGO_URI), # Getting a Database and parsing the name of the database from the MONGO_URI o = urlparse(MONGO_URI).path[1::] db = client[o] # Once training is finished, the user status that triggered training # is set to authenticable db.users.update_one(json.loads(body), {'$set': {'authenticable': True}})