def columns(self): """Return columns in selected dataset""" if self.is_db: ds = get_table(self.db_url, self.table) else: ds = pd.read_csv(self.dataset.file) return ds.columns.to_list()
def index(request): if 'step' not in request.session: request.session['step'] = 1 # Step 1: upload file directly if request.method == 'POST' and 'uploaded_file' in request.FILES: file = request.FILES['uploaded_file'] print("get file: {}".format(file.name)) print("type: {}".format(type(file))) sep = request.POST['sep'] if request.POST['sep'] else ',' ds = get_dataframe(file, sep=sep) if ds is not None: print("dataframe:") print(ds.head()) request.session['dataset_name'] = file.name request.session['ds_data'] = ds.to_json() request.session['sep'] = sep request.session['is_db'] = False if 'db_url' in request.session: del request.session['db_url'] request.session['step'] = 2 return render(request, 'msp/index.html') else: print('dataframe {} isn\'t valid'.format(file.name)) return render(request, 'msp/index.html', { 'upload_error': 'dataframe {} isn\'t valid'.format(file.name) }) # Step 1: get database connection if request.method == 'POST' and 'db_connection' in request.POST: db_connection = request.POST['db_connection'] print("get connection: {}".format(db_connection)) engine = get_connector(db_connection) if engine is not None: request.session['db_url'] = db_connection request.session['is_db'] = True tables = get_tables(db_connection) return render(request, 'msp/index.html', {'tables': tables}) else: return render( request, 'msp/index.html', {'upload_error': 'url {} isn\'t valid'.format(db_connection)}) # Step 1: get table in DBMS if request.method == 'POST' and 'table' in request.POST: table = request.POST['table'] print("get table: {}".format(table)) request.session['dataset_name'] = table request.session['is_db'] = True if 'ds_data' in request.session: del request.session['ds_data'] del request.session['sep'] request.session['step'] = 2 return render(request, 'msp/index.html') # Step 2: select modality if request.method == 'POST' and 'mode' in request.POST: mode = request.POST['mode'] print("select modality: {}".format(mode)) request.session['mode'] = mode if mode == 'test': request.session['step'] = 3 return render(request, 'msp/index.html') elif mode == 'train': # case 1: passing another table inside the DBMS if request.session['is_db'] and request.POST['label_table']: label_table = request.POST['label_table'] print('get label table: {}'.format(label_table)) if not check_table(request.session['db_url'], label_table): return render( request, 'msp/index.html', { 'label_error': 'Table {} doens\t exists'.format(label_table) }) else: pass # case 2 elif request.POST['label_column']: label_column = request.POST['label_column'] print('get label column: {}'.format(label_column)) # check column on table in DBMS if request.session['is_db']: print('check column in table {}'.format( request.session['dataset_name'])) res = check_column(request.session['db_url'], request.session['dataset_name'], label_column) if res: labels = get_column(request.session['db_url'], request.session['dataset_name'], label_column) print(labels) request.session['labels'] = labels request.session['is_label_column'] = label_column request.session['step'] = 3 return render(request, 'msp/index.html') # check column on dataframe else: print('check column in dataframe {}'.format( request.session['dataset_name'])) ds = pd.read_json(request.session['ds_data']) if label_column in ds.columns: labels = ds[label_column].to_list() print(ds[label_column]) request.session['labels'] = labels request.session['is_label_column'] = label_column request.session['step'] = 3 return render(request, 'msp/index.html') return render(request, 'msp/index.html', { 'label_error': 'Column {} doesnt exists'.format(label_column) }) # case 3: upload label file elif 'label_dataframe' in request.FILES: file = request.FILES['label_dataframe'] print("get label dataframe: {}".format(file.name)) print(type(file)) ds = get_dataframe(file, sep=',') print(ds.head()) if ds is not None: labels = ds.iloc[:, 0].to_list() print(ds.iloc[:, 0]) request.session['labels'] = labels request.session['is_label_column'] = False request.session['step'] = 3 return render(request, 'msp/index.html') return render(request, 'msp/index.html', {'label_error': 'Label Dataframe error'}) else: return render( request, 'msp/index.html', {'label_error': 'Mode {} doesnt exists'.format(mode)}) # Step 3: add transformations if request.method == 'POST' and 'transform_type' in request.POST: transform_type = request.POST['transform_type'] transform_column = request.POST['transform_column'] error = False if request.session['is_db']: print('check column {} in table {}'.format( transform_column, request.session['dataset_name'])) res = check_column(request.session['db_url'], request.session['dataset_name'], transform_column) if not res: error = True else: print('check column {} in dataframe'.format(transform_column)) ds = pd.read_json(request.session['ds_data']) if transform_column not in ds.columns: error = True if error: return render( request, 'msp/index.html', { 'transform_error': 'Impossible apply transform {} {}'.format( transform_type, transform_column) }) else: if 'transforms' not in request.session: print('create new transform variable') request.session['transforms'] = [] print(request.session['transforms']) l = request.session['transforms'] l.append({ 'transform_type': transform_type, 'transform_column': transform_column }) request.session['transforms'] = l print(request.session['transforms']) # Step 3: remove transformations if request.method == 'POST' and 'delete_transforms' in request.POST: print('delete all transforms') if 'transforms' in request.session: del request.session['transforms'] # Step 4: execute if request.method == 'POST' and 'model_type' in request.POST: print('run model') model_type = request.POST['model_type'] model_file = None if request.session['mode'] == 'test': if 'uploaded_model' not in request.FILES: return render(request, 'msp/index.html', {'model_error': 'No pre trained model uploaded'}) model_file = request.FILES['uploaded_model'] # request.session['model_file'] = model_file run_db = False if request.session['mode'] == 'test' and request.session['is_db']: run_db = request.POST['run_db'] request.session['model_type'] = model_type request.session['run_db'] = run_db request.session['step'] = 4 manager = MLManager() manager.select_model('GradientBoostingRegressor') if request.session['mode'] == 'train': if request.session['is_db']: ds = get_table(request.session['db_url'], request.session['dataset_name']) else: ds = pd.read_json(request.session['ds_data']) print(ds.head(10)) print(request.session['is_label_column']) if request.session['is_label_column']: del ds[request.session['is_label_column']] model = manager.fit(ds, request.session['labels']) save_model(model, "data/model_{}.joblib".format(model_type)) print('save_model') elif request.session['mode'] == 'test': if request.session['is_db']: print( 'runjdfisbgsijkbgjsdlgosdbgoisdhgilsdhgodspngfposifdòlhgaofdis' ) columns = list( get_table(request.session['db_url'], request.session['dataset_name']).columns) print(columns) query = manager.generate_query(model_file, request.session['dataset_name'], columns) print(query) ds = execute_query(request.session['db_url'], query) print(ds) else: ds = pd.read_json(request.session['ds_data']) y_pred = manager.predict(ds, model_file) y_pred = pd.Series(y_pred, ) y_pred.to_csv("data/{}_prediction.csv".format(model_type), index=False) print('save_result') return render( request, 'msp/index.html', )
def main(): print('Workflow Training') t_start = datetime.now() scenario = get_train_scenario() check_train_consistency(scenario) # Dataset print('Get Dataset') ds = get_dataset(scenario) features = ds.columns.to_list() # Label print('Get Label') if scenario['labels_type'] == 'file': # Get labels from file labels = get_dataframe(scenario['labels']) if labels is None: raise ValueError('Impossible read label from selected file') # Get first column from file labels = labels.iloc[:, 0].to_list() elif scenario['labels_type'] == 'table': # Get labels from table labels = get_table(scenario['db_url'], scenario['labels']) if ds is None: raise ValueError( 'Impossible read labels from DBMS, please check DBMS input') # Get first column from table labels = labels.iloc[:, 0].to_list() elif scenario['labels_type'] == 'column': # Get labels from column labels = ds[scenario['labels']].to_list() # Remove label column from dataset features.remove(scenario['labels']) else: raise ValueError('Selected the wrong label type') # Get train and test from dataset x_train, y_train, x_test, y_test = get_train_test(ds[features], labels, scenario['validation']) # ML Manager print('Create ML Manager') manager = MLManager() # Set ML model manager.select_model(scenario['model']) # Set ML transforms manager.set_transforms(scenario['transforms']) # Training print('Training...') model = manager.fit(x_train, y_train) # Finish training t_end = datetime.now() print('Execution Time: ', t_end - t_start) # Compute evaluation if y_test and scenario['metric']: y_pred = model.predict(x_test) res_evaluation = manager.evaluate(scenario['metric'], y_test, y_pred) print('Evaluation: ', res_evaluation) print('Save Result Model: train.joblib') trained_pipeline_name = '../data/result/train.joblib' save_model(model, trained_pipeline_name) print(':)')
def post(self, request): # Params scenario = create_simulation_scenario(request.data) try: check_simulation_consistency(scenario) except ValueError as e: return Response({ 'detail': str(e), }, status=status.HTTP_400_BAD_REQUEST) # Get Dataset t_load_start = datetime.now() ds = get_table(scenario['db_url'], scenario['table']) features = ds.columns.to_list() if scenario['labels_type'] == 'column': # Remove Label column if exists features.remove(scenario['labels']) t_load_end = datetime.now() # ML Manager manager = MLManager() t_ml = [] t_db = [] # Testing Phase for i in range(scenario['batch_number']): ds_batch = get_batch(ds, i, scenario['batch_size']) if ds_batch.empty: break # Execute predict using MLManager and ML Library t_start = datetime.now() _ = manager.predict(ds[features], scenario['pipeline'].file) t_end = datetime.now() t_ml.append(t_end - t_start) # Create Batch for DBMS ds_batch.to_sql('batch', con=scenario["db_url"], if_exists="replace", index=False) # Generate query using MLManager dbms = DBMSUtils.get_dbms_from_str_connection(scenario['db_url']) queries, query = manager.generate_query(scenario['pipeline'].file, scenario['table'], features, dbms, scenario['optimizer']) # Execute query t_start = datetime.now() # Execute query _ = execute_multi_queries(scenario["db_url"], queries) t_end = datetime.now() t_db.append(t_end - t_start) # Finish Simulation return Response( { 'detail': 'Successfully predicted result', 'ml_results': { 'execution_time': (np.mean(t_ml) + (t_load_end - t_load_start)) }, 'dbms_results': { 'execution_time': np.mean(t_db) }, }, status=status.HTTP_200_OK)
def get_dataset(scenario): if scenario.is_db: ds = get_table(scenario.db_url, scenario.table) else: ds = get_dataframe(scenario.dataset.file) return ds
def post(self, request): scenario = create_test_scenario(request.data) inference_time = 0 try: check_test_consistency(scenario) except ValueError as e: return Response({ 'detail': str(e), }, status=status.HTTP_400_BAD_REQUEST) # Load pipeline model from pipeline file pipeline = load_model(scenario.pipeline.file) if not pipeline: return Response( {'detail': 'The selected file isn\'t a pipeline objects'}, status=status.HTTP_400_BAD_REQUEST) # Extract pipeline information from loaded model pipeline = MLManager.extract_pipeline_components(pipeline) if not pipeline: return Response( {'detail': 'The selected file isn\'t a pipeline objects'}, status=status.HTTP_400_BAD_REQUEST) # Model params scenario.model = pipeline.get('model') scenario.transforms = json.dumps(pipeline.get('transforms', [])) # Dataset if scenario.run_db: # Get features from table features = get_columns(scenario.db_url, scenario.table) else: data_extractor_start = datetime.now() # Get Dataset ds = get_dataset(scenario) features = ds.columns.to_list() data_extractor_end = datetime.now() data_extractor_time = (data_extractor_end - data_extractor_start).total_seconds() inference_time += data_extractor_time if scenario.labels_type == 'column': # Remove Label column if exists features.remove(scenario.labels) # ML Manager manager = MLManager() # Testing Phase query = None if scenario.run_db: inference_start = datetime.now() # Generate query using MLManager dbms = DBMSUtils.get_dbms_from_str_connection(scenario.db_url) queries, query = manager.generate_query(scenario.pipeline.file, scenario.table, features, dbms, scenario.optimizer) # Execute query y_pred = execute_multi_queries(scenario.db_url, queries) y_pred = pd.Series(y_pred.iloc[:, 0], name='Label') inference_end = datetime.now() inference_time += (inference_end - inference_start).total_seconds() else: inference_start = datetime.now() # Execute predict using MLManager and ML Library y_pred = manager.predict(ds[features], scenario.pipeline.file) y_pred = pd.Series(y_pred, name='Label') inference_end = datetime.now() inference_time += (inference_end - inference_start).total_seconds() # Label labels = [] # Compute evaluation if scenario.labels_type: if scenario.labels_type == 'file': # Get labels from file labels_document = get_document_object(scenario.labels) labels = get_dataframe(labels_document.file) if labels is None: return Response( { 'detail': 'The selected labels file {} isn\'t valid'.format( labels_document.filename) }, status=status.HTTP_400_BAD_REQUEST) # Get first column from file labels = labels.iloc[:, 0].to_list() elif scenario.labels_type == 'table': # Get labels from table labels = get_table(scenario.db_url, scenario.labels) if not labels: return Response( { 'detail': 'The selected table {} isn\'t valid for label'. format(scenario.labels) }, status=status.HTTP_400_BAD_REQUEST) # Get first column from table labels = labels.iloc[:, 0].to_list() elif scenario.labels_type == 'column' and not scenario.run_db: # Get labels from column labels = ds[scenario.labels].to_list() elif scenario.labels_type == 'column' and scenario.run_db: # Get labels from table labels = get_column(scenario.db_url, scenario.table, scenario.labels) else: return Response({'detail': 'Select the wrong labels type'}, status=status.HTTP_400_BAD_REQUEST) # Compute evaluation res_evaluation = None if labels and scenario.metric: res_evaluation = manager.evaluate(scenario.metric, labels, y_pred) # Create predictions file test_result_name = "test_{}_{}.csv".format(scenario.model, datetime.now()) test_result_name = test_result_name.replace(' ', '_') test_result_name = test_result_name.replace(':', '_') y_pred.to_csv(test_result_name, index=False, header=True) # Save predictions in Document model f = open(test_result_name, 'rb') document = Document(file=File(f), filename=test_result_name) document.save() f.close() # Remove temporally predictions file os.remove(test_result_name) # Save Scenario model scenario.save() # Save ResultScenario result_scenario = ResultScenario() result_scenario.scenario = scenario result_scenario.execution_time = inference_time result_scenario.throughput = result_scenario.execution_time / len( y_pred) result_scenario.score = res_evaluation result_scenario.file_result = document.filename result_scenario.query = query result_scenario.save() return Response( { 'detail': 'Successfully predicted result', 'filename': test_result_name, 'scenario_id': scenario.id }, status=status.HTTP_201_CREATED)
def post(self, request): t_start = datetime.now() scenario: Scenario = create_train_scenario(request.data) try: check_train_consistency(scenario) except ValueError as e: return Response({ 'detail': str(e), }, status=status.HTTP_400_BAD_REQUEST) # Dataset print('Get Dataset') ds = get_dataset(scenario) if ds is None: return Response( { 'detail': 'Impossible read the selected dataset', }, status=status.HTTP_400_BAD_REQUEST) features = ds.columns.to_list() # Label print('Get Label') if scenario.labels_type == 'file': # Get labels from file labels_document = get_document_object(scenario.labels) labels = get_dataframe(labels_document.file) if labels is None: return Response( { 'detail': 'The selected labels file {} isn\'t valid'.format( labels_document.filename) }, status=status.HTTP_400_BAD_REQUEST) # Get first column from file labels = labels.iloc[:, 0].to_list() elif scenario.labels_type == 'table': # Get labels from table labels = get_table(scenario.db_url, scenario.labels) if not labels: return Response( { 'detail': 'The selected table {} isn\'t valid for label'.format( scenario.labels) }, status=status.HTTP_400_BAD_REQUEST) # Get first column from table labels = labels.iloc[:, 0].to_list() elif scenario.labels_type == 'column': # Get labels from column if scenario.labels not in ds.columns: return Response( { 'detail': 'The selected column {} isn\'t valid for label'.format( scenario.labels) }, status=status.HTTP_400_BAD_REQUEST) labels = ds[scenario.labels].to_list() # Remove label column from dataset features.remove(scenario.labels) else: return Response({'detail': 'Select the wrong labels type'}, status=status.HTTP_400_BAD_REQUEST) # Get train and test from dataset x_train, y_train, x_test, y_test = get_train_test( ds[features], labels, scenario.validation) # ML Manager manager = MLManager() # Set ML model manager.select_model(scenario.model) # Set ML transforms manager.set_transforms(json.loads(scenario.transforms)) # Training model = manager.fit(x_train, y_train) # Finish training t_end = datetime.now() # Compute evaluation res_evaluation = None if y_test and scenario.metric: y_pred = model.predict(x_test) res_evaluation = manager.evaluate(scenario.metric, y_test, y_pred) # Create joblib file trained_pipeline_name = "train_{}_{}.joblib".format( scenario.model, datetime.now()) trained_pipeline_name = trained_pipeline_name.replace(' ', '_') trained_pipeline_name = trained_pipeline_name.replace(':', '_') save_model(model, trained_pipeline_name) # Save trained pipeline in Document model f = open(trained_pipeline_name, 'rb') document = Document(file=File(f), filename=trained_pipeline_name) document.save() f.close() # Remove joblib file os.remove(trained_pipeline_name) # Save Scenario model scenario.save() # Save ResultScenario result_scenario = ResultScenario() result_scenario.scenario = scenario result_scenario.execution_time = (t_end - t_start).total_seconds() result_scenario.throughput = result_scenario.execution_time / len( x_train) result_scenario.score = res_evaluation result_scenario.file_result = document.filename result_scenario.save() return Response( { 'detail': 'Pipeline trained correctly', 'filename': trained_pipeline_name, 'scenario_id': scenario.id }, status=status.HTTP_201_CREATED)
def main(): print('Workflow Testing') t_start = datetime.now() scenario = get_test_scenario() check_test_consistency(scenario) # Dataset if scenario['run_db']: # Get features from table print('Extract Features') features = get_columns(scenario['db_url'], scenario['table']) else: # Get Dataset print('Get Dataset and Features') ds = get_dataset(scenario) features = ds.columns.to_list() if scenario['labels_type'] == 'column': # Remove Label column if exists features.remove(scenario['labels']) # ML Manager print('Create ML Manager') manager = MLManager() # Testing Phase print('Testing...') if scenario['run_db']: # Generate query using MLManager print('Query Generation...') query = manager.generate_query(scenario['pipeline'], scenario['table'], features) # Execute query print('Query Execution...') y_pred = execute_query(scenario['db_url'], query) y_pred = pd.Series(y_pred.iloc[:, 0], name='Label') else: # Execute predict using MLManager and ML Library y_pred = manager.predict(ds[features], scenario['pipeline']) y_pred = pd.Series(y_pred.flatten(), name='Label') # Finish testing t_end = datetime.now() print('Execution Time: ', t_end - t_start) print('Save prediction: test.csv') # test_result_name = '../data/result/test_run_db{}.csv'.format(scenario['run_db']) test_result_name = '../data/result/test.csv' y_pred.to_csv(test_result_name, index=False, header=True) # Compute evaluation if scenario['labels_type']: # Label print('Get Label') if scenario['labels_type'] == 'file': # Get labels from file labels = get_dataframe(scenario['labels']) if labels is None: raise ValueError('Impossible read label from selected file') # Get first column from file labels = labels.iloc[:, 0].to_list() elif scenario['labels_type'] == 'table': # Get labels from table labels = get_table(scenario['db_url'], scenario['labels']) if ds is None: raise ValueError( 'Impossible read labels from DBMS, please check DBMS input' ) # Get first column from table labels = labels.iloc[:, 0].to_list() elif scenario['labels_type'] == 'column' and not scenario['is_db']: # Get labels from column labels = ds[scenario['labels']].to_list() elif scenario['labels_type'] == 'column' and scenario['is_db']: # Get labels from table labels = get_column(scenario['db_url'], scenario['table'], scenario['labels']) else: raise ValueError('Select the wrong label type') res_evaluation = manager.evaluate(scenario['metric'], labels, y_pred) print('Evaluation: ', res_evaluation) print(':)')