예제 #1
0
def main():
    print('Workflow Simulation')
    scenario = get_simulation_scenario()

    # Dataset
    print('Get Dataset and Features')
    t_load_start = datetime.now()
    ds = get_dataset(scenario)
    features = ds.columns.to_list()
    t_load_end = datetime.now()

    # ML Manager
    print('Create ML Manager')
    manager = MLManager()

    t_ml = []
    t_db = []

    # Testing Phase
    print('Simulation...')
    for i in range(scenario['batch_number']):
        ds_batch = get_batch(ds, i, scenario['batch_size'])
        if ds_batch.empty:
            break

        # Execute predict using MLManager and ML Library
        print('ML Prediction...')
        t_start = datetime.now()
        _ = manager.predict(ds[features], scenario['pipeline'])
        t_end = datetime.now()

        t_ml.append(t_end - t_start)

        # Create Batch for DBMS
        ds_batch.to_sql('batch',
                        con=scenario["db_url"],
                        if_exists="replace",
                        index=False)

        # Generate query using MLManager
        print('Query Generation...')
        query = manager.generate_query(scenario['pipeline'], scenario['table'],
                                       features)

        # Execute query
        print('Query Execution...')
        t_start = datetime.now()
        _ = execute_query(scenario['db_url'], query)
        t_end = datetime.now()

        t_db.append(t_end - t_start)

    # Finish Simulation
    print('ML Execution Time: ', np.mean(t_ml) + (t_load_end - t_load_start))
    print('DB Execution Time: ', np.mean(t_db))

    print(':)')
예제 #2
0
    def get(self, request, filename):
        # Retrieve pipeline document object
        document = get_document_object(filename)
        if not document:
            raise Http404

        # Load pipeline model from pipeline file
        pipeline = load_model(document.file)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Extract pipeline information from loaded model
        pipeline = MLManager.extract_pipeline_components(pipeline)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        return Response(pipeline, status=status.HTTP_200_OK)
예제 #3
0
def index(request):
    if 'step' not in request.session:
        request.session['step'] = 1

    # Step 1: upload file directly
    if request.method == 'POST' and 'uploaded_file' in request.FILES:
        file = request.FILES['uploaded_file']
        print("get file: {}".format(file.name))
        print("type: {}".format(type(file)))
        sep = request.POST['sep'] if request.POST['sep'] else ','
        ds = get_dataframe(file, sep=sep)

        if ds is not None:
            print("dataframe:")
            print(ds.head())

            request.session['dataset_name'] = file.name
            request.session['ds_data'] = ds.to_json()
            request.session['sep'] = sep

            request.session['is_db'] = False

            if 'db_url' in request.session:
                del request.session['db_url']

            request.session['step'] = 2
            return render(request, 'msp/index.html')
        else:
            print('dataframe {} isn\'t valid'.format(file.name))
            return render(request, 'msp/index.html', {
                'upload_error':
                'dataframe {} isn\'t valid'.format(file.name)
            })

    # Step 1: get database connection
    if request.method == 'POST' and 'db_connection' in request.POST:
        db_connection = request.POST['db_connection']
        print("get connection: {}".format(db_connection))

        engine = get_connector(db_connection)
        if engine is not None:
            request.session['db_url'] = db_connection
            request.session['is_db'] = True

            tables = get_tables(db_connection)
            return render(request, 'msp/index.html', {'tables': tables})
        else:
            return render(
                request, 'msp/index.html',
                {'upload_error': 'url {} isn\'t valid'.format(db_connection)})

    # Step 1: get table in DBMS
    if request.method == 'POST' and 'table' in request.POST:
        table = request.POST['table']
        print("get table: {}".format(table))

        request.session['dataset_name'] = table
        request.session['is_db'] = True

        if 'ds_data' in request.session:
            del request.session['ds_data']
            del request.session['sep']

        request.session['step'] = 2
        return render(request, 'msp/index.html')

    # Step 2: select modality
    if request.method == 'POST' and 'mode' in request.POST:
        mode = request.POST['mode']
        print("select modality: {}".format(mode))

        request.session['mode'] = mode

        if mode == 'test':
            request.session['step'] = 3
            return render(request, 'msp/index.html')

        elif mode == 'train':

            # case 1: passing another table inside the DBMS
            if request.session['is_db'] and request.POST['label_table']:
                label_table = request.POST['label_table']
                print('get label table: {}'.format(label_table))

                if not check_table(request.session['db_url'], label_table):
                    return render(
                        request, 'msp/index.html', {
                            'label_error':
                            'Table {} doens\t exists'.format(label_table)
                        })
                else:
                    pass

            # case 2
            elif request.POST['label_column']:
                label_column = request.POST['label_column']
                print('get label column: {}'.format(label_column))

                # check column on table in DBMS
                if request.session['is_db']:
                    print('check column in table {}'.format(
                        request.session['dataset_name']))
                    res = check_column(request.session['db_url'],
                                       request.session['dataset_name'],
                                       label_column)
                    if res:
                        labels = get_column(request.session['db_url'],
                                            request.session['dataset_name'],
                                            label_column)
                        print(labels)
                        request.session['labels'] = labels
                        request.session['is_label_column'] = label_column

                        request.session['step'] = 3
                        return render(request, 'msp/index.html')

                # check column on dataframe
                else:
                    print('check column in dataframe {}'.format(
                        request.session['dataset_name']))

                    ds = pd.read_json(request.session['ds_data'])
                    if label_column in ds.columns:
                        labels = ds[label_column].to_list()
                        print(ds[label_column])

                        request.session['labels'] = labels
                        request.session['is_label_column'] = label_column

                        request.session['step'] = 3
                        return render(request, 'msp/index.html')

                return render(request, 'msp/index.html', {
                    'label_error':
                    'Column {} doesnt exists'.format(label_column)
                })

            # case 3: upload label file
            elif 'label_dataframe' in request.FILES:
                file = request.FILES['label_dataframe']
                print("get label dataframe: {}".format(file.name))
                print(type(file))

                ds = get_dataframe(file, sep=',')
                print(ds.head())

                if ds is not None:
                    labels = ds.iloc[:, 0].to_list()
                    print(ds.iloc[:, 0])

                    request.session['labels'] = labels
                    request.session['is_label_column'] = False

                    request.session['step'] = 3
                    return render(request, 'msp/index.html')

                return render(request, 'msp/index.html',
                              {'label_error': 'Label Dataframe error'})

        else:
            return render(
                request, 'msp/index.html',
                {'label_error': 'Mode {} doesnt exists'.format(mode)})

    # Step 3: add transformations
    if request.method == 'POST' and 'transform_type' in request.POST:
        transform_type = request.POST['transform_type']
        transform_column = request.POST['transform_column']
        error = False

        if request.session['is_db']:
            print('check column {} in table {}'.format(
                transform_column, request.session['dataset_name']))
            res = check_column(request.session['db_url'],
                               request.session['dataset_name'],
                               transform_column)
            if not res:
                error = True
        else:
            print('check column {} in dataframe'.format(transform_column))
            ds = pd.read_json(request.session['ds_data'])
            if transform_column not in ds.columns:
                error = True

        if error:
            return render(
                request, 'msp/index.html', {
                    'transform_error':
                    'Impossible apply transform {} {}'.format(
                        transform_type, transform_column)
                })
        else:

            if 'transforms' not in request.session:
                print('create new transform variable')
                request.session['transforms'] = []

            print(request.session['transforms'])
            l = request.session['transforms']
            l.append({
                'transform_type': transform_type,
                'transform_column': transform_column
            })
            request.session['transforms'] = l
            print(request.session['transforms'])

    # Step 3: remove transformations
    if request.method == 'POST' and 'delete_transforms' in request.POST:
        print('delete all transforms')
        if 'transforms' in request.session:
            del request.session['transforms']

    # Step 4: execute
    if request.method == 'POST' and 'model_type' in request.POST:
        print('run model')

        model_type = request.POST['model_type']

        model_file = None
        if request.session['mode'] == 'test':
            if 'uploaded_model' not in request.FILES:
                return render(request, 'msp/index.html',
                              {'model_error': 'No pre trained model uploaded'})

            model_file = request.FILES['uploaded_model']
            # request.session['model_file'] = model_file

        run_db = False
        if request.session['mode'] == 'test' and request.session['is_db']:
            run_db = request.POST['run_db']

        request.session['model_type'] = model_type
        request.session['run_db'] = run_db
        request.session['step'] = 4

        manager = MLManager()
        manager.select_model('GradientBoostingRegressor')

        if request.session['mode'] == 'train':
            if request.session['is_db']:
                ds = get_table(request.session['db_url'],
                               request.session['dataset_name'])
            else:
                ds = pd.read_json(request.session['ds_data'])

            print(ds.head(10))
            print(request.session['is_label_column'])
            if request.session['is_label_column']:
                del ds[request.session['is_label_column']]

            model = manager.fit(ds, request.session['labels'])
            save_model(model, "data/model_{}.joblib".format(model_type))
            print('save_model')

        elif request.session['mode'] == 'test':
            if request.session['is_db']:
                print(
                    'runjdfisbgsijkbgjsdlgosdbgoisdhgilsdhgodspngfposifdòlhgaofdis'
                )
                columns = list(
                    get_table(request.session['db_url'],
                              request.session['dataset_name']).columns)
                print(columns)
                query = manager.generate_query(model_file,
                                               request.session['dataset_name'],
                                               columns)
                print(query)
                ds = execute_query(request.session['db_url'], query)
                print(ds)

            else:
                ds = pd.read_json(request.session['ds_data'])
                y_pred = manager.predict(ds, model_file)
                y_pred = pd.Series(y_pred, )
                y_pred.to_csv("data/{}_prediction.csv".format(model_type),
                              index=False)
                print('save_result')

    return render(
        request,
        'msp/index.html',
    )
예제 #4
0
def main():
    print('Workflow Training')
    t_start = datetime.now()

    scenario = get_train_scenario()
    check_train_consistency(scenario)

    # Dataset
    print('Get Dataset')
    ds = get_dataset(scenario)
    features = ds.columns.to_list()

    # Label
    print('Get Label')

    if scenario['labels_type'] == 'file':
        # Get labels from file
        labels = get_dataframe(scenario['labels'])

        if labels is None:
            raise ValueError('Impossible read label from selected file')

        # Get first column from file
        labels = labels.iloc[:, 0].to_list()

    elif scenario['labels_type'] == 'table':
        # Get labels from table
        labels = get_table(scenario['db_url'], scenario['labels'])
        if ds is None:
            raise ValueError(
                'Impossible read labels from DBMS, please check DBMS input')

        # Get first column from table
        labels = labels.iloc[:, 0].to_list()

    elif scenario['labels_type'] == 'column':
        # Get labels from column
        labels = ds[scenario['labels']].to_list()
        # Remove label column from dataset
        features.remove(scenario['labels'])

    else:
        raise ValueError('Selected the wrong label type')

    # Get train and test from dataset
    x_train, y_train, x_test, y_test = get_train_test(ds[features], labels,
                                                      scenario['validation'])

    # ML Manager
    print('Create ML Manager')
    manager = MLManager()
    # Set ML model
    manager.select_model(scenario['model'])

    # Set ML transforms
    manager.set_transforms(scenario['transforms'])

    # Training
    print('Training...')
    model = manager.fit(x_train, y_train)

    # Finish training
    t_end = datetime.now()
    print('Execution Time: ', t_end - t_start)

    # Compute evaluation
    if y_test and scenario['metric']:
        y_pred = model.predict(x_test)
        res_evaluation = manager.evaluate(scenario['metric'], y_test, y_pred)
        print('Evaluation: ', res_evaluation)

    print('Save Result Model: train.joblib')
    trained_pipeline_name = '../data/result/train.joblib'
    save_model(model, trained_pipeline_name)
    print(':)')
예제 #5
0
    def post(self, request):
        # Params
        scenario = create_simulation_scenario(request.data)

        try:
            check_simulation_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Get Dataset
        t_load_start = datetime.now()
        ds = get_table(scenario['db_url'], scenario['table'])
        features = ds.columns.to_list()

        if scenario['labels_type'] == 'column':
            # Remove Label column if exists
            features.remove(scenario['labels'])

        t_load_end = datetime.now()

        # ML Manager
        manager = MLManager()

        t_ml = []
        t_db = []

        # Testing Phase
        for i in range(scenario['batch_number']):
            ds_batch = get_batch(ds, i, scenario['batch_size'])
            if ds_batch.empty:
                break

            # Execute predict using MLManager and ML Library
            t_start = datetime.now()
            _ = manager.predict(ds[features], scenario['pipeline'].file)
            t_end = datetime.now()

            t_ml.append(t_end - t_start)

            # Create Batch for DBMS
            ds_batch.to_sql('batch',
                            con=scenario["db_url"],
                            if_exists="replace",
                            index=False)

            # Generate query using MLManager
            dbms = DBMSUtils.get_dbms_from_str_connection(scenario['db_url'])
            queries, query = manager.generate_query(scenario['pipeline'].file,
                                                    scenario['table'],
                                                    features, dbms,
                                                    scenario['optimizer'])

            # Execute query
            t_start = datetime.now()
            # Execute query
            _ = execute_multi_queries(scenario["db_url"], queries)
            t_end = datetime.now()

            t_db.append(t_end - t_start)

        # Finish Simulation
        return Response(
            {
                'detail': 'Successfully predicted result',
                'ml_results': {
                    'execution_time':
                    (np.mean(t_ml) + (t_load_end - t_load_start))
                },
                'dbms_results': {
                    'execution_time': np.mean(t_db)
                },
            },
            status=status.HTTP_200_OK)
예제 #6
0
    def post(self, request):

        scenario = create_test_scenario(request.data)
        inference_time = 0

        try:
            check_test_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Load pipeline model from pipeline file
        pipeline = load_model(scenario.pipeline.file)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Extract pipeline information from loaded model
        pipeline = MLManager.extract_pipeline_components(pipeline)
        if not pipeline:
            return Response(
                {'detail': 'The selected file isn\'t a pipeline objects'},
                status=status.HTTP_400_BAD_REQUEST)

        # Model params
        scenario.model = pipeline.get('model')
        scenario.transforms = json.dumps(pipeline.get('transforms', []))

        # Dataset
        if scenario.run_db:
            # Get features from table
            features = get_columns(scenario.db_url, scenario.table)

        else:
            data_extractor_start = datetime.now()
            # Get Dataset
            ds = get_dataset(scenario)
            features = ds.columns.to_list()

            data_extractor_end = datetime.now()
            data_extractor_time = (data_extractor_end -
                                   data_extractor_start).total_seconds()
            inference_time += data_extractor_time

        if scenario.labels_type == 'column':
            # Remove Label column if exists
            features.remove(scenario.labels)

        # ML Manager
        manager = MLManager()

        # Testing Phase
        query = None
        if scenario.run_db:

            inference_start = datetime.now()

            # Generate query using MLManager
            dbms = DBMSUtils.get_dbms_from_str_connection(scenario.db_url)
            queries, query = manager.generate_query(scenario.pipeline.file,
                                                    scenario.table, features,
                                                    dbms, scenario.optimizer)

            # Execute query
            y_pred = execute_multi_queries(scenario.db_url, queries)
            y_pred = pd.Series(y_pred.iloc[:, 0], name='Label')

            inference_end = datetime.now()
            inference_time += (inference_end - inference_start).total_seconds()

        else:

            inference_start = datetime.now()

            # Execute predict using MLManager and ML Library
            y_pred = manager.predict(ds[features], scenario.pipeline.file)
            y_pred = pd.Series(y_pred, name='Label')

            inference_end = datetime.now()
            inference_time += (inference_end - inference_start).total_seconds()

        # Label
        labels = []
        # Compute evaluation
        if scenario.labels_type:
            if scenario.labels_type == 'file':
                # Get labels from file
                labels_document = get_document_object(scenario.labels)
                labels = get_dataframe(labels_document.file)

                if labels is None:
                    return Response(
                        {
                            'detail':
                            'The selected labels file {} isn\'t valid'.format(
                                labels_document.filename)
                        },
                        status=status.HTTP_400_BAD_REQUEST)

                # Get first column from file
                labels = labels.iloc[:, 0].to_list()

            elif scenario.labels_type == 'table':
                # Get labels from table
                labels = get_table(scenario.db_url, scenario.labels)
                if not labels:
                    return Response(
                        {
                            'detail':
                            'The selected table {} isn\'t valid for label'.
                            format(scenario.labels)
                        },
                        status=status.HTTP_400_BAD_REQUEST)

                # Get first column from table
                labels = labels.iloc[:, 0].to_list()

            elif scenario.labels_type == 'column' and not scenario.run_db:
                # Get labels from column
                labels = ds[scenario.labels].to_list()

            elif scenario.labels_type == 'column' and scenario.run_db:
                # Get labels from table
                labels = get_column(scenario.db_url, scenario.table,
                                    scenario.labels)
            else:
                return Response({'detail': 'Select the wrong labels type'},
                                status=status.HTTP_400_BAD_REQUEST)

        # Compute evaluation
        res_evaluation = None
        if labels and scenario.metric:
            res_evaluation = manager.evaluate(scenario.metric, labels, y_pred)

        # Create predictions file
        test_result_name = "test_{}_{}.csv".format(scenario.model,
                                                   datetime.now())
        test_result_name = test_result_name.replace(' ', '_')
        test_result_name = test_result_name.replace(':', '_')
        y_pred.to_csv(test_result_name, index=False, header=True)

        # Save predictions in Document model
        f = open(test_result_name, 'rb')
        document = Document(file=File(f), filename=test_result_name)
        document.save()
        f.close()

        # Remove temporally predictions file
        os.remove(test_result_name)

        # Save Scenario model
        scenario.save()

        # Save ResultScenario
        result_scenario = ResultScenario()
        result_scenario.scenario = scenario
        result_scenario.execution_time = inference_time
        result_scenario.throughput = result_scenario.execution_time / len(
            y_pred)
        result_scenario.score = res_evaluation
        result_scenario.file_result = document.filename
        result_scenario.query = query
        result_scenario.save()

        return Response(
            {
                'detail': 'Successfully predicted result',
                'filename': test_result_name,
                'scenario_id': scenario.id
            },
            status=status.HTTP_201_CREATED)
예제 #7
0
    def post(self, request):
        t_start = datetime.now()
        scenario: Scenario = create_train_scenario(request.data)

        try:
            check_train_consistency(scenario)
        except ValueError as e:
            return Response({
                'detail': str(e),
            },
                            status=status.HTTP_400_BAD_REQUEST)

        # Dataset
        print('Get Dataset')
        ds = get_dataset(scenario)
        if ds is None:
            return Response(
                {
                    'detail': 'Impossible read the selected dataset',
                },
                status=status.HTTP_400_BAD_REQUEST)

        features = ds.columns.to_list()

        # Label
        print('Get Label')

        if scenario.labels_type == 'file':
            # Get labels from file
            labels_document = get_document_object(scenario.labels)
            labels = get_dataframe(labels_document.file)

            if labels is None:
                return Response(
                    {
                        'detail':
                        'The selected labels file {} isn\'t valid'.format(
                            labels_document.filename)
                    },
                    status=status.HTTP_400_BAD_REQUEST)

            # Get first column from file
            labels = labels.iloc[:, 0].to_list()

        elif scenario.labels_type == 'table':
            # Get labels from table
            labels = get_table(scenario.db_url, scenario.labels)
            if not labels:
                return Response(
                    {
                        'detail':
                        'The selected table {} isn\'t valid for label'.format(
                            scenario.labels)
                    },
                    status=status.HTTP_400_BAD_REQUEST)

            # Get first column from table
            labels = labels.iloc[:, 0].to_list()

        elif scenario.labels_type == 'column':
            # Get labels from column
            if scenario.labels not in ds.columns:
                return Response(
                    {
                        'detail':
                        'The selected column {} isn\'t valid for label'.format(
                            scenario.labels)
                    },
                    status=status.HTTP_400_BAD_REQUEST)

            labels = ds[scenario.labels].to_list()
            # Remove label column from dataset
            features.remove(scenario.labels)

        else:
            return Response({'detail': 'Select the wrong labels type'},
                            status=status.HTTP_400_BAD_REQUEST)

        # Get train and test from dataset
        x_train, y_train, x_test, y_test = get_train_test(
            ds[features], labels, scenario.validation)

        # ML Manager
        manager = MLManager()
        # Set ML model
        manager.select_model(scenario.model)

        # Set ML transforms
        manager.set_transforms(json.loads(scenario.transforms))

        # Training
        model = manager.fit(x_train, y_train)

        # Finish training
        t_end = datetime.now()

        # Compute evaluation
        res_evaluation = None
        if y_test and scenario.metric:
            y_pred = model.predict(x_test)
            res_evaluation = manager.evaluate(scenario.metric, y_test, y_pred)

        # Create joblib file
        trained_pipeline_name = "train_{}_{}.joblib".format(
            scenario.model, datetime.now())
        trained_pipeline_name = trained_pipeline_name.replace(' ', '_')
        trained_pipeline_name = trained_pipeline_name.replace(':', '_')
        save_model(model, trained_pipeline_name)

        # Save trained pipeline in Document model
        f = open(trained_pipeline_name, 'rb')
        document = Document(file=File(f), filename=trained_pipeline_name)
        document.save()
        f.close()

        # Remove joblib file
        os.remove(trained_pipeline_name)

        # Save Scenario model
        scenario.save()

        # Save ResultScenario
        result_scenario = ResultScenario()
        result_scenario.scenario = scenario
        result_scenario.execution_time = (t_end - t_start).total_seconds()
        result_scenario.throughput = result_scenario.execution_time / len(
            x_train)
        result_scenario.score = res_evaluation
        result_scenario.file_result = document.filename
        result_scenario.save()

        return Response(
            {
                'detail': 'Pipeline trained correctly',
                'filename': trained_pipeline_name,
                'scenario_id': scenario.id
            },
            status=status.HTTP_201_CREATED)
예제 #8
0
def main():
    print('Workflow Testing')
    t_start = datetime.now()

    scenario = get_test_scenario()
    check_test_consistency(scenario)

    # Dataset
    if scenario['run_db']:
        # Get features from table
        print('Extract Features')
        features = get_columns(scenario['db_url'], scenario['table'])
    else:
        # Get Dataset
        print('Get Dataset and Features')
        ds = get_dataset(scenario)
        features = ds.columns.to_list()

    if scenario['labels_type'] == 'column':
        # Remove Label column if exists
        features.remove(scenario['labels'])

    # ML Manager
    print('Create ML Manager')
    manager = MLManager()

    # Testing Phase
    print('Testing...')

    if scenario['run_db']:
        # Generate query using MLManager
        print('Query Generation...')
        query = manager.generate_query(scenario['pipeline'], scenario['table'],
                                       features)

        # Execute query
        print('Query Execution...')
        y_pred = execute_query(scenario['db_url'], query)
        y_pred = pd.Series(y_pred.iloc[:, 0], name='Label')

    else:
        # Execute predict using MLManager and ML Library
        y_pred = manager.predict(ds[features], scenario['pipeline'])
        y_pred = pd.Series(y_pred.flatten(), name='Label')

    # Finish testing
    t_end = datetime.now()
    print('Execution Time: ', t_end - t_start)

    print('Save prediction: test.csv')
    # test_result_name = '../data/result/test_run_db{}.csv'.format(scenario['run_db'])
    test_result_name = '../data/result/test.csv'
    y_pred.to_csv(test_result_name, index=False, header=True)

    # Compute evaluation
    if scenario['labels_type']:

        # Label
        print('Get Label')
        if scenario['labels_type'] == 'file':
            # Get labels from file
            labels = get_dataframe(scenario['labels'])

            if labels is None:
                raise ValueError('Impossible read label from selected file')

            # Get first column from file
            labels = labels.iloc[:, 0].to_list()

        elif scenario['labels_type'] == 'table':
            # Get labels from table
            labels = get_table(scenario['db_url'], scenario['labels'])
            if ds is None:
                raise ValueError(
                    'Impossible read labels from DBMS, please check DBMS input'
                )

            # Get first column from table
            labels = labels.iloc[:, 0].to_list()

        elif scenario['labels_type'] == 'column' and not scenario['is_db']:
            # Get labels from column
            labels = ds[scenario['labels']].to_list()

        elif scenario['labels_type'] == 'column' and scenario['is_db']:
            # Get labels from table
            labels = get_column(scenario['db_url'], scenario['table'],
                                scenario['labels'])

        else:
            raise ValueError('Select the wrong label type')

        res_evaluation = manager.evaluate(scenario['metric'], labels, y_pred)
        print('Evaluation: ', res_evaluation)

    print(':)')
예제 #9
0
def main(data_conf, pipeline_conf, str_db_conn, task, optimizer, debug=False):
    data_conf['str_db_conn'] = str_db_conn
    data_conf = check_data_config(data_conf)
    train = data_conf['train']
    y_train = data_conf['y_train']
    test = data_conf['test']
    y_test = data_conf['y_test']
    test_table_name = data_conf['test_table_name']
    features = list(data_conf['train'].columns)
    conn = data_conf['db_conn']

    tasks = ['regression', 'binary_classification', 'multi_classification']
    if task not in tasks:
        raise ValueError(f"Wrong task {task}. Available tasks: {tasks}")

    if task == 'regression':
        eval_fn = evaluate_regression_results
    elif task == 'binary_classification':
        eval_fn = evaluate_binary_classification_results
    else:
        eval_fn = evaluate_multi_classification_results

    check_pipeline_config(pipeline_conf, features)
    model_name = pipeline_conf['model']['name']

    mlmanager = MLManager()
    pipeline = create_pipeline(pipeline_conf)

    # fit
    print("\nStarting training...")
    pipeline.fit(train, y_train)
    _check_fitted_pipeline(pipeline, model_name, train)

    print("Training completed.\n")

    fitted_model = pipeline.steps[1][1]

    # ML predict
    print("\nStarting the ML inference...")
    ml_preds = pipeline.predict(test)
    ml_preds = pd.Series(ml_preds)
    print(ml_preds[:10])
    eval_fn(model_name, y_test, ml_preds)
    print("ML inference completed.\n")

    # SQL conversion
    print("\nStarting the SQL conversion...")
    pipeline = extract_pipeline(pipeline)
    dbms = DBMSUtils.get_dbms_from_str_connection(data_conf['str_db_conn'])
    queries, all_query = create_query(pipeline, mlmanager, features,
                                      test_table_name, optimizer, dbms, debug)
    print("SQL Conversion completed.\n")

    # SQL predict
    print("\nStarting the SQL inference...")
    for q in queries[:-1]:
        try:
            for qq in q.split(';'):
                conn.execute(qq)
        except Exception as e:
            pass

    try:
        sql_preds = pd.read_sql(queries[-1], conn)
    except Exception as e:
        logging.error(e.args[0])
        return

    sql_preds = pd.Series(sql_preds.iloc[:, 0])
    print(sql_preds[:10])
    null_val = False
    if sql_preds.isnull().sum() == 0:
        eval_fn(f"{model_name} SQL", y_test, sql_preds)
    else:
        null_val = True
    print("SQL inference completed.\n")

    # Null value test
    if null_val:
        print("\nNull value test")
        null_val_cnt = 0
        for sample_id in sql_preds[sql_preds.isnull()].index:
            print(sample_id)
            for (attr, val) in zip(test.columns,
                                   test.iloc[sample_id, :].values):
                print("\t", attr, '=', val)
            null_val_cnt += 1
        print(f"Found {null_val_cnt} null predictions.")

    # Accuracy test
    print("\nAccuracy test")
    equals = False
    for prec in range(10, 0, -1):
        ml_preds = ml_preds.map(lambda x: round(x, prec))
        sql_preds = sql_preds.map(lambda x: round(x, prec))
        if ml_preds.equals(sql_preds):
            print(
                f"The prediction scores are equal with {prec} decimal precision."
            )
            print(":)")
            equals = True
            break
    if not equals:
        print("The prediction scores are not equal.")
        print(":(\n")

        ne_preds = 0
        for i in range(len(ml_preds)):
            ml_pred = ml_preds.iloc[i]
            sql_pred = sql_preds.iloc[i]

            if ml_pred != sql_pred:
                if debug:
                    print(i, ml_pred, sql_pred)
                    for (attr, val) in zip(test.columns,
                                           test.iloc[i, :].values):
                        print("\t", attr, '=', val)
                ne_preds += 1
        print(f"Found {ne_preds} incorrect predictions.")