def test_preparedata_runs_without_crashing(self):
     df_test = pd.read_csv(
         'restapi/machine_learning/testfiles/df_minimal.csv')
     target_variable = 'G3_class'
     factor_list = ['failures', 'C(sex)']
     y_expected = np.array([0., 0., 1., 0., 1., 1., 0., 1., 1.])
     y, X = preparedata(df_test, target_variable, factor_list)
     np.testing.assert_array_equal(y, y_expected)
Exemplo n.º 2
0
def retrain_model(request):
    if request.method == 'POST':
        data = json.loads(request.body.decode('utf-8'))
        if "model_id" not in data or data["model_id"] is None:
            model_id = 1
        else:
            model_id = data["model_id"]
        mlmodel = MlModel.objects.get(pk=model_id)
        dataset = mlmodel.dataset_id
        target_variable = mlmodel.target_variable
        dataFilePath = os.path.join(settings.MEDIA_ROOT, dataset.file.name)
        numeric_columns = get_numeric_columns(model_id)
        factor_list_wo_categories = get_factor_list_from_file(
            dataFilePath, target_variable, numeric_columns)
        model, model_description = retrain(factor_list_wo_categories,
                                           target_variable,
                                           data["factors"],
                                           dataFile=dataFilePath)
        # not enforced here: error if two factors are balanced
        protected_attr = None
        for factor in data["factors"]:
            if factor["is_balanced"]:
                protected_attr = factor["name"]
        df_data = pd.read_csv(dataFilePath)
        y, X = preparedata(df_data, target_variable, factor_list_wo_categories)
        X = drop_disabled_factors(X, data["factors"])
        if protected_attr is not None:
            if protected_attr not in X:
                raise ValueError(
                    'missing protected_attr in X, it may have been disabled')
            thresholds = get_fair_thresholds(model, X, y, protected_attr)
            model_description["negative_threshold"] = thresholds[0]
            model_description["positive_threshold"] = thresholds[1]
            print(thresholds)
            accuracy, confusion_matrices = test_logreg_model(
                model, X, y, thresholds, protected_attr)
        else:
            accuracy, confusion_matrices = test_logreg_model(model, X, y)
        model_description['accuracy'] = accuracy
        model_description['confusion_matrices'] = confusion_matrices
        return Response(model_description, status=status.HTTP_200_OK)
    else:
        return Response('HTTP_400_BAD_REQUEST',
                        status=status.HTTP_400_BAD_REQUEST)
Exemplo n.º 3
0
def test_model(request):
    if request.method == 'POST':
        data = json.loads(request.body.decode('utf-8'))
        model_id = data["model_id"]
        mlmodel = MlModel.objects.get(pk=int(model_id))
        target_variable = mlmodel.target_variable
        dataset = mlmodel.dataset_id
        target_variable = mlmodel.target_variable
        dataFilePath = os.path.join(settings.MEDIA_ROOT, dataset.file.name)
        protected_attr = None
        for factor in data["factors"]:
            if factor["is_balanced"]:
                protected_attr = factor["name"]
        thresholds = None
        if protected_attr is not None:
            if ("positive_threshold" in data and "negative_threshold" in data
                    and data["negative_threshold"] is not None
                    and data["positive_threshold"] is not None):
                thresholds = (data["negative_threshold"],
                              data["positive_threshold"])
                print(thresholds)
            else:
                return Response('Retrain needed, thresholds missing',
                                status=status.HTTP_400_BAD_REQUEST)
        numeric_columns = get_numeric_columns(model_id)
        factor_list_wo_categories = get_factor_list_from_file(
            dataFilePath, target_variable, numeric_columns)
        df_data = pd.read_csv(dataFilePath)
        y, X = preparedata(df_data, target_variable, factor_list_wo_categories)
        X = drop_disabled_factors(X, data["factors"])
        for factor in data["factors"]:
            if isinstance(factor["weight"], str):
                raise ValueError(
                    'factor weights cannot be strings, but are for ' +
                    factor["name"])
        model = build_model_from_factors(data["factors"], data["intercept"], y,
                                         X)
        accuracy, confusion_matrices = test_logreg_model(
            model, X, y, thresholds, protected_attr)
        res = {'accuracy': accuracy, 'confusion_matrices': confusion_matrices}
        return Response(res, status=status.HTTP_200_OK)
    return Response('HTTP_400_BAD_REQUEST', status=status.HTTP_400_BAD_REQUEST)
Exemplo n.º 4
0
def retrain(factor_list_wo_categories, target_variable, factors, dataFile):
    df_math = pd.read_csv(dataFile)
    y, X = preparedata(df_math, target_variable, factor_list_wo_categories)
    for factor in factors:
        if not factor["is_enabled"]:
            X = X.drop(factor["name"], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    # enabling/disabling factors
    for factor in factors:
        col_list = X.dtypes.index.tolist()
        factor_name = factor['name']
        if factor_name in col_list:
            col_num_in_X = col_list.index(factor_name)
            factor["weight"] = model.coef_[0][col_num_in_X]
    return model, {'factors': factors}
Exemplo n.º 5
0
def new_model_with_factor_creation(request):
    # expects JSON with name, description, and dataset_id
    if request.method != 'POST':
        return Response('HTTP_400_BAD_REQUEST',
                        status=status.HTTP_400_BAD_REQUEST)
    data = json.loads(request.body.decode('utf-8'))
    # Save new model
    dataset_id = data['dataset_id']
    non_categorical_columns = data['non_categorical_columns']
    target_variable = data['target_variable']
    target_var_alias = data['target_var_alias']
    newModel = MlModel(name=data['name'],
                       description=data['description'],
                       dataset_id=DataSet.objects.get(pk=dataset_id),
                       modified=timezone.now(),
                       non_categorical_columns=non_categorical_columns,
                       target_variable=target_variable,
                       target_var_alias=target_var_alias)
    newModel.save()
    # Get factor names from file
    dataset = DataSet.objects.get(pk=dataset_id)
    datafile = dataset.file
    dataFilePath = 'datasets/' + datafile.name
    numeric_columns = non_categorical_columns.split(',')
    factor_list = get_factor_list_from_file(dataFilePath, target_variable,
                                            numeric_columns)
    y, X = preparedata(pd.read_csv(dataFilePath), target_variable, factor_list)
    factors_to_save = X.columns.values.tolist()
    # Save factors
    for factor in factors_to_save:
        newFactor = Factor(name=factor,
                           alias=factor,
                           weight=0,
                           model_id=newModel)
        newFactor.save()
    return Response(newModel.id, status=status.HTTP_200_OK)