예제 #1
0
def main():
    """
    """
    resource_path = get_resource_path()
    folder = os.path.join(resource_path, 'data/amazon')
    datafile_train = 'Electronics_5.json'
    datafile_test = 'Books_5.json'
    X_train, y_train = [], []
    X_test, y_test = [], []

    for line in open(os.path.join(folder, datafile_train)):
        content = json.loads(line)
        X_train.append(content["reviewText"])
        y_train.append(float(content["overall"]))

    for line in open(os.path.join(folder, datafile_test)):
        content = json.loads(line)
        X_test.append(content["reviewText"])
        y_test.append(float(content["overall"]))

    size = 100

    # pipeline = FullTextPipeline(RandomForest())
    pipeline = HashingPipeline(RandomForest())
    # pipeline = TfIdfPipeline(RandomForest())
    model = pipeline.fit(X_train[:size], y_train[:size])

    shift_detector = SklearnDataShiftDetector(model, n_bins=1000)
    shift_detector.iteration(X_train[:100])
    shift_detector.iteration(X_test[:100])
    print(shift_detector.data_is_shifted())
예제 #2
0
def main():
    """
    """
    from settings import get_resource_path
    import os
    resource_folder = get_resource_path()
    filename = 'dataset_31_credit-g.csv'
    data = pd.read_csv(os.path.join(resource_folder, "data", filename))
    analyzer = DataFrameAnalyzer()
    analyzer.on(data)
예제 #3
0
 def setUp(self):
     self.resource_folder = get_resource_path()
     self.pipeline = CreditGPipeline()
     # data = credit.dataset_31_credit_g()
     data = pd.read_csv(os.path.join(self.resource_folder, 'data',
                                     'credit-g/dataset_31_credit-g.csv'))
     target = 'class'
     # I guess it will work only if the target value is the last one.
     self.features = [col for col in data.columns if col != target]
     X = data[self.features]
     y = data[target]
     sets = split(X, y, test_size=0.2, random_state=0)
     self.X_train, self.X_test, self.y_train, self.y_test = sets
     self.data_profile = DataFrameProfiler().on(self.X_train)
     self.automated_suite = AutomatedTestSuite()
예제 #4
0
    def __init__(self):
        self.resource_folder = get_resource_path()
        # for dataset_name in sorted(os.listdir(folder)):
        #     if dataset_name.endswith('.csv'):
        #         print(dataset_name[:-4])
        self.pipelines = {
            'credit-g': (
                'credit-g/dataset_31_credit-g.csv', 'class',
                CreditGPipeline()),
            'wine-quality': (
                'wine-quality/wine-quality-red.csv', 'class',
                WineQualityPipeline()),
            'wq-missing': (
                'wine-quality/wine-quality-red.csv', 'class',
                WineQualityMissingPipeline()),
            'abalone': (
                'abalone/abalone.csv', 'Rings',
                AbalonePipeline()),
            'adult': (
                'adult/adult.csv', 'class',
                AdultPipeline()),
            'adult-missing': (
                'adult/adult.csv', 'class',
                AdultMissingPipeline()),
            'heart': (
                'heart/heart.csv', 'class',
                HeartPipeline())}

        self.classifiers = {
            'dtc': DecisionTree(),
            'rfc40': RandomForest(size=40),
            'ertc40': ExtremelyRandomizedTrees(size=40),
            'xgb': XGB(),
            'svm': SVM(),
            'lsvm': LinearSVM(),
            'knn': KNN(n_neighbors=7),
            'logreg': LogRegression(),
            'gaus': GausNB(),
            'brfc40': BaggingRandomForest(size=40),
            'mlpc': MLPC(input_size=[16, 32, 16, 8])
        }

        self.error_gens = {
            'numeric anomalies': (
                Anomalies(), lambda x: x.dtype in [DataType.INTEGER,
                                                   DataType.FLOAT]),
            'typos': (
                Typos(), lambda x: x.dtype == DataType.STRING),
            'explicit misvals': (
                ExplicitMissingValues(), lambda x: True),
            'implicit misvals': (
                ImplicitMissingValues(), lambda x: True),
            'swap fields': (
                SwapFields(), lambda x: True)}

        self.params = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8]

        self.tests = {'num disc': lambda x: (x.scale == DataScale.NOMINAL
                                             and x.dtype in [DataType.INTEGER,
                                                             DataType.FLOAT]),
                      'num cont': lambda x: (x.scale == DataScale.NOMINAL
                                             and x.dtype in [DataType.INTEGER,
                                                             DataType.FLOAT]),
                      'string': lambda x: x.dtype == DataType.STRING}

        self.results = Table(rows=sorted(self.pipelines.keys()),
                             columns=sorted(self.classifiers.keys()),
                             subrows=self.tests.keys(),
                             subcolumns=self.error_gens.keys())
def main():
    """
    """
    path = get_resource_path()

    classifiers = [
        # DecisionTree(),
        # RandomForest(size=40),
        # ExtremelyRandomizedTrees(size=40),
        # XGB(),
        # SVM(),
        # LinearSVM(),
        # KNN(n_neighbors=7),
        LogRegression(),
        # GausNB(),
        # BaggingRandomForest(size=40),
        # MLPC(input_size=[16, 32, 16, 8])
    ]

    error_generators = [
        Anomalies(),
        Typos(),
        ExplicitMissingValues(),
        ImplicitMissingValues(),
        SwapFields()
    ]

    # TODO: dataset size as a hyperparameter
    # TODO: random_state as a hyperparameter
    hyperparams = {
        'train_ratio': .7,
        'val_ratio': .1,
        'test_ratio': .1,
        'target_ratio': .1,
        'random_state': [0],
        # 'row_fraction': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8],
        'row_fraction': [0.2],
        'classifier': classifiers,
        # Ordering of error generators
        # 'mask': [(0, 0, 1, 0, 0), (0, 0, 0, 1, 0), (0, 0, 0, 0, 1),
        #          (0, 2, 0, 0, 1)],
        'mask': [(0, 0, 0, 1, 0)],
        'testset_size': 100
    }

    datasets = pd.read_csv(os.path.join(path, 'datasets.csv'))

    for dataset_info in datasets.values:
        filepath, name, target_feature, task = tuple(dataset_info)
        data = pd.read_csv(os.path.join(path, 'data', filepath))

        for state in HyperParameterHolder(hyperparams):
            print("HyperParam : %s" % str(state))
            # Dataset Split
            (X_train, y_train, X_val, y_val, X_test, y_test, X_target,
             y_target) = split_dataset(data, target_feature, state)

            tuning_done = False
            while not tuning_done:
                # ML Pipeline Training Procedure
                model = BlackBox().train(state['classifier'], X_train, y_train)

                # ML Pipeline Validation Procedures
                predicted = model.predict(X_val)
                score = performance_metric(y_val, predicted)
                print("Validation : accuracy = %.4f" % round(score, 4))
                tuning_done = True

            # ML Pipeline final performance score
            predicted = model.predict(X_test)
            score = performance_metric(y_test, predicted)
            print("Test       : accuracy = %.4f" % round(score, 4))

            # Meta Classifier Training Procedure
            error_gen_strat = ErrorGenerationStrategy(error_generators, state)
            # TODO: so far, X_test/y_test is used for training

            # prepare a dataset based on X_test and repeated error generation
            # NB: returns a python list, not a numpy array or pandas dataframe
            list_of_corrupted_X_test = error_gen_strat.on(X_test, state)

            try:
                meta_classifier = MetaClassifier(model, LinearRegression())
                print(str(meta_classifier))
                meta_classifier.fit(list_of_corrupted_X_test, y_test)

                # Meta Classifier Evaluation Procedure
                list_of_corrupted_X_target = error_gen_strat.on(
                    X_target, state)
                predicted_scores = meta_classifier.predict(
                    list_of_corrupted_X_target)
                actual_scores = [
                    performance_metric(y_target, model.predict(x))
                    for x in list_of_corrupted_X_target
                ]
                plt.plot(range(len(actual_scores)), actual_scores, 'g^')
                plt.plot(range(len(predicted_scores)), predicted_scores, 'ro')
                plt.gca().legend(('ground truth', 'predicted scores'))
                plt.grid(True)
                plt.show()
                result = distance_metric(actual_scores, predicted_scores)

                print("Evaluation : distance metric = %.4f" % round(result, 4))
                print()
            except Exception as e:
                print("\nException  : %s\n%s\n" % (str(error_gen_strat), e))