def main(): """ """ resource_path = get_resource_path() folder = os.path.join(resource_path, 'data/amazon') datafile_train = 'Electronics_5.json' datafile_test = 'Books_5.json' X_train, y_train = [], [] X_test, y_test = [], [] for line in open(os.path.join(folder, datafile_train)): content = json.loads(line) X_train.append(content["reviewText"]) y_train.append(float(content["overall"])) for line in open(os.path.join(folder, datafile_test)): content = json.loads(line) X_test.append(content["reviewText"]) y_test.append(float(content["overall"])) size = 100 # pipeline = FullTextPipeline(RandomForest()) pipeline = HashingPipeline(RandomForest()) # pipeline = TfIdfPipeline(RandomForest()) model = pipeline.fit(X_train[:size], y_train[:size]) shift_detector = SklearnDataShiftDetector(model, n_bins=1000) shift_detector.iteration(X_train[:100]) shift_detector.iteration(X_test[:100]) print(shift_detector.data_is_shifted())
def main(): """ """ from settings import get_resource_path import os resource_folder = get_resource_path() filename = 'dataset_31_credit-g.csv' data = pd.read_csv(os.path.join(resource_folder, "data", filename)) analyzer = DataFrameAnalyzer() analyzer.on(data)
def setUp(self): self.resource_folder = get_resource_path() self.pipeline = CreditGPipeline() # data = credit.dataset_31_credit_g() data = pd.read_csv(os.path.join(self.resource_folder, 'data', 'credit-g/dataset_31_credit-g.csv')) target = 'class' # I guess it will work only if the target value is the last one. self.features = [col for col in data.columns if col != target] X = data[self.features] y = data[target] sets = split(X, y, test_size=0.2, random_state=0) self.X_train, self.X_test, self.y_train, self.y_test = sets self.data_profile = DataFrameProfiler().on(self.X_train) self.automated_suite = AutomatedTestSuite()
def __init__(self): self.resource_folder = get_resource_path() # for dataset_name in sorted(os.listdir(folder)): # if dataset_name.endswith('.csv'): # print(dataset_name[:-4]) self.pipelines = { 'credit-g': ( 'credit-g/dataset_31_credit-g.csv', 'class', CreditGPipeline()), 'wine-quality': ( 'wine-quality/wine-quality-red.csv', 'class', WineQualityPipeline()), 'wq-missing': ( 'wine-quality/wine-quality-red.csv', 'class', WineQualityMissingPipeline()), 'abalone': ( 'abalone/abalone.csv', 'Rings', AbalonePipeline()), 'adult': ( 'adult/adult.csv', 'class', AdultPipeline()), 'adult-missing': ( 'adult/adult.csv', 'class', AdultMissingPipeline()), 'heart': ( 'heart/heart.csv', 'class', HeartPipeline())} self.classifiers = { 'dtc': DecisionTree(), 'rfc40': RandomForest(size=40), 'ertc40': ExtremelyRandomizedTrees(size=40), 'xgb': XGB(), 'svm': SVM(), 'lsvm': LinearSVM(), 'knn': KNN(n_neighbors=7), 'logreg': LogRegression(), 'gaus': GausNB(), 'brfc40': BaggingRandomForest(size=40), 'mlpc': MLPC(input_size=[16, 32, 16, 8]) } self.error_gens = { 'numeric anomalies': ( Anomalies(), lambda x: x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'typos': ( Typos(), lambda x: x.dtype == DataType.STRING), 'explicit misvals': ( ExplicitMissingValues(), lambda x: True), 'implicit misvals': ( ImplicitMissingValues(), lambda x: True), 'swap fields': ( SwapFields(), lambda x: True)} self.params = [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8] self.tests = {'num disc': lambda x: (x.scale == DataScale.NOMINAL and x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'num cont': lambda x: (x.scale == DataScale.NOMINAL and x.dtype in [DataType.INTEGER, DataType.FLOAT]), 'string': lambda x: x.dtype == DataType.STRING} self.results = Table(rows=sorted(self.pipelines.keys()), columns=sorted(self.classifiers.keys()), subrows=self.tests.keys(), subcolumns=self.error_gens.keys())
def main(): """ """ path = get_resource_path() classifiers = [ # DecisionTree(), # RandomForest(size=40), # ExtremelyRandomizedTrees(size=40), # XGB(), # SVM(), # LinearSVM(), # KNN(n_neighbors=7), LogRegression(), # GausNB(), # BaggingRandomForest(size=40), # MLPC(input_size=[16, 32, 16, 8]) ] error_generators = [ Anomalies(), Typos(), ExplicitMissingValues(), ImplicitMissingValues(), SwapFields() ] # TODO: dataset size as a hyperparameter # TODO: random_state as a hyperparameter hyperparams = { 'train_ratio': .7, 'val_ratio': .1, 'test_ratio': .1, 'target_ratio': .1, 'random_state': [0], # 'row_fraction': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5, 0.8], 'row_fraction': [0.2], 'classifier': classifiers, # Ordering of error generators # 'mask': [(0, 0, 1, 0, 0), (0, 0, 0, 1, 0), (0, 0, 0, 0, 1), # (0, 2, 0, 0, 1)], 'mask': [(0, 0, 0, 1, 0)], 'testset_size': 100 } datasets = pd.read_csv(os.path.join(path, 'datasets.csv')) for dataset_info in datasets.values: filepath, name, target_feature, task = tuple(dataset_info) data = pd.read_csv(os.path.join(path, 'data', filepath)) for state in HyperParameterHolder(hyperparams): print("HyperParam : %s" % str(state)) # Dataset Split (X_train, y_train, X_val, y_val, X_test, y_test, X_target, y_target) = split_dataset(data, target_feature, state) tuning_done = False while not tuning_done: # ML Pipeline Training Procedure model = BlackBox().train(state['classifier'], X_train, y_train) # ML Pipeline Validation Procedures predicted = model.predict(X_val) score = performance_metric(y_val, predicted) print("Validation : accuracy = %.4f" % round(score, 4)) tuning_done = True # ML Pipeline final performance score predicted = model.predict(X_test) score = performance_metric(y_test, predicted) print("Test : accuracy = %.4f" % round(score, 4)) # Meta Classifier Training Procedure error_gen_strat = ErrorGenerationStrategy(error_generators, state) # TODO: so far, X_test/y_test is used for training # prepare a dataset based on X_test and repeated error generation # NB: returns a python list, not a numpy array or pandas dataframe list_of_corrupted_X_test = error_gen_strat.on(X_test, state) try: meta_classifier = MetaClassifier(model, LinearRegression()) print(str(meta_classifier)) meta_classifier.fit(list_of_corrupted_X_test, y_test) # Meta Classifier Evaluation Procedure list_of_corrupted_X_target = error_gen_strat.on( X_target, state) predicted_scores = meta_classifier.predict( list_of_corrupted_X_target) actual_scores = [ performance_metric(y_target, model.predict(x)) for x in list_of_corrupted_X_target ] plt.plot(range(len(actual_scores)), actual_scores, 'g^') plt.plot(range(len(predicted_scores)), predicted_scores, 'ro') plt.gca().legend(('ground truth', 'predicted scores')) plt.grid(True) plt.show() result = distance_metric(actual_scores, predicted_scores) print("Evaluation : distance metric = %.4f" % round(result, 4)) print() except Exception as e: print("\nException : %s\n%s\n" % (str(error_gen_strat), e))