示例#1
0
文件: Generator.py 项目: BigDaMa/DFS
    def __init__(self, data, y):
        self.transformations = []
        self.number_attributes = data.shape[1]
        self.data = data
        self.y = y
        self.unary_output_dimension_size = -1

        self.transformer = Transformer(self.data,
                                       self.y,
                                       map=False,
                                       number_clusters_for_target=1)
        self.transformer.create_train_test_valid_stratified()

        self.train_data = self.data.iloc[self.transformer.ids_parts[0], :]
        self.valid_data = self.data.iloc[self.transformer.ids_parts[2], :]
        self.test_data = self.data.iloc[self.transformer.ids_parts[1], :]

        self.train_data_bin = None
        self.valid_data_bin = None
        self.test_data_bin = None

        self.train_data_bin_una = None
        self.valid_data_bin_una = None
        self.test_data_bin_una = None

        self.train_data = self.train_data.reset_index(drop=True)
        self.valid_data = self.valid_data.reset_index(drop=True)
        self.test_data = self.test_data.reset_index(drop=True)

        print self.train_data

        print self.train_data
示例#2
0
                if fraction < 0.05:
                    transformers.extend(categorical_transformers)

                new_dataframe = pandas_table[[
                    pandas_table.columns[col_rep],
                    pandas_table.columns[target_column]
                ]]

                print new_dataframe.shape

                #new_dataframe = new_dataframe.sample(n=1500)
                #new_dataframe.reset_index(inplace=True, drop=True)

                for current_transformer in transformers:

                    transformer = Transformer(new_dataframe, 1, map=False)
                    transformer.create_train_test(25, 2000)

                    transformer.transformers = [current_transformer]
                    transformer.fit()

                    datasets, targets, feature_names = transformer.transform()

                    if type(datasets[0]) == type(None):
                        log_file.write(
                            str(col_rep) + ": " +
                            str(pandas_table.columns[col_rep]) + ": " +
                            str(transformer.transformers[0]) + ": " +
                            str(0.0) + "\n")
                        log_file.flush()
示例#3
0
    for line in f:
        tokens = line.split("#")
        user = tokens[0]
        project = tokens[1]
        csv_file = tokens[2]
        type_var = tokens[3]
        task = tokens[4]
        target_column = int(tokens[5])

        mypath = "/home/felix/.kaggle/datasets/" + str(user) + "/" + str(
            project) + "/" + csv_file

        pandas_table = pd.read_csv(mypath, encoding="utf-8", parse_dates=True)
        #pandas_table = pandas_table.fillna('0.0')

        transformer = Transformer(pandas_table, target_column)

        for counter_it in range(3):

            transformer.fit()

            datasets, targets, feature_names = transformer.transform()

            regr = xgb.XGBClassifier(objective='multi:softprob', nthread=4)
            regr.fit(datasets[0], targets[0])

            #from sklearn import svm
            #regr = svm.SVC()
            #regr.fit(X_train, y_train)

            #get feature importance
target_colum = 8

pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv")
target_colum = 20

pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv")# 3 classes
target_colum = 9

pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/phpkIxskf_bank_data.csv")
target_colum = 16

pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/vehicleNorm.csv")
target_colum = 100
'''

transformer = Transformer(pandas_table, target_colum, map=False, number_clusters_for_target=1)


number_runs = 10



fscore = []
fscore_best = []


transformations = get_all_transformations()

rand_state = np.random.RandomState(seed=42)

示例#5
0
    def run(self):

        print self.pandas_dataframe.shape

        transformer = Transformer(self.pandas_dataframe,
                                  self.target_column,
                                  map=False,
                                  number_clusters_for_target=1)
        transformer.create_train_test_valid_stratified(
            train_fraction=[0.66, 1000000],
            valid_fraction=0.0,
            test_fraction=0.44,
            seed=42)

        transformations = get_all_transformations_per_column(
            self.pandas_dataframe, self.target_column)

        rand_state = np.random.RandomState(seed=42)

        my_Score = self.Score(transformer.number_classes)
        classifier = self.Classifier(transformer.number_classes, my_Score)

        apply_hyperparameter_optimization = False
        cross_val_folds = 10

        self.fscore = []
        self.fscore_best = []

        N_runs = 0
        while True:

            transformers = []

            #we randomly choose one feature representation per attribute or none
            for col_i in range(self.pandas_dataframe.shape[1]):
                if col_i != self.target_column:
                    best_i = rand_state.randint(
                        len(transformations[col_i]) + 1)
                    if best_i != len(transformations[col_i]):
                        transformer_default = copy.deepcopy(
                            transformations[col_i][best_i])
                        transformer_default.column_id = col_i
                        transformers.append(transformer_default)
                    else:  #skip attribute
                        print "skip attribute"

            transformer.transformers = transformers
            failed_transformation = False
            try:
                transformer.fit()
                datasets, targets, feature_names = transformer.transform()

                if apply_hyperparameter_optimization:
                    best_params = classifier.run_cross_validation(
                        datasets[0], targets[0], cross_val_folds)
                    model_hyperparameter_optimized = classifier.fit(
                        datasets[0], targets[0], best_params)

                default_model = classifier.fit(datasets[0], targets[0])
            except Exception as e:
                print e
                failed_transformation = True
            if failed_transformation:
                continue

            #apply hyperparameter tuning
            '''
                # get feature importance
                b = model_hyperparameter_optimized.get_booster()
                fs = b.get_score('', importance_type='gain')
                all_features = [fs.get(f, 0.) for f in b.feature_names]
                all_features = np.array(all_features, dtype=np.float32)
                sorted = np.argsort(-all_features)
                
                number_of_features = 10
                show_features = np.array(feature_names)[sorted][0:number_of_features]
                
                # Visualize model
                fig, ax = plt.subplots()
                y_pos = np.arange(len(show_features))
                performance = all_features[sorted][0:number_of_features]
                ax.barh(y_pos, performance, align='center', color='green', ecolor='black')
                ax.set_yticks(y_pos)
                ax.set_yticklabels(show_features)
                ax.invert_yaxis()  # labels read top-to-bottom
                ax.set_xlabel('Gain')
                plt.show()
        
                '''
            for t_i in transformers:
                try:
                    print t_i.__class__.__name__ + ": " + str(
                        len(t_i.get_feature_names(self.pandas_dataframe)))
                except:
                    print t_i.__class__.__name__ + ": " + "exception"

            assert datasets[0].shape[1] == len(
                feature_names), "Feature names does not fit to data dimensions"

            try:
                y_pred = default_model.predict(datasets[1])  #test
                current_score = my_Score.score(targets[1], y_pred)  #test

                if apply_hyperparameter_optimization:
                    y_pred_best = model_hyperparameter_optimized.predict(
                        datasets[1])  #test
                    current_score_best = my_Score.score(
                        targets[1], y_pred_best)  #test

                self.fscore.append(current_score)

                if apply_hyperparameter_optimization:
                    self.fscore_best.append(current_score_best)

                print transformer.print_config()

                print "default F1: " + str(current_score)
                print "max: " + str(np.max(self.fscore))
                if apply_hyperparameter_optimization:
                    print "optimzed F1: " + str(current_score_best)
                N_runs += 1

            except Exception as e:
                print e

            if N_runs == self.number_of_valid_configs:
                break
示例#6
0
        print mypath
        print target_column

        pandas_table = pd.read_csv(mypath, encoding="utf-8", parse_dates=True)
        #pandas_table = pandas_table.fillna('0.0')

        for col_rep in range(pandas_table.shape[1]):
            if col_rep != target_column:

                new_dataframe = pandas_table[[
                    pandas_table.columns[col_rep],
                    pandas_table.columns[target_column]
                ]]

                transformer = Transformer(new_dataframe, 1)

                while True:

                    transformer.fit()

                    datasets, targets, feature_names = transformer.transform()

                    if type(datasets[0]) == type(None):
                        break

                        transformer.next_transformation_for_attribute(0)

                    print str(type(datasets[0]))

                    print str(datasets[0].shape)
示例#7
0
    for i_feature in range(len(importance_scores)):
        attribute_id = int(feature_names[i_feature].split("#")[0])
        if not attribute_id in attribute_importance:
            attribute_importance[attribute_id] = importance_scores[i_feature]
        else:
            attribute_importance[attribute_id] += importance_scores[i_feature]

    return attribute_importance


pandas_table = pd.read_csv(
    "/home/felix/.kaggle/datasets/dansbecker/melbourne-housing-snapshot/melb_data.csv",
    encoding="utf-8",
    parse_dates=True)
transformer = Transformer(pandas_table, 4)

fscore = []

while True:

    transformers = []
    skip_columns = []

    for result in results:
        if not result.column_id in skip_columns:

            #best_i = np.random.randint(len(result.transformers) + 1)
            best_i = 0
            if best_i != len(result.transformers):
                transformers.append(result.get_best_transformer(best_i))
示例#8
0
    def run(self):

        transformer = Transformer(self.pandas_dataframe, self.target_column, map=False, number_clusters_for_target=1)

        self.fscore_list = []


        transformations = get_all_transformations()
        rand_state = np.random.RandomState(seed=42)


        my_Score = self.Score(transformer.number_classes)
        classifier = self.Classifier(transformer.number_classes, my_Score)

        apply_hyperparameter_optimization = False
        cross_val_folds = 10


        for run_i in range(self.number_runs):
            transformer.create_train_test_valid(train_fraction=[0.6, 1000], valid_fraction=0.2, test_fraction=0.2, seed=42 + run_i)

            final_transformers = []
            transformed_columns = []

            for attribute_i in range(self.pandas_dataframe.shape[1]):
                if attribute_i != self.target_column:
                    attribute_scores = []
                    attribute_transformation = []
                    for transformation_i in range(len(transformations)):

                        transformers = []
                        transformer_default = copy.deepcopy(transformations[transformation_i])
                        transformer_default.column_id = attribute_i
                        transformers.append(transformer_default)


                        transformer.transformers = transformers
                        failed_transformation = False
                        try:
                            transformer.fit()
                            datasets, targets, feature_names = transformer.transform()

                            if apply_hyperparameter_optimization:
                                best_params = classifier.run_cross_validation(datasets[0], targets[0], cross_val_folds)
                                model_hyperparameter_optimized = classifier.fit(datasets[0], targets[0], best_params)

                            default_model = classifier.fit(datasets[0], targets[0])
                        except Exception as e:
                            print e
                            failed_transformation = True
                        if failed_transformation:
                            continue

                        assert datasets[0].shape[1] == len(feature_names), "Feature names does not fit to data dimensions"



                        try:
                            y_pred = default_model.predict(datasets[2]) #check validation
                            current_score = my_Score.score(targets[2], y_pred)

                            if apply_hyperparameter_optimization:
                                y_pred_best = model_hyperparameter_optimized.predict(datasets[2])#check validation
                                current_score_best = my_Score.score(targets[2], y_pred_best)

                            attribute_scores.append(current_score)
                            attribute_transformation.append(transformation_i)

                        except Exception as e:
                            print e

                    #add best to final transformation
                    max_id = np.argmax(np.array(attribute_scores))
                    final_transformers.append(attribute_transformation[max_id])
                    transformed_columns.append(attribute_i)

            #apply best single configurations together
            transformer_indices = final_transformers
            transformed_column_indices = transformed_columns

            transformers = []
            all_transformations = get_all_transformations()
            for t_i in range(len(transformer_indices)):
                transformer_default = copy.deepcopy(all_transformations[transformer_indices[t_i]])
                transformer_default.column_id = transformed_column_indices[t_i]
                transformers.append(transformer_default)

            transformer.transformers = transformers

            transformer.fit()
            datasets, targets, feature_names = transformer.transform()



            if apply_hyperparameter_optimization:
                best_params = classifier.run_cross_validation(datasets[0], targets[0], cross_val_folds)
                model_hyperparameter_optimized = classifier.fit(datasets[0], targets[0], best_params)

            default_model = classifier.fit(datasets[0], targets[0])

            try:
                y_pred = default_model.predict(datasets[1])  # test
                current_score = my_Score.score(targets[1], y_pred)  # test

                if apply_hyperparameter_optimization:
                    y_pred_best = model_hyperparameter_optimized.predict(datasets[1])  # test
                    current_score_best = my_Score.score(targets[1], y_pred_best)  # test


                self.fscore_list.append(current_score)
            except:
                self.fscore_list.append(0.0)
示例#9
0
文件: Generator.py 项目: BigDaMa/DFS
class Generator:
    def __init__(self, data, y):
        self.transformations = []
        self.number_attributes = data.shape[1]
        self.data = data
        self.y = y
        self.unary_output_dimension_size = -1

        self.transformer = Transformer(self.data,
                                       self.y,
                                       map=False,
                                       number_clusters_for_target=1)
        self.transformer.create_train_test_valid_stratified()

        self.train_data = self.data.iloc[self.transformer.ids_parts[0], :]
        self.valid_data = self.data.iloc[self.transformer.ids_parts[2], :]
        self.test_data = self.data.iloc[self.transformer.ids_parts[1], :]

        self.train_data_bin = None
        self.valid_data_bin = None
        self.test_data_bin = None

        self.train_data_bin_una = None
        self.valid_data_bin_una = None
        self.test_data_bin_una = None

        self.train_data = self.train_data.reset_index(drop=True)
        self.valid_data = self.valid_data.reset_index(drop=True)
        self.test_data = self.test_data.reset_index(drop=True)

        print self.train_data

        print self.train_data

    def generate_unary_transformations(self):  # Fu,i
        unary_transformations = get_unary_transformations()

        applied_unary_transformations = []

        for attribute_i in range(self.number_attributes):
            for transformation_i in range(len(unary_transformations)):
                transformation = copy.deepcopy(
                    unary_transformations[transformation_i])
                transformation.column_id = attribute_i
                applied_unary_transformations.append(transformation)

        applied_unary_transformations_output_dim = []

        for transformation_i in range(len(applied_unary_transformations)):
            self.transformer.transformers = [
                applied_unary_transformations[transformation_i]
            ]

            #if transformer.transformers[0].output_space_size == None:
            self.transformer.fit()

            error = False
            try:
                transformed_data, target_data, feature_names = self.transformer.transform(
                )
            except:
                error = True

            if not error:
                print self.train_data.columns
                new_train = pd.DataFrame(data=transformed_data[0],
                                         index=range(
                                             transformed_data[0].shape[0]),
                                         columns=feature_names)

                self.train_data = pd.concat([self.train_data, new_train],
                                            axis=1)

                new_test = pd.DataFrame(data=transformed_data[1],
                                        index=range(
                                            transformed_data[1].shape[0]),
                                        columns=feature_names)

                self.test_data = pd.concat([self.test_data, new_test], axis=1)

                new_valid = pd.DataFrame(data=transformed_data[2],
                                         index=range(
                                             transformed_data[2].shape[0]),
                                         columns=feature_names)

                self.valid_data = pd.concat([self.valid_data, new_valid],
                                            axis=1)

            #print applied_unary_transformations[transformation_i]
            #print self.test_data.columns
            #print self.train_data.columns
            #print self.train_data
            assert self.train_data.shape[1] == self.test_data.shape[
                1], "test != train: " + str(
                    self.train_data.shape[1]) + " != " + str(
                        self.test_data.shape[1])

            print "all: " + str(self.train_data.shape)

            applied_unary_transformations_output_dim.append(
                self.transformer.transformers[0].output_space_size)

            assert (
                (type(new_train) == type(None)
                 and self.transformer.transformers[0].output_space_size == 0)
            ) or (new_train.shape[1]
                  == self.transformer.transformers[0].output_space_size
                  ), str(self.transformer.transformers[0]) + ": " + str(
                      new_train.shape[1]) + " != " + str(
                          self.transformer.transformers[0].output_space_size)

        print str(applied_unary_transformations_output_dim)
        print str(np.sum(applied_unary_transformations_output_dim))
        self.unary_output_dimension_size = applied_unary_transformations_output_dim

        assert self.train_data.shape[1] == self.test_data.shape[
            1], "test != train: " + str(
                self.train_data.shape[1]) + " != " + str(
                    self.test_data.shape[1])
        assert self.train_data.shape[1] == self.valid_data.shape[
            1], "valid != train"
        assert self.test_data.shape[1] == self.valid_data.shape[
            1], "valid != test"

    def generate_binary_transformations(self):  # Fo,i
        binary_transformations = get_binary_transformations()

        applied_binary_transformations = []

        for attribute_a in range(self.train_data.shape[1]):
            for attribute_b in range(attribute_a + 1,
                                     self.train_data.shape[1]):
                for transformation_i in range(len(binary_transformations)):
                    transformation = copy.deepcopy(
                        binary_transformations[transformation_i])
                    transformation.column_a = attribute_a
                    transformation.column_b = attribute_b
                    applied_binary_transformations.append(transformation)

        print "hello"
        print len(applied_binary_transformations)

        for transformation_i in range(len(applied_binary_transformations)):
            current_transformation = applied_binary_transformations[
                transformation_i]

            train_data_column_a = self.train_data[self.train_data.columns[
                applied_binary_transformations[transformation_i].column_a]]
            train_data_column_b = self.train_data[self.train_data.columns[
                applied_binary_transformations[transformation_i].column_b]]

            test_data_column_a = self.test_data[self.test_data.columns[
                applied_binary_transformations[transformation_i].column_a]]
            test_data_column_b = self.test_data[self.test_data.columns[
                applied_binary_transformations[transformation_i].column_b]]

            valid_data_column_a = self.valid_data[self.valid_data.columns[
                applied_binary_transformations[transformation_i].column_a]]
            valid_data_column_b = self.valid_data[self.valid_data.columns[
                applied_binary_transformations[transformation_i].column_b]]

            current_transformation.fit1(train_data_column_a,
                                        train_data_column_b)

            transformed_train = current_transformation.transform1(
                train_data_column_a, train_data_column_b)
            transformed_test = current_transformation.transform1(
                test_data_column_a, test_data_column_b)
            transformed_valid = current_transformation.transform1(
                valid_data_column_a, valid_data_column_b)

            if type(self.train_data_bin) == type(None):
                self.train_data_bin = np.matrix(
                    transformed_train).T  #check if different
                self.test_data_bin = np.matrix(transformed_test).T
                self.valid_data_bin = np.matrix(transformed_valid).T
            else:
                self.train_data_bin = np.concatenate(
                    (self.train_data_bin, np.matrix(transformed_train).T),
                    axis=1)
                self.test_data_bin = np.concatenate(
                    (self.test_data_bin, np.matrix(transformed_test).T),
                    axis=1)
                self.valid_data_bin = np.concatenate(
                    (self.valid_data_bin, np.matrix(transformed_valid).T),
                    axis=1)

                #print str(self.train_data_bin.shape) + " vs " + str(np.matrix(transformed_train).T.shape)

        print "binary size:" + str(self.train_data_bin.shape)

    def generate_binary_unary_transformations(self):  # Fu,i
        unary_transformations = get_unary_transformations()

        applied_unary_transformations = []

        for attribute_i in range(self.train_data_bin.shape[1]):
            for transformation_i in range(len(unary_transformations)):
                transformation = copy.deepcopy(
                    unary_transformations[transformation_i])
                transformation.column_id = attribute_i
                applied_unary_transformations.append(transformation)

        applied_unary_transformations_output_dim = []

        for transformation_i in range(len(applied_unary_transformations)):
            current_transformer = applied_unary_transformations[
                transformation_i]

            train_data_column = self.train_data_bin[:,
                                                    applied_unary_transformations[
                                                        transformation_i].
                                                    column_id]
            test_data_column = self.test_data_bin[:,
                                                  applied_unary_transformations[
                                                      transformation_i].
                                                  column_id]
            valid_data_column = self.valid_data_bin[:,
                                                    applied_unary_transformations[
                                                        transformation_i].
                                                    column_id]

            current_transformer.fit1(train_data_column)

            error = False
            try:
                transformed_train = current_transformer.transform1(
                    train_data_column)
                transformed_test = current_transformer.transform1(
                    test_data_column)
                transformed_valid = current_transformer.transform1(
                    valid_data_column)
            except:
                error = True

            if not error:
                if type(self.train_data_bin_una) == type(None):
                    self.train_data_bin_una = np.matrix(
                        transformed_train)  # check if different
                    self.test_data_bin_una = np.matrix(transformed_test)
                    self.valid_data_bin_una = np.matrix(transformed_valid)
                else:
                    print str(self.train_data_bin_una.shape) + " vs " + str(
                        (np.matrix(transformed_train).T).shape)

                    try:
                        self.train_data_bin_una = np.concatenate(
                            (self.train_data_bin_una,
                             np.matrix(transformed_train).T),
                            axis=1)
                        self.test_data_bin_una = np.concatenate(
                            (self.test_data_bin_una,
                             np.matrix(transformed_test).T),
                            axis=1)
                        self.valid_data_bin_una = np.concatenate(
                            (self.valid_data_bin_una,
                             np.matrix(transformed_valid).T),
                            axis=1)
                    except:
                        self.train_data_bin_una = np.concatenate(
                            (self.train_data_bin_una,
                             np.matrix(transformed_train)),
                            axis=1)
                        self.test_data_bin_una = np.concatenate(
                            (self.test_data_bin_una,
                             np.matrix(transformed_test)),
                            axis=1)
                        self.valid_data_bin_una = np.concatenate(
                            (self.valid_data_bin_una,
                             np.matrix(transformed_valid)),
                            axis=1)

        print self.train_data_bin_una.shape[1]