示例#1
0

class Cleaner(object):

    def __init__(self, strat_train_set, strat_test_set):
        self.strat_train_set = strat_train_set
        self.strat_test_set = strat_test_set

    def clean(self):
        self.housing = strat_train_set.drop("median_house_value", axis=1)
        self.housing_label = strat_train_set["median_house_value"].copy()
        housing = self.housing
        housing_label = self.housing_label
        print(housing.info())
        sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head(20)
        print(sample_incomplete_rows)
        print(housing[housing.isnull().any(axis=1)].head(30))
        print(housing.loc[[4629, 6068]])


if __name__ == "__main__":
    show = len(sys.argv) > 1 and sys.argv[1] == 'show'
    analysis = Analysis(HousingData()).pipeline()
    strat_train_set, strat_test_set = analysis.get_stratified_train_test_set()
    print("\n\n --------- Cleaning ----------- \n\n")
    cleaner = Cleaner(strat_train_set, strat_test_set)
    cleaner.clean()

    if show:
        plt.show()
示例#2
0
        print(test_set.head())
        return train_set, test_set

    def add_custom_id_as_function_and_split_training_set(self, func):
        print("\n\n add_custom_id_as_function_and_split_training_set \n\n")
        data_with_id = self.data
        data_with_id["id"] = func(self.data)
        train_set, test_set = self.split_train_test_by_id(
            data_with_id, 0.2, "id")
        print(test_set.head())
        return train_set, test_set

    def split_train_set_sklearn(self, test_size, random_state):
        print("\n\n se sklearn and split train set \n\n")
        train_set, test_set = train_test_split(self.data,
                                               test_size=test_size,
                                               random_state=random_state)

        print(test_set.head())
        return train_set, test_set


if __name__ == "__main__":
    housing = HousingData().get_data_frame()
    training_set = GenerateTrainingSet(housing)
    training_set.add_index_as_id_and_split_training_set()
    training_set.add_custom_id_as_function_and_split_training_set(
        lambda data: data['longitude'] * 1000 + data['longitude'])
    training_set.split_train_set_sklearn(test_ratio, random_state)
示例#3
0
        s = compare_props["Stratified"]

        compare_props["Rand. %error"] = ((r - o) / o) * 100
        compare_props["Strat. %error"] = ((s - o) / o) * 100
        print("\n Comparison Matrix \n", compare_props)
        self.comparison_matrix = compare_props

    def drop_stratified_columns(self):
        for set_ in (self.strat_train_set, self.strat_test_set):
            set_.drop("income_cat", axis=1, inplace=True)

    def get_stratified_train_test_set(self):
        return self.strat_train_set, self.strat_test_set

    def pipeline(self):
        self.base_graph()
        self.stratified_sampling()
        self.comparison_matrix()
        self.drop_stratified_columns()

        return self


if __name__ == "__main__":
    show = len(sys.argv) > 1 and sys.argv[1] == 'show'
    print("show = ", show)

    analysis = Analysis(HousingData(), show).pipeline()
    strat_train_set, strat_test_set = analysis.get_stratified_train_test_set()
    print(len(strat_train_set))