class Cleaner(object): def __init__(self, strat_train_set, strat_test_set): self.strat_train_set = strat_train_set self.strat_test_set = strat_test_set def clean(self): self.housing = strat_train_set.drop("median_house_value", axis=1) self.housing_label = strat_train_set["median_house_value"].copy() housing = self.housing housing_label = self.housing_label print(housing.info()) sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head(20) print(sample_incomplete_rows) print(housing[housing.isnull().any(axis=1)].head(30)) print(housing.loc[[4629, 6068]]) if __name__ == "__main__": show = len(sys.argv) > 1 and sys.argv[1] == 'show' analysis = Analysis(HousingData()).pipeline() strat_train_set, strat_test_set = analysis.get_stratified_train_test_set() print("\n\n --------- Cleaning ----------- \n\n") cleaner = Cleaner(strat_train_set, strat_test_set) cleaner.clean() if show: plt.show()
print(test_set.head()) return train_set, test_set def add_custom_id_as_function_and_split_training_set(self, func): print("\n\n add_custom_id_as_function_and_split_training_set \n\n") data_with_id = self.data data_with_id["id"] = func(self.data) train_set, test_set = self.split_train_test_by_id( data_with_id, 0.2, "id") print(test_set.head()) return train_set, test_set def split_train_set_sklearn(self, test_size, random_state): print("\n\n se sklearn and split train set \n\n") train_set, test_set = train_test_split(self.data, test_size=test_size, random_state=random_state) print(test_set.head()) return train_set, test_set if __name__ == "__main__": housing = HousingData().get_data_frame() training_set = GenerateTrainingSet(housing) training_set.add_index_as_id_and_split_training_set() training_set.add_custom_id_as_function_and_split_training_set( lambda data: data['longitude'] * 1000 + data['longitude']) training_set.split_train_set_sklearn(test_ratio, random_state)
s = compare_props["Stratified"] compare_props["Rand. %error"] = ((r - o) / o) * 100 compare_props["Strat. %error"] = ((s - o) / o) * 100 print("\n Comparison Matrix \n", compare_props) self.comparison_matrix = compare_props def drop_stratified_columns(self): for set_ in (self.strat_train_set, self.strat_test_set): set_.drop("income_cat", axis=1, inplace=True) def get_stratified_train_test_set(self): return self.strat_train_set, self.strat_test_set def pipeline(self): self.base_graph() self.stratified_sampling() self.comparison_matrix() self.drop_stratified_columns() return self if __name__ == "__main__": show = len(sys.argv) > 1 and sys.argv[1] == 'show' print("show = ", show) analysis = Analysis(HousingData(), show).pipeline() strat_train_set, strat_test_set = analysis.get_stratified_train_test_set() print(len(strat_train_set))