def __init__(self, data, y): self.transformations = [] self.number_attributes = data.shape[1] self.data = data self.y = y self.unary_output_dimension_size = -1 self.transformer = Transformer(self.data, self.y, map=False, number_clusters_for_target=1) self.transformer.create_train_test_valid_stratified() self.train_data = self.data.iloc[self.transformer.ids_parts[0], :] self.valid_data = self.data.iloc[self.transformer.ids_parts[2], :] self.test_data = self.data.iloc[self.transformer.ids_parts[1], :] self.train_data_bin = None self.valid_data_bin = None self.test_data_bin = None self.train_data_bin_una = None self.valid_data_bin_una = None self.test_data_bin_una = None self.train_data = self.train_data.reset_index(drop=True) self.valid_data = self.valid_data.reset_index(drop=True) self.test_data = self.test_data.reset_index(drop=True) print self.train_data print self.train_data
if fraction < 0.05: transformers.extend(categorical_transformers) new_dataframe = pandas_table[[ pandas_table.columns[col_rep], pandas_table.columns[target_column] ]] print new_dataframe.shape #new_dataframe = new_dataframe.sample(n=1500) #new_dataframe.reset_index(inplace=True, drop=True) for current_transformer in transformers: transformer = Transformer(new_dataframe, 1, map=False) transformer.create_train_test(25, 2000) transformer.transformers = [current_transformer] transformer.fit() datasets, targets, feature_names = transformer.transform() if type(datasets[0]) == type(None): log_file.write( str(col_rep) + ": " + str(pandas_table.columns[col_rep]) + ": " + str(transformer.transformers[0]) + ": " + str(0.0) + "\n") log_file.flush()
for line in f: tokens = line.split("#") user = tokens[0] project = tokens[1] csv_file = tokens[2] type_var = tokens[3] task = tokens[4] target_column = int(tokens[5]) mypath = "/home/felix/.kaggle/datasets/" + str(user) + "/" + str( project) + "/" + csv_file pandas_table = pd.read_csv(mypath, encoding="utf-8", parse_dates=True) #pandas_table = pandas_table.fillna('0.0') transformer = Transformer(pandas_table, target_column) for counter_it in range(3): transformer.fit() datasets, targets, feature_names = transformer.transform() regr = xgb.XGBClassifier(objective='multi:softprob', nthread=4) regr.fit(datasets[0], targets[0]) #from sklearn import svm #regr = svm.SVC() #regr.fit(X_train, y_train) #get feature importance
target_colum = 8 pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/dataset_31_credit-g_german_credit.csv") target_colum = 20 pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/dataset_23_cmc_contraceptive.csv")# 3 classes target_colum = 9 pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/phpkIxskf_bank_data.csv") target_colum = 16 pandas_table = pd.read_csv("/home/felix/datasets/ExploreKit/csv/vehicleNorm.csv") target_colum = 100 ''' transformer = Transformer(pandas_table, target_colum, map=False, number_clusters_for_target=1) number_runs = 10 fscore = [] fscore_best = [] transformations = get_all_transformations() rand_state = np.random.RandomState(seed=42)
def run(self): print self.pandas_dataframe.shape transformer = Transformer(self.pandas_dataframe, self.target_column, map=False, number_clusters_for_target=1) transformer.create_train_test_valid_stratified( train_fraction=[0.66, 1000000], valid_fraction=0.0, test_fraction=0.44, seed=42) transformations = get_all_transformations_per_column( self.pandas_dataframe, self.target_column) rand_state = np.random.RandomState(seed=42) my_Score = self.Score(transformer.number_classes) classifier = self.Classifier(transformer.number_classes, my_Score) apply_hyperparameter_optimization = False cross_val_folds = 10 self.fscore = [] self.fscore_best = [] N_runs = 0 while True: transformers = [] #we randomly choose one feature representation per attribute or none for col_i in range(self.pandas_dataframe.shape[1]): if col_i != self.target_column: best_i = rand_state.randint( len(transformations[col_i]) + 1) if best_i != len(transformations[col_i]): transformer_default = copy.deepcopy( transformations[col_i][best_i]) transformer_default.column_id = col_i transformers.append(transformer_default) else: #skip attribute print "skip attribute" transformer.transformers = transformers failed_transformation = False try: transformer.fit() datasets, targets, feature_names = transformer.transform() if apply_hyperparameter_optimization: best_params = classifier.run_cross_validation( datasets[0], targets[0], cross_val_folds) model_hyperparameter_optimized = classifier.fit( datasets[0], targets[0], best_params) default_model = classifier.fit(datasets[0], targets[0]) except Exception as e: print e failed_transformation = True if failed_transformation: continue #apply hyperparameter tuning ''' # get feature importance b = model_hyperparameter_optimized.get_booster() fs = b.get_score('', importance_type='gain') all_features = [fs.get(f, 0.) for f in b.feature_names] all_features = np.array(all_features, dtype=np.float32) sorted = np.argsort(-all_features) number_of_features = 10 show_features = np.array(feature_names)[sorted][0:number_of_features] # Visualize model fig, ax = plt.subplots() y_pos = np.arange(len(show_features)) performance = all_features[sorted][0:number_of_features] ax.barh(y_pos, performance, align='center', color='green', ecolor='black') ax.set_yticks(y_pos) ax.set_yticklabels(show_features) ax.invert_yaxis() # labels read top-to-bottom ax.set_xlabel('Gain') plt.show() ''' for t_i in transformers: try: print t_i.__class__.__name__ + ": " + str( len(t_i.get_feature_names(self.pandas_dataframe))) except: print t_i.__class__.__name__ + ": " + "exception" assert datasets[0].shape[1] == len( feature_names), "Feature names does not fit to data dimensions" try: y_pred = default_model.predict(datasets[1]) #test current_score = my_Score.score(targets[1], y_pred) #test if apply_hyperparameter_optimization: y_pred_best = model_hyperparameter_optimized.predict( datasets[1]) #test current_score_best = my_Score.score( targets[1], y_pred_best) #test self.fscore.append(current_score) if apply_hyperparameter_optimization: self.fscore_best.append(current_score_best) print transformer.print_config() print "default F1: " + str(current_score) print "max: " + str(np.max(self.fscore)) if apply_hyperparameter_optimization: print "optimzed F1: " + str(current_score_best) N_runs += 1 except Exception as e: print e if N_runs == self.number_of_valid_configs: break
print mypath print target_column pandas_table = pd.read_csv(mypath, encoding="utf-8", parse_dates=True) #pandas_table = pandas_table.fillna('0.0') for col_rep in range(pandas_table.shape[1]): if col_rep != target_column: new_dataframe = pandas_table[[ pandas_table.columns[col_rep], pandas_table.columns[target_column] ]] transformer = Transformer(new_dataframe, 1) while True: transformer.fit() datasets, targets, feature_names = transformer.transform() if type(datasets[0]) == type(None): break transformer.next_transformation_for_attribute(0) print str(type(datasets[0])) print str(datasets[0].shape)
for i_feature in range(len(importance_scores)): attribute_id = int(feature_names[i_feature].split("#")[0]) if not attribute_id in attribute_importance: attribute_importance[attribute_id] = importance_scores[i_feature] else: attribute_importance[attribute_id] += importance_scores[i_feature] return attribute_importance pandas_table = pd.read_csv( "/home/felix/.kaggle/datasets/dansbecker/melbourne-housing-snapshot/melb_data.csv", encoding="utf-8", parse_dates=True) transformer = Transformer(pandas_table, 4) fscore = [] while True: transformers = [] skip_columns = [] for result in results: if not result.column_id in skip_columns: #best_i = np.random.randint(len(result.transformers) + 1) best_i = 0 if best_i != len(result.transformers): transformers.append(result.get_best_transformer(best_i))
def run(self): transformer = Transformer(self.pandas_dataframe, self.target_column, map=False, number_clusters_for_target=1) self.fscore_list = [] transformations = get_all_transformations() rand_state = np.random.RandomState(seed=42) my_Score = self.Score(transformer.number_classes) classifier = self.Classifier(transformer.number_classes, my_Score) apply_hyperparameter_optimization = False cross_val_folds = 10 for run_i in range(self.number_runs): transformer.create_train_test_valid(train_fraction=[0.6, 1000], valid_fraction=0.2, test_fraction=0.2, seed=42 + run_i) final_transformers = [] transformed_columns = [] for attribute_i in range(self.pandas_dataframe.shape[1]): if attribute_i != self.target_column: attribute_scores = [] attribute_transformation = [] for transformation_i in range(len(transformations)): transformers = [] transformer_default = copy.deepcopy(transformations[transformation_i]) transformer_default.column_id = attribute_i transformers.append(transformer_default) transformer.transformers = transformers failed_transformation = False try: transformer.fit() datasets, targets, feature_names = transformer.transform() if apply_hyperparameter_optimization: best_params = classifier.run_cross_validation(datasets[0], targets[0], cross_val_folds) model_hyperparameter_optimized = classifier.fit(datasets[0], targets[0], best_params) default_model = classifier.fit(datasets[0], targets[0]) except Exception as e: print e failed_transformation = True if failed_transformation: continue assert datasets[0].shape[1] == len(feature_names), "Feature names does not fit to data dimensions" try: y_pred = default_model.predict(datasets[2]) #check validation current_score = my_Score.score(targets[2], y_pred) if apply_hyperparameter_optimization: y_pred_best = model_hyperparameter_optimized.predict(datasets[2])#check validation current_score_best = my_Score.score(targets[2], y_pred_best) attribute_scores.append(current_score) attribute_transformation.append(transformation_i) except Exception as e: print e #add best to final transformation max_id = np.argmax(np.array(attribute_scores)) final_transformers.append(attribute_transformation[max_id]) transformed_columns.append(attribute_i) #apply best single configurations together transformer_indices = final_transformers transformed_column_indices = transformed_columns transformers = [] all_transformations = get_all_transformations() for t_i in range(len(transformer_indices)): transformer_default = copy.deepcopy(all_transformations[transformer_indices[t_i]]) transformer_default.column_id = transformed_column_indices[t_i] transformers.append(transformer_default) transformer.transformers = transformers transformer.fit() datasets, targets, feature_names = transformer.transform() if apply_hyperparameter_optimization: best_params = classifier.run_cross_validation(datasets[0], targets[0], cross_val_folds) model_hyperparameter_optimized = classifier.fit(datasets[0], targets[0], best_params) default_model = classifier.fit(datasets[0], targets[0]) try: y_pred = default_model.predict(datasets[1]) # test current_score = my_Score.score(targets[1], y_pred) # test if apply_hyperparameter_optimization: y_pred_best = model_hyperparameter_optimized.predict(datasets[1]) # test current_score_best = my_Score.score(targets[1], y_pred_best) # test self.fscore_list.append(current_score) except: self.fscore_list.append(0.0)
class Generator: def __init__(self, data, y): self.transformations = [] self.number_attributes = data.shape[1] self.data = data self.y = y self.unary_output_dimension_size = -1 self.transformer = Transformer(self.data, self.y, map=False, number_clusters_for_target=1) self.transformer.create_train_test_valid_stratified() self.train_data = self.data.iloc[self.transformer.ids_parts[0], :] self.valid_data = self.data.iloc[self.transformer.ids_parts[2], :] self.test_data = self.data.iloc[self.transformer.ids_parts[1], :] self.train_data_bin = None self.valid_data_bin = None self.test_data_bin = None self.train_data_bin_una = None self.valid_data_bin_una = None self.test_data_bin_una = None self.train_data = self.train_data.reset_index(drop=True) self.valid_data = self.valid_data.reset_index(drop=True) self.test_data = self.test_data.reset_index(drop=True) print self.train_data print self.train_data def generate_unary_transformations(self): # Fu,i unary_transformations = get_unary_transformations() applied_unary_transformations = [] for attribute_i in range(self.number_attributes): for transformation_i in range(len(unary_transformations)): transformation = copy.deepcopy( unary_transformations[transformation_i]) transformation.column_id = attribute_i applied_unary_transformations.append(transformation) applied_unary_transformations_output_dim = [] for transformation_i in range(len(applied_unary_transformations)): self.transformer.transformers = [ applied_unary_transformations[transformation_i] ] #if transformer.transformers[0].output_space_size == None: self.transformer.fit() error = False try: transformed_data, target_data, feature_names = self.transformer.transform( ) except: error = True if not error: print self.train_data.columns new_train = pd.DataFrame(data=transformed_data[0], index=range( transformed_data[0].shape[0]), columns=feature_names) self.train_data = pd.concat([self.train_data, new_train], axis=1) new_test = pd.DataFrame(data=transformed_data[1], index=range( transformed_data[1].shape[0]), columns=feature_names) self.test_data = pd.concat([self.test_data, new_test], axis=1) new_valid = pd.DataFrame(data=transformed_data[2], index=range( transformed_data[2].shape[0]), columns=feature_names) self.valid_data = pd.concat([self.valid_data, new_valid], axis=1) #print applied_unary_transformations[transformation_i] #print self.test_data.columns #print self.train_data.columns #print self.train_data assert self.train_data.shape[1] == self.test_data.shape[ 1], "test != train: " + str( self.train_data.shape[1]) + " != " + str( self.test_data.shape[1]) print "all: " + str(self.train_data.shape) applied_unary_transformations_output_dim.append( self.transformer.transformers[0].output_space_size) assert ( (type(new_train) == type(None) and self.transformer.transformers[0].output_space_size == 0) ) or (new_train.shape[1] == self.transformer.transformers[0].output_space_size ), str(self.transformer.transformers[0]) + ": " + str( new_train.shape[1]) + " != " + str( self.transformer.transformers[0].output_space_size) print str(applied_unary_transformations_output_dim) print str(np.sum(applied_unary_transformations_output_dim)) self.unary_output_dimension_size = applied_unary_transformations_output_dim assert self.train_data.shape[1] == self.test_data.shape[ 1], "test != train: " + str( self.train_data.shape[1]) + " != " + str( self.test_data.shape[1]) assert self.train_data.shape[1] == self.valid_data.shape[ 1], "valid != train" assert self.test_data.shape[1] == self.valid_data.shape[ 1], "valid != test" def generate_binary_transformations(self): # Fo,i binary_transformations = get_binary_transformations() applied_binary_transformations = [] for attribute_a in range(self.train_data.shape[1]): for attribute_b in range(attribute_a + 1, self.train_data.shape[1]): for transformation_i in range(len(binary_transformations)): transformation = copy.deepcopy( binary_transformations[transformation_i]) transformation.column_a = attribute_a transformation.column_b = attribute_b applied_binary_transformations.append(transformation) print "hello" print len(applied_binary_transformations) for transformation_i in range(len(applied_binary_transformations)): current_transformation = applied_binary_transformations[ transformation_i] train_data_column_a = self.train_data[self.train_data.columns[ applied_binary_transformations[transformation_i].column_a]] train_data_column_b = self.train_data[self.train_data.columns[ applied_binary_transformations[transformation_i].column_b]] test_data_column_a = self.test_data[self.test_data.columns[ applied_binary_transformations[transformation_i].column_a]] test_data_column_b = self.test_data[self.test_data.columns[ applied_binary_transformations[transformation_i].column_b]] valid_data_column_a = self.valid_data[self.valid_data.columns[ applied_binary_transformations[transformation_i].column_a]] valid_data_column_b = self.valid_data[self.valid_data.columns[ applied_binary_transformations[transformation_i].column_b]] current_transformation.fit1(train_data_column_a, train_data_column_b) transformed_train = current_transformation.transform1( train_data_column_a, train_data_column_b) transformed_test = current_transformation.transform1( test_data_column_a, test_data_column_b) transformed_valid = current_transformation.transform1( valid_data_column_a, valid_data_column_b) if type(self.train_data_bin) == type(None): self.train_data_bin = np.matrix( transformed_train).T #check if different self.test_data_bin = np.matrix(transformed_test).T self.valid_data_bin = np.matrix(transformed_valid).T else: self.train_data_bin = np.concatenate( (self.train_data_bin, np.matrix(transformed_train).T), axis=1) self.test_data_bin = np.concatenate( (self.test_data_bin, np.matrix(transformed_test).T), axis=1) self.valid_data_bin = np.concatenate( (self.valid_data_bin, np.matrix(transformed_valid).T), axis=1) #print str(self.train_data_bin.shape) + " vs " + str(np.matrix(transformed_train).T.shape) print "binary size:" + str(self.train_data_bin.shape) def generate_binary_unary_transformations(self): # Fu,i unary_transformations = get_unary_transformations() applied_unary_transformations = [] for attribute_i in range(self.train_data_bin.shape[1]): for transformation_i in range(len(unary_transformations)): transformation = copy.deepcopy( unary_transformations[transformation_i]) transformation.column_id = attribute_i applied_unary_transformations.append(transformation) applied_unary_transformations_output_dim = [] for transformation_i in range(len(applied_unary_transformations)): current_transformer = applied_unary_transformations[ transformation_i] train_data_column = self.train_data_bin[:, applied_unary_transformations[ transformation_i]. column_id] test_data_column = self.test_data_bin[:, applied_unary_transformations[ transformation_i]. column_id] valid_data_column = self.valid_data_bin[:, applied_unary_transformations[ transformation_i]. column_id] current_transformer.fit1(train_data_column) error = False try: transformed_train = current_transformer.transform1( train_data_column) transformed_test = current_transformer.transform1( test_data_column) transformed_valid = current_transformer.transform1( valid_data_column) except: error = True if not error: if type(self.train_data_bin_una) == type(None): self.train_data_bin_una = np.matrix( transformed_train) # check if different self.test_data_bin_una = np.matrix(transformed_test) self.valid_data_bin_una = np.matrix(transformed_valid) else: print str(self.train_data_bin_una.shape) + " vs " + str( (np.matrix(transformed_train).T).shape) try: self.train_data_bin_una = np.concatenate( (self.train_data_bin_una, np.matrix(transformed_train).T), axis=1) self.test_data_bin_una = np.concatenate( (self.test_data_bin_una, np.matrix(transformed_test).T), axis=1) self.valid_data_bin_una = np.concatenate( (self.valid_data_bin_una, np.matrix(transformed_valid).T), axis=1) except: self.train_data_bin_una = np.concatenate( (self.train_data_bin_una, np.matrix(transformed_train)), axis=1) self.test_data_bin_una = np.concatenate( (self.test_data_bin_una, np.matrix(transformed_test)), axis=1) self.valid_data_bin_una = np.concatenate( (self.valid_data_bin_una, np.matrix(transformed_valid)), axis=1) print self.train_data_bin_una.shape[1]