def fit(self, X, Y, learning_rate=1e-8, reg=1e-12, epochs=10000, show_fig=False): D = X.shape[1] # number of features K = len(set(Y)) # number of classes X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] T = one_hot_encoder(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) costs = [] best_validation_error = 1 for epoch in range(epochs): Y_hat, Z = self.forward(X) # Weight updates ---------------------- Y_hat_T = Y_hat - T self.W2 -= learning_rate * (Z.T.dot(Y_hat_T) + reg * self.W2) self.b2 -= learning_rate * (Y_hat_T.sum() + reg * self.b2) val = Y_hat_T.dot(self.W2.T) * (1 - Z * Z) #tanh self.W1 -= learning_rate * (X.T.dot(val) + reg * self.W1) self.b1 -= learning_rate * (val.sum() + reg * self.b1) # ------------------------------------- if epoch % 10 == 0: Y_hat_valid, _ = self.forward(X_valid) c = cross_entropy(T_valid, Y_hat_valid) costs.append(c) e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1)) print("epoch:", epoch, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.title('Validation cost') print("Final train classification_rate:", self.score(Y, self.predict(Y_hat)))
def generate_batch_hot(self): start = self.start end = self.end self.texts_train = [] self.labels_train = [] data_split = self.ids[start:end] for i in range(0, len(data_split)): ids_index = data_split[i][0].split(" ") id = int(ids_index[0]) index = int(ids_index[1]) labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train.append(labels_temp) text_name = str(id) + "text.txt" temp_text = "" with open('data/bibtex/over200/train/' + text_name, 'r') as f: temp_text = f.read() temp_text = temp_text + temp_text.replace(" ", "") temp_text = temp_text + temp_text.replace(" ", "").replace( "\t", "") matrix = utils.one_hot_encoder(temp_text) self.texts_train.append(matrix)
def generic_visit(self, node): ''' Is called upon visit to every node. ''' if not hasattr(node, 'visited'): if not ast_utils.should_filter(node): self.collect_metadata(node) self.nodes_stack.append(node) token_id = ast_utils.get_token_id(node) if token_id == -1: print("[WARNING] --- Found unkown token", node) if self.include_vectorized_tokens: ft = feature_utils.token2vec(node, slot=self.slot) if np.count_nonzero(np.isnan(ft)) > 0: print("[WARNING] Found nan feature for node", node) ft = np.zeros(64) one_hot_token_type = utils.one_hot_encoder( token_id, 1, min=0, max=max(AST_SYMBOL_DICT.values())) if self.include_vectorized_tokens: self.feature_list.append( np.concatenate([ft, one_hot_token_type[0]])) else: self.feature_list.append(one_hot_token_type[0]) self.classes_list.append(ast_utils.get_token_class_id(node)) node.visited = True ast.NodeVisitor.generic_visit(self, node)
def kmeans_for_img(kmeans, img): h, w, ch = img.shape img = np.reshape(img, (h*w, ch)) img = kmeans.predict(img) img = one_hot_encoder(img, 64) img = np.reshape(img, (h, w, 64)) return img
def installments_payments(num_rows=None, nan_as_category=True): ins = pd.read_csv('../input/installments_payments.csv', nrows=num_rows) ins, cat_cols = utils.one_hot_encoder(ins, nan_as_category=nan_as_category) # Percentage and difference paid in each installment (amount paid and installment value) ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT'] ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT'] # Days past due and days before due (no negative values) ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT'] ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT'] ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0) ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0) # Features: Perform aggregations aggregations = { 'NUM_INSTALMENT_VERSION': ['nunique'], 'DPD': ['max', 'mean', 'sum'], 'DBD': ['max', 'mean', 'sum'], 'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'], 'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'], 'AMT_INSTALMENT': ['max', 'mean', 'sum'], 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'], 'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'] } for cat in cat_cols: aggregations[cat] = ['mean'] ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations) ins_agg.columns = pd.Index( ['INS_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()]) # Count installments accounts ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size() del ins gc.collect() return ins_agg
def credit_card_balance(num_rows=None, nan_as_category=True): """ load the data from """ cc = pd.read_csv('../input/credit_card_balance.csv', nrows=num_rows) # NEW CLEANING #cc['AMT_DRAWINGS_ATM_CURRENT'][cc['AMT_DRAWINGS_ATM_CURRENT'] < 0] = np.nan #cc['AMT_DRAWINGS_CURRENT'][cc['AMT_DRAWINGS_CURRENT'] < 0] = np.nan cc['AMT_DRAWINGS_ATM_CURRENT'] = cc['AMT_DRAWINGS_ATM_CURRENT'].apply( lambda x: np.nan if x < 0 else x) cc['AMT_DRAWINGS_CURRENT'] = cc['AMT_DRAWINGS_CURRENT'].apply( lambda x: np.nan if x < 0 else x) cc, cat_cols = utils.one_hot_encoder(cc, nan_as_category=nan_as_category) # General aggregations cc.drop(columns=['SK_ID_PREV'], inplace=True) # aggregate the cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var']) cc_agg.columns = pd.Index( ['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()]) # Count credit card lines cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size() del cc gc.collect() return cc_agg
def get_feature_from_pre_app(prev): prev, cat_cols = utils.one_hot_encoder(prev, nan_as_category=True) # Days 365.243 values -> nan prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) # Add feature: value received / value ask prev['APP_CREDIT_PERC'] = prev['AMT_CREDIT'] / prev['AMT_APPLICATION'] # Previous applications' numeric features num_aggregations = { 'AMT_ANNUITY': ['max', 'mean'], 'AMT_APPLICATION': ['max', 'mean'], 'AMT_CREDIT': ['max', 'mean'], 'APP_CREDIT_PERC': ['max', 'mean'], 'AMT_DOWN_PAYMENT': ['max', 'mean'], 'AMT_GOODS_PRICE': ['max', 'mean'], 'RATE_DOWN_PAYMENT': ['max', 'mean'], 'RATE_INTEREST_PRIMARY': ['max', 'mean'], 'RATE_INTEREST_PRIVILEGED': ['max', 'mean'], 'CNT_PAYMENT': ['mean', 'sum'], } # Previous applications categorical features cat_aggregations = {} for cat in cat_cols: cat_aggregations[cat] = ['mean'] prev_agg = prev.groupby('SK_ID_CURR').agg({ **num_aggregations, **cat_aggregations }) prev_agg.columns = pd.Index([ 'PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist() ]) # Previous Applications: Approved Applications - only numerical features approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1] approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations) approved_agg.columns = pd.Index([ 'APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist() ]) prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR') # Previous Applications: Refused Applications - only numerical features refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1] refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations) refused_agg.columns = pd.Index([ 'REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist() ]) prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR') del refused, refused_agg, approved, approved_agg, prev gc.collect() return prev_agg
def fit(self, X, Y, learning_rate=1e-8, reg=1e-12, epochs=10000, show_fig=False): D = X.shape[1] # number of features K = len(set(Y)) # number of classes X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] T = one_hot_encoder(Y) self.W = np.random.randn(D, K) / np.sqrt(D) self.b = np.zeros(K) costs = [] best_validation_error = 1 for epoch in range(epochs): Y_hat = self.forward(X) self.W -= learning_rate * (self.dJ_dw(T, Y_hat, X) + reg * self.W) self.b -= learning_rate * (self.dJ_db(T, Y_hat) + reg * self.b) if epoch % 100 == 0: Y_hat_valid = self.forward(X_valid) c = cross_entropy(T_valid, Y_hat_valid) costs.append(c) e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1)) print("epoch:", epoch, "cost:", c, "error:", e) if e < best_validation_error: best_validation_error = e print("best_validation_error:", best_validation_error) if show_fig: plt.plot(costs) plt.title('Validation cost') plt.show() print("Final train classification_rate:", self.score(X, Y))
def get_pos_cash(path, num_rows= None): """Preprocess and extract features from POS_CASH_balance file. Arguments: path: Path to the folder where files are saved (string). num_rows: Number of rows to read; None reads all rows (int, default: None). Returns: df: DataFrame with processed data. """ pos = pd.read_csv(os.path.join(path, 'POS_CASH_balance.csv'), nrows=num_rows) pos, categorical_cols = utils.one_hot_encoder(pos, nan_as_category=False) # Flag months with late payment pos['LATE_PAYMENT'] = pos['SK_DPD'].apply(lambda x: 1 if x > 0 else 0) # Aggregate by SK_ID_CURR categorical_agg = {key: ['mean'] for key in categorical_cols} pos_agg = utils.group(pos, 'POS_', {**config.POS_CASH_AGG, **categorical_agg}) # Sort and group by SK_ID_PREV sort_pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE']) gp = sort_pos.groupby('SK_ID_PREV') df = pd.DataFrame() df['SK_ID_CURR'] = gp['SK_ID_CURR'].first() df['MONTHS_BALANCE_MAX'] = gp['MONTHS_BALANCE'].max() # Percentage of previous loans completed and completed before initial term df['POS_LOAN_COMPLETED_MEAN'] = gp['NAME_CONTRACT_STATUS_Completed'].mean() df['POS_COMPLETED_BEFORE_MEAN'] = gp['CNT_INSTALMENT'].first() - gp['CNT_INSTALMENT'].last() df['POS_COMPLETED_BEFORE_MEAN'] = df.apply(lambda x: 1 if x['POS_COMPLETED_BEFORE_MEAN'] > 0 and x['POS_LOAN_COMPLETED_MEAN'] > 0 else 0, axis=1) # Number of remaining installments (future installments) and percentage from total df['POS_REMAINING_INSTALMENTS'] = gp['CNT_INSTALMENT_FUTURE'].last() df['POS_REMAINING_INSTALMENTS_RATIO'] = gp['CNT_INSTALMENT_FUTURE'].last()/gp['CNT_INSTALMENT'].last() # Group by SK_ID_CURR and merge df_gp = df.groupby('SK_ID_CURR').sum().reset_index() df_gp.drop(['MONTHS_BALANCE_MAX'], axis=1, inplace= True) pos_agg = pd.merge(pos_agg, df_gp, on= 'SK_ID_CURR', how= 'left') del df, gp, df_gp, sort_pos gc.collect() # Percentage of late payments for the 3 most recent applications pos = utils.do_sum(pos, ['SK_ID_PREV'], 'LATE_PAYMENT', 'LATE_PAYMENT_SUM') # Last month of each application last_month_df = pos.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax() # Most recent applications (3) sort_pos = pos.sort_values(by=['SK_ID_PREV', 'MONTHS_BALANCE']) gp = sort_pos.iloc[last_month_df].groupby('SK_ID_CURR').tail(3) gp_mean = gp.groupby('SK_ID_CURR').mean().reset_index() pos_agg = pd.merge(pos_agg, gp_mean[['SK_ID_CURR','LATE_PAYMENT_SUM']], on='SK_ID_CURR', how='left') # Drop some useless categorical features drop_features = [ 'POS_NAME_CONTRACT_STATUS_Canceled_MEAN', 'POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN', 'POS_NAME_CONTRACT_STATUS_XNA_MEAN'] pos_agg.drop(drop_features, axis=1, inplace=True) return pos_agg
def get_feature_from_credit_card_balance(cc): cc, cat_cols = utils.one_hot_encoder(cc, nan_as_category=True) # General aggregations cc.drop(['SK_ID_PREV'], axis=1, inplace=True) cc_agg = cc.groupby('SK_ID_CURR').agg(['max', 'mean', 'sum', 'var']) cc_agg.columns = pd.Index( ['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()]) # Count credit card lines cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size() del cc gc.collect() return cc_agg
def previous_applications(num_rows=None, nan_as_category=True): prev = pd.read_csv('../input/previous_application.csv', nrows=num_rows) prev, cat_cols = utils.one_hot_encoder(prev, nan_as_category=nan_as_category) # Days 365.243 values -> nan keys = [ 'DAYS_FIRST_DRAWING', 'DAYS_FIRST_DUE', 'DAYS_LAST_DUE_1ST_VERSION', 'DAYS_LAST_DUE', 'DAYS_TERMINATION' ] prev[keys] = prev[keys].replace(365243, np.nan) # Add feature: value ask / value received percentage prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT'] # Previous applications categorical features cat_aggregations = {cat: 'mean' for cat in cat_cols} prev_agg = aggregate_data(prev, 'PA', cat_aggregations) # Previous applications numeric features num_aggregations = { 'AMT_ANNUITY': ['min', 'max', 'mean'], 'AMT_APPLICATION': ['min', 'max', 'mean'], 'AMT_CREDIT': ['min', 'max', 'mean'], 'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'], 'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'], 'AMT_GOODS_PRICE': ['min', 'max', 'mean'], 'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'], 'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'], 'DAYS_DECISION': ['min', 'max', 'mean'], 'CNT_PAYMENT': ['mean', 'sum'], } # Previous Applications: Total Applications - only numerical features prev_subset = aggregate_data(prev, prefix='PREV', aggregates=num_aggregations) prev_agg = prev_agg.join(prev_subset, how='left', on='SK_ID_CURR') # Previous Applications: Approved Applications - only numerical features prev_subset = aggregate_data( prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1], prefix='APR', aggregates=num_aggregations) prev_agg = prev_agg.join(prev_subset, how='left', on='SK_ID_CURR') # Previous Applications: Refused Applications - only numerical features prev_subset = aggregate_data( prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1], prefix='REF', aggregates=num_aggregations) prev_agg = prev_agg.join(prev_subset, how='left', on='SK_ID_CURR') return prev_agg
def cache(self, node_attribute, num_nodes): path_len_2 = pickle.load(open(self.dataset + "/path_len_2.p", "rb")) path_len_3 = pickle.load(open(self.dataset + "/path_len_3.p", "rb")) path_len_4 = pickle.load(open(self.dataset + "/path_len_4.p", "rb")) paths = np.array(path_len_2 + path_len_3 + path_len_4) g = np.random.Generator(np.random.PCG64()) sampled_paths = paths[g.choice(len(paths), 16400, replace=False)] real_walk_data = [] for path in sampled_paths: temp_walk = utils.one_hot_encoder(np.array(path), num_nodes + 1) temp_type = utils.one_hot_encoder( np.array([node_attribute[i] for i in path]), self.num_classes) # Padding random walks to the max length if temp_type.shape[0] < self.max_path_len: temp_walk = utils.pad_along_axis(temp_walk, self.max_path_len, axis=0) temp_type = utils.pad_along_axis(temp_type, self.max_path_len, axis=0) real_walk_data.append((temp_type, temp_walk)) print("Done!") return real_walk_data
def generate_batch_hot(self): start = self.start end = self.end self.texts_train = [] self.labels_train = [] #print(self.data[start:end, 2]) titles = list(self.texts[start:end, 1]) texts = list(self.texts[start:end, 2]) for i in range(0, len(texts)): text = titles[i] + texts[i] text = text.replace(" ", "") matrix = utils.one_hot_encoder(text) self.texts_train.append(matrix) labels = list(self.texts[start:end, 0]) for i in range(0, len(labels)): temp = np.zeros(config.label_size) temp[int(labels[i]) - 1] = 1 self.labels_train.append(temp)
def get_feature_from_pos_cash(pos): pos, cat_cols = utils.one_hot_encoder(pos, nan_as_category=True) # Features aggregations = { 'MONTHS_BALANCE': ['max', 'mean', 'size'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean'] } for cat in cat_cols: aggregations[cat] = ['mean'] pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations) pos_agg.columns = pd.Index( ['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()]) # Count pos cash accounts pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size() del pos gc.collect() return pos_agg
def pos_cash(num_rows=None, nan_as_category=True): pos = pd.read_csv('../input/POS_CASH_balance.csv', nrows=num_rows) pos, cat_cols = utils.one_hot_encoder(pos, nan_as_category=nan_as_category) # Features aggregations = { 'MONTHS_BALANCE': ['max', 'mean', 'size'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean'] } for cat in cat_cols: aggregations[cat] = ['mean'] pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations) pos_agg.columns = pd.Index( ['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()]) # Count pos cash accounts pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size() del pos gc.collect() return pos_agg
def get_bureau_balance(path, num_rows=None): """Preprocess and extract features from bureau balance. Aggregations are done by SK_ID_BUREAU. Arguments: path: Path to the folder where files are saved (string). num_rows: Number of rows to read; None reads all rows (int, default: None). Returns: df: DataFrame with processed data. """ bb = pd.read_csv(os.path.join(path, 'bureau_balance.csv'), nrows=num_rows) bb, categorical_cols = utils.one_hot_encoder(bb, nan_as_category=False) bb_processed = bb.groupby('SK_ID_BUREAU')[categorical_cols].mean().reset_index() # Min, Max, Count and mean duration of payments (months) agg = {'MONTHS_BALANCE': ['min', 'max', 'mean', 'size']} bb_processed = utils.group_and_merge(bb, bb_processed, '', agg, 'SK_ID_BUREAU') del bb gc.collect() return bb_processed
def build_data_sets(file_name, name="no_name", avg_group_size=None, derivation=None, random_state=42, test_proportion=0.2): eeg = EEG(data_reader=matlab_data_reader).read(file_name) n_channels = eeg.n_channels if avg_group_size: eeg.average_trials(avg_group_size, inplace=True) derivation = derivation or 'potential' if derivation.lower() == "electric_field": eeg.get_electric_field(inplace=True) eeg.data = eeg.data.reshape(eeg.n_channels, eeg.trial_size, -1, 3).transpose((2, 0, 1, 3)) elif derivation.lower() == 'laplacian': eeg.get_laplacian(inplace=True) n_classes = len(np.unique(eeg.trial_labels)) labels = one_hot_encoder(eeg.trial_labels) X_train, X_test, y_train, y_test = train_test_split( eeg.data, labels, test_size=test_proportion, random_state=random_state) return type( 'DataSet', (), { 'train': EEGDataSetBatch(X_train, y_train), 'test': type('Dataset', (), { 'samples': X_test, 'labels': y_test }), 'trial_size': eeg.trial_size, 'name': name, 'derivation': derivation, 'avg_group_size': avg_group_size, 'random_state': random_state, 'test_proportion': test_proportion, 'n_channels': n_channels, 'n_comps': eeg.n_comps, 'n_classes': n_classes })
def get_credit_card(path, num_rows= None): """Preprocess and extract features from credit_card_balance. Arguments: path: Path to the folder where files are saved (string). num_rows: Number of rows to read; None reads all rows (int, default: None). Returns: df: DataFrame with processed data. """ cc = pd.read_csv(os.path.join(path, 'credit_card_balance.csv'), nrows= num_rows) cc, _ = utils.one_hot_encoder(cc, nan_as_category=False) cc.rename(columns={'AMT_RECIVABLE': 'AMT_RECEIVABLE'}, inplace=True) # Amount used from limit cc['LIMIT_USE'] = cc['AMT_BALANCE'] / cc['AMT_CREDIT_LIMIT_ACTUAL'] # Current payment / Min payment cc['PAYMENT_DIV_MIN'] = cc['AMT_PAYMENT_CURRENT'] / cc['AMT_INST_MIN_REGULARITY'] # Late payment cc['LATE_PAYMENT'] = cc['SK_DPD'].apply(lambda x: 1 if x > 0 else 0) # How much drawing of limit cc['DRAWING_LIMIT_RATIO'] = cc['AMT_DRAWINGS_ATM_CURRENT'] / cc['AMT_CREDIT_LIMIT_ACTUAL'] # Aggregations by SK_ID_CURR cc_agg = cc.groupby('SK_ID_CURR').agg(config.CREDIT_CARD_AGG) cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()]) cc_agg.reset_index(inplace= True) # Last month balance of each credit card application last_ids = cc.groupby('SK_ID_PREV')['MONTHS_BALANCE'].idxmax() last_months_df = cc[cc.index.isin(last_ids)] cc_agg = utils.group_and_merge(last_months_df,cc_agg,'CC_LAST_', {'AMT_BALANCE': ['mean', 'max']}) # Aggregations for last x months for months in [12, 24, 48]: cc_prev_id = cc[cc['MONTHS_BALANCE'] >= -months]['SK_ID_PREV'].unique() cc_recent = cc[cc['SK_ID_PREV'].isin(cc_prev_id)] prefix = 'CC_{}M_'.format(months) cc_agg = utils.group_and_merge(cc_recent, cc_agg, prefix, config.CREDIT_CARD_TIME_AGG) return cc_agg
def generate_batch_hot(self): start = self.start end = self.end self.texts_train = [] self.labels_train = [] #print(self.data[start:end, 2]) titles = list(self.data[start:end, 1]) texts = list(self.data[start:end, 2]) for i in range(0, len(texts)): text = titles[i] + texts[i] text = text.replace(" ", "") matrix = utils.one_hot_encoder(text) self.texts_train.append(matrix) labels = list(self.data[start:end, 0]) for i in range(0, len(labels)): if labels[i] == '1': self.labels_train.append([1, 0, 0, 0]) if labels[i] == '2': self.labels_train.append([0, 1, 0, 0]) if labels[i] == '3': self.labels_train.append([0, 0, 1, 0]) if labels[i] == '4': self.labels_train.append([0, 0, 0, 1])
def oneHotGenerate(dataSet, labSet, batchSize): dataSet = list(dataSet) labSet = list(labSet) data_list = np.zeros((batchSize, 500, 25, 1), dtype=np.float32) label_list = np.zeros((batchSize)) setNum = len(dataSet) batchFlag = 0 setFlag = 0 while True: data = dataSet[setFlag] label = labSet[setFlag] data = one_hot_encoder(data) data = normalization_processing(data) data_list[batchFlag, :, :, 0] = data label_list[batchFlag] = label batchFlag += 1 setFlag += 1 if setFlag >= setNum: setFlag = 0 if batchFlag >= batchSize: oneHotLab = to_categorical(label_list, num_classes=2) yield [data_list], [oneHotLab] batchFlag = 0 data_list = np.zeros((batchSize, 500, 25, 1), dtype=np.float32) lab_list = np.zeros((batchSize))
def generate_batch_hot(self): start = self.start end = self.end self.texts_train = [] self.labels_train = [] data_split = self.ids[start:end] for i in range(0, len(data_split)): #print(data_split[i]) ids_index = data_split[i][0].split(" ") id = int(ids_index[0]) index = int(ids_index[1]) labels = self.labels[index][0] split_labels = labels.split(" ") labels_temp = np.zeros(config.label_size) for j in range(1, len(split_labels)): try: label_index = utils.find_label_index(split_labels[j]) labels_temp[label_index] = 1.0 except ValueError: print("Not have label: ", split_labels[j]) self.labels_train.append(labels_temp) text_name = str(id) + "newsML.xml" #reuters = et.parse("data/rcv1-2/train-text/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot() reuters = et.parse("data/rcv1-2/test-text0/" + text_name, et.XMLParser(encoding='ISO-8859-1')).getroot() temp_text = "" for text in reuters.findall("title"): #print(text.text) temp_text = temp_text + text.text.replace(" ", "") for text in reuters.findall("text"): for p in text.findall("p"): temp_text = temp_text + p.text.replace(" ", "").replace( "\t", "") #print("ID TExt: ", id) #print(temp_text) matrix = utils.one_hot_encoder(temp_text) self.texts_train.append(matrix)
def fit(self, X, Y, activation=th.nnet.relu, learning_rate=1e-8, reg=1e-12, epochs=10000, n_batches=10, decay_rate=0.9, show_fig=False): X = X.astype(np.float32) Y = Y.astype(np.int32) X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] T = one_hot_encoder(Y) self.rng = theano.tensor.shared_randomstreams.RandomStreams() eps = 1e-10 D = X.shape[1] # number of features K = len(set(Y)) # number of classes batch_size = X.shape[0] // n_batches print_time = n_batches // 1 M1 = D for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, activation_fn=activation) self.layers.append(h) M1 = M2 # the final layer h = HiddenLayer(M1, K, activation_fn=th.nnet.softmax) self.layers.append(h) for layer in self.layers: self.params += layer.params dparams = [ theano.shared(np.zeros_like(p.get_value())) for p in self.params ] cache = [ theano.shared(np.zeros_like(p.get_value())) for p in self.params ] thX = th.matrix('X') thT = th.matrix('T') thY_train = self.forward_train(thX) # Cost regularization_cost = reg * th.mean([(p * p).sum() for p in self.params]) #cost = -th.mean(th.log(thY[th.arange(thT.shape[0]), thT])) #+ regularization_cost cost_train = -th.mean(thT * th.log(thY_train)) + regularization_cost # Gradient grads = th.grad(cost_train, self.params) update_params = [(p, p - learning_rate * (decay_rate * v + (1 - decay_rate) * g + reg * p)) for g, v, p in zip(grads, dparams, self.params)] update_velocity = [(v, decay_rate * v + (1 - decay_rate) * g) for g, v in zip(grads, dparams)] # updates = [(p, p - learning_rate*g) for g, p in zip(grads, self.params)] updates = update_params + update_velocity train_op = theano.function(inputs=[thX, thT], updates=updates) thY_predict = self.forward_predict(thX) cost_predict = -th.mean( thT * th.log(thY_predict)) + regularization_cost # Predictions prediction = th.argmax(thY_predict, axis=1) cost_predict_op = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) costs = [] for epoch in range(epochs): X_shuffled, T_shuffled = shuffle(X, T) for batch in range(n_batches): # Get the batch X_batch = X_shuffled[batch * batch_size:(batch + 1) * batch_size, :] Y_batch = T_shuffled[batch * batch_size:(batch + 1) * batch_size, :] train_op(X_batch, Y_batch) if batch % print_time == 0: test_cost, prediction = cost_predict_op(X_valid, T_valid) err = error_rate(Y_valid, prediction) # print(prediction.shape) print( "epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" % (epoch, batch, test_cost, err)) costs.append(test_cost) plt.plot(costs) plt.title('Validation cost') plt.show()
def get_bureau(path, num_rows=None): """Preprocess and extract features from bureau and bureau balance. Get bureau balance features grouped by SK_ID_BUREAU and append to bureau data. After that, it performs aggregations for each customer (unique SK_ID_CURR) and return a DataFrame Arguments: path: Path to the folder where files are saved (string). num_rows: Number of rows to read; None reads all rows (int, default: None). Returns: df: DataFrame with processed data. """ bureau = pd.read_csv(os.path.join(path, 'bureau.csv'), nrows= num_rows) # Credit duration and credit/account end date difference bureau['CREDIT_DURATION'] = -bureau['DAYS_CREDIT'] + bureau['DAYS_CREDIT_ENDDATE'] bureau['ENDDATE_DIF'] = bureau['DAYS_CREDIT_ENDDATE'] - bureau['DAYS_ENDDATE_FACT'] # Credit to debt ratio and difference bureau['DEBT_PERCENTAGE'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_CREDIT_SUM_DEBT'] bureau['DEBT_CREDIT_DIFF'] = bureau['AMT_CREDIT_SUM'] - bureau['AMT_CREDIT_SUM_DEBT'] bureau['CREDIT_TO_ANNUITY_RATIO'] = bureau['AMT_CREDIT_SUM'] / bureau['AMT_ANNUITY'] # One-hot encoder bureau, _ = utils.one_hot_encoder(bureau, nan_as_category= False) # Join bureau balance features bureau = bureau.merge(get_bureau_balance(path, num_rows), how='left', on='SK_ID_BUREAU') # Flag months with late payments (days past due) bureau['STATUS_12345'] = 0 for i in range(1,6): bureau['STATUS_12345'] += bureau['STATUS_{}'.format(i)] # Aggregate by number of months in balance and merge with bureau (loan length agg) features = ['AMT_CREDIT_MAX_OVERDUE', 'AMT_CREDIT_SUM_OVERDUE', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'DEBT_PERCENTAGE', 'DEBT_CREDIT_DIFF', 'STATUS_0', 'STATUS_12345'] agg_length = bureau.groupby('MONTHS_BALANCE_SIZE')[features].mean().reset_index() agg_length.rename({feat: 'LL_' + feat for feat in features}, axis=1, inplace=True) bureau = bureau.merge(agg_length, how='left', on='MONTHS_BALANCE_SIZE') del agg_length gc.collect() # General loans aggregations agg_bureau = utils.group(bureau, 'BUREAU_', config.BUREAU_AGG) # Active and closed loans aggregations active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1] agg_bureau = utils.group_and_merge(active,agg_bureau,'BUREAU_ACTIVE_',config.BUREAU_ACTIVE_AGG) closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1] agg_bureau = utils.group_and_merge(closed,agg_bureau,'BUREAU_CLOSED_',config.BUREAU_CLOSED_AGG) del active, closed gc.collect() # Aggregations for the main loan types for credit_type in ['Consumer credit', 'Credit card', 'Mortgage', 'Car loan', 'Microloan']: type_df = bureau[bureau['CREDIT_TYPE_' + credit_type] == 1] prefix = 'BUREAU_' + credit_type.split(' ')[0].upper() + '_' agg_bureau = utils.group_and_merge(type_df, agg_bureau, prefix, config.BUREAU_LOAN_TYPE_AGG) del type_df gc.collect() # Time based aggregations: last x months for time_frame in [6, 12, 24, 36]: prefix = "BUREAU_LAST{}M_".format(time_frame) time_frame_df = bureau[bureau['DAYS_CREDIT'] >= -30*time_frame] agg_bureau = utils.group_and_merge(time_frame_df,agg_bureau,prefix,config.BUREAU_TIME_AGG) del time_frame_df gc.collect() # Last loan max overdue sort_bureau = bureau.sort_values(by=['DAYS_CREDIT']) gr = sort_bureau.groupby('SK_ID_CURR')['AMT_CREDIT_MAX_OVERDUE'].last().reset_index() gr.rename({'AMT_CREDIT_MAX_OVERDUE': 'BUREAU_LAST_LOAN_MAX_OVERDUE'}, inplace=True) agg_bureau = agg_bureau.merge(gr, on='SK_ID_CURR', how='left') # Ratios: total debt/total credit and active loans debt/ active loans credit agg_bureau['BUREAU_DEBT_OVER_CREDIT'] = \ agg_bureau['BUREAU_AMT_CREDIT_SUM_DEBT_SUM']/agg_bureau['BUREAU_AMT_CREDIT_SUM_SUM'] agg_bureau['BUREAU_ACTIVE_DEBT_OVER_CREDIT'] = \ agg_bureau['BUREAU_ACTIVE_AMT_CREDIT_SUM_DEBT_SUM']/agg_bureau['BUREAU_ACTIVE_AMT_CREDIT_SUM_SUM'] return agg_bureau
# test set x_test1 = feature_all[i] y_test = target_all[i]['reason'] # remove foursquare data # x_train = x_train.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1) # x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1) # train (layer 1) #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200) gbm1 = xgb.XGBClassifier(max_depth=3, n_estimators=20, learning_rate=0.01, nthread=12, subsample=1, max_delta_step=0).fit(x_train1, y_train1) y_pred1 = gbm1.predict(x_train1) # train (layer 2) y_pred1_code = pd.DataFrame(columns=['loc {}'.format(j) for j in range(len(location_top))]) for j in range(x_train1.shape[0]): y_pred1_code.loc[j,:] = one_hot_encoder(y_pred1[j], np.array(location_top)) x_train2 = pd.concat([x_train1, y_pred1_code], axis=1) gbm2 = xgb.XGBClassifier(max_depth=3, n_estimators=20, learning_rate=0.01, nthread=12, subsample=1, max_delta_step=0).fit(x_train2, y_train2) # train performance # y_pred = gbm.predict(x_train) # conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train) # test (layer 1) y_pred1 = gbm1.predict(x_test1) y_pred1_code = pd.DataFrame(columns=['loc {}'.format(j) for j in range(len(location_top))]) # test (layer 2) for j in range(x_test1.shape[0]): y_pred1_code.loc[j,:] = one_hot_encoder(y_pred1[j], np.array(location_top)) x_test2 = pd.concat([x_test1, y_pred1_code], axis=1)
#reading the dataset df = utils.read_dataset(path, separator) #global analysis of the dataset utils.broad_analysis(df) utils.missing_values_table(df) features_list = df.columns #dropping duplicates in the dataset to avoid duplicated data points to have more importance than they really have df = df.drop_duplicates() #removing columns containing only one value, brining useless noise to the dataset df = utils.remove_unique_feature(df) #removing proper nouns from the dataset name_features = utils.remove_name(nlp, df) df = df.drop(name_features, axis=1) #one hot encoding of categorical data df = utils.one_hot_encoder(df) #processing of NaNs values df = utils.missing_values(df, 'drop') #visualizing correlation matrix (linear correlation only utils.visualise_correlation(df) features = df.columns.tolist() del features[features.index(str(target))] #converting float values to log(min(x)+1) if the distribution is skewed #this will aloow to correct distribution to be gaussian for better outliers removal for feature in features: if df[feature].dtypes == 'float64': if df[feature].skew() == 0: pass else: print(df.columns) df[feature] = df[feature].apply(
def main(): #file_loc = '/media/avemuri/DEV/Data/deeplearning/mnist/train.csv' file_loc = 'D:/dev/data/face_emotion_recognizer/fer2013.csv' X_train, Y_train, X_test, Y_test = get_data(file_name=file_loc) pca = PCA(n_components=400) pca.fit(X_train) X_train = pca.transform(X_train) X_test = pca.transform(X_test) T_train = one_hot_encoder(Y_train) T_test = one_hot_encoder(Y_test) D = X_train.shape[1] # number of features K = len(set(Y_train)) # number of classes decay_rate = 0.999 eps = 1e-10 epochs = 100 n_batches = 10 batch_size = X_train.shape[0]//n_batches print_time = n_batches M = 300 learning_rate=1e-6 reg=1e-8 W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) thX = th.matrix('X') thT = th.matrix('Y') W1 = theano.shared(W1_init, 'W1') b1 = theano.shared(b1_init, 'b1') W2 = theano.shared(W2_init, 'W2') b2 = theano.shared(b2_init, 'b2') cache_W1 = theano.shared(1, 'cache_w1') cache_b1 = theano.shared(1, 'cache_b1') cache_W2 = theano.shared(1, 'cache_w2') cache_b2 = theano.shared(1, 'cache_b2') # forward model thZ = th.nnet.relu(thX.dot(W1) + b1) #thZ[thZ < 0] = 0 # Z = np.tanh(X.dot(self.W1) + self.b1) thY = th.nnet.softmax(thZ.dot(W2) + b2) # Cost cost = -((thT*th.log(thY)).sum() + reg*((W1*W1).sum() + (b1*b1).sum() + (W2*W2).sum() + (b2*b2).sum())) # Prediction prediction = th.argmax(thY, axis=1) # Updates dJ_dW1 = th.grad(cost, W1) dJ_db1 = th.grad(cost, b1) dJ_dW2 = th.grad(cost, W2) dJ_db2 = th.grad(cost, b2) cache_W1 = decay_rate*cache_W1 + (1-decay_rate)*dJ_dW1*dJ_dW1 cache_b1 = decay_rate*cache_b1 + (1-decay_rate)*dJ_db1*dJ_db1 cache_W2 = decay_rate*cache_W2 + (1-decay_rate)*dJ_dW2*dJ_dW2 cache_b2 = decay_rate*cache_b2 + (1-decay_rate)*dJ_db2*dJ_db2 update_W1 = W1 - learning_rate*dJ_dW1/(np.sqrt(cache_W1)+eps) update_b1 = b1 - learning_rate*dJ_db1/(np.sqrt(cache_b1)+eps) update_W2 = W2 - learning_rate*dJ_dW2/(np.sqrt(cache_W2)+eps) update_b2 = b2 - learning_rate*dJ_db2/(np.sqrt(cache_b2)+eps) train = theano.function(inputs=[thX, thT], updates=[(W1, update_W1), (b1, update_b1), (W2, update_W2), (b2, update_b2)])# get_prediction = theano.function(inputs=[thX, thT], outputs=[cost, prediction]) costs = [] for epoch in range(epochs): X_shuffled, T_shuffled = shuffle(X_train, T_train) for batch in range(n_batches): # Get the batch X_batch = X_shuffled[batch*batch_size:(batch+1)*batch_size,:] Y_batch = T_shuffled[batch*batch_size:(batch+1)*batch_size,:] train(X_batch, Y_batch) if batch % print_time == 0: c, pred = get_prediction(X_test, T_test) err = error_rate(Y_test, pred) print("epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" %(epoch, batch, c, err)) costs.append(c) plt.plot(costs) plt.title('Validation cost') plt.show()
def fit(self, X, Y, learning_rate=1e-8, reg=1e-12, epochs=10000, n_batches=10, show_fig=False): D = X.shape[1] # number of features K = len(set(Y)) # number of classes X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] batch_size = X.shape[0] // n_batches T = one_hot_encoder(Y) self.W1 = np.random.randn(D, self.M) / np.sqrt(D) self.b1 = np.zeros(self.M) self.W2 = np.random.randn(self.M, K) / np.sqrt(self.M) self.b2 = np.zeros(K) # 1st moment mW1 = 0 mb1 = 0 mW2 = 0 mb2 = 0 # 2nd moment vW1 = 0 vb1 = 0 vW2 = 0 vb2 = 0 # hyperparams beta1 = 0.9 beta2 = 0.999 eps = 1e-8 costs = [] t = 1 for epoch in range(epochs): X_shuffled, T_shuffled = shuffle(X, T) for ibatch in range(n_batches): # Get the batch X_batch = X_shuffled[ibatch * batch_size:(ibatch + 1) * batch_size, :] Y_batch = T_shuffled[ibatch * batch_size:(ibatch + 1) * batch_size, :] Y_hat, Z = self.forward(X_batch) # Weight updates ---------------------- Y_hat_T = Y_hat - Y_batch dJ_dW2 = Z.T.dot(Y_hat_T) + reg * self.W2 dJ_db2 = Y_hat_T.sum() + reg * self.b2 val = (Y_hat - Y_batch).dot(self.W2.T) * (Z > 0) # Relu #val = Y_hat_T.dot(self.W2.T) * (1-Z*Z) # tanh dJ_dW1 = X_batch.T.dot(val) + reg * self.W1 dJ_db1 = val.sum() + reg * self.b1 # Mean mW2 = beta1 * mW2 + (1 - beta1) * dJ_dW2 mb2 = beta1 * mb2 + (1 - beta1) * dJ_db2 mW1 = beta1 * mW1 + (1 - beta1) * dJ_dW1 mb1 = beta1 * mb1 + (1 - beta1) * dJ_db1 # Velocity terms vW2 = beta2 * vW2 + (1 - beta2) * dJ_dW2 * dJ_dW2 vb2 = beta2 * vb2 + (1 - beta2) * dJ_db2 * dJ_db2 vW1 = beta2 * vW1 + (1 - beta2) * dJ_dW1 * dJ_dW1 vb1 = beta2 * vb1 + (1 - beta2) * dJ_db1 * dJ_db1 correction1 = 1 - beta1**t hat_mW2 = mW2 / correction1 hat_mb2 = mb2 / correction1 hat_mW1 = mW1 / correction1 hat_mb1 = mb1 / correction1 correction2 = 1 - beta2**t hat_vW2 = vW2 / correction2 hat_vb2 = vb2 / correction2 hat_vW1 = vW1 / correction2 hat_vb1 = vb1 / correction2 self.W2 -= learning_rate * hat_mW2 / (np.sqrt(hat_vW2) + eps) self.b2 -= learning_rate * hat_mb2 / (np.sqrt(hat_vb2) + eps) self.W1 -= learning_rate * hat_mW1 / (np.sqrt(hat_vW1) + eps) self.b1 -= learning_rate * hat_mb1 / (np.sqrt(hat_vb1) + eps) # ------------------------------------- Y_hat_valid, _ = self.forward(X_valid) c = cross_entropy(T_valid, Y_hat_valid) costs.append(c) if ibatch % (n_batches) == 0: e = error_rate(Y_valid, np.argmax(Y_hat_valid, axis=1)) print("epoch:", epoch, " cost:", c, " error:", e) t += 1 if show_fig: plt.plot(costs) plt.title('Validation cost') plt.show() print("Final train classification_rate:", self.score(X, Y))
def LogReg2D_classification(dataset, filename): """ Classification of data with 2D logistic regression, followed by plotting of ROC and PR curves. Parameters --- dataset: the input dataset, containing training and test split data, and the corresponding labels for binding- and non-binding sequences. filename: an identifier to distinguish different plots from each other. Returns --- stats: array containing classification accuracy, precision and recall """ # Import training/test set X_train = dataset.train.loc[:, 'AASeq'].values X_test = dataset.test.loc[:, 'AASeq'].values # One hot encode the sequences in 2D X_train = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_train] X_train_2D_list = [] for x in range(0, len(X_train)): X_train_2D = np.empty([20, 0]) for y in range(0, X_train[x].shape[1] - 1): for z in range(0, X_train[x].shape[0]): X_train_2D = np.concatenate( (X_train_2D, X_train[x][z, y] * X_train[x][:, y + 1:]), axis=1) X_train_2D_list.append(X_train_2D) X_train = [x.flatten('F') for x in X_train_2D_list] X_test = [one_hot_encoder(s=x, alphabet=IUPAC.protein) for x in X_test] X_test_2D_list = [] for x in range(0, len(X_test)): X_test_2D = np.empty([20, 0]) for y in range(0, X_test[x].shape[1] - 1): for z in range(0, X_test[x].shape[0]): X_test_2D = np.concatenate( (X_test_2D, X_test[x][z, y] * X_test[x][:, y + 1:]), axis=1) X_test_2D_list.append(X_test_2D) X_test = [x.flatten('F') for x in X_test_2D_list] # Extract labels of training/test set y_train = dataset.train.loc[:, 'AgClass'].values y_test = dataset.test.loc[:, 'AgClass'].values # Fitting Logistic Regression to the training set LR_classifier = LogisticRegression(random_state=0) LR_classifier.fit(X_train, y_train) # Predicting the test set results y_pred = LR_classifier.predict(X_test) y_score = LR_classifier.predict_proba(X_test) # ROC curve title = '2D Logistic Regression ROC curve (Train={})'.format(filename) plot_ROC_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/2DLR_ROC_Test_{}.png'.format(filename)) # Precision-recall curve title = '2D Logistic Regression Precision-Recall curve (Train={})'.format( filename) plot_PR_curve(y_test, y_score[:, 1], plot_title=title, plot_dir='figures/2DLR_P-R_Test_{}.png'.format(filename)) # Calculate statistics stats = calc_stat(y_test, y_pred) # Return statistics return stats
def fit(self, X, Y, activation=tf.nn.relu, learning_rate=1e-8, reg=1e-12, epochs=10000, n_batches=10, decay_rate=0.9, show_fig=False): X = X.astype(np.float32) Y = Y.astype(np.int32) X, Y = shuffle(X, Y) X_valid, Y_valid = X[-1000:], Y[-1000:] T_valid = one_hot_encoder(Y_valid) X, Y = X[:-1000], Y[:-1000] T = one_hot_encoder(Y) eps = 1e-10 D = X.shape[1] # number of features K = len(set(Y)) # number of classes batch_size = X.shape[0] // n_batches print_time = n_batches // 1 M1 = D for M2 in self.hidden_layer_sizes: h = HiddenLayer(M1, M2, activation_fn=activation) self.layers.append(h) M1 = M2 # the final layer h = HiddenLayer(M1, K, activation_fn=tf.nn.softmax) self.layers.append(h) for layer in self.layers: self.params += layer.params tfX = tf.placeholder(tf.float32, shape=(None, D), name='tfX') tfT = tf.placeholder(tf.float32, shape=(None, K), name='tfT') tfY = self.forward(tfX) predict_op = tf.argmax(tfY, axis=1) cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits_v2(logits=tfY, labels=tfT)) train_op = tf.train.RMSPropOptimizer(learning_rate, decay=0.99, momentum=0.9).minimize(cost) costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for epoch in range(epochs): X_shuffled, T_shuffled = shuffle(X, T) for batch in range(n_batches): # Get the batch X_batch = X_shuffled[batch * batch_size:(batch + 1) * batch_size, :] Y_batch = T_shuffled[batch * batch_size:(batch + 1) * batch_size, :] session.run(train_op, feed_dict={ tfX: X_batch, tfT: Y_batch }) if batch % print_time == 0: test_cost = session.run(cost, feed_dict={ tfX: X_valid, tfT: T_valid }) prediction = session.run(predict_op, feed_dict={tfX: X_valid}) err = error_rate(Y_valid, prediction) # print(prediction.shape) print( "epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" % (epoch, batch, test_cost, err)) costs.append(test_cost) plt.plot(costs) plt.title('Validation cost') plt.show()
# x_test = x_test.drop(['fsq 0','fsq 1','fsq 2','fsq 3','fsq 4','fsq 5','fsq 6','fsq 7'],axis=1) # train (layer 1) #eta_list = np.array([0.05]*200+[0.02]*200+[0.01]*200) gbm1 = xgb.XGBClassifier(max_depth=3, n_estimators=20, learning_rate=0.05, nthread=12, subsample=1, max_delta_step=0).fit(x_train1, y_train1) y_pred1 = gbm1.predict(x_train1) # train (layer 2) y_pred1_code = pd.DataFrame( columns=['loc {}'.format(j) for j in range(len(location_top))]) for j in range(x_train1.shape[0]): y_pred1_code.loc[j, :] = one_hot_encoder(y_pred1[j], np.array(location_top)) x_train2 = pd.concat([x_train1, y_pred1_code], axis=1) gbm2 = xgb.XGBClassifier(max_depth=3, n_estimators=20, learning_rate=0.05, nthread=12, subsample=1, max_delta_step=0).fit(x_train2, y_train2) # train performance # y_pred = gbm.predict(x_train) # conf_train, roc_auc_train = calculate_confusion_matrix(y_pred, y_train) # test (layer 1) y_pred1 = gbm1.predict(x_test1)
def main(): #file_loc = '/media/avemuri/DEV/Data/deeplearning/mnist/train.csv' file_loc = 'D:/dev/data/mnist/train.csv' X_train, Y_train, X_test, Y_test = get_data(file_name=file_loc, split_train_test=True) pca = PCA(n_components=400) pca.fit(X_train) X_train = pca.transform(X_train) #Y = Y_train T_train = one_hot_encoder(Y_train) X_test = pca.transform(X_test) T_test = one_hot_encoder(Y_test) ####################################################### D = X_train.shape[1] # number of features K = len(set(Y_train)) # number of classes M = 300 reg = 0.00001 batch_size = 500 n_batches = X_train.shape[0] // batch_size learning_rate = 0.00004 epochs = 10 W1_init = np.random.randn(D, M) / np.sqrt(D) b1_init = np.zeros(M) W2_init = np.random.randn(M, K) / np.sqrt(M) b2_init = np.zeros(K) # Define all variables X = tf.placeholder(tf.float32, shape=(None, D), name='X') T = tf.placeholder(tf.float32, shape=(None, K), name='Y') W1 = tf.Variable(W1_init.astype(np.float32)) b1 = tf.Variable(b1_init.astype(np.float32)) W2 = tf.Variable(W2_init.astype(np.float32)) b2 = tf.Variable(b2_init.astype(np.float32)) # Model definition Z = tf.nn.relu(tf.matmul(X, W1) + b1) Y_hat = tf.matmul(Z, W2) + b2 # Cost cost = tf.reduce_sum( tf.nn.softmax_cross_entropy_with_logits_v2(logits=Y_hat, labels=T)) # Optimization train = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=0.99, momentum=0.9).minimize(cost) # Predictions predic_op = tf.argmax(Y_hat, axis=1) costs = [] init = tf.global_variables_initializer() with tf.Session() as session: session.run(init) for epoch in range(epochs): X_shuffled, T_shuffled = shuffle(X_train, T_train) for batch in range(n_batches): # Get the batch X_batch = X_shuffled[batch * batch_size:(batch + 1) * batch_size, :] Y_batch = T_shuffled[batch * batch_size:(batch + 1) * batch_size, :] session.run(train, feed_dict={X: X_batch, T: Y_batch}) if batch % 10 == 0: c = session.run(cost, feed_dict={X: X_test, T: T_test}) Y_test_predictions = session.run(predic_op, feed_dict={X: X_test}) err = error_rate(Y_test, Y_test_predictions) print( "epoch [%d], batch [%d] : cost=[%.3f], error=[%.3f]" % (epoch, batch, c, err)) costs.append(c) plt.plot(costs) plt.title('Validation cost') plt.show()