def make_train_loader(small=False): #TODO update with new amounts of tensors if small: # equal amount of patients from each class = |smallest class| train_indices = list(range(26750, 29876)) # class 1 & 2 train_indices.extend(random.sample(range(0, 26750), 1600)) # class 0 train_indices.extend(random.sample(range(29876, 32750), 1600)) # class 3 train_indices.extend(random.sample(range(32750, 36798), 1400)) # class 4 train_data = Dataset(train_indices, train_path) return DataLoader(train_data, batch_size=args.batch, shuffle=True, collate_fn=collate_fn) else: # equal amount of patients from each class = |largest class| train_indices = list(range(26750, 29876)) * 3 # class 1 & 2 train_indices.extend(random.sample(range(0, 26750), 2505)) # class 0 train_indices.extend(list(range(29876, 32750))) # class 3 train_indices.extend(random.sample(range(29876, 32750), 762)) # class 3 train_indices.extend(list(range(32750, 36798))) # class 4 train_data = Dataset(train_indices, train_path) return DataLoader(train_data, batch_size=args.batch, shuffle=True, collate_fn=collate_fn)
def validation(database_img, query_img, config): model = DVSQ(config) img_database = Dataset(database_img, config['output_dim'], config['n_subspace'] * config['n_subcenter']) img_query = Dataset(query_img, config['output_dim'], config['n_subspace'] * config['n_subcenter']) model.validation(img_query, img_database, config['R']) return
def load_x(ds, preset): feature_parts = [Dataset.load_part(ds, part) for part in preset.get('features', [])] prediction_parts = [load_prediction(ds, p, mode=preset.get('predictions_mode', 'fulltrain')) for p in preset.get('predictions', [])] prediction_parts = [p.clip(lower=0.1).values.reshape((p.shape[0], 1)) for p in prediction_parts] if 'prediction_transform' in preset: prediction_parts = map(preset['prediction_transform'], prediction_parts) return hstack(feature_parts + prediction_parts)
def experiment(p, trainData, validData, testData, eps, hparams, run, resultRep): trainS, trainX1, trainX2, trainY = copy.deepcopy(trainData) validS, validX1, validX2, validY = copy.deepcopy(validData) testS, testX1, testX2, testY = copy.deepcopy(testData) dataset = Dataset(trainS, trainX1, trainX2, trainY) dataset.add_validdata(validS, validX1, validX2, validY) dataset.add_testdata(testS, testX1, testX2, testY) avails = [] for j in range(len(testS[0])): vals = set() vals = vals.union(set([s[j] for s in trainS])) #print "trainS=",trainS vals = vals.union(set([s[j] for s in validS])) #print "validS=",trainS vals = vals.union(set([s[j] for s in testS])) #print "testS=",trainS avails.append(vals == set([0, 1])) #print "j,vals=",j,vals result_unfair_train, result_unfair_valid, result_unfair_test = dataset.Unfair_Prediction( p.kernel or p.rff, hparams["lmd"], hparams["gamma"], avails) title = {} result_train, result_valid, result_test = dataset.EpsFair_Prediction( p.dataset, eps, hparams, avails, p) title["hparam"] = hparams title["preprocessing"] = p.preprocessing title["run"] = run if p.kernel: title["kernel"] = "kernel" elif p.rff and p.nonlinears: title["kernel"] = "rff-ns" elif p.rff: title["kernel"] = "rff" else: title["kernel"] = "no" title["eps"] = eps title["dataset"] = "train" resultRep.add_run(copy.deepcopy(title), result_train) title["dataset"] = "valid" resultRep.add_run(copy.deepcopy(title), result_valid) title["dataset"] = "test" resultRep.add_run(copy.deepcopy(title), result_test) title["eps"] = "unfair" title["dataset"] = "train" resultRep.add_run(copy.deepcopy(title), result_unfair_train) title["dataset"] = "valid" resultRep.add_run(copy.deepcopy(title), result_unfair_valid) title["dataset"] = "test" resultRep.add_run(copy.deepcopy(title), result_unfair_test)
def extract_feature_names(preset): x = [] for part in preset.get('features', []): x += Dataset.get_part_features(part) lp = 1 for pred in preset.get('predictions', []): if type(pred) is list: x.append('pred_%d' % lp) lp += 1 else: x.append(pred) return x
X_train = data['X'][:split_train_test] Y_train = data['Y'][:split_train_test] X_test = data['X'][split_train_test:last_divisible_index] Y_test = data['Y'][split_train_test:last_divisible_index] #Y_train = np.expand_dims(Y_train, axis=-1) #Y_test = np.expand_dims(Y_test, axis=-1) print('Y_train:', Y_test[0:10]) n_samples = X_train.shape[0] n_timesteps = X_train.shape[1] n_input = X_train.shape[2] n_output = 2 # rise or fall n_iters = int(n_epochs * n_samples / batch_size) print('number of iterations %d' %n_iters) # Convert to Dataset instance train_dataset = Dataset(X_train, Y_train, batch_size) def dense(x, n_out): return tf.contrib.layers.fully_connected(x, n_out, activation_fn=tf.nn.relu) def dense2d(x, n_out): x_sequence = tf.unstack(x, n_timesteps, 1) #print(tf.unstack(x_sequence[0], batch_size, 0)[0].get_shape()) timesteps = [dense(x_sequence[t], n_out) for t in range(n_timesteps)] return timesteps def model(is_train): x = tf.placeholder('float', [None, n_timesteps, n_input]) y = tf.placeholder('float', [None, n_output]) if is_train:
# Save column names num_columns = [ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term' ] cat_columns = [ c for c in data.columns if (c not in [*num_columns, target, id]) ] cData_mode = data[cat_columns].copy() cData_mode.Credit_History.fillna(1.0, inplace=True) cData_mode.fillna("X", inplace=True) cData_mode = cData_mode.apply(LabelEncoder().fit_transform) Dataset.save_part_features('categorical_na_new', Dataset.get_part_features('categorical_mode')) Dataset(categorical_na_new=cData_mode.values).save(name) Dataset(id=data[id]).save(name) if target in data.columns: le = LabelEncoder() le.fit(data[target]) print(le.transform(data[target])) Dataset(target=le.transform(data[target])).save(name) Dataset(target_labels=le.classes_).save(name) print("Done.")
import pandas as pd import numpy as np from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name data = pd.read_csv('../input/lin_%s.csv' % name) # Save column names if name == 'train': num_columns = [c for c in data.columns if c.startswith('cont')] Dataset.save_part_features('numeric_lin', num_columns) Dataset(numeric_lin=data[num_columns].values.astype(np.float32)).save(name) print "Done."
p_loc[p_loc < 0.2] = 0.3 polar_labels(theta, groups, 1.3) polar_labels(theta, probs1, p_loc) plt.subplots_adjust(wspace=0.35) plt.show() if __name__ == '__main__': path = '/home/osvald/Projects/Diagnostics/github/srtr_data/multi_label/backup/n_train_tensors/' groups = ['survival/', 'cardiac/', 'graft/', 'cancer/', 'infection/'] #load model indices = list(range(4804)) data = Dataset(indices, path) loader = DataLoader(data, batch_size=1, shuffle=True, collate_fn=collate_fn) model_folder = '/home/osvald/Projects/Diagnostics/github/models/TCN/normalized/search/[64, 64]_fcl32_att0/lr0.0007787054002686635_b1_0.7982063652094732_b2_0.6107009808891577_gamma0.8319009616491654_drop0.2357377200377962_l2_0.006814594805294124/' model = DynamicTCN(input_size=267, output_size=10, num_channels=[64, 64], fcl=32, attention=0, kernel_size=2, dropout=0) model.load_state_dict(torch.load(model_folder + '/best_auc_model')) model.eval()
import numpy as np import scipy.sparse as sp from tqdm import tqdm from util import Dataset print("Loading data...") min_freq = 10 train_cat = Dataset.load_part('train', 'cat_manual') test_cat = Dataset.load_part('test', 'cat_manual') train_cat_enc = [] test_cat_enc = [] cats = Dataset.get_part_features('categorical') features = [] with tqdm(total=len(cats), desc=' Encoding', unit='cols') as pbar: for col, cat in enumerate(cats): value_counts = dict( zip(*np.unique(train_cat[:, col], return_counts=True))) train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8) test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8) for val in value_counts: if value_counts[val] >= min_freq: features.append('%s_%s' % (cat, val)) train_cat_enc.append(
import numpy as np from scipy.stats import skew, boxcox from tqdm import tqdm from util import Dataset print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in range(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) print(values) sk = skew(values) if sk > 0.25: values_enc, lam = boxcox(values+1) train_num_enc[:, col] = values_enc[:train_num.shape[0]] test_num_enc[:, col] = values_enc[train_num.shape[0]:] else: train_num_enc[:, col] = train_num[:, col] test_num_enc[:, col] = test_num[:, col] pbar.update(1)
data = pd.read_csv('input/%s.csv' % name) target = "Loan_Status" id = "Loan_ID" # Save column names if name == 'train': num_columns = [ 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term' ] cat_columns = [ c for c in data.columns if (c not in [*num_columns, target, id]) ] print(cat_columns) print(num_columns) Dataset.save_part_features('categorical_mode', cat_columns) Dataset.save_part_features('numeric_mean', num_columns) Dataset.save_part_features('numeric_median', num_columns) cData_mode = data[cat_columns].copy() cat_mode = cData_mode.mode().ix[0] cData_mode.fillna(cat_mode, inplace=True) cData_mode = cData_mode.apply(LabelEncoder().fit_transform) #print(cat_data.isnull().sum()) numData_mean = data[num_columns].copy() num_mean = numData_mean.mean() numData_mean.fillna(num_mean, inplace=True) numData_median = data[num_columns].copy()
def calculate_error_with_real_price(sess, predict_op, X, Y, real_last_one, real_last_two, statistcs): # Predict prices dataset = Dataset(X, Y, batch_size) n_batch = int(X.shape[0] / batch_size) for i in range(n_batch): next_x, next_y = dataset.next_batch() if i == 0: predict_log_return = sess.run(predict, feed_dict={ x: next_x, y: next_y }).tolist() else: predict_log_return = np.append(predict_log_return, sess.run(predict, feed_dict={ x: next_x, y: next_y }).tolist(), axis=0) predict_last_one = np.exp(predict_log_return + np.log(real_last_two)) # Export prices to statistcs_file statistcs['real_price'] = real_last_one.tolist() statistcs['predict_price'] = predict_last_one.tolist() # Show last 5 statistics print('last price is: ', real_last_two[-5:]) print('predict price is: ', predict_last_one[-5:]) print('real price is: ', real_last_one[-5:]) # Calculate MSE MSE = np.mean((real_last_one - predict_last_one)**2) statistcs['MSE'] = MSE print('MSE = %.10f' % MSE) # Caluculate Variance mean_real = np.mean(real_last_one) var_real = np.var(real_last_one) print('Real mean=%.4f, var=%.5f' % (mean_real, var_real)) mean_predict = np.mean(predict_last_one) var_predict = np.var(predict_last_one) print('Predict mean=%.4f, var=%.5f' % (mean_predict, var_predict)) length = real_last_one.shape[0] real_last_one = np.reshape(real_last_one, length) real_last_two = np.reshape(real_last_two, length) predict_last_one = np.reshape(predict_last_one, length) # Calculate sign accuracy real_diff = real_last_one - real_last_two predict_diff = predict_last_one - real_last_two real_diff_sign = np.sign(real_diff) predict_diff_sign = np.sign(predict_diff) print('real_diff: ', real_diff[-20:]) print('predict_diff: ', predict_diff[-20:]) num_correct_sign = np.count_nonzero( np.equal(real_diff_sign, predict_diff_sign)) sign_accuracy = num_correct_sign / real_diff_sign.shape[0] statistcs['sign_accuracy'] = sign_accuracy print('Sign Accuracy = %.10f' % sign_accuracy)
import pandas as pd import numpy as np from tqdm import tqdm from util import Dataset print "Loading data..." train_cat = Dataset.load_part('train', 'categorical') test_cat = Dataset.load_part('test', 'categorical') train_cat_enc = np.zeros(train_cat.shape, dtype=np.uint8) test_cat_enc = np.zeros(test_cat.shape, dtype=np.uint8) with tqdm(total=train_cat.shape[1], desc=' Encoding', unit='cols') as pbar: for col in xrange(train_cat.shape[1]): values = np.hstack((train_cat[:, col], test_cat[:, col])) values = np.unique(values) values = sorted(values, key=lambda x: (len(x), x)) encoding = dict(zip(values, range(len(values)))) train_cat_enc[:, col] = pd.Series(train_cat[:, col]).map(encoding).values test_cat_enc[:, col] = pd.Series(test_cat[:, col]).map(encoding).values pbar.update(1) print "Saving..." Dataset.save_part_features('categorical_encoded',
import numpy as np from scipy.stats.mstats import rankdata from scipy.special import erfinv from sklearn.preprocessing import scale, minmax_scale from tqdm import tqdm from util import Dataset print "Loading data..." lim = 0.999 train_num = Dataset.load_part('train', 'numeric') test_num = Dataset.load_part('test', 'numeric') train_num_enc = np.zeros(train_num.shape, dtype=np.float32) test_num_enc = np.zeros(test_num.shape, dtype=np.float32) with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for col in xrange(train_num.shape[1]): values = np.hstack((train_num[:, col], test_num[:, col])) # Apply rank transformation values = rankdata(values).astype(np.float64) # Scale into range (-1, 1) values = minmax_scale(values, feature_range=(-lim, lim))
import numpy as np import scipy.sparse as sp from scipy.stats import boxcox import pandas as pd from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset print("Loading data...") idx = Dataset.load_part("train", 'id') train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) idx = Dataset.load_part("test", 'id') test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) all_nData = train_num.append(test_num) print(all_nData.head()) all_num_norm = pd.DataFrame() all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome) all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome) all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount)) all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term) train_custom = all_num_norm[:train_num.shape[0]] test_custom = all_num_norm[train_num.shape[0]:]
def evaluate(self, x, y, batch_size): ''' input: FloatTensor/ndarray x: data list[str] y: targets int batch_size: batch size, x and y should be dividable with this output: seq_acc: mean accuracy, per sequence dig_acc: mean accuracy, per digit loss: mean ctc loss dist: mean Levenshtein edit distance ''' print ("evaluating...", end="\r") with torch.no_grad(): loss = 0 dist = 0 seq_acc = 0 dig_acc = 0 if type(x) is not torch.FloatTensor: x = torch.FloatTensor(x) eval_set = Dataset(x, y, normalize=True, augment=False) params = {'batch_size':batch_size, 'shuffle':False, 'num_workers':6} eval_generator = data.DataLoader(eval_set, **params) for x_batch, y_batch in eval_generator: x_batch = x_batch.to(self.device) y, p, out_sizes = self.predict(x_batch) y_batch_no_pad, target_sizes = self.remove_padding(y_batch, p.shape[0]) # ------ LOSS CALCULATION ------ loss += self.loss_fcn(p, y_batch_no_pad, out_sizes, target_sizes).numpy() # ------ ACCURACY CALCULATION ------ y_batch = self.decoder.convert_to_strings(y_batch) for i in range(len(y_batch)): chars_pred = list(y[i][0]) chars_true = list(y_batch[i][0]) len_diff = len(chars_pred) - len(chars_true) if len_diff < 0: chars_pred += ["_"]*(-len_diff) elif len_diff > 0: chars_true += ["_"]*len_diff dist += levenshtein(chars_pred , chars_true) seq_acc += accuracy_score(y[i], y_batch[i]) dig_acc += accuracy_score(chars_pred, chars_true) torch.cuda.empty_cache() # average over sequences loss /= x.shape[0] dist /= x.shape[0] seq_acc /= x.shape[0] dig_acc /= x.shape[0] return seq_acc, dig_acc, loss, dist
def fit(self, training_data, validation_data, optimizer, epochs=100, batch_size=10, logger=None): ''' Training loop input: tuple training_data: (features, labels), as numpy.ndarray tuple validation_data: (features, labels), as numpy.ndarray optimizer: a PyTorch optimizer object logger: a HistoryLogger object, for visualizing training ''' # Learning-rate is reduced on plateau, parameters might need tweaking depending on data scheduler = lr_reducer(optimizer, factor=0.5, patience=5, min_lr=1e-6) logger.on_train_begin() self.to(self.device) # Data to tensors x_tr = torch.tensor(training_data[0]).type(torch.FloatTensor) y_tr = torch.IntTensor(training_data[1]) x_te = torch.tensor(validation_data[0]).type(torch.FloatTensor) y_te = torch.IntTensor(validation_data[1]) print("\nTraining with {} examples, validating with {} examples"\ .format(x_tr.shape[0], x_te.shape[0])) best_acc = 0 # for determining best checkpoint training_set = Dataset(x_tr, y_tr, normalize=True, augment=True) params = {'batch_size':batch_size, 'shuffle':True, 'num_workers':6} train_generator = data.DataLoader(training_set, **params) for e in range(self.last_epoch, epochs): print("\nStarting epoch {}/{} ...".format(e+1, epochs)) loss = 0 # Progress bar with tqdm(train_generator, total = (x_tr.shape[0]//batch_size)+1, unit_scale = batch_size, postfix = "loss: {}".format(loss)) as t: for x_batch, y_batch in t: # one batch at a time to GPU, uses less GPU memory x_batch = x_batch.to(self.device) optimizer.zero_grad() y = self(x_batch) # predict on batch # Output strings are all the same length, but warp-ctc needs # these lengths as a tensor to work. out_sizes = torch.IntTensor([y.shape[0]]*batch_size) out_sizes = torch.autograd.Variable(out_sizes, requires_grad=False) y_batch, target_sizes = self.remove_padding(y_batch, y.shape[0]) # predictions are needed without softmax in the loss function loss = self.loss_fcn(y, y_batch, out_sizes, target_sizes) loss.backward() optimizer.step() t.postfix = "loss: {:8.4f}".format(loss.cpu().detach().numpy()[0]) torch.cuda.empty_cache() if logger is not None: seq_acc, _, loss, _ = logger.on_epoch_end(e) else: seq_acc, _, loss, _ = self.evaluate(x_te, y_te, batch_size) scheduler.step(loss) # Save checkpoint most_accurate = False if seq_acc > best_acc: best_acc = seq_acc most_accurate = True checkpointer({'epoch' : e+1, 'model_states' : self.state_dict(), 'optimizer' : optimizer.state_dict(), 'accuracy' : best_acc, 'model_params' : self.modelparams}, most_accurate, self.out_path) print("validation accuracy: {:4.2f}, validation loss: {:6.4f}"\ .format(float(seq_acc), float(loss)))
# # all_s["app_s"] = all_s["ApplicantIncome"] * all_s["Self_Employed"] # all_s["ci_s"] = all_s["CoapplicantIncome"] * all_s["Self_Employed"] # all_s["la_s"] = all_s["LoanAmount"] * all_s["Self_Employed"] # all_s["lat_s"] = all_s["Loan_Amount_Term"] * all_s["Self_Employed"] features_to_drop = ['Gender', 'Married', 'Dependents','Education', 'Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term'] all_filtered = all_s.drop(features_to_drop,axis=1) print(train.columns) # ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', # 'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', # 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'] train_cust = all_filtered[:ntrain] test_cust = all_filtered[ntrain:] print(all_s.columns) # # train_cat_enc = sp.hstack(train_cat_enc, format='csr') # test_cat_enc = sp.hstack(test_cat_enc, format='csr') Dataset.save_part_features('fSelect', list(all_filtered.columns)) Dataset(fSelect=train_cust.values).save('train') Dataset(fSelect=test_cust.values).save('test') # # print("Done.")
import numpy as np from util import Dataset from sklearn.preprocessing import scale print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') print("Scaling...") all_scaled = scale(np.vstack((train_num, test_num))) print("Saving...") Dataset.save_part_features('numeric_mean_scaled', Dataset.get_part_features('numeric')) Dataset(numeric_mean_scaled=all_scaled[:train_num.shape[0]]).save('train') Dataset(numeric_mean_scaled=all_scaled[train_num.shape[0]:]).save('test') print("Done.")
help='Resume from the given checkpoint.') opt = args.parse_args() tokenizer = BertTokenizer.from_pretrained('bert-base-chinese') if (opt.checkpoint != None): print('Load model from checkpoints') model = BertForNextSentencePrediction.from_pretrained( join('checkpoints', opt.checkpoint)) else: model = BertForNextSentencePrediction.from_pretrained('bert-base-chinese') optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) dataset = Dataset(tokenizer) dataloader = data.DataLoader(dataset=dataset, batch_size=opt.batch_size, shuffle=True, collate_fn=custom_collate(tokenizer)) task_name = '{}-{}'.format(opt.name, time.strftime("%Y-%m-%d-%H-%M-%S")) writer = SummaryWriter('runs/{}'.format(task_name)) if not os.path.exists(join('checkpoints', task_name)): os.mkdir(join('checkpoints', task_name)) model.to('cuda') model.train() acc_loss = [] # accumulated loss, refreshed when it is plotted
import numpy as np from util import Dataset, vstack, hstack from sklearn.preprocessing import scale from sklearn.decomposition import TruncatedSVD n_components = 500 # 500 components explain 99.8% of variance print "Loading data..." train_num = Dataset.load_part('train', 'numeric') train_cat = Dataset.load_part('train', 'categorical_dummy') test_num = Dataset.load_part('test', 'numeric') test_cat = Dataset.load_part('test', 'categorical_dummy') train_cnt = train_num.shape[0] print "Combining data..." all_data = hstack((scale(vstack( (train_num, test_num)).astype(np.float64)).astype(np.float32), vstack((train_cat, test_cat)))) del train_num, train_cat, test_num, test_cat print "Fitting svd..." svd = TruncatedSVD(n_components) res = svd.fit_transform(all_data)
import numpy as np import pandas as pd from scipy.stats import skew, boxcox from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset import itertools print("Loading data...") train_num = Dataset.load_part('train', 'numeric_mean') test_num = Dataset.load_part('test', 'numeric_mean') ntrain = train_num.shape[0] train_test = np.vstack([train_num, test_num]) num_features = Dataset.get_part_features('numeric') num_comb_df = pd.DataFrame() with tqdm(total=train_num.shape[1], desc=' Transforming', unit='cols') as pbar: for comb in itertools.combinations(num_features, 2): feat = comb[0] + "_" + comb[1] num_comb_df[ feat] = train_test[:, num_features.index(comb[0]) - 1] + train_test[:, num_features.index(comb[1]) - 1] print('Combining Columns:', feat)
# Replace category label with their counts import numpy as np import pandas as pd from tqdm import tqdm from util import Dataset print("Loading data...") train_cat = Dataset.load_part('train', 'categorical_mode') test_cat = Dataset.load_part('test', 'categorical_mode') train_cat_counts = np.zeros(train_cat.shape, dtype=np.float32) test_cat_counts = np.zeros(test_cat.shape, dtype=np.float32) with tqdm(total=train_cat.shape[1], desc=' Counting', unit='cols') as pbar: for col in range(train_cat.shape[1]): train_series = pd.Series(train_cat[:, col]) test_series = pd.Series(test_cat[:, col]) counts = pd.concat((train_series, test_series)).value_counts() train_cat_counts[:, col] = train_series.map(counts).values test_cat_counts[:, col] = test_series.map(counts).values pbar.update(1) print("Saving...") print(train_cat_counts) Dataset.save_part_features('categorical_counts', Dataset.get_part_features('categorical'))
from keras.optimizers import SGD, Adam, Adadelta from keras.callbacks import ModelCheckpoint from keras import regularizers #from keras_util import ExponentialMovingAverage, batch_generator from statsmodels.regression.quantile_regression import QuantReg from pylightgbm.models import GBMRegressor from scipy.stats import boxcox from bayes_opt import BayesianOptimization from util import Dataset, load_prediction, hstack categoricals = Dataset.get_part_features('categorical') class DenseTransformer(BaseEstimator): def transform(self, X, y=None, **fit_params): return X.todense() def fit_transform(self, X, y=None, **fit_params): self.fit(X, y, **fit_params) return self.transform(X) def fit(self, X, y=None, **fit_params): return self class BaseAlgo(object):
import numpy as np import scipy.sparse as sp import pandas as pd from sklearn.preprocessing import scale from tqdm import tqdm from util import Dataset print("Loading data...") idx = Dataset.load_part("train", 'id') train_cat = pd.DataFrame(Dataset.load_part("train", 'categorical_mode'), columns=Dataset.get_part_features('categorical_mode'), index=idx) train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) train = pd.concat([train_cat, train_num], axis=1) idx = Dataset.load_part("test", 'id') test_cat = pd.DataFrame(Dataset.load_part("test", 'categorical_mode'), columns=Dataset.get_part_features('categorical_mode'), index=idx) test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx) test = pd.concat([test_cat, test_num], axis=1)
def y_decode(y): og = Dataset.load_part("train", "target_labels") le = LabelEncoder() le.classes_ = og z = [int(i) for i in y] return le.inverse_transform(z)
'discriminator_loss': [], 'encoded_feature_vector': [], 'original_images': [], 'encoded_images': [], 'reconstruct_images': [], 'reconstruct_from_random': [], } statistics_file = 'statistics/' + eid print('id: ', eid) print('number of epochs = {:d}'.format(n_epochs)) print('batch_size = {:d}'.format(batch_size)) # Load data X_train = load_data('../data/data.npy') # (2000, 784) label_train = load_data('../data/label.npy') # (2000,) train_dataset = Dataset(X_train, label_train, batch_size) n_train_samples = X_train.shape[0] n_iters = int(n_epochs * n_train_samples / batch_size) print('number of iterations = {:d}'.format(n_iters)) def weight_variable(shape): initial = tf.random_normal(shape, stddev=0.01) return tf.Variable(initial) def bias_variable(shape): initial = tf.fill(shape, 0.1) return tf.Variable(initial)
import pandas as pd from util import Dataset for name in ['train', 'test']: print "Processing %s..." % name num = pd.DataFrame(Dataset.load_part(name, 'numeric'), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=num.index) df['diff_1_6'] = num['cont1'] - num['cont6'] df['diff_1_9'] = num['cont1'] - num['cont9'] df['diff_1_10'] = num['cont1'] - num['cont10'] df['diff_6_9'] = num['cont6'] - num['cont9'] df['diff_6_10'] = num['cont6'] - num['cont10'] df['diff_6_11'] = num['cont6'] - num['cont11'] df['diff_6_12'] = num['cont6'] - num['cont12'] df['diff_6_13'] = num['cont6'] - num['cont13'] df['diff_7_11'] = num['cont7'] - num['cont11'] df['diff_7_12'] = num['cont7'] - num['cont12'] df['diff_11_12'] = num['cont11'] - num['cont12'] if name == 'train': Dataset.save_part_features('numeric_combinations', list(df.columns)) Dataset(numeric_combinations=df.values).save(name) print "Done."
import numpy as np import pandas as pd from util import Dataset from sklearn.preprocessing import minmax_scale print "Loading data..." train_num = Dataset.load_part('train', 'numeric') test_num = Dataset.load_part('test', 'numeric') print "Scaling..." numeric = pd.DataFrame(np.vstack((train_num, test_num)), columns=Dataset.get_part_features('numeric')) df = pd.DataFrame(index=numeric.index) df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"])) df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"])) df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"])) df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"])) df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"])) df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"])) df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"])) df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1) df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1) df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1) df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1) df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122)**0.25 print "Saving..."