def make_train_loader(small=False):
    #TODO update with new amounts of tensors
    if small:
        # equal amount of patients from each class = |smallest class|
        train_indices = list(range(26750, 29876))  # class 1 & 2
        train_indices.extend(random.sample(range(0, 26750), 1600))  # class 0
        train_indices.extend(random.sample(range(29876, 32750),
                                           1600))  # class 3
        train_indices.extend(random.sample(range(32750, 36798),
                                           1400))  # class 4

        train_data = Dataset(train_indices, train_path)
        return DataLoader(train_data,
                          batch_size=args.batch,
                          shuffle=True,
                          collate_fn=collate_fn)
    else:
        # equal amount of patients from each class = |largest class|
        train_indices = list(range(26750, 29876)) * 3  # class 1 & 2
        train_indices.extend(random.sample(range(0, 26750), 2505))  # class 0
        train_indices.extend(list(range(29876, 32750)))  # class 3
        train_indices.extend(random.sample(range(29876, 32750),
                                           762))  # class 3
        train_indices.extend(list(range(32750, 36798)))  # class 4

        train_data = Dataset(train_indices, train_path)
        return DataLoader(train_data,
                          batch_size=args.batch,
                          shuffle=True,
                          collate_fn=collate_fn)
def validation(database_img, query_img, config):
    model = DVSQ(config)
    img_database = Dataset(database_img, config['output_dim'],
                           config['n_subspace'] * config['n_subcenter'])
    img_query = Dataset(query_img, config['output_dim'],
                        config['n_subspace'] * config['n_subcenter'])
    model.validation(img_query, img_database, config['R'])
    return
def load_x(ds, preset):
    feature_parts = [Dataset.load_part(ds, part) for part in preset.get('features', [])]
    prediction_parts = [load_prediction(ds, p, mode=preset.get('predictions_mode', 'fulltrain')) for p in preset.get('predictions', [])]
    prediction_parts = [p.clip(lower=0.1).values.reshape((p.shape[0], 1)) for p in prediction_parts]

    if 'prediction_transform' in preset:
        prediction_parts = map(preset['prediction_transform'], prediction_parts)

    return hstack(feature_parts + prediction_parts)
def experiment(p, trainData, validData, testData, eps, hparams, run,
               resultRep):
    trainS, trainX1, trainX2, trainY = copy.deepcopy(trainData)
    validS, validX1, validX2, validY = copy.deepcopy(validData)
    testS, testX1, testX2, testY = copy.deepcopy(testData)
    dataset = Dataset(trainS, trainX1, trainX2, trainY)
    dataset.add_validdata(validS, validX1, validX2, validY)
    dataset.add_testdata(testS, testX1, testX2, testY)
    avails = []
    for j in range(len(testS[0])):
        vals = set()
        vals = vals.union(set([s[j] for s in trainS]))
        #print "trainS=",trainS
        vals = vals.union(set([s[j] for s in validS]))
        #print "validS=",trainS
        vals = vals.union(set([s[j] for s in testS]))
        #print "testS=",trainS
        avails.append(vals == set([0, 1]))
        #print "j,vals=",j,vals

    result_unfair_train, result_unfair_valid, result_unfair_test = dataset.Unfair_Prediction(
        p.kernel or p.rff, hparams["lmd"], hparams["gamma"], avails)
    title = {}
    result_train, result_valid, result_test = dataset.EpsFair_Prediction(
        p.dataset, eps, hparams, avails, p)
    title["hparam"] = hparams
    title["preprocessing"] = p.preprocessing
    title["run"] = run
    if p.kernel:
        title["kernel"] = "kernel"
    elif p.rff and p.nonlinears:
        title["kernel"] = "rff-ns"
    elif p.rff:
        title["kernel"] = "rff"
    else:
        title["kernel"] = "no"
    title["eps"] = eps
    title["dataset"] = "train"
    resultRep.add_run(copy.deepcopy(title), result_train)
    title["dataset"] = "valid"
    resultRep.add_run(copy.deepcopy(title), result_valid)
    title["dataset"] = "test"
    resultRep.add_run(copy.deepcopy(title), result_test)
    title["eps"] = "unfair"
    title["dataset"] = "train"
    resultRep.add_run(copy.deepcopy(title), result_unfair_train)
    title["dataset"] = "valid"
    resultRep.add_run(copy.deepcopy(title), result_unfair_valid)
    title["dataset"] = "test"
    resultRep.add_run(copy.deepcopy(title), result_unfair_test)
def extract_feature_names(preset):
    x = []

    for part in preset.get('features', []):
        x += Dataset.get_part_features(part)

    lp = 1
    for pred in preset.get('predictions', []):
        if type(pred) is list:
            x.append('pred_%d' % lp)
            lp += 1
        else:
            x.append(pred)

    return x
示例#6
0
X_train = data['X'][:split_train_test]
Y_train = data['Y'][:split_train_test]
X_test  = data['X'][split_train_test:last_divisible_index]
Y_test  = data['Y'][split_train_test:last_divisible_index]

#Y_train = np.expand_dims(Y_train, axis=-1)
#Y_test = np.expand_dims(Y_test, axis=-1)
print('Y_train:', Y_test[0:10])
n_samples = X_train.shape[0]
n_timesteps = X_train.shape[1]
n_input = X_train.shape[2]
n_output = 2  # rise or fall
n_iters = int(n_epochs * n_samples / batch_size)
print('number of iterations %d' %n_iters)
# Convert to Dataset instance 
train_dataset = Dataset(X_train, Y_train, batch_size)


def dense(x, n_out):
    return tf.contrib.layers.fully_connected(x, n_out, activation_fn=tf.nn.relu)

def dense2d(x, n_out):
    x_sequence = tf.unstack(x, n_timesteps, 1)
    #print(tf.unstack(x_sequence[0], batch_size, 0)[0].get_shape())
    timesteps = [dense(x_sequence[t], n_out) for t in range(n_timesteps)]
    return timesteps

def model(is_train):
    x = tf.placeholder('float', [None, n_timesteps, n_input])
    y = tf.placeholder('float', [None, n_output])
    if is_train:
示例#7
0
    # Save column names

    num_columns = [
        'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
        'Loan_Amount_Term'
    ]
    cat_columns = [
        c for c in data.columns if (c not in [*num_columns, target, id])
    ]

    cData_mode = data[cat_columns].copy()
    cData_mode.Credit_History.fillna(1.0, inplace=True)
    cData_mode.fillna("X", inplace=True)

    cData_mode = cData_mode.apply(LabelEncoder().fit_transform)

    Dataset.save_part_features('categorical_na_new',
                               Dataset.get_part_features('categorical_mode'))
    Dataset(categorical_na_new=cData_mode.values).save(name)

    Dataset(id=data[id]).save(name)

    if target in data.columns:
        le = LabelEncoder()
        le.fit(data[target])
        print(le.transform(data[target]))
        Dataset(target=le.transform(data[target])).save(name)
        Dataset(target_labels=le.classes_).save(name)

print("Done.")
import pandas as pd
import numpy as np

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name
    data = pd.read_csv('../input/lin_%s.csv' % name)

    # Save column names
    if name == 'train':
        num_columns = [c for c in data.columns if c.startswith('cont')]

        Dataset.save_part_features('numeric_lin', num_columns)

    Dataset(numeric_lin=data[num_columns].values.astype(np.float32)).save(name)

print "Done."
示例#9
0
    p_loc[p_loc < 0.2] = 0.3
    polar_labels(theta, groups, 1.3)
    polar_labels(theta, probs1, p_loc)
    plt.subplots_adjust(wspace=0.35)

    plt.show()


if __name__ == '__main__':

    path = '/home/osvald/Projects/Diagnostics/github/srtr_data/multi_label/backup/n_train_tensors/'
    groups = ['survival/', 'cardiac/', 'graft/', 'cancer/', 'infection/']

    #load model
    indices = list(range(4804))
    data = Dataset(indices, path)
    loader = DataLoader(data,
                        batch_size=1,
                        shuffle=True,
                        collate_fn=collate_fn)

    model_folder = '/home/osvald/Projects/Diagnostics/github/models/TCN/normalized/search/[64, 64]_fcl32_att0/lr0.0007787054002686635_b1_0.7982063652094732_b2_0.6107009808891577_gamma0.8319009616491654_drop0.2357377200377962_l2_0.006814594805294124/'
    model = DynamicTCN(input_size=267,
                       output_size=10,
                       num_channels=[64, 64],
                       fcl=32,
                       attention=0,
                       kernel_size=2,
                       dropout=0)
    model.load_state_dict(torch.load(model_folder + '/best_auc_model'))
    model.eval()
import numpy as np
import scipy.sparse as sp

from tqdm import tqdm
from util import Dataset

print("Loading data...")

min_freq = 10

train_cat = Dataset.load_part('train', 'cat_manual')
test_cat = Dataset.load_part('test', 'cat_manual')

train_cat_enc = []
test_cat_enc = []

cats = Dataset.get_part_features('categorical')
features = []

with tqdm(total=len(cats), desc='  Encoding', unit='cols') as pbar:
    for col, cat in enumerate(cats):
        value_counts = dict(
            zip(*np.unique(train_cat[:, col], return_counts=True)))

        train_rares = np.zeros(train_cat.shape[0], dtype=np.uint8)
        test_rares = np.zeros(test_cat.shape[0], dtype=np.uint8)

        for val in value_counts:
            if value_counts[val] >= min_freq:
                features.append('%s_%s' % (cat, val))
                train_cat_enc.append(
示例#11
0
import numpy as np

from scipy.stats import skew, boxcox

from tqdm import tqdm
from util import Dataset

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming', unit='cols') as pbar:
    for col in range(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))
        print(values)
        sk = skew(values)

        if sk > 0.25:
            values_enc, lam = boxcox(values+1)

            train_num_enc[:, col] = values_enc[:train_num.shape[0]]
            test_num_enc[:, col] = values_enc[train_num.shape[0]:]
        else:
            train_num_enc[:, col] = train_num[:, col]
            test_num_enc[:, col] = test_num[:, col]

        pbar.update(1)
示例#12
0
    data = pd.read_csv('input/%s.csv' % name)
    target = "Loan_Status"
    id = "Loan_ID"
    # Save column names
    if name == 'train':
        num_columns = [
            'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
            'Loan_Amount_Term'
        ]
        cat_columns = [
            c for c in data.columns if (c not in [*num_columns, target, id])
        ]

        print(cat_columns)
        print(num_columns)
        Dataset.save_part_features('categorical_mode', cat_columns)
        Dataset.save_part_features('numeric_mean', num_columns)
        Dataset.save_part_features('numeric_median', num_columns)

    cData_mode = data[cat_columns].copy()
    cat_mode = cData_mode.mode().ix[0]
    cData_mode.fillna(cat_mode, inplace=True)

    cData_mode = cData_mode.apply(LabelEncoder().fit_transform)

    #print(cat_data.isnull().sum())
    numData_mean = data[num_columns].copy()
    num_mean = numData_mean.mean()
    numData_mean.fillna(num_mean, inplace=True)

    numData_median = data[num_columns].copy()
示例#13
0
def calculate_error_with_real_price(sess, predict_op, X, Y, real_last_one,
                                    real_last_two, statistcs):
    # Predict prices
    dataset = Dataset(X, Y, batch_size)
    n_batch = int(X.shape[0] / batch_size)
    for i in range(n_batch):
        next_x, next_y = dataset.next_batch()
        if i == 0:
            predict_log_return = sess.run(predict,
                                          feed_dict={
                                              x: next_x,
                                              y: next_y
                                          }).tolist()
        else:
            predict_log_return = np.append(predict_log_return,
                                           sess.run(predict,
                                                    feed_dict={
                                                        x: next_x,
                                                        y: next_y
                                                    }).tolist(),
                                           axis=0)

    predict_last_one = np.exp(predict_log_return + np.log(real_last_two))
    # Export prices to statistcs_file
    statistcs['real_price'] = real_last_one.tolist()
    statistcs['predict_price'] = predict_last_one.tolist()

    # Show last 5 statistics
    print('last price is: ', real_last_two[-5:])
    print('predict price is: ', predict_last_one[-5:])
    print('real price is: ', real_last_one[-5:])

    # Calculate MSE
    MSE = np.mean((real_last_one - predict_last_one)**2)
    statistcs['MSE'] = MSE
    print('MSE = %.10f' % MSE)

    # Caluculate Variance
    mean_real = np.mean(real_last_one)
    var_real = np.var(real_last_one)
    print('Real mean=%.4f, var=%.5f' % (mean_real, var_real))
    mean_predict = np.mean(predict_last_one)
    var_predict = np.var(predict_last_one)
    print('Predict mean=%.4f, var=%.5f' % (mean_predict, var_predict))

    length = real_last_one.shape[0]
    real_last_one = np.reshape(real_last_one, length)
    real_last_two = np.reshape(real_last_two, length)
    predict_last_one = np.reshape(predict_last_one, length)

    # Calculate sign accuracy
    real_diff = real_last_one - real_last_two
    predict_diff = predict_last_one - real_last_two
    real_diff_sign = np.sign(real_diff)
    predict_diff_sign = np.sign(predict_diff)
    print('real_diff: ', real_diff[-20:])
    print('predict_diff: ', predict_diff[-20:])
    num_correct_sign = np.count_nonzero(
        np.equal(real_diff_sign, predict_diff_sign))
    sign_accuracy = num_correct_sign / real_diff_sign.shape[0]
    statistcs['sign_accuracy'] = sign_accuracy
    print('Sign Accuracy = %.10f' % sign_accuracy)
示例#14
0
import pandas as pd
import numpy as np

from tqdm import tqdm
from util import Dataset

print "Loading data..."

train_cat = Dataset.load_part('train', 'categorical')
test_cat = Dataset.load_part('test', 'categorical')

train_cat_enc = np.zeros(train_cat.shape, dtype=np.uint8)
test_cat_enc = np.zeros(test_cat.shape, dtype=np.uint8)

with tqdm(total=train_cat.shape[1], desc='  Encoding', unit='cols') as pbar:
    for col in xrange(train_cat.shape[1]):
        values = np.hstack((train_cat[:, col], test_cat[:, col]))
        values = np.unique(values)
        values = sorted(values, key=lambda x: (len(x), x))

        encoding = dict(zip(values, range(len(values))))

        train_cat_enc[:, col] = pd.Series(train_cat[:,
                                                    col]).map(encoding).values
        test_cat_enc[:, col] = pd.Series(test_cat[:, col]).map(encoding).values

        pbar.update(1)

print "Saving..."

Dataset.save_part_features('categorical_encoded',
示例#15
0
import numpy as np

from scipy.stats.mstats import rankdata
from scipy.special import erfinv

from sklearn.preprocessing import scale, minmax_scale

from tqdm import tqdm
from util import Dataset

print "Loading data..."

lim = 0.999

train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')

train_num_enc = np.zeros(train_num.shape, dtype=np.float32)
test_num_enc = np.zeros(test_num.shape, dtype=np.float32)

with tqdm(total=train_num.shape[1], desc='  Transforming',
          unit='cols') as pbar:
    for col in xrange(train_num.shape[1]):
        values = np.hstack((train_num[:, col], test_num[:, col]))

        # Apply rank transformation
        values = rankdata(values).astype(np.float64)

        # Scale into range (-1, 1)
        values = minmax_scale(values, feature_range=(-lim, lim))
示例#16
0
import numpy as np
import scipy.sparse as sp
from scipy.stats import boxcox
import pandas as pd
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

print("Loading data...")

idx = Dataset.load_part("train", 'id')

train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)

idx = Dataset.load_part("test", 'id')

test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'), columns=Dataset.get_part_features('numeric_mean'), index=idx)


all_nData = train_num.append(test_num)
print(all_nData.head())

all_num_norm = pd.DataFrame()
all_num_norm["ApplicantIncome"] = np.log1p(all_nData.ApplicantIncome)
all_num_norm["CoapplicantIncome"] = np.log1p(all_nData.CoapplicantIncome)
all_num_norm["LoanAmount"] = (np.log1p(all_nData.LoanAmount))
all_num_norm["Loan_Amount_Term"] = np.log1p(all_nData.Loan_Amount_Term)

train_custom = all_num_norm[:train_num.shape[0]]
test_custom = all_num_norm[train_num.shape[0]:]
示例#17
0
    def evaluate(self, x, y, batch_size):
        '''
        input:
          FloatTensor/ndarray x: data
          list[str] y: targets
          int batch_size: batch size, x and y should be dividable with this 

        output:
          seq_acc: mean accuracy, per sequence
          dig_acc: mean accuracy, per digit
          loss:    mean ctc loss
          dist:    mean Levenshtein edit distance
        '''

        print ("evaluating...", end="\r")

        with torch.no_grad():
            
            loss = 0
            dist = 0
            seq_acc = 0
            dig_acc = 0

            if type(x) is not torch.FloatTensor:
                x = torch.FloatTensor(x)

            eval_set = Dataset(x, y, normalize=True, augment=False)

            params = {'batch_size':batch_size, 'shuffle':False, 'num_workers':6}
            eval_generator = data.DataLoader(eval_set, **params)
            
            for x_batch, y_batch in eval_generator:

                x_batch = x_batch.to(self.device)

                y, p, out_sizes = self.predict(x_batch)

                y_batch_no_pad, target_sizes = self.remove_padding(y_batch, p.shape[0])

                # ------ LOSS CALCULATION ------          

                loss += self.loss_fcn(p, y_batch_no_pad, out_sizes, target_sizes).numpy()   

                # ------ ACCURACY CALCULATION ------

                y_batch = self.decoder.convert_to_strings(y_batch)

                for i in range(len(y_batch)):
                    chars_pred = list(y[i][0])
                    chars_true = list(y_batch[i][0])
                    
                    len_diff = len(chars_pred) - len(chars_true)
                    if len_diff < 0:
                        chars_pred += ["_"]*(-len_diff)
                    elif len_diff > 0:
                        chars_true += ["_"]*len_diff

                    dist    += levenshtein(chars_pred , chars_true)
                    seq_acc += accuracy_score(y[i], y_batch[i])
                    dig_acc += accuracy_score(chars_pred, chars_true)

                torch.cuda.empty_cache()

            # average over sequences
            loss    /= x.shape[0]
            dist    /= x.shape[0]
            seq_acc /= x.shape[0]
            dig_acc /= x.shape[0]

            return seq_acc, dig_acc, loss, dist
示例#18
0
    def fit(self, training_data, validation_data, optimizer, 
            epochs=100, batch_size=10, logger=None):
        '''
        Training loop

        input:
          tuple training_data:   (features, labels), as numpy.ndarray
          tuple validation_data: (features, labels), as numpy.ndarray
          optimizer: a PyTorch optimizer object
          logger:    a HistoryLogger object, for visualizing training
        '''

        # Learning-rate is reduced on plateau, parameters might need tweaking depending on data
        scheduler = lr_reducer(optimizer, factor=0.5, patience=5, min_lr=1e-6)

        logger.on_train_begin()

        self.to(self.device)
        
        # Data to tensors
        x_tr = torch.tensor(training_data[0]).type(torch.FloatTensor)
        y_tr = torch.IntTensor(training_data[1])
        x_te = torch.tensor(validation_data[0]).type(torch.FloatTensor)
        y_te = torch.IntTensor(validation_data[1])

        print("\nTraining with {} examples, validating with {} examples"\
              .format(x_tr.shape[0], x_te.shape[0]))

        best_acc = 0 # for determining best checkpoint

        training_set = Dataset(x_tr, y_tr, normalize=True, augment=True)

        params = {'batch_size':batch_size, 'shuffle':True, 'num_workers':6}
        train_generator = data.DataLoader(training_set, **params)

        for e in range(self.last_epoch, epochs):

            print("\nStarting epoch {}/{} ...".format(e+1, epochs))

            loss = 0

            # Progress bar
            with tqdm(train_generator,
                      total = (x_tr.shape[0]//batch_size)+1, 
                      unit_scale = batch_size,
                      postfix = "loss: {}".format(loss)) as t:

                for x_batch, y_batch in t:

                    # one batch at a time to GPU, uses less GPU memory
                    x_batch = x_batch.to(self.device)

                    optimizer.zero_grad()

                    y = self(x_batch) # predict on batch

                    # Output strings are all the same length, but warp-ctc needs 
                    # these lengths as a tensor to work.
                    out_sizes = torch.IntTensor([y.shape[0]]*batch_size)
                    out_sizes = torch.autograd.Variable(out_sizes, requires_grad=False)
                    
                    y_batch, target_sizes = self.remove_padding(y_batch, y.shape[0])

                    # predictions are needed without softmax in the loss function
                    loss = self.loss_fcn(y, y_batch, out_sizes, target_sizes)

                    loss.backward() 
                    optimizer.step()

                    t.postfix = "loss: {:8.4f}".format(loss.cpu().detach().numpy()[0])

                    torch.cuda.empty_cache()

            if logger is not None:
                seq_acc, _, loss, _ = logger.on_epoch_end(e)
            else:
                seq_acc, _, loss, _ = self.evaluate(x_te, y_te, batch_size)    

            scheduler.step(loss)

            # Save checkpoint
            most_accurate = False
            if seq_acc > best_acc:
                best_acc = seq_acc 
                most_accurate = True

            checkpointer({'epoch'        : e+1,
                          'model_states' : self.state_dict(),
                          'optimizer'    : optimizer.state_dict(),
                          'accuracy'     : best_acc,
                          'model_params' : self.modelparams}, 
                          most_accurate, self.out_path)

            print("validation accuracy: {:4.2f}, validation loss: {:6.4f}"\
                  .format(float(seq_acc), float(loss)))
示例#19
0
#


# all_s["app_s"] = all_s["ApplicantIncome"] * all_s["Self_Employed"]
# all_s["ci_s"] = all_s["CoapplicantIncome"] * all_s["Self_Employed"]
# all_s["la_s"] = all_s["LoanAmount"] * all_s["Self_Employed"]
# all_s["lat_s"] = all_s["Loan_Amount_Term"] * all_s["Self_Employed"]

features_to_drop = ['Gender', 'Married', 'Dependents','Education', 'Self_Employed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term']

all_filtered = all_s.drop(features_to_drop,axis=1)

print(train.columns)
# ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
#        'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
#        'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status']

train_cust = all_filtered[:ntrain]
test_cust = all_filtered[ntrain:]

print(all_s.columns)
#
# train_cat_enc = sp.hstack(train_cat_enc, format='csr')
# test_cat_enc = sp.hstack(test_cat_enc, format='csr')

Dataset.save_part_features('fSelect', list(all_filtered.columns))
Dataset(fSelect=train_cust.values).save('train')
Dataset(fSelect=test_cust.values).save('test')
#
# print("Done.")
示例#20
0
import numpy as np

from util import Dataset
from sklearn.preprocessing import scale

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')

print("Scaling...")

all_scaled = scale(np.vstack((train_num, test_num)))

print("Saving...")

Dataset.save_part_features('numeric_mean_scaled',
                           Dataset.get_part_features('numeric'))
Dataset(numeric_mean_scaled=all_scaled[:train_num.shape[0]]).save('train')
Dataset(numeric_mean_scaled=all_scaled[train_num.shape[0]:]).save('test')

print("Done.")
示例#21
0
                  help='Resume from the given checkpoint.')

opt = args.parse_args()

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

if (opt.checkpoint != None):
    print('Load model from checkpoints')
    model = BertForNextSentencePrediction.from_pretrained(
        join('checkpoints', opt.checkpoint))
else:
    model = BertForNextSentencePrediction.from_pretrained('bert-base-chinese')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

dataset = Dataset(tokenizer)
dataloader = data.DataLoader(dataset=dataset,
                             batch_size=opt.batch_size,
                             shuffle=True,
                             collate_fn=custom_collate(tokenizer))

task_name = '{}-{}'.format(opt.name, time.strftime("%Y-%m-%d-%H-%M-%S"))
writer = SummaryWriter('runs/{}'.format(task_name))

if not os.path.exists(join('checkpoints', task_name)):
    os.mkdir(join('checkpoints', task_name))

model.to('cuda')
model.train()

acc_loss = []  # accumulated loss, refreshed when it is plotted
import numpy as np

from util import Dataset, vstack, hstack

from sklearn.preprocessing import scale
from sklearn.decomposition import TruncatedSVD

n_components = 500  # 500 components explain 99.8% of variance

print "Loading data..."

train_num = Dataset.load_part('train', 'numeric')
train_cat = Dataset.load_part('train', 'categorical_dummy')

test_num = Dataset.load_part('test', 'numeric')
test_cat = Dataset.load_part('test', 'categorical_dummy')

train_cnt = train_num.shape[0]

print "Combining data..."

all_data = hstack((scale(vstack(
    (train_num, test_num)).astype(np.float64)).astype(np.float32),
                   vstack((train_cat, test_cat))))

del train_num, train_cat, test_num, test_cat

print "Fitting svd..."

svd = TruncatedSVD(n_components)
res = svd.fit_transform(all_data)
import numpy as np
import pandas as pd
from scipy.stats import skew, boxcox
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

import itertools

print("Loading data...")

train_num = Dataset.load_part('train', 'numeric_mean')
test_num = Dataset.load_part('test', 'numeric_mean')
ntrain = train_num.shape[0]

train_test = np.vstack([train_num, test_num])
num_features = Dataset.get_part_features('numeric')
num_comb_df = pd.DataFrame()

with tqdm(total=train_num.shape[1], desc='  Transforming',
          unit='cols') as pbar:
    for comb in itertools.combinations(num_features, 2):
        feat = comb[0] + "_" + comb[1]

        num_comb_df[
            feat] = train_test[:, num_features.index(comb[0]) -
                               1] + train_test[:,
                                               num_features.index(comb[1]) - 1]
        print('Combining Columns:', feat)
示例#24
0
# Replace category label with their counts

import numpy as np
import pandas as pd

from tqdm import tqdm
from util import Dataset

print("Loading data...")

train_cat = Dataset.load_part('train', 'categorical_mode')
test_cat = Dataset.load_part('test', 'categorical_mode')

train_cat_counts = np.zeros(train_cat.shape, dtype=np.float32)
test_cat_counts = np.zeros(test_cat.shape, dtype=np.float32)

with tqdm(total=train_cat.shape[1], desc='  Counting', unit='cols') as pbar:
    for col in range(train_cat.shape[1]):
        train_series = pd.Series(train_cat[:, col])
        test_series = pd.Series(test_cat[:, col])

        counts = pd.concat((train_series, test_series)).value_counts()
        train_cat_counts[:, col] = train_series.map(counts).values
        test_cat_counts[:, col] = test_series.map(counts).values
        pbar.update(1)

print("Saving...")

print(train_cat_counts)
Dataset.save_part_features('categorical_counts',
                           Dataset.get_part_features('categorical'))
from keras.optimizers import SGD, Adam, Adadelta
from keras.callbacks import ModelCheckpoint
from keras import regularizers
#from keras_util import ExponentialMovingAverage, batch_generator

from statsmodels.regression.quantile_regression import QuantReg

from pylightgbm.models import GBMRegressor

from scipy.stats import boxcox

from bayes_opt import BayesianOptimization

from util import Dataset, load_prediction, hstack

categoricals = Dataset.get_part_features('categorical')


class DenseTransformer(BaseEstimator):
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self


class BaseAlgo(object):
示例#26
0
import numpy as np
import scipy.sparse as sp
import pandas as pd
from sklearn.preprocessing import scale

from tqdm import tqdm
from util import Dataset

print("Loading data...")

idx = Dataset.load_part("train", 'id')

train_cat = pd.DataFrame(Dataset.load_part("train", 'categorical_mode'),
                         columns=Dataset.get_part_features('categorical_mode'),
                         index=idx)
train_num = pd.DataFrame(Dataset.load_part("train", 'numeric_mean'),
                         columns=Dataset.get_part_features('numeric_mean'),
                         index=idx)

train = pd.concat([train_cat, train_num], axis=1)

idx = Dataset.load_part("test", 'id')

test_cat = pd.DataFrame(Dataset.load_part("test", 'categorical_mode'),
                        columns=Dataset.get_part_features('categorical_mode'),
                        index=idx)
test_num = pd.DataFrame(Dataset.load_part("test", 'numeric_mean'),
                        columns=Dataset.get_part_features('numeric_mean'),
                        index=idx)

test = pd.concat([test_cat, test_num], axis=1)
def y_decode(y):
    og = Dataset.load_part("train", "target_labels")
    le = LabelEncoder()
    le.classes_ = og
    z = [int(i) for i in y]
    return le.inverse_transform(z)
示例#28
0
    'discriminator_loss': [],
    'encoded_feature_vector': [],
    'original_images': [],
    'encoded_images': [],
    'reconstruct_images': [],
    'reconstruct_from_random': [],
}
statistics_file = 'statistics/' + eid
print('id: ', eid)
print('number of epochs = {:d}'.format(n_epochs))
print('batch_size = {:d}'.format(batch_size))

# Load data
X_train = load_data('../data/data.npy')  # (2000, 784)
label_train = load_data('../data/label.npy')  # (2000,)
train_dataset = Dataset(X_train, label_train, batch_size)
n_train_samples = X_train.shape[0]
n_iters = int(n_epochs * n_train_samples / batch_size)
print('number of iterations = {:d}'.format(n_iters))


def weight_variable(shape):
    initial = tf.random_normal(shape, stddev=0.01)
    return tf.Variable(initial)


def bias_variable(shape):
    initial = tf.fill(shape, 0.1)
    return tf.Variable(initial)

import pandas as pd

from util import Dataset

for name in ['train', 'test']:
    print "Processing %s..." % name

    num = pd.DataFrame(Dataset.load_part(name, 'numeric'),
                       columns=Dataset.get_part_features('numeric'))
    df = pd.DataFrame(index=num.index)

    df['diff_1_6'] = num['cont1'] - num['cont6']
    df['diff_1_9'] = num['cont1'] - num['cont9']
    df['diff_1_10'] = num['cont1'] - num['cont10']
    df['diff_6_9'] = num['cont6'] - num['cont9']
    df['diff_6_10'] = num['cont6'] - num['cont10']
    df['diff_6_11'] = num['cont6'] - num['cont11']
    df['diff_6_12'] = num['cont6'] - num['cont12']
    df['diff_6_13'] = num['cont6'] - num['cont13']
    df['diff_7_11'] = num['cont7'] - num['cont11']
    df['diff_7_12'] = num['cont7'] - num['cont12']
    df['diff_11_12'] = num['cont11'] - num['cont12']

    if name == 'train':
        Dataset.save_part_features('numeric_combinations', list(df.columns))

    Dataset(numeric_combinations=df.values).save(name)

print "Done."
示例#30
0
import numpy as np
import pandas as pd

from util import Dataset
from sklearn.preprocessing import minmax_scale

print "Loading data..."

train_num = Dataset.load_part('train', 'numeric')
test_num = Dataset.load_part('test', 'numeric')

print "Scaling..."

numeric = pd.DataFrame(np.vstack((train_num, test_num)),
                       columns=Dataset.get_part_features('numeric'))

df = pd.DataFrame(index=numeric.index)
df["cont1"] = np.sqrt(minmax_scale(numeric["cont1"]))
df["cont4"] = np.sqrt(minmax_scale(numeric["cont4"]))
df["cont5"] = np.sqrt(minmax_scale(numeric["cont5"]))
df["cont8"] = np.sqrt(minmax_scale(numeric["cont8"]))
df["cont10"] = np.sqrt(minmax_scale(numeric["cont10"]))
df["cont11"] = np.sqrt(minmax_scale(numeric["cont11"]))
df["cont12"] = np.sqrt(minmax_scale(numeric["cont12"]))
df["cont6"] = np.log(minmax_scale(numeric["cont6"]) + 0000.1)
df["cont7"] = np.log(minmax_scale(numeric["cont7"]) + 0000.1)
df["cont9"] = np.log(minmax_scale(numeric["cont9"]) + 0000.1)
df["cont13"] = np.log(minmax_scale(numeric["cont13"]) + 0000.1)
df["cont14"] = (np.maximum(numeric["cont14"] - 0.179722, 0) / 0.665122)**0.25

print "Saving..."