示例#1
0
def train(config, Model):

    # Load the data and create X and Y matrices
    data = get_data(config)
    num_features = data.shape[1] - 1
    X = data[:, :num_features]
    Y = data[:, -1]

    # split the data into training and test set
    X_train, Y_train, X_test, Y_test = split_data(X,
                                                  Y,
                                                  0.80,
                                                  balance_dist=True)
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)
    Y_train = to_categorical(Y_train)
    Y_test = to_categorical(Y_test)

    # instantiate the CNN model and train on the data
    model = Model(num_features, Y_train.shape[1])
    history = model.fit(X_train,
                        Y_train,
                        batch_size=128,
                        epochs=100,
                        verbose=2)

    # Evaluate the trained model on test data and print the accuracy
    score = model.model.evaluate(X_test, Y_test)
    print("\nTest accuracy: ", round(score[1] * 100, 2))
    print("Test loss: ", round(score[0], 2))

    return history
示例#2
0
def main():

    import config

    from model import load_model
    model = load_model()
    while not model:
        config.model_path = input('valid model: ')
        model = load_model()

    from data import load_data, split_data
    d = load_data(with_meta=True)
    d, _ = split_data(d)

    # from random import shuffle
    # shuffle(d)
    d = d[:config.hm_output_file]

    for i, (seq, meta) in enumerate(d):

        from model import respond_to
        _, seq = respond_to(model, [seq[:config.hm_extra_steps]],
                            training_run=False,
                            extra_steps=config.hm_extra_steps)
        seq = seq.detach()
        if config.use_gpu:
            seq = seq.cpu()
        seq = seq.numpy()

        from data import data_to_audio, write
        seq = data_to_audio(seq, meta)
        write(f'{config.output_file}{i}.wav', config.sample_rate, seq)
示例#3
0
文件: run.py 项目: Petlja/PSIML
def main():
    # Generate and split data
    # Try and play with arguments
    all_data = data.generate_data_gauss(numSamples=1000, noise=0.5)
    train_data, valid_data = data.split_data(all_data, val_factor=0.3)
    # Set show to True if you want to see generated dataset
    data.plot_data(train_data, valid_data, show=False)

    # Directory to save summaries to
    # From your conda environment run
    # tensorbard --logdir ../tf_playground/output
    # to see training details
    output = utils.get_output_dir()

    # Create model
    # Go to model.py file to make changes to the model
    model = Model()

    # Lets train
    # Try changing number of epochs and batch_size
    trainer = Trainer(train_data=train_data,
                      valid_data=valid_data,
                      model=model,
                      epochs=10,
                      batch_size=2,
                      output=output)
    trainer.train()

    trainer.save_final_accuracy()
示例#4
0
def main(unused_argv):

  if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly
    raise Exception('Problem with flags: %s' % unused_argv)

  # choose what level of logging you want
  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO) 

  # user_rating has the elements in the following order
  # user_id, item_id, rating, time, num_words, review
  user_rating, user_id_to_idx, item_id_to_idx = read_file(FLAGS.data_path)
  num_users = len(user_id_to_idx)
  num_items = len(item_id_to_idx)
  num_reviews = len(user_rating)
  print('Number of total users / items / reviews: %d / %d / %d' % 
          (num_users, num_items, num_reviews))
  users_ratings = [ur for ur in user_rating]
  train_ratings, test_ratings, valid_ratings = split_data(users_ratings)

  # build vocabulary
  id_to_word, word_to_id = build_vocab(users_ratings, FLAGS.vocab_size)
  train_item_doc = token_to_id(train_ratings, word_to_id)
  valid_item_doc = token_to_id(valid_ratings, word_to_id)

  current_datetime = datetime.now()
  subfolder_timestamp = datetime.strftime(current_datetime, '%Y%m%d-%H%M%S')
  subfolder_dataname = os.path.basename(FLAGS.data_path)
  log_folder = os.path.join(FLAGS.log_root, subfolder_dataname + '-' + subfolder_timestamp)
  # save vocab to output folder
  pathlib.Path(log_folder).mkdir(parents=True, exist_ok=True) 
  with open(os.path.join(log_folder, 'vocab.csv'), 'w') as f:    
    for idx, token in id_to_word.items():
      f.write('%s,%s\n' % (idx, token))
  
  # Try offset model
  offset_model = offsetModel(train_ratings, valid_ratings, test_ratings)
  offset_model.train()

  # Make a namedtuple hps, containing the values of the hyperparameters that the model needs
  hparam_list = ['init_stddev', 'emb_dim', 'min_kappa', 'max_kappa', 
                 'vocab_size', 'mu', 'max_iter_steps', 'num_iter_steps',
                 'threshold']
  hps_dict = {}
  for key,val in FLAGS.flag_values_dict().items(): # for each flag
    if key in hparam_list: # if it's in the list
      hps_dict[key] = val # add it to the dict
  hps = namedtuple('HParams', hps_dict.keys())(**hps_dict)

  hft_model = HFTModel(hps, train_ratings, valid_ratings, test_ratings,
                       train_item_doc, valid_item_doc,
                       num_users, num_items, num_reviews, log_folder)
  hft_model.build_graph()  
  hft_model.train()
示例#5
0
    def test_precision(self):
        df = pd.read_pickle('../data/final/df_final.pkl')
        data = d.split_data(df, True)

        data_train = data[0]
        data_test = data[1]
        data_val = data[2]

        b = base.baseline(df, False)

        als_result = als_precision(data_train, data_val, b)
        assert 1 == 1
示例#6
0
def main(disp_text=True):

    if config.fresh_model:
        config.all_losses = []
        save_model(make_model())
        model = load_model()
        if disp_text: print('created model.', end=' ')
    else:
        model = load_model()
        if not model:
            save_model(make_model())
            model = load_model()
            if disp_text: print('created model.', end=' ')
        else:
            if disp_text: print('loaded model.', end=' ')

    data = load_data()
    data, data_dev = split_data(data)

    data = [d for i,d in enumerate(data) if i in [8,10,13,14]]
    print()
    seq_lens = [len(d) for d in data]
    print(f'seq lens: {seq_lens}')
    min_seq_len = min(seq_lens)
    print(f'min seq len: {min_seq_len}')
    if not config.max_seq_len or config.max_seq_len > min_seq_len:
        config.max_seq_len = min_seq_len
    data = [d[:config.max_seq_len] for d in data]

    # from random import choice
    # from torch import randn
    # data = [[randn(config.in_size) for _ in range(choice(range(config.max_seq_len//2,config.max_seq_len)))] for _ in range(10)]
    # data_dev = []
    # for d in data: print(len(d))

    if not config.batch_size or config.batch_size >= len(data):
        config.batch_size = len(data)
    elif config.batch_size < 1:
        config.batch_size = int(len(data)*config.batch_size)

    if disp_text: print(f'hm data: {len(data)}, hm dev: {len(data_dev)}, bs: {config.batch_size}, lr: {config.learning_rate}, \ntraining started @ {now()}')

    for ep in range(config.hm_epochs):

        for i, batch in enumerate(batchify_data(data)):

            train_on(model, batch)

    return model
示例#7
0
文件: dna.py 项目: hhcho/harness
def load_data(paths, report=sys.stdout, **kwargs):
    import data
    dtypes = [parse_type(path) for path in paths]
    if not all(t == dtypes[0] for t in dtypes[1:]):
        print >>report, "Error: all files must be of same type but were {}".format(dtypes)
        raise Exception()
    dtype = dtypes[0]
    labels = dtype == labeled
    Xt, Yt, Xv, Yv = data.split_data(read_files(paths,report=report,**kwargs), labels=labels
                           , report=sys.stdout, **kwargs)
    if labels:
        labels = max(np.max(Yt), np.max(Yv))+1
    else:
        labels = 0
    return Xt, Yt, Xv, Yv, labels
def main(args):
    x, fx = get_data(args)

    device = torch.device("cuda" if args.cuda else "cpu")
    train_data, val_data = split_data(args, x, fx)

    if args.save_splits:
        save_splits(train_data, val_data)

    train_loader, val_loader = get_loaders(train_data, val_data)

    model = get_model(args)

    trainer = get_trainer(model, train_loader, val_loader, device, args)
    trainer.train()
def main():

    import config

    from model import load_model
    model = load_model(config.model_path + '_final')
    while not model:
        config.model_path = input('valid model: ')
        model = load_model()

    from data import load_data, split_data
    d = load_data()
    d, _ = split_data(d)

    # from random import shuffle
    # shuffle(d)
    #d = d[:config.hm_output_file]
    d = [d[8]]  # [8,10,13,14]]
    config.polyphony = True

    for i, seq in enumerate(d):

        from model import respond_to
        seq = respond_to(model, seq[:1])
        seq = [t.detach() for t in seq]
        if config.use_gpu:
            seq = [t.cpu() for t in seq]
        seq = [t.numpy() for t in seq]

        from data import note_reverse_dict, convert_to_midi
        seq_converted = []
        for timestep in seq:
            if config.act_fn == 't': timestep = (timestep + 1) / 2
            if config.polyphony:
                t_converted = ''
                for i, e in enumerate(timestep[0]):
                    if e > config.pick_threshold:
                        t_converted += note_reverse_dict[i % 12] + str(
                            int(i / 12) + config.min_octave
                        ) if i != config.out_size - 1 else 'R'
                        t_converted += ','
                t_converted = t_converted[:-1] if len(t_converted) else 'R'
            else:
                i = timestep[0].argmax()
                t_converted = note_reverse_dict[i % 12] + str(
                    int(i / 12) + config.min_octave)
            seq_converted.append(t_converted)
        convert_to_midi(seq_converted).show()
示例#10
0
def get_stats():
    config = {
        'unknown_freq': 2,
        'gold_ratio': 0.1,
        'inc_option': 'auxiliary',
        'auxiliary_option': 'detection',
        'seed': 66
    }
    dir_path = '/path/to/working/dir'
    set_random_seed(config['seed'])
    train_file = dir_path + '/data/ontonotes.development.ner'
    print('load data')
    train_data = get_data(train_file)
    gold_data, inc_data = split_data(train_data, config)
    print('get vocabulary')
    word_to_ix, pos_to_ix, ner_to_ix = get_vocabulary(train_data, config)
    config['ner_to_ix'] = ner_to_ix
    config['pos_to_ix'] = pos_to_ix
    config['word_to_ix'] = word_to_ix
    config['output_size'] = len(ner_to_ix)
    print('ner_to_ix', ner_to_ix)
    print('word_to_ix', len(word_to_ix))
    print('process data')
    inc_input_ids, inc_sent_ids, inc_pos_ids, inc_ner_ids = process_data(
        inc_data, word_to_ix, pos_to_ix, ner_to_ix)
    inc_ner_ids = get_incidental_data(inc_sent_ids, inc_input_ids, inc_pos_ids,
                                      inc_ner_ids, config)
    inc_label_counter = Counter()
    for label in inc_ner_ids:
        # if label[0] == 'B' or label[0] == 'I':
        #    label = label[2:]
        inc_label_counter[label] += 1 / len(inc_ner_ids)
    print('inc label counter', inc_label_counter)
    inputs, sent_ids, pos_labels, ner_labels = inc_data
    word_seqs = generate_sent_seqs(inputs, sent_ids)
    pos_seqs = generate_sent_seqs(pos_labels, sent_ids)
    ner_seqs = generate_sent_seqs(ner_labels, sent_ids)
    inc_data = []
    sent_counter = Counter()
    for x in range(len(word_seqs)):
        inc_data.append((word_seqs[x], pos_seqs[x], ner_seqs[x]))
        sent_counter[len(word_seqs[x])] += 1 / len(word_seqs)
    print('average sent length', len(sent_ids) / len(word_seqs))
    print('sent length distribution', sent_counter.items())
示例#11
0
def main(images_path, labels_path):
    keras.backend.clear_session()

    data_df = get_data(images_path, labels_path)

    raw_train, valid = split_data(data_df)

    model = create_model(num_classes=28, input_shape=input_shape)
    model.compile(loss="binary_crossentropy",
                  optimizer=Adam(),
                  metrics=["acc", f1])
    # model.compile(loss=[_focal_loss(gamma=2,alpha=0.75)], optimizer=Adam(), metrics=["acc", f1])

    epochs = 50
    batch_size = 64
    checkpointer = ModelCheckpoint("../working/InceptionResNetV2.model",
                                   verbose=2,
                                   save_best_only=True)
    early_stopping = EarlyStopping(monitor="val_loss", patience=2)
    reduce_lr = ReduceLROnPlateau(monitor="val_loss", patience=1, factor=0.1)

    train_generator = DataGenerator.create_train(raw_train,
                                                 batch_size,
                                                 DEFAULT_IMG_SIZE_WHC,
                                                 augument=True)
    validation_generator = DataGenerator.create_train(valid,
                                                      100,
                                                      DEFAULT_IMG_SIZE_WHC,
                                                      augument=False)

    train_steps = raw_train.shape[0] // batch_size
    valid_steps = valid.shape[0] // batch_size

    # train model
    history = model.fit_generator(
        train_generator,
        steps_per_epoch=train_steps,
        validation_data=next(validation_generator),
        validation_steps=valid_steps,
        epochs=epochs,
        verbose=1,
        callbacks=[checkpointer, reduce_lr],
    )
示例#12
0
def main():
    try:
        feature1 = request.form["feature1"]
        feature2 = request.form["feature2"]

        classifier = request.form["classifier"]
    except KeyError:
        error = "Warning! Missing selections. Please select two features from the dataset, and one classifier!"
        return render_template('select.html', error=error)

    df = read_diabetes()
    x_train, x_test, y_train, y_test = split_data(df)

    x_train, x_test = select_features(x_train, x_test, [feature1, feature2])

    clf = eval(classifier + "()")
    clf.fit(x_train, y_train)

    plot_data = build_plot(clf, x_test, y_test)

    accuracy = clf.score(x_test, y_test)

    return render_template('plot.html', accuracy=accuracy, plot_url=plot_data)
示例#13
0
def preproc_data():
    from data import split_data
    split_data('../data/hin-eng/hin.txt', '../data/hin-eng')
示例#14
0

def CNN(X_train, X_test, y_train, y_test):
    X_train, X_test, y_train, y_test = reshape_data(X_train, X_test, y_train,
                                                    y_test)
    model = Sequential()
    model.add(
        Conv2D(64, kernel_size=3, activation='relu', input_shape=(28, 28, 1)))
    model.add(Conv2D(32, kernel_size=3, activation='relu'))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    history = model.fit(X_train,
                        y_train,
                        validation_data=(X_test, y_test),
                        epochs=10,
                        batch_size=100)
    plt.plot(history.history['loss'])
    # plt.plot(history.history['val_loss'])
    plt.title('Loss Function')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    # plt.legend(['train', 'test'], loc='upper left')
    plt.show()


if __name__ == "__main__":
    train_x, test_x, train_y, test_y = split_data()
    CNN(train_x, test_x, train_y, test_y)
示例#15
0
        output, hidden = model(input, hidden)
        output = output.squeeze()
        output = softmax(output, dim=0)
        p = output[current_idx].data  # 概率
        total_p += math.log(p)  #e为底
    return math.exp(-total_p * (1 / sentence_len))


def evaluate(model, test_dataset, dict):
    ppl = 0
    for sentence in test_dataset:
        ppl += evaluate_iter(model, sentence, dict)
    ppl = ppl / len(test_dataset)
    print("evaluation ppl:", ppl)
    return ppl


if __name__ == '__main__':
    dataset = data.get_dataset(file_path)
    dict = data.build_dict(dataset)
    config.vocab_size = len(dict)
    train_dataset, test_dataset = data.split_data(
        dataset, train_proportion=config.train_proportion)
    train_tokens = data.tokenize(train_dataset, dict)
    model = RNNModel(config)
    train_batch_source = data.batchify(train_tokens,
                                       config.batch_size)  #传入batchify好的数据直接训练
    train(model, batch_source=train_batch_source)

    #test
    evaluate(model, test_dataset, dict)
示例#16
0
def train(gpu: int, args: Namespace):
    """Implements the training loop for PyTorch a model.

    Args:
        gpu: the GPU device
        args: user defined arguments
    """

    # setup process groups
    rank = args.nr * args.gpus + gpu
    setup(rank, args)
    
    # define the model
    model = ResNext().architecture
    model.cuda(gpu)
    # Wrap the model
    model = DDP(model, device_ids=[gpu])

    # define loss function (criterion) and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = Adam(model.parameters(), args.lr)

    # split data
    train_df = split_data(args.folds)

    for fold in range(args.folds):
        losses = []
        scores = []
        train_loader, valid_loader = get_data(args, train_df, fold, rank)
        
        if gpu == 0:
            print(f"Training started using fold {fold} for validation") 
        
        # train
        model.train()
        for epoch in range(args.epochs):
            for i, (images, labels) in enumerate(train_loader):
                images = images.cuda(gpu)
                labels = labels.cuda(gpu)
                output = model(images)
                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                if i % args.log_interval == 0 and gpu == 0:
                    print("Train Epoch: {} [{}/{} ({:.0f}%)]\tloss={:.4f}".format(
                          epoch+1, i, len(train_loader),
                          100. * i / len(train_loader), loss.item()))
        
        # evaluate
        model.eval()
        with torch.no_grad():
            for i, (images, labels) in enumerate(valid_loader):
                images = images.cuda(gpu)
                labels = labels.cuda(gpu)
                output = model(images)
                loss = criterion(output, labels).item()
                score = get_score(labels.detach().cpu(), output.detach().cpu())
                losses.append(loss)
                scores.append(score)

            if gpu == 0:
                print("Validation loss={:.4f}\tAUC score={:.4f}".format(
                      statistics.mean(losses), statistics.mean(scores)))
                
        # checkpoint model
        model = checkpoint(model, gpu, fold)
            
    if args.save_model and gpu == 0:
        torch.save(model.module.state_dict(), "model.pt")
        
    cleanup()
示例#17
0
from data import get_train_data, get_vocab, split_data, response_len, post_len, padding
import random
import os

from pprint import pprint
import numpy as np
import time

id2w, w2id, freq = get_vocab()

from emo_cls.classification import Classification
from seq2seq_attention_9emo import Seq2SeqAttentionMinDis, Seq2SeqAttentionMaxDis, Seq2SeqAttentionEmoContent
from seq2seq_attention_9emo import Seq2SeqAttentionHappy, Seq2SeqAttentionSad, Seq2SeqAttentionAnger, Seq2SeqAttentionDisgust
from seq2seq_attention_9emo import Seq2SeqAttentionLike  #,Seq2SeqAttentionSurprise,Seq2SeqAttentionFear

train_datas, val_datas, test_datas = split_data()

keys = ['posts', 'postLen', 'resps', 'respLen', 'resp_tfidf']
train_datas = [train_datas[k] for k in keys]
val_datas = [val_datas[k] for k in keys]
print("train num:%s" % len(train_datas[0]))

seq_len = 20
batch_size = 128
D_step = 5
G_step = 1
is_debug = True

# Emotion Classifier
emo_clas = Classification(sequence_length=20, num_classes=6, l2_reg_lambda=0.1)
emo_clas.restore_last_session(base_path="./emo_cls")
示例#18
0
import data
import utils
import log
log = log.log
log.struct_log(log)

pre_method = 'Rescaling'
train_method = 'RandomForest'
time_train = 300
time_test = 10
n_stock_select = 10
seed = 41

data = data.data
data.read_data(data)
data.split_data(data)
data.pre_process(data,pre_method)

model = Model.Model
model.read_data(model)
model.roll_train(model,train_method,time_train,time_test)

utils = utils.utils
utils.parameter(utils,n_stock_select,seed)
utils.struct_strategy(utils)
utils.merging_index(utils)

log.logger.info(utils.strategy)

utils.print_winrate(utils)
utils.plot_value(utils)
示例#19
0
model_files = glob.glob('models/*.hdf5')
other_models = glob.glob('models/*/*-0.6*hdf5')
model_files.extend(other_models)

public_test_dict = {}
private_test_dict = {}
results = {}

for model_file in model_files:
    model = load_model(model_file, compile=False)
    model.compile(optimizer='adam', loss='categorical_crossentropy',
                  metrics=['accuracy'])
    input_shape = model.input_shape[1:4]
    if input_shape not in public_test_dict.keys():
        faces, emotions = load_emotion_data('data/fer2013/fer2013.csv', input_shape)
        train, test = split_data(faces, emotions, 0.2)
        public_test_dict[input_shape], private_test_dict[input_shape] = split_data(test[0], test[1], 0.5)


    start = time.time()
    public_test_result = model.evaluate(public_test_dict[input_shape][0], public_test_dict[input_shape][1])
    private_test_result = model.evaluate(private_test_dict[input_shape][0], private_test_dict[input_shape][1])
    duration = time.time() - start
    print(model_file)
    print('public  test', public_test_result)
    print('private test', private_test_result)
    results[model_file] = {'public_acc': public_test_result[1], 'private_acc': private_test_result[1], 'time': duration}
print(results)
import json
json.dump(results, open('test.json', 'w'))
示例#20
0
文件: run.py 项目: pmmf/SI-SLR
def main():
    global ADV_WEIGHT, TRANSFER_WEIGHT

    # set random seed
    np.random.seed(42)
    torch.manual_seed(42)
    torch.backends.cudnn.deterministic = True

    # Parsing arguments
    parser = argparse.ArgumentParser(description='signer-independent project')
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--dataset', type=str, required=True)
    parser.add_argument('--mode', type=str, default='test')
    parser.add_argument('--gpu', type=int, required=True)
    parser.add_argument('--adv_weight', type=float, required=True)
    parser.add_argument('--transf_weight', type=float, required=True)
    parser.add_argument('--output', default='./output_cnn/')

    args = parser.parse_args()

    # set adversarial and transfer weights
    TRANSFER_WEIGHT = args.transf_weight
    ADV_WEIGHT = args.adv_weight

    # Make output direcotiry if not exists
    if not os.path.isdir(args.output):
        os.mkdir(args.output)

    # select gpu
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # dataset
    dataset = DATASETS_LIST[args.dataset](model=args.model)
    X_to_split = np.zeros((len(dataset), 1))
    print(len(dataset[0]))

    # evaluation protocol
    (IM_SIZE, MODE, SPLITS, n_signers,
     n_classes) = get_evaluation_protocol(args.dataset)

    # get data splitter
    dataSplitter = getSplitter(dataset,
                               n_splits=SPLITS,
                               mode=MODE,
                               test_size=.10)

    results = []
    split = 0

    for split, (tr_indexes, test_indexes) in enumerate(dataSplitter):
        output_fn = os.path.join(args.output, 'split_' + str(split))

        if not os.path.isdir(output_fn):
            os.mkdir(output_fn)

        # split data
        (train_loader, valid_loader,
         test_loader) = split_data(dataset, (tr_indexes, test_indexes),
                                   BATCH_SIZE,
                                   dataAug=True,
                                   mode=MODE)

        # Initialize the model
        model = MODEL_LIST[args.model](input_shape=IM_SIZE,
                                       output_signers=n_signers,
                                       output_classes=n_classes,
                                       hasAdversial=True).to(device)
        print(model)

        # Train or test
        if args.mode == 'train':
            # Fit model
            model, train_history, valid_loader = fit(model=model,
                                                     data=(train_loader,
                                                           valid_loader),
                                                     device=device,
                                                     output=output_fn,
                                                     n_signers=n_signers)

            # save train history
            res_fn = os.path.join(*(output_fn, '_history.pckl'))
            pickle.dump(train_history, open(res_fn, "wb"))

        elif args.mode == 'test':
            model.load_state_dict(
                torch.load(os.path.join(*(output_fn, 'cnn.pth'))))

            # load train history
            res_fn = os.path.join(*(output_fn, '_history.pckl'))
            train_history = pickle.load(open(res_fn, "rb"))
            plot_fn = os.path.join(*(output_fn, 'cnn_history.png'))
            plot_train_history(train_history, plot_fn=plot_fn)

        # Test results
        (_, test_loss, _, _, _, test_acc, test_acc_3,
         test_acc_5) = eval_model(model,
                                  test_loader,
                                  n_signers,
                                  device,
                                  debug=True)
        print('##!!!! Test loss: {:.5f} |'.format(test_loss.item()) +
              ' Test Acc: {:.5f}'.format(test_acc))

        results.append((test_loss.item(), test_acc, test_acc_3, test_acc_5))

        # TSNE maps
        # tsne(model, test_loader, device,
        #      plot_fn=os.path.join(*(output_fn, 'tsne.png')))

    # save results
    print(results)
    # asdas
    res_fn = os.path.join(args.output, 'res.pckl')
    pickle.dump(results, open(res_fn, "wb"))
    results = pickle.load(open(res_fn, "rb"))

    # Compute average and std
    print(results)
    acc_array = np.array([i[1] for i in results])
    acc3_array = np.array([i[2] for i in results])
    acc5_array = np.array([i[3] for i in results])
    print('Average acc: ', np.mean(acc_array))
    print('Average acc3: ', np.mean(acc3_array))
    print('Average acc5: ', np.mean(acc5_array))
    print('Std acc: ', np.std(acc_array))
示例#21
0
    plt.ylabel(test_x.iloc[:, 1].name)

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())

    model_accuracy = trained_model.score(test_x, test_y)
    print("train_accuracy::", model_accuracy)

    return fig


if __name__ == '__main__':

    doctors = read_diabetes()
    train_x, test_x, train_y, test_y = split_data(doctors)

    features = ['glucose', 'mass']
    train_x, test_x = select_features(train_x, test_x, features)

    # Training Logistic regression model
    clf = train_logistic_regression(train_x, train_y)
    #clf = eval(classifier+"()")
    #clf = MLPClassifier()
    #clf.fit(train_x, train_y)

    fig = visualize_plot(clf, test_x, test_y)
    plt.savefig('./static/visualize_plot.png')
    plt.show()
def preproc_data():
    from data import split_data
    split_data('/content/itr/itr/data/hin-eng/hin.txt',
               '/content/itr/itr/data/hin-eng')
import data
import baseline
import numpy as np
x_data, y_data = data.get_keras_data()
x_data = [" ".join(x.split()[:100]) for x in x_data]


X_train, X_val, y_train, y_val = data.split_data(x_data,y_data)
X_train, X_val = data.tf_idf(X_train,X_val)
#get baseline accuracy
models = baseline.Base_models(X_train, y_train, X_val, y_val)
print(models.BinaryRe())
print(powerset())
print(mlknn())
示例#24
0
    Muraro = data.read_dataset(_path+"../data/Muraro/data.h5")
    Enge = data.read_dataset(_path+"../data/Enge/data.h5")
    Segerstolpe = data.read_dataset(_path+"../data/Segerstolpe/data.h5")
    Xin_2016 = data.read_dataset(_path+"../data/Xin_2016/data.h5")
    Lawlor = data.read_dataset(_path+"../data/Lawlor/data.h5")
    merge = {'Baron_human':Baron_human, 'Muraro':Muraro, 'Enge':Enge, 'Segerstolpe':Segerstolpe, 
    'Xin_2016':Xin_2016, 'Lawlor':Lawlor}
    mergedexpr, mergedl = data.merge_datasets(merge)

    s = mergedexpr.sum(axis=1)
    x = (mergedexpr.T/s).T
    x = x*10000
    #x = x[: ,:1000]
    whole_set = dataset.Single(x, mergedl)
    
    x,y,z,w = data.split_data(x, mergedl) 
    whole_set.print_info()

    for 
    
    exit()
    
    x = np.load("./data/train15720data.npy")
    z = np.load("./data/train15720label.npy")
    y = np.load("./data/test15720data.npy")
    w = np.load("./data/test15720label.npy")
    

    train_set = dataset.Single(x, z)
    test_set = dataset.Single(y, w)
    dl = DataLoader(train_set, batch_size=60, shuffle=True)
示例#25
0
def main(disp_text=True):

    if config.fresh_model:
        config.all_losses = []
        save_model(make_model())
        model = load_model()
        if disp_text: print('created model.', end=' ')
    else:
        model = load_model()
        if not model:
            save_model(make_model())
            model = load_model()
            if disp_text: print('created model.', end=' ')
        else:
            if disp_text: print('loaded model.', end=' ')

    data = load_data()
    data, data_dev = split_data(data)
    # from random import choice
    # from torch import randn
    # data = [[randn(config.in_size) for _ in range(choice(range(config.max_seq_len//2,config.max_seq_len)))] for _ in range(40)]
    # data_dev = []
    # for d in data: print(len(d))
    if config.max_seq_len: data = [d[:config.max_seq_len] for d in data]

    if not config.batch_size or config.batch_size >= len(data):
        config.batch_size = len(data)
        one_batch = True
    elif config.batch_size < 1:
        config.batch_size = int(len(data) * config.batch_size)
        one_batch = False
    else:
        one_batch = False

    if disp_text:
        print(
            f'hm data: {len(data)}, hm dev: {len(data_dev)}, bs: {config.batch_size}, lr: {config.learning_rate}, \ntraining started @ {now()}'
        )

    data_losss, dev_losss = [], []
    if not one_batch:
        if not config.all_losses:
            config.all_losses.append(dev_loss(model, data))
        data_losss.append(config.all_losses[-1])
    if config.dev_ratio:
        dev_losss.append(dev_loss(model, data_dev))

    if data_losss or dev_losss:
        if disp_text:
            print(
                f'initial loss(es): {data_losss[-1] if data_losss else ""} {dev_losss[-1] if dev_losss else ""}'
            )

    for ep in range(config.hm_epochs):

        loss = 0

        for i, batch in enumerate(batchify_data(data)):

            loss += respond_to(model, batch)

            sgd(model) if config.optimizer == 'sgd' else adaptive_sgd(model)

        loss /= len(data)

        if not one_batch: loss = dev_loss(model, data)
        data_losss.append(loss)
        config.all_losses.append(loss)
        if config.dev_ratio: dev_losss.append(dev_loss(model, data_dev))

        if disp_text:
            print(
                f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1] if config.dev_ratio else ""}, completed @ {now()}',
                flush=True)
        if config.ckp_per_ep and ((ep + 1) % config.ckp_per_ep == 0):
            save_model(model, config.model_path + f'_ckp{ep}')

    if one_batch: data_losss.append(dev_loss(model, data))

    if disp_text:
        print(
            f'training ended @ {now()} \nfinal losses: {data_losss[-1]}, {dev_losss[-1] if config.dev_ratio else ""}',
            flush=True)
    show(plot(data_losss))
    if config.dev_ratio:
        show(plot(dev_losss))
    if not config.fresh_model: show(plot(config.all_losses))

    return model, [data_losss, dev_losss]
	plt.show()


################################################################################
################################################################################
################################################################################


################################################################################
## MAIN ########################################################################
################################################################################


if __name__ == '__main__':

	X,Y = load_data_from_csv('../data/binary.csv', -1, float)
	X,Y = bootstrap_data(X, Y, 25)
	X = X[:,2:]
	Xtr,Xte,Ytr,Yte = split_data(X, Y, .8)
	knn = KNNClassify(Xtr, Ytr)

	print(cols((X,knn.predict(X))))
	
	plot_classify_2D(knn, X, Y)


################################################################################
################################################################################
################################################################################
示例#27
0
def main():

    if config.attention_only:
        from model2 import make_model_higher, respond_to
    else: from model import make_model_higher, respond_to

    if config.fresh_model:
        save_model(make_model_higher())
        model = load_model()
        print('created model.',end=' ')
    else:
        model = load_model()
        if not model:
            save_model(make_model_higher())
            model = load_model()
            print('created model.',end=' ')
        else:
            print('loaded model.',end=' ')
    print(f'info: {config.creation_info}')

    data = load_data(frames=not config.attention_only)
    data, data_dev = split_data(data)

    if not config.batch_size or config.batch_size >= len(data):
        config.batch_size = len(data)
        one_batch = True
    elif config.batch_size < 1:
        config.batch_size = int(len(data)*config.batch_size)
        one_batch = False
    else: one_batch = False

    print(f'hm data: {len(data)}, hm dev: {len(data_dev)}, bs: {config.batch_size}, lr: {config.learning_rate}, \ntraining started @ {now()}')

    data_losss, dev_losss = [], []
    if config.batch_size != len(data):
        data_losss.append(dev_loss(model, data))
    if config.dev_ratio:
        dev_losss.append(dev_loss(model, data_dev))

    if data_losss or dev_losss:
        print(f'initial loss(es): {data_losss[-1] if data_losss else ""} {dev_losss[-1] if dev_losss else ""}')

    for ep in range(config.hm_epochs):

        loss = 0

        for i, batch in enumerate(batchify_data(data, do_shuffle=not one_batch)):

            # print(f'\tbatch {i}, started @ {now()}', flush=True)

            batch_size = sum(len(sequence) for sequence in batch)

            loss += respond_to(model, batch)
            sgd(model, batch_size=batch_size) if config.optimizer == 'sgd' else \
                adaptive_sgd(model, batch_size=batch_size)

        # loss /= sum(len(sequence) for sequence in data)
        if not one_batch: loss = dev_loss(model, data)
        data_losss.append(loss)
        if config.dev_ratio:
            dev_losss.append(dev_loss(model, data_dev))

        print(f'epoch {ep}, loss {loss}, dev loss {dev_losss[-1] if config.dev_ratio else ""}, completed @ {now()}', flush=True)
        if config.ckp_per_ep and ((ep+1)%config.ckp_per_ep==0):
                save_model(model,config.model_path+f'_ckp{ep}')

    # data_losss.append(dev_loss(model, data))
    # if config.dev_ratio:
    #     dev_losss.append(dev_loss(model, data_dev))

    print(f'training ended @ {now()} \nfinal losses: {data_losss[-1]}, {dev_losss[-1] if config.dev_ratio else ""}', flush=True)
    show(plot(data_losss))
    if config.dev_ratio:
        show(plot(dev_losss))

    # if input(f'Save model as {config.model_path}? (y/n): ').lower() == 'y':
    #     save_model(load_model(), config.model_path + '_prev')
    #     save_model(model)

    return model, [data_losss, dev_losss]
示例#28
0
文件: backup.py 项目: kimiyoung/ssl
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('corpus', help = 'the name of the input corpus file')
    parser.add_argument('--seeds', help = 'percentage of seeds', type = float, default = 0.01)
    parser.add_argument('--epochs', help = 'number of epochs', type = int, default = 40)
    parser.add_argument('--learning_rate', help = 'learning rate', type = float, default = 1.0)
    parser.add_argument('--param_reg', help = 'the regularization factor of the parameters', type = float, default = 0.001)
    parser.add_argument('--ent_reg', help = 'the factor of entropy regularization', type = float, default = 0.0)
    args = parser.parse_args()

    lasagne.random.set_rng(np.random)
    np.random.seed(0)

    features, labels, label_set = data.read_content_citeseer(args.corpus)
    split = data.split_data(labels, args.seeds)
    maxf = get_maxf(features)

    trainx, trainy = constuct_dataset(features, labels, label_set, split[0], maxf)
    testx, testy = constuct_dataset(features, labels, label_set, split[1], maxf)
    allx, ally = constuct_dataset(features, labels, label_set, features.keys(), maxf)

    input_var = sparse.csr_matrix(name = 'x', dtype = 'float32')
    un_var = sparse.csr_matrix(name = 'ux', dtype = 'float32')
    target_var = T.imatrix('targets')
    ent_target = T.ivector('ent_targets')
    network, l_entropy = build_model(input_var, maxf + 1, trainy.shape[1], args.ent_reg > 0, un_var)

    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean() + regularize_layer_params(network, l2) * args.param_reg
示例#29
0
文件: model.py 项目: icapucap/bert
 def prepare_data(self):
     from data import split_data
     split_data('/content/itr/hin.txt', '/content/itr/')
示例#30
0
def main(FLAGS):

    # set seed
    np.random.seed(FLAGS.seed)
    tf.set_random_seed(FLAGS.seed)

    with tf.device('/cpu:0'), tf.name_scope('input'):

        # load data
        data, meta = load_data(FLAGS.dataset_root,
                               FLAGS.dataset,
                               is_training=True)
        train_data, val_data = split_data(data, FLAGS.validate_rate)
        batch_size = FLAGS.n_class_per_iter * FLAGS.n_img_per_class
        img_shape = train_data[0].shape[1:]

        # build DataSampler
        train_data_sampler = DataSampler(train_data, meta['n_class'],
                                         FLAGS.n_class_per_iter,
                                         FLAGS.n_img_per_class)

        val_data_sampler = DataSampler(val_data, meta['n_class'],
                                       FLAGS.n_class_per_iter,
                                       FLAGS.n_img_per_class)

        # build tf_dataset for training
        train_dataset = (tf.data.Dataset.from_generator(
            lambda: train_data_sampler, (tf.float32, tf.int32),
            ([batch_size, *img_shape
              ], [batch_size])).take(FLAGS.n_iter_per_epoch).flat_map(
                  lambda x, y: tf.data.Dataset.from_tensor_slices((x, y))).map(
                      preprocess_for_train, 8).batch(batch_size).prefetch(1))

        # build tf_dataset for val
        val_dataset = (tf.data.Dataset.from_generator(
            lambda: val_data_sampler, (tf.float32, tf.int32),
            ([batch_size, *img_shape], [batch_size])).take(100).flat_map(
                lambda x, y: tf.data.Dataset.from_tensor_slices((x, y))).map(
                    preprocess_for_eval, 8).batch(batch_size).prefetch(1))

        # clean up
        del data, train_data, val_data

        # construct data iterator
        data_iterator = tf.data.Iterator.from_structure(
            train_dataset.output_types, train_dataset.output_shapes)

        # construct iterator initializer for training and validation
        train_data_init = data_iterator.make_initializer(train_dataset)
        val_data_init = data_iterator.make_initializer(val_dataset)

        # get data from data iterator
        images, labels = data_iterator.get_next()
        tf.summary.image('images', images)

    # define useful scalars
    learning_rate = tf.placeholder(tf.float32, shape=(), name='learning_rate')
    tf.summary.scalar('lr', learning_rate)
    is_training = tf.placeholder(tf.bool, [], name='is_training')
    global_step = tf.train.create_global_step()

    # define optimizer
    optimizer = tf.train.AdamOptimizer(learning_rate)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate)

    # build the net
    model = importlib.import_module('models.{}'.format(FLAGS.model))
    net = model.Net(n_feats=FLAGS.n_feats, weight_decay=FLAGS.weight_decay)

    if net.data_format == 'channels_first' or net.data_format == 'NCHW':
        images = tf.transpose(images, [0, 3, 1, 2])

    # get features
    features = net(images, is_training)
    tf.summary.histogram('features', features)

    # summary variable defined in net
    for w in net.global_variables:
        tf.summary.histogram(w.name, w)

    with tf.name_scope('losses'):
        # compute loss, if features is l2 normed, then 2 * cosine_distance will
        # equal squared l2 distance.
        distance = 2 * custom_ops.cosine_distance(features)
        # hard mining
        arch_idx, pos_idx, neg_idx = custom_ops.semi_hard_mining(
            distance, FLAGS.n_class_per_iter, FLAGS.n_img_per_class,
            FLAGS.threshold)

        # triplet loss
        N_pair_lefted = tf.shape(arch_idx)[0]

        def true_fn():
            pos_distance = tf.gather_nd(distance,
                                        tf.stack([arch_idx, pos_idx], 1))
            neg_distance = tf.gather_nd(distance,
                                        tf.stack([arch_idx, neg_idx], 1))
            return custom_ops.triplet_distance(pos_distance, neg_distance,
                                               FLAGS.threshold)

        loss = tf.cond(N_pair_lefted > 0, true_fn, lambda: 0.)
        pair_rate = N_pair_lefted / (FLAGS.n_class_per_iter *
                                     FLAGS.n_img_per_class**2)

        # compute l2 regularization
        l2_reg = tf.losses.get_regularization_loss()

    with tf.name_scope('metrics') as scope:

        mean_loss, mean_loss_update_op = tf.metrics.mean(loss,
                                                         name='mean_loss')

        mean_pair_rate, mean_pair_rate_update_op = tf.metrics.mean(
            pair_rate, name='mean_pair_rate')

        reset_metrics = tf.variables_initializer(
            tf.get_collection(tf.GraphKeys.LOCAL_VARIABLES, scope))
        metrics_update_op = tf.group(mean_loss_update_op,
                                     mean_pair_rate_update_op)

        # collect metric summary alone, because it need to
        # summary after metrics update
        metric_summary = [
            tf.summary.scalar('loss', mean_loss, collections=[]),
            tf.summary.scalar('pair_rate', mean_pair_rate, collections=[])
        ]

    # compute grad
    grads_and_vars = optimizer.compute_gradients(loss + l2_reg)

    # summary grads
    for g, v in grads_and_vars:
        tf.summary.histogram(v.name + '/grad', g)

    # run train_op and update_op together
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    train_op = tf.group(train_op, *update_ops)

    # build summary
    jpg_img_str = tf.placeholder(tf.string, shape=[], name='jpg_img_str')
    emb_summary_str = tf.summary.image(
        'emb',
        tf.expand_dims(tf.image.decode_image(jpg_img_str, 3), 0),
        collections=[])
    train_summary_str = tf.summary.merge_all()
    metric_summary_str = tf.summary.merge(metric_summary)

    # init op
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    # prepare for the logdir
    if not tf.gfile.Exists(FLAGS.logdir):
        tf.gfile.MakeDirs(FLAGS.logdir)

    # saver
    saver = tf.train.Saver(max_to_keep=FLAGS.n_epoch)

    # summary writer
    train_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'train'),
                                         tf.get_default_graph())
    val_writer = tf.summary.FileWriter(os.path.join(FLAGS.logdir, 'val'),
                                       tf.get_default_graph())

    # session
    config = tf.ConfigProto(allow_soft_placement=True,
                            log_device_placement=False,
                            intra_op_parallelism_threads=8,
                            inter_op_parallelism_threads=0)
    config.gpu_options.allow_growth = True

    sess = tf.Session(config=config)

    # do initialization
    sess.run(init_op)

    # restore
    if FLAGS.restore:
        saver.restore(sess, FLAGS.restore)

    lr_boundaries = list(map(int, FLAGS.boundaries.split(',')))
    lr_values = list(map(float, FLAGS.values.split(',')))
    lr_manager = LRManager(lr_boundaries, lr_values)
    time_meter = TimeMeter()

    # start to train
    for e in range(FLAGS.n_epoch):
        print('-' * 40)
        print('Epoch: {:d}'.format(e))

        # training loop
        try:
            i = 0
            sess.run([train_data_init, reset_metrics])
            while True:

                lr = lr_manager.get(e)
                fetch = [train_summary_str] if i % FLAGS.log_every == 0 else []

                time_meter.start()
                result = sess.run([train_op, metrics_update_op] + fetch, {
                    learning_rate: lr,
                    is_training: True
                })
                time_meter.stop()

                if i % FLAGS.log_every == 0:
                    # fetch summary str
                    t_summary = result[-1]
                    t_metric_summary = sess.run(metric_summary_str)

                    t_loss, t_pr = sess.run([mean_loss, mean_pair_rate])
                    sess.run(reset_metrics)

                    spd = batch_size / time_meter.get_and_reset()

                    print(
                        'Iter: {:d}, LR: {:g}, Loss: {:.4f}, PR: {:.2f}, Spd: {:.2f} i/s'
                        .format(i, lr, t_loss, t_pr, spd))

                    train_writer.add_summary(t_summary,
                                             global_step=sess.run(global_step))
                    train_writer.add_summary(t_metric_summary,
                                             global_step=sess.run(global_step))

                i += 1
        except tf.errors.OutOfRangeError:
            pass

        # save checkpoint
        saver.save(sess,
                   '{}/{}'.format(FLAGS.logdir, FLAGS.model),
                   global_step=sess.run(global_step),
                   write_meta_graph=False)

        # val loop
        try:
            sess.run([val_data_init, reset_metrics])
            v_flist, v_llist = [], []
            v_iter = 0
            while True:
                v_feats, v_labels, _ = sess.run(
                    [features, labels, metrics_update_op],
                    {is_training: False})
                if v_iter < FLAGS.n_iter_for_emb:
                    v_flist.append(v_feats)
                    v_llist.append(v_labels)
                v_iter += 1
        except tf.errors.OutOfRangeError:
            pass

        v_loss, v_pr = sess.run([mean_loss, mean_pair_rate])
        print('[VAL]Loss: {:.4f}, PR: {:.2f}'.format(v_loss, v_pr))

        v_jpg_str = feat2emb(
            np.concatenate(v_flist, axis=0), np.concatenate(v_llist, axis=0),
            TSNE_transform if int(FLAGS.n_feats) > 2 else None)

        val_writer.add_summary(sess.run(metric_summary_str),
                               global_step=sess.run(global_step))
        val_writer.add_summary(sess.run(emb_summary_str,
                                        {jpg_img_str: v_jpg_str}),
                               global_step=sess.run(global_step))

    print('-' * 40)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--job-dir",
                        default="",
                        help="Job directory for output plots")
    parser.add_argument("--data-path",
                        default="diabetes.csv",
                        help="Data directory for PIMA")
    parser.add_argument("--n-folds",
                        type=int,
                        default=10,
                        help="Number of Folds for K-Fold Cross Validation")
    parser.add_argument("--n-trees",
                        type=int,
                        default=100,
                        help="Number of Trees")
    parser.add_argument("--n-neighbors",
                        type=int,
                        default=10,
                        help="Number of neighbors for critical set")
    parser.add_argument("--max-depth",
                        type=int,
                        default=10,
                        help="Depth of search for random forest")
    parser.add_argument("--min-size",
                        type=int,
                        default=1,
                        help="Minimum size for random forest")
    parser.add_argument("--n-features",
                        type=int,
                        default=2,
                        help="Number of features for random forest")
    parser.add_argument("--sample-size",
                        type=float,
                        default=1.0,
                        help="Sample size for random forest")
    parser.add_argument("--p-critical",
                        type=float,
                        default=0.5,
                        help="Percentage of forest size using critical set")
    parser.add_argument("--seed", type=int, default=0, help="Random seed")
    args = parser.parse_args()

    # Set seeds for reproducibility
    np.random.seed(seed=args.seed)
    seed(args.seed)

    # Prep data for model training
    data = load_data(args.data_path)
    raw_data_train, raw_data_test = split_data(data)
    data_train, scaler, medians = pima_training_data_transformation(
        raw_data_train)

    # Evaluate algorithm on training data using K-Fold cross validation
    _ = evaluate_algorithm(data_train, biased_random_forest, args.n_folds,
                           args.n_neighbors, args.p_critical, args.max_depth,
                           args.min_size, args.sample_size, args.n_trees,
                           args.n_features)

    # Train tree model on full training dataset
    trees = train_biased_random_forest(data_train, args.n_neighbors,
                                       args.max_depth, args.min_size,
                                       args.sample_size, args.n_trees,
                                       args.n_features, args.p_critical)

    # Evaluate model on test data
    # Prepare test data
    data_test = pima_test_data_transformation(raw_data_test, scaler,
                                              medians).to_numpy()

    test_set = list()
    for row in data_test:
        row_copy = list(row)
        test_set.append(row_copy)
        row_copy[-1] = None

    # Run inference on test set
    test_predictions, test_probs = test_random_forest(trees, test_set)
    test_actual = data_test[:, -1]

    # Evaluate test data performance
    print('Test Data Performance')
    fp_rates, tp_rates, recalls, precisions = display_metrics(
        test_actual, test_predictions, test_probs)

    # Plot final
    outname = "Test Data"
    save_prc_curve(recalls, precisions, name=outname)
    save_roc_curve(fp_rates, tp_rates, name=outname)

    # LIME
    df_features = data_train.iloc[:, :-1]
    feature_cols = df_features.columns
    data_features = df_features.values
    data_labels = data_train.iloc[:, -1].values

    explainer = lime.lime_tabular.LimeTabularExplainer(
        data_features,
        mode='classification',
        training_labels=data_labels,
        feature_names=feature_cols)

    model = BiasedRandomForestModel(trees)

    # ipdb is useful here for further exploration in LIME. This can also be moved to a follow-up notebook.
    # ipdb.set_trace()
    idx = 0
    exp = explainer.explain_instance(data_features[idx],
                                     model.get_probs,
                                     num_features=7)
    exp.save_to_file('lime_rf_example0.html')
示例#32
0
def main():
    load_dotenv('.env.general')
    config = load_config('config.yml')
    Path(config.logging.handlers.debug_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    Path(config.logging.handlers.info_file_handler.filename).parent.mkdir(
        parents=True, exist_ok=True)
    logging.config.dictConfig(config.logging)

    _logger.info("Loading the data")
    x, y = load_training_data()
    x_train, x_test, y_train, y_test = split_data(x, y)

    with tempfile.TemporaryDirectory() as td:
        temp_dir = Path(td)
        mlflow.set_experiment(config.experiment.name)

        params = {}
        tags = {}
        metrics = {}
        artifacts = {}

        with mlflow.start_run():
            _logger.info("Fitting the preprocessor")
            preprocessor = get_preprocessor()
            preprocessor.fit(x_train, y_train)

            _logger.info("Preprocessing the training data")
            x_train_prep = preprocessor.transform(x_train)
            x_test_prep = preprocessor.transform(x_test)

            estimator_params, search_space = get_params()

            if search_space is None:
                estimator, estimator_tags, estimator_metrics, estimator_artifacts = train_run(
                    estimator_params=estimator_params,
                    x_train_prep=x_train_prep,
                    y_train=y_train,
                    x_test_prep=x_test_prep,
                    y_test=y_test,
                    temp_dir=temp_dir)

                model = make_pipeline(preprocessor, estimator)
                params.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_params.items()})
                tags.update(
                    {f"estimator_{k}": v
                     for k, v in estimator_tags.items()})
                metrics.update(estimator_metrics)
                artifacts.update(estimator_artifacts)

            else:

                def hyperopt_objective(search_params):
                    # This function is called for each set of hyper-parameters being tested by HyperOpt.
                    run_name = str(len(trials) - 1)
                    ho_params = {}
                    ho_tags = {}
                    ho_metrics = {}
                    ho_artifacts = {}

                    search_params = flatten_params(search_params)
                    search_params = prep_params(search_params)
                    ho_estimator_params = estimator_params.copy()
                    ho_estimator_params.update(search_params)

                    with mlflow.start_run(nested=True, run_name=run_name):
                        ho_estimator, ho_estimator_tags, ho_estimator_metrics, ho_estimator_artifacts = train_run(
                            estimator_params=ho_estimator_params,
                            x_train_prep=x_train_prep,
                            y_train=y_train,
                            x_test_prep=x_test_prep,
                            y_test=y_test,
                            temp_dir=temp_dir / run_name)

                        ho_model = make_pipeline(preprocessor, ho_estimator)
                        ho_params.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_params.items()
                        })
                        ho_tags.update({
                            f"estimator_{k}": v
                            for k, v in ho_estimator_tags.items()
                        })
                        ho_metrics.update(ho_estimator_metrics)
                        ho_artifacts.update(ho_estimator_artifacts)

                        ho_tags['hyperopt'] = True

                        log_sk_model(ho_model,
                                     registered_model_name=None,
                                     params=ho_params,
                                     tags=ho_tags,
                                     metrics=ho_metrics,
                                     artifacts=ho_artifacts)

                    loss = 1 - ho_metrics[config.evaluation.primary_metric]

                    return {
                        'loss': loss,
                        'status': STATUS_OK,
                        'model': ho_model,
                        'params': ho_params,
                        'tags': ho_tags,
                        'metrics': ho_metrics,
                        'artifacts': ho_artifacts
                    }

                trials = Trials()
                fmin(fn=hyperopt_objective,
                     space=search_space,
                     algo=tpe.suggest,
                     trials=trials,
                     max_evals=config.training.max_evals,
                     rstate=np.random.RandomState(1),
                     show_progressbar=False)

                model = trials.best_trial['result']['model']
                params = trials.best_trial['result']['params']
                tags = trials.best_trial['result']['tags']
                metrics = trials.best_trial['result']['metrics']
                artifacts = trials.best_trial['result']['artifacts']

            if config.evaluation.shap_analysis:
                _logger.info("Starting shap analysis")
                shap_tags, shap_artifacts = shap_analyse(
                    model=model, x=x_train, temp_dir=Path(temp_dir) / 'shap')
                tags.update(shap_tags)
                artifacts.update(shap_artifacts)
            else:
                _logger.info("Shap analysis skipped")

            log_sk_model(model,
                         registered_model_name=None,
                         params=params,
                         tags=tags,
                         metrics=metrics,
                         artifacts=artifacts)

    return (x_train, y_train, x_test,
            y_test), model, params, tags, metrics, artifacts
示例#33
0
    args = parser.parse_args()

    # for TPU
    os.environ["WANDB_API_KEY"] = "0"  # to silence warning
    device = xm.xla_device()
    print('Found TPU at: {}'.format(device))

    # For reproducibility
    np.random.seed(args.seed)

    # Open train and test csv files using pandas library
    train_df = pd.read_csv(args.train_file)
    test_df = pd.read_csv(args.test_file)

    # Split training dataset into two parts - the data we will train the model with and a validation set.
    train_df, validation_df = data.split_data(train_df)

    # Check the number of rows and columns in the subsets after split
    print("Train data shape after split: {} \n".format(train_df.shape))
    print("Validation data shape after split: {} \n".format(
        validation_df.shape))

    # Augment training data
    train_df = data.augment_data(train_df,
                                 test_df,
                                 use_xnli=args.load_xnli,
                                 use_mnli=args.load_mnli,
                                 use_bt=args.back_translate,
                                 bt_filepath=args.bt_file)

    # Define the tokenizer to preprocess the input data