Пример #1
0
def process():
    #print("use:",layerfile()," info:",open(layerfile().replace(".pickle",".info"),'r').read().rstrip());
    N, X, Target = dataset.get()
    Em, Dm = diff_minimax(layers())
    Ep, Dp = perf()
    print(layerfile(),
          len(Ep) + len(Dp), ": minimax:", percent(Ep, Dp), "%", "Jall=",
          layer.J(X, Target, layers()), " ME:", percent(Em, Dm), "%")
Пример #2
0
def eval(net):
    data_root="~/dataset/"
    val_loader = get(
                batch_size=256, 
                data_root=data_root, 
                train=False, 
                val=True, 
                shuffle=True)
    acc1, acc5 = eval_model(net,val_loader)
    print("acc1:{}, acc5:{}".format(acc1,acc5))
Пример #3
0
def get_latent(args):
    model = torch.load(args.model[0])
    model.eval()

    latents = []
    file_names = []
    for data_file in args.data_files:
        file_names.append(str(os.path.basename(data_file)))
        seq = torch.FloatTensor([dataset.get(data_file)])
        seq_len = [len(s) for s in seq]
        with torch.no_grad():
            mu, ln_var = model.encode(seq, seq_len)
            latents.append(mu[0].tolist())
    return latents, file_names
Пример #4
0
 def get_datasets(self, *dataset_names, use_local=False):
     datasets = {name: dataset_desc.get(name) for name in dataset_names}
     if use_local:
         logger.info('use local dataset')
     else:
         # we use remote dataset by default
         logger.info('use remote dataset')
         datasets = {
             name: create_remote_dataset(
                 ds.servable_name,
                 ds.nr_minibatch_in_epoch,
             )
             for name, ds in datasets.items()
         }
     return datasets
Пример #5
0
def main():
    # Read world map
    world_map = imread('world_map2.jpg', as_gray=True)

    # Set waves dataset
    ds = get(lat, lon, level)
    ds = resize_with_extender(ds, world_map.shape, extender=ds.min())

    # Form color map
    plt.axis('off')
    plt.title('Color map', fontsize=20, pad=30)
    plt.imshow(world_map, cmap='gray')
    plt.imshow(ds, alpha=0.7, cmap='gist_heat')
    plt.colorbar(norm=Normalize(vmin=ds.min(), vmax=ds.max()), orientation='horizontal')
    plt.savefig(output_path, bbox_inches='tight', pad_inches=0.2)

    print(f"Congratulations! The color map was created. Check out {output_path}.")
Пример #6
0
def plot(preprocess):
    labels, data = dataset.get(subset="train",
                               preprocess=preprocess,
                               categories=categories,
                               verbose=True)
    labels = np.array(labels)

    print "Getting TF IDF weights"

    vec = TfidfVectorizer(max_df=0.5,
                          max_features=10000,
                          min_df=2,
                          stop_words='english',
                          use_idf=True,
                          ngram_range=(1, 1))
    X = vec.fit_transform(data)

    print(repr(X))

    print "Reducing dimensions to 50"

    X_reduced = TruncatedSVD(n_components=50, random_state=0).fit_transform(X)

    X_embedded = PCA(n_components=2).fit_transform(X_reduced)

    names = np.unique(labels)
    print names
    num_clusters = len(names)
    fig = plt.figure(frameon=False)

    colors = iter(cm.Spectral(np.linspace(0, 1, num_clusters)))

    for name in names:
        X = X_embedded[labels == name]
        plt.scatter(X[:, 0], X[:, 1], marker='x', label=name)

    plt.title("PCA (Preprocessed)" if preprocess else "PCA")
    plt.xticks([])
    plt.yticks([])
    plt.legend()
Пример #7
0
def learn(nhidden):
    X, Target = dataset.get(50)
    N = []
    # input layer
    N.append(X.shape[0])

    # hidden layer(s)
    N.append(nhidden)

    # output layer
    N.append(Target.shape[0])

    T = X.shape[1]
    # init
    layers = []
    for i in range(1, len(N)):
        layers.append(
            layer.Layer(N[i - 1], N[i], layer.activation('sigma'), 0.001))

    dirname = "{}/{}".format("results", nhidden)
    if not os.path.exists(dirname):
        os.makedirs(dirname)

    scores = []
    iter = 0
    while not scores or (decreasing(scores) and iter < 200):
        scores.append(layer.J(X, Target, layers))
        if iter % 50 == 0:
            pfile = "{}/{}.pickle".format(dirname, iter)
            pickle.dump(layers, open(pfile, 'wb'))
            ifile = "{}/{}.info".format(dirname, iter)
            open(ifile + ".info", 'w').write("J=" + str(scores[-1]) + "\n")
            print("J=", scores[-1], pfile, ifile)
        layer.learn(X, Target, layers)
        iter = iter + 1
    print("J=", scores[-1])
    return scores[-1]
Пример #8
0
    def __init__(self, params):
        """
        Initialize trainer.
        """
        self.params = params

        # epoch / iteration size
        assert isinstance(config.epoch_size, int)
        assert config.epoch_size >= 1
        self.epoch_size = config.epoch_size

        # network and criterion
        net, criterion = model.get()
        self.net = net
        self.criterion = criterion

        # data iterators
        self.iterators = {}
        train_iter, valid_iter, SRC_TEXT, TGT_TEXT = dataset.get(params)
        self.iterators["train"] = train_iter
        self.iterators["valid"] = valid_iter
        self.num_train = len(train_iter)
        self.SRC_TEXT = SRC_TEXT
        self.TGT_TEXT = TGT_TEXT

        # Multi-GPU
        assert config.amp >= 1 or not config.fp16
        if config.multi_gpu and config.fp16 == False:
            logger.info("Using nn.parallel.DistributedDataParallel ...")
            self.net = nn.parallel.DistributedDataParallel(
                    self.net, device_ids=[params.local_rank], output_device=params.local_rank
                    )

        # set optimizers
        self.opt = optimizer.get(self.net)

        # Float16 / distributed
        if config.fp16:
            self.init_amp()
            if config.multi_gpu:
                logger.info("Using apex.parallel.DistributedDataParallel ...")
                self.net = apex.parallel.DistributedDataParallel(self.net, delay_allreduce=True)

        # validation metrics
        self.best_metrics = {}
        for k in config.valid_metrics.keys():
            factor = config.valid_metrics[k]
            self.best_metrics[k] = [config.init_metric * factor, factor]

        # early stopping metrics
        self.early_stopping_metrics = {}
        for k in self.best_metrics:
            self.early_stopping_metrics[k] = self.best_metrics[k]

        self.decrease_counts = 0
        self.decrease_counts_max = config.decrease_counts_max
        self.stopping_criterion = config.stopping_criterion
        if config.multi_gpu:
            self.should_terminate = torch.tensor(0).byte()
            self.should_terminate = self.should_terminate.cuda()
        else:
            self.should_terminate = False
        assert ( self.stopping_criterion in self.best_metrics ) or ( self.stopping_criterion is None )

        # training statistics
        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sentences = 0
        self.stats = OrderedDict(
            [('processed_s', 0), ('processed_w', 0)] +
            [('MT-%s-%s-loss' % (config.SRC_LAN, config.TGT_LAN), [])] +
            [('MT-%s-%s-ppl' % (config.SRC_LAN, config.TGT_LAN), [])]
        )
        self.last_time = time.time()

        # reload potential checkpoints
        self.reload_checkpoint(network_only=config.reload_network_only)
Пример #9
0
# Author: Marius Maaland
#         Jonas Palm

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import numpy as np

import dataset

preprocess = True  #("emails", "headers")
y_train, X_train_data = dataset.get(subset="train",
                                    preprocess=preprocess,
                                    verbose=True)
y_test, X_test_data = dataset.get(subset="test", preprocess=preprocess)

VEC_MAX_DF = 1.0
VEC_MIN_DF = 1
VEC_STOP_WORDS = 'english'


def print_dominant_words(vec, n):
    X_train = vec.fit_transform(X_train_data)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)

    labels = np.unique(y_train)
    coefs = clf.coef_.argsort()[:, ::-1]
    terms = vec.get_feature_names()
    for i in range(len(labels)):
Пример #10
0
from time import time
import dataset

#print "Loading features"


def make_string_label_dict(unique_string_labels):
    label_dict = dict()
    for i in range(unique_string_labels.size):
        label_dict[unique_string_labels[i]] = i
    return label_dict


# trunc_label specifies whether to truncate the label
# to the least common denominator for each usenet group
labels, data = dataset.get(truncate_label=False)
datapoints = len(data)
#print "Number of datapoints: ", datapoints
unique_labels, _ = np.unique(labels, return_inverse=True)
#print "- Labels:", unique_labels
# Create a dictionary with enumerated label names
label_dict = make_string_label_dict(unique_labels)
#print label_dict
# true_k holds the true number of clusters
true_k = np.unique(labels).shape[0]


# Calculate and print metrics to assess k-means
def assess_birch(estimator, num_clusters, data, labels):
    t0 = time()
    estimator.fit(data)
Пример #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--task', required=True)
    parser.add_argument('--model', required=True)
    parser.add_argument('--preprocess-train-frac', type=float)
    parser.add_argument('--preprocess-test-frac', type=float)
    parser.add_argument('--model-train-frac', type=float)
    parser.add_argument('--model-valid-frac', type=float)
    parser.add_argument('--result-dir', default='./result')
    parser.add_argument('--n-trials', type=int)
    parser.add_argument('--n-jobs', type=int, default=1)
    parser.add_argument('--seed', type=int, default=1)
    args = parser.parse_args()

    logger = autogbt.logging.get_logger()
    logger.info(args)

    model = args.model
    task = args.task
    n_trials = args.n_trials
    seed = args.seed
    n_jobs = args.n_jobs
    model_train_frac = args.model_train_frac
    model_valid_frac = args.model_valid_frac
    with open('../.git/refs/heads/master') as fp:
        commit = next(fp).strip()

    res_dir = Path(args.result_dir) / commit
    res_dir.mkdir(parents=True, exist_ok=True)

    name = '-'.join(map(str, [model, task, n_trials, model_train_frac, seed]))
    result_path = res_dir / ('%s.csv' % (name))

    if result_path.exists():
        return

    res = []
    logger.info('load dataset %s' % task)
    logger.info('model %s' % model)
    cv = KFold(n_splits=5, shuffle=True, random_state=seed)
    train_X, train_y, test_X = dataset.get(task)
    start = time.time()
    prep = autogbt.Preprocessor(
        train_frac=args.preprocess_train_frac,
        test_frac=args.preprocess_test_frac,
        random_state=seed,
    )
    train_X, valid_X, train_y = prep.transform(train_X, test_X, train_y)
    if model == 'auto':
        sampler = TrainDataSampler(
            train_frac=model_train_frac,
            valid_frac=model_valid_frac,
            random_state=seed,
        )
        est = autogbt.AutoGBTClassifier(
            n_trials=n_trials,
            sampler=sampler,
            n_jobs=n_jobs,
            cv=cv,
            random_state=seed,
        )
        est.fit(train_X, train_y)
        score = est.best_score
    else:
        n_trials = 1
        model_train_frac = 1.0
        model_valid_frac = 1.0
        if model == 'xgb':
            import xgboost as xgb
            est = xgb.XGBClassifier(n_jobs=n_jobs, random_state=seed)
            pred = cross_val_predict(est,
                                     train_X,
                                     train_y,
                                     cv=cv,
                                     method='predict_proba')[:, 1]
            score = roc_auc_score(train_y, pred)
        if model == 'lgb':
            import lightgbm as lgb
            est = lgb.LGBMClassifier(n_jobs=n_jobs, random_state=seed)
            pred = cross_val_predict(est,
                                     train_X,
                                     train_y,
                                     cv=cv,
                                     method='predict_proba')[:, 1]
            score = roc_auc_score(train_y, pred)

    end = time.time()
    duration = end - start

    logger.info('CV AUC: %.6f' % score)
    res = pd.DataFrame([[
        task,
        model,
        n_trials,
        args.preprocess_train_frac,
        args.preprocess_test_frac,
        model_train_frac,
        model_valid_frac,
        duration,
        score,
        commit,
    ]],
                       columns=[
                           'dataset',
                           'model',
                           'n_trials',
                           'preprocess_train_frac',
                           'preprocess_test_frac',
                           'model_train_frac',
                           'model_valid_frac',
                           'duration[s]',
                           'CV AUC',
                           'commit',
                       ])

    res.to_csv(result_path, index=False)
Пример #12
0
def test_incremental():
    from common import config
    import model
    from utils import get_batch
    net, _ = model.get()
    net.eval()
    
    ckpt = torch.load("checkpoints/checkpoint_best_ppl.pth", map_location='cpu')

    # reload model parameters
    s_dict = {}
    for k in ckpt["net"]:
        new_k = k[7:]
        s_dict[new_k] = ckpt["net"][k]

    net.load_state_dict(s_dict)
    
    import dataset
    train_iter, _, SRC_TEXT, TGT_TEXT = dataset.get()
    #data_iter = iter(train_iter.get_iterator(True, True))
    #raw_batch = next(data_iter)
    src = np.arange(4, 4+2000).reshape(80, 25)
    tgt = np.arange(4, 4+2400).reshape(80, 30)
    raw_batch = dataset.Batch(
            torch.from_numpy(src).long(),
            torch.from_numpy(tgt).long()
            )

    batch = get_batch(
            raw_batch.src, raw_batch.tgt,
            SRC_TEXT.vocab, TGT_TEXT.vocab
            )
    for k, v in batch.items():
        try:
            print(k, v.size())
        except AttributeError:
            pass

    with torch.no_grad():
        enc_out = net.encode(src=batch['src'], src_mask=batch['src_mask'])
        # No incremental
        logits1 = net.decode(enc_out, batch['src_mask'], batch['tgt'], batch['tgt_mask'])
        logits1 = net.generator(logits1, log_prob=True)

        # Incremental
        print("Incremental encoding finished!")
        tlen = batch['tgt'].size(1)
        cache = {'cur_len':0}
        logits2 = []
        for i in range(tlen):
            x = batch['tgt'][:, i].unsqueeze(-1)

            logit = net.decode(
                    enc_out, batch['src_mask'], x,
                    batch['tgt_mask'][:, i, :(i+1)].unsqueeze(-2), cache
                    )

            logit = net.generator(logit, log_prob=True)
            
            if i >= 0:
                ref = logits1[:, i, :]
                sys = logit.squeeze()
                
                ref_words = torch.topk(ref, 1)[1].squeeze()
                sys_words = torch.topk(sys, 1)[1].squeeze()

                print("Diff  = {}".format(torch.sum(ref - sys).item()))
                print("Logits sys size : {}, Logits sys : {}".format(sys.size(), sys.sum().item()))
                print("Logits ref size : {}, Logits ref : {}".format(ref.size(), ref.sum().item()))
                if (ref_words == sys_words).all() == False:
                    print("F**k!")
                print("\n")
            
            logits2.append(logit)
            cache['cur_len'] = i + 1
        logits2 = torch.cat(logits2, dim=1).contiguous()

        print("Logits1: {}".format(torch.sum(logits1).item()))
        print("Logits2: {}".format(torch.sum(logits2).item()))
Пример #13
0
import pandas as pd

import dataset

src = dataset.uci_root / 'abalone'
dst = dataset.data_folder / 'dataset1'


if __name__ == '__main__':

    dataset.get(src / 'abalone.data', dst / 'abalone.data')
    dataset.get(src / 'abalone.names', dst / 'abalone.names')

    df = (pd
          .read_csv(dst / 'abalone.data', header=None)
          .pipe(lambda x: x[x[0].isin({'F', 'M'})]))

    X = df[[1, 2, 3, 4, 5, 6, 7]].values
    t = (df[0] == 'M').values
    y = df[8] <= 10

    dataset.save('dataset1', X, t, y)

Пример #14
0
def test_dropout(wdecay,
                 lr,
                 route_iter,
                 model_name='dynamic_capsules',
                 epoch_stuff=[30, 60],
                 reconstruct=False,
                 loss_weights=None,
                 exp=False,
                 model_to_test=None,
                 res=False,
                 dropout=0.5):

    out_dirs = []
    out_dir_meta = '../experiments/' + model_name + '_' + str(route_iter)
    num_epochs = epoch_stuff[1]
    if model_to_test is None:
        model_to_test = num_epochs - 1

    epoch_start = 0
    if exp:
        dec_after = ['exp', 0.96, epoch_stuff[0], 1e-6]
    else:
        dec_after = ['step', epoch_stuff[0], 0.1]

    lr = lr

    criterion = 'margin'
    criterion_str = criterion
    n_classes = 10
    save_after = 10
    init = False
    pre_pend = 'mnist'
    strs_append_list = [
        'reconstruct', reconstruct, 'shift', criterion_str, init, 'wdecay',
        wdecay, num_epochs
    ] + dec_after + lr + [dropout]
    if loss_weights is not None:
        strs_append_list = strs_append_list + ['lossweights'] + loss_weights
    strs_append = '_' + '_'.join([str(val) for val in strs_append_list])

    out_dir_train = os.path.join(out_dir_meta, pre_pend + strs_append)
    final_model_file = os.path.join(out_dir_train,
                                    'model_' + str(num_epochs - 1) + '.pt')
    print out_dir_train

    if os.path.exists(final_model_file):
        print 'skipping', final_model_file
        raw_input()

    model_file = None
    batch_size = 256
    batch_size_val = 256
    num_workers = 0

    data_transforms = {}
    data_transforms['train'] = transforms.Compose([
        transforms.RandomCrop(28, padding=2),
        transforms.ToTensor(),
        transforms.Normalize((0.1307, ), (0.3081, ))
    ])
    data_transforms['val'] = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.1307, ), (0.3081, ))])

    train_data = dataset.get(
        'mnist',
        dict(dir_data='../data/mnist_downloaded',
             train=True,
             transform=data_transforms['train']))
    test_data = dataset.get(
        'mnist',
        dict(dir_data='../data/mnist_downloaded',
             train=False,
             transform=data_transforms['val']))

    train_dataloader = torch.utils.data.DataLoader(train_data,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   num_workers=num_workers)

    test_dataloader = torch.utils.data.DataLoader(test_data,
                                                  batch_size=batch_size_val,
                                                  shuffle=False,
                                                  num_workers=num_workers)

    network_params = dict(n_classes=n_classes,
                          r=route_iter,
                          init=init,
                          reconstruct=reconstruct,
                          loss_weights=loss_weights,
                          dropout=dropout)

    util.makedirs(out_dir_train)

    train_params = dict(out_dir_train=out_dir_train,
                        train_data=train_data,
                        test_data=test_data,
                        batch_size=batch_size,
                        batch_size_val=batch_size_val,
                        num_epochs=num_epochs,
                        save_after=save_after,
                        disp_after=1,
                        plot_after=100,
                        test_after=1,
                        lr=lr,
                        dec_after=dec_after,
                        model_name=model_name,
                        criterion=criterion,
                        gpu_id=0,
                        num_workers=0,
                        model_file=model_file,
                        epoch_start=epoch_start,
                        network_params=network_params,
                        weight_decay=wdecay)
    test_params = dict(out_dir_train=out_dir_train,
                       model_num=model_to_test,
                       train_data=train_data,
                       test_data=test_data,
                       gpu_id=0,
                       model_name=model_name,
                       batch_size_val=batch_size_val,
                       criterion=criterion,
                       network_params=network_params)

    print train_params
    param_file = os.path.join(out_dir_train, 'params.txt')
    all_lines = []
    for k in train_params.keys():
        str_print = '%s: %s' % (k, train_params[k])
        print str_print
        all_lines.append(str_print)
    util.writeFile(param_file, all_lines)

    train_model_recon(**train_params)
Пример #15
0
misc.ensure_dir(args.logdir)
print("=================FLAGS==================")
for k, v in args.__dict__.items():
    print('{}: {}'.format(k, v))
print("========================================")

# seed
args.cuda = torch.cuda.is_available()
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# data loader and model
train_loader, test_loader = dataset.get(batch_size=args.batch_size,
                                        num_workers=1,
                                        num_samples=args.num_samples,
                                        data_augment=args.data_augment,
                                        validation=args.validation)

if args.validation or (args.num_samples != 500):
    Ntrain = len(train_loader.sampler.indices)
else:
    Ntrain = len(train_loader.dataset)

if args.validation:
    Ntest = len(test_loader.sampler.indices)
else:
    Ntest = len(test_loader.dataset)

model = model.stl10(n_channel=args.channel)
model = torch.nn.DataParallel(model, device_ids=range(args.ngpu))
Пример #16
0
                                             #Deltas[0, -50, 50],
                                             #sigma[20, 0.001, 100],
                                             #alpha[-1.5, -10, 0],
                                             n[1.5, 0.1, 10] )""")

## Categories
catTitleCut = {
    'highR9_eb' : ('high R9 barrel' , 'subdet == subdet::Barrel  && r9 == r9::High'),
    'highR9_ee' : ('high R9 endcaps', 'subdet == subdet::Endcaps && r9 == r9::High'),
    'lowR9_eb'  : ('low R9 barrel'  , 'subdet == subdet::Barrel  && r9 == r9::Low' ),
    'lowR9_ee'  : ('low R9 endcaps' , 'subdet == subdet::Endcaps && r9 == r9::Low' ),
}

## Get data
data = dataset.get( tree = esChains.getChains('v4')['data'],
                    variable = x,
                    weight = w,
                    categories = myCategories )
data.SetName('realData')
data.SetTitle('scale real data 750/pb')
ws1.Import(data)

## Get data in categories
realData = {}
for cat, (title, cut) in catTitleCut.items():
    realData[cat] = data.reduce( Cut(cut), Name('data_' + cat), Title(title) )

## Get MC
w.SetTitle('pileup.weightOOT')
data = dataset.get( tree = esChains.getChains('v4')['z'],
                    variable = x,
                    weight = w )
                                             #Deltas[0, -50, 50],
                                             #sigma[20, 0.001, 100],
                                             #alpha[-1.5, -10, 0],
                                             n[1.5, 0.1, 10] )""")

## Categories
catTitleCut = {
    'highR9_eb' : ('high R9 barrel' , 'subdet == subdet::Barrel  && r9 == r9::High'),
    'highR9_ee' : ('high R9 endcaps', 'subdet == subdet::Endcaps && r9 == r9::High'),
    'lowR9_eb'  : ('low R9 barrel'  , 'subdet == subdet::Barrel  && r9 == r9::Low' ),
    'lowR9_ee'  : ('low R9 endcaps' , 'subdet == subdet::Endcaps && r9 == r9::Low' ),
}

## Get data
data = dataset.get( tree = esChains.getChains('v4')['data'],
                    variable = x,
                    weight = w )
data.SetName('realData')
data.SetTitle('scale real data 750/pb')
ws1.Import(data)

## Get data in categories
realData = {}
for cat, (title, cut) in catTitleCut.items():
    realData[cat] = data.reduce( Cut(cut), Name('data_' + cat), Title(title) )

## Get MC
w.SetTitle('pileup.weightOOT')
data = dataset.get( tree = esChains.getChains('v4')['z'],
                    variable = x,
                    weight = w )
Пример #18
0
    def __init__(self, params):
        """
        Initialize trainer.
        """
        self.params = params

        # Initialize tensorboard writer
        train_log = SummaryWriter(
            os.path.join(config.tensorboard_log_path, "log", "train"))
        valid_log = SummaryWriter(
            os.path.join(config.tensorboard_log_path, "log", "valid"))
        self._tensorboard = TensorboardWriter(train_log, valid_log)

        # epoch / iteration size
        assert isinstance(config.epoch_size, int)
        assert config.epoch_size >= 1
        self.epoch_size = config.epoch_size

        # network and criterion
        net, criterion = model.get()
        self.net = net
        self.criterion = criterion

        # data iterators
        self.iterators = {}
        train_iter, valid_iter, SRC_TEXT, TGT_TEXT = dataset.get()
        self.iterators["train"] = train_iter
        self.iterators["valid"] = valid_iter
        self.num_train = len(train_iter)
        self.SRC_TEXT = SRC_TEXT
        self.TGT_TEXT = TGT_TEXT

        # Multi-GPU
        if config.multi_gpu:
            logger.info("Using nn.parallel.DistributedDataParallel ...")
            self.net = nn.parallel.DistributedDataParallel(
                self.net,
                device_ids=[params.local_rank],
                output_device=params.local_rank)
            """
            self.criterion = nn.parallel.DistributedDataParallel(
                    self.criterion, device_ids=[params.local_rank], output_device=params.local_rank
                    )
            """

        # set optimizers
        self.opt = optimizer.get(self.net)

        # validation metrics
        self.best_metrics = {}
        for k in config.valid_metrics.keys():
            factor = config.valid_metrics[k]
            self.best_metrics[k] = [config.init_metric * factor, factor]

        # training statistics
        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sentences = 0
        self.stats = OrderedDict([('processed_s', 0), ('processed_w', 0)] +
                                 [('MT-%s-%s-loss' %
                                   (config.SRC_LAN, config.TGT_LAN), [])] +
                                 [('MT-%s-%s-ppl' %
                                   (config.SRC_LAN, config.TGT_LAN), [])])
        self.last_time = time.time()

        # reload potential checkpoints
        self.reload_checkpoint()
Пример #19
0
def evaluate_kmeans():
    """ run kmeans implementation on dataset """
    import dataset
    from sklearn.feature_extraction.text import TfidfVectorizer

    def kmeans_args(k):
        return {
            "KM": {
                "n_clusters": k,
                "init": "k-means",
                "minibatch": False
            },
            "KM++": {
                "n_clusters": k,
                "init": "k-means",
                "minibatch": False
            },
            "MBKM": {
                "n_clusters": k,
                "init": "k-means++",
                "minibatch": True
            },
            "MBKM++": {
                "n_clusters": k,
                "init": "k-means++",
                "minibatch": True
            },
        }

    # get all unique categories in the dataset and shuffle the order
    labels, _ = dataset.get(subset="all")
    categories = np.unique(labels)
    np.random.shuffle(categories)

    print categories

    names = ["K"]
    for name in kmeans_args(2):
        names.append(name + " (mr)")
        names.append(name + " (time)")
        names.append(name + " (it)")

    print ", ".join(names)
    for k in range(2, 21):
        n = 0
        args = kmeans_args(k)

        # select k first categories from the list of all categories we
        # prepared above
        y, Xdata = dataset.get(categories=categories[:k], subset="all")
        vec = TfidfVectorizer(max_df=0.5,
                              max_features=1000,
                              min_df=2,
                              stop_words="english",
                              use_idf=True)
        X = vec.fit_transform(Xdata)

        print "{},".format(k),
        for name in args:
            n += 1
            km = KMeans(**args[name])
            km.fit(X)

            mr = mistake_rate(km, k, y)
            time = km.avg_time
            iters = km.avg_iterations

            str = "{:.2f}, {:.2f}, {:.2f}".format(mr, time, iters)
            if n != len(args):
                str += ","
            print str,
            sys.stdout.flush()
        print ""
Пример #20
0
#!/usr/bin/env mdl
from megbrain.config import set_default_device
from megskull.graph import Function
from neupeak.utils.cli import load_network

import dataset
import cv2
import numpy as np

set_default_device('cpu0')
net = load_network(
    '/home/zhaojing/vehicle_pose/config/xception145/train_log/models/latest')
classify = Function().compile(net.outputs[0])

test_dataset = dataset.get('test')
x = test_dataset.get_epoch_minibatch_iter()

correct = [0, 0]
total_label = [0, 0]
total_pred = [0, 0]
for data in x:
    out = classify(data.data)

    #total += data.label.size
    for i in range(0, data.label.size):
        total_pred[out[i].argmax()] += 1
        total_label[data.label[i]] += 1
        if out[i].argmax() == data.label[i]:
            correct[data.label[i]] += 1

accuracy = [0, 0]
Пример #21
0
import pandas as pd

import dataset

src = dataset.uci_root / 'adult'
dst = dataset.data_folder / 'dataset2'

if __name__ == '__main__':

    dataset.get(src / 'adult.data', dst / 'adult.data')
    dataset.get(src / 'adult.test', dst / 'adult.test')
    dataset.get(src / 'adult.names', dst / 'adult.names')

    df = (pd.concat([
        pd.read_csv(dst / 'adult.data', header=None),
        pd.read_csv(dst / 'adult.test', header=None, skiprows=1)
    ]))

    higher_ed = {
        ' Assoc-acdm', ' Assoc-voc', ' Bachelors', ' Doctorate', ' Masters',
        ' Some-college'
    }

    high_income = {' >50K', ' >50K.'}

    X = pd.get_dummies(df[[0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])
    t = df[3].isin(higher_ed)
    y = df[14].isin(high_income)

    dataset.save('dataset2', X, t, y)
Пример #22
0
args.ngpu = len(args.gpu)

# logger
misc.ensure_dir(args.loaddir)
misc.ensure_dir(args.savedir)
print("=================FLAGS==================")
for k, v in args.__dict__.items():
    print('{}: {}'.format(k, v))
print("========================================")

args.cuda = torch.cuda.is_available()
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

train_loader, test_loader = dataset.get(batch_size=args.batch_size, data_root=args.data_root, num_workers=4)

algo = {'fgsm': fgsm_gt, 'bim': ifgsm_gt, 'pgd': pgd_gt, 'wrm': wrm_gt}
# attack_algo = algo[args.attack_algo]

attack_algo = algo[args.attack_algo] if args.attack_algo is not None else None
defend_algo = algo[args.defend_algo] if args.defend_algo is not None else None

defend_name = "None" if args.defend_algo is None else args.defend_algo

if args.prune_algo == "l0proj":
    prune_algo = l0proj
elif args.prune_algo is None:
    prune_algo = None
elif args.prune_algo == "baseline":
    prune_algo = l0proj
# logger
misc.ensure_dir(args.logdir)
print("=================FLAGS==================")
for k, v in args.__dict__.items():
    print('{}: {}'.format(k, v))
print("========================================")

# seed
args.cuda = torch.cuda.is_available()
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# data loader and model
train_loader, test_loader = dataset.get(batch_size=args.batch_size,
                                        num_workers=1)
model = model.stl10(n_channel=args.channel)
model = torch.nn.DataParallel(model, device_ids=range(args.ngpu))
if args.cuda:
    model.cuda()

# optimizer
optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.wd)
decreasing_lr = list(map(int, args.decreasing_lr.split(',')))
print('decreasing_lr: ' + str(decreasing_lr))
best_acc, old_file = 0, None
t_begin = time.time()
try:
    # ready to go
    for epoch in range(args.epochs):
        model.train()
Пример #24
0

if args.premodel:
    model.load_weights(args.premodel)
#train
adam = optimizers.Adam(lr=args.lr)
model.compile(optimizer=adam,
              loss='mean_squared_error',
              metrics=[true_num, pred_num, mae, mse])

# serialize model to JSON
model_json = model.to_json()
with open("./logs/model.json", "w") as json_file:
    json_file.write(model_json)

checkpoint = ModelCheckpoint('./logs/models/weights_{epoch:02d}.hdf5',
                             verbose=1,
                             save_best_only=False,
                             save_weights_only=True,
                             mode='auto',
                             period=1)

data_generator = get('train')
valid_data = get_test('test')

model.fit_generator(data_generator,
                    validation_data=valid_data,
                    steps_per_epoch=config.per_epoch,
                    epochs=config.nr_epoch,
                    callbacks=[checkpoint])
Пример #25
0
def init_data(data_files, device):
    data_set = []
    for file_name in data_files:
        data_set.append(torch.FloatTensor(dataset.get(file_name)).to(device))
    return data_set