def load_data_fashion_mnist(batch_size, resize=None):
    """Download the Fashion-MNIST dataset and then load into memory."""
    dataset = gluon.data.vision
    trans = [dataset.transforms.Resize(resize)] if resize else []
    trans.append(dataset.transforms.ToTensor())
    trans = dataset.transforms.Compose(trans)
    mnist_train = dataset.FashionMNIST(train=True).transform_first(trans)
    mnist_test = dataset.FashionMNIST(train=False).transform_first(trans)
    return (gluon.data.DataLoader(mnist_train,
                                  batch_size,
                                  shuffle=True,
                                  num_workers=d2l.get_dataloader_workers()),
            gluon.data.DataLoader(mnist_test,
                                  batch_size,
                                  shuffle=False,
                                  num_workers=d2l.get_dataloader_workers()))
Exemplo n.º 2
0
def load_data(file_path, batch_size):
    """Load tabular feature data."""
    invalid_thd = analyze_valid_threshold(file_path)
    log.info('Invalid throughput is set to %.1f GFLOP/s', invalid_thd)

    log.info('Parsing file...')
    with open(file_path, 'r') as filep:
        next(filep)  # Get rid of headers

        # Parse features to sequence.
        features = []
        valids = []
        for line in tqdm.tqdm(filep):
            tokens = line.replace('\n', '').split(',')
            features.append([float(v) for v in tokens[:-1]])
            valids.append(1 if float(tokens[-1]) > invalid_thd else 0)

    log.info('Total data size %d', len(features))

    # 70% for training.
    # 10% for validation.
    # 20% for testing.
    splitter1 = int(len(features) * 0.7)
    splitter2 = int(len(features) * 0.8)
    train_feats = np.array(features[:splitter1])
    train_valid = np.array(valids[:splitter1])

    # Calculate imbalance weight.
    num_valid = len(train_valid.nonzero()[0])
    num_invalid = len(train_valid) - num_valid
    pos_weight = num_invalid / num_valid

    train_iter = gluon.data.DataLoader(
        gluon.data.ArrayDataset(train_feats, train_valid),
        batch_size,
        shuffle=True,
        num_workers=d2l.get_dataloader_workers())
    validate_feats = np.array(features[splitter1:splitter2])
    validate_valids = np.array(valids[splitter1:splitter2])
    test_feats = np.array(features[splitter2:])
    test_valids = np.array(valids[splitter2:])
    return train_iter, pos_weight, validate_feats, validate_valids, test_feats, test_valids
Exemplo n.º 3
0
npx.set_np()

d2l.use_svg_display()

mnist_train = gluon.data.vision.FashionMNIST(train=True)
mnist_test = gluon.data.vision.FashionMNIST(train=False)

print("train length : {}, test length: {}".format(len(mnist_train),
                                                  len(mnist_test)))

X, y = mnist_train[:18]
# d2l.show_images(X.squeeze(axis=-1), 2, 9, titles=d2l.get_fashion_mnist_labels(y))
# plt.show()

batch_size = 256
transformer = gluon.data.vision.transforms.ToTensor()
train_iter = gluon.data.DataLoader(mnist_train.transform_first(transformer),
                                   batch_size,
                                   shuffle=True,
                                   num_workers=d2l.get_dataloader_workers())

timer = d2l.Timer()
for X, y in train_iter:
    continue
print("loading dada takes {:.2f} sec".format(timer.stop()))

train_iter, test_iter = d2l.load_data_fashion_mnist(32, (64, 64))
for X, y in train_iter:
    print(X.shape)
    break
Exemplo n.º 4
0
def load_data(file_path, batch_size, num_hiddens):
    """Load tabular feature data."""
    log.info('Parsing file...')
    with open(file_path, 'r') as filep:
        next(filep)  # Get rid of headers

        # Parse features to sequence. num_seq = num_feature + 1
        features = []
        thrpts = []
        for line in tqdm.tqdm(filep):
            tokens = line.replace('\n', '').split(',')

            # initial <CLS> to 0
            features.append([0] + [float(v) for v in tokens[:-1]])
            thrpts.append(float(tokens[-1]))

    # Expand featues to (batch, sequence, hidden)
    log.info('Expanding features...')
    with ProcessPoolExecutor(max_workers=8) as pool:
        expand_features = []
        for start in tqdm.tqdm(
                range(0, len(features), 8),
                bar_format='{desc}{percentage:3.0f}%|{bar:50}{r_bar}'):
            futures = [
                pool.submit(expand_hidden,
                            feature=feature,
                            num_hiddens=num_hiddens)
                for feature in features[start:min(start + 8, len(features))]
            ]
            for future in as_completed(futures):
                expand_features.append(future.result())
        features = expand_features

    log.info('Total data size %d', len(features))

    # 70% for training.
    # 10% for validation.
    # 20% for testing.
    splitter1 = int(len(features) * 0.7)
    splitter2 = int(len(features) * 0.8)
    train_feats = np.array(features[:splitter1])
    train_thrpts = np.array(thrpts[:splitter1])

    # Make valid labels
    labels = np.array(
        [1 if thrpt >= INVALID_THD else 0 for thrpt in train_thrpts])

    # Standardize training thrpts
    thrpt_avg, thrpt_std = train_thrpts.mean().tolist(), train_thrpts.std(
    ).tolist()
    # log.info('Train thrpt avd std: %.2f %.2f', thrpt_avg, thrpt_std)
    # train_thrpts = (train_thrpts - thrpt_avg) / thrpt_std
    # log.info('Standardized Thrpt range %.2f, %.2f', min(train_thrpts), max(train_thrpts))

    # Normalize training thrpts
    thrpt_avg = min(train_thrpts.nonzero()).tolist()[0]
    thrpt_std = max(train_thrpts).tolist() - thrpt_avg
    log.info('Train thrpt min range: %.2f %.2f', thrpt_avg, thrpt_std)
    train_thrpts = (train_thrpts - thrpt_avg) / thrpt_std

    # Statistics
    buckets = [0 for _ in range(11)]
    for thrpt in train_thrpts:
        buckets[int(thrpt * 10)] += 1
    log.info('Training thrpt distributions')
    for idx, bucket in enumerate(buckets):
        print('%d: %d' % (idx, bucket))

    # Calculate imbalance weight.
    num_valid = sum(labels).tolist()
    num_invalid = len(train_thrpts) - num_valid
    pos_weight = num_invalid / num_valid
    log.info('Valid %.2f : Invalid %.2f', num_valid / len(train_thrpts),
             num_invalid / len(train_thrpts))

    # Make log to training outputs.
    # train_thrpts = np.log(train_thrpts + 1e-6)

    train_iter = gluon.data.DataLoader(
        gluon.data.ArrayDataset(train_feats, train_thrpts, labels),
        batch_size,
        shuffle=True,
        num_workers=d2l.get_dataloader_workers())
    validate_feats = np.array(features[splitter1:splitter2])
    validate_thrpts = np.array(thrpts[splitter1:splitter2])
    test_feats = np.array(features[splitter2:])
    test_thrpts = np.array(thrpts[splitter2:])
    return (train_iter, pos_weight, validate_feats, validate_thrpts,
            test_feats, test_thrpts, thrpt_avg, thrpt_std)
Exemplo n.º 5
0
def load_data(file_path, batch_size, num_cls):
    """Load tabular feature data."""
    log.info('Parsing file...')
    with open(file_path, 'r') as filep:
        next(filep)  # Get rid of headers

        # Parse features to sequence. num_seq = num_feature + 1
        features = []
        thrpts = []
        for line in tqdm.tqdm(filep):
            tokens = line.replace('\n', '').split(',')

            # Filter out invalid records
            thrpt = float(tokens[-1])
            if thrpt <= INVALID_THD:
                continue

            # initial <CLS> to 0
            features.append([0] + [float(v) for v in tokens[:-1]])
            thrpts.append(thrpt)

    log.info('Total data size %d', len(features))

    # Data balancing

    # 70% for training.
    # 10% for validation.
    # 20% for testing.
    splitter1 = int(len(features) * 0.7)
    splitter2 = int(len(features) * 0.8)
    train_feats = np.array(features[:splitter1])
    train_thrpts = np.array(thrpts[:splitter1])

    log.info('Train thrpt min max: %.2f %.2f',
             min(train_thrpts).tolist(),
             max(train_thrpts).tolist())

    # Identify throughput class boundaries.
    sorted_thrpts = [e.tolist() for e in sorted(train_thrpts)]  #np.unique(
    cls_size = len(sorted_thrpts) // num_cls
    boundaries = [sorted_thrpts[-1]]
    for ridx in range(num_cls - 1, 0, -1):
        boundaries.append(sorted_thrpts[ridx * cls_size])
    boundaries.reverse()

    # Transform throughputs to classes.
    log.info('Transforming throughput to class...')
    cls_thrpts = []
    with ProcessPoolExecutor(max_workers=8) as pool:
        for start in tqdm.tqdm(
                range(0, len(thrpts), 8),
                bar_format='{desc}{percentage:3.0f}%|{bar:50}{r_bar}'):
            futures = [
                pool.submit(find_class, thrpt=thrpt, boundaries=boundaries)
                for thrpt in thrpts[start:min(start + 8, len(thrpts))]
            ]
            for future in as_completed(futures):
                cls_thrpts.append(future.result())
    train_thrpts = np.array(cls_thrpts[:splitter1], dtype='int32')

    # Statistics
    buckets = [0 for _ in range(num_cls)]
    for thrpt_cls in train_thrpts:
        buckets[thrpt_cls.tolist()] += 1
    log.debug('Training throughput distributions')
    for idx, (boundary, bucket) in enumerate(zip(boundaries, buckets)):
        log.debug('\t%02d (<=%.2f): %d', idx, boundary, bucket)

    train_iter = gluon.data.DataLoader(
        gluon.data.ArrayDataset(train_feats, train_thrpts),
        batch_size,
        shuffle=True,
        num_workers=d2l.get_dataloader_workers())
    validate_feats = np.array(features[splitter1:splitter2])
    validate_thrpts = np.array(cls_thrpts[splitter1:splitter2], dtype='int32')
    test_feats = np.array(features[splitter2:])
    test_thrpts = np.array(cls_thrpts[splitter2:], dtype='int32')
    return (train_iter, validate_feats, validate_thrpts, test_feats,
            test_thrpts, boundaries)