Пример #1
0
def sequential_synthetic_dataset(root_path, dataset_name, data_type):
    """Generate a synthetic dataset for regression."""
    if data_type == "dense":
        X, y = make_classification(
            n_samples=10000,
            n_features=100,
            n_informative=90,
            n_classes=2,
            random_state=42,
        )
    else:
        raise NotImplementedError(
            "{} synthetic dataset is " "not supported.".format(data_type)
        )

    data = LIBSVMDataset(X, y, False)
    lmdb_file_path = os.path.join(
        root_path, "{}_{}.lmdb".format(dataset_name, data_type)
    )

    ds1 = PrefetchDataZMQ(
        data,
    )
    LMDBSerializer.save(ds1, lmdb_file_path)

    print("Dumped dataflow to {} for {}".format(lmdb_file_path, dataset_name))
Пример #2
0
def sequential_epsilon_or_rcv1(root_path, name, data_type, is_sparse):
    data = LIBSVMDataset(root_path, name, data_type, is_sparse)
    lmdb_file_path = join(root_path, '{}_{}.lmdb'.format(name, data_type))

    print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path))
    ds1 = PrefetchDataZMQ(data, nr_proc=1)
    LMDBSerializer.save(ds1, lmdb_file_path)
Пример #3
0
def sequential_epsilon_or_rcv1(root_path, name, data_type):
    features, labels, is_sparse = _load_libsvm_data(root_path, name, data_type)
    data = LIBSVMDataset(features, labels, is_sparse)
    lmdb_file_path = os.path.join(root_path, "{}_{}.lmdb".format(name, data_type))

    ds1 = PrefetchDataZMQ(data)
    LMDBSerializer.save(ds1, lmdb_file_path)

    print("Dumped dataflow to {} for {}".format(lmdb_file_path, name))
def sequential_downsampled_imagenet(args):
    data = DownsampledImageNet(args.data_dir, args.data_type, args.img_size)
    lmdb_file_path = os.path.join(
        args.data_dir, f"imagenet{args.img_size}_{args.data_type}.lmdb")

    # delete file if exists.
    if os.path.exists(lmdb_file_path) and args.force_delete == 1:
        os.remove(lmdb_file_path)

    # serialize to the target path.
    ds1 = PrefetchDataZMQ(data, num_proc=1)
    LMDBSerializer.save(ds1, lmdb_file_path)
Пример #5
0
def sequential_synthetic_dataset(root_path, dataset_name):
    """Generate a synthetic dataset for regression."""
    if 'dense' in dataset_name:
        X, y = make_classification(n_samples=10000, n_features=100, n_informative=90, n_classes=2, random_state=42)
    else:
        raise NotImplementedError("{} synthetic dataset is not supported.".format(dataset_name))

    data = SyntheticLIBSVMDataset(X, y)
    lmdb_file_path = join(root_path, '{}.lmdb'.format(dataset_name))

    print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path))
    ds1 = PrefetchDataZMQ(data, nr_proc=1)
    LMDBSerializer.save(ds1, lmdb_file_path)
Пример #6
0
def compute_mean_std(db, fname):
    ds = LMDBSerializer.load(db, shuffle=False)
    ds.reset_state()
    o = OnlineMoments()
    for dp in get_tqdm(ds):
        feat = dp[0]  # len x dim
        for f in feat:
            o.feed(f)
    logger.info("Writing to {} ...".format(fname))
    with open(fname, 'wb') as f:
        f.write(serialize.dumps([o.mean, o.std]))
Пример #7
0
def compute_mean_std(db, fname):
    ds = LMDBSerializer.load(db, shuffle=False)
    ds.reset_state()
    o = OnlineMoments()
    for dp in get_tqdm(ds):
        feat = dp[0]  # len x dim
        for f in feat:
            o.feed(f)
    logger.info("Writing to {} ...".format(fname))
    with open(fname, 'wb') as f:
        f.write(serialize.dumps([o.mean, o.std]))
Пример #8
0
Файл: r.py Проект: voidiak/MTRE
def getdata(path, isTrain):
    ds = LMDBSerializer.load(path, shuffle=isTrain)

    # Graph Benchmark
    # ds=FakeData([[10,10],[10,10],[10,10],[10,10],[10],[10],[10,10],[1],[1],[1]], 1000, random=False,dtype=['int32', 'int32', 'int32', 'int32', 'int32', 'int32',\
    #     'int32', 'int32', 'int32', 'int32'], domain=[(0, 100), (0, 120),(0,120),(0,1),(0,100),(0,100),(0,100),(0,52),(0,115),(0,115)])

    ds = getbatch(ds, 32, isTrain)
    if isTrain:
        ds = MultiProcessRunnerZMQ(ds, 4)
    return ds
Пример #9
0
 def __init__(self, file_location, batch_size, train=True, shuffle=True, full=False, batch_from_disk=150):
   self.batch_size = batch_size
   self.train = train
   if train:
     self.ds = MyLMDBSerializer.load(file_location, shuffle=shuffle, batch_from_disk=batch_from_disk)
     self.ds = MyLocallyShuffleData(self.ds, buffer_size=10000, shuffle_interval=500)
     self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000)
     self.len_ = 1281167
   else:
     self.ds = LMDBSerializer.load(file_location, shuffle=False)
     self.ds = MultiProcessRunnerZMQ(self.ds, num_proc=1, hwm=10000)
     self.len_ = 50000
   self.ds.reset_state()
   self.batches_in_epoch = int(math.ceil(self.len_ / self.batch_size))
    def __init__(self,
                 mode,
                 batch_size=256,
                 shuffle=False,
                 num_workers=25,
                 cache=50000,
                 device='cuda'):
        # enumerate standard imagenet augmentors
        imagenet_augmentors = fbresnet_augmentor(mode == 'train')

        # load the lmdb if we can find it
        base_dir = '/userhome/cs/u3003679/'
        lmdb_loc = os.path.join(base_dir, 'ILSVRC-{}.lmdb'.format(mode))
        #lmdb_loc = os.path.join(os.environ['IMAGENET'], 'ILSVRC-%s.lmdb'%mode)
        ds = LMDBSerializer.load(lmdb_loc, shuffle=shuffle)
        ds = LocallyShuffleData(ds, cache)

        # ds = td.LMDBDataPoint(ds)

        def f(dp):
            x, label = dp
            x = cv2.imdecode(x, cv2.IMREAD_COLOR)
            for aug in imagenet_augmentors:
                x = aug.augment(x)
            return x, label

        ds = MultiProcessMapDataZMQ(ds, num_proc=8, map_func=f)
        # ds = MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0)
        # ds = AugmentImageComponent(ds, imagenet_augmentors)

        # ds = td.PrefetchData(ds, 5000, 1)

        # ds = td.MapDataComponent(ds, lambda x: cv2.imdecode(x, cv2.IMREAD_COLOR), 0)
        # ds = td.AugmentImageComponent(ds, imagenet_augmentors)
        # ds = td.PrefetchDataZMQ(ds, num_workers)
        self.ds = BatchData(ds, batch_size)
        # self.ds = MultiProcessRunnerZMQ(self.ds, 4)
        self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device
Пример #11
0
def getdata(path, isTrain):
    ds = LMDBSerializer.load(path, shuffle=isTrain)
    ds = getbatch(ds, 64, isTrain)
    if isTrain:
        ds = MultiProcessRunnerZMQ(ds, 4)
    return ds
Пример #12
0
                TailLabel,
            ]
            yield output


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    parser_db = subparsers.add_parser("build",
                                      help="build train/test database")
    parser_db.add_argument("--dataset",
                           help="path to train/test data",
                           required=True)
    parser_db.add_argument("--db", help="output lmdb file", required=True)
    parser_eval = subparsers.add_parser("eval", help="bulid p@n eval database")
    parser_eval.add_argument("--dataset",
                             help="path to eval data",
                             required=True)
    parser_eval.add_argument("--db",
                             help="output eval lmdb file",
                             required=True)
    args = parser.parse_args()
    if args.command == "build":
        data = pickle.load(open(args.dataset, "rb"))
        ds = Raw(data)
        LMDBSerializer.save(ds, args.db)
    elif args.command == "eval":
        data = pickle.load(open(args.dataset, "rb"))
        ds = Raw(data)
        LMDBSerializer.save(ds, args.db)
Пример #13
0
    ds.reset_state()
    o = OnlineMoments()
    for dp in get_tqdm(ds):
        feat = dp[0]  # len x dim
        for f in feat:
            o.feed(f)
    logger.info("Writing to {} ...".format(fname))
    with open(fname, 'wb') as f:
        f.write(serialize.dumps([o.mean, o.std]))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(title='command', dest='command')
    parser_db = subparsers.add_parser('build', help='build a LMDB database')
    parser_db.add_argument('--dataset',
                           help='path to TIMIT TRAIN or TEST directory', required=True)
    parser_db.add_argument('--db', help='output lmdb file', required=True)

    parser_stat = subparsers.add_parser('stat', help='compute statistics (mean/std) of dataset')
    parser_stat.add_argument('--db', help='input lmdb file', required=True)
    parser_stat.add_argument('-o', '--output',
                             help='output statistics file', default='stats.data')

    args = parser.parse_args()
    if args.command == 'build':
        ds = RawTIMIT(args.dataset)
        LMDBSerializer.save(ds, args.db)
    elif args.command == 'stat':
        compute_mean_std(args.db, args.output)
Пример #14
0
        'char': sentiment_c,
        'num': m
    }, open('./data/sentiment.pkl', 'wb'))
    # make train/dev/test
    for i in range(10):
        train_ori = get_train(data, 10, i)
        test_ori = get_test(data, 10, i)

        train = []
        dev = []
        test = []
        for j in range(2):
            random.shuffle(train_ori[j])
            x = len(train_ori[j]) * 9 // 10
            train += train_ori[j][:x]
            dev += train_ori[j][x:]
        test += test_ori
        random.shuffle(train)
        random.shuffle(dev)
        random.shuffle(test)
        train_ = process_data(train, word2id, char2id)
        dev_ = process_data(dev, word2id, char2id)
        test_ = process_data(test, word2id, char2id)
        train_data = MEANdata(train_)
        dev_data = MEANdata(dev_)
        test_data = MEANdata(test_)
        os.system('mkdir mdb%s' % i)
        LMDBSerializer.save(train_data, './mdb{}/train.mdb'.format(i))
        LMDBSerializer.save(dev_data, './mdb{}/dev.mdb'.format(i))
        LMDBSerializer.save(test_data, './mdb{}/test.mdb'.format(i))
Пример #15
0
    # --------------------
    parser.add_argument('--imagenet_folder')
    parser.add_argument('--val', action='store_true')
    parser.add_argument('--train', action='store_true')
    parser.add_argument('--lmdb_file', type=str)

    args = parser.parse_args()

    if args.val and args.train:
        print(
            "Train and Validation options are mutually exclusive! Chose only one."
        )

    if args.val:
        print(
            "We are generating the lmdb file containing validation images of imagenet."
        )
        print(f"The file will be saved at {args.lmdb_file}.lmdb")

        ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'val')
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
    elif args.train:
        print(
            "We are generating the lmdb file containing training images of imagenet."
        )
        print(f"The file will be saved at {args.lmdb_file}.lmdb")

        ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'train')
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
Пример #16
0
        self.count = 0

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        for bag in data:
            # X = bag['X']
            # Pos1 = bag['Pos1']
            # Pos2 = bag['Pos2']
            # DepMask = bag['DepMask']
            HeadPos = bag["HeadPos"]
            TailPos = bag["TailPos"]
            if max(max(bag["HeadPos"]), max(bag["TailPos"])) > 100:
                self.count += 1
            # DepLabel = bag['Dep']
            # ReLabel = bag['Y']
            # HeadLabel = bag['HeadLabel']
            # TailLabel = bag['TailLabel']
            # output = [X, Pos1, Pos2, DepMask, HeadPos, TailPos, DepLabel, ReLabel, HeadLabel, TailLabel]
            output = [HeadPos, TailPos]
            yield output


if __name__ == "__main__":

    data = pickle.load(open("/data/PKL/train.pkl", "rb"))
    ds = Raw(data)
    LMDBSerializer.save(ds, "/data/MLRE/testpkl")
    print(ds.count)
Пример #17
0
def getdata(path, batchsize, isTrain):
    ds = LMDBSerializer.load(path, shuffle=isTrain)
    ds = getbatch(ds, batchsize, isTrain)
    # if isTrain:
    #     ds = MultiProcessRunnerZMQ(ds, 2)
    return ds
Пример #18
0
    parser.add_argument('database_dir',
                        type=str,
                        default=None,
                        help='location to save output database')
    args = parser.parse_args()

    class BinaryILSVRC12(dataset.ILSVRC12Files):
        def get_data(self):
            for fname, label in super(BinaryILSVRC12, self).__iter__():
                with open(fname, 'rb') as f:
                    jpeg = f.read()
                jpeg = np.asarray(bytearray(jpeg), dtype='uint8')
                yield [jpeg, label]

    if args.database_dir is None:
        lmdb_path = args.imagenet
    else:
        lmdb_path = args.database_dir
    os.environ['TENSORPACK_DATASET'] = os.path.join(lmdb_path,
                                                    "tensorpack_data")
    if not os.path.exists(os.environ['TENSORPACK_DATASET']):
        os.mkdir(os.environ['TENSORPACK_DATASET'])

    for name in ['train', 'val']:
        db_filename = 'ILSVRC-%s.lmdb' % name
        db_loc = os.path.join(lmdb_path, db_filename)
        print(f"Processing {args.imagenet} {name} to {db_loc}...")
        ds0 = BinaryILSVRC12(args.imagenet, name)
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, db_loc)