예제 #1
0
def sequential_synthetic_dataset(root_path, dataset_name, data_type):
    """Generate a synthetic dataset for regression."""
    if data_type == "dense":
        X, y = make_classification(
            n_samples=10000,
            n_features=100,
            n_informative=90,
            n_classes=2,
            random_state=42,
        )
    else:
        raise NotImplementedError(
            "{} synthetic dataset is " "not supported.".format(data_type)
        )

    data = LIBSVMDataset(X, y, False)
    lmdb_file_path = os.path.join(
        root_path, "{}_{}.lmdb".format(dataset_name, data_type)
    )

    ds1 = PrefetchDataZMQ(
        data,
    )
    LMDBSerializer.save(ds1, lmdb_file_path)

    print("Dumped dataflow to {} for {}".format(lmdb_file_path, dataset_name))
예제 #2
0
def sequential_epsilon_or_rcv1(root_path, name, data_type, is_sparse):
    data = LIBSVMDataset(root_path, name, data_type, is_sparse)
    lmdb_file_path = join(root_path, '{}_{}.lmdb'.format(name, data_type))

    print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path))
    ds1 = PrefetchDataZMQ(data, nr_proc=1)
    LMDBSerializer.save(ds1, lmdb_file_path)
예제 #3
0
def sequential_epsilon_or_rcv1(root_path, name, data_type):
    features, labels, is_sparse = _load_libsvm_data(root_path, name, data_type)
    data = LIBSVMDataset(features, labels, is_sparse)
    lmdb_file_path = os.path.join(root_path, "{}_{}.lmdb".format(name, data_type))

    ds1 = PrefetchDataZMQ(data)
    LMDBSerializer.save(ds1, lmdb_file_path)

    print("Dumped dataflow to {} for {}".format(lmdb_file_path, name))
def sequential_downsampled_imagenet(args):
    data = DownsampledImageNet(args.data_dir, args.data_type, args.img_size)
    lmdb_file_path = os.path.join(
        args.data_dir, f"imagenet{args.img_size}_{args.data_type}.lmdb")

    # delete file if exists.
    if os.path.exists(lmdb_file_path) and args.force_delete == 1:
        os.remove(lmdb_file_path)

    # serialize to the target path.
    ds1 = PrefetchDataZMQ(data, num_proc=1)
    LMDBSerializer.save(ds1, lmdb_file_path)
예제 #5
0
def sequential_synthetic_dataset(root_path, dataset_name):
    """Generate a synthetic dataset for regression."""
    if 'dense' in dataset_name:
        X, y = make_classification(n_samples=10000, n_features=100, n_informative=90, n_classes=2, random_state=42)
    else:
        raise NotImplementedError("{} synthetic dataset is not supported.".format(dataset_name))

    data = SyntheticLIBSVMDataset(X, y)
    lmdb_file_path = join(root_path, '{}.lmdb'.format(dataset_name))

    print('dump_dataflow_to_lmdb for {}'.format(lmdb_file_path))
    ds1 = PrefetchDataZMQ(data, nr_proc=1)
    LMDBSerializer.save(ds1, lmdb_file_path)
예제 #6
0
        self.count = 0

    def __len__(self):
        return len(self.data)

    def __iter__(self):
        for bag in data:
            # X = bag['X']
            # Pos1 = bag['Pos1']
            # Pos2 = bag['Pos2']
            # DepMask = bag['DepMask']
            HeadPos = bag["HeadPos"]
            TailPos = bag["TailPos"]
            if max(max(bag["HeadPos"]), max(bag["TailPos"])) > 100:
                self.count += 1
            # DepLabel = bag['Dep']
            # ReLabel = bag['Y']
            # HeadLabel = bag['HeadLabel']
            # TailLabel = bag['TailLabel']
            # output = [X, Pos1, Pos2, DepMask, HeadPos, TailPos, DepLabel, ReLabel, HeadLabel, TailLabel]
            output = [HeadPos, TailPos]
            yield output


if __name__ == "__main__":

    data = pickle.load(open("/data/PKL/train.pkl", "rb"))
    ds = Raw(data)
    LMDBSerializer.save(ds, "/data/MLRE/testpkl")
    print(ds.count)
예제 #7
0
                TailLabel,
            ]
            yield output


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    parser_db = subparsers.add_parser("build",
                                      help="build train/test database")
    parser_db.add_argument("--dataset",
                           help="path to train/test data",
                           required=True)
    parser_db.add_argument("--db", help="output lmdb file", required=True)
    parser_eval = subparsers.add_parser("eval", help="bulid p@n eval database")
    parser_eval.add_argument("--dataset",
                             help="path to eval data",
                             required=True)
    parser_eval.add_argument("--db",
                             help="output eval lmdb file",
                             required=True)
    args = parser.parse_args()
    if args.command == "build":
        data = pickle.load(open(args.dataset, "rb"))
        ds = Raw(data)
        LMDBSerializer.save(ds, args.db)
    elif args.command == "eval":
        data = pickle.load(open(args.dataset, "rb"))
        ds = Raw(data)
        LMDBSerializer.save(ds, args.db)
예제 #8
0
        'char': sentiment_c,
        'num': m
    }, open('./data/sentiment.pkl', 'wb'))
    # make train/dev/test
    for i in range(10):
        train_ori = get_train(data, 10, i)
        test_ori = get_test(data, 10, i)

        train = []
        dev = []
        test = []
        for j in range(2):
            random.shuffle(train_ori[j])
            x = len(train_ori[j]) * 9 // 10
            train += train_ori[j][:x]
            dev += train_ori[j][x:]
        test += test_ori
        random.shuffle(train)
        random.shuffle(dev)
        random.shuffle(test)
        train_ = process_data(train, word2id, char2id)
        dev_ = process_data(dev, word2id, char2id)
        test_ = process_data(test, word2id, char2id)
        train_data = MEANdata(train_)
        dev_data = MEANdata(dev_)
        test_data = MEANdata(test_)
        os.system('mkdir mdb%s' % i)
        LMDBSerializer.save(train_data, './mdb{}/train.mdb'.format(i))
        LMDBSerializer.save(dev_data, './mdb{}/dev.mdb'.format(i))
        LMDBSerializer.save(test_data, './mdb{}/test.mdb'.format(i))
예제 #9
0
    # --------------------
    parser.add_argument('--imagenet_folder')
    parser.add_argument('--val', action='store_true')
    parser.add_argument('--train', action='store_true')
    parser.add_argument('--lmdb_file', type=str)

    args = parser.parse_args()

    if args.val and args.train:
        print(
            "Train and Validation options are mutually exclusive! Chose only one."
        )

    if args.val:
        print(
            "We are generating the lmdb file containing validation images of imagenet."
        )
        print(f"The file will be saved at {args.lmdb_file}.lmdb")

        ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'val')
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
    elif args.train:
        print(
            "We are generating the lmdb file containing training images of imagenet."
        )
        print(f"The file will be saved at {args.lmdb_file}.lmdb")

        ds0 = BinaryILSVRC12(os.path.expanduser(args.imagenet_folder), 'train')
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, f"{os.path.expanduser(args.lmdb_file)}.lmdb")
예제 #10
0
    ds.reset_state()
    o = OnlineMoments()
    for dp in get_tqdm(ds):
        feat = dp[0]  # len x dim
        for f in feat:
            o.feed(f)
    logger.info("Writing to {} ...".format(fname))
    with open(fname, 'wb') as f:
        f.write(serialize.dumps([o.mean, o.std]))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(title='command', dest='command')
    parser_db = subparsers.add_parser('build', help='build a LMDB database')
    parser_db.add_argument('--dataset',
                           help='path to TIMIT TRAIN or TEST directory', required=True)
    parser_db.add_argument('--db', help='output lmdb file', required=True)

    parser_stat = subparsers.add_parser('stat', help='compute statistics (mean/std) of dataset')
    parser_stat.add_argument('--db', help='input lmdb file', required=True)
    parser_stat.add_argument('-o', '--output',
                             help='output statistics file', default='stats.data')

    args = parser.parse_args()
    if args.command == 'build':
        ds = RawTIMIT(args.dataset)
        LMDBSerializer.save(ds, args.db)
    elif args.command == 'stat':
        compute_mean_std(args.db, args.output)
예제 #11
0
    parser.add_argument('database_dir',
                        type=str,
                        default=None,
                        help='location to save output database')
    args = parser.parse_args()

    class BinaryILSVRC12(dataset.ILSVRC12Files):
        def get_data(self):
            for fname, label in super(BinaryILSVRC12, self).__iter__():
                with open(fname, 'rb') as f:
                    jpeg = f.read()
                jpeg = np.asarray(bytearray(jpeg), dtype='uint8')
                yield [jpeg, label]

    if args.database_dir is None:
        lmdb_path = args.imagenet
    else:
        lmdb_path = args.database_dir
    os.environ['TENSORPACK_DATASET'] = os.path.join(lmdb_path,
                                                    "tensorpack_data")
    if not os.path.exists(os.environ['TENSORPACK_DATASET']):
        os.mkdir(os.environ['TENSORPACK_DATASET'])

    for name in ['train', 'val']:
        db_filename = 'ILSVRC-%s.lmdb' % name
        db_loc = os.path.join(lmdb_path, db_filename)
        print(f"Processing {args.imagenet} {name} to {db_loc}...")
        ds0 = BinaryILSVRC12(args.imagenet, name)
        ds1 = MultiProcessRunnerZMQ(ds0, num_proc=1)
        LMDBSerializer.save(ds1, db_loc)