Exemplo n.º 1
0
def split_processing(dataset, api, args, resume,
                     multi_label_data=None, session_file=None,
                     path=None, log=None):
    """Splits a dataset into train and test datasets

    """
    train_dataset = None
    test_dataset = None
    sample_rate = 1 - args.test_split
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - train (%s %%)" % (
            args.name, int(sample_rate * 100)),
        args.description_, args, sample_rate,
        out_of_bag=False,
        multi_label_data=multi_label_data)
    train_dataset, resume = alternative_dataset_processing(
        dataset, "train", dataset_alternative_args, api, args,
        resume, session_file=session_file, path=path, log=log)
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - test (%s %%)" % (
            args.name, int(args.test_split * 100)),
        args.description_, args,
        sample_rate, out_of_bag=True, multi_label_data=multi_label_data)
    test_dataset, resume = alternative_dataset_processing(
        dataset, "test", dataset_alternative_args, api, args,
        resume, session_file=session_file, path=path, log=log)

    return train_dataset, test_dataset, resume
Exemplo n.º 2
0
def split_processing(dataset, api, args, resume,
                     multi_label_data=None, session_file=None,
                     path=None, log=None):
    """Splits a dataset into train and test datasets

    """
    train_dataset = None
    test_dataset = None
    sample_rate = 1 - args.test_split
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - train (%s %%)" % (
            args.name, int(sample_rate * 100)),
        args.description_, args, sample_rate,
        out_of_bag=False,
        multi_label_data=multi_label_data)
    train_dataset, resume = alternative_dataset_processing(
        dataset, "train", dataset_alternative_args, api, args,
        resume, session_file=session_file, path=path, log=log)
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - test (%s %%)" % (
            args.name, int(args.test_split * 100)),
        args.description_, args,
        sample_rate, out_of_bag=True, multi_label_data=multi_label_data)
    test_dataset, resume = alternative_dataset_processing(
        dataset, "test", dataset_alternative_args, api, args,
        resume, session_file=session_file, path=path, log=log)

    return train_dataset, test_dataset, resume
Exemplo n.º 3
0
def split_range_processing(dataset, api, args, resume,
                           multi_label_data=None, session_file=None,
                           path=None, log=None):
    """Splits a dataset into train and test datasets using ranges

    """
    train_dataset = None
    test_dataset = None
    test_rate = args.test_split
    train_rate = 1 - test_rate
    split_row = int(dataset["object"]["rows"] * train_rate)
    args.range_ = [1, split_row]
    args.test_split = 0
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - train (%s %%)" % (
            args.name, int(train_rate * 100)),
        args.description_, args,
        multi_label_data=multi_label_data)
    train_dataset, resume = alternative_dataset_processing(
        dataset, "train", dataset_alternative_args, api, args,
        resume, session_file=session_file, path=path, log=log)
    args.range_ = [split_row + 1, dataset["object"]["rows"]]
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - test (%s %%)" % (
            args.name, int(test_rate * 100)),
        args.description_, args, multi_label_data=multi_label_data)
    test_dataset, resume = alternative_dataset_processing(
        dataset, "test", dataset_alternative_args, api, args,
        resume, session_file=session_file, path=path, log=log)

    return train_dataset, test_dataset, resume
Exemplo n.º 4
0
def split_range_processing(dataset,
                           api,
                           args,
                           resume,
                           multi_label_data=None,
                           session_file=None,
                           path=None,
                           log=None):
    """Splits a dataset into train and test datasets using ranges

    """
    train_dataset = None
    test_dataset = None
    test_rate = args.test_split
    train_rate = 1 - test_rate
    split_row = int(dataset["object"]["rows"] * train_rate)
    args.range_ = [1, split_row]
    args.test_split = 0
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - train (%s %%)" % (args.name, int(train_rate * 100)),
        args.description_,
        args,
        multi_label_data=multi_label_data)
    train_dataset, resume = alternative_dataset_processing(
        dataset,
        "train",
        dataset_alternative_args,
        api,
        args,
        resume,
        session_file=session_file,
        path=path,
        log=log)
    args.range_ = [split_row + 1, dataset["object"]["rows"]]
    dataset_alternative_args = r.set_dataset_split_args(
        "%s - test (%s %%)" % (args.name, int(test_rate * 100)),
        args.description_,
        args,
        multi_label_data=multi_label_data)
    test_dataset, resume = alternative_dataset_processing(
        dataset,
        "test",
        dataset_alternative_args,
        api,
        args,
        resume,
        session_file=session_file,
        path=path,
        log=log)

    return train_dataset, test_dataset, resume
Exemplo n.º 5
0
def split_processing(dataset, name, description, api, args, resume,
                     session_file=None, path=None, log=None):
    """Splits a dataset into train and test datasets
    """
    train_dataset = None
    test_dataset = None
    sample_rate = 1 - args.test_split
    # if resuming, try to extract train dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, train_dataset = c.checkpoint(
            c.is_dataset_created, path, "_train", debug=args.debug,
            message=message, log_file=session_file, console=args.verbosity)

    if train_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - train (%s %%)" % (name,
            int(sample_rate * 100)), description, args,
            sample_rate, out_of_bag=False)
        train_dataset = r.create_dataset(
            dataset, dataset_split_args, args, api, path, session_file,
            log, "train")
        if train_dataset:
            train_dataset = r.get_dataset(train_dataset, api,
                                          args.verbosity, session_file)

    # if resuming, try to extract test dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, test_dataset = c.checkpoint(
            c.is_dataset_created, path, "_test", debug=args.debug,
            message=message, log_file=session_file, console=args.verbosity)

    if test_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - test (%s %%)" % (name,
            int(args.test_split * 100)), description, args,
            sample_rate, out_of_bag=True)
        test_dataset = r.create_dataset(
            dataset, dataset_split_args, args, api, path, session_file,
            log, "test")
        if test_dataset:
            test_dataset = r.get_dataset(test_dataset, api, args.verbosity,
                                         session_file)
    return train_dataset, test_dataset, resume
Exemplo n.º 6
0
def split_processing(dataset,
                     name,
                     description,
                     api,
                     args,
                     resume,
                     session_file=None,
                     path=None,
                     log=None):
    """Splits a dataset into train and test datasets
    """
    train_dataset = None
    test_dataset = None
    sample_rate = 1 - args.test_split
    # if resuming, try to extract train dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, train_dataset = c.checkpoint(c.is_dataset_created,
                                             path,
                                             "_train",
                                             debug=args.debug,
                                             message=message,
                                             log_file=session_file,
                                             console=args.verbosity)

    if train_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - train (%s %%)" % (name, int(sample_rate * 100)),
            description,
            args,
            sample_rate,
            out_of_bag=False)
        train_dataset = r.create_dataset(dataset, dataset_split_args, args,
                                         api, path, session_file, log, "train")
        if train_dataset:
            train_dataset = r.get_dataset(train_dataset, api, args.verbosity,
                                          session_file)

    # if resuming, try to extract test dataset form log files
    if resume:
        message = u.dated("Dataset not found. Resuming.\n")
        resume, test_dataset = c.checkpoint(c.is_dataset_created,
                                            path,
                                            "_test",
                                            debug=args.debug,
                                            message=message,
                                            log_file=session_file,
                                            console=args.verbosity)

    if test_dataset is None:
        dataset_split_args = r.set_dataset_split_args(
            "%s - test (%s %%)" % (name, int(args.test_split * 100)),
            description,
            args,
            sample_rate,
            out_of_bag=True)
        test_dataset = r.create_dataset(dataset, dataset_split_args, args, api,
                                        path, session_file, log, "test")
        if test_dataset:
            test_dataset = r.get_dataset(test_dataset, api, args.verbosity,
                                         session_file)
    return train_dataset, test_dataset, resume