示例#1
0
def create_ensembles(datasets, ensemble_ids, ensemble_args, args,
                     number_of_ensembles=1,
                     api=None, path=None, session_file=None, log=None):
    """Create ensembles from input data

    """

    if api is None:
        api = bigml.api.BigML()
    ensembles = ensemble_ids[:]
    existing_ensembles = len(ensembles)
    model_ids = []
    ensemble_args_list = []
    if isinstance(ensemble_args, list):
        ensemble_args_list = ensemble_args
    if args.dataset_off and args.evaluate:
        args.test_dataset_ids = datasets[:]
    if not args.multi_label:
        datasets = datasets[existing_ensembles:]
    if number_of_ensembles > 0:
        message = dated("Creating %s.\n" %
                        plural("ensemble", number_of_ensembles))
        log_message(message, log_file=session_file,
                    console=args.verbosity)
        inprogress = []
        for i in range(0, number_of_ensembles):
            wait_for_available_tasks(inprogress, args.max_parallel_ensembles,
                                     api, "ensemble",
                                     wait_step=args.number_of_models)

            if ensemble_args_list:
                ensemble_args = ensemble_args_list[i]

            if args.dataset_off and args.evaluate:
                multi_dataset = args.test_dataset_ids[:]
                del multi_dataset[i + existing_ensembles]
                ensemble = api.create_ensemble(multi_dataset,
                                               ensemble_args,
                                               retries=None)
            else:
                ensemble = api.create_ensemble(datasets, ensemble_args,
                                               retries=None)
            ensemble_id = check_resource_error(ensemble,
                                               "Failed to create ensemble: ")
            log_message("%s\n" % ensemble_id, log_file=log)
            ensemble_ids.append(ensemble_id)
            inprogress.append(ensemble_id)
            ensembles.append(ensemble)
            log_created_resources("ensembles", path, ensemble_id,
                                  mode='a')
        models, model_ids = retrieve_ensembles_models(ensembles, api, path)
        if number_of_ensembles < 2 and args.verbosity:
            message = dated("Ensemble created: %s\n" %
                            get_url(ensemble))
            log_message(message, log_file=session_file,
                        console=args.verbosity)
            if args.reports:
                report(args.reports, path, ensemble)

    return ensembles, ensemble_ids, models, model_ids
示例#2
0
def create_samples(datasets,
                   sample_ids,
                   sample_args,
                   args,
                   api=None,
                   path=None,
                   session_file=None,
                   log=None):
    """Create remote samples

    """
    if api is None:
        api = bigml.api.BigML()

    samples = sample_ids[:]
    existing_samples = len(samples)
    sample_args_list = []
    datasets = datasets[existing_samples:]
    # if resuming and all samples were created, there will be no datasets left
    if datasets:
        if isinstance(sample_args, list):
            sample_args_list = sample_args

        # Only one sample per command, at present
        number_of_samples = 1
        max_parallel_samples = 1
        message = dated("Creating %s.\n" % plural("sample", number_of_samples))
        log_message(message, log_file=session_file, console=args.verbosity)

        inprogress = []
        for i in range(0, number_of_samples):
            wait_for_available_tasks(inprogress, max_parallel_samples, api,
                                     "sample")
            if sample_args_list:
                sample_args = sample_args_list[i]

            sample = api.create_sample(datasets[i], sample_args, retries=None)
            sample_id = check_resource_error(sample,
                                             "Failed to create sample: ")
            log_message("%s\n" % sample_id, log_file=log)
            sample_ids.append(sample_id)
            inprogress.append(sample_id)
            samples.append(sample)
            log_created_resources("samples", path, sample_id, mode='a')

        if args.verbosity:
            if bigml.api.get_status(sample)['code'] != bigml.api.FINISHED:
                try:
                    sample = check_resource(sample,
                                            api.get_sample,
                                            raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished sample: %s" %
                             str(exception))
                samples[0] = sample
            message = dated("Sample created: %s\n" % get_url(sample))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, sample)
示例#3
0
def create_clusters(datasets, cluster_ids, cluster_args,
                    args, api=None, path=None,
                    session_file=None, log=None):
    """Create remote clusters

    """
    if api is None:
        api = bigml.api.BigML()

    clusters = cluster_ids[:]
    existing_clusters = len(clusters)
    cluster_args_list = []
    datasets = datasets[existing_clusters:]
    # if resuming and all clusters were created, there will be no datasets left
    if datasets:
        if isinstance(cluster_args, list):
            cluster_args_list = cluster_args

        # Only one cluster per command, at present
        number_of_clusters = 1
        message = dated("Creating %s.\n" %
                        plural("cluster", number_of_clusters))
        log_message(message, log_file=session_file,
                    console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        for i in range(0, number_of_clusters):
            wait_for_available_tasks(inprogress, args.max_parallel_clusters,
                                     api, "cluster")
            if cluster_args_list:
                cluster_args = cluster_args_list[i]

            cluster = api.create_cluster(datasets, cluster_args, retries=None)
            cluster_id = check_resource_error(cluster,
                                              "Failed to create cluster: ")
            log_message("%s\n" % cluster_id, log_file=log)
            cluster_ids.append(cluster_id)
            inprogress.append(cluster_id)
            clusters.append(cluster)
            log_created_resources("clusters", path, cluster_id, mode='a')

        if args.verbosity:
            if bigml.api.get_status(cluster)['code'] != bigml.api.FINISHED:
                try:
                    cluster = check_resource(cluster, api.get_cluster,
                                             query_string=query_string,
                                             raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished cluster: %s" %
                             str(exception))
                clusters[0] = cluster
            message = dated("Cluster created: %s\n" %
                            get_url(cluster))
            log_message(message, log_file=session_file,
                        console=args.verbosity)
            if args.reports:
                report(args.reports, path, cluster)
示例#4
0
def create_fusion(models,
                  fusion,
                  fusion_args,
                  args,
                  api=None,
                  path=None,
                  session_file=None,
                  log=None):
    """Create remote fusion

    """
    if api is None:
        api = bigml.api.BigML()

    fusions = []
    fusion_ids = []
    if fusion is not None:
        fusions = [fusion]
        fusion_ids = [fusion]
    # if resuming and all fusions were created
    if models:

        # Only one fusion per command, at present
        message = dated("Creating fusion.\n")
        log_message(message, log_file=session_file, console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        wait_for_available_tasks(inprogress, args.max_parallel_fusions, api,
                                 "fusion")

        fusion = api.create_fusion(models, fusion_args, retries=None)
        fusion_id = check_resource_error( \
            fusion,
            "Failed to create fusion: ")
        log_message("%s\n" % fusion_id, log_file=log)
        fusion_ids.append(fusion_id)
        inprogress.append(fusion_id)
        fusions.append(fusion)
        log_created_resources("fusions", path, fusion_id, mode='a')

        if args.verbosity:
            if bigml.api.get_status(fusion)['code'] != bigml.api.FINISHED:
                try:
                    fusion = check_resource( \
                        fusion, api.get_fusion,
                        query_string=query_string,
                        raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished fusion: %s" %
                             str(exception))
                fusions[0] = fusion
            message = dated("Fusion created: %s\n" % get_url(fusion))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, fusion)
示例#5
0
def create_anomalies(datasets,
                     anomaly_ids,
                     anomaly_args,
                     args,
                     api=None,
                     path=None,
                     session_file=None,
                     log=None):
    """Create remote anomalies

    """
    if api is None:
        api = bigml.api.BigML()

    anomalies = anomaly_ids[:]
    existing_anomalies = len(anomalies)
    anomaly_args_list = []
    datasets = datasets[existing_anomalies:]
    # if resuming and all anomalies were created,
    # there will be no datasets left
    if datasets:
        if isinstance(anomaly_args, list):
            anomaly_args_list = anomaly_args

        # Only one anomaly per command, at present
        number_of_anomalies = 1
        message = dated("Creating %s.\n" %
                        plural("anomaly detector", number_of_anomalies))
        log_message(message, log_file=session_file, console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        for i in range(0, number_of_anomalies):
            wait_for_available_tasks(inprogress, args.max_parallel_anomalies,
                                     api, "anomaly")
            if anomaly_args_list:
                anomaly_args = anomaly_args_list[i]

            anomaly = api.create_anomaly(datasets, anomaly_args, retries=None)
            anomaly_id = check_resource_error(anomaly,
                                              "Failed to create anomaly: ")
            log_message("%s\n" % anomaly_id, log_file=log)
            anomaly_ids.append(anomaly_id)
            inprogress.append(anomaly_id)
            anomalies.append(anomaly)
            log_created_resources("anomalies", path, anomaly_id, mode='a')

        if args.verbosity:
            if bigml.api.get_status(anomaly)['code'] != bigml.api.FINISHED:
                try:
                    anomaly = check_resource(anomaly,
                                             api.get_anomaly,
                                             query_string=query_string,
                                             raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished anomaly: %s" %
                             str(exception))
                anomalies[0] = anomaly
            message = dated("Anomaly created: %s\n" % get_url(anomaly))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, anomaly)
示例#6
0
def create_logistic_regressions(datasets,
                                logistic_regression_ids,
                                logistic_regression_args,
                                args,
                                api=None,
                                path=None,
                                session_file=None,
                                log=None):
    """Create remote logistic regressions

    """
    if api is None:
        api = bigml.api.BigML()

    logistic_regressions = logistic_regression_ids[:]
    existing_logistic_regressions = len(logistic_regressions)
    logistic_regression_args_list = []
    datasets = datasets[existing_logistic_regressions:]
    # if resuming and all logistic regressions were created,
    # there will be no datasets left
    if datasets:
        if isinstance(logistic_regression_args, list):
            logistic_regression_args_list = logistic_regression_args

        # Only one logistic regression per command, at present
        number_of_logistic_regressions = 1
        message = dated(
            "Creating %s.\n" %
            plural("logistic regression", number_of_logistic_regressions))
        log_message(message, log_file=session_file, console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        for i in range(0, number_of_logistic_regressions):
            wait_for_available_tasks(inprogress,
                                     args.max_parallel_logistic_regressions,
                                     api, "logisticregression")
            if logistic_regression_args_list:
                logistic_regression_args = logistic_regression_args_list[i]
            if args.cross_validation_rate > 0:
                new_seed = get_basic_seed(i + existing_logistic_regressions)
                logistic_regression_args.update(seed=new_seed)

            if (args.test_datasets and args.evaluate):
                dataset = datasets[i]
                logistic_regression = api.create_logistic_regression( \
                    dataset, logistic_regression_args, retries=None)
            elif args.dataset_off and args.evaluate:
                multi_dataset = args.test_dataset_ids[:]
                del multi_dataset[i + existing_logistic_regressions]
                logistic_regression = api.create_logistic_regression( \
                    multi_dataset, logistic_regression_args, retries=None)
            else:
                logistic_regression = api.create_logistic_regression( \
                datasets, logistic_regression_args, retries=None)
            logistic_regression_id = check_resource_error( \
                logistic_regression, "Failed to create logistic regression: ")
            log_message("%s\n" % logistic_regression_id, log_file=log)
            logistic_regression_ids.append(logistic_regression_id)
            inprogress.append(logistic_regression_id)
            logistic_regressions.append(logistic_regression)
            log_created_resources("logistic_regressions",
                                  path,
                                  logistic_regression_id,
                                  mode='a')

        if args.verbosity:
            if bigml.api.get_status(logistic_regression)['code'] != \
                    bigml.api.FINISHED:
                try:
                    logistic_regression = check_resource( \
                        logistic_regression, api.get_logistic_regression,
                        query_string=query_string, raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished logistic regression:"
                             " %s" % str(exception))
                logistic_regressions[0] = logistic_regression
            message = dated("Logistic regression created: %s\n" %
                            get_url(logistic_regression))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, logistic_regression)
示例#7
0
def create_models(datasets, model_ids, model_args,
                  args, api=None, path=None,
                  session_file=None, log=None):
    """Create remote models

    """
    if api is None:
        api = bigml.api.BigML()

    models = model_ids[:]
    existing_models = len(models)
    model_args_list = []
    if args.dataset_off and args.evaluate:
        args.test_dataset_ids = datasets[:]
    if not args.multi_label:
        datasets = datasets[existing_models:]
    # if resuming and all models were created, there will be no datasets left
    if datasets:
        dataset = datasets[0]
        if isinstance(model_args, list):
            model_args_list = model_args
        if args.number_of_models > 0:
            message = dated("Creating %s.\n" %
                            plural("model", args.number_of_models))
            log_message(message, log_file=session_file,
                        console=args.verbosity)

            single_model = args.number_of_models == 1 and existing_models == 0
            # if there's more than one model the first one must contain
            # the entire field structure to be used as reference.
            query_string = (FIELDS_QS if single_model and (args.test_header \
                and not args.export_fields) else ALL_FIELDS_QS)
            inprogress = []
            for i in range(0, args.number_of_models):
                wait_for_available_tasks(inprogress, args.max_parallel_models,
                                         api, "model")
                if model_args_list:
                    model_args = model_args_list[i]
                if args.cross_validation_rate > 0:
                    new_seed = get_basic_seed(i + existing_models)
                    model_args.update(seed=new_seed)
                # one model per dataset (--max-categories or single model)
                if (args.max_categories > 0 or
                        (args.test_datasets and args.evaluate)):
                    dataset = datasets[i]
                    model = api.create_model(dataset, model_args, retries=None)
                elif args.dataset_off and args.evaluate:
                    multi_dataset = args.test_dataset_ids[:]
                    del multi_dataset[i + existing_models]
                    model = api.create_model(multi_dataset, model_args,
                                             retries=None)
                else:
                    model = api.create_model(datasets, model_args,
                                             retries=None)
                model_id = check_resource_error(model,
                                                "Failed to create model: ")
                log_message("%s\n" % model_id, log_file=log)
                model_ids.append(model_id)
                inprogress.append(model_id)
                models.append(model)
                log_created_resources("models", path, model_id, mode='a')

            if args.number_of_models < 2 and args.verbosity:
                if bigml.api.get_status(model)['code'] != bigml.api.FINISHED:
                    try:
                        model = check_resource(model, api.get_model,
                                               query_string=query_string,
                                               raise_on_error=True)
                    except Exception, exception:
                        sys.exit("Failed to get a finished model: %s" %
                                 str(exception))
                    models[0] = model
                message = dated("Model created: %s\n" %
                                get_url(model))
                log_message(message, log_file=session_file,
                            console=args.verbosity)
                if args.reports:
                    report(args.reports, path, model)
示例#8
0
def create_time_series(datasets,
                       time_series_ids,
                       time_series_args,
                       args,
                       api=None,
                       path=None,
                       session_file=None,
                       log=None):
    """Create remote time-series

    """
    if api is None:
        api = bigml.api.BigML()

    time_series_set = time_series_ids[:]
    existing_time_series = len(time_series_set)
    time_series_args_list = []
    datasets = datasets[existing_time_series:]
    # if resuming and all time-series were created,
    # there will be no datasets left
    if datasets:
        if isinstance(time_series_args, list):
            time_series_args_list = time_series_args

        # Only one time-series per command, at present
        number_of_time_series = 1
        message = dated("Creating %s time-series.\n" % number_of_time_series)
        log_message(message, log_file=session_file, console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        for i in range(0, number_of_time_series):
            wait_for_available_tasks(inprogress, args.max_parallel_time_series,
                                     api, "timeseries")
            if time_series_args_list:
                time_series_args = time_series_args_list[i]

            if (args.test_datasets and args.evaluate):
                dataset = datasets[i]
                time_series = api.create_time_series( \
                    dataset, time_series_args, retries=None)
            else:
                time_series = api.create_time_series( \
                datasets, time_series_args, retries=None)
            time_series_id = check_resource_error( \
                time_series, "Failed to create time-series: ")
            log_message("%s\n" % time_series_id, log_file=log)
            time_series_ids.append(time_series_id)
            inprogress.append(time_series_id)
            time_series_set.append(time_series)
            log_created_resources("time_series",
                                  path,
                                  time_series_id,
                                  mode='a')

        if args.verbosity:
            if bigml.api.get_status(time_series)['code'] != \
                    bigml.api.FINISHED:
                try:
                    time_series = check_resource( \
                        time_series, api.get_time_series,
                        query_string=query_string,
                        raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished time-series:"
                             " %s" % str(exception))
                time_series_set[0] = time_series
            message = dated("Time-series created: %s\n" % get_url(time_series))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, time_series)
示例#9
0
def create_pca(datasets,
               pca,
               pca_args,
               args,
               api=None,
               path=None,
               session_file=None,
               log=None):
    """Create remote pcas

    """
    if api is None:
        api = bigml.api.BigML()

    pcas = []
    pca_ids = []
    if pca is not None:
        pcas = [pca]
        pca_ids = [pca]
    existing_pcas = len(pcas)
    pca_args_list = []
    datasets = datasets[existing_pcas:]
    # if resuming and all pcas were created, there will
    # be no datasets left
    if datasets:
        if isinstance(pca_args, list):
            pca_args_list = pca_args

        # Only one pca per command, at present
        number_of_pcas = 1
        message = dated("Creating %s.\n" % plural("pca", number_of_pcas))
        log_message(message, log_file=session_file, console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        for i in range(0, number_of_pcas):
            wait_for_available_tasks(inprogress, args.max_parallel_pcas, api,
                                     "pca")
            if pca_args_list:
                pca_args = pca_args_list[i]

            pca = api.create_pca(datasets, pca_args, retries=None)
            pca_id = check_resource_error( \
                pca,
                "Failed to create pca: ")
            log_message("%s\n" % pca_id, log_file=log)
            pca_ids.append(pca_id)
            inprogress.append(pca_id)
            pcas.append(pca)
            log_created_resources("pcas", path, pca_id, mode='a')

        if args.verbosity:
            if bigml.api.get_status(pca)['code'] != bigml.api.FINISHED:
                try:
                    pca = check_resource( \
                        pca, api.get_pca,
                        query_string=query_string,
                        raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished pca: %s" %
                             str(exception))
                pcas[0] = pca
            message = dated("PCA created: %s\n" % get_url(pca))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, pca)
示例#10
0
def create_topic_models(datasets,
                        topic_model_ids,
                        topic_model_args,
                        args,
                        api=None,
                        path=None,
                        session_file=None,
                        log=None):
    """Create remote topic models

    """
    if api is None:
        api = bigml.api.BigML()

    topic_models = topic_model_ids[:]
    existing_topic_models = len(topic_models)
    topic_model_args_list = []
    datasets = datasets[existing_topic_models:]
    # if resuming and all topic models were created, there will
    # be no datasets left
    if datasets:
        if isinstance(topic_model_args, list):
            topic_model_args_list = topic_model_args

        # Only one topic model per command, at present
        number_of_topic_models = 1
        message = dated("Creating %s.\n" %
                        plural("topic model", number_of_topic_models))
        log_message(message, log_file=session_file, console=args.verbosity)

        query_string = FIELDS_QS
        inprogress = []
        for i in range(0, number_of_topic_models):
            wait_for_available_tasks(inprogress,
                                     args.max_parallel_topic_models, api,
                                     "topicmodel")
            if topic_model_args_list:
                topic_model_args = topic_model_args_list[i]

            topic_model = api.create_topic_model(datasets,
                                                 topic_model_args,
                                                 retries=None)
            topic_model_id = check_resource_error( \
                topic_model,
                "Failed to create topic model: ")
            log_message("%s\n" % topic_model_id, log_file=log)
            topic_model_ids.append(topic_model_id)
            inprogress.append(topic_model_id)
            topic_models.append(topic_model)
            log_created_resources("topic_models",
                                  path,
                                  topic_model_id,
                                  mode='a')

        if args.verbosity:
            if bigml.api.get_status(topic_model)['code'] != bigml.api.FINISHED:
                try:
                    topic_model = check_resource( \
                        topic_model, api.get_topic_model,
                        query_string=query_string,
                        raise_on_error=True)
                except Exception, exception:
                    sys.exit("Failed to get a finished topic model: %s" %
                             str(exception))
                topic_models[0] = topic_model
            message = dated("Topic model created: %s\n" % get_url(topic_model))
            log_message(message, log_file=session_file, console=args.verbosity)
            if args.reports:
                report(args.reports, path, topic_model)
示例#11
0
def create_evaluations(model_or_ensemble_ids, datasets, evaluation_args,
                       args, api=None,
                       path=None, session_file=None, log=None,
                       existing_evaluations=0):
    """Create evaluations for a list of models

       ``model_or_ensemble_ids``: list of model or ensemble ids to create
                                  an evaluation of
       ``datasets``: dataset objects or ids to evaluate with
       ``evaluation_args``: arguments for the ``create_evaluation`` call
       ``args``: input values for bigmler flags
       ``api``: api to remote objects in BigML
       ``path``: directory to store the BigMLer generated files in
       ``session_file``: file to store the messages of that session
       ``log``: user provided log file
       ``existing_evaluations``: evaluations found when attempting resume
    """

    evaluations = []
    dataset = datasets[0]
    evaluation_args_list = []
    if isinstance(evaluation_args, list):
        evaluation_args_list = evaluation_args
    if api is None:
        api = bigml.api.BigML()
    remaining_ids = model_or_ensemble_ids[existing_evaluations:]
    if args.test_dataset_ids or args.dataset_off:
        remaining_datasets = datasets[existing_evaluations:]
    number_of_evaluations = len(remaining_ids)
    message = dated("Creating evaluations.\n")
    log_message(message, log_file=session_file,
                console=args.verbosity)

    inprogress = []
    for i in range(0, number_of_evaluations):
        model = remaining_ids[i]
        if args.test_dataset_ids or args.dataset_off:
            dataset = remaining_datasets[i]
        wait_for_available_tasks(inprogress, args.max_parallel_evaluations,
                                 api, "evaluation")

        if evaluation_args_list != []:
            evaluation_args = evaluation_args_list[i]
        if args.cross_validation_rate > 0:
            new_seed = get_basic_seed(i + existing_evaluations)
            evaluation_args.update(seed=new_seed)
        evaluation = api.create_evaluation(model, dataset, evaluation_args,
                                           retries=None)
        evaluation_id = check_resource_error(evaluation,
                                             "Failed to create evaluation: ")
        inprogress.append(evaluation_id)
        log_created_resources("evaluations", path, evaluation_id,
                              mode='a')
        evaluations.append(evaluation)
        log_message("%s\n" % evaluation['resource'], log_file=log)

    if (args.number_of_evaluations < 2 and len(evaluations) == 1
            and args.verbosity):
        evaluation = evaluations[0]
        if bigml.api.get_status(evaluation)['code'] != bigml.api.FINISHED:
            try:
                evaluation = check_resource(evaluation, api.get_evaluation,
                                            raise_on_error=True)
            except Exception, exception:
                sys.exit("Failed to get a finished evaluation: %s" %
                         str(exception))
            evaluations[0] = evaluation
        message = dated("Evaluation created: %s\n" %
                        get_url(evaluation))
        log_message(message, log_file=session_file,
                    console=args.verbosity)
        if args.reports:
            report(args.reports, path, evaluation)