예제 #1
0
def test_data_stream(test_path, package_path):
    test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_stream.csv')
    raw_data = pd.read_csv(test_file)
    stream = DataStream(raw_data, name='Test')

    assert not stream._Y_is_defined

    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == 'Test: 1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    assert 'stream' == stream._estimator_type

    expected_info = "DataStream(n_targets=-1, target_idx=1, cat_features=None, name='Test')"
    assert stream.get_info() == expected_info
예제 #2
0
def test_data_stream_X_y(test_path, package_path):
    test_file = os.path.join(package_path,
                             'src/skmultiflow/datasets/sea_stream.csv')
    raw_data = pd.read_csv(test_file)
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    stream = DataStream(X, y)

    assert stream._Y_is_defined == True

    stream.prepare_for_use()

    assert stream.n_remaining_samples() == 40000

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == '1 target(s), 2 target_values'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]
예제 #3
0
def test_data_stream_X_y(test_path):
    test_file = os.path.join(test_path, 'sea_stream_file.csv')
    raw_data = pd.read_csv(test_file)
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    stream = DataStream(X, y)

    assert stream._Y_is_defined

    assert stream.n_remaining_samples() == 40

    expected_names = ['attrib1', 'attrib2', 'attrib3']
    assert stream.feature_names == expected_names

    expected_targets = [0, 1]
    assert stream.target_values == expected_targets

    assert stream.target_names == ['class']

    assert stream.n_features == 3

    assert stream.n_cat_features == 0

    assert stream.n_num_features == 3

    assert stream.n_targets == 1

    assert stream.get_data_info() == '1 target(s), 2 classes'

    assert stream.has_more_samples() is True

    assert stream.is_restartable() is True

    # Load test data corresponding to first 10 instances
    test_file = os.path.join(test_path, 'sea_stream_file.npz')
    data = np.load(test_file)
    X_expected = data['X']
    y_expected = data['y']

    X, y = stream.next_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    X, y = stream.last_sample()
    assert np.alltrue(X[0] == X_expected[0])
    assert np.alltrue(y[0] == y_expected[0])

    stream.restart()
    X, y = stream.next_sample(10)
    assert np.alltrue(X == X_expected)
    assert np.alltrue(y == y_expected)

    assert stream.n_targets == np.array(y).ndim

    assert stream.n_features == X.shape[1]

    # Ensure that the regression case is also covered
    y = raw_data.iloc[:, -1:]
    X = raw_data.iloc[:, :-1]
    y = y.astype('float64')
    stream = DataStream(X, y, name='Test')

    assert stream.task_type == 'regression'
    assert stream.get_data_info() == 'Test: 1 target(s)'
def main():
    args = parser.parse_args()
    logging = set_logger(args.verbose)
    if not valid_args(args):
        sys.exit(0)

    datasets = args.datasets
    models = [i.lower() for i in args.models]
    copies = [int(i) for i in args.copies]

    dir_path = os.path.dirname(os.path.realpath(__file__))
    to_absolute = curry(to_absolute_path)(dir_path)

    metadata = {
        "experimento": args.experiment or "",
        "command": " ".join(sys.argv),
        "date": time.strftime("%Y%m%d%H%M%S"),
        "models": models,
        "copies": copies,
        "datasets": []
    }
    logging.debug(metadata)

    # DATASET CLASSIFICATION ######
    all_train_data = []
    true_vs_pred = []
    logging.debug(datasets)
    for idx, dataset in enumerate(datasets):
        logging.info("Classifying dataset %s", dataset)
        logging.debug("Loading dataset: %s", dataset)
        x_stream, y_stream, _, label_names = load_given_dataset(dataset)
        logging.debug("Copies per instance: %s", copies[idx])
        x_stream, y_stream = repeatInstances(
            x_stream.todense(), y_stream.todense(), copies=copies[idx])

        data_stream = DataStream(data=x_stream,
                                 y=y_stream, name=dataset)
        cardinality = sum(np.sum(y_stream, axis=1)
                          ) / y_stream.shape[0]
        dataset_metadata = {
            "name": dataset,
            "instances": data_stream.n_remaining_samples(),
            "x_shape": x_stream.shape,
            "y_shape": y_stream.shape,
            "cardinality": cardinality,
            "label_names": [i[0] for i in label_names],
            "copies": copies[idx]
        }
        logging.debug(dataset_metadata)

        for model_id in models:
            model = SUPPORTED_MODELS[model_id]
            logging.info(model["name"])
            train_data = {"model": model["name"], "model_id": model_id,
                          "stream": data_stream.name, "copies": copies[idx]}
            train_stats, true_labels, predictions = evaluar(
                data_stream,
                model["model"](data_stream),
                pretrain_size=args.pretrainsize,
                ensemble=model["ensemble"],
                catch_errors=args.catch,
                logging=logging,
                train_logs_max=100000,
                window_size=20
            )
            eval_stats = {}
            if true_labels and predictions:
                logging.info("Evaluating...")
                eval_stats = evaluation_metrics(
                    true_labels,
                    predictions,
                    train_stats["start_time"],
                    train_stats["end_time"]
                )
                true_vs_pred.append({
                    "model": model_id,
                    "dataset": dataset,
                    "true": true_labels,
                    "pred": predictions
                })
            train_data.update(train_stats)
            train_data.update(eval_stats)
            all_train_data.append(train_data)
            data_stream.restart()

        metadata["datasets"].append(dataset_metadata)
        # Limpia memoria
        del x_stream, y_stream, data_stream

    # FIN DATASET CLASSIFICATION ######

    # STREAM ANALYSIS ######

    if args.streams:
        print("Stream classification. Not yet implemented.")
        sys.exit(0)
        stream_names = args.streamsnames or []
        if len(stream_names) != len(args.streams):
            logging.error(
                "La cantidad de streams y la cantidad de nombres" +
                " de streams no coinciden."
            )
            sys.exit(1)
            metadata["syn_streams"] = []
            for idx, i in enumerate(args.streams):
                stream_path = to_absolute(i)
                stream_name = stream_names[idx]

                logging.info("Classifying syn stream: %s", stream_name)

                logging.info("Loading syn stream to memory")
                _, y_syn, _, _ = load_moa_stream(stream_path, args.labels)

                cardinality = sum(
                    np.sum(y_syn.toarray(), axis=1)
                ) / y_syn.toarray().shape[0]

                metadata["syn_streams"].append({
                    "labels": args.labels,
                    "stream_path": stream_path,
                    "stream_name": stream_name,
                    "y_shape": y_syn.shape,
                    "cardinality": cardinality,
                })

                # FIN STREAM ANALYSIS ######

    default_output_path = "experiments/"
    dest_dir = "{}_classification".format(
        time.strftime(TIME_STR)
    )
    output_rel = os.path.join(
        args.output if args.output else default_output_path,
        dest_dir
    )
    output_dir = pipe(
        output_rel,
        to_absolute,
        create_path_if_not_exists
    )

    logging.info("Saving results in a csv...")
    pd.DataFrame.from_dict(all_train_data).to_csv(
        os.path.join(
            output_dir, "results.csv"
        )
    )

    logging.info("Saving true_vs_pred in a csv...")
    for i in true_vs_pred:
        true_file = '{}_{}_true.csv'.format(i["dataset"], i["model"])
        pred_file = '{}_{}_predicted.csv'.format(i["dataset"], i["model"])
        np.savetxt(os.path.join(output_dir, true_file),
                   i["true"], delimiter=',')
        np.savetxt(os.path.join(output_dir, pred_file),
                   i["pred"], delimiter=',')

    logging.info("Saving metadata")
    with open(os.path.join(output_dir, 'metadata.json'), 'w') as f_p:
        json.dump(metadata, f_p, indent=4)

    logging.info("Files saved in %s", output_dir)
def main():
    logging = set_logger()
    args = parser.parse_args()
    output_dir = create_output_dir(
        output_path=args.output if args.output else None)
    metadata = {
        "experimento": args.experiment or "",
        "command": " ".join(sys.argv),
        "date": time.strftime("%Y%m%d%H%M%S"),
    }

    lk_plot_data = []
    ld_plot_data = []
    ld_mae_plot_data = []

    if not args.dataset:
        print("Dataset not provided. Exiting.")
        sys.exit(0)

    #### DATASET ANALYSIS ######

    logging.info("Analyzing dataset %s", args.dataset)
    logging.info("Loading dataset: %s", args.dataset)
    x_stream, y_stream, _, label_names = load_given_dataset(args.dataset)
    data_stream = DataStream(data=x_stream.todense(),
                             y=y_stream.todense(),
                             name=args.dataset)
    labels = y_stream.shape[1]
    cardinality = sum(np.sum(y_stream.toarray(),
                             axis=1)) / y_stream.toarray().shape[0]
    density = cardinality / labels
    metadata["dataset"] = {
        "name": args.dataset,
        "instances": data_stream.n_remaining_samples(),
        "X_shape": x_stream.shape,
        "y_shape": y_stream.shape,
        "labels": labels,
        "cardinality": cardinality,
        "density": density,
        "label_names": [i[0] for i in label_names]
    }

    logging.info("Analyzing label relationship")
    priors, coocurrences, conditional_matrix = generate_labels_relationship(
        y_stream.toarray(),
        cardinalidad=cardinality,
    )
    save_labels_relationship(output_dir, args.dataset, priors, coocurrences,
                             conditional_matrix)
    labels_relationship_graph(plot_props={"data": conditional_matrix},
                              output=os.path.join(
                                  output_dir,
                                  filename_path("relationship_graph",
                                                args.dataset,
                                                output_dir,
                                                ext="png")))
    data_stream.restart()

    logging.info("Analyzing label skew")
    labels_skew_original = generate_labels_skew(y_stream.toarray())
    labels_skew_original.to_csv(
        os.path.join(output_dir, args.dataset + "_label_skew.csv"))
    lk_plot_data.append({
        "x":
        np.arange(1, SKEW_TOP_COMBINATIONS + 1),
        "y":
        labels_skew_original.values[:SKEW_TOP_COMBINATIONS],
        "color":
        "black",
        "join":
        True,
        "label":
        "Original"
    })

    logging.info("Analyzing label distribution")
    lbo_not_scaled, labels_distribution_original = generate_labels_distribution(
        y_stream.toarray())
    lbo_not_scaled.to_csv(
        os.path.join(output_dir, args.dataset + "_label_distribution.csv"))
    ld_plot_data.append({
        "x": labels_distribution_original.index.values,
        "y": labels_distribution_original.values,
        "color": "black",
        "join": True,
        "label": "Original"
    })
    # Mean absolute error - graph
    ld_mae_plot_data.append({
        "x":
        labels_distribution_original.index.values,
        "y":
        np.zeros(shape=len(labels_distribution_original)),
        "color":
        "black",
        "label":
        "Original",
        "join":
        True
    })

    # Limpia memoria
    del x_stream, y_stream, data_stream

    #### FIN DATASET ANALYSIS ######

    #### STREAM ANALYSIS ######

    if args.streams:
        stream_names = args.streamsnames or []
        if len(stream_names) != len(args.streams):
            logging.error(
                "La cantidad de streams y la cantidad de nombres de streams no coinciden."
            )
            sys.exit(1)
        metadata["syn_streams"] = []
        for idx, i in enumerate(args.streams):
            stream_path = to_absolute(i)
            stream_name = stream_names[idx]

            logging.info("Analyzing syn stream: %s", stream_name)

            logging.info("Loading syn stream to memory")
            _, y_syn, _, _ = load_moa_stream(stream_path, args.labels)

            labels = y_syn.shape[1]
            cardinality = sum(np.sum(y_syn.toarray(),
                                     axis=1)) / y_syn.toarray().shape[0]
            density = cardinality / labels

            logging.info("Analyzing label skew")
            labels_skew_syn = generate_labels_skew(y_syn.toarray())
            labels_skew_syn.to_csv(
                os.path.join(output_dir, stream_name + "_label_skew.csv"))
            lk_plot_data.append({
                "x":
                np.arange(1, SKEW_TOP_COMBINATIONS + 1),
                "y":
                labels_skew_syn.values[:SKEW_TOP_COMBINATIONS],
                "color":
                PLOT_COLORS[idx],
                "join":
                True,
                "label":
                stream_name
            })

            logging.info("Analyzing label distribution")
            lds_not_scaled, labels_distribution_syn = generate_labels_distribution(
                y_syn.toarray())
            ld_syn = labels_distribution_syn.reindex(
                np.arange(labels_distribution_original.index.min(),
                          labels_distribution_original.index.max() +
                          1)).fillna(0)
            ld_syn_not_scaled = lds_not_scaled.reindex(
                np.arange(labels_distribution_original.index.min(),
                          labels_distribution_original.index.max() +
                          1)).fillna(0)
            ld_plot_data.append({
                "x": ld_syn.index.values,
                "y": ld_syn.values,
                "color": PLOT_COLORS[idx],
                "join": True,
                "label": stream_name
            })
            ld_syn_not_scaled.to_csv(
                os.path.join(output_dir,
                             stream_name + "_label_distribution.csv"))
            mae = mean_absolute_error(labels_distribution_original.to_numpy(),
                                      ld_syn.to_numpy())
            # plot mae
            ld_mae_plot_data.append({
                "x":
                labels_distribution_original.index.values,
                "y":
                labels_distribution_original.to_numpy() - ld_syn.to_numpy(),
                "label":
                stream_name,
                "color":
                PLOT_COLORS[idx],
                "join":
                True
            })

            logging.info("Analyzing label relationship")
            priors, coocurrences, conditional_matrix = generate_labels_relationship(
                y_syn.toarray(),
                cardinalidad=cardinality,
            )
            save_labels_relationship(output_dir, stream_name, priors,
                                     coocurrences, conditional_matrix)
            labels_relationship_graph(plot_props={"data": conditional_matrix},
                                      output=os.path.join(
                                          output_dir,
                                          filename_path("relationship_graph",
                                                        stream_name,
                                                        output_dir,
                                                        ext="png")))

            metadata["syn_streams"].append({
                "stream_path":
                stream_path,
                "stream_name":
                stream_name,
                "y_shape":
                y_syn.shape,
                "labels":
                labels,
                "cardinality":
                cardinality,
                "density":
                density,
                "labels_distribution_mean_absolute_error":
                mae
            })

    #### FIN STREAM ANALYSIS ######

    logging.info("Plotting Label Skew")
    labels_skew_graph(lk_plot_data,
                      title="Label Skew\n{}".format(
                          metadata["dataset"]["name"].title()),
                      output=os.path.join(output_dir, "label_skew.png"))

    logging.info("Plotting Label Distribution")
    labels_distribution_graph(ld_plot_data,
                              title="Label Distribution\n{}".format(
                                  metadata["dataset"]["name"].title()),
                              output=os.path.join(output_dir,
                                                  "label_distribution.png"))
    labels_distribution_mae_graph(
        ld_mae_plot_data,
        title="Label Distribution - Mean Absolute Error\n{}".format(
            metadata["dataset"]["name"].title()),
        output=os.path.join(output_dir, "ld_mae.png"))

    logging.info("Saving metadata")
    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(metadata, fp, indent=4)
    logging.info("Files saved in %s", output_dir)