def test_data_stream(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/data/datasets/sea_stream.csv') raw_data = pd.read_csv(test_file) stream = DataStream(raw_data, name='Test') assert not stream._Y_is_defined stream.prepare_for_use() assert stream.n_remaining_samples() == 40000 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == 'Test: 1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] assert 'stream' == stream._estimator_type expected_info = "DataStream(n_targets=-1, target_idx=1, cat_features=None, name='Test')" assert stream.get_info() == expected_info
def test_data_stream_X_y(test_path, package_path): test_file = os.path.join(package_path, 'src/skmultiflow/datasets/sea_stream.csv') raw_data = pd.read_csv(test_file) y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] stream = DataStream(X, y) assert stream._Y_is_defined == True stream.prepare_for_use() assert stream.n_remaining_samples() == 40000 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == '1 target(s), 2 target_values' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1]
def test_data_stream_X_y(test_path): test_file = os.path.join(test_path, 'sea_stream_file.csv') raw_data = pd.read_csv(test_file) y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] stream = DataStream(X, y) assert stream._Y_is_defined assert stream.n_remaining_samples() == 40 expected_names = ['attrib1', 'attrib2', 'attrib3'] assert stream.feature_names == expected_names expected_targets = [0, 1] assert stream.target_values == expected_targets assert stream.target_names == ['class'] assert stream.n_features == 3 assert stream.n_cat_features == 0 assert stream.n_num_features == 3 assert stream.n_targets == 1 assert stream.get_data_info() == '1 target(s), 2 classes' assert stream.has_more_samples() is True assert stream.is_restartable() is True # Load test data corresponding to first 10 instances test_file = os.path.join(test_path, 'sea_stream_file.npz') data = np.load(test_file) X_expected = data['X'] y_expected = data['y'] X, y = stream.next_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) X, y = stream.last_sample() assert np.alltrue(X[0] == X_expected[0]) assert np.alltrue(y[0] == y_expected[0]) stream.restart() X, y = stream.next_sample(10) assert np.alltrue(X == X_expected) assert np.alltrue(y == y_expected) assert stream.n_targets == np.array(y).ndim assert stream.n_features == X.shape[1] # Ensure that the regression case is also covered y = raw_data.iloc[:, -1:] X = raw_data.iloc[:, :-1] y = y.astype('float64') stream = DataStream(X, y, name='Test') assert stream.task_type == 'regression' assert stream.get_data_info() == 'Test: 1 target(s)'
def main(): args = parser.parse_args() logging = set_logger(args.verbose) if not valid_args(args): sys.exit(0) datasets = args.datasets models = [i.lower() for i in args.models] copies = [int(i) for i in args.copies] dir_path = os.path.dirname(os.path.realpath(__file__)) to_absolute = curry(to_absolute_path)(dir_path) metadata = { "experimento": args.experiment or "", "command": " ".join(sys.argv), "date": time.strftime("%Y%m%d%H%M%S"), "models": models, "copies": copies, "datasets": [] } logging.debug(metadata) # DATASET CLASSIFICATION ###### all_train_data = [] true_vs_pred = [] logging.debug(datasets) for idx, dataset in enumerate(datasets): logging.info("Classifying dataset %s", dataset) logging.debug("Loading dataset: %s", dataset) x_stream, y_stream, _, label_names = load_given_dataset(dataset) logging.debug("Copies per instance: %s", copies[idx]) x_stream, y_stream = repeatInstances( x_stream.todense(), y_stream.todense(), copies=copies[idx]) data_stream = DataStream(data=x_stream, y=y_stream, name=dataset) cardinality = sum(np.sum(y_stream, axis=1) ) / y_stream.shape[0] dataset_metadata = { "name": dataset, "instances": data_stream.n_remaining_samples(), "x_shape": x_stream.shape, "y_shape": y_stream.shape, "cardinality": cardinality, "label_names": [i[0] for i in label_names], "copies": copies[idx] } logging.debug(dataset_metadata) for model_id in models: model = SUPPORTED_MODELS[model_id] logging.info(model["name"]) train_data = {"model": model["name"], "model_id": model_id, "stream": data_stream.name, "copies": copies[idx]} train_stats, true_labels, predictions = evaluar( data_stream, model["model"](data_stream), pretrain_size=args.pretrainsize, ensemble=model["ensemble"], catch_errors=args.catch, logging=logging, train_logs_max=100000, window_size=20 ) eval_stats = {} if true_labels and predictions: logging.info("Evaluating...") eval_stats = evaluation_metrics( true_labels, predictions, train_stats["start_time"], train_stats["end_time"] ) true_vs_pred.append({ "model": model_id, "dataset": dataset, "true": true_labels, "pred": predictions }) train_data.update(train_stats) train_data.update(eval_stats) all_train_data.append(train_data) data_stream.restart() metadata["datasets"].append(dataset_metadata) # Limpia memoria del x_stream, y_stream, data_stream # FIN DATASET CLASSIFICATION ###### # STREAM ANALYSIS ###### if args.streams: print("Stream classification. Not yet implemented.") sys.exit(0) stream_names = args.streamsnames or [] if len(stream_names) != len(args.streams): logging.error( "La cantidad de streams y la cantidad de nombres" + " de streams no coinciden." ) sys.exit(1) metadata["syn_streams"] = [] for idx, i in enumerate(args.streams): stream_path = to_absolute(i) stream_name = stream_names[idx] logging.info("Classifying syn stream: %s", stream_name) logging.info("Loading syn stream to memory") _, y_syn, _, _ = load_moa_stream(stream_path, args.labels) cardinality = sum( np.sum(y_syn.toarray(), axis=1) ) / y_syn.toarray().shape[0] metadata["syn_streams"].append({ "labels": args.labels, "stream_path": stream_path, "stream_name": stream_name, "y_shape": y_syn.shape, "cardinality": cardinality, }) # FIN STREAM ANALYSIS ###### default_output_path = "experiments/" dest_dir = "{}_classification".format( time.strftime(TIME_STR) ) output_rel = os.path.join( args.output if args.output else default_output_path, dest_dir ) output_dir = pipe( output_rel, to_absolute, create_path_if_not_exists ) logging.info("Saving results in a csv...") pd.DataFrame.from_dict(all_train_data).to_csv( os.path.join( output_dir, "results.csv" ) ) logging.info("Saving true_vs_pred in a csv...") for i in true_vs_pred: true_file = '{}_{}_true.csv'.format(i["dataset"], i["model"]) pred_file = '{}_{}_predicted.csv'.format(i["dataset"], i["model"]) np.savetxt(os.path.join(output_dir, true_file), i["true"], delimiter=',') np.savetxt(os.path.join(output_dir, pred_file), i["pred"], delimiter=',') logging.info("Saving metadata") with open(os.path.join(output_dir, 'metadata.json'), 'w') as f_p: json.dump(metadata, f_p, indent=4) logging.info("Files saved in %s", output_dir)
def main(): logging = set_logger() args = parser.parse_args() output_dir = create_output_dir( output_path=args.output if args.output else None) metadata = { "experimento": args.experiment or "", "command": " ".join(sys.argv), "date": time.strftime("%Y%m%d%H%M%S"), } lk_plot_data = [] ld_plot_data = [] ld_mae_plot_data = [] if not args.dataset: print("Dataset not provided. Exiting.") sys.exit(0) #### DATASET ANALYSIS ###### logging.info("Analyzing dataset %s", args.dataset) logging.info("Loading dataset: %s", args.dataset) x_stream, y_stream, _, label_names = load_given_dataset(args.dataset) data_stream = DataStream(data=x_stream.todense(), y=y_stream.todense(), name=args.dataset) labels = y_stream.shape[1] cardinality = sum(np.sum(y_stream.toarray(), axis=1)) / y_stream.toarray().shape[0] density = cardinality / labels metadata["dataset"] = { "name": args.dataset, "instances": data_stream.n_remaining_samples(), "X_shape": x_stream.shape, "y_shape": y_stream.shape, "labels": labels, "cardinality": cardinality, "density": density, "label_names": [i[0] for i in label_names] } logging.info("Analyzing label relationship") priors, coocurrences, conditional_matrix = generate_labels_relationship( y_stream.toarray(), cardinalidad=cardinality, ) save_labels_relationship(output_dir, args.dataset, priors, coocurrences, conditional_matrix) labels_relationship_graph(plot_props={"data": conditional_matrix}, output=os.path.join( output_dir, filename_path("relationship_graph", args.dataset, output_dir, ext="png"))) data_stream.restart() logging.info("Analyzing label skew") labels_skew_original = generate_labels_skew(y_stream.toarray()) labels_skew_original.to_csv( os.path.join(output_dir, args.dataset + "_label_skew.csv")) lk_plot_data.append({ "x": np.arange(1, SKEW_TOP_COMBINATIONS + 1), "y": labels_skew_original.values[:SKEW_TOP_COMBINATIONS], "color": "black", "join": True, "label": "Original" }) logging.info("Analyzing label distribution") lbo_not_scaled, labels_distribution_original = generate_labels_distribution( y_stream.toarray()) lbo_not_scaled.to_csv( os.path.join(output_dir, args.dataset + "_label_distribution.csv")) ld_plot_data.append({ "x": labels_distribution_original.index.values, "y": labels_distribution_original.values, "color": "black", "join": True, "label": "Original" }) # Mean absolute error - graph ld_mae_plot_data.append({ "x": labels_distribution_original.index.values, "y": np.zeros(shape=len(labels_distribution_original)), "color": "black", "label": "Original", "join": True }) # Limpia memoria del x_stream, y_stream, data_stream #### FIN DATASET ANALYSIS ###### #### STREAM ANALYSIS ###### if args.streams: stream_names = args.streamsnames or [] if len(stream_names) != len(args.streams): logging.error( "La cantidad de streams y la cantidad de nombres de streams no coinciden." ) sys.exit(1) metadata["syn_streams"] = [] for idx, i in enumerate(args.streams): stream_path = to_absolute(i) stream_name = stream_names[idx] logging.info("Analyzing syn stream: %s", stream_name) logging.info("Loading syn stream to memory") _, y_syn, _, _ = load_moa_stream(stream_path, args.labels) labels = y_syn.shape[1] cardinality = sum(np.sum(y_syn.toarray(), axis=1)) / y_syn.toarray().shape[0] density = cardinality / labels logging.info("Analyzing label skew") labels_skew_syn = generate_labels_skew(y_syn.toarray()) labels_skew_syn.to_csv( os.path.join(output_dir, stream_name + "_label_skew.csv")) lk_plot_data.append({ "x": np.arange(1, SKEW_TOP_COMBINATIONS + 1), "y": labels_skew_syn.values[:SKEW_TOP_COMBINATIONS], "color": PLOT_COLORS[idx], "join": True, "label": stream_name }) logging.info("Analyzing label distribution") lds_not_scaled, labels_distribution_syn = generate_labels_distribution( y_syn.toarray()) ld_syn = labels_distribution_syn.reindex( np.arange(labels_distribution_original.index.min(), labels_distribution_original.index.max() + 1)).fillna(0) ld_syn_not_scaled = lds_not_scaled.reindex( np.arange(labels_distribution_original.index.min(), labels_distribution_original.index.max() + 1)).fillna(0) ld_plot_data.append({ "x": ld_syn.index.values, "y": ld_syn.values, "color": PLOT_COLORS[idx], "join": True, "label": stream_name }) ld_syn_not_scaled.to_csv( os.path.join(output_dir, stream_name + "_label_distribution.csv")) mae = mean_absolute_error(labels_distribution_original.to_numpy(), ld_syn.to_numpy()) # plot mae ld_mae_plot_data.append({ "x": labels_distribution_original.index.values, "y": labels_distribution_original.to_numpy() - ld_syn.to_numpy(), "label": stream_name, "color": PLOT_COLORS[idx], "join": True }) logging.info("Analyzing label relationship") priors, coocurrences, conditional_matrix = generate_labels_relationship( y_syn.toarray(), cardinalidad=cardinality, ) save_labels_relationship(output_dir, stream_name, priors, coocurrences, conditional_matrix) labels_relationship_graph(plot_props={"data": conditional_matrix}, output=os.path.join( output_dir, filename_path("relationship_graph", stream_name, output_dir, ext="png"))) metadata["syn_streams"].append({ "stream_path": stream_path, "stream_name": stream_name, "y_shape": y_syn.shape, "labels": labels, "cardinality": cardinality, "density": density, "labels_distribution_mean_absolute_error": mae }) #### FIN STREAM ANALYSIS ###### logging.info("Plotting Label Skew") labels_skew_graph(lk_plot_data, title="Label Skew\n{}".format( metadata["dataset"]["name"].title()), output=os.path.join(output_dir, "label_skew.png")) logging.info("Plotting Label Distribution") labels_distribution_graph(ld_plot_data, title="Label Distribution\n{}".format( metadata["dataset"]["name"].title()), output=os.path.join(output_dir, "label_distribution.png")) labels_distribution_mae_graph( ld_mae_plot_data, title="Label Distribution - Mean Absolute Error\n{}".format( metadata["dataset"]["name"].title()), output=os.path.join(output_dir, "ld_mae.png")) logging.info("Saving metadata") with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp: json.dump(metadata, fp, indent=4) logging.info("Files saved in %s", output_dir)