def build_dataset(dataset, build_name, slicer, sample_interval, features_regex, class_label, tdt_split, force, visualize, data_path, target_data_path, s3_profile, s3_url, data_plots_folder, aggregation_functions): # s3 support s3 = gather_results.get_s3_client(s3_url=s3_url, s3_profile=s3_profile) # Prevent overwrite by mistake if gather_results.load_model_config( dataset, data_path=target_data_path, s3=s3) and not force: print("Dataset %s already configured" % dataset) sys.exit(1) # Validate tdt-split training, dev, test = map(lambda x: x / 10, tdt_split) if not sum(tdt_split) == 10: print("Training (%d) + dev (%d) + test (%d) != 10" % tdt_split) sys.exit(1) # Load available run ids for the build name (from s3) runs = gather_results.load_run_uuids('.raw', name=build_name, data_path=data_path, s3=s3) # Apply the slice def slice_fn(x): return int(x.strip()) if x.strip() else None slice_object = slice(*map(slice_fn, slicer.split(":"))) runs = np.array(runs[slice_object]) print("Obtained %d runs for build %s" % (len(runs), build_name)) # Split the runs in training, dev and test training_idx, dev_idx, test_idx = dataset_split_filters( len(runs), training, dev, data_path=target_data_path, s3=s3) np_runs = np.array(runs) # Saving dataset metadata gather_results.save_run_uuids(dataset, np_runs[training_idx], name='training', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[dev_idx], name='dev', data_path=target_data_path, s3=s3) gather_results.save_run_uuids(dataset, np_runs[test_idx], name='test', data_path=target_data_path, s3=s3) # Calculate normalized and filtered dimensions and labels normalized_length, num_dstat_features, labels = \ data_sizes_and_labels(runs[0], features_regex, sample_interval, aggregation_functions=aggregation_functions, data_path=data_path, s3=s3) model_config = { 'build_name': build_name, 'sample_interval': sample_interval, 'features_regex': features_regex, 'class_label': class_label, 'aggregation_functions': aggregation_functions, 'training_set': training, 'dev_set': dev, 'test_set': test, 'normalized_length': normalized_length, 'labels': labels, 'num_columns': num_dstat_features, 'num_features': len(labels) } # Save the config and complete list of run uuids gather_results.save_run_uuids(dataset, runs, data_path=target_data_path, s3=s3) gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) print("Stored %d run IDs in the model %s config" % (len(runs), dataset)) # Resolve the aggregation function names to functions resolved_agg_fn = [ resolve_aggregation_function(x) for x in aggregation_functions ] datasets = {} # Training must come first so we calculate normalization params for data_type in ['training', 'dev', 'test']: data, _figure_sizes = prepare_dataset( dataset, normalized_length, num_dstat_features, data_type, features_regex=features_regex, sample_interval=sample_interval, class_label=class_label, aggregation_functions=resolved_agg_fn, visualize=visualize, data_path=data_path, target_data_path=target_data_path, s3=s3) datasets[data_type] = data examples = data['examples'] if len(examples) == 0: continue # Perform dataset-wise normalization if data_type == 'training': n_examples, normalization_params = normalize_dataset( examples, labels) # We cache normalization parameters from the training data set # to normalize the dev and test set, as well as other input data model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config, data_path=target_data_path, s3=s3) # Save figure sizes as well for training only figure_sizes = _figure_sizes else: # Perform dataset-wise normalization n_examples, normalization_params = normalize_dataset( examples, labels, model_config['normalization_params']) # Replace examples with normalized ones datasets[data_type]['examples'] = n_examples # Store the normalized data to disk gather_results.save_dataset(dataset, name=data_type, data_path=target_data_path, s3=s3, **datasets[data_type]) # Plot some more figures if visualize and not aggregation_functions: for n in range(n_examples.shape[0]): figure_name = sample_interval + "_%s_" + str(n) unrolled_norm_plot = pd.Series(n_examples[n]).plot() fig = unrolled_norm_plot.get_figure() axes = plt.gca() axes.set_ylim([-1, 1]) fig.savefig( os.sep.join([data_plots_folder] + [figure_name % "normalized"])) plt.close(fig) df = pd.DataFrame(figure_sizes, columns=['size', 'status']) size_plot = df.plot.scatter(x='size', y='status') fig = size_plot.get_figure() fig.savefig(os.sep.join([data_plots_folder] + ['sizes_by_result.png'])) plt.close(fig) # Store labels to disk gather_results.save_dataset(dataset, name='labels', data_path=target_data_path, s3=s3, labels=labels) print("Done creating dataset %s" % model_config)
def prepare_dataset(dataset, normalized_length, num_dstat_features, data_type, features_regex, sample_interval='1s', class_label='status', aggregation_functions=None, visualize=False, data_path=None, target_data_path=None, s3=None): """Takes a dataset and filters and does the magic Loads the run ids from the dataset configuration. Loads the data (dsv + meta) for every run from cache. Builds the unrolled examples as a numpy ndarray. Builds the classes as a numpy array. Saves the data setup to the dataset config. Does some visualization (if enabled). """ if visualize: data_plots_folder = [ os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data', dataset, 'plots' ] os.makedirs(os.sep.join(data_plots_folder), exist_ok=True) # Load the list of runs and base labels runs = gather_results.load_run_uuids(dataset, name=data_type, data_path=target_data_path, s3=s3) # run_uuids are the example_ids sizes = [] # The data for each example. examples = examples_ndarray(len(runs), num_dstat_features, normalized_length) # The test result for each example classes = [] skips = [] print("Loading %s data:" % data_type, end='\r', flush=True) for count, run in enumerate(runs): print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)), end='\r', flush=True) result = gather_results.get_subunit_results_for_run( run, sample_interval, data_path=data_path, s3=s3) # For one run_uuid we must only get on example (result) # Filtering by columns if not result: skips.append(run.uuid) continue # Apply column filtering result = filter_example(result, features_regex) # Normalize data example = fixed_lenght_example(result, normalized_length, aggregation_functions) vector = unroll_example(example, normalized_length) # Normalize status status = get_class(result, class_label) # Examples is an np ndarrays examples[count] = vector.values classes.append(status) # Plot from figures if visualize and not aggregation_functions: # Prepare some more data if we are going to visualize sizes.append((result['dstat'].shape[0], status)) figure_name = sample_interval + "_%s_" + str(count) # Plot un-normalized data data_plot = result['dstat'].plot() fig = data_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "downsampled"])) plt.close(fig) # Plot fixed size data fixed_plot = example.plot() fig = fixed_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "fixedsize"])) plt.close(fig) # Plot unrolled data unrolled_plot = pd.Series(vector).plot() fig = unrolled_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "unrolled"])) plt.close(fig) print("Loading %s data: %d done!" % (data_type, len(runs))) # Check that everything went well if len(skips) > 0: print('Unable to train model because of missing runs %s' % skips) safe_runs = [run.uuid for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs) print('The model has been updated to exclude those runs.') print('Please re-run the training step.') sys.exit(1) classes = np.array(classes) figure_sizes = np.array(sizes) example_ids = np.array(runs) print("%s set: examples: %s, classes: %s, example IDs: %s" % (data_type, str(examples.shape), str( classes.shape), str(example_ids.shape))) data = { 'examples': examples, 'example_ids': example_ids, 'classes': classes } if visualize and aggregation_functions and len(examples) > 0: if len(aggregation_functions) > 3: print('Visualization skipped, cannot represent more than 3D') sys.exit(1) else: fig = plt.figure() if len(aggregation_functions) == 3: ax = fig.add_subplot(111, projection='3d') else: ax = fig.add_subplot(111) # Build a dict [class] -> [int ID] unique_classes = list(set(classes)) dict_classes = dict( zip(unique_classes, list(range(len(unique_classes))))) # Setup colours cm = plt.get_cmap('jet') cNorm = pltcolors.Normalize(vmin=0, vmax=len(unique_classes)) scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=cm) # Scatter the data for ii in range(len(examples)): ax.scatter(*examples[ii], marker='o', c=scalarMap.to_rgba(dict_classes[classes[ii]])) # Set axis labels ax.set_xlabel(aggregation_functions[0].__name__) if len(aggregation_functions) > 1: ax.set_ylabel(aggregation_functions[1].__name__) if len(aggregation_functions) > 2: ax.set_zlabel(aggregation_functions[2].__name__) # scalarMap.set_array(classes) # fig.colorbar(scalarMap) # Save the plot fig.savefig( os.sep.join(data_plots_folder + [data_type + "_3d_plot"])) plt.close(fig) return data, figure_sizes
def db_batch_predict(db_uri, dataset, slice, gpu, debug): """Run predict on all DB items on included in the dataset yet Takes a dataset and a build name. It builds the list of runs in the DB that fit the specified build name, and that are not yet used for training in the specified dataset. It runs prediction on all of them. """ if debug: tf.logging.set_verbosity(tf.logging.DEBUG) # Get the configuration for the model model_config = gather_results.load_model_config(dataset) # Get the list of runs from the dataset run_uuids = gather_results.load_run_uuids(dataset) # Get the list of runs from the DB runs = gather_results.get_runs_by_name( db_uri=db_uri, build_name=model_config['build_name']) # Run a predict loop, include all runs not in the train dataset predict_runs = [r for r in runs if r.uuid not in run_uuids] predict_runs = predict_runs[] if len(predict_runs) == 0: print("Empty prediction set, nothing to do.") sys.exit(0) # Initialize the array examples = np.ndarray( shape=(len(predict_runs), model_config['num_features'])) idx = 0 classes = [] labels = [] print("All runs: %d, dataset size: %d, predict size: %d" % ( len(runs), len(run_uuids), len(predict_runs))) for run in predict_runs: # This will also store new runs in cache. In future we may want to # train on those as well, but nor now let's try to predict only results = gather_results.get_subunit_results_for_run( run, model_config['sample_interval'], db_uri=db_uri) for result in results: # Skip runs with no data if result is None: continue if model_config['features_regex']: df = result['dstat'] col_regex = re.compile(model_config['features_regex']) result['dstat'] = df[list(filter( col_regex.search, df.columns))] # Normalize examples vector, status, labels = trainer.normalize_example( result, model_config['normalized_length'], model_config['labels']) examples[idx] = vector.values classes.append(status) idx += 1 # Normalize dataset n_examples, _ = trainer.normalize_dataset( examples, labels, params=model_config['normalization_params']) # Prepare other arrays classes = np.array(classes) run_uuids = [r.uuid for r in predict_runs] # Configure TF config = tf.ConfigProto(log_device_placement=True,) config.gpu_options.allow_growth = True config.allow_soft_placement = True # Now do the prediction model = svm_trainer.SVMTrainer(n_examples, run_uuids, labels, classes, dataset_name=dataset, force_gpu=gpu) predictions = model.predict() errors = [] for prediction, actual in zip(predictions, classes): if prediction['classes'] != actual: errors.append((prediction, actual)) print("Prediction of %d inputs completed." % len(classes)) print("Input set composition: %d PASS, %s FAIL" % ( len([x for x in classes if x == 0]), len([x for x in classes if x == 1]))) if len(errors) > 0: print("There were some prediction errors: %s" % errors) else: print("All predicted correctly.")
def local_trainer(train, estimator, dataset, sample_interval, features_regex, class_label, visualize, steps, gpu, debug): # Normalized lenght before resampling normalized_length = 5500 if sample_interval: # Calculate the desired normalized lenght after resample normalized_length = get_downsampled_example_lenght( sample_interval, normalized_length) data_plots_folder = [ os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data', dataset, 'plots' ] os.makedirs(os.sep.join(data_plots_folder), exist_ok=True) runs = gather_results.load_run_uuids(dataset) # run_uuids are the example_ids sizes = [] # The data for each example. We don't know yet the pre-set shape, so # wait until the first result comes in examples = [] # Model configuration. We need to cache sample_interval, features-regex and # the normalization parameters for each feature so we can re-use them # during prediction. model_config = { 'sample_interval': sample_interval, 'features_regex': features_regex, 'normalized_length': normalized_length } # The test result for each example classes = [] labels = [] idx = 0 skips = [] for run in runs: results = gather_results.get_subunit_results_for_run( run, sample_interval) # For one run_uuid we must only get on example (result) result = results[0] # Filtering by columns if not result: skips.append(run.uuid) continue df = result['dstat'] if features_regex: col_regex = re.compile(features_regex) result['dstat'] = df[list(filter(col_regex.search, df.columns))] # Setup the numpy matrix and sizes if len(examples) == 0: # Adjust normalized_length to the actual re-sample one examples = np.ndarray(shape=(len(runs), len(result['dstat'].columns) * normalized_length)) model_config['num_columns'] = len(result['dstat'].columns) model_config['num_features'] = (len(result['dstat'].columns) * normalized_length) # Normalize data example = fixed_lenght_example(result, normalized_length) # Normalize status status = get_class(result, class_label) vector, new_labels = unroll_example(example, normalized_length, labels) # Only calculate labels for the first example if len(labels) == 0: labels = new_labels model_config['labels'] = labels print("Normalized example %d of %d" % (runs.index(run) + 1, len(runs)), end='\r', flush=True) # Examples is an np ndarrays examples[idx] = vector.values classes.append(status) if visualize: # Prepare some more data if we are going to visualize sizes.append((result['dstat'].shape[0], status)) figure_name = sample_interval + "_%s_" + str(idx) # Plot un-normalized data data_plot = result['dstat'].plot() fig = data_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "downsampled"])) plt.close(fig) # Plot fixed size data fixed_plot = example.plot() fig = fixed_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "fixedsize"])) plt.close(fig) # Plot unrolled data unrolled_plot = pd.Series(vector).plot() fig = unrolled_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "unrolled"])) plt.close(fig) idx += 1 if len(skips) > 0: print('Unable to train model because of missing runs %s' % skips) safe_runs = [run for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs) print('The model has been updated to exclude those runs.') print('Please re-run the training step.') sys.exit(1) # Perform dataset-wise normalization # NOTE(andreaf) When we train the model we ignore any saved normalization # parameter, since the sample interval and features may be different. n_examples, normalization_params = normalize_dataset(examples, labels) # We do cache the result to normalize the prediction set. model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config) if visualize: for n in range(len(runs)): figure_name = sample_interval + "_%s_" + str(n) unrolled_norm_plot = pd.Series(n_examples[n]).plot() fig = unrolled_norm_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "normalized"])) plt.close(fig) np_sizes = np.array(sizes) df = pd.DataFrame(np_sizes, columns=['size', 'status']) size_plot = df.plot.scatter(x='size', y='status') fig = size_plot.get_figure() fig.savefig(os.sep.join(data_plots_folder + ['sizes_by_result.png'])) plt.close(fig) # Now do the training exmple_ids = [run.uuid for run in runs] classes = np.array(classes) print("\nTraining data shape: (%d, %d)" % n_examples.shape) if train: if debug: tf.logging.set_verbosity(tf.logging.DEBUG) config = tf.ConfigProto(log_device_placement=True, ) config.gpu_options.allow_growth = True config.allow_soft_placement = True model = svm_trainer.SVMTrainer(n_examples, exmple_ids, labels, classes, dataset_name=dataset, force_gpu=gpu) model.train(steps=steps)
def train_model(build_name): global estimator dataset = estimator global model_dir with session_scope() as session: if not os.path.isfile(os.sep.join([model_dir, 'data', dataset, 'runs.json.gz'])): runs = gather_results.get_runs_by_name(None, build_name=build_name, session=session) model_config = {'build_name': build_name} gather_results.save_model_config(dataset, model_config, data_path=model_dir) gather_results.save_run_uuids(dataset, runs, data_path=model_dir) else: runs = gather_results.load_run_uuids(dataset, data_path=model_dir) normalized_length = 5500 if estimator == 'svm': skips = [] classes = [] labels = [] examples = [] class_label = 'status' features_regex = None sample_interval = None idx = 0 # Model configuration. We need to cache sample_interval, # features-regex and the normalization parameters for each # feature so we can re-use them during prediction. model_config = { 'sample_interval': sample_interval, 'features_regex': features_regex, 'normalized_length': normalized_length } for run in runs: results = gather_results.get_subunit_results_for_run( run, '1s', session=None, data_path=model_dir, use_cache=True) print('Acquired run %s' % run.uuid) # For one run_uuid we must only get on example (result) result = results[0] if not result: skips.append(run.uuid) continue # Setup the numpy matrix and sizes if len(examples) == 0: # Adjust normalized_length to the actual re-sample one examples = np.ndarray( shape=( len(runs), (len(result['dstat'].columns) * normalized_length))) model_config['num_columns'] = len( result['dstat'].columns) model_config['num_features'] = (len( result['dstat'].columns) * normalized_length) # Normalize data example = fixed_lenght_example(result, normalized_length) # Normalize status status = get_class(result, class_label) vector, new_labels = unroll_example( example, normalized_length, labels) # Only calculate labels for the first example if len(labels) == 0: labels = new_labels model_config['labels'] = labels # Examples is an np ndarrays examples[idx] = vector.values classes.append(status) if len(skips) > 0: print('Unable to train model because of missing ' 'runs %s' % skips) safe_runs = [ run for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs, data_path=model_dir) message = ('The model has been updated to exclude ' 'those runs. Please re-run the training' ' step.') abort(make_response(message, 400)) def run_training(): # Perform dataset-wise normalization # NOTE(andreaf) When we train the model we ignore any saved # normalization # parameter, since the sample interval and features may be # different. n_examples, normalization_params = normalize_dataset( examples, labels) # We do cache the result to normalize the prediction set. model_config['normalization_params'] = normalization_params gather_results.save_model_config(dataset, model_config, data_path=model_dir) # Now do the training example_ids = [run.uuid for run in runs] outclasses = np.array(classes) svm_trainer.SVMTrainer(n_examples, example_ids, labels, outclasses, dataset_name=dataset, model_path=model_dir) thread = threading.Thread(target=run_training) thread.start() return "training started", 202 else: def run_nn_training(): for run in runs: uuid = run.uuid result = gather_results.get_subunit_results_for_run( run, '1s', session=session, use_cache=False, data_path=model_dir)[0] try: features, labels = nn_trainer.normalize_data(result) except TypeError: print('Unable to normalize data in run %s, ' 'skipping' % uuid) continue nn_trainer.train_model(features, labels, dataset_name=dataset, model_path=model_dir) print('done') thread = threading.Thread(target=run_nn_training) thread.start() return "training started", 202
def prepare_dataset(dataset, normalized_length, num_dstat_features, data_type, features_regex, sample_interval='1s', class_label='status', visualize=False, data_path=None, target_data_path=None, s3=None): """Takes a dataset and filters and does the magic Loads the run ids from the dataset configuration. Loads the data (dsv + meta) for every run from cache. Builds the unrolled exaples as a numpy ndarray. Builds the classes as a numpy array. Saves the data setup to the dataset config. Does some visualization (if enabled). """ if visualize: data_plots_folder = [ os.path.dirname(os.path.realpath(__file__)), os.pardir, 'data', dataset, 'plots' ] os.makedirs(os.sep.join(data_plots_folder), exist_ok=True) # Load the list of runs and base labels runs = gather_results.load_run_uuids(dataset, name=data_type, data_path=target_data_path, s3=s3) # run_uuids are the example_ids sizes = [] # The data for each example. examples = examples_ndarray(len(runs), num_dstat_features, normalized_length) # The test result for each example classes = [] skips = [] print("Loading %s data:" % data_type, end='\r', flush=True) for count, run in enumerate(runs): print("Loading %s data: %d of %d" % (data_type, count + 1, len(runs)), end='\r', flush=True) result = gather_results.get_subunit_results_for_run( run, sample_interval, data_path=data_path, s3=s3) # For one run_uuid we must only get on example (result) # Filtering by columns if not result: skips.append(run.uuid) continue # Apply column filtering result = filter_example(result, features_regex) # Normalize data example = fixed_lenght_example(result, normalized_length) vector = unroll_example(example, normalized_length) # Normalize status status = get_class(result, class_label) # Examples is an np ndarrays examples[count] = vector.values classes.append(status) # Plot from figures if visualize: # Prepare some more data if we are going to visualize sizes.append((result['dstat'].shape[0], status)) figure_name = sample_interval + "_%s_" + str(count) # Plot un-normalized data data_plot = result['dstat'].plot() fig = data_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "downsampled"])) plt.close(fig) # Plot fixed size data fixed_plot = example.plot() fig = fixed_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "fixedsize"])) plt.close(fig) # Plot unrolled data unrolled_plot = pd.Series(vector).plot() fig = unrolled_plot.get_figure() fig.savefig( os.sep.join(data_plots_folder + [figure_name % "unrolled"])) plt.close(fig) print("Loading %s data: %d done!" % (data_type, len(runs))) # Check that everything went well if len(skips) > 0: print('Unable to train model because of missing runs %s' % skips) safe_runs = [run.uuid for run in runs if run.uuid not in skips] gather_results.save_run_uuids(dataset, safe_runs) print('The model has been updated to exclude those runs.') print('Please re-run the training step.') sys.exit(1) classes = np.array(classes) figure_sizes = np.array(sizes) example_ids = np.array(runs) print("%s set: examples: %s, classes: %s, example IDs: %s" % (data_type, str(examples.shape), str( classes.shape), str(example_ids.shape))) data = { 'examples': examples, 'example_ids': example_ids, 'classes': classes } return data, figure_sizes