import sys sys.path.append('../') from loglizer.models import PCA from loglizer import dataloader, preprocessing struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file label_file = '../data/HDFS/anomaly_label.csv' # The anomaly label file if __name__ == '__main__': (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(struct_log, label_file=label_file, window='session', train_ratio=0.5, split_type='uniform') feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') x_test = feature_extractor.transform(x_test) model = PCA() model.fit(x_train) print('Train validation:') precision, recall, f1 = model.evaluate(x_train, y_train) print('Test validation:') precision, recall, f1 = model.evaluate(x_test, y_test)
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv' # The structured log file if __name__ == '__main__': ## 1. Load strutured log file and extract feature vectors # Save the raw event sequence file by setting save_csv=True (x_train, _), (_, _) = dataloader.load_HDFS(struct_log, window='session', save_csv=True) feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf', normalization='zero-mean') ## 2. Train an unsupervised model print('Train phase:') # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner model = PCA() # Model hyper-parameters may be sensitive to log data, here we use the default for demo model.fit(x_train) # Make predictions and manually check for correctness. Details may need to go into the raw logs y_train = model.predict(x_train) ## 3. Use the trained model for online anomaly detection print('Test phase:') # Load another new log file. Here we use struct_log for demo only (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session') # Go through the same feature extraction process with training, using transform() instead x_test = feature_extractor.transform(x_test) # Finall make predictions and alter on anomaly cases y_test = model.predict(x_test)
feature_extractor = preprocessing.FeatureExtractor() x_train = feature_extractor.df_fit_transform(x_train) graphs_and_reports.add_weights_to_key_dict( "../analysis_results/HDFS/key_dict.txt", feature_extractor.idf_vec.to_dict(), output_path='../analysis_results/HDFS/key_dict.txt') graphs_and_reports.get_keys_chart( "../analysis_results/HDFS/key_dict.txt", output_path=r'../analysis_results/HDFS/keys_chart.png') ## Train an unsupervised model print('Train phase:') # Initialize PCA model = PCA() model.fit(x_train) print('Prediction phase:') prediction = model.predict(x_train.values) print(prediction) print(prediction['prediction'].values) x_test = dataloader.load_HDFS_data_timestamp_approach( test_path, time_delta_sec=time_delta_sec, timestamp_format='%Y-%m-%d %H:%M:%S', cached_workflow_path=r'..\cached\HDFS\test_workflow.csv') #x_train = dataloader.load_HDFS_data_debug(r'..\cached\HDFS\train_workflow.csv') x_test = feature_extractor.transform(x_test)