Exemplo n.º 1
0
import sys

sys.path.append('../')
from loglizer.models import PCA
from loglizer import dataloader, preprocessing

struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file
label_file = '../data/HDFS/anomaly_label.csv'  # The anomaly label file

if __name__ == '__main__':
    (x_train, y_train), (x_test,
                         y_test) = dataloader.load_HDFS(struct_log,
                                                        label_file=label_file,
                                                        window='session',
                                                        train_ratio=0.5,
                                                        split_type='uniform')
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train,
                                              term_weighting='tf-idf',
                                              normalization='zero-mean')
    x_test = feature_extractor.transform(x_test)

    model = PCA()
    model.fit(x_train)

    print('Train validation:')
    precision, recall, f1 = model.evaluate(x_train, y_train)

    print('Test validation:')
    precision, recall, f1 = model.evaluate(x_test, y_test)
Exemplo n.º 2
0
struct_log = '../data/HDFS/HDFS_100k.log_structured.csv'  # The structured log file

if __name__ == '__main__':
    ## 1. Load strutured log file and extract feature vectors
    # Save the raw event sequence file by setting save_csv=True
    (x_train, _), (_, _) = dataloader.load_HDFS(struct_log,
                                                window='session',
                                                save_csv=True)
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.fit_transform(x_train,
                                              term_weighting='tf-idf',
                                              normalization='zero-mean')

    ## 2. Train an unsupervised model
    print('Train phase:')
    # Initialize PCA, or other unsupervised models, LogClustering, InvariantsMiner
    model = PCA()
    # Model hyper-parameters may be sensitive to log data, here we use the default for demo
    model.fit(x_train)
    # Make predictions and manually check for correctness. Details may need to go into the raw logs
    y_train = model.predict(x_train)

    ## 3. Use the trained model for online anomaly detection
    print('Test phase:')
    # Load another new log file. Here we use struct_log for demo only
    (x_test, _), (_, _) = dataloader.load_HDFS(struct_log, window='session')
    # Go through the same feature extraction process with training, using transform() instead
    x_test = feature_extractor.transform(x_test)
    # Finall make predictions and alter on anomaly cases
    y_test = model.predict(x_test)
Exemplo n.º 3
0
    feature_extractor = preprocessing.FeatureExtractor()
    x_train = feature_extractor.df_fit_transform(x_train)

    graphs_and_reports.add_weights_to_key_dict(
        "../analysis_results/HDFS/key_dict.txt",
        feature_extractor.idf_vec.to_dict(),
        output_path='../analysis_results/HDFS/key_dict.txt')
    graphs_and_reports.get_keys_chart(
        "../analysis_results/HDFS/key_dict.txt",
        output_path=r'../analysis_results/HDFS/keys_chart.png')

    ## Train an unsupervised model
    print('Train phase:')
    # Initialize PCA
    model = PCA()
    model.fit(x_train)

    print('Prediction phase:')
    prediction = model.predict(x_train.values)

    print(prediction)
    print(prediction['prediction'].values)

    x_test = dataloader.load_HDFS_data_timestamp_approach(
        test_path,
        time_delta_sec=time_delta_sec,
        timestamp_format='%Y-%m-%d %H:%M:%S',
        cached_workflow_path=r'..\cached\HDFS\test_workflow.csv')
    #x_train = dataloader.load_HDFS_data_debug(r'..\cached\HDFS\train_workflow.csv')
    x_test = feature_extractor.transform(x_test)