Exemplo n.º 1
0
def main(argv):
    startTime = datetime.now()

    print 'Starting...\n'
    
    ifile = None
    cont = None

    try:
        opts, args = getopt.getopt(argv, "hi:c:", ["ifile=", "cont=",])
    except getopt.GetoptError:
        print 'Check your input parameters'
        print 'bro-dns-ml-hunt.py -i <input Bro DNS file> -c <contamination>'
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print 'bro-dns-ml-hunt.py -i <input Bro DNS file> -c <contamination>'
            sys.exit()
        elif opt in ("-i", "--ifile"):
            ifile = arg
        elif opt in ("-c", "--cont"):
            cont = float(arg)

    if not ifile:
        print 'A Bro log file must be provided as input'
        print 'bro-dns-ml-hunt.py -i <input Bro DNS file> -c <contamination>'
        sys.exit(2)

    if not cont:
        print 'Using default contamination value: 0.1'
        cont = 0.2

    rng = np.random.RandomState(42)

    # Loading target data set
    f = open(ifile, 'r')
    lines = f.readlines()
    f.close()

    data = []
    for line in lines:
        data.append(json.loads(line.strip()))

    original_data = data

    target_data, srows = select_fields(original_data)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        # Create pandas dataframe

        bro_target_df = pd.DataFrame.from_dict(target_data, orient='columns')

        to_matrix = dataframe_to_matrix.DataFrameToMatrix()

        bro_target_matrix = to_matrix.fit_transform(bro_target_df)

        # Train using the Isolation Forest model
        iForest = IsolationForest(max_samples=100, contamination=cont, random_state=rng, verbose=False)
        iForest.fit(bro_target_matrix)

        # Get predictions
        outliers = iForest.predict(bro_target_matrix)

        # Save all outliers
        f = open('outliers.json', 'w')
        for i in range(len(outliers)):
            if outliers[i] == -1:
                f.write(json.dumps(original_data[srows[i]]) + '\n')
        f.close()

        # Isolate outliers
        odd_df = bro_target_df[outliers == -1]

        # Explore outliers with the help from KMeans
        odd_matrix = to_matrix.fit_transform(odd_df)
        num_clusters = min(len(odd_df), 4)  # 4 clusters unless we have less than 4 observations
        odd_df['cluster'] = KMeans(n_clusters=num_clusters).fit_predict(odd_matrix)

        # Group the dataframe by cluster
        cluster_groups = odd_df[['cluster']].groupby('cluster')

        # Save all outliers per cluster
        f = open('kmeans-clusters.json', 'w')
        for key, group in cluster_groups:
            f.write('#Cluster {:d}: {:d} observations'.format(key, len(group)) + '\n')
            np_matrix = group.to_records()
            for item in np_matrix:
                f.write(json.dumps(original_data[srows[item[0]]]) + '\n')
        f.close()

    print '\nDone!'
    print 'Your results have been saved to the files outliers.json and kmeans-clusters.json'
    print 'Time elapsed: ' + str(datetime.now() - startTime)
    print 'Have a nice day!'
Exemplo n.º 2
0
        bro_df = log_to_dataframe.LogToDataFrame(args.bro_log)

        # Add query length
        bro_df['query_length'] = bro_df['query'].str.len()

        # Normalize this field
        ql = bro_df['query_length']
        bro_df['query_length_norm'] = (ql - ql.min()) / (ql.max()-ql.min())

        # These are the features we want (note some of these are categorical!)
        features = ['AA', 'RA', 'RD', 'TC', 'Z', 'rejected', 'proto', 'query',
                    'qclass_name', 'qtype_name', 'rcode_name', 'query_length_norm']
        feature_df = bro_df[features]

        # Use the super awesome DataframeToMatrix class (handles categorical data!)
        to_matrix = dataframe_to_matrix.DataFrameToMatrix()
        bro_matrix = to_matrix.fit_transform(feature_df)

        # Now we're ready for scikit-learn!
        # Just some simple stuff for this example, KMeans and PCA
        kmeans = KMeans(n_clusters=5).fit_predict(bro_matrix)
        pca = PCA(n_components=2).fit_transform(bro_matrix)

        # Now we can put our ML results back onto our dataframe!
        bro_df['x'] = jitter(pca[:, 0]) # PCA X Column
        bro_df['y'] = jitter(pca[:, 1]) # PCA Y Column
        bro_df['cluster'] = kmeans

        # Now use dataframe group by cluster
        show_fields = ['query', 'Z', 'proto', 'qtype_name', 'x', 'y', 'cluster']
        cluster_groups = bro_df[show_fields].groupby('cluster')