예제 #1
0
def parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-j',
                        '--json_format',
                        help='Import zeek log in json string format',
                        action='store_true')
    parser.add_argument(
        '-l',
        '--length',
        help='Filter queries that have characters greater than this length',
        type=int,
        default=0)
    parser.add_argument('zeek_log_path',
                        type=str,
                        help='Type in location of zeek log')
    args = parser.parse_args()
    if args.json_format:
        print('**Importing zeek log in json format**')
        df = import_json(args.zeek_log_path)
    else:
        print('**Importing zeek log in ascii format**')
        print(
            '**If this hangs for longer than a 17 sec, high chance you are trying to import a log in json format instead, use -j**'
        )
        log_to_df = LogToDataFrame()
        df = log_to_df.create_dataframe(args.zeek_log_path)
    return df, args.length
예제 #2
0
def detect(file, amountanom):
    """
    Function to apply a very simple anomaly detector
    amountanom: The top number of anomalies we want to print
    """

    # read data
    log_to_df = LogToDataFrame()
    zeek_df = log_to_df.create_dataframe(file)
    #print('Read in {:d} Rows...'.format(len(zeek_df)))

    # In case you need a label, due to some models being able to work in a
    # semisupervized mode, then put it here. For now everything is
    # 'normal', but we are not using this for detection
    zeek_df['label'] = 'normal'

    # Replace the rows without data (with '-') with 0.
    # Even though this may add a bias in the algorithms,
    # is better than not using the lines.
    # Also fill the no values with 0
    # Finally put a type to each column
    zeek_df['orig_bytes'].replace('-', '0', inplace=True)
    zeek_df['orig_bytes'] = zeek_df['orig_bytes'].fillna(0).astype('int32')
    zeek_df['resp_bytes'].replace('-', '0', inplace=True)
    zeek_df['resp_bytes'] = zeek_df['resp_bytes'].fillna(0).astype('int32')
    zeek_df['resp_pkts'].replace('-', '0', inplace=True)
    zeek_df['resp_pkts'] = zeek_df['resp_pkts'].fillna(0).astype('int32')
    zeek_df['orig_ip_bytes'].replace('-', '0', inplace=True)
    zeek_df['orig_ip_bytes'] = zeek_df['orig_ip_bytes'].fillna(0).astype(
        'int32')
    zeek_df['resp_ip_bytes'].replace('-', '0', inplace=True)
    zeek_df['resp_ip_bytes'] = zeek_df['resp_ip_bytes'].fillna(0).astype(
        'int32')
    # zeek_df['duration'].replace('-', '0', inplace=True)
    # zeek_df['duration'] = zeek_df['duration'].fillna(0).astype('float64')

    features = [
        'duration', 'orig_bytes', 'id.resp_p', 'resp_bytes', 'orig_ip_bytes',
        'resp_pkts', 'resp_ip_bytes'
    ]
    # Add the columns from the log file that we know are numbers. This is only for conn.log files.
    to_matrix = DataFrameToMatrix()
    zeek_matrix = to_matrix.fit_transform(zeek_df[features], normalize=True)

    X_train = zeek_matrix

    # Our y is the label. But we are not using it now.
    y = zeek_df.label

    # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train.
    # X_test = X_train
    odd_clf = IsolationForest(behaviour='new',
                              contamination=0.25)  # Marking 25% odd
    odd_clf.fit(zeek_matrix)
    odd_df = zeek_df[features][odd_clf.predict(zeek_matrix) == -1]
    print(odd_df.shape)
예제 #3
0
def parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-j',
                        '--json_format',
                        help='Import zeek log in json string format',
                        action='store_true')
    parser.add_argument('zeek_log_path',
                        type=str,
                        help='Type in location of zeek log')
    parser.add_argument('-t',
                        action='store_true',
                        default=False,
                        help='Sets the program to tail a live Zeek log')
    args, commands = parser.parse_known_args()
    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)
    # Sanity check that this is a ssl log
    if 'ssl' not in args.zeek_log_path:
        print('This example only works with Zeek ssl.log files..')
        sys.exit(1)
    # File may have a tilde in it
    if args.zeek_log_path:
        args.zeek_log_path = os.path.expanduser(args.zeek_log_path)
    # Determine json or ascii format
    if args.json_format:
        print('**Importing zeek log in json format**')
        df = import_json(args.zeek_log_path)
    else:
        print('**Importing zeek log in ascii format**')
        print(
            '**If this hangs for longer than a 17 sec, high chance you are trying to import a log in json format instead, use -j**'
        )
        log_to_df = LogToDataFrame()
        df = log_to_df.create_dataframe(args.zeek_log_path)
    return df
def detect(file, amountanom, realtime, dumptocsv):
    """
    Function to apply a very simple anomaly detector
    amountanom: The top number of anomalies we want to print
    realtime: If we want to read the conn.log file in real time (not working)
    """

    # Create a Pandas dataframe from the conn.log
    log_to_df = LogToDataFrame()
    bro_df = log_to_df.create_dataframe(file, ts_index=False)

    # In case you need a label, due to some models being able to work in a semisupervized mode, then put it here. For now everything is 'normal', but we are not using this for detection
    bro_df['label'] = 'normal'

    # Change the datetime delta value to seconds. Scikit does not now how to work with timedeltas
    bro_df['durationsec'] = bro_df.duration.apply(lambda x: x.total_seconds())

    # Replace the rows without data (with '-') with -1. Even though this may add a bias in the algorithms, is better than not using the lines.
    bro_df['orig_bytes'] = bro_df['orig_bytes'].fillna(0)
    bro_df['resp_bytes'] = bro_df['resp_bytes'].fillna(0)
    bro_df['resp_pkts'] = bro_df['resp_pkts'].fillna(0)
    bro_df['orig_ip_bytes'] = bro_df['orig_ip_bytes'].fillna(0)
    bro_df['resp_ip_bytes'] = bro_df['resp_ip_bytes'].fillna(0)
    bro_df['durationsec'] = bro_df['durationsec'].fillna(0)

    # Save dataframe to disk as CSV
    if dumptocsv != "None":
        bro_df.to_csv(dumptocsv)

    # Add the columns from the log file that we know are numbers. This is only for conn.log files.
    X_train = bro_df[[
        'durationsec', 'orig_bytes', 'id.resp_p', 'resp_bytes',
        'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes'
    ]]

    # Our y is the label. But we are not using it now.
    y = bro_df.label

    # The X_test is where we are going to search for anomalies. In our case, its the same set of data than X_train.
    X_test = X_train

    #################
    # Select a model from below

    # ABOD class for Angle-base Outlier Detection. For an observation, the variance of its weighted cosine scores to all neighbors could be viewed as the outlying score.
    #clf = ABOD()

    # LOF
    #clf = LOF()

    # CBLOF
    #clf = CBLOF()

    # LOCI
    #clf = LOCI()

    # LSCP
    #clf = LSCP()

    # MCD
    #clf = MCD()

    # OCSVM
    #clf = OCSVM()

    # PCA. Good and fast!
    clf = PCA()

    # SOD
    #clf = SOD()

    # SO_GAAL
    #clf = SO_GALL()

    # SOS
    #clf = SOS()

    # XGBOD
    #clf = XGBOD()

    # KNN
    # Good results but slow
    #clf = KNN()
    #clf = KNN(n_neighbors=10)
    #################

    # Fit the model to the train data
    clf.fit(X_train)

    # get the prediction on the test data
    y_test_pred = clf.predict(X_test)  # outlier labels (0 or 1)

    y_test_scores = clf.decision_function(X_test)  # outlier scores

    # Convert the ndarrays of scores and predictions to  pandas series
    scores_series = pd.Series(y_test_scores)
    pred_series = pd.Series(y_test_pred)

    # Now use the series to add a new column to the X test
    X_test['score'] = scores_series.values
    X_test['pred'] = pred_series.values

    # Add the score to the bro_df also. So we can show it at the end
    bro_df['score'] = X_test['score']

    # Keep the positive predictions only. That is, keep only what we predict is an anomaly.
    X_test_predicted = X_test[X_test.pred == 1]

    # Keep the top X amount of anomalies
    top10 = X_test_predicted.sort_values(by='score',
                                         ascending=False).iloc[:amountanom]

    ## Print the results
    # Find the predicted anomalies in the original bro dataframe, where the rest of the data is
    #df_to_print = bro_df.iloc[top10.index]
    df_to_print = bro_df.iloc[top10.index]
    print('\nFlows of the top anomalies')

    # Only print some columns, not all, so its easier to read.
    df_to_print = df_to_print.drop([
        'conn_state', 'history', 'local_orig', 'local_resp', 'missed_bytes',
        'ts', 'tunnel_parents', 'uid', 'label'
    ],
                                   axis=1)
    print(df_to_print)
예제 #5
0
    parser.add_argument('zeek_log',
                        type=str,
                        help='Specify a zeek log to run ZeekLogReader test on')
    args, commands = parser.parse_known_args()

    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)

    # File may have a tilde in it
    if args.zeek_log:
        args.zeek_log = os.path.expanduser(args.zeek_log)

        # Create a Pandas dataframe from a Zeek log
        log_to_df = LogToDataFrame()
        zeek_df = log_to_df.create_dataframe(args.zeek_log)

        # Print out the head of the dataframe
        print(zeek_df.head())

        # Print out the types of the columns
        print(zeek_df.dtypes)

        # Print out size and memory usage
        print('DF Shape: {:s}'.format(str(zeek_df.shape)))
        print('DF Memory:')
        memory_usage = zeek_df.memory_usage(deep=True)
        total = memory_usage.sum()
        for item in memory_usage.items():
            print('\t {:s}: \t{:.2f} MB'.format(item[0], item[1] / 1e6))
def parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-j',
                        '--json_format',
                        help='Import zeek log in json string format',
                        action='store_true')
    parser.add_argument('-d',
                        '--directory',
                        help='Import zeek logs from directory',
                        action='store_true')
    parser.add_argument('zeek_log_path',
                        type=str,
                        help='Type in location of zeek log')
    parser.add_argument('-t',
                        action='store_true',
                        default=False,
                        help='Sets the program to tail a live Zeek log')
    args, commands = parser.parse_known_args()
    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)
    # Sanity check that this is a ssl log
    if 'ssl' not in args.zeek_log_path:
        print('This example only works with Zeek ssl.log files..')
        sys.exit(1)
    # File may have a tilde in it
    if args.zeek_log_path:
        args.zeek_log_path = os.path.expanduser(args.zeek_log_path)
    # Determine json or ascii format
    if args.json_format:
        if args.directory:
            print('**Importing zeek logs from directory in json format**')
            zeek_logs = [
                os.path.join(args.zeek_log_path, file)
                for file in os.listdir(args.zeek_log_path)
            ]
            logs = []
            for log in zeek_logs:
                logs.append(import_json(log))
            df = pd.concat(logs)
        else:
            print('**Importing zeek log in json format**')
            df = import_json(args.zeek_log_path)
    else:
        if args.directory:
            print('**Importing zeek logs from directory in ascii format**')
            zeek_logs = [
                os.path.join(args.zeek_log_path, file)
                for file in os.listdir(args.zeek_log_path)
            ]
            logs = []
            log_to_df = LogToDataFrame()
            for log in zeek_logs:
                logs.append(log_to_df.create_dataframe(log))
            df = pd.concat(logs)
        else:
            print('**Importing zeek log in ascii format**')
            print(
                '**Hanging? High chance you are trying to import a log in json format, use -j**'
            )
            log_to_df = LogToDataFrame()
            df = log_to_df.create_dataframe(args.zeek_log_path)
    return df
예제 #7
0
            for j in self.job_queue:
                if not j.running():
                    break
            else:
                j = None
            if j is not None:
                self.finished_jobs.append(j)
                self.job_queue.remove(j)
            time.sleep(self.sleeping_time)
        self.job_queue.append(job)


if __name__ == "__main__":
    client = bigquery.Client.from_service_account_json(
        GCP_SERVICE_ACCOUNT_JSON)
    log_to_df = LogToDataFrame()
    jq = JobQueue()
    for root, directories, files in os.walk(DATA_IN):
        for f in files:
            if f.endswith(".log"):
                table_id = "cyber-258808.cicids_2017." + f.replace(".", "_")
                df = log_to_df.create_dataframe(os.path.join(root, f))
                df.reset_index(inplace=True)
                for c, t in df.dtypes.iteritems():
                    # print((c, str(t)))
                    if str(t) == "timedelta64[ns]":
                        df[c] = df[c].dt.total_seconds()
                    elif str(t).endswith("[ns]"):
                        df[c] = df[c].astype(str(t).replace("[ns]", "[ms]"))
                        # accepting to loose precision because of pyarrow that would otherwise not be happy
                        # TODO? Better?
예제 #8
0
파일: zeek_to_pandas.py 프로젝트: maxrp/zat
    parser.add_argument('bro_log',
                        type=str,
                        help='Specify a bro log to run BroLogReader test on')
    args, commands = parser.parse_known_args()

    # Check for unknown args
    if commands:
        print('Unrecognized args: %s' % commands)
        sys.exit(1)

    # File may have a tilde in it
    if args.bro_log:
        args.bro_log = os.path.expanduser(args.bro_log)

        # Create a Pandas dataframe from a Zeek log
        log_to_df = LogToDataFrame()
        bro_df = log_to_df.create_dataframe(args.bro_log)

        # Print out the head of the dataframe
        print(bro_df.head())

        # Print out the types of the columns
        print(bro_df.dtypes)

        # Print out size and memory usage
        print('DF Shape: {:s}'.format(str(bro_df.shape)))
        print('DF Memory:')
        memory_usage = bro_df.memory_usage(deep=True)
        total = memory_usage.sum()
        for item in memory_usage.items():
            print('\t {:s}: \t{:.2f} MB'.format(item[0], item[1] / 1e6))
예제 #9
0
def parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-j',
                        '--json_format',
                        help='Import zeek log in json string format',
                        action='store_true')
    parser.add_argument('-a',
                        '--anomaly',
                        help='Perform clustering on anomalies',
                        action='store_true')
    parser.add_argument('-d',
                        '--directory',
                        help='Import zeek logs from directory',
                        action='store_true')
    parser.add_argument('-c',
                        '--clusters',
                        help='Number of clusters to divide data, default=4',
                        type=int,
                        default=4)
    parser.add_argument('zeek_log_path',
                        type=str,
                        help='Type in location of zeek log')
    args = parser.parse_args()
    if args.json_format:
        if args.directory:
            print('**Importing zeek logs from directory in json format**')
            zeek_logs = [
                os.path.join(args.zeek_log_path, file)
                for file in os.listdir(args.zeek_log_path)
            ]
            '''
            df = import_json(zeek_logs[0])
            for log in zeek_logs[1:]:
                print('**Appending log**')
                df = df.append(import_json(log))
            '''
            logs = []
            for log in zeek_logs:
                logs.append(import_json(log))
            df = pd.concat(logs)
        else:
            print('**Importing zeek log in json format**')
            df = import_json(args.zeek_log_path)
    else:
        if args.directory:
            print('**Importing zeek logs from directory in ascii format**')
            zeek_logs = [
                os.path.join(args.zeek_log_path, file)
                for file in os.listdir(args.zeek_log_path)
            ]
            logs = []
            log_to_df = LogToDataFrame()
            for log in zeek_logs:
                logs.append(log_to_df.create_dataframe(log))
            df = pd.concat(logs)
        else:
            print('**Importing zeek log in ascii format**')
            print(
                '**If this hangs for longer than a 17 sec, high chance you are trying to import a log in json format instead, use -j**'
            )
            log_to_df = LogToDataFrame()
            df = log_to_df.create_dataframe(args.zeek_log_path)
    return df, args.anomaly, args.clusters