Exemplo n.º 1
0
def do_first_job(file_location: str):
    experiments = get_experiments(file_location)

    did_job = False
    idx = 0
    for cmd, state in experiments.items():
        idx = idx + 1
        if state == EXPERIMENT_NOT_DONE:
            # Do this job
            experiments[cmd] = EXPERIMENT_BUSY
            upload_experiments(file_location, experiments)
            did_job = True

            exit_code = do_job(cmd +
                               (' -g 16' if io.get('use_gpus') else ' -g 0'))
            if exit_code != 0:
                error('An error occurred while executing command', cmd,
                      'giving exit code', exit_code)
                try_notify('A command failed')
                upload_experiments(file_location,
                                   experiments,
                                   is_error=True,
                                   error_code=exit_code)
                sys.exit(1)
            else:
                experiments[cmd] = EXPERIMENT_DONE
                logline('Done with job', cmd)
                try_notify('Done with job ' + str(idx))
                upload_experiments(file_location, experiments)
            break

    return did_job
Exemplo n.º 2
0
def get_features():
    file = get_pd_file()
    logline('Length before filtering is', len(file))
    f = filter_users(file)
    logline('Length after filtering is', len(f))
    rows = len(f)
    f = group_pd_file(f)
    gen_features(f, rows)
Exemplo n.º 3
0
def get_pd_file() -> pd.DataFrame:
    logline('Opening file')
    dataset_name = get_dataset_name()

    return pd.read_hdf(io.get('input_file'),
                       dataset_name,
                       start=0,
                       stop=calc_rows_amount(),
                       chunksize=1000)
Exemplo n.º 4
0
def upload_experiments(file_location: str,
                       experiments: Dict[str, int],
                       is_error=False,
                       error_code=0):
    with open(file_location, 'w+') as experiments_file:
        if is_error:
            logline('Set state to error state with code', error_code)
            experiments["error_code"] = error_code
        else:
            logline('Updated experiments file')
        experiments_file.write(json.dumps(experiments))
Exemplo n.º 5
0
def main():
    if not io.run:
        return

    start_time = time.time()
    logline("Gathering features for",
            str(io.get('dataset_percentage')) + "% of rows",
            "using a batch size of", BATCH_SIZE)

    get_features()
    # get_features_iter()
    logline('Total runtime is',
            Timer.stringify_time(Timer.format_time(time.time() - start_time)))
    sys.exit()
Exemplo n.º 6
0
def split_dataframe(f: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    # Try to get close to the target split
    training_set = list()
    test_set = list()

    index = 10
    logline('Splitting dataframes')
    grouped = f.groupby(np.arange(len(f)) // (len(f) / 10))
    for g, dataframe in grouped:
        if index <= TRAINING_SET_PERCENTAGE:
            training_set.append(dataframe)
        else:
            test_set.append(dataframe)
        index += 10

    # noinspection PyTypeChecker
    return pd.concat(training_set), pd.concat(test_set)
Exemplo n.º 7
0
def output_data(users_list: List[Dict[str, Union[str,
                                                 Dict[str,
                                                      List[List[float]]]]]]):
    if io.get('output_file') == 'stdout':
        logline('Outputting to stdout')
        sys.stdout.write(json.dumps(users_list))
    else:
        logline('Outputting data to file', io.get('output_file'))
        output = open(io.get('output_file'), 'wb')
        try:
            pickle.dump(users_list, output, protocol=4)
        except:
            try:
                logline("Using JSON instead")
                output.write(json.dumps(users_list))
            except:
                error('Outputting to console instead')
                print(json.dumps(users_list))
                raise
            raise
        logline('Done outputting data to file')
def filter_users(f: pd.DataFrame) -> pd.DataFrame:
    logline('Generating anonymous users filter')
    anonymous_users_filter = ~(f['source_user'].str.contains('ANONYMOUS')
                               & f['source_user'].str.contains('LOGON'))

    if io.get('users_only'):
        debug('Skipping all computer users')
        logline('Generating computer users filter')
        computer_users_filter = ~(f['source_user'].str.startswith('C')
                                  & f['source_user'].str.endswith('$'))

        full_filter = anonymous_users_filter & computer_users_filter
    else:
        full_filter = anonymous_users_filter

    logline('Applying filters')
    return f[full_filter]
Exemplo n.º 9
0
def gen_features(f: pd.DataFrame, row_amount: int):
    users_list = list()

    logline('Calculating amount of groups...')
    users = len(f)
    logline(
        'There are', users, 'users and', row_amount,
        'rows matching your filter type',
        'no computer users or anonymous users'
        if io.get('users_only') else 'no anonymous users')
    rows = 0

    max_users = users
    if not DO_ROWS_PERCENTAGE:
        max_users = int(math.ceil(users * 0.01 * io.get('dataset_percentage')))
    logline('Max amount of users is', max_users)

    logline('Setting timer for',
            int(math.ceil(row_amount * 0.01 * io.get('dataset_percentage'))),
            'rows')
    timer = Timer(
        int(math.ceil(row_amount * 0.01 * io.get('dataset_percentage'))))

    logline('Creating iterator')
    dataset_iterator = DFIterator(f)

    next_report = REPORT_SIZE

    if not SKIP_MAIN:
        try:
            # Create groups of approx 1000 users big
            if io.get('cpus') == 1:
                logline('Only using a single CPU')
                logline('Starting feature generation')
                for name, group in f:
                    completed_result, group_len = strip_group_length(
                        gen_features_for_user((name, group)))

                    timer.add_to_current(group_len)
                    rows += group_len

                    if completed_result is not None:
                        users_list.append(completed_result)

                        if rows > next_report == 0 or REPORT_EVERY_USER:
                            next_report = next_report + REPORT_SIZE

                            logline('At row ',
                                    str(rows),
                                    '/~',
                                    str(row_amount),
                                    ' - ETA is: ' + timer.get_eta(),
                                    spaces_between=False)
                            logline('At user ',
                                    len(users_list),
                                    '/~',
                                    max_users,
                                    spaces_between=False)

                    if len(users_list) >= max_users:
                        break

            else:
                logline('Using', io.get('cpus'), 'cpus')
                for i in range(
                        round(math.ceil(max_users / PROCESSING_GROUP_SIZE))):
                    dataset_iterator.set_max((i + 1) * PROCESSING_GROUP_SIZE)
                    if i == 0:
                        logline('Starting feature generation')

                    with multiprocessing.Pool(io.get('cpus')) as p:
                        for completed_result in p.imap_unordered(
                                gen_features_for_user,
                                dataset_iterator,
                                chunksize=100):

                            completed_result, group_len = strip_group_length(
                                completed_result)
                            timer.add_to_current(group_len)
                            rows += group_len

                            if completed_result is not None:
                                users_list.append(completed_result)

                                if rows > next_report or REPORT_EVERY_USER:
                                    next_report = next_report + REPORT_SIZE
                                    logline('At row ',
                                            str(rows),
                                            '/~',
                                            str(row_amount),
                                            ' - ETA is: ' + timer.get_eta(),
                                            spaces_between=False)
                                    logline('At user',
                                            len(users_list),
                                            '/~',
                                            max_users,
                                            spaces_between=False)
        except KeyboardInterrupt:
            logline('User cancelled execution, wrapping up')
            debug('Cancelled early at', len(users_list), 'instead of', users)
            debug('You skipped a total of', users - len(users_list),
                  'users, or', 100 - ((len(users_list) / users) * 100), '%')
        except Exception:
            error('An error occurred during execution', traceback.format_exc())
            debug('Salvaging all remaining users')
        finally:
            debug('Runtime is', timer.report_total_time())

            logline("Did a total of", len(users_list), "users")
            logline('Done gathering data')
            logline('Closing file...')
            output_data(users_list)
    else:
        debug('SKIPPING MAIN, DO NOT ENABLE IN PRODUCTION')
        logline('Closing file')
        output_data([])
Exemplo n.º 10
0
def extract_features(rows):
    users_list = list()
    users = len(rows)
    rows_amount = 0

    logline(
        'There are', users, 'users and', len(rows),
        'rows matching your filter type',
        'no computer users or anonymous users'
        if io.get('users_only') else 'no anonymous users')

    rows_max = get_dict_inner_length(rows)
    logline('Setting timer for', rows_max, 'rows')
    timer = Timer(rows_max)

    try:
        for name, group in rows.items():
            completed_result, group_len = strip_group_length(
                gen_features_for_user((name, group)))

            timer.add_to_current(group_len)
            rows_amount += group_len

            if completed_result is not None:
                users_list.append(completed_result)

                if rows_amount > next_report == 0 or REPORT_EVERY_USER:
                    next_report = next_report + REPORT_SIZE

                    logline('At row ',
                            str(rows_amount),
                            '/~',
                            str(row_amount),
                            ' - ETA is: ' + timer.get_eta(),
                            spaces_between=False)
                    logline('At user ',
                            len(users_list),
                            '/~',
                            max_users,
                            spaces_between=False)

            if len(users_list) >= max_users:
                break
    except KeyboardInterrupt:
        logline('User cancelled execution, wrapping up')
        debug('Cancelled early at', len(users_list), 'instead of', users)
        debug('You skipped a total of', users - len(users_list), 'users, or',
              100 - ((len(users_list) / users) * 100), '%')
    except Exception:
        error('An error occurred during execution', traceback.format_exc())
        debug('Salvaging all remaining users')
    finally:
        debug('Runtime is', timer.report_total_time())

        logline("Did a total of", len(users_list), "users")
        logline('Done gathering data')
        logline('Closing file...')
        output_data(users_list)
Exemplo n.º 11
0
def filter_users(f: pd.DataFrame) -> pd.DataFrame:
    logline('Generating anonymous users filter')
    anonymous_users_filter = ~(f['source_user'].str.contains('ANONYMOUS')
                               & f['source_user'].str.contains('LOGON'))

    if io.get('users_only'):
        debug('Skipping all computer users')
        logline('Generating computer users filter')
        computer_users_filter = ~(f['source_user'].str.startswith('C')
                                  & f['source_user'].str.contains('$'))

        logline('Filtering out',
                len(list(filter(lambda x: x, ~computer_users_filter))),
                'computer users')
        full_filter = anonymous_users_filter & computer_users_filter
    else:
        full_filter = anonymous_users_filter
    logline('Filtering out',
            len(list(filter(lambda x: x, ~anonymous_users_filter))),
            'anonymous users')
    logline('Filtering out a total of',
            len(list(filter(lambda x: x, ~full_filter))), 'rows')

    logline('Applying filters')
    return f[full_filter]
Exemplo n.º 12
0
def group_pd_file(f: pd.DataFrame) -> pd.DataFrame:
    logline('Grouping users in file')
    grouped = group_df(f)
    logline('Done grouping users')
    return grouped
def main():
    if not io.run:
        return

    state_file = io.get('state_file')
    input_file = io.get('input_file')
    output_file = io.get('output_file')
    dataset_file = io.get('dataset_file')

    logline('Loading dataset file...')
    f = pd.read_hdf(dataset_file,
                    get_dataset_name(),
                    start=0,
                    stop=calc_rows_amount())
    logline('Filtering users')
    f = filter_users(f)
    logline('Grouping users')
    f = group_df(f)

    if state_file is not None:
        initial_state = get_state(state_file)
        logline('Waiting for state to reach different value, currently at ' +
                str(initial_state) + '...')
        while get_state(state_file) == initial_state:
            time.sleep(60)

        logline('State file has switched to ' + str(get_state(state_file)) +
                ', continuing execution')

    logline('Loading anomalies')
    anomalies = read_anomalies(input_file)

    anomaly_rows_list = dict()

    users = len(f)
    max_users = users
    if DO_ROWS_PERCENTAGE:
        max_users = math.ceil(users * 0.01 * io.get('dataset_percentage'))

    timer = Timer(math.ceil(len(f) * 0.01 * io.get('dataset_percentage')))

    for name, group in f:
        user_name = group.iloc[0].get('source_user').split('@')[0]

        anomaly_collection = anomalies.get(user_name)
        if anomaly_collection is not None:
            # Print those rows

            user_anomalies = list()
            for anomaly in anomaly_collection:
                anomaly_dict = {
                    "start":
                    anomaly["start"],
                    "end":
                    anomaly["end"],
                    "lines":
                    listify_df(group.iloc[anomaly["start"]:anomaly["end"]]),
                    "final_features":
                    translate_feature_arr(anomaly["final_row_features"]),
                    "predicted":
                    anomaly["predicted"],
                    "actual":
                    anomaly["actual"],
                    "loss":
                    anomaly["loss"]
                }
                user_anomalies.append(anomaly_dict)

            anomaly_rows_list[user_name] = user_anomalies

            timer.add_to_current(1)

        if timer.current % REPORT_SIZE == 0:
            logline('ETA is ' + timer.get_eta())

        if timer.current >= max_users:
            break

    debug('Runtime is', timer.report_total_time())
    logline('Generating concatenated results')
    if output_file == 'stdout':
        logline("Outputting results to stdout\n\n\n")
        logline('Final value is', anomaly_rows_list)
        logline(json.dumps(anomaly_rows_list))
    else:
        logline('Outputting results to', output_file)
        with open(output_file, 'w') as out_file:
            out_file.write(json.dumps(anomaly_rows_list))
            logline('Output results to', output_file)

    if REMOVE_INPUT_FILE:
        os.remove(input_file)
        logline('Removed encoded file')
    else:
        logline('Not Removing encoded file')

    logline('Done, closing files and stuff')
        if timer.current >= max_users:
            break

    debug('Runtime is', timer.report_total_time())
    logline('Generating concatenated results')
    if output_file == 'stdout':
        logline("Outputting results to stdout\n\n\n")
        logline('Final value is', anomaly_rows_list)
        logline(json.dumps(anomaly_rows_list))
    else:
        logline('Outputting results to', output_file)
        with open(output_file, 'w') as out_file:
            out_file.write(json.dumps(anomaly_rows_list))
            logline('Output results to', output_file)

    if REMOVE_INPUT_FILE:
        os.remove(input_file)
        logline('Removed encoded file')
    else:
        logline('Not Removing encoded file')

    logline('Done, closing files and stuff')


if __name__ == '__main__':
    start_time = time.time()
    main()
    logline('Total runtime is',
            Timer.stringify_time(Timer.format_time(time.time() - start_time)))
Exemplo n.º 15
0
def main():
    experiments_file_location = io.get('experiments_file')

    if experiments_file_location is None:
        logline('Experiment file is not specified, please do so')
        sys.exit(2)

    logline('Starting')

    try:
        logline('Starting jobs')
        while do_first_job(experiments_file_location):
            logline('Did job')
        logline('Completed successfully!')
    except KeyboardInterrupt:
        logline('Cancelling run due to user interrupt')
        sys.exit(130)
    logline('Done')
Exemplo n.º 16
0
def ensure_trailing_slash(folder_name: str) -> str:
    """Makes sure a string ends with a slash"""
    if not folder_name.endswith('/'):
        return folder_name + '/'
    return folder_name


def ensure_folder(folder_name: str):
    """Checks if a folder exists and if it doesn't, makes it"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)


def gen_folders():
    """Generates all folders needed for the process"""
    root_folder = ensure_trailing_slash(io.get('folder'))
    ensure_folder(root_folder)
    ensure_folder(root_folder + 'logs/')
    ensure_folder(root_folder + 'plots/')


def main():
    gen_folders()


if __name__ == "__main__":
    logline('Starting folder generation')
    main()
    logline('Done generating folders')