def main():
    args = parse_arguments([
        {'name': 'quasi_identifiers', 'type': str},
        {'name': 'k', 'type': int}
    ])

    dataset = load_dataset(args.dataset_path)
    quasi_identifiers = json.loads(args.quasi_identifiers)
    relevant_arrays = find_selected_arrays(dataset, quasi_identifiers)

    if len(relevant_arrays) == 0:
        notify_validation_result(
            ValidationOutcome.WARNING, "No quasi identifier was found")
    else:
        # compute k-anonymity for each array containing QI's.
        # final k-anonymity will be the lowest of them
        ks = [compute_kanonymity(a['array'], a['field_selection'])
              for a in relevant_arrays]
        k = min(ks)

        if k >= args.k:
            notify_validation_result(
                ValidationOutcome.SUCCESS,
                '%d-anonymous (>= %d)' % (k, args.k))
        else:
            notify_validation_result(
                ValidationOutcome.FAILURE,
                '%d-anonymous (< %d)' % (k, args.k))
示例#2
0
def main():
    args = parse_arguments([{'name': 'fields', 'type': str}])

    # extract arguments
    dataset = load_dataset(args.dataset_path)
    original_dataset = load_dataset(args.origin_path)
    fields = json.loads(args.fields)

    usernames = []
    iterate_and_apply(original_dataset, fields, usernames.append)

    # detect names by traversing the dataset
    matches = []
    find_names(dataset, usernames, matches)

    # notify the detected user names
    if len(matches) > 0:
        messages = ["Detected user names: "] + \
            ['  - %s' % m for m in matches[:10]]
        if len(matches) > 10:
            messages.append('  - ...')

        notify_validation_result(ValidationOutcome.FAILURE,
                                 '\n'.join(messages))
    else:
        notify_validation_result(ValidationOutcome.SUCCESS,
                                 'No user name detected')
示例#3
0
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    users = dataset['data']['users']

    # prepare id and name substitutions
    # id substitution = hash(id)
    id_substitutions = {
        user['_id']: {
            'substitution': sha256_hash(user['_id']),
            'regex': user['_id']
        }
        for user in users
    }

    name_substitutions = {
        user['name']: {
            # name substitution = id substitution
            'substitution': id_substitutions[user['_id']]['substitution'],
            'regex': get_regex(user['name'])
        }
        for user in users
    }

    substitutions = [v['substitution'] for v in id_substitutions.values()]

    # replace the user ids in actions
    for action in dataset['data']['actions']:
        if 'user' in action:
            userid = action['user']
            if userid not in id_substitutions:
                id_substitutions[userid] = {
                    'substitution': sha256_hash(userid),
                    'regex': userid
                }

            action['user'] = id_substitutions[userid]['substitution']

    # replace the user ids in appInstanceResources
    for app_instance_res in dataset['data']['appInstanceResources']:
        if 'user' in app_instance_res:
            userid = app_instance_res['user']
            if userid not in id_substitutions:
                id_substitutions[userid] = {
                    'substitution': sha256_hash(userid),
                    'regex': userid
                }

            app_instance_res['user'] = id_substitutions[userid]['substitution']

    # generically search and replace occurrences of usernames and ids
    find_and_replace(dataset, name_substitutions)
    find_and_replace(dataset, id_substitutions)

    # the users fields are now just the hashes
    dataset['data']['users'] = substitutions

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([
        {'name': 'quasi_identifiers', 'type': str},
        {'name': 'sensitive_attributes', 'type': str},
        {'name': 'l', 'type': int}
    ])

    dataset = load_dataset(args.dataset_path)
    quasi_identifiers = json.loads(args.quasi_identifiers)
    sensitive_attributes = json.loads(args.sensitive_attributes)
    relevant_arrays = find_selected_arrays(
        dataset, quasi_identifiers, sensitive_attributes)

    if len(relevant_arrays) == 0:
        notify_validation_result(
            ValidationOutcome.WARNING, "No sensitive attribute was found")
    else:
        # compute l-diversity for each array with SA's
        # final l-diversity will be the smallest of them
        ls = [compute_ldiversity(array['array'],
                                 array['quasi_identifiers'],
                                 array['sensitive_attributes'])
              for array in relevant_arrays]
        l = min(ls)

        if l >= args.l:
            notify_validation_result(
                ValidationOutcome.SUCCESS,
                '%d-diversified (>= %d)' % (l, args.l))
        else:
            notify_validation_result(
                ValidationOutcome.FAILURE,
                '%d-diversified (< %d)' % (l, args.l))
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    # write your dataset changes here

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([{'name': 'positive_integer', 'type': int}])

    dataset = load_dataset(args.dataset_path)

    if args.positive_integer < 0:
        raise Exception

    save_dataset(dataset, args.output_path)
示例#7
0
def main():
    args = parse_arguments([{'name': 'fields', 'type': str}])

    dataset = load_dataset(args.dataset_path)
    fields = json.loads(args.fields)

    iterate_and_suppress(dataset, fields)

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments([{'name': 'password', 'type': str}])

    dataset = load_dataset(args.dataset_path)

    if args.password != 'PASSWORD':
        raise Exception

    save_dataset(dataset, args.output_path)
示例#9
0
def main():
    args = parse_arguments([{'name': 'fields', 'type': str}])

    dataset = load_dataset(args.dataset_path)
    fields = json.loads(args.fields)

    relevant_arrays = find_selected_arrays(dataset, fields)
    for array in relevant_arrays:
        shuffle_attributes(array['array'], array['field_selection']),

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    for action in dataset['data']['actions']:
        if 'user' in action:
            action['user'] = sha256_hash(action['user'])

    for appInstanceResource in dataset['data']['appInstanceResources']:
        if 'user' in appInstanceResource:
            appInstanceResource['user'] = sha256_hash(appInstanceResource['user'])

    save_dataset(dataset, args.output_path)
示例#11
0
def main():
    # Prepares the parameters for the algorithm (dataset_path and output_path
    # come by default). You can then use them with args.parameter_name.
    # Avoid editing the parameters here, use the dedicated utility instead and
    # the code will change accordingly.
    args = parse_arguments()

    # load the json dataset, available as a python dictionary
    dataset = load_dataset(args.dataset_path)

    # write your validation code here

    # notify the validation outcome
    # outcome should be one of ValidationOutcome.SUCCESS,
    # ValidationOutcome.WARNING or ValidationOutcome.FAILURE
    # write the information to display in info
    outcome = ValidationOutcome.SUCCESS
    info = ""

    notify_validation_result(outcome, info)
示例#12
0
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)

    # hash the 'user' id for every action
    for action in dataset['data']['actions']:
        if 'user' in action:
            action['user'] = sha256_hash(action['user'])

    # hash the 'user' id for every appInstanceResource
    for appInstanceResource in dataset['data']['appInstanceResources']:
        if 'user' in appInstanceResource:
            appInstanceResource['user'] = sha256_hash(
                appInstanceResource['user'])

    # hash the user '_id' for every user and remove every other attribute
    new_users = []
    for user in dataset['data']['users']:
        if '_id' in user:
            new_users.append({'_id': sha256_hash(user['_id'])})
    dataset['data']['users'] = new_users

    save_dataset(dataset, args.output_path)
def main():
    args = parse_arguments()

    dataset = load_dataset(args.dataset_path)
    has_user_name = False
    has_AP_settings = False
    has_API_data = False

    # verify in 'actions' if 'data' and 'geolocation' are present
    has_action_data = False
    has_action_geolocation = False
    actions = dataset['data']['actions']
    for action in actions:
        if 'data' in action:
            data = action['data']
            if data != None and data != '' and data != {}:
                has_action_data = True
        if 'geolocation' in action:
            geolocation = action['geolocation']
            if geolocation != None and geolocation != '' and geolocation != {}:
                has_action_geolocation = True

    # verify in 'users' if 'name' is present
    users = dataset['data']['users']
    for user in users:
        if 'name' in user:
            name = user['name']
            if name != None and name != '':
                has_user_name = True

    # verify in 'appInstances' if 'settings' is present
    appInstances = dataset['data']['appInstances']
    for ar in appInstances:
        if 'settings' in ar:
            settings = ar['settings']
            if settings != None and settings != '' and settings != {}:
                has_AP_settings = True

    # verify in 'appInstanceResources' if 'data' is present
    appInstanceResources = dataset['data']['appInstanceResources']
    for air in appInstanceResources:
        if 'data' in air:
            data = air['data']
            if data != None and data != '' and data != {}:
                has_API_data = True

    # issue a warning if any of these potentially dangerousattributes
    # are present
    potentially_dangerous = (has_action_data or has_action_geolocation
                             or has_user_name or has_AP_settings
                             or has_API_data)
    if potentially_dangerous:
        messages = ['Potentially dangerous attributes: ']
        if has_action_data:
            messages.append('- actions > data')
        if has_action_geolocation:
            messages.append('- actions > geolocation')
        if has_user_name:
            messages.append('- users > name')
        if has_AP_settings:
            messages.append('- appInstances > settings')
        if has_API_data:
            messages.append('- appInstanceResources > data')

        notify_validation_result(ValidationOutcome.WARNING,
                                 '\n'.join(messages))
    else:
        notify_validation_result(ValidationOutcome.SUCCESS,
                                 'No potentially dangerous attributes')
示例#14
0
def main():
    args = parse_arguments([{'name': 'k', 'type': int}])

    dataset = load_dataset(args.dataset_path)

    actions = dataset['data']['actions']

    # get the geolocations for each user
    geolocations_per_user = {}
    for action in actions:
        if 'geolocation' in action:
            user = action['user']
            if user not in geolocations_per_user:
                geolocations_per_user[user] = []

            geoloc = action['geolocation']
            country = geoloc.get('country')
            region = geoloc.get('region')
            city = geoloc.get('city')
            geolocations_per_user[user].append((country, region, city))

    # get most represented geolocation for each user
    geolocation_mapping = {}
    for user, geolocations in geolocations_per_user.items():
        most_common = Counter(geolocations).most_common(1)
        if len(most_common) == 1:
            [((country, region, city), _)] = most_common
            geolocation_mapping[user] = {
                'country': country,
                'region': region,
                'city': city,
            }

    # group users by country then region then city
    grouped = {}
    for user, geo in geolocation_mapping.items():
        country = geo['country']
        region = geo['region']
        city = geo['city']

        if country not in grouped:
            grouped[country] = {'count': 0, 'regions': {}}
        country_group = grouped[country]
        country_group['count'] += 1
        country_regions = country_group['regions']

        if region not in country_regions:
            country_regions[region] = {'count': 0, 'cities': {}}
        region_group = country_regions[region]
        region_group['count'] += 1
        region_cities = region_group['cities']

        if city not in region_cities:
            region_cities[city] = {'count': 0, 'users': []}
        city_group = region_cities[city]
        city_group['count'] += 1
        city_group['users'].append(user)

    # remove each value that is not represented at least k times
    for country, country_group in grouped.items():
        country_user_count = country_group.get('count', 0)
        regions = country_group.get('regions', {})
        for region, region_group in regions.items():
            region_user_count = region_group.get('count', 0)
            cities = region_group.get('cities', {})
            for city, city_group in cities.items():
                city_user_count = city_group.get('count', 0)
                users = city_group.get('users', [])
                if city_user_count < args.k:
                    for user in users:
                        geolocation_mapping[user]['city'] = ''
                if region_user_count < args.k:
                    for user in users:
                        geolocation_mapping[user]['region'] = ''
                if country_user_count < args.k:
                    for user in users:
                        geolocation_mapping[user]['country'] = ''

    # update with new values
    for action in actions:
        if 'geolocation' in action:
            user = action['user']
            action['geolocation'] = geolocation_mapping[user]

    save_dataset(dataset, args.output_path)