Exemplo n.º 1
0
def get_set_of_features():
    varience_threshold_features = vt.get_features(raw_data, raw_data_ids)
    tree_selection_features = tsf.get_features(raw_data, raw_data_ids, debug=debug)
    recusive_features = rf.get_features(raw_data, raw_data_ids, debug=debug)
    chi_square_features = cs.get_features(raw_data, raw_data_ids)
    information_gain_features = ig.get_features(raw_data, raw_data_ids)

    ## Take the intersection of the features
    features = set(chi_square_features).intersection(recusive_features)
    features = features.intersection(varience_threshold_features)
    features = features.intersection(tree_selection_features)
    features = features.intersection(information_gain_features)
    return features
Exemplo n.º 2
0
        selected_features.append(temp)
    if _use_recursive_algo_:
        temp, debug = rf.get_features(raw_data,
                                      raw_data_ids,
                                      debug=debug,
                                      run=i)
        selected_features.append(temp)
    if _use_chi_algo:
        temp, debug = cs.get_features(raw_data,
                                      raw_data_ids,
                                      debug=debug,
                                      run=i)
        selected_features.append(temp)
    if _use_info_algo:
        temp, debug = ig.get_features(raw_data,
                                      raw_data_ids,
                                      debug=debug,
                                      run=i)
        selected_features.append(temp)

    ## Take the intersection of the features
    features = set(selected_features[0])
    for i in range(1, len(selected_features)):
        features.intersection(selected_features[i])

    ## Remove the unused features from raw_data
    refined_data = pd.DataFrame(raw_data)
    refined_ids = raw_data_ids
    for i in reversed(range(len(raw_data.columns))):
        if i not in features:
            col = raw_data.columns[i]
            refined_data = refined_data.drop(columns=col, axis=1)
    raw_data_ids.extend([ids] * len(temp.index))
    ids += 1
    if debug == 1 and ids == 15:
        break
raw_data = pd.concat(temp_list)
raw_data = raw_data.astype(np.float64)
raw_data_ids = np.array(raw_data_ids)
print("Total number of raw rows: ", len(raw_data))
print("Total number of users: ", len(files))

## Perform feature selection
varience_threshold_features = vt.get_features(raw_data, raw_data_ids)
tree_selection_features = tsf.get_features(raw_data, raw_data_ids, debug=debug)
recusive_features = rf.get_features(raw_data, raw_data_ids, debug=debug)
chi_square_features = cs.get_features(raw_data, raw_data_ids)
information_gain_features = ig.get_features(raw_data, raw_data_ids)

## Take the intersection of the features
features = set(chi_square_features).intersection(recusive_features)
features = features.intersection(varience_threshold_features)
features = features.intersection(tree_selection_features)
features = features.intersection(information_gain_features)

## Remove the unused features from raw_data
for i in reversed(range(len(raw_data.columns))):
    if i not in features:
        col = raw_data.columns[i]
        raw_data = raw_data.drop(columns=col, axis=1)
print("Remaining number post intersection: ", len(raw_data.columns),
      " columns")
Exemplo n.º 4
0
def optimize(templates, templates_ids):

    # log
    print("Finding optimal parameters")

    # Split the sets
    templates, validation, templates_ids, validation_ids = train_test_split(
        templates, templates_ids, test_size=0.30)
    users = np.unique(validation_ids)

    # iterate over each user
    params = []
    for user in users:

        # Declare fold variables
        selected_features = []
        performance = []

        # Remove outliers within user data
        data = validation[validation_ids == user]
        inliers = ods.remove_outliers(data)
        ids = (validation_ids == user) * 1

        # Get features
        temp = vt.get_features(validation, ids)
        selected_features.extend(temp)
        temp = tsf.get_features(validation, ids)
        selected_features.extend(temp)
        temp = rf.get_features(validation, ids)
        selected_features.extend(temp)
        temp = ig.get_features(validation, ids)
        selected_features.extend(temp)

        # Sort features by number of times selected
        feature_counts = np.zeros(len(validation))
        for selection in selected_features:
            feature_counts[selection] += 1
        sorted_index = np.argsort(-1 * feature_counts)

        # For every feature
        for k in range(1, len(sorted_index)):
            # Declare feature scope variables
            gen_scores = []
            imp_scores = []

            # If this feature was never selected, break
            if feature_counts[sorted_index[k]] == 0:
                break

            # For each user
            for this_user in range(len(users)):
                # For each user
                for other_user in range(len(users)):
                    # if this user is the current iteration
                    if users[this_user] == user:
                        # Calculate distance
                        dist = distance.chebyshev(
                            validation[this_user, sorted_index[0:k]],
                            validation[other_user, sorted_index[0:k]])

                        # other user is also the current iteration
                        if users[other_user] == user:
                            gen_scores.append(dist)
                        else:
                            imp_scores.append(dist)

            # Compute d-prime
            dp = compute_dprime(gen_scores, imp_scores)
            performance.append(dp)

        # Obtain the best subset of features
        best_k = np.argmax(performance)
        if best_k == 0: best_k = 1
        feats = sorted_index[0:best_k]
        params.append((user, feats, inliers))

    # Log
    print("Optimization finished")

    # Return
    return params, templates, templates_ids