def do_cluster():
    label_names = pickletools.load(labels_filename)
    X_scaled = pickletools.load(normalized_data_filename)
    
    cluster_count = 10
    
    print('Clustering...')
    kmeans = KMeans(n_clusters=cluster_count, max_iter=1000, n_jobs=4, tol=0.0001).fit(X_scaled)
    
    create_lists(cluster_count, kmeans, label_names)

    features = save_centroids(kmeans)

    find_interesting_features(cluster_count, features, kmeans)
def save_centroids(kmeans):
    print('Saving the centroids')
    features = pickletools.load(reduced_feature_filename)
    centroids = kmeans.cluster_centers_.tolist()
    centroids.insert(0, features)
    csvtools.save_csv_lines(centroids_filename, centroids)
    return features
# Load the header info
urlIdx = find_column_index_from_csv(headerCsvFilename, urlColumnName)
http_method_index = find_column_index_from_csv(headerCsvFilename,
                                               http_method_column_name)
orgId_index = find_column_index_from_csv(headerCsvFilename, orgIdColumnName)
appId_index = find_column_index_from_csv(headerCsvFilename, appIdColumnName)

# Get the list of all CSV files
fileList = get_csv_filelist_from_folder(folder_with_csvs)

# Load the list of url parts
url_parts = load_simple_csv_list(url_part_filename)
url_parts.append('')  # It's helpful to keep empty parts

# Load the feature list
features = pickletools.load(feature_list_filename)

# Create a template for feature counts
blank_feature_count_row = [0] * len(features)

# Accumulate feature counts here
feature_counts = {}

# Track if these features are used
feature_used = [False] * len(features)


def create_empty_feature_count():
    # Creates an array of zeroes, sized to the feature counts
    return blank_feature_count_row.copy()
from settings import *
import pickletools
import numpy as np

# Settings
feature_min_connection_count = 1
feature_min_app_count = 1

# Load data
features = pickletools.load(feature_list_filename)
X_dict = pickletools.load(data_filename)

# Remove the labels
print('Getting labels and values')
labels = list(X_dict.keys())
X = np.array(list(X_dict.values()))

useful_features = []

for f in range(0, len(features)):
    counts_for_this_feature = X[:, f]
    connection_idxs = list(x for x in counts_for_this_feature if x > 0)
    connection_count = len(connection_idxs)
    appIds = list(
        set(
            list(labels[connection_idx][0:36]
                 for connection_idx in connection_idxs)))
    app_count = len(appIds)
    call_count = sum(counts_for_this_feature)

    #print('{}/{}/{} - {}'.format(connection_count, app_count, call_count, features[f]))
from settings import *
import pickletools
import numpy as np

# Load the data file
#X_dict = pickletools.load(data_filename)
X = pickletools.load(reduced_data_filename)

# Remove the labels
#print('Getting labels and values')
#labels = list(X_dict.keys())
#X = np.array(list(X_dict.values()))

# Scale it
print('Scaling...')
#X_scaled = preprocessing.scale(X)
X_scaled = np.log(X + 1)

#pickletools.save(labels_filename, labels)
pickletools.save(normalized_data_filename, X_scaled)
예제 #6
0
"""
Created on Mon Sep 18 16:01:33 2017

@author: nick.green
"""
from settings import *
from sklearn import preprocessing
import time
import pickletools
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt


# Load the data file
X_dict = pickletools.load(data_filename)
X = np.array(list(X_dict.values()))



for feature_idx in range(X.shape[1]):
    n, bins, patches = plt.hist(X[:,feature_idx], 50, normed=1, facecolor='green', alpha=0.75)
    plt.draw()
    plt.savefig(r'plots\feature_hist_{}.png'.format(feature_idx))
    plt.show()


for idx in range(X.shape[1]):
    print('{} = {}'.format(idx, sum(X[:,idx])))