예제 #1
0
    def select_k_best(self, feature, k=5):
        """
        :param feature: str
            Name of feature (column name) to predict
        :param k: int
            Number of features to select
        :return: void
            Selects the best features to predict feature
        """

        print("Selecting k best features of data file", self.dataset_file)
        headers, raw_data = self.parse_csv()  # get columns names and raw data
        sel = feature_selection.SelectKBest(feature_selection.f_regression,
                                            k=k)  # model to select data
        x_matrix_features = self.HEADERS_TO_ANALYZE.copy()  # not edit main list of headers
        x_matrix_features.remove(
            feature)  # do NOT include feature to predict in input matrix
        x_data = m_utils.get_subset_of_matrix(x_matrix_features, headers,
                                              raw_data)  # input matrix
        y_data = m_utils.get_subset_of_matrix([feature], headers,
                                              raw_data)  # output matrix
        sel.fit(x_data, y_data)  # fit

        top_k_features_indices = np.array(sel.scores_).argsort()[-k:][
                                 ::-1]  # indices of top k features
        top_k_features = [x_matrix_features[i] for i in
                          top_k_features_indices]  # names of top k features
        top_k_features_scores = [sel.scores_[i] for i in
                                 top_k_features_indices]  # scores of top k features

        chart = create_symlog_bar_chart(
            "Most " + str(k) + " correlated features with " + feature,
            top_k_features,
            top_k_features_scores, "score")
        plt.show()
예제 #2
0
    def predict_feature(self, feature):
        """
        :param feature: str
            Name of feature (column name) to predict
        :return: void
            Predicts feature with linear regression
        """

        print("Predicting ", feature, "with data from file", self.dataset_file)
        headers, raw_data = self.parse_csv()  # get columns names and raw data
        clf = linear_model.LinearRegression()  # model to fit data
        x_matrix_features = self.HEADERS_TO_ANALYZE.copy()
        x_matrix_features.remove(
            feature)  # do NOT include feature to predict in input matrix
        x_data = m_utils.get_subset_of_matrix(x_matrix_features, headers,
                                              raw_data)  # input matrix
        y_data = m_utils.get_subset_of_matrix([feature], headers,
                                              raw_data)  # output matrix
        clf.fit(x_data, y_data)

        coefficients = {}  # dict feature -> coefficient
        for i in range(len(x_matrix_features)):
            coefficients[x_matrix_features[i]] = clf.coef_[0][i]

        chart = create_symlog_bar_chart(
            "Linear fit of " + feature,
            [k for k in coefficients.keys()],
            coefficients.values(),
            "Coefficient"
        )
        plt.show()
예제 #3
0
    def cluster_3d_plot(self, labels, n_clusters=6):
        """
        :param labels: [] of str (len = 3)
            Features to cluster data. Each item must be in the csv data file. Each label is one of x, y, z axis
        :param n_clusters: int
            Number of clusters
        :return: void
            Plots 3D chart with clusters based on selected features
        """

        print("Clustering file", self.dataset_file)
        headers, raw_data = self.parse_csv()  # get columns names and raw data
        x_data = m_utils.get_subset_of_matrix(self.HEADERS_TO_ANALYZE, headers,
                                              raw_data)  # input matrix
        kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=0).fit(
            x_data)

        fig = plt.figure(figsize=(4, 3))  # create 3D plot
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(
            x_data[:, self.HEADERS_TO_ANALYZE.index(labels[0])],
            # get values of given labels
            x_data[:, self.HEADERS_TO_ANALYZE.index(labels[1])],
            x_data[:, self.HEADERS_TO_ANALYZE.index(labels[2])],
            c=kmeans.labels_.astype(np.float)
        )  # plot 3D data points

        centroids = kmeans.cluster_centers_
        cluster_centers = []  # list of centers of each cluster
        for i in range(n_clusters):
            cl_center = {
                "x": centroids[i][self.HEADERS_TO_ANALYZE.index(labels[0])],
                # x-coordinate of i-th cluster
                "y": centroids[i][self.HEADERS_TO_ANALYZE.index(labels[1])],
                # y-coordinate of i-th cluster
                "z": centroids[i][self.HEADERS_TO_ANALYZE.index(labels[2])]
                # z-coordinate of i-th cluster
            }  # x, y, z of center of first cluster -> find x, y, z of each label
            cluster_centers.append(cl_center)

        ax.scatter(
            [c["x"] for c in cluster_centers],
            # x positions of centers of all clusters
            [c["y"] for c in cluster_centers],
            # y positions of centers of all clusters
            [c["z"] for c in cluster_centers],
            # z positions of centers of all clusters
            marker='o',
            s=800,
            linewidth=5,
            color='w'
        )  # plot centroids

        ax.set_xlabel(labels[0])  # set labels
        ax.set_ylabel(labels[1])
        ax.set_zlabel(labels[2])

        plt.title(str(n_clusters) + "-clustering data")
        plt.show()
예제 #4
0
    def cluster_analyze(self, n_clusters=6):
        """
        :param n_clusters: int
            Number of clusters
        :return: void
            Computes cluster analysis: see days based on differences.
            Each day is different from one another, there are days where you trained more, others where you ate more ...
            The goal is to divide your days into categories (e.g highly-active, active ...) based on data logs.
            This way, the input matrix consists of multiple vectors with each one consisting of one day's values.
        """

        print("Clustering file", self.dataset_file)
        headers, raw_data = self.parse_csv()  # get columns names and raw data
        x_data = m_utils.get_subset_of_matrix(self.HEADERS_TO_ANALYZE, headers,
                                              raw_data)  # input matrix
        kmeans = cluster.KMeans(n_clusters=n_clusters, random_state=0).fit(
            x_data)
        print("Clusters", kmeans.labels_)

        headers_to_plot = [
            "SUMMARY:kcal_count",
            "STEPS:distance",
            "SLEEP:deep_sleep_time",
            "ACTIVITIES:distance"
        ]  # get headers to add to chart
        vals_headers = [
            [float(row[headers.index(h)]) for row in raw_data] for h in
            headers_to_plot
        ]  # get values for each header
        headers_to_plot.append("cluster")  # add cluster group
        vals_headers.append(kmeans.labels_)
        days = [str(row[headers.index("date")]) for row in
                raw_data]  # get list of days (x values)

        chart = create_multiple_bar_chart(
            "Days",
            days,
            vals_headers,
            headers_to_plot
        )  # create chart
        plt.show()  # show bar chart