def data_distributions(data, customer_ids, category_names):
    """
        Method to find and plot distributions across the dataset:
            1. Customers annual spendings
            2. Customers spendings
            3. Category transactions
            4. category average spendings
        param:
            1. data - pandas DataFrame of mapped data
            2. customer_ids - numpy array (10000, ) with all customer ids
            3. category_names - numpy array (82, ) with names of categories
    """
    annual_spendings, spendings, category_spendings = [], [], []
    category_numbers = np.zeros((len(category_names), ))

    # Find customers distributions
    for customer_id in customer_ids:
        customer_data = data[data['customer_id'] == customer_id]
        annual_spendings.append(pd.to_numeric(customer_data['amount']).sum())

        for i in range(len(customer_data.index)):
            spendings.append(pd.to_numeric(customer_data['amount'].values[i]))

    # Find categories distributions
    for i in range(len(category_names)):
        category_data = data[data['category_name'] == category_names[i]]
        category_numbers[i] += len(category_data.index)
        category_spendings.append(
            pd.to_numeric(category_data['amount']).mean())

    # Plot customers distributions
    plotting.hist_plotting(np.array(annual_spendings),
                           ["Annual spendings", "Frequency"],
                           "Customers annual spendings distribution")
    plotting.hist_plotting(np.array(spendings), ["Spendings", "Frequency"],
                           "Customers spendings distribution")

    # Sort and plot category distributions
    category_numbers = np.array(category_numbers)
    sorted_indexes = np.argsort(category_numbers)

    plotting.bar_plotting([
        np.take_along_axis(category_numbers, sorted_indexes, axis=0),
        np.take_along_axis(category_names, sorted_indexes, axis=0)
    ], ["Category", "Number of transactions"], "Category transactions",
                          "data distributions")

    category_spendings = np.array(category_spendings)
    sorted_indexes = np.argsort(category_spendings)

    plotting.bar_plotting([
        np.take_along_axis(category_spendings, sorted_indexes, axis=0),
        np.take_along_axis(category_names, sorted_indexes, axis=0)
    ], ["Category", "Average spendings"], "Category average spendings",
                          "data distributions")

    return
Exemplo n.º 2
0
def optics_clustering(data, name, auxiliary_data=None):
    """
        Function for Oredering points to identify the clustering structure
            (OPTICS) algorithm.
        param:
            1. data - pandas DataFrame (10000, 82), where values are mean
                spendings of customers for every category
            2. name - string name of using data
            3. auxiliary_data - pandas DataFrame of data that need to be
                    clustered instead of regular DataFrame
    """
    # Decide what data should be using for clustering
    learning_data = data if auxiliary_data is None else auxiliary_data

    # Implement OPTICS model
    optics_model = \
        sklearn.cluster.OPTICS(
            min_samples=round(math.log(len(learning_data.index))),
            n_jobs=-1).fit(learning_data)
    # Extract OPTICS clusters using DBSCAN algorithm with distance choosing
    # as DMDBSCAN algorithm
    labels = sklearn.cluster.cluster_optics_dbscan(
        reachability=optics_model.reachability_,
        core_distances=optics_model.core_distances_,
        ordering=optics_model.ordering_,
        eps=dmdbscan_algorithm(learning_data,
                               "clustering/" + name + "/OPTICS"))

    # Plot clusters over auxiliary data
    if auxiliary_data is not None:
        plotting.data_plotting(auxiliary_data,
                               "Auxiliary data clustering",
                               "clustering/" + name + "/OPTICS",
                               color=labels)

    # Plot clusters over data
    plotting.data_plotting(data,
                           "OPTICS clustering",
                           "clustering/" + name + "/OPTICS",
                           color=labels)

    data['cluster'] = labels

    # Plot every cluster as a barchart of mean spendings
    for i in range(len(np.unique(labels)) - 1):
        cluster_data = data[data['cluster'] == i].drop(columns='cluster')
        mean_cluster_data = cluster_data.mean(axis=0).sort_values()

        plotting.bar_plotting([
            np.array(mean_cluster_data.values),
            np.array(mean_cluster_data.index)
        ], ["Category", "Mean spendings"], "Cluster " + str(i) + " (" +
                              str(len(cluster_data.index)) + " customers)",
                              "clustering/" + name + "/OPTICS/clusters")

    return
Exemplo n.º 3
0
def kmeans_clustering(data,
                      name,
                      clustering_name,
                      clusters_number=8,
                      auxiliary_data=None):
    """
        Function for k-means algorithm.
        param:
            1. data - pandas DataFrame (10000, 82), where values are mean
                spendings of customers for every category
            2. name - string name of using data
            3. clustering_name - string more precise name of clustering
            4. clusters_number - int number of clusters (8 as default)
            5. auxiliary_data - pandas DataFrame of data that need to be
                clustered instead of regular DataFrame
    """
    # Decide what data should be using for clustering
    learning_data = data if auxiliary_data is None else auxiliary_data

    # Implement k-means model
    kmeans_model = \
        sklearn.cluster.KMeans(n_clusters=clusters_number, n_jobs=-1).\
        fit(learning_data)

    # Plot clusters over auxiliary data
    if auxiliary_data is not None:
        plotting.data_plotting(auxiliary_data,
                               "Auxiliary data clustering",
                               "clustering/" + name + "/" + clustering_name,
                               color=kmeans_model.labels_)

    # Plot clusters over data
    plotting.data_plotting(data,
                           clustering_name,
                           "clustering/" + name + "/" + clustering_name,
                           color=kmeans_model.labels_)

    data['cluster'] = kmeans_model.labels_

    # Plot every cluster as a barchart of mean spendings
    for i in range(len(np.unique(kmeans_model.labels_))):
        cluster_data = data[data['cluster'] == i].drop(columns='cluster')
        mean_cluster_data = cluster_data.mean(axis=0).sort_values()

        plotting.bar_plotting([
            np.array(mean_cluster_data.values),
            np.array(mean_cluster_data.index)
        ], ["Category", "Mean spendings"], "Cluster " + str(i) + " (" +
                              str(len(cluster_data.index)) + " customers)",
                              "clustering/" + name + "/" + clustering_name +
                              "/clusters")

    return
Exemplo n.º 4
0
def dbscan_clustering(data, name, auxiliary_data=None):
    """
        Function for Density-based spatial clustering of applications
            with noise (DBSCAN) algorithm.
        param:
            1. data - pandas DataFrame (10000, 82), where values are mean
                spendings of customers for every category
            2. name - string name of using data
            3. auxiliary_data - pandas DataFrame of data that need to be
                clustered instead of regular DataFrame
    """
    # Decide what data should be using for clustering
    learning_data = data if auxiliary_data is None else auxiliary_data

    # Implement DBSCAN model with distance choose using DMDBSCAN algorithm
    dbscan_model = \
        sklearn.cluster.DBSCAN(
            min_samples=round(math.log(len(learning_data.index))),
            eps=dmdbscan_algorithm(learning_data,
                                   "clustering/" + name +
                                   "/DBSCAN with DMDBSCAN"),
            n_jobs=-1).fit(learning_data)

    # Plot clusters over auxiliary data
    if auxiliary_data is not None:
        plotting.data_plotting(auxiliary_data,
                               "Auxiliary data clustering",
                               "clustering/" + name + "/DBSCAN with DMDBSCAN",
                               color=dbscan_model.labels_)

    # Plot clusters over data
    plotting.data_plotting(data,
                           "DBSCAN clustering",
                           "clustering/" + name + "/DBSCAN with DMDBSCAN",
                           color=dbscan_model.labels_)

    data['cluster'] = dbscan_model.labels_

    # Plot every cluster as a barchart of mean spendings
    for i in range(len(np.unique(dbscan_model.labels_)) - 1):
        cluster_data = data[data['cluster'] == i].drop(columns='cluster')
        mean_cluster_data = cluster_data.mean(axis=0).sort_values()

        plotting.bar_plotting([
            np.array(mean_cluster_data.values),
            np.array(mean_cluster_data.index)
        ], ["Category", "Mean spendings"], "Cluster " + str(i) + " (" +
                              str(len(cluster_data.index)) + " customers)",
                              "clustering/" + name +
                              "/DBSCAN with DMDBSCAN/clusters")

    return
Exemplo n.º 5
0
def gaussian_mixture(data, name, auxiliary_data=None):
    """
        Function clustering using Gaussian mixture model with Bayes classifier.
        param:
            1. data - pandas DataFrame (10000, 82), where values are mean
                spendings of customers for every category
            2. name - string name of using data
            3. auxiliary_data - pandas DataFrame of data that need to be
                clustered instead of regular DataFrame
    """
    # Decide what data should be using for clustering
    learning_data = data if auxiliary_data is None else auxiliary_data

    # Implement Gaussian mixture model
    bgm_model = sklearn.mixture.\
        BayesianGaussianMixture(n_components=5, max_iter=1000,
                                weight_concentration_prior=1e+03).\
        fit(learning_data)
    labels = bgm_model.predict(learning_data)

    # Plot clusters over auxiliary data
    if auxiliary_data is not None:
        plotting.data_plotting(auxiliary_data,
                               "Auxiliary data clustering",
                               "clustering/" + name + "/gaussian mixture",
                               color=labels)

    # Plot clusters over data
    plotting.data_plotting(data,
                           "Gaussian mixture clustering",
                           "clustering/" + name + "/gaussian mixture",
                           color=labels)

    data['cluster'] = labels

    # Plot every cluster as a barchart of mean spendings
    for i in range(len(np.unique(labels))):
        cluster_data = data[data['cluster'] == i].drop(columns='cluster')
        mean_cluster_data = cluster_data.mean(axis=0).sort_values()

        plotting.bar_plotting([
            np.array(mean_cluster_data.values),
            np.array(mean_cluster_data.index)
        ], ["Category", "Mean spendings"], "Cluster " + str(i) + " (" +
                              str(len(cluster_data.index)) + " customers)",
                              "clustering/" + name +
                              "/gaussian mixture/clusters")

    return
Exemplo n.º 6
0
def univariate_analysis_applying(data):
    """
        Method to perform univariate analysis on categories.
        param:
            data - pandas DataFrame of mapped data
    """
    # Categories distribution plotting
    plotting.bar_plotting(data, ["Categories", "Patients number"],
                          "Patients EEG categories distribution (numeric)",
                          "univariate_analysis")

    # Categories distribution plotting (percents)
    plotting.pie_plotting(data / data.values.sum() * 100,
                          "Patients EEG categories distribution (percentage)",
                          "univariate_analysis")

    return
Exemplo n.º 7
0
def topic_clustering(data, lda_results, topics_number):
    """
        Function for clustering data using dominant topics from
            LDA topic modeling results.
        param:
            1. data - pandas DataFrame (10000, 82), where values are mean
                spendings of customers for every category
            2. auxiliary_data - pandas DataFrame of LDA topic modeling results
            3. topics_number - int number of topics
    """
    # Extract topic labels as cluster labels
    topic_labels = lda_results[['dominant_topic']].values.\
        reshape((len(lda_results[['dominant_topic']].values), ))

    # Plot clusters over LDA results
    plotting.data_plotting(lda_results.drop(columns=['dominant_topic']),
                           "Auxiliary data clustering",
                           "clustering/initial data/clustering by LDA topics",
                           color=topic_labels)

    # Plot clusters over data
    plotting.data_plotting(data,
                           "Clustering by LDA topics",
                           "clustering/initial data/clustering by LDA topics",
                           color=topic_labels)

    data['dominant_topic'] = topic_labels

    # Plot every cluster as a barchart of mean spendings
    for i in range(topics_number):
        cluster_data = data[data['dominant_topic'] == i].\
            drop(columns='dominant_topic')
        mean_cluster_data = cluster_data.mean(axis=0).sort_values()

        plotting.bar_plotting([
            np.array(mean_cluster_data.values),
            np.array(mean_cluster_data.index)
        ], ["Category", "Mean spendings"], "Cluster " + str(i) + " (" +
                              str(len(cluster_data.index)) + " customers)",
                              "clustering/initial data/"
                              "clustering by LDA topics/clusters")

    return
Exemplo n.º 8
0
def lda_performing(data_for_lda, customer_ids, category_names):
    """
        Function for topic modeling using the Latent Dirichlet Algorithm (LDA).
        param:
            1. data_for_lda - pandas DataFrame (10000, 82), where values are
                the transactions number of customers for every category
            2. customer_ids - numpy array (10000, ) with all customer ids
            3. category_names - numpy array (82, ) with names of categories
        return:
            1. lda_results - pandas DataFrame results of LDA modeleing
            2. Int value of topics number
    """
    # Create LDA with standard parameters
    lda_model = \
        sklearn.decomposition.LatentDirichletAllocation(n_jobs=-1)

    # Do a grid search over parameters of LDA
    search_params = {
        'n_components': [5, 10, 15, 20, 25, 30],
        'learning_decay': [.5, .7, .9],
        'max_iter': [10, 15, 20, 25, 30]
    }
    # Create grid search model
    grid_search_model = \
        sklearn.model_selection.GridSearchCV(lda_model,
                                             param_grid=search_params,
                                             n_jobs=-1)
    grid_search_model.fit(data_for_lda)

    # Get the LDA model with the best score
    lda_model = grid_search_model.best_estimator_

    # Fit and transform model
    lda_results = lda_model.fit_transform(data_for_lda)

    # Create topic names
    topic_names = \
        np.array(["topic " + str(i) for i in
                  range(lda_results.shape[1])])

    # Create and save DataFrame of results
    lda_results = pd.DataFrame(data=lda_results,
                               columns=topic_names,
                               index=customer_ids)
    lda_results.index.name = "customer_id"
    dominant_topics = np.argmax(lda_results.values, axis=1)
    lda_results['dominant_topic'] = dominant_topics
    lda_results.to_csv(path_or_buf="data/output/lda_results.csv")

    # Plot probability of topics for every customer as a barchart
    for customer_id in customer_ids:
        customer_data = \
            lda_results.drop(columns=['dominant_topic']).loc[[customer_id]].\
            sort_values(by=customer_id, axis=1)

        plotting.bar_plotting([
            np.array(customer_data.values[0]),
            np.array(customer_data.columns)
        ], ["Category", "Probability"],
                              "Customer " + customer_id,
                              "LDA/customers",
                              color='m')

    # Create DataFrame of topics distribution over customers
    topic_distribution = pd.DataFrame()
    topic_distribution['customers_number'] = \
        lda_results['dominant_topic'].value_counts(sort=False).\
        reindex(np.arange(len(topic_names)), fill_value=0)
    topic_distribution.index = \
        lda_results.columns[:len(topic_names)]
    topic_distribution.index.name = "topic"
    topic_distribution = \
        topic_distribution.sort_values(by=['customers_number'], axis=0)

    # plot the distribution
    plotting.bar_plotting([
        np.array(topic_distribution['customers_number'].values),
        np.array(topic_distribution.index)
    ], ["Topic", "Number of customers"], "Topics distribution over customers",
                          "LDA")

    # Create DataFrame for topics
    topics_data = \
        pd.DataFrame(data=lda_model.components_ / lda_model.components_.
                     sum(axis=1)[:, np.newaxis],
                     columns=data_for_lda.columns,
                     index=topic_names)
    topics_data.index.name = "topic"

    # Plot probability of categories for every topic as a barchart
    for topic_name in topic_names:
        topic_data = \
            topics_data.loc[[topic_name]].sort_values(by=topic_name, axis=1)

        plotting.bar_plotting(
            [np.array(topic_data.values[0]),
             np.array(topic_data.columns)], ["Category", "Probability"],
            topic_name, "LDA/topics")

    # Plot probability of topics for every category as a barchart
    for i in range(len(category_names)):
        category_data = topics_data.iloc[:, i].sort_values(axis=0)

        plotting.bar_plotting(
            [np.array(category_data.values),
             np.array(category_data.index)], ["Topic", "Probability"],
            "Category " + category_names[i],
            "LDA/categories",
            color='g')

    return lda_results, len(topic_names)
def random_forrest_classification(training_set,
                                  validation_set,
                                  folder,
                                  feature_importance=True):
    """
        Training and validation of random forrest classifier model.
        param:
            1. training_set - list of sets for training
            2. validation_set - list of sets for validation
            3. folder - string name of folder
            4. feature_importance - boolean value to compute most important
                features (True as default)
        return:
            most_important_features - list of 15 most important features
    """
    # Creation and training of random forest model
    model_random_forest = \
        sklearn.ensemble.RandomForestClassifier(n_estimators=2500,
                                                max_features=None,
                                                n_jobs=-1,
                                                random_state=0).\
        fit(training_set[0],
            training_set[1].drop(columns=['categories'], axis=1).
            astype('int').values.ravel())

    categories = validation_set[1].loc[:, ['y', 'categories']].\
        sort_values(by=['y']).drop(columns=['y'])['categories'].unique()

    # Accuracy computation for every category
    confusion_matrix = sklearn.metrics.confusion_matrix(
        validation_set[1].drop(columns=['categories'], axis=1).astype('int'),
        model_random_forest.predict(validation_set[0]),
        labels=np.arange(0, len(categories)))

    accuracies = (confusion_matrix.astype('float') /
                  confusion_matrix.sum(axis=1)[:, np.newaxis]).diagonal()

    # Precision, recall and F-score computation for every category
    precisions, recalls, f_scores, _ = \
        sklearn.metrics.precision_recall_fscore_support(
            validation_set[1].drop(columns=['categories'], axis=1).
            astype('int'),
            model_random_forest.predict(validation_set[0]),
            labels=np.arange(0, len(categories)))

    # Accuracy distribution over categories plotting
    plotting.bar_plotting(
        pd.Series(accuracies, index=categories).sort_values(ascending=False),
        ["Categories", "Accuracy"], "Classification accuracy",
        "multivariate_analysis/initial/random_forest/" + folder)

    # Recall distribution over categories plotting
    plotting.bar_plotting(
        pd.Series(recalls, index=categories).sort_values(ascending=False),
        ["Categories", "Recall"], "Classification recall",
        "multivariate_analysis/initial/random_forest/" + folder)

    # Precision distribution over categories plotting
    plotting.bar_plotting(
        pd.Series(precisions, index=categories).sort_values(ascending=False),
        ["Categories", "Precision"], "Classification precision",
        "multivariate_analysis/initial/random_forest/" + folder)

    # F-score distribution over categories plotting
    plotting.bar_plotting(
        pd.Series(f_scores, index=categories).sort_values(ascending=False),
        ["Categories", "F-score"], "Classification F-score",
        "multivariate_analysis/initial/random_forest/" + folder)

    # Decision tree plotting
    plotting.graph_exporting(model_random_forest,
                             validation_set[0].columns.tolist(),
                             validation_set[1]['y'].unique().astype('str'),
                             folder)

    most_important_features = None

    # Most important features selection and plotting
    if feature_importance:
        most_important_features = \
            pd.Series(model_random_forest.feature_importances_,
                      index=validation_set[0].columns).\
            sort_values(ascending=False).head(15)

        plotting.bar_plotting(
            most_important_features, ["Brain activity", "Score"],
            "Top 15 most important features",
            "multivariate_analysis/initial/random_forest/" + folder)

    return most_important_features