Пример #1
0
class Kmeans:
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {
            "subquery": query,
            "geom_col": "the_geom",
            "id_col": "cartodb_id"
        }

        data = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = data[0]['xs']
        ys = data[0]['ys']
        ids = data[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
Пример #2
0
class Kmeans:
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {"subquery": query,
                  "geom_col": "the_geom",
                  "id_col": "cartodb_id"}

        data = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = data[0]['xs']
        ys = data[0]['ys']
        ids = data[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)
Пример #3
0
class Getis(object):
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def getis_ord(self, subquery, attr, w_type, num_ngbrs, permutations,
                  geom_col, id_col):
        """
        Getis-Ord's G*
        Implementation building neighbors with a PostGIS database and PySAL's
          Getis-Ord's G* hotspot/coldspot module.
        Andy Eschbacher
        """

        # geometries with attributes that are null are ignored
        # resulting in a collection of not as near neighbors if kNN is chosen

        params = OrderedDict([("id_col", id_col), ("attr1", attr),
                              ("geom_col", geom_col), ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_getis(w_type, params)
        attr_vals = pu.get_attributes(result)

        # build PySAL weight object
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate Getis-Ord's G* z- and p-values
        getis = ps.esda.getisord.G_Local(attr_vals,
                                         weight,
                                         star=True,
                                         permutations=permutations)

        return list(
            zip(getis.z_sim, getis.p_sim, getis.p_z_sim, weight.id_order))
Пример #4
0
 def __init__(self, data_provider=None):
     if data_provider is None:
         self.data_provider = AnalysisDataProvider()
     else:
         self.data_provider = data_provider
Пример #5
0
class Moran:
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def global_stat(self, subquery, attr_name,
                    w_type, num_ngbrs, permutations, geom_col, id_col):
        """
        Moran's I (global)
        Implementation building neighbors with a PostGIS database and Moran's I
         core clusters with PySAL.
        Andy Eschbacher
        """
        params = OrderedDict([("id_col", id_col),
                              ("attr1", attr_name),
                              ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        attr_vals = pu.get_attributes(result)

        # calculate weights
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate moran global
        moran_global = ps.esda.moran.Moran(attr_vals, weight,
                                           permutations=permutations)

        return zip([moran_global.I], [moran_global.EI])

    def local_stat(self, subquery, attr,
                   w_type, num_ngbrs, permutations, geom_col, id_col):
        """
        Moran's I implementation for PL/Python
        Andy Eschbacher
        """

        # geometries with attributes that are null are ignored
        # resulting in a collection of not as near neighbors

        params = OrderedDict([("id_col", id_col),
                              ("attr1", attr),
                              ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        attr_vals = pu.get_attributes(result)
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate LISA values
        lisa = ps.esda.moran.Moran_Local(attr_vals, weight,
                                         permutations=permutations)

        # find quadrants for each geometry
        quads = quad_position(lisa.q)

        return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)

    def global_rate_stat(self, subquery, numerator, denominator,
                         w_type, num_ngbrs, permutations, geom_col, id_col):
        """
        Moran's I Rate (global)
        Andy Eschbacher
        """
        params = OrderedDict([("id_col", id_col),
                              ("attr1", numerator),
                              ("attr2", denominator)
                              ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        numer = pu.get_attributes(result, 1)
        denom = pu.get_attributes(result, 2)

        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate moran global rate
        lisa_rate = ps.esda.moran.Moran_Rate(numer, denom, weight,
                                             permutations=permutations)

        return zip([lisa_rate.I], [lisa_rate.EI])

    def local_rate_stat(self, subquery, numerator, denominator,
                        w_type, num_ngbrs, permutations, geom_col, id_col):
        """
            Moran's I Local Rate
            Andy Eschbacher
        """
        # geometries with values that are null are ignored
        # resulting in a collection of not as near neighbors

        params = OrderedDict([("id_col", id_col),
                              ("numerator", numerator),
                              ("denominator", denominator),
                              ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        numer = pu.get_attributes(result, 1)
        denom = pu.get_attributes(result, 2)

        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate LISA values
        lisa = ps.esda.moran.Moran_Local_Rate(numer, denom, weight,
                                              permutations=permutations)

        # find quadrants for each geometry
        quads = quad_position(lisa.q)

        return zip(lisa.Is, quads, lisa.p_sim, weight.id_order, lisa.y)

    def local_bivariate_stat(self, subquery, attr1, attr2,
                             permutations, geom_col, id_col,
                             w_type, num_ngbrs):
        """
            Moran's I (local) Bivariate (untested)
        """

        params = OrderedDict([("id_col", id_col),
                              ("attr1", attr1),
                              ("attr2", attr2),
                              ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        attr1_vals = pu.get_attributes(result, 1)
        attr2_vals = pu.get_attributes(result, 2)

        # create weights
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate LISA values
        lisa = ps.esda.moran.Moran_Local_BV(attr1_vals, attr2_vals, weight,
                                            permutations=permutations)

        # find clustering of significance
        lisa_sig = quad_position(lisa.q)

        return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
Пример #6
0
class Segmentation(object):
    """
        Add docstring
    """

    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def create_and_predict_segment_agg(self, target, features, target_features,
                                       target_ids, model_parameters):
        """
        Version of create_and_predict_segment that works on arrays that come
            straight form the SQL calling the function.

            Input:
                @param target: The 1D array of length NSamples containing the
                target variable we want the model to predict
                @param features: The 2D array of size NSamples * NFeatures that
                    form the input to the model
                @param target_ids: A 1D array of target_ids that will be used
                to associate the results of the prediction with the rows which
                    they come from
                @param model_parameters: A dictionary containing parameters for
                the model.
        """
        clean_target, _ = replace_nan_with_mean(target)
        clean_features, _ = replace_nan_with_mean(features)
        target_features,  _ = replace_nan_with_mean(target_features)

        model, accuracy = train_model(clean_target, clean_features,
                                      model_parameters, 0.2)
        prediction = model.predict(target_features)
        accuracy_array = [accuracy] * prediction.shape[0]
        return list(zip(target_ids, prediction, accuracy_array))

    def create_and_predict_segment(self, query, variable, feature_columns,
                                   target_query, model_params,
                                   id_col='cartodb_id'):
        """
        generate a segment with machine learning
        Stuart Lynn
                @param query: subquery that data is pulled from for packaging
                @param variable: name of the target variable
                @param feature_columns: list of column names
                @target_query: The query to run to obtain the data to predict
                @param model_params: A dictionary of model parameters, the full
                        specification can be found on the
                        scikit learn page for [GradientBoostingRegressor]
                        (http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)
        """
        params = {"subquery": target_query,
                  "id_col": id_col}

        (target, features, target_mean,
            feature_means) = self.clean_data(query, variable, feature_columns)

        model, accuracy = train_model(target, features, model_params, 0.2)
        result = self.predict_segment(model, feature_columns, target_query,
                                      feature_means)
        accuracy_array = [accuracy] * result.shape[0]

        rowid = self.data_provider.get_segmentation_data(params)
        '''
        rowid = [{'ids': [2.9, 4.9, 4, 5, 6]}]
        '''
        return list(zip(rowid[0]['ids'], result, accuracy_array))

    def predict_segment(self, model, feature_columns, target_query,
                        feature_means):
        """
        Use the provided model to predict the values for the new feature set
            Input:
                @param model: The pretrained model
                @features_col: A list of features to use in the
                    model prediction (list of column names)
                @target_query: The query to run to obtain the data to predict
                    on and the cartodb_ids associated with it.
        """

        batch_size = 1000
        params = {"subquery": target_query,
                  "feature_columns": feature_columns}

        results = []
        cursors = self.data_provider.get_segmentation_predict_data(params)
        '''
         cursors = [{'features': [[m1[0],m2[0],m3[0]],[m1[1],m2[1],m3[1]],
                                  [m1[2],m2[2],m3[2]]]}]
        '''

        while True:
            rows = cursors.fetch(batch_size)
            if not rows:
                break
            batch = np.row_stack([np.array(row['features'])
                                  for row in rows]).astype(float)

            batch = replace_nan_with_mean(batch, feature_means)[0]
            prediction = model.predict(batch)
            results.append(prediction)

        # NOTE: we removed the cartodb_ids calculation in here
        return np.concatenate(results)

    def clean_data(self, query, variable, feature_columns):
        """
            Add docstring
        """
        params = {"subquery": query,
                  "target": variable,
                  "features": feature_columns}

        data = self.data_provider.get_segmentation_model_data(params)

        '''
        data = [{'target': [2.9, 4.9, 4, 5, 6],
        'feature1': [1,2,3,4], 'feature2' : [2,3,4,5]}]
        '''

        # extract target data from data_provider object
        target = np.array(data[0]['target'], dtype=float)

        # put n feature data arrays into an n x m array of arrays
        features = np.column_stack([np.array(data[0][col])
                                    for col in feature_columns]).astype(float)

        features, feature_means = replace_nan_with_mean(features)
        target, target_mean = replace_nan_with_mean(target)
        return target, features, target_mean, feature_means
Пример #7
0
class Markov(object):
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def spatial_trend(self,
                      subquery,
                      time_cols,
                      num_classes=7,
                      w_type='knn',
                      num_ngbrs=5,
                      permutations=0,
                      geom_col='the_geom',
                      id_col='cartodb_id'):
        """
            Predict the trends of a unit based on:
            1. history of its transitions to different classes (e.g., 1st
               quantile -> 2nd quantile)
            2. average class of its neighbors

            Inputs:
            @param subquery string: e.g., SELECT the_geom, cartodb_id,
              interesting_time_column FROM table_name
            @param time_cols list of strings: list of strings of column names
            @param num_classes (optional): number of classes to break
              distribution of values into. Currently uses quantile bins.
            @param w_type string (optional): weight type ('knn' or 'queen')
            @param num_ngbrs int (optional): number of neighbors (if knn type)
            @param permutations int (optional): number of permutations for test
              stats
            @param geom_col string (optional): name of column which contains
              the geometries
            @param id_col string (optional): name of column which has the ids
              of the table

            Outputs:
            @param trend_up float: probablity that a geom will move to a higher
              class
            @param trend_down float: probablity that a geom will move to a
              lower class
            @param trend float: (trend_up - trend_down) / trend_static
            @param volatility float: a measure of the volatility based on
              probability stddev(prob array)
        """

        if len(time_cols) < 2:
            plpy.error('More than one time column needs to be passed')

        params = {
            "id_col": id_col,
            "time_cols": time_cols,
            "geom_col": geom_col,
            "subquery": subquery,
            "num_ngbrs": num_ngbrs
        }

        result = self.data_provider.get_markov(w_type, params)

        # build weight
        weights = pu.get_weight(result, w_type)
        weights.transform = 'r'

        # prep time data
        t_data = get_time_data(result, time_cols)

        sp_markov_result = ps.Spatial_Markov(t_data,
                                             weights,
                                             k=num_classes,
                                             fixed=False,
                                             permutations=permutations)

        # get lag classes
        lag_classes = ps.Quantiles(ps.lag_spatial(weights, t_data[:, -1]),
                                   k=num_classes).yb

        # look up probablity distribution for each unit according to class and
        #  lag class
        prob_dist = get_prob_dist(sp_markov_result.P, lag_classes,
                                  sp_markov_result.classes[:, -1])

        # find the ups and down and overall distribution of each cell
        trend_up, trend_down, trend, volatility = get_prob_stats(
            prob_dist, sp_markov_result.classes[:, -1])

        # output the results
        return zip(trend, trend_up, trend_down, volatility, weights.id_order)
Пример #8
0
class Moran(object):
    """Class for calculation of Moran's I statistics (global, local, and local
    rate)

    Parameters:
      data_provider (:obj:`AnalysisDataProvider`): Class for fetching data. See
        the `crankshaft.analysis_data_provider` module for more information.
    """
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def global_stat(self, subquery, attr_name, w_type, num_ngbrs, permutations,
                    geom_col, id_col):
        """
        Moran's I (global)
        Implementation building neighbors with a PostGIS database and Moran's I
         core clusters with PySAL.

        Args:

          subquery (str): Query to give access to the data needed. This query
            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
          attr_name (str): Column name of data to analyze
          w_type (str): Type of spatial weight. Must be one of `knn`
            or `queen`. See `PySAL documentation
            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
            for more information.
          num_ngbrs (int): If using `knn` for ``w_type``, this
            specifies the number of neighbors to be used to define the spatial
            neighborhoods.
          permutations (int): Number of permutations for performing
            conditional randomization to find the p-value. Higher numbers
            takes a longer time for getting results.
          geom_col (str): Name of the geometry column in the dataset for
            finding the spatial neighborhoods.
          id_col (str): Row index for each value. Usually the database index.

        """
        params = OrderedDict([("id_col", id_col), ("attr1", attr_name),
                              ("geom_col", geom_col), ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        attr_vals = pu.get_attributes(result)

        # calculate weights
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate moran global
        moran_global = ps.esda.moran.Moran(attr_vals,
                                           weight,
                                           permutations=permutations)

        return zip([moran_global.I], [moran_global.EI])

    def local_stat(self, subquery, attr, w_type, num_ngbrs, permutations,
                   geom_col, id_col):
        """
        Moran's I (local)

        Args:

          subquery (str): Query to give access to the data needed. This query
            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
          attr (str): Column name of data to analyze
          w_type (str): Type of spatial weight. Must be one of `knn`
            or `queen`. See `PySAL documentation
            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
            for more information.
          num_ngbrs (int): If using `knn` for ``w_type``, this
            specifies the number of neighbors to be used to define the spatial
            neighborhoods.
          permutations (int): Number of permutations for performing
            conditional randomization to find the p-value. Higher numbers
            takes a longer time for getting results.
          geom_col (str): Name of the geometry column in the dataset for
            finding the spatial neighborhoods.
          id_col (str): Row index for each value. Usually the database index.

        Returns:
          list of tuples: Where each tuple consists of the following values:
            - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
            - p-value
            - spatial lag
            - standardized spatial lag (centered on the mean, normalized by the
              standard deviation)
            - original value
            - standardized value
            - Moran's I statistic
            - original row index
        """

        # geometries with attributes that are null are ignored
        # resulting in a collection of not as near neighbors

        params = OrderedDict([("id_col", id_col), ("attr1", attr),
                              ("geom_col", geom_col), ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        attr_vals = pu.get_attributes(result)
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate LISA values
        lisa = ps.esda.moran.Moran_Local(attr_vals,
                                         weight,
                                         permutations=permutations)

        # find quadrants for each geometry
        quads = quad_position(lisa.q)

        # calculate spatial lag
        lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
        lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)

        return zip(quads, lisa.p_sim, lag, lag_std, lisa.y, lisa.z, lisa.Is,
                   weight.id_order)

    def global_rate_stat(self, subquery, numerator, denominator, w_type,
                         num_ngbrs, permutations, geom_col, id_col):
        """
        Moran's I Rate (global)

        Args:

          subquery (str): Query to give access to the data needed. This query
            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
          numerator (str): Column name of numerator to analyze
          denominator (str): Column name of the denominator
          w_type (str): Type of spatial weight. Must be one of `knn`
            or `queen`. See `PySAL documentation
            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
            for more information.
          num_ngbrs (int): If using `knn` for ``w_type``, this
            specifies the number of neighbors to be used to define the spatial
            neighborhoods.
          permutations (int): Number of permutations for performing
            conditional randomization to find the p-value. Higher numbers
            takes a longer time for getting results.
          geom_col (str): Name of the geometry column in the dataset for
            finding the spatial neighborhoods.
          id_col (str): Row index for each value. Usually the database index.
        """
        params = OrderedDict([("id_col", id_col), ("attr1", numerator),
                              ("attr2", denominator), ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        numer = pu.get_attributes(result, 1)
        denom = pu.get_attributes(result, 2)

        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate moran global rate
        lisa_rate = ps.esda.moran.Moran_Rate(numer,
                                             denom,
                                             weight,
                                             permutations=permutations)

        return zip([lisa_rate.I], [lisa_rate.EI])

    def local_rate_stat(self, subquery, numerator, denominator, w_type,
                        num_ngbrs, permutations, geom_col, id_col):
        """
        Moran's I Local Rate

        Args:

          subquery (str): Query to give access to the data needed. This query
            must give access to ``attr_name``, ``geom_col``, and ``id_col``.
          numerator (str): Column name of numerator to analyze
          denominator (str): Column name of the denominator
          w_type (str): Type of spatial weight. Must be one of `knn`
            or `queen`. See `PySAL documentation
            <http://pysal.readthedocs.io/en/latest/users/tutorials/weights.html>`__
            for more information.
          num_ngbrs (int): If using `knn` for ``w_type``, this
            specifies the number of neighbors to be used to define the spatial
            neighborhoods.
          permutations (int): Number of permutations for performing
            conditional randomization to find the p-value. Higher numbers
            takes a longer time for getting results.
          geom_col (str): Name of the geometry column in the dataset for
            finding the spatial neighborhoods.
          id_col (str): Row index for each value. Usually the database index.

        Returns:
          list of tuples: Where each tuple consists of the following values:
            - quadrants classification (one of `HH`, `HL`, `LL`, or `LH`)
            - p-value
            - spatial lag
            - standardized spatial lag (centered on the mean, normalized by the
              standard deviation)
            - original value (roughly numerator divided by denominator)
            - standardized value
            - Moran's I statistic
            - original row index
        """
        # geometries with values that are null are ignored
        # resulting in a collection of not as near neighbors

        params = OrderedDict([("id_col", id_col), ("numerator", numerator),
                              ("denominator", denominator),
                              ("geom_col", geom_col), ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        numer = pu.get_attributes(result, 1)
        denom = pu.get_attributes(result, 2)

        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate LISA values
        lisa = ps.esda.moran.Moran_Local_Rate(numer,
                                              denom,
                                              weight,
                                              permutations=permutations)

        # find quadrants for each geometry
        quads = quad_position(lisa.q)

        # spatial lag
        lag = ps.weights.spatial_lag.lag_spatial(weight, lisa.y)
        lag_std = ps.weights.spatial_lag.lag_spatial(weight, lisa.z)

        return zip(quads, lisa.p_sim, lag, lag_std, lisa.y, lisa.z, lisa.Is,
                   weight.id_order)

    def local_bivariate_stat(self, subquery, attr1, attr2, permutations,
                             geom_col, id_col, w_type, num_ngbrs):
        """
            Moran's I (local) Bivariate (untested)
        """

        params = OrderedDict([("id_col", id_col), ("attr1", attr1),
                              ("attr2", attr2), ("geom_col", geom_col),
                              ("subquery", subquery),
                              ("num_ngbrs", num_ngbrs)])

        result = self.data_provider.get_moran(w_type, params)

        # collect attributes
        attr1_vals = pu.get_attributes(result, 1)
        attr2_vals = pu.get_attributes(result, 2)

        # create weights
        weight = pu.get_weight(result, w_type, num_ngbrs)

        # calculate LISA values
        lisa = ps.esda.moran.Moran_Local_BV(attr1_vals,
                                            attr2_vals,
                                            weight,
                                            permutations=permutations)

        # find clustering of significance
        lisa_sig = quad_position(lisa.q)

        return zip(lisa.Is, lisa_sig, lisa.p_sim, weight.id_order)
Пример #9
0
class GWR:
    def __init__(self, data_provider=None):
        if data_provider:
            self.data_provider = data_provider
        else:
            self.data_provider = AnalysisDataProvider()

    def gwr(self,
            subquery,
            dep_var,
            ind_vars,
            bw=None,
            fixed=False,
            kernel='bisquare',
            geom_col='the_geom',
            id_col='cartodb_id'):
        """
            subquery: 'select * from demographics'
            dep_var: 'pctbachelor'
            ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack']
            bw: value of bandwidth, if None then select optimal
            fixed: False (kNN) or True ('distance')
            kernel: 'bisquare' (default), or 'exponential', 'gaussian'
        """

        params = {
            'geom_col': geom_col,
            'id_col': id_col,
            'subquery': subquery,
            'dep_var': dep_var,
            'ind_vars': ind_vars
        }

        # get data from data provider
        query_result = self.data_provider.get_gwr(params)

        # exit if data to analyze is empty
        if len(query_result) == 0:
            plpy.error('No data passed to analysis or independent variables '
                       'are all null-valued')

        # unique ids and variable names list
        rowid = np.array(query_result[0]['rowid'], dtype=np.int)

        # x, y are centroids of input geometries
        x = np.array(query_result[0]['x'], dtype=np.float)
        y = np.array(query_result[0]['y'], dtype=np.float)
        coords = list(zip(x, y))

        # extract dependent variable
        Y = np.array(query_result[0]['dep_var'], dtype=np.float).reshape(
            (-1, 1))

        n = Y.shape[0]
        k = len(ind_vars)
        X = np.zeros((n, k))

        # extract query result
        for attr in range(0, k):
            attr_name = 'attr' + str(attr + 1)
            X[:, attr] = np.array(query_result[0][attr_name],
                                  dtype=np.float).flatten()

        # add intercept variable name
        ind_vars.insert(0, 'intercept')

        # calculate bandwidth if none is supplied
        if bw is None:
            bw = Sel_BW(coords, Y, X, fixed=fixed, kernel=kernel).search()
        model = PySAL_GWR(coords, Y, X, bw, fixed=fixed, kernel=kernel).fit()

        # containers for outputs
        coeffs = []
        stand_errs = []
        t_vals = []
        filtered_t_vals = []

        # extracted model information
        c_alpha = model.adj_alpha
        filtered_t = model.filter_tvals(c_alpha[1])
        predicted = model.predy.flatten()
        residuals = model.resid_response
        r_squared = model.localR2.flatten()
        bw = np.repeat(float(bw), n)

        # create lists of json objs for model outputs
        for idx in range(n):
            coeffs.append(
                json.dumps({
                    var: model.params[idx, k]
                    for k, var in enumerate(ind_vars)
                }))
            stand_errs.append(
                json.dumps(
                    {var: model.bse[idx, k]
                     for k, var in enumerate(ind_vars)}))
            t_vals.append(
                json.dumps({
                    var: model.tvalues[idx, k]
                    for k, var in enumerate(ind_vars)
                }))
            filtered_t_vals.append(
                json.dumps({
                    var: filtered_t[idx, k]
                    for k, var in enumerate(ind_vars)
                }))

        return list(
            zip(coeffs, stand_errs, t_vals, filtered_t_vals, predicted,
                residuals, r_squared, bw, rowid))

    def gwr_predict(self,
                    subquery,
                    dep_var,
                    ind_vars,
                    bw=None,
                    fixed=False,
                    kernel='bisquare',
                    geom_col='the_geom',
                    id_col='cartodb_id'):
        """
        subquery: 'select * from demographics'
        dep_var: 'pctbachelor'
        ind_vars: ['intercept', 'pctpov', 'pctrural', 'pctblack']
        bw: value of bandwidth, if None then select optimal
        fixed: False (kNN) or True ('distance')
        kernel: 'bisquare' (default), or 'exponential', 'gaussian'
        """

        params = {
            'geom_col': geom_col,
            'id_col': id_col,
            'subquery': subquery,
            'dep_var': dep_var,
            'ind_vars': ind_vars
        }

        # get data from data provider
        query_result = self.data_provider.get_gwr_predict(params)

        # exit if data to analyze is empty
        if len(query_result) == 0:
            plpy.error('No data passed to analysis or independent variables '
                       'are all null-valued')

        # unique ids and variable names list
        rowid = np.array(query_result[0]['rowid'], dtype=np.int)

        x = np.array(query_result[0]['x'], dtype=np.float)
        y = np.array(query_result[0]['y'], dtype=np.float)
        coords = np.array(list(zip(x, y)), dtype=np.float)

        # extract dependent variable
        Y = np.array(query_result[0]['dep_var']).reshape((-1, 1))

        n = Y.shape[0]
        k = len(ind_vars)
        X = np.empty((n, k), dtype=np.float)

        for attr in range(0, k):
            attr_name = 'attr' + str(attr + 1)
            X[:, attr] = np.array(query_result[0][attr_name],
                                  dtype=np.float).flatten()

        # add intercept variable name
        ind_vars.insert(0, 'intercept')

        # split data into "training" and "test" for predictions
        # create index to split based on null y values
        train = np.where(Y != np.array(None))[0]
        test = np.where(Y == np.array(None))[0]

        # report error if there is no data to predict
        if len(test) < 1:
            plpy.error('No rows flagged for prediction: verify that rows '
                       'denoting prediction locations have a dependent '
                       'variable value of `null`')

        # split dependent variable (only need training which is non-Null's)
        Y_train = Y[train].reshape((-1, 1))
        Y_train = Y_train.astype(np.float)

        # split coords
        coords_train = coords[train]
        coords_test = coords[test]

        # split explanatory variables
        X_train = X[train]
        X_test = X[test]

        # calculate bandwidth if none is supplied
        if bw is None:
            bw = Sel_BW(coords_train,
                        Y_train,
                        X_train,
                        fixed=fixed,
                        kernel=kernel).search()

        # estimate model and predict at new locations
        model = PySAL_GWR(coords_train,
                          Y_train,
                          X_train,
                          bw,
                          fixed=fixed,
                          kernel=kernel).predict(coords_test, X_test)

        coeffs = []
        stand_errs = []
        t_vals = []
        r_squared = model.localR2.flatten()
        predicted = model.predy.flatten()

        m = len(model.predy)
        for idx in range(m):
            coeffs.append(
                json.dumps({
                    var: model.params[idx, k]
                    for k, var in enumerate(ind_vars)
                }))
            stand_errs.append(
                json.dumps(
                    {var: model.bse[idx, k]
                     for k, var in enumerate(ind_vars)}))
            t_vals.append(
                json.dumps({
                    var: model.tvalues[idx, k]
                    for k, var in enumerate(ind_vars)
                }))

        return list(
            zip(coeffs, stand_errs, t_vals, r_squared, predicted, rowid[test]))
Пример #10
0
class Kmeans(object):
    def __init__(self, data_provider=None):
        if data_provider is None:
            self.data_provider = AnalysisDataProvider()
        else:
            self.data_provider = data_provider

    def spatial(self, query, no_clusters, no_init=20):
        """
            find centers based on clusters of latitude/longitude pairs
            query: SQL query that has a WGS84 geometry (the_geom)
        """
        params = {
            "subquery": query,
            "geom_col": "the_geom",
            "id_col": "cartodb_id"
        }

        result = self.data_provider.get_spatial_kmeans(params)

        # Unpack query response
        xs = result[0]['xs']
        ys = result[0]['ys']
        ids = result[0]['ids']

        km = KMeans(n_clusters=no_clusters, n_init=no_init)
        labels = km.fit_predict(zip(xs, ys))
        return zip(ids, labels)

    def nonspatial(self,
                   subquery,
                   colnames,
                   no_clusters=5,
                   standardize=True,
                   id_col='cartodb_id'):
        """
        Arguments:
            query (string): A SQL query to retrieve the data required to do the
                            k-means clustering analysis, like so:
                            SELECT * FROM iris_flower_data
            colnames (list): a list of the column names which contain the data
                             of interest, like so: ['sepal_width',
                                                    'petal_width',
                                                    'sepal_length',
                                                    'petal_length']
            no_clusters (int): number of clusters (greater than zero)
            id_col (string): name of the input id_column

        Returns:
            A list of tuples with the following columns:
            cluster labels: a label for the cluster that the row belongs to
            centers: center of the cluster that this row belongs to
            silhouettes: silhouette measure for this value
            rowid: row that these values belong to (corresponds to the value in
                   `id_col`)
        """
        import json
        from sklearn import metrics

        params = {"colnames": colnames, "subquery": subquery, "id_col": id_col}

        data = self.data_provider.get_nonspatial_kmeans(params)

        # fill array with values for k-means clustering
        if standardize:
            cluster_columns = _scale_data(_extract_columns(data))
        else:
            cluster_columns = _extract_columns(data)

        kmeans = KMeans(n_clusters=no_clusters,
                        random_state=0).fit(cluster_columns)

        centers = [
            json.dumps(dict(zip(colnames, c)))
            for c in kmeans.cluster_centers_[kmeans.labels_]
        ]

        silhouettes = metrics.silhouette_samples(cluster_columns,
                                                 kmeans.labels_,
                                                 metric='sqeuclidean')

        return zip(kmeans.labels_, centers, silhouettes,
                   [kmeans.inertia_] * kmeans.labels_.shape[0],
                   data[0]['rowid'])