Пример #1
0
    def create_all_polygons_on_grid(self):
        """
        Create all polygons that are represented in a grid and store them in a
        new dic_grid key .

        """

        operation = begin_operation('create_all_polygons_on_grid')

        try:
            print('\nCreating all polygons on virtual grid', flush=True)
            grid_polygon = np.array(
                [[None for i in range(self.grid_size_lon_x)]
                 for j in range(self.grid_size_lat_y)])
            lat_init = self.lat_min_y
            cell_size = self.cell_size_by_degree
            for i in progress_bar(range(self.grid_size_lat_y)):
                lon_init = self.lon_min_x
                for j in range(self.grid_size_lon_x):
                    # Cria o polygon da célula
                    grid_polygon[i][j] = Polygon((
                        (lat_init, lon_init),
                        (lat_init + cell_size, lon_init),
                        (lat_init + cell_size, lon_init + cell_size),
                        (lat_init, lon_init + cell_size),
                    ))
                    lon_init += cell_size
                lat_init += cell_size
            self.grid_polygon = grid_polygon
            print('...geometries saved on Grid grid_polygon property')
            self.last_operation = end_operation(operation)
        except Exception as e:
            self.last_operation = end_operation(operation)
            raise e
Пример #2
0
def insert_points_in_df(data: DataFrame, aug_df: DataFrame):
    """
    Inserts the points of the generated trajectories
    to the original data sets.

    Parameters
    ----------
    data : DataFrame
        The input trajectories data
    aug_df : DataFrame
        The data of unobserved trajectories

    """
    for _, row in progress_bar(aug_df.iterrows(), total=aug_df.shape[0]):

        keys = row.index.tolist()
        values = row.values.tolist()

        row_df = pd.DataFrame()

        for k, v in zip(keys, values):
            if k in data:
                if isinstance(v, list) or isinstance(v, np.ndarray):
                    row_df[k] = v

        for k, v in zip(keys, values):
            if k in data:
                if not isinstance(v, list) and not isinstance(v, np.ndarray):
                    row_df[k] = v

        for _, row_ in row_df.iterrows():
            append_row(data, row=row_)
Пример #3
0
def decode_geohash_to_latlon(data: DataFrame,
                             label_geohash: Optional[Text] = GEOHASH,
                             reset_index: Optional[bool] = True):
    """
    Decode feature with hash of trajectories back to
    geographic coordinates.

    Parameters
    ----------
    data : dataframe
        The input trajectories data
    label_geohash : str, optional
        The name of the feature with hashed trajectories, by default GEOHASH
    reset_index : boolean, optional
        Condition to reset the df index, by default True
    """

    if label_geohash not in data:
        raise ValueError('feature {} not in df'.format(label_geohash))

    lat, lon, _, _ = _reset_and_create_arrays_none(data,
                                                   reset_index=reset_index)

    for idx, row in progress_bar(data[[label_geohash]].iterrows(),
                                 total=data.shape[0]):
        lat_lon = _decode(row[label_geohash])
        lat[idx] = lat_lon[0]
        lon[idx] = lat_lon[1]

    data[LATITUDE_DECODE] = lat
    data[LONGITUDE_DECODE] = lon
    print('\n================================================')
    print('\n==> lat and lon decode features was created. <==')
    print('\n================================================')
Пример #4
0
def join_collective_areas(gdf_: DataFrame,
                          gdf_rules_: DataFrame,
                          label_geometry: Optional[Text] = GEOMETRY):
    """
    It performs the integration between trajectories and collective
    areas, generating a new column that informs if the point of the
    trajectory is inserted in a collective area.

    Parameters
    ----------
    gdf_ : geopandas.GeoDataFrame
        The input trajectory data
    gdf_rules_ : geopandas.GeoDataFrame
        The input coletive areas data
    label_geometry : str, optional
        Label referring to the Point of Interest category, by default GEOMETRY

    """

    print('Integration between trajectories and collectives areas')

    polygons = gdf_rules_[label_geometry].unique()
    gdf_[VIOLATING] = False
    for p in progress_bar(polygons):
        # intersects = gdf_[label_geometry].apply(lambda x: x.intersects(p))
        intersects = gdf_[label_geometry].intersects(p)
        index = gdf_[intersects].index
        gdf_.at[index, VIOLATING] = True
Пример #5
0
    def create_all_polygons_on_grid(self):
        """
        Create all polygons that are represented in a grid.

        Stores the polygons in the `grid_polygon` key

        """
        operation = begin_operation('create_all_polygons_on_grid')

        logger.debug('\nCreating all polygons on virtual grid')
        grid_polygon = np.array([[None for _ in range(self.grid_size_lon_x)]
                                 for _ in range(self.grid_size_lat_y)])
        lat_init = self.lat_min_y
        cell_size = self.cell_size_by_degree
        for i in progress_bar(range(self.grid_size_lat_y),
                              desc='Creating polygons'):
            lon_init = self.lon_min_x
            for j in range(self.grid_size_lon_x):
                # Cria o polygon da célula
                grid_polygon[i][j] = Polygon(
                    ((lon_init, lat_init), (lon_init, lat_init + cell_size),
                     (lon_init + cell_size,
                      lat_init + cell_size), (lon_init + cell_size, lat_init)))
                lon_init += cell_size
            lat_init += cell_size
        self.grid_polygon = grid_polygon
        logger.debug('...geometries saved on Grid grid_polygon property')
        self.last_operation = end_operation(operation)
Пример #6
0
def join_with_pois(data: DataFrame,
                   df_pois: DataFrame,
                   label_id: Optional[Text] = TRAJ_ID,
                   label_poi_name: Optional[Text] = NAME_POI,
                   reset_index: Optional[Text] = True):
    """
    Performs the integration between trajectories and points
    of interest, generating two new columns referring to the
    name and the distance from the point of interest closest
    to each point of the trajectory.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    df_pois : DataFrame
        The input point of interest data.
    label_id : str, optional
        Label of df_pois referring to the Point of Interest id, by default TRAJ_ID
    label_poi_name : str, optional
        Label of df_pois referring to the Point of Interest name, by default NAME_POI
    reset_index : bool, optional
        Flag for reset index of the df_pois and data dataframes before the join,
        by default True

    """

    print('Integration with POIs...')

    values = _reset_and_creates_id_and_lat_lon(data, df_pois, True,
                                               reset_index)
    current_distances, ids_POIs, tag_POIs, lat_user, lon_user = values

    for idx, row in progress_bar(data.iterrows(), total=len(data)):
        # create a vector to each lat
        lat_user.fill(row[LATITUDE])
        lon_user.fill(row[LONGITUDE])

        # computing distances to idx
        distances = np.float64(
            haversine(
                lat_user,
                lon_user,
                df_pois[LATITUDE].values,
                df_pois[LONGITUDE].values,
            ))

        # get index to arg_min and min distance
        index_min = np.argmin(distances)
        current_distances[idx] = np.min(distances)

        # setting data for a single object movement
        ids_POIs[idx] = df_pois.at[index_min, label_id]
        tag_POIs[idx] = df_pois.at[index_min, label_poi_name]

    data[ID_POI] = ids_POIs
    data[DIST_POI] = current_distances
    data[NAME_POI] = tag_POIs

    print('Integration with POI was finalized')
Пример #7
0
def join_collective_areas(gdf_, gdf_rules_, label_geometry=GEOMETRY):
    """
    It performs the integration between trajectories and collective
    areas, generating a new column that informs if the point of the
    trajectory is inserted in a collective area.

    Parameters
    ----------
    gdf_ : geopandas.GeoDataFrame
        The input trajectory data

    gdf_rules_ : geopandas.GeoDataFrame
        The input coletive areas data

    label_geometry: String, optional("geometry" by default)
        Label of gdf_rules_ referring to the geometry of each feature

    """

    print('Integration between trajectories and collectives areas')

    polygons = gdf_rules_[label_geometry].unique()
    gdf_[VIOLATING] = False
    for p in progress_bar(polygons):
        index = gdf_[gdf_[label_geometry].intersects(p)].index
        gdf_.at[index, VIOLATING] = True
Пример #8
0
def create_bin_geohash_df(df_, precision=15):
    """
    Create trajectory geohash binaries and integrate with df.

    Parameters
    ----------
    df_ : dataframe
        The input trajectories data.
    precision : number, optional, default 15
        Number of characters in resulting geohash.
    """
    try:
        _, _, _, bin_geohash = _reset_and_create_arrays_none(df_)

        for idx, row in progress_bar(df_[[LATITUDE, LONGITUDE]].iterrows(),
                                     total=df_.shape[0]):
            bin_geohash[idx] = _bin_geohash(row[LATITUDE], row[LONGITUDE],
                                            precision)

        df_[BIN_GEOHASH] = bin_geohash
        print('\n================================================')
        print('\n=====> bin_geohash features was created. <======')
        print('\n================================================')

    except Exception as e:
        raise e
Пример #9
0
def decode_geohash_to_latlon(df_, label_geohash=GEOHASH, reset_index=True):
    """
    Decode feature with hash of trajectories back to
    geographic coordinates.

    Parameters
    ----------
    df_ : dataframe
        The input trajectories data.
    label_geohash : str, optional, default 'geohash'
        The name of the feature with hashed trajectories
    reset_index : boolean, optional, default True
        Condition to reset the df index.
    """
    try:
        if label_geohash not in df_:
            raise ValueError('feature {} not in df'.format(label_geohash))

        lat, lon, _, _ = _reset_and_create_arrays_none(df_,
                                                       reset_index=reset_index)

        for idx, row in progress_bar(df_[[label_geohash]].iterrows(),
                                     total=df_.shape[0]):
            lat_lon = _decode(row[label_geohash])
            lat[idx] = lat_lon[0]
            lon[idx] = lat_lon[1]

        df_[LATITUDE_DECODE] = lat
        df_[LONGITUDE_DECODE] = lon
        print('\n================================================')
        print('\n==> lat and lon decode features was created. <==')
        print('\n================================================')

    except Exception as e:
        raise e
Пример #10
0
def column_to_array(df_, label_conversion):
    """
    Transforms all columns values to list.

    Parameters
    ----------
    df_ : dataframe
        The input trajectory data.

    label_conversion : Object
        Label of df_ referring to the column for conversion.
    """
    try:

        if label_conversion not in df_:
            raise KeyError('Dataframe must contain a %s column' %
                           label_conversion)

        arr = np.full(df_.shape[0], None, dtype=np.ndarray)
        for idx, row in progress_bar(df_.iterrows(), total=df_.shape[0]):
            arr[idx] = object_for_array(row[label_conversion])

        df_[label_conversion] = arr

    except Exception as e:
        raise e
Пример #11
0
def gap_statistic(move_data,
                  nrefs=3,
                  k_initial=1,
                  max_clusters=15,
                  k_iteration=1):
    """
    Calculates optimal clusters numbers using Gap Statistic from Tibshirani,
    Walther, Hastie.

    Parameters
    ----------
    move_data: ndarry of shape (n_samples, n_features).
        The input trajectory data.
    nrefs: int, optional (3 by default).
        number of sample reference datasets to create
    k_initial: int, optional (1 by default).
        The initial value used in the interaction of the elbow method.
        Represents the maximum numbers of clusters.
    max_clusters: int, optional (15  by default).
        Maximum number of clusters to test for.
    k_iteration:int, optional (1 by default).
        Increment value of the sequence used by the elbow method.

    Returns
    -------
    dict
        The error value for each cluster number

    Notes
    -----
    https://anaconda.org/milesgranger/gap-statistic/notebook

    """

    message = 'Executing Gap Statistic to:\n...K of %srs to %srs from k_iteration:%srs\n'
    message = message % (k_initial, max_clusters, k_iteration)
    print(message, flush=True)
    gaps = {}
    for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration)):
        # Holder for reference dispersion results
        ref_disps = np.zeros(nrefs)
        # For n references, generate random sample and perform kmeans
        # getting resulting dispersion of each loop
        for i in range(nrefs):
            # Create new random reference set
            random_reference = np.random.random_sample(size=move_data.shape)
            # Fit to it
            km = KMeans(k)
            km.fit(random_reference)
            ref_disps[i] = km.inertia_
        # Fit cluster to original data and create dispersion
        km = KMeans(k).fit(move_data)
        orig_disp = km.inertia_
        # Calculate gap statistic
        gap = np.log(np.mean(ref_disps)) - np.log(orig_disp)
        # Assign this loop'srs gap statistic to gaps
        gaps[k] = gap

    return gaps
Пример #12
0
def generate_trajectories_df(
    data: 'PandasMoveDataFrame' | 'DaskMoveDataFrame',
    label_tid: Text = TID,
    min_points_traj: int = 3
) -> DataFrame:
    """
    Generates a dataframe with the sequence of location points of a trajectory.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.
    label_tid: String, optional
        Label referring to the ID of the trajectories, by default TID
    min_points_traj: Number, optional
        Minimum points per trajectory, by default 3

    Return
    ------
    DataFrame
        DataFrame of the trajectories

    Example
    -------
    >>> from pymove.utils.data_augmentation import generate_trajectories_df
    >>>
    >>> df
      id             datetime  local         lat          lon          tid
    0  1  2017-09-02 21:59:34    162  -3.8431323  -38.5933142  12017090221
    1  1  2017-09-02 22:00:27     85  -3.8347478  -38.5921890  12017090222
    2  1  2017-09-02 22:01:36    673  -3.8235834  -38.5903890  12017090222
    3  1  2017-09-02 22:03:08    394  -3.8138890  -38.5904445  12017090222
    4  1  2017-09-02 22:03:46    263  -3.9067654  -38.5907723  12017090222
    5  1  2017-09-02 22:07:19    224  -3.8857223  -38.5928892  12017090222
    6  1  2017-09-02 22:07:40    623  -3.8828723  -38.5929789  12017090222
    >>>
    >>> traj_df = generate_trajectories_df(df)
    >>> traj_df.local
    0    [85, 673, 394, 263, 224, 623]
    Name: local, dtype: object

    """
    if label_tid not in data:
        raise ValueError(
            '{} not in DataFrame'.format(label_tid)
        )

    frames = []
    tids = data[label_tid].unique()

    desc = 'Gererating Trajectories DataFrame'
    for tid in progress_bar(tids, desc=desc, total=len(tids)):
        frame = data[data[label_tid] == tid]

        if frame.shape[0] >= min_points_traj:
            frames.append(frame.T.values.tolist())

    return pd.DataFrame(frames, columns=data.columns)
Пример #13
0
def dbscan_clustering(move_data: DataFrame,
                      cluster_by: str,
                      meters: int = 10,
                      min_sample: float = 1680 / 2,
                      earth_radius: float = EARTH_RADIUS,
                      metric: str | Callable = 'euclidean',
                      inplace: bool = False) -> DataFrame | None:
    """
    Performs density based clustering on the move_dataframe according to cluster_by.

    Parameters
    ----------
    move_data : dataframe
        the input trajectory
    cluster_by : str
        the colum to cluster
    meters : int, optional
        distance to use in the clustering, by default 10
    min_sample : float, optional
        the minimum number of samples to consider a cluster, by default 1680/2
    earth_radius : int
        Y offset from your original position in meters, by default EARTH_RADIUS
    metric: string, or callable, optional
        The metric to use when calculating distance between instances in a feature array
        by default 'euclidean'
    inplace : bool, optional
            Whether to return a new DataFrame, by default False

    Returns
    -------
    DataFrame
        Clustered dataframe or None
    """
    if not inplace:
        move_data = move_data[:]
    move_data.reset_index(drop=True, inplace=True)

    move_data[N_CLUSTER] = -1

    for cluster_id in progress_bar(move_data[cluster_by].unique(),
                                   desc='Clustering'):

        df_filter = move_data[move_data[cluster_by] == cluster_id]

        dbscan = DBSCAN(eps=meters_to_eps(meters, earth_radius),
                        min_samples=min_sample,
                        metric=metric)
        dbscan_result = dbscan.fit(df_filter[[LATITUDE, LONGITUDE]].to_numpy())

        idx = df_filter.index
        res = dbscan_result.labels_ + move_data[N_CLUSTER].max() + 1
        move_data.at[idx, N_CLUSTER] = res

    if not inplace:
        return move_data
Пример #14
0
def decode_geohash_to_latlon(data: DataFrame,
                             label_geohash: str = GEOHASH,
                             reset_index: bool = True):
    """
    Decode feature with hash of trajectories back to geographic coordinates.

    Parameters
    ----------
    data : dataframe
        The input trajectories data
    label_geohash : str, optional
        The name of the feature with hashed trajectories, by default GEOHASH
    reset_index : boolean, optional
        Condition to reset the df index, by default True

    Return
    ------
    A DataFrame with the additional columns 'lat_decode' and 'lon_decode'

    Example
    -------
    >>> from pymove.utils.geoutils import decode_geohash_to_latlon
    >>> geoLife_df
              lat          lon           geohash
    0   39.984094   116.319236   wx4eqyvh4xkg0xs
    1   39.984198   116.319322   wx4eqyvhudszsev
    2   39.984224   116.319402   wx4eqyvhyx8d9wc
    3   39.984211   116.319389   wx4eqyvhyjnv5m7
    4   39.984217   116.319422   wx4eqyvhyyr2yy8
    >>> print(type(decode_geohash_to_latlon(geoLife_df)))
    >>> geoLife_df
    <class 'NoneType'>
              lat          lon           geohash  lat_decode   lon_decode
    0   39.984094   116.319236   wx4eqyvh4xkg0xs   39.984094   116.319236
    1   39.984198   116.319322   wx4eqyvhudszsev   39.984198   116.319322
    2   39.984224   116.319402   wx4eqyvhyx8d9wc   39.984224   116.319402
    3   39.984211   116.319389   wx4eqyvhyjnv5m7   39.984211   116.319389
    4   39.984217   116.319422   wx4eqyvhyyr2yy8   39.984217   116.319422
    """
    if label_geohash not in data:
        raise ValueError(f'feature {label_geohash} not in df')

    lat, lon, _, _ = _reset_and_create_arrays_none(data,
                                                   reset_index=reset_index)

    for idx, row in progress_bar(data[[label_geohash]].iterrows(),
                                 total=data.shape[0]):
        lat_lon = _decode(row[label_geohash])
        lat[idx] = lat_lon[0]
        lon[idx] = lat_lon[1]

    data[LATITUDE_DECODE] = lat
    data[LONGITUDE_DECODE] = lon
Пример #15
0
def elbow_method(move_data: DataFrame,
                 k_initial: int = 1,
                 max_clusters: int = 15,
                 k_iteration: int = 1,
                 random_state: int | None = None) -> dict:
    """
    Determines the optimal number of clusters.

    In the range set by the user using the elbow method.

    Parameters
    ----------
    move_data : dataframe
        The input trajectory data.
    k_initial: int, optional
        The initial value used in the interaction of the elbow method.
        Represents the maximum numbers of clusters, by default 1
    max_clusters: int, optional
        The maximum value used in the interaction of the elbow method.
        Maximum number of clusters to test for, by default 15
    k_iteration: int, optional
        Increment value of the sequence used by the elbow method, by default 1
    random_state: int, RandomState instance
        Determines random number generation for centroid initialization.
        Use an int to make the randomness deterministic, by default None

    Returns
    -------
    dict
        The inertia values ​​for the different numbers of clusters

    Example
    -------
    clustering.elbow_method(move_data=move_df, k_iteration=3)
        {
            1: 55084.15957839036,
            4: 245.68365592382938,
            7: 92.31472644640075,
            10: 62.618599956870355,
            13: 45.59653757292055,
        }

    """
    message = 'Executing Elbow Method for {} to {} clusters at {} steps\n'.format(
        k_initial, max_clusters, k_iteration)
    logger.debug(message)
    inertia_dic = {}
    for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration),
                          desc='Running KMeans'):
        km = KMeans(n_clusters=k, random_state=random_state)
        inertia_dic[k] = km.fit(move_data[[LATITUDE, LONGITUDE]]).inertia_
    return inertia_dic
Пример #16
0
def elbow_method(move_data,
                 k_initial=1,
                 max_clusters=15,
                 k_iteration=1,
                 random_state=None):
    """
    Determines the optimal number of clusters in the range set by the user using
    the elbow method.

    Parameters
    ----------
    move_data : dataframe
        The input trajectory data.
    k_initial: int, optional (1 by default).
        The initial value used in the interaction of the elbow method.
        Represents the maximum numbers of clusters.
    max_clusters: int, optional (15  by default).
        The maximum value used in the interaction of the elbow method.
        Maximum number of clusters to test for
    k_iteration: int, optional (1 by default).
        Increment value of the sequence used by the elbow method.
    random_state: int, RandomState instance, default=None
        Determines random number generation for centroid initialization.
        Use an int to make the randomness deterministic

    Returns
    -------
    dict
        The inertia values ​​for the different numbers of clusters

    Example
    -------
    clustering.elbow_method(move_data=move_df[['lat', 'lon']], k_iteration=3)
        {
            1: 55084.15957839036,
            4: 245.68365592382938,
            7: 92.31472644640075,
            10: 62.618599956870355,
            13: 45.59653757292055,
        }

    """

    message = 'Executing Elbow Method to:\n...K of %srs to %srs from k_iteration:%srs\n'
    message = message % (k_initial, max_clusters, k_iteration)
    print(message, flush=True)
    inertia_dic = {}
    for k in progress_bar(range(k_initial, max_clusters + 1, k_iteration)):
        km = KMeans(n_clusters=k, random_state=random_state)
        inertia_dic[k] = km.fit(move_data).inertia_
    return inertia_dic
Пример #17
0
def generate_trajectories_df(
    data: Union['PandasMoveDataFrame', 'DaskMoveDataFrame']
) -> DataFrame:
    """
    Generates a dataframe with the sequence of
    location points of a trajectory.

    Parameters
    ----------
    data : DataFrame
        The input trajectory data.

    Return
    ------
    DataFrame
        DataFrame of the trajectories

    """
    if TID not in data:
        data.generate_tid_based_on_id_datetime()
        data.reset_index(drop=True, inplace=True)

    tids = data[TID].unique()
    new_df = pd.DataFrame(
        columns=data.columns
    )

    for tid in progress_bar(tids, total=len(tids)):
        filter_ = data[data[TID] == tid]
        filter_.reset_index(drop=True, inplace=True)

        if filter_.shape[0] > 4:

            values = []
            for col in filter_.columns:
                if filter_[col].nunique() == 1:
                    values.append(filter_.at[0, col])
                else:
                    values.append(
                        np.array(
                            filter_[col], dtype=type(filter_.at[0, col])
                        ).tolist()
                    )

            row = pd.Series(values, filter_.columns)
            append_row(new_df, row=row)

    return new_df
Пример #18
0
    def create_all_polygons_to_all_point_on_grid(self, data, unique_index=True):
        """
        Create all polygons to all points represented in a grid.

        Parameters
        ----------
        data : pandas.core.frame.DataFrame
            Represents the dataset with contains lat, long and datetime.
        unique_index: boolean
            How to index the grid

        Returns
        -------
        pandas.core.frame.DataFrame
            Represents the same dataset with new key 'polygon'
            where polygons were saved.

        """

        operation = begin_operation('create_all_polygons_to_all_point_on_grid')

        try:
            self.create_update_index_grid_feature(data, unique_index=False)
            print(data)
            datapolygons = data.loc[
                :, ['id', 'index_grid_lat', 'index_grid_lon']
            ].drop_duplicates()
            size = datapolygons.shape[0]
            # transform series in numpyarray
            index_grid_lat = np.array(data['index_grid_lat'])
            index_grid_lon = np.array(data['index_grid_lon'])

            # transform series in numpyarray
            polygons = np.array([])

            for i in progress_bar(range(size)):
                p = self.create_one_polygon_to_point_on_grid(
                    index_grid_lat[i], index_grid_lon[i]
                )
                polygons = np.append(polygons, p)
            print('...polygons were created')
            datapolygons['polygon'] = polygons
            self.last_operation = end_operation(operation)
            return datapolygons
        except Exception as e:
            self.last_operation = end_operation(operation)
            print('size:{}, i:{}'.format(size, i))
            raise e
Пример #19
0
def create_bin_geohash_df(data: DataFrame, precision: float = 15):
    """
    Create trajectory geohash binaries and integrate with df.

    Parameters
    ----------
    data : dataframe
        The input trajectories data
    precision : float, optional
        Number of characters in resulting geohash, by default 15

    Return
    ------
    A DataFrame with the additional column 'bin_geohash'

    Example
    -------
    >>> from pymove.utils.geoutils import create_bin_geohash_df
    >>> geoLife_df
             lat          lon
    0   39.984094   116.319236
    1   39.984198   116.319322
    2   39.984224   116.319402
    3   39.984211   116.319389
    4   39.984217   116.319422
    >>> print(type(create_bin_geohash_df(geoLife_df)))
    >>> geoLife_df
    <class 'NoneType'>
              lat         lon                                         bin_geohash
    0   39.984094   116.319236  [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...
    1   39.984198   116.319322  [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...
    2   39.984224   116.319402  [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...
    3   39.984211   116.319389  [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...
    4   39.984217   116.319422  [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, ...
    """
    *_, bin_geohash = _reset_and_create_arrays_none(data)

    for idx, row in progress_bar(data[[LATITUDE, LONGITUDE]].iterrows(),
                                 total=data.shape[0]):
        bin_geohash[idx] = _bin_geohash(row[LATITUDE], row[LONGITUDE],
                                        precision)

    data[BIN_GEOHASH] = bin_geohash
Пример #20
0
def elbow_method(move_data, k_initial=1, max_clusters=15, k_iteration=1):
    """
    Determines the optimal number of clusters in the range set by the user using
    the elbow method.

    Parameters
    ----------
    move_data : dataframe
        The input trajectory data.
    k_initial: int, optional (1 by default).
        The initial value used in the interaction of the elbow method. Represents the maximum numbers of clusters.
    max_clusters: int, optional (15  by default).
        The maximum value used in the interaction of the elbow method. Maximum number of clusters to test for
    k_iteration: int, optional (1 by default).
        Increment value of the sequence used by the elbow method.

    Returns
    -------
    inertia_dic : dictionary
        The inertia values ​​for the different numbers of clusters
    Example
    -------
        clustering.elbow_method(move_data=move_df[['lat', 'lon']], k_initial = 2, max_clusters = 17, k_iteration = 2)
            {2: 55084.15957839036,
             4: 245.68365592382938,
             6: 92.31472644640075,
             8: 62.618599956870355,
             10: 45.59653757292055,
             12: 34.32238676029195,
             14: 26.087387367439227,
             16: 20.64369311973992}
    """

    message = "Executing Elbow Method to:\n...K of {} to {} from k_iteration:{}\n".format(
        k_initial, max_clusters, k_iteration)
    print(message, flush=True)
    inertia_dic = {}
    for k in progress_bar(range(k_initial, max_clusters, k_iteration)):
        # validing K value in K-means
        # print('...testing k: {}'.format(k))
        inertia_dic[k] = KMeans(n_clusters=k).fit(move_data).inertia_
    return inertia_dic
Пример #21
0
def create_geohash_df(data: DataFrame, precision: float = 15):
    """
    Create geohash from geographic coordinates and integrate with df.

    Parameters
    ----------
    data : dataframe
        The input trajectories data
    precision : float, optional
        Number of characters in resulting geohash, by default 15

    Return
    ------
    A DataFrame with the additional column 'geohash'

    Example
    -------
    >>> from pymove.utils.geoutils import create_geohash_df, _reset_and_create_arrays_none
    >>> geoLife_df
              lat          lon
    0   39.984094   116.319236
    1   39.984198   116.319322
    2   39.984224   116.319402
    3   39.984211   116.319389
    4   39.984217   116.319422
    >>> print(type (create_geohash_df(geoLife_df)))
    >>> geoLife_df
    <class 'NoneType'>
              lat          lon           geohash
    0   39.984094   116.319236   wx4eqyvh4xkg0xs
    1   39.984198   116.319322   wx4eqyvhudszsev
    2   39.984224   116.319402   wx4eqyvhyx8d9wc
    3   39.984211   116.319389   wx4eqyvhyjnv5m7
    4   39.984217   116.319422   wx4eqyvhyyr2yy8
    """
    _, _, geohash, _ = _reset_and_create_arrays_none(data)

    for idx, row in progress_bar(data[[LATITUDE, LONGITUDE]].iterrows(),
                                 total=data.shape[0]):
        geohash[idx] = _encode(row[LATITUDE], row[LONGITUDE], precision)

    data[GEOHASH] = geohash
Пример #22
0
def flatten_trajectories_dataframe(traj_df: DataFrame) -> DataFrame:
    """
    Extracts information from trajectories.

    Parameters
    ----------
    traj_df : DataFrame
        The input trajectories data

    Return
    ------
    DataFrames
        Flat trajectories.

    Example
    -------
    >>> from pymove.utils.data_augmentation import flatten_trajectories_dataframe
    >>>
    >>> traj_df
                 id                 local
    0     [1, 1, 1]        [85, 673, 394]
    1  [2, 2, 2, 2]  [263, 224, 623, 515]
    >>>
    >>> flatten_trajectories_dataframe(traj_df)
       id  local
    0   1     85
    1   1    673
    2   1    394
    3   2    263
    4   2    224
    5   2    623
    6   2    515

    """
    frames = {}
    for idx, row in progress_bar(traj_df.iterrows(), total=traj_df.shape[0]):
        frames[idx] = pd.DataFrame(row.to_dict())

    return pd.concat([frames[i] for i in range(len(frames))], ignore_index=True)
Пример #23
0
def create_bin_geohash_df(data: DataFrame, precision: Optional[float] = 15):
    """
    Create trajectory geohash binaries and integrate with df.

    Parameters
    ----------
    data : dataframe
        The input trajectories data
    precision : float, optional
        Number of characters in resulting geohash, by default 15
    """

    _, _, _, bin_geohash = _reset_and_create_arrays_none(data)

    for idx, row in progress_bar(data[[LATITUDE, LONGITUDE]].iterrows(),
                                 total=data.shape[0]):
        bin_geohash[idx] = _bin_geohash(row[LATITUDE], row[LONGITUDE],
                                        precision)

    data[BIN_GEOHASH] = bin_geohash
    print('\n================================================')
    print('\n=====> bin_geohash features was created. <======')
    print('\n================================================')
Пример #24
0
def knn_query(
    traj: DataFrame,
    move_df: DataFrame,
    k: Optional[int] = 5,
    id_: Optional[Text] = TRAJ_ID,
    distance: Optional[Text] = MEDP,
    latitude: Optional[Text] = LATITUDE,
    longitude: Optional[Text] = LONGITUDE,
    datetime: Optional[Text] = DATETIME
) -> DataFrame:
    """
    Given a k, a trajectory and a
    DataFrame with multiple paths, it returns
    the k neighboring trajectories closest to the trajectory.

    Parameters
    ----------
    traj: dataframe
        The input of one trajectory.
    move_df: dataframe
        The input trajectory data.
    k: int, optional
        neighboring trajectories, by default 5
    id_: str, optional
        Label of the trajectories dataframe user id, by default TRAJ_ID
    distance: string, optional
        Distance measure type, by default MEDP
    latitude: string, optional
        Label of the trajectories dataframe referring to the latitude,
        by default LATITUDE
    longitude: string, optional
        Label of the trajectories dataframe referring to the longitude,
        by default LONGITUDE
    datetime: string, optional
        Label of the trajectories dataframe referring to the timestamp,
        by default DATETIME

    Returns
    -------
    DataFrame
        dataframe with near trajectories


    Raises
    ------
        ValueError: if distance measure is invalid

    """

    k_list = pd.DataFrame([[np.Inf, 'empty']] * k, columns=['distance', TRAJ_ID])

    if (distance == MEDP):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.MEDP(
                traj, this, latitude, longitude
            )
    elif (distance == MEDT):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.MEDT(
                traj, this, latitude, longitude, datetime
            )
    else:
        raise ValueError('Unknown distance measure. Use MEDP or MEDT')

    for traj_id in progress_bar(
        move_df[id_].unique(), desc='Querying knn by {}'.format(distance)
    ):
        if (traj_id != traj[id_].values[0]):
            this = move_df.loc[move_df[id_] == traj_id]
            this_distance = dist_measure(
                traj, this, latitude, longitude, datetime
            )
            n = 0
            for n in range(k):
                if (this_distance < k_list.loc[n, 'distance']):
                    k_list.loc[n, 'distance'] = this_distance
                    k_list.loc[n, 'traj_id'] = traj_id
                    break
                n = n + 1

    result = traj.copy()
    print('Gerando DataFrame com as k trajetórias mais próximas')
    for n in range(k):
        result = result.append(
            move_df.loc[move_df[id_] == k_list.loc[n, 'traj_id']]
        )

    return result
Пример #25
0
def range_query(
    traj: DataFrame,
    move_df: DataFrame,
    _id: Optional[Text] = TRAJ_ID,
    min_dist: Optional[float] = 1000,
    distance: Optional[Text] = MEDP,
    latitude: Optional[Text] = LATITUDE,
    longitude: Optional[Text] = LONGITUDE,
    datetime: Optional[Text] = DATETIME
) -> DataFrame:
    """
    Given a distance, a trajectory, and a DataFrame
    with several trajectories, it returns all trajectories that
    have a distance equal to or less than the informed
    trajectory.

    Parameters
    ----------
    traj: dataframe
        The input of one trajectory.
    move_df: dataframe
        The input trajectory data.
    _id: str, optional
        Label of the trajectories dataframe user id, by default TRAJ_ID
    min_dist: float, optional
        Minimum distance measure, by default 1000
    distance: string, optional
        Distance measure type, by default MEDP
    latitude: string, optional
        Label of the trajectories dataframe referring to the latitude,
        by default LATITUDE
    longitude: string, optional
        Label of the trajectories dataframe referring to the longitude,
        by default LONGITUDE
    datetime: string, optional
        Label of the trajectories dataframe referring to the timestamp,
        by default DATETIME

    Returns
    -------
    DataFrame
        dataframe with near trajectories

    Raises
    ------
        ValueError: if distance measure is invalid

    """

    result = traj.copy()
    result.drop(result.index, inplace=True)

    if (distance == MEDP):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.MEDP(
                traj, this, latitude, longitude
            )
    elif (distance == MEDT):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.MEDT(
                traj, this, latitude, longitude, datetime
            )
    else:
        raise ValueError('Unknown distance measure. Use MEDP or MEDT')

    for traj_id in progress_bar(
        move_df[_id].unique(), desc='Querying range by {}'.format(distance)
    ):
        this = move_df.loc[move_df[_id] == traj_id]
        if dist_measure(traj, this, latitude, longitude, datetime) < min_dist:
            result = result.append(this)

    return result
Пример #26
0
def _filter_by(move_data, label_id, label_new_tid, drop_single_points,
               **kwargs):
    """
    Splits the trajectories into segments.

    Parameters
    ----------
    move_data : dataframe
       The input trajectory data
    label_id : String, optional(dic_labels["id"] by default)
         Indicates the label of the id column in the user"srs dataframe.
    label_new_tid : String, optional(TID_PART by default)
        The label of the column containing the ids of the formed segments.
        Is the new splitted id.
    drop_single_points : boolean, optional(True by default)
        If set to True, drops the trajectories with only one point.
    **kwargs : arguments
        depends on the type of segmentation
        - all : if is a segmentation by all features
        - max_dist : maximum dist between adjacent points
        - max_time : maximum time between adjacent points
        - max_speed : maximum speed between adjacent points
        - feature : feature to use for segmentation
        - max_between_adj_points : maximum value for feature

    Returns
    -------
    dataframe
        DataFrame with the aditional features: label_new_tid,
        that indicates the trajectory segment to which the point belongs to.

    Note
    ----
    Time, distance and speeed features must be updated after split.

    """

    curr_tid, ids, count = _prepare_segmentation(move_data, label_id,
                                                 label_new_tid)

    for idx in progress_bar(ids, desc='Generating %s' % label_new_tid):
        if kwargs['all']:
            filter_ = _filter_and_dist_time_speed(move_data, idx,
                                                  kwargs['max_dist'],
                                                  kwargs['max_time'],
                                                  kwargs['max_speed'])
        else:
            filter_ = _filter_or_dist_time_speed(
                move_data, idx, kwargs['feature'],
                kwargs['max_between_adj_points'])

        curr_tid, count = _update_curr_tid_count(filter_, move_data, idx,
                                                 label_new_tid, curr_tid,
                                                 count)

    if label_id == label_new_tid:
        move_data.reset_index(drop=True, inplace=True)
        print('... label_tid = label_new_id, then reseting and drop index')
    else:
        move_data.reset_index(inplace=True)
        print('... Reseting index\n')

    if drop_single_points:
        _drop_single_point(move_data, label_new_tid, label_id)
        move_data.generate_dist_time_speed_features()

    return move_data
Пример #27
0
def query_all_points_by_range(
    traj1: DataFrame,
    move_df: DataFrame,
    minimum_meters: float = 100,
    minimum_time: timedelta = None
) -> DataFrame:
    """
    Queries closest point within a spatial range based on meters and a temporal range.

    Selects only the points between two Move Dataframes
    that have the closest point within a spatial range
    based on meters and a temporal range.

    Parameters
    ----------
    traj1: dataframe
        The input of a trajectory data.
    move_df: dataframe
        The input of another trajectory data.
    minimum_meters: float, optional
        the minimum spatial distance, based in meters, between the points, by default 100
    minimum_time: datetime.timedelta, optional
        the minimum temporal distance between the points, by default timedelta(minutes=2)
    datetime_label: string, optional
        the label that refers to the datetime label of the dataframes, by default DATETIME

    Returns
    -------
    DataFrame
        dataframe with all the points of move_df which are in
        a spatial distance and temporal distance equal or smaller
        than the minimum distance parameters.

    Examples
    --------
    >>> from pymove.query.query import query_all_points_by_range
    >>> traj_df
         lat      lon              datetime  id
    0   16.4    -54.9   2014-10-11 18:00:00   1
    1   16.4    -55.9   2014-10-12 00:00:00   1
    2   16.4    -56.9   2014-10-12 06:00:00   1
    >>> move_df
         lat      lon              datetime  id
    0   33.1    -77.0   2012-05-19 00:00:00   2
    1   32.8    -77.1   2012-05-19 06:00:00   3
    2   32.5    -77.3   2012-05-19 12:00:00   4
    >>> query_all_points_by_range(
    >>>    traj_df, move_df, minimum_meters=3190000, minimum_time=timedelta(hours=21010)
    >>> )
         lat      lon              datetime  id    spatial_distance target_id\
        target_lat  target_lon        target_datetime   temporal_distance
    0   32.5    -77.3   2012-05-19 12:00:00   4        3.182834e+06         1\
             16.4       -54.9    2014-10-11 18:00:00   875 days 06:00:00
    """
    if minimum_time is None:
        minimum_time = timedelta(minutes=2)

    result = DataFrame([])
    total = traj1.shape[0]
    for _, row in progress_bar(
        traj1.iterrows(),
        desc='Querying all points by temporal and spatial distance',
        total=total
    ):
        coinc_points = _meters_filter(row, move_df, minimum_meters)
        coinc_points = _datetime_filter(row, coinc_points, minimum_time)
        result = coinc_points.append(result)

    return result
Пример #28
0
def range_query(
    traj: DataFrame,
    move_df: DataFrame,
    _id: str = TRAJ_ID,
    min_dist: float = 1000,
    distance: str = MEDP,
    latitude: str = LATITUDE,
    longitude: str = LONGITUDE,
    datetime: str = DATETIME
) -> DataFrame:
    """
    Returns all trajectories that have a distance equal to or less than the trajectory.

    Given a distance, a trajectory, and a DataFrame with several trajectories.

    Parameters
    ----------
    traj: dataframe
        The input of one trajectory.
    move_df: dataframe
        The input trajectory data.
    _id: str, optional
        Label of the trajectories dataframe user id, by default TRAJ_ID
    min_dist: float, optional
        Minimum distance measure, by default 1000
    distance: string, optional
        Distance measure type, by default MEDP
    latitude: string, optional
        Label of the trajectories dataframe referring to the latitude,
        by default LATITUDE
    longitude: string, optional
        Label of the trajectories dataframe referring to the longitude,
        by default LONGITUDE
    datetime: string, optional
        Label of the trajectories dataframe referring to the timestamp,
        by default DATETIME

    Returns
    -------
    DataFrame
        dataframe with near trajectories

    Raises
    ------
        ValueError: if distance measure is invalid

    Examples
    --------
    >>> from pymove.query.query import range_query
    >>> traj_df
         lat      lon              datetime  id
    0   16.4    -54.9   2014-10-11 18:00:00   1
    1   16.4    -55.9   2014-10-12 00:00:00   1
    2   16.4    -56.9   2014-10-12 06:00:00   1
    >>> move_df
         lat      lon              datetime  id
    0   33.1    -77.0   2012-05-19 00:00:00   2
    1   32.8    -77.1   2012-05-19 06:00:00   3
    2   32.5    -77.3   2012-05-19 12:00:00   4
    >>> range_query(
    >>>    traj_df, move_df, min_dist=80.5
    >>> )
         lat      lon              datetime  id
    1   32.8	-77.1	2012-05-19 06:00:00	  3
    2	32.5	-77.3	2012-05-19 12:00:00	  4
    """
    result = traj.copy()
    result.drop(result.index, inplace=True)

    if (distance == MEDP):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.medp(
                traj, this, latitude, longitude
            )
    elif (distance == MEDT):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.medt(
                traj, this, latitude, longitude, datetime
            )
    else:
        raise ValueError('Unknown distance measure. Use MEDP or MEDT')

    for traj_id in progress_bar(
        move_df[_id].unique(), desc=f'Querying range by {distance}'
    ):
        this = move_df.loc[move_df[_id] == traj_id]
        if dist_measure(traj, this, latitude, longitude, datetime) < min_dist:
            result = result.append(this)

    return result
Пример #29
0
def knn_query(
    traj: DataFrame,
    move_df: DataFrame,
    k: int = 5,
    id_: str = TRAJ_ID,
    distance: str = MEDP,
    latitude: str = LATITUDE,
    longitude: str = LONGITUDE,
    datetime: str = DATETIME
) -> DataFrame:
    """
    Returns the k neighboring trajectories closest to the trajectory.

    Given a k, a trajectory and a DataFrame with multiple paths.

    Parameters
    ----------
    traj: dataframe
        The input of one trajectory.
    move_df: dataframe
        The input trajectory data.
    k: int, optional
        neighboring trajectories, by default 5
    id_: str, optional
        Label of the trajectories dataframe user id, by default TRAJ_ID
    distance: string, optional
        Distance measure type, by default MEDP
    latitude: string, optional
        Label of the trajectories dataframe referring to the latitude,
        by default LATITUDE
    longitude: string, optional
        Label of the trajectories dataframe referring to the longitude,
        by default LONGITUDE
    datetime: string, optional
        Label of the trajectories dataframe referring to the timestamp,
        by default DATETIME

    Returns
    -------
    DataFrame
        dataframe with near trajectories


    Raises
    ------
        ValueError: if distance measure is invalid

    Examples
    --------
    >>> from pymove.query.query import knn_query
    >>> traj_df
         lat      lon              datetime  id
    0   16.4    -54.9   2014-10-11 18:00:00   1
    1   16.4    -55.9   2014-10-12 00:00:00   1
    2   16.4    -56.9   2014-10-12 06:00:00   1
    >>> move_df
         lat      lon              datetime  id
    0   33.1    -77.0   2012-05-19 00:00:00   2
    1   32.8    -77.1   2012-05-19 06:00:00   3
    2   32.5    -77.3   2012-05-19 12:00:00   4
    >>> knn_query(
    >>>    traj_df, move_df, k=1
    >>> )
         lat      lon              datetime  id
    0	16.4	-54.9	2014-10-11 18:00:00   1
    1	16.4	-55.9	2014-10-12 00:00:00	  1
    2	16.4	-56.9	2014-10-12 06:00:00	  1
    2	32.5	-77.3	2012-05-19 12:00:00	  4
    """
    k_list = pd.DataFrame([[np.Inf, 'empty']] * k, columns=['distance', TRAJ_ID])

    if (distance == MEDP):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.medp(
                traj, this, latitude, longitude
            )
    elif (distance == MEDT):
        def dist_measure(traj, this, latitude, longitude, datetime):
            return distances.medt(
                traj, this, latitude, longitude, datetime
            )
    else:
        raise ValueError('Unknown distance measure. Use MEDP or MEDT')

    for traj_id in progress_bar(
        move_df[id_].unique(), desc=f'Querying knn by {distance}'
    ):
        if (traj_id != traj[id_].values[0]):
            this = move_df.loc[move_df[id_] == traj_id]
            this_distance = dist_measure(
                traj, this, latitude, longitude, datetime
            )
            n = 0
            for n in range(k):
                if (this_distance < k_list.loc[n, 'distance']):
                    k_list.loc[n, 'distance'] = this_distance
                    k_list.loc[n, 'traj_id'] = traj_id
                    break
                n = n + 1

    result = traj.copy()
    logger.debug('Generating DataFrame with k nearest trajectories.')
    for n in range(k):
        result = result.append(
            move_df.loc[move_df[id_] == k_list.loc[n, 'traj_id']]
        )

    return result
Пример #30
0
def compress_segment_stop_to_point(
    move_data: DataFrame,
    label_segment: str = SEGMENT_STOP,
    label_stop: str = STOP,
    point_mean: str = 'default',
    drop_moves: bool = False,
    label_id: str = TRAJ_ID,
    dist_radius: float = 30,
    time_radius: float = 900,
    inplace: bool = False,
) -> DataFrame:
    """
    Compress the trajectories using the stop points in the dataframe.

    Compress a segment to point setting lat_mean e lon_mean to each segment.

    Parameters
    ----------
    move_data : dataframe
       The input trajectory data
    label_segment : String, optional
        The label of the column containing the ids of the formed segments.
        Is the new splitted id, by default SEGMENT_STOP
    label_stop : String, optional
        Is the name of the column that indicates if a point is a stop, by default STOP
    point_mean : String, optional
        Indicates whether the mean points should be calculated using
        centroids or the point that repeat the most, by default 'default'
    drop_moves : Boolean, optional
        If set to true, the moving points will be dropped from the dataframe,
        by default False
    label_id : String, optional
         Used to create the stay points used in the compression.
         If the dataset already has the stop move, this
         parameter should be ignored.
         Indicates the label of the id column in the user dataframe, by default TRAJ_ID
    dist_radius : Double, optional
        Used to create the stay points used in the compression, by default 30
        If the dataset already has the stop move, this
        parameter should be ignored.
        The first step in this function is segmenting the trajectory.
        The segments are used to find the stop points.
        The dist_radius defines the distance used in the segmentation.
    time_radius :  Double, optional
        Used to create the stay points used in the compression, by default 900
        If the dataset already has the stop move, this
         parameter should be ignored.
        The time_radius used to determine if a segment is a stop.
        If the user stayed in the segment for a time
        greater than time_radius, than the segment is a stop.
    inplace : boolean, optional
        if set to true the original dataframe will be altered to contain
        the result of the filtering, otherwise a copy will be returned, by default False

    Returns
    -------
    DataFrame
        Data with 3 additional features: segment_stop, lat_mean and lon_mean or None
        segment_stop indicates the trajectory segment to which the point belongs
        lat_mean and lon_mean:
            if the default option is used, lat_mean and lon_mean are defined
            based on point that repeats most within the segment
            On the other hand, if centroid option is used,
            lat_mean and lon_mean are defined by centroid of
            the all points into segment

    """
    if not inplace:
        move_data = move_data.copy()

    if (label_segment not in move_data) & (label_stop not in move_data):
        create_or_update_move_stop_by_dist_time(move_data,
                                                dist_radius,
                                                time_radius,
                                                label_id,
                                                inplace=True)

    logger.debug('...setting mean to lat and lon...')
    lat_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64)
    lon_mean = np.full(move_data.shape[0], -1.0, dtype=np.float64)

    if drop_moves is False:
        lat_mean[move_data[~move_data[label_stop]].index] = np.NaN
        lon_mean[move_data[~move_data[label_stop]].index] = np.NaN
    else:
        logger.debug('...move segments will be dropped...')

    logger.debug('...get only segments stop...')
    segments = move_data[move_data[label_stop]][label_segment].unique()

    for idx in progress_bar(
            segments, desc=f'Generating {label_segment} and {label_stop}'):
        filter_ = move_data[label_segment] == idx

        size_id = move_data[filter_].shape[0]
        # verify if filter is None
        if size_id > 1:
            # get first and last point of each stop segment
            ind_start = move_data[filter_].iloc[[0]].index
            ind_end = move_data[filter_].iloc[[-1]].index

            if point_mean == 'default':
                p = (move_data[filter_].groupby([LATITUDE, LONGITUDE],
                                                as_index=False).agg({
                                                    'id':
                                                    'count'
                                                }).sort_values(['id']).tail(1))
                lat_mean[ind_start] = p.iloc[0, 0]
                lon_mean[ind_start] = p.iloc[0, 1]
                lat_mean[ind_end] = p.iloc[0, 0]
                lon_mean[ind_end] = p.iloc[0, 1]

            elif point_mean == 'centroid':
                # set lat and lon mean to first_point
                # and last points to each segment
                lat_mean[ind_start] = move_data.loc[filter_][LATITUDE].mean()
                lon_mean[ind_start] = move_data.loc[filter_][LONGITUDE].mean()
                lat_mean[ind_end] = move_data.loc[filter_][LATITUDE].mean()
                lon_mean[ind_end] = move_data.loc[filter_][LONGITUDE].mean()
        else:
            logger.debug(f'There are segments with only one point: {idx}')

    move_data[LAT_MEAN] = lat_mean
    move_data[LON_MEAN] = lon_mean
    del lat_mean
    del lon_mean

    shape_before = move_data.shape[0]
    # filter points to drop
    filter_drop = ((move_data[LAT_MEAN] == -1.0)
                   & (move_data[LON_MEAN] == -1.0))
    shape_drop = move_data[filter_drop].shape[0]

    if shape_drop > 0:
        logger.debug('...Dropping %s points...' % shape_drop)
        move_data.drop(move_data[filter_drop].index, inplace=True)

    logger.debug('...Shape_before: %s\n...Current shape: %s' %
                 (shape_before, move_data.shape[0]))

    if not inplace:
        return move_data