def test_timezone_conversion(self):
     tdf = skmob.TrajDataFrame(self.default_data_df,
                               latitude='latitude',
                               datetime='hour',
                               user_id='user')
     tdf.timezone_conversion(from_timezone='Europe/London',
                             to_timezone='Europe/Berlin')
     assert tdf[DATETIME][0] == pd.Timestamp('2008-10-23 14:53:05')
 def test_tdf_from_list(self):
     tdf = skmob.TrajDataFrame(self.default_data_list,
                               latitude=1,
                               longitude=2,
                               datetime=3,
                               user_id=0)
     self.perform_default_asserts(tdf)
     print(tdf.head()
           )  # raised TypeError: 'BlockManager' object is not iterable
def GetDistance(df_input):
    df_input = RemoveZeroLatLon(df_input)
    #print(' ==> ',len(df_input))
    if (len(df_input) > 0):
        #print(df_input)
        tdf = skmob.TrajDataFrame(df_input,
                                  latitude='UserLat',
                                  longitude='UserLong',
                                  datetime='DateTimeStamp',
                                  user_id='EmployeeId')
        dsl_df = distance_straight_line(tdf)
        #print(dsl_df)
        dfHead = dsl_df.head()
        distanceTravel = dfHead['distance_straight_line'].values[0]
        del dfHead, dsl_df, tdf
    else:
        distanceTravel = 0
    return distanceTravel
示例#4
0
def process_for_county(curfips):
    #POI sanity check
    POI_location = '/project/biocomplexity/data/XMode/evaluation/POI_directory/'
    if not os.path.isfile(POI_location + curfips + '_POIS.csv'):
        empty_df = pd.DataFrame(columns=['uid'])
        mylogger.info("no nearby POI will be found in this part for " +
                      str(curfips))
        return empty_df
    #Res sanity check
    Res_location = '/project/biocomplexity/data/XMode/evaluation/Residence_data/'
    if not os.path.isfile(Res_location + curfips + '_Res.csv'):
        empty_df = pd.DataFrame(columns=['uid'])
        mylogger.info("no nearby Res will be found in this part for " +
                      str(curfips))
        return empty_df

    func_start = datetime.now()
    fips_df = info_df[info_df.fips == curfips]
    cur_min_lat, cur_max_lat, cur_min_lng, cur_max_lng = fips_df.iloc[0][
        'min_lat'], fips_df.iloc[0]['max_lat'], fips_df.iloc[0][
            'min_lng'], fips_df.iloc[0]['max_lng']
    #initial filtering based on bounding box, works very fast
    county_df = part_df[part_df['latitude'].ge(cur_min_lat)
                        & part_df['latitude'].le(cur_max_lat)
                        & part_df['longitude'].ge(cur_min_lng)
                        & part_df['longitude'].le(cur_max_lng)]

    county_code = (str(curfips).zfill(5))[2:]
    county_prefix = (str(curfips).zfill(5))[0:2]
    #print(county_code)

    state_shape_gdf = whole_shape_gdf[(
        whole_shape_gdf['STATEFP'] == county_prefix)]
    #state_shape_gdf = whole_shape_gdf
    county_shape_gdf = state_shape_gdf[(
        state_shape_gdf['COUNTYFP'] == county_code)]

    #joining with shape file to get more accurate measurement, works slower, that is why initial filtering is done.
    if (len(county_df) > 0):

        county_df = gpd.GeoDataFrame(county_df,
                                     geometry=gpd.points_from_xy(
                                         county_df.longitude,
                                         county_df.latitude))

        county_df.crs = county_shape_gdf.crs
        county_gdf = gpd.sjoin(county_df, county_shape_gdf, how='inner')
        county_gdf = county_gdf[updated_col_names].copy(deep=True)
        spatial_join_time = datetime.now()
        mylogger.info("county pings separated for " + str(curfips) +
                      ", time taken: " + str(spatial_join_time - func_start))

    else:
        empty_df = pd.DataFrame(columns=['uid'])
        mylogger.info("no nearby pings found in this part for " + str(curfips))
        return empty_df

    #stop point detection
    county_gdf['location_at'] = pd.to_datetime(county_gdf['location_at'],
                                               unit='s')
    try:
        traj_df = skmob.TrajDataFrame(county_gdf,
                                      latitude='latitude',
                                      longitude='longitude',
                                      user_id='advertiser_id',
                                      datetime='location_at')
        county_stop_df = detection.stops(traj_df,
                                         stop_radius_factor=0.05,
                                         minutes_for_a_stop=5.0,
                                         spatial_radius_km=0.2,
                                         leaving_time=True)
        county_stop_df['county_fips'] = curfips
        stop_detect_time = datetime.now()
        mylogger.info("stop point detection complete for " + str(curfips) +
                      ", time taken: " +
                      str(stop_detect_time - spatial_join_time))
    except:
        empty_df = pd.DataFrame(columns=['uid'])
        mylogger.info('stop detection failed for ' + curfips)
        return empty_df

    #associate each point with its closest POI information
    #load POI data for corresponding county

    POI_df = pd.read_csv(POI_location + curfips + '_POIS.csv')
    POI_df = POI_df[[
        'source_id', 'poi_name', 'st_name', 'st_num', 'designation', 'lat',
        'lon', 'fac_name'
    ]]
    POI_df = POI_df.rename(columns={'lat': 'POI_lat', 'lon': 'POI_lon'})
    #find index of closest POI for each ping
    POI_dist_np = distance.cdist(county_stop_df[['lat', 'lng']],
                                 POI_df[['POI_lat', 'POI_lon']],
                                 metric='euclidean')
    POI_dist_df = pd.DataFrame(POI_dist_np,
                               index=county_stop_df['uid'],
                               columns=POI_df['source_id'].tolist())
    county_stop_df['POI_id'] = [
        i[i.astype(bool)][0] for i in np.where(
            POI_dist_df.values == POI_dist_df.min(
                axis=1)[:, None], POI_dist_df.columns, False)
    ]
    #join with POI data based on previously found index, also calculate actual distance
    county_POI_df = county_stop_df.merge(POI_df,
                                         how='inner',
                                         left_on='POI_id',
                                         right_on='source_id')
    county_POI_df = county_POI_df.drop(columns=['source_id'])
    county_POI_df["POI_dist"] = haversine(county_POI_df["lng"],
                                          county_POI_df["lat"],
                                          county_POI_df["POI_lon"],
                                          county_POI_df["POI_lat"])
    POI_time = datetime.now()
    mylogger.info("pings matched with nearby POIs " + str(curfips) +
                  ", time taken: " + str(POI_time - stop_detect_time))

    try:
        Res_df = pd.read_csv(Res_location + curfips + '_Res.csv')
        #hack
        if Res_df.shape[0] > 500000:
            Res_df = Res_df.sample(500000)
        if Res_df.shape[0] < 2:
            return county_POI_df

        Res_df = Res_df[['blockgroup_id', 'urban_rural', 'lat', 'lon']]
        Res_df = Res_df.rename(columns={'lat': 'Res_lat', 'lon': 'Res_lon'})
        #find index of closest POI for each ping
        Res_dist_np = distance.cdist(county_stop_df[['lat', 'lng']],
                                     Res_df[['Res_lat', 'Res_lon']],
                                     metric='euclidean')
        Res_dist_df = pd.DataFrame(Res_dist_np,
                                   index=county_stop_df['uid'],
                                   columns=Res_df['blockgroup_id'].tolist())
        county_stop_df['Res_id'] = [
            i[i.astype(bool)][0] for i in np.where(
                Res_dist_df.values == Res_dist_df.min(
                    axis=1)[:, None], Res_dist_df.columns, False)
        ]
        #join with POI data based on previously found index, also calculate actual distance
        county_Res_df = county_stop_df.merge(Res_df,
                                             how='inner',
                                             left_on='Res_id',
                                             right_on='blockgroup_id')
        county_Res_df = county_Res_df.drop(columns=['blockgroup_id'])
        county_Res_df["Res_dist"] = haversine(county_Res_df["lng"],
                                              county_Res_df["lat"],
                                              county_Res_df["Res_lon"],
                                              county_Res_df["Res_lat"])
        Res_time = datetime.now()
        mylogger.info("pings matched with nearby Residences " + str(curfips) +
                      ", time taken: " + str(Res_time - stop_detect_time))
    except Exception as e:
        mylogger.info('something wrong happened while matching with ' +
                      str(curfips) + ' Residences')
        mylogger.info(e)
        return county_POI_df

    county_POI_df['Res_dist'] = county_Res_df['Res_dist']
    county_POI_df['Res_lon'] = county_Res_df['Res_lon']
    county_POI_df['Res_lat'] = county_Res_df['Res_lat']
    county_POI_df['urban_rural'] = county_Res_df['urban_rural']
    return county_POI_df
 def test_slicing_a_tdf_returns_a_tdf(self):
     tdf = skmob.TrajDataFrame(self.default_data_df,
                               latitude='latitude',
                               datetime='hour',
                               user_id='user')
     assert type(tdf) == type(tdf[tdf[UID] == 1][:1])
 def test_tdf_from_dict(self):
     tdf = skmob.TrajDataFrame(self.default_data_dict,
                               latitude='latitude',
                               datetime='hour',
                               user_id='user')
     self.perform_default_asserts(tdf)
示例#7
0
import skmob

# create a TrajDataFrame from a list

data_list = [[1, 39.984094, 116.319236, '2008-10-23 13:53:05'], [1, 39.984198, 116.319322, '2008-10-23 13:53:06'],
             [1, 39.984224, 116.319402, '2008-10-23 13:53:11'], [1, 39.984211, 116.319389, '2008-10-23 13:53:16']]

tdf = skmob.TrajDataFrame(data_list, latitude=1, longitude=2, datetime=3)

# print a portion of the TrajDataFrame
print(tdf.head())

print(type(tdf))

import pandas as pd
# create a DataFrame from the previous list
data_df = pd.DataFrame(data_list, columns=['user', 'latitude', 'lng', 'hour'])
# print the type of the object
print(type(data_df))

# now create a TrajDataFrame from the pandas DataFrame
tdf = skmob.TrajDataFrame(data_df, latitude='latitude', datetime='hour', user_id='user')
# print the type of the object
print(type(tdf))

# print a portion of the TrajDataFrame
print(tdf.head())

# download the file from https://raw.githubusercontent.com/scikit-mobility/scikit-mobility/master/examples/geolife_sample.txt.gz
# read the trajectory data (GeoLife, Beijing, China)
tdf = skmob.TrajDataFrame.from_file('examples/geolife_sample.txt.gz', latitude='lat', longitude='lon', user_id='user', datetime='datetime')