def start(self): self.data = pd.read_csv(os.path.join(project_root(), 'data', 'raw', 'ubaar-competition', 'train.csv'), encoding="utf-8", index_col="ID") self.features = pd.DataFrame() self.next(self.get_day_feature)
import os import pandas as pd from feature_extraction.date_utils import date_features from feature_extraction.coords_features import coord_features from feature_extraction.other_features import raw_features, categorical_features from feature_extraction.path_utils import project_root import xgboost as xgb import joblib raw_data = pd.read_csv(os.path.join(project_root(), 'data', 'raw', 'ubaar-competition', 'train.csv'), encoding="utf-8", index_col="ID") all_features_cols = pd.read_csv(os.path.join(project_root(), 'data', 'processed', 'ubaar_features.csv'), encoding="utf-8", index_col="ID").columns model = joblib.load( os.path.join(project_root(), 'data', 'processed', 'model.bin')) num_cols = [ 'sourceLatitude', 'sourceLongitude', 'destinationLatitude', 'destinationLongitude', 'distanceKM', 'taxiDurationMin', 'weight', 'price' ] num_cols_dict = {col: float for col in num_cols} def _add_missing_cat_columns(features, all_features_cols): missing_columns = [ c for c in all_features_cols if c not in features.columns
import pandas as pd import os import plotly.express as px from feature_extraction.path_utils import project_root from feature_extraction.coords_features import coords_clusters_dbscan, coords_clusters_kmeans if __name__ == '__main__': data = pd.read_csv(os.path.join(project_root(), 'data', 'raw', 'ubaar-competition', 'train.csv'), encoding="utf-8", index_col="ID") coords = data[["sourceLatitude", "sourceLongitude", "destinationLatitude", "destinationLongitude"]] # coords['cluster_src'], _ = coords_clusters_kmeans(coords, n_clusters=50) coords['cluster_src'], _ = coords_clusters_dbscan(coords) fig = px.scatter_mapbox(coords, lat="sourceLatitude", lon="sourceLongitude", zoom=3, height=900, color='cluster_src', title="Clusters") fig.update_layout(mapbox_style="stamen-terrain", mapbox_zoom=2, mapbox_center_lat=41, margin={"r": 0, "t": 0, "l": 0, "b": 0}) fig.write_html(os.path.join(project_root(), "data", "processed", "clusters.html")) fig.show()
def save(self): self.features.to_csv( os.path.join(project_root(), 'data', 'processed', 'ubaar_features.csv')) self.next(self.end)
import pandas as pd import os import plotly.express as px import reverse_geocoder as rg from feature_extraction.path_utils import project_root if __name__ == '__main__': src_dest = 'source' data = pd.read_csv(os.path.join(project_root(), 'data', 'raw', 'ubaar-competition', 'train.csv'), encoding="utf-8", index_col="ID") coords = data[[f'{src_dest}Latitude', f'{src_dest}Longitude']] localisations = rg.search([tuple(row) for row in coords.values]) data[f'{src_dest}_city'] = [l['name'] for l in localisations] data[f'{src_dest}_province'] = [l['admin1'] for l in localisations] # data['price_per_km'] = data['price'] / data['distanceKM'] city_ave_prices = dict(data.groupby(f'{src_dest}_city')['price'].mean()) data['ave_price'] = data.apply( lambda x: city_ave_prices[x[f'{src_dest}_city']], axis=1) fig = px.scatter_mapbox(data, lat=f"{src_dest}Latitude", lon=f"{src_dest}Longitude",