def add_items(hour, score, station): coordinates = pd.DataFrame([station["coordinates"]]) coordinates.rename(columns={"lng": "lon"}, inplace=True) district_id = coords_convert(coordinates).values[0] meta = { "station_id": station["id"], "station_name": station["name"], "address": station["address"], "station_types": station["types"], "station_lon": station["coordinates"]["lng"], "station_lat": station["coordinates"]["lat"], "station_rating": station["rating"], "station_rating_n": station["rating_n"] } metas.append({ "district_id": district_id, "category": category, "meta": json.dumps(meta), "description": station["name"], "source_id": station["id"] }) scores.append({ "dt": datetime(date.year, date.month, date.day, hour), "score_value": score, "reference_value": 0, "category": category, "district_id": district_id, "source_id": station["id"], })
def aggregate(date): s3_client = boto3.client('s3') #date = datetime.date.today() - datetime.timedelta(days=1) response = s3_client.get_object(Bucket='sdd-s3-basebucket', Key='tomtom/{}/{}/{}.json'.format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(date))) data_current = pd.DataFrame(json.loads(response["Body"].read())) data_current["ags"] = coords_convert(data_current) data_current["score"] = data_current["TrafficIndexLive"] / data_current[ "TrafficIndexHistoric"] data_current.replace(np.inf, np.nan, inplace=True) result = pd.DataFrame(data_current.groupby("ags")["score"].mean()) result = result.reset_index() list_results = [] for index, row in result.iterrows(): landkreis = row['ags'] relative_popularity = row["score"] data_index = { "landkreis": landkreis, 'tomtom_score': relative_popularity } list_results.append(data_index) return list_results
def aggregate(date): s3_client = boto3.client('s3') data = pd.DataFrame() clientFirehose = boto3.client('firehose') response = s3_client.get_object(Bucket='sdd-s3-basebucket', Key='hystreet/{}/{}/{}'.format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day-3).zfill(2))) result = pd.DataFrame(json.loads(response["Body"].read())) data = data.append(result) def compute_weekday(timestamp): date_str = timestamp.split('+')[0] date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f') return date.weekday() data['weekday'] = float("NaN") for index, row in data.iterrows(): data.at[index, 'weekday'] = compute_weekday(row['timestamp']) station_means = pd.read_csv('station_means.csv') data = pd.merge(data, station_means, left_on=['station_id', 'weekday'], right_on=['station_id_mean', 'weekday_mean'], how='left').drop(['station_id_mean', 'weekday_mean'], axis=1) data['relative_pedestrians_count'] = float("NaN") for index, row in data.iterrows(): data.at[index, 'relative_pedestrians_count'] = row['pedestrians_count'] / \ row['mean_pedestrians_count_weekday'] stations_with_ags = pd.read_csv('data/stations_with_ags.csv') data_with_ags = pd.merge(data, stations_with_ags, left_on='station_id', right_on='stationid', how='left').drop('stationid', axis=1) data_with_ags['landkreis'] = coords_convert(data_with_ags) # compute mean for each ags (if multiple stations are in the same landkreis) grouped = data_with_ags.groupby(['ags', 'date'], sort=False).agg( {'pedestrians_count': 'mean', 'min_temperature': 'mean', 'temperature': 'mean', 'weather_condition': lambda x: x.iloc[0], 'relative_pedestrians_count': 'mean', 'city': lambda x: x.iloc[0], 'lat': lambda x: x.iloc[0], 'lon': lambda x: x.iloc[0], 'landkreis': lambda x: x.iloc[0]}).reset_index() list_results = [] for index, row in grouped.iterrows(): data_dict = { #'name': row['city'], 'hystreet_score': row['relative_pedestrians_count'], 'landkreis': row['landkreis'] #'lat': row['lat'], #'lon': row['lon'], #'date': row['date'] } list_results.append(data_dict) return list_results
def aggregate(date): s3_client = boto3.client('s3') response = s3_client.get_object(Bucket='sdd-s3-basebucket', Key='fahrrad/{}/{}/{}/{}.json'.format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(date))) data_current = pd.DataFrame(json.loads(json.loads( response["Body"].read()))) data_current["ags"] = coords_convert(data_current) data_current = data_current.loc[~data_current["ags"]. isin(["05362", "05913"])] data_current["bike_count"] = data_current["bike_count"].astype(int) data_current["prediction"] = data_current["prediction"].astype(int) data_current[ "score"] = data_current["bike_count"] / data_current["prediction"] result = pd.DataFrame(data_current.groupby("ags")["score"].mean()) # data_normal = pd.DataFrame() # for x in range(1,6): # date = date - datetime.timedelta(days = 364) #x year ago same weekday # try: # response = s3_client.get_object(Bucket='sdd-s3-basebucket', Key='fahrrad/{}/{}/{}/{}.json'.format(str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(date))) # data_normal.append(pd.DataFrame(json.loads(response["Body"].read()))) # except: # pass # return data_normal # data_normal["ags"] = coords_convert(data_normal) # data_normal["bike_count"] = data_normal["bike_count"].astype(int) # data_normal = pd.DataFrame(data_normal.groupby("ags")["bike_count"].mean()) # # result = data_current.join(data_normal, rsuffix = "normal") result = result.reset_index() list_results = [] for index, row in result.iterrows(): landkreis = row['ags'] relative_popularity = row["score"] data_index = { "landkreis": landkreis, 'bike_score': relative_popularity } list_results.append(data_index) return list_results
def aggregate(date): # connect to aws client_s3 = boto3.client("s3") s3 = boto3.resource('s3') content_object = s3.Object( "sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2))) file_content = content_object.get()['Body'].read().decode('utf-8') json_content = json.loads(file_content) df = pd.DataFrame(json_content) df["name"] = coords_convert(df) # aggregate by region regions = df["name"].unique() # pass several scores in one file lineProducts = [ 'nationalExpress', 'regional', 'suburban', 'national', 'bus' ] result = [] for r in regions: df_filtered_by_region = df[df.name == r] scores = { "zug_score": 1 - df_filtered_by_region.cancelled_stops.mean() / df_filtered_by_region.planned_stops.mean() } for product in lineProducts: df_filtered_by_region_and_product = df_filtered_by_region[ df_filtered_by_region.lineProduct == product] scores.update({ product + "_score": (1 - df_filtered_by_region_and_product.cancelled_stops.mean() / df_filtered_by_region_and_product.planned_stops.mean()) }) if len(df_filtered_by_region["name"].values) < 1: break scores.update({"landkreis": df_filtered_by_region["name"].values[0]}) result.append(scores) #break return result
def aggregate(date): try: # connect to aws client_s3 = boto3.client("s3") s3 = boto3.resource('s3') content_object = s3.Object( "sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2))) file_content = content_object.get()['Body'].read().decode('utf-8') json_content = json.loads(file_content) df = pd.DataFrame(json_content) df["district"] = coords_convert(df) # aggregate by region regions = df["district"].unique() # pass several scores in one file lineProducts = ['nationalExpress', 'regional'] result = [] for r in regions: # add entry for district scores = {"landkreis": r} for product in lineProducts: df_filtered = df[(df.district == r) & (df.lineProduct == product)] # add scores to dictionary scores.update({ "score_public_transportation_" + product: df_filtered.cancelled_stops.sum() }) # add to result result.append(scores) # return result as list return result except Exception as e: print("Datum evt nicht vorhanden? Fehler:", str(e)) return []
def aggregate(date): s3_client = boto3.client('s3') data = pd.DataFrame() for x in range(0, 20): try: response = s3_client.get_object( Bucket='sdd-s3-basebucket', Key='webcamdaten/{}/{}/{}/{}webcamdaten.json'.format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(x).zfill(2))) result = pd.DataFrame(json.loads(response["Body"].read())) result["date"] = date result["hour"] = x data = data.append(result) except: pass data.columns = [col.lower() for col in data.columns] #names(df)[names(df) == 'Lat'] <- 'lat' data["ags"] = coords_convert(data) #return data result = pd.DataFrame(data.groupby("ags")[["personenzahl"]].mean()) result = result.reset_index() list_results = [] #print(result["personenzahl"]) result["personenzahl"] = result[["personenzahl"]] / 2.4 #print(result["personenzahl"]) for index, row in result.iterrows(): landkreis = row['ags'] relative_popularity = row["personenzahl"] data_index = { "landkreis": landkreis, 'webcam_score': relative_popularity } list_results.append(data_index) return list_results
def aggregate(date): s3_client = boto3.client('s3') #date = date.today() - timedelta(days = 1) #print(date) data = pd.DataFrame() #clientFirehose = boto3.client('firehose') for x in range(9, 19): try: response = s3_client.get_object( Bucket='sdd-s3-basebucket', Key='googleplaces/{}/{}/{}/{}'.format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(x).zfill(2))) result = pd.DataFrame(json.loads(response["Body"].read())) result["date"] = date result["hour"] = x data = data.append(result) except Exception as e: print("No gmap data for " + str(date) + " " + str(e)) return def normal_popularity(row): return row["populartimes"][row["date"].weekday()]["data"][row["hour"]] def to_data(landkreis, date, relative_popularity, airquality_score, hystreet_score, cycle_score): #['id', 'name', 'date', 'gmap_score', 'hystreet_score', 'cycle_score'] return { 'name': landkreis, # todo time from request 'date': date, 'gmap_score': relative_popularity #"airquality_score" : airquality_score #'hystreet_score' : hystreet_score # 'cycle_score' : cycle_score } import ast data["normal_popularity"] = data.apply(normal_popularity, axis=1, result_type="reduce") data["relative_popularity"] = data["current_popularity"] / data[ "normal_popularity"] data["coordinates"] = data["coordinates"].astype(str) lat = [] lon = [] for index, row in data.iterrows(): lat.append(ast.literal_eval(row["coordinates"])["lat"]) lon.append(ast.literal_eval(row["coordinates"])["lng"]) data["lat"] = lat data["lon"] = lon #print(data) data["ags"] = coords_convert(data) data data2 = data.loc[data["ags"].notna()] result = data2.groupby("ags").apply(lambda x: np.average( x.relative_popularity, weights=x.normal_popularity)) result = pd.DataFrame(result) result = result.reset_index() result.columns = ["ags", "relative_popularity"] list_results = [] for index, row in result.iterrows(): landkreis = row['ags'] relative_popularity = row['relative_popularity'] data_index = { "landkreis": landkreis, # todo time from request #'date': str(date), 'gmap_score': relative_popularity #"airquality_score" : airquality_score #'hystreet_score' : hystreet_score # 'cycle_score' : cycle_score } list_results.append(data_index) #print (data_index) # clientFirehose.put_record(DeliveryStreamName='sdd-kinese-aggregator', Record={'Data':data_index }) #print(input) return list_results
def upload_date(date): """ Uploads data for given date """ print("processing", date) stations = [] scores = [] for hour in range(0, 24): try: response = s3_client.get_object( Bucket='sdd-s3-basebucket', Key='googleplaces/{}/{}/{}/{}'.format( str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(hour).zfill(2))) result = json.loads(response["Body"].read()) except: continue for station in result: score_value = station["current_popularity"] try: score_reference = station["populartimes"][ date.weekday()]["data"][hour] except: score_reference = None coordinates = pd.DataFrame([station["coordinates"]]) coordinates.rename(columns={"lng": "lon"}, inplace=True) district_id = coords_convert(coordinates).values[0] other = { "googleplaces_id": station["id"], "station_name": station["name"], "address": station["address"], "station_types": station["types"], "station_lon": station["coordinates"]["lng"], "station_lat": station["coordinates"]["lat"], "station_rating": station["rating"], "station_rating_n": station["rating_n"] } stations.append({ "district_id": district_id, "source_id": source_id, "other": json.dumps(other), "description": station["name"], "source_station_id": station["id"], }) other = { # dont loose this information "populartimes": station["populartimes"] } scores.append({ "dt": datetime(date.year, date.month, date.day, hour), "score_value": score_value, "reference_value": score_reference, "source_id": source_id, "district_id": district_id, "source_station_id": station["id"], "description": station["name"], "other": json.dumps(other) }) # upload stations. handles duplicates so dont worry if len(stations) > 0: q = """ INSERT IGNORE INTO stations ( district_id, source_id, other, description, source_station_id ) VALUES (%s, %s, %s, %s, %s ) """ df_stations = pd.DataFrame(stations).drop_duplicates() with aws_engine.connect() as cnx: cnx.execute(q, df_stations.values.tolist(), multi=True) # upload scores if len(scores) > 0: q = """ SELECT id AS station_id, district_id, description, source_station_id FROM stations WHERE source_id = '%s' """ % source_id scores_stations_foreign_keys = pd.read_sql(q, aws_engine) scores_stations_foreign_keys[ "custom_index"] = scores_stations_foreign_keys.apply(custom_index, axis=1) scores_stations_foreign_keys.drop( ["district_id", "description", "source_station_id"], axis=1, inplace=True) df_scores = pd.DataFrame(scores) df_scores["custom_index"] = df_scores.apply(custom_index, axis=1) df_scores = df_scores.merge(scores_stations_foreign_keys, on="custom_index", how="left", suffixes=(False, False)) df_scores.drop(["description", "source_station_id", "custom_index"], axis=1, inplace=True) df_scores['dt'] = df_scores['dt'].astype(str) q = """ INSERT IGNORE INTO scores ( dt, score_value, reference_value, source_id, district_id, station_id, other ) VALUES (%s, %s, %s, %s, %s, %s, %s) """ df_scores = df_scores.drop_duplicates() with aws_engine.connect() as cnx: # makes sure the columns' order match the query cnx.execute(q, df_scores[[ "dt", "score_value", "reference_value", "source_id", "district_id", "station_id", "other" ]].values.tolist(), multi=True) print("upload completed")
right_on=['station_id_mean', 'weekday_mean'], how='left').drop(['station_id_mean', 'weekday_mean'], axis=1) data['relative_pedestrians_count'] = float("NaN") for index, row in data.iterrows(): data.at[index, 'relative_pedestrians_count'] = row['pedestrians_count'] / \ row['mean_pedestrians_count_weekday'] stations_with_ags = pd.read_csv('data/stations_with_ags.csv') data_with_ags = pd.merge(data, stations_with_ags, left_on='station_id', right_on='stationid', how='left').drop('stationid', axis=1) data_with_ags['landkreis'] = coords_convert(data_with_ags) result_df = result_df.append(data_with_ags, sort=True) result_df.columns def compute_weekday(timestamp): date_str = timestamp.split('+')[0] date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f') return date.weekday() result_df["city"].unique() result_df = result_df.loc[(result_df["pedestrians_count"] > 0)] result_df["weekday"] = result_df["timestamp"].apply(compute_weekday) temp = pd.DataFrame(
from pathlib import Path from coords_to_kreis import coords_convert date = date.today() - timedelta(days = 4) # connect to aws date.weekday() client_s3 = boto3.client("s3") s3 = boto3.resource('s3') content_object = s3.Object("sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format(str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2))) file_content = content_object.get()['Body'].read().decode('utf-8') json_content = json.loads(file_content) df = pd.DataFrame(json_content) df["resulting_stops"] = df["planned_stops"] - df["cancelled_stops"] df["name"] = coords_convert(df) # aggregate by region regions = df["name"].unique() r=regions[0] date_normal = datetime(2020, 2, 10) + timedelta(days = date.weekday()) content_object = s3.Object("sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format(str(date_normal.year).zfill(4), str(date_normal.month).zfill(2), str(date_normal.day).zfill(2))) file_content = content_object.get()['Body'].read().decode('utf-8') json_content = json.loads(file_content) df_normal = pd.DataFrame(json_content) df_normal["resulting_stops"] = df_normal["planned_stops"] - df_normal["cancelled_stops"] df_normal["name"] = coords_convert(df_normal) df["date"] df_normal["date"]