Пример #1
0
    def add_items(hour, score, station):
        coordinates = pd.DataFrame([station["coordinates"]])
        coordinates.rename(columns={"lng": "lon"}, inplace=True)
        district_id = coords_convert(coordinates).values[0]

        meta = {
            "station_id": station["id"],
            "station_name": station["name"],
            "address": station["address"],
            "station_types": station["types"],
            "station_lon": station["coordinates"]["lng"],
            "station_lat": station["coordinates"]["lat"],
            "station_rating": station["rating"],
            "station_rating_n": station["rating_n"]
        }

        metas.append({
            "district_id": district_id,
            "category": category,
            "meta": json.dumps(meta),
            "description": station["name"],
            "source_id": station["id"]
        })

        scores.append({
            "dt": datetime(date.year, date.month, date.day, hour),
            "score_value": score,
            "reference_value": 0,
            "category": category,
            "district_id": district_id,
            "source_id": station["id"],
        })
Пример #2
0
def aggregate(date):
    s3_client = boto3.client('s3')
    #date = datetime.date.today() - datetime.timedelta(days=1)
    response = s3_client.get_object(Bucket='sdd-s3-basebucket',
                                    Key='tomtom/{}/{}/{}.json'.format(
                                        str(date.year).zfill(4),
                                        str(date.month).zfill(2),
                                        str(date.day).zfill(2), str(date)))
    data_current = pd.DataFrame(json.loads(response["Body"].read()))
    data_current["ags"] = coords_convert(data_current)
    data_current["score"] = data_current["TrafficIndexLive"] / data_current[
        "TrafficIndexHistoric"]
    data_current.replace(np.inf, np.nan, inplace=True)
    result = pd.DataFrame(data_current.groupby("ags")["score"].mean())

    result = result.reset_index()
    list_results = []

    for index, row in result.iterrows():
        landkreis = row['ags']
        relative_popularity = row["score"]
        data_index = {
            "landkreis": landkreis,
            'tomtom_score': relative_popularity
        }
        list_results.append(data_index)
    return list_results
Пример #3
0
def aggregate(date):
    s3_client = boto3.client('s3')
    data = pd.DataFrame()
    clientFirehose = boto3.client('firehose')

    response = s3_client.get_object(Bucket='sdd-s3-basebucket', Key='hystreet/{}/{}/{}'.format(
        str(date.year).zfill(4), str(date.month).zfill(2), str(date.day-3).zfill(2)))
    result = pd.DataFrame(json.loads(response["Body"].read()))
    data = data.append(result)


    def compute_weekday(timestamp):
        date_str = timestamp.split('+')[0]
        date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f')
        return date.weekday()


    data['weekday'] = float("NaN")
    for index, row in data.iterrows():
        data.at[index, 'weekday'] = compute_weekday(row['timestamp'])

    station_means = pd.read_csv('station_means.csv')

    data = pd.merge(data, station_means, left_on=['station_id', 'weekday'],
                    right_on=['station_id_mean', 'weekday_mean'], how='left').drop(['station_id_mean', 'weekday_mean'], axis=1)

    data['relative_pedestrians_count'] = float("NaN")
    for index, row in data.iterrows():
        data.at[index, 'relative_pedestrians_count'] = row['pedestrians_count'] / \
            row['mean_pedestrians_count_weekday']


    stations_with_ags = pd.read_csv('data/stations_with_ags.csv')
    data_with_ags = pd.merge(data, stations_with_ags, left_on='station_id',
                             right_on='stationid', how='left').drop('stationid', axis=1)
    data_with_ags['landkreis'] = coords_convert(data_with_ags)


    # compute mean for each ags (if multiple stations are in the same landkreis)
    grouped = data_with_ags.groupby(['ags', 'date'], sort=False).agg(
        {'pedestrians_count': 'mean', 'min_temperature': 'mean', 'temperature': 'mean', 'weather_condition': lambda x: x.iloc[0], 'relative_pedestrians_count': 'mean', 'city': lambda x: x.iloc[0], 'lat': lambda x: x.iloc[0], 'lon': lambda x: x.iloc[0], 'landkreis': lambda x: x.iloc[0]}).reset_index()

    list_results = []

    for index, row in grouped.iterrows():
        data_dict = {
            #'name': row['city'],
            'hystreet_score': row['relative_pedestrians_count'],
            'landkreis': row['landkreis']
            #'lat': row['lat'],
            #'lon': row['lon'],
            #'date': row['date']
        }
        list_results.append(data_dict)

    return list_results
Пример #4
0
def aggregate(date):
    s3_client = boto3.client('s3')
    response = s3_client.get_object(Bucket='sdd-s3-basebucket',
                                    Key='fahrrad/{}/{}/{}/{}.json'.format(
                                        str(date.year).zfill(4),
                                        str(date.month).zfill(2),
                                        str(date.day).zfill(2), str(date)))

    data_current = pd.DataFrame(json.loads(json.loads(
        response["Body"].read())))
    data_current["ags"] = coords_convert(data_current)
    data_current = data_current.loc[~data_current["ags"].
                                    isin(["05362", "05913"])]
    data_current["bike_count"] = data_current["bike_count"].astype(int)
    data_current["prediction"] = data_current["prediction"].astype(int)
    data_current[
        "score"] = data_current["bike_count"] / data_current["prediction"]
    result = pd.DataFrame(data_current.groupby("ags")["score"].mean())

    # data_normal = pd.DataFrame()
    # for x in range(1,6):
    #     date = date - datetime.timedelta(days = 364) #x year ago same weekday
    #     try:
    #         response = s3_client.get_object(Bucket='sdd-s3-basebucket', Key='fahrrad/{}/{}/{}/{}.json'.format(str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2), str(date)))
    #         data_normal.append(pd.DataFrame(json.loads(response["Body"].read())))
    #     except:
    #         pass
    # return data_normal
    # data_normal["ags"] = coords_convert(data_normal)
    # data_normal["bike_count"] = data_normal["bike_count"].astype(int)
    # data_normal = pd.DataFrame(data_normal.groupby("ags")["bike_count"].mean())
    #
    # result = data_current.join(data_normal, rsuffix = "normal")
    result = result.reset_index()
    list_results = []

    for index, row in result.iterrows():
        landkreis = row['ags']
        relative_popularity = row["score"]
        data_index = {
            "landkreis": landkreis,
            'bike_score': relative_popularity
        }
        list_results.append(data_index)
    return list_results
Пример #5
0
def aggregate(date):
    # connect to aws
    client_s3 = boto3.client("s3")
    s3 = boto3.resource('s3')
    content_object = s3.Object(
        "sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format(
            str(date.year).zfill(4),
            str(date.month).zfill(2),
            str(date.day).zfill(2)))
    file_content = content_object.get()['Body'].read().decode('utf-8')
    json_content = json.loads(file_content)

    df = pd.DataFrame(json_content)
    df["name"] = coords_convert(df)
    # aggregate by region
    regions = df["name"].unique()
    # pass several scores in one file
    lineProducts = [
        'nationalExpress', 'regional', 'suburban', 'national', 'bus'
    ]
    result = []
    for r in regions:
        df_filtered_by_region = df[df.name == r]
        scores = {
            "zug_score":
            1 - df_filtered_by_region.cancelled_stops.mean() /
            df_filtered_by_region.planned_stops.mean()
        }
        for product in lineProducts:
            df_filtered_by_region_and_product = df_filtered_by_region[
                df_filtered_by_region.lineProduct == product]
            scores.update({
                product + "_score":
                (1 - df_filtered_by_region_and_product.cancelled_stops.mean() /
                 df_filtered_by_region_and_product.planned_stops.mean())
            })
        if len(df_filtered_by_region["name"].values) < 1:
            break
        scores.update({"landkreis": df_filtered_by_region["name"].values[0]})
        result.append(scores)
        #break
    return result
Пример #6
0
def aggregate(date):
    try:
        # connect to aws
        client_s3 = boto3.client("s3")
        s3 = boto3.resource('s3')
        content_object = s3.Object(
            "sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format(
                str(date.year).zfill(4),
                str(date.month).zfill(2),
                str(date.day).zfill(2)))
        file_content = content_object.get()['Body'].read().decode('utf-8')
        json_content = json.loads(file_content)

        df = pd.DataFrame(json_content)
        df["district"] = coords_convert(df)
        # aggregate by region
        regions = df["district"].unique()
        # pass several scores in one file
        lineProducts = ['nationalExpress', 'regional']
        result = []
        for r in regions:
            # add entry for district
            scores = {"landkreis": r}
            for product in lineProducts:
                df_filtered = df[(df.district == r)
                                 & (df.lineProduct == product)]
                # add scores to dictionary
                scores.update({
                    "score_public_transportation_" + product:
                    df_filtered.cancelled_stops.sum()
                })
            # add to result
            result.append(scores)
        # return result as list
        return result
    except Exception as e:
        print("Datum evt nicht vorhanden? Fehler:", str(e))
        return []
Пример #7
0
def aggregate(date):
    s3_client = boto3.client('s3')
    data = pd.DataFrame()
    for x in range(0, 20):
        try:
            response = s3_client.get_object(
                Bucket='sdd-s3-basebucket',
                Key='webcamdaten/{}/{}/{}/{}webcamdaten.json'.format(
                    str(date.year).zfill(4),
                    str(date.month).zfill(2),
                    str(date.day).zfill(2),
                    str(x).zfill(2)))
            result = pd.DataFrame(json.loads(response["Body"].read()))
            result["date"] = date
            result["hour"] = x
            data = data.append(result)
        except:
            pass
    data.columns = [col.lower() for col in data.columns]
    #names(df)[names(df) == 'Lat'] <- 'lat'
    data["ags"] = coords_convert(data)
    #return data
    result = pd.DataFrame(data.groupby("ags")[["personenzahl"]].mean())
    result = result.reset_index()
    list_results = []
    #print(result["personenzahl"])
    result["personenzahl"] = result[["personenzahl"]] / 2.4
    #print(result["personenzahl"])
    for index, row in result.iterrows():
        landkreis = row['ags']
        relative_popularity = row["personenzahl"]
        data_index = {
            "landkreis": landkreis,
            'webcam_score': relative_popularity
        }
        list_results.append(data_index)
    return list_results
Пример #8
0
def aggregate(date):
    s3_client = boto3.client('s3')
    #date = date.today() - timedelta(days = 1)
    #print(date)
    data = pd.DataFrame()
    #clientFirehose = boto3.client('firehose')

    for x in range(9, 19):
        try:
            response = s3_client.get_object(
                Bucket='sdd-s3-basebucket',
                Key='googleplaces/{}/{}/{}/{}'.format(
                    str(date.year).zfill(4),
                    str(date.month).zfill(2),
                    str(date.day).zfill(2),
                    str(x).zfill(2)))
            result = pd.DataFrame(json.loads(response["Body"].read()))
            result["date"] = date
            result["hour"] = x
            data = data.append(result)
        except Exception as e:
            print("No gmap data for " + str(date) + " " + str(e))
            return

    def normal_popularity(row):
        return row["populartimes"][row["date"].weekday()]["data"][row["hour"]]

    def to_data(landkreis, date, relative_popularity, airquality_score,
                hystreet_score, cycle_score):

        #['id', 'name', 'date', 'gmap_score', 'hystreet_score', 'cycle_score']
        return {
            'name': landkreis,
            # todo time from request
            'date': date,
            'gmap_score': relative_popularity
            #"airquality_score" : airquality_score
            #'hystreet_score' : hystreet_score
            # 'cycle_score' : cycle_score
        }

    import ast
    data["normal_popularity"] = data.apply(normal_popularity,
                                           axis=1,
                                           result_type="reduce")
    data["relative_popularity"] = data["current_popularity"] / data[
        "normal_popularity"]
    data["coordinates"] = data["coordinates"].astype(str)
    lat = []
    lon = []
    for index, row in data.iterrows():
        lat.append(ast.literal_eval(row["coordinates"])["lat"])
        lon.append(ast.literal_eval(row["coordinates"])["lng"])

    data["lat"] = lat
    data["lon"] = lon
    #print(data)
    data["ags"] = coords_convert(data)
    data
    data2 = data.loc[data["ags"].notna()]

    result = data2.groupby("ags").apply(lambda x: np.average(
        x.relative_popularity, weights=x.normal_popularity))
    result = pd.DataFrame(result)
    result = result.reset_index()
    result.columns = ["ags", "relative_popularity"]
    list_results = []
    for index, row in result.iterrows():
        landkreis = row['ags']
        relative_popularity = row['relative_popularity']
        data_index = {
            "landkreis": landkreis,
            # todo time from request
            #'date': str(date),
            'gmap_score': relative_popularity
            #"airquality_score" : airquality_score
            #'hystreet_score' : hystreet_score
            # 'cycle_score' : cycle_score
        }
        list_results.append(data_index)
        #print (data_index)
        # clientFirehose.put_record(DeliveryStreamName='sdd-kinese-aggregator',  Record={'Data':data_index })

        #print(input)
    return list_results
Пример #9
0
def upload_date(date):
    """ Uploads data for given date """
    print("processing", date)

    stations = []
    scores = []
    for hour in range(0, 24):
        try:
            response = s3_client.get_object(
                Bucket='sdd-s3-basebucket',
                Key='googleplaces/{}/{}/{}/{}'.format(
                    str(date.year).zfill(4),
                    str(date.month).zfill(2),
                    str(date.day).zfill(2),
                    str(hour).zfill(2)))

            result = json.loads(response["Body"].read())
        except:
            continue

        for station in result:

            score_value = station["current_popularity"]
            try:
                score_reference = station["populartimes"][
                    date.weekday()]["data"][hour]
            except:
                score_reference = None

            coordinates = pd.DataFrame([station["coordinates"]])
            coordinates.rename(columns={"lng": "lon"}, inplace=True)
            district_id = coords_convert(coordinates).values[0]

            other = {
                "googleplaces_id": station["id"],
                "station_name": station["name"],
                "address": station["address"],
                "station_types": station["types"],
                "station_lon": station["coordinates"]["lng"],
                "station_lat": station["coordinates"]["lat"],
                "station_rating": station["rating"],
                "station_rating_n": station["rating_n"]
            }

            stations.append({
                "district_id": district_id,
                "source_id": source_id,
                "other": json.dumps(other),
                "description": station["name"],
                "source_station_id": station["id"],
            })

            other = {
                # dont loose this information
                "populartimes": station["populartimes"]
            }

            scores.append({
                "dt": datetime(date.year, date.month, date.day, hour),
                "score_value": score_value,
                "reference_value": score_reference,
                "source_id": source_id,
                "district_id": district_id,
                "source_station_id": station["id"],
                "description": station["name"],
                "other": json.dumps(other)
            })

    # upload stations. handles duplicates so dont worry
    if len(stations) > 0:
        q = """
      INSERT IGNORE INTO stations 
      (
        district_id,
        source_id,
        other,
        description,
        source_station_id
      )
      VALUES (%s, %s, %s, %s, %s )
    """
        df_stations = pd.DataFrame(stations).drop_duplicates()
        with aws_engine.connect() as cnx:
            cnx.execute(q, df_stations.values.tolist(), multi=True)

    # upload scores
    if len(scores) > 0:
        q = """
      SELECT id AS station_id, district_id, description, source_station_id FROM stations 
      WHERE source_id = '%s' 
    """ % source_id

        scores_stations_foreign_keys = pd.read_sql(q, aws_engine)
        scores_stations_foreign_keys[
            "custom_index"] = scores_stations_foreign_keys.apply(custom_index,
                                                                 axis=1)
        scores_stations_foreign_keys.drop(
            ["district_id", "description", "source_station_id"],
            axis=1,
            inplace=True)

        df_scores = pd.DataFrame(scores)
        df_scores["custom_index"] = df_scores.apply(custom_index, axis=1)

        df_scores = df_scores.merge(scores_stations_foreign_keys,
                                    on="custom_index",
                                    how="left",
                                    suffixes=(False, False))
        df_scores.drop(["description", "source_station_id", "custom_index"],
                       axis=1,
                       inplace=True)
        df_scores['dt'] = df_scores['dt'].astype(str)
        q = """
      INSERT IGNORE INTO scores 
      (
        dt,
        score_value,
        reference_value,
        source_id,
        district_id,
        station_id,
        other
      )
      VALUES (%s, %s, %s, %s, %s, %s, %s)
    """
        df_scores = df_scores.drop_duplicates()
        with aws_engine.connect() as cnx:
            # makes sure the columns' order match the query
            cnx.execute(q,
                        df_scores[[
                            "dt", "score_value", "reference_value",
                            "source_id", "district_id", "station_id", "other"
                        ]].values.tolist(),
                        multi=True)

        print("upload completed")
Пример #10
0
                    right_on=['station_id_mean', 'weekday_mean'],
                    how='left').drop(['station_id_mean', 'weekday_mean'],
                                     axis=1)

    data['relative_pedestrians_count'] = float("NaN")
    for index, row in data.iterrows():
        data.at[index, 'relative_pedestrians_count'] = row['pedestrians_count'] / \
            row['mean_pedestrians_count_weekday']

    stations_with_ags = pd.read_csv('data/stations_with_ags.csv')
    data_with_ags = pd.merge(data,
                             stations_with_ags,
                             left_on='station_id',
                             right_on='stationid',
                             how='left').drop('stationid', axis=1)
    data_with_ags['landkreis'] = coords_convert(data_with_ags)
    result_df = result_df.append(data_with_ags, sort=True)

result_df.columns


def compute_weekday(timestamp):
    date_str = timestamp.split('+')[0]
    date = datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S.%f')
    return date.weekday()


result_df["city"].unique()
result_df = result_df.loc[(result_df["pedestrians_count"] > 0)]
result_df["weekday"] = result_df["timestamp"].apply(compute_weekday)
temp = pd.DataFrame(
from pathlib import Path
from coords_to_kreis import coords_convert

date = date.today() - timedelta(days = 4)
    # connect to aws

date.weekday()
client_s3 = boto3.client("s3")
s3 = boto3.resource('s3')
content_object = s3.Object("sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format(str(date.year).zfill(4), str(date.month).zfill(2), str(date.day).zfill(2)))
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)

df = pd.DataFrame(json_content)
df["resulting_stops"] = df["planned_stops"] - df["cancelled_stops"]
df["name"] = coords_convert(df)

# aggregate by region
regions = df["name"].unique()
r=regions[0]
date_normal = datetime(2020, 2, 10) + timedelta(days = date.weekday())

content_object = s3.Object("sdd-s3-basebucket", "zugdaten/{}/{}/{}/zugdaten.json".format(str(date_normal.year).zfill(4), str(date_normal.month).zfill(2), str(date_normal.day).zfill(2)))
file_content = content_object.get()['Body'].read().decode('utf-8')
json_content = json.loads(file_content)
df_normal = pd.DataFrame(json_content)
df_normal["resulting_stops"] = df_normal["planned_stops"] - df_normal["cancelled_stops"]
df_normal["name"] = coords_convert(df_normal)

df["date"]
df_normal["date"]