def prepare_stats_data(route_name=None,
                       weather_category=None,
                       update=False,
                       verbose=True):
    """
    Prepare data of statistics.

    :param route_name: name of Route, defaults to ``None``
    :type route_name: str, None
    :param weather_category: weather to which an incident is attributed,
        defaults to ``None``
    :type weather_category: str, None
    :param update: whether to retrieve the source data (in case it has been updated),
        defaults to ``False``
    :type update: bool
    :param verbose: defaults to ``False``
    :type verbose: bool

    **Example**::

        route_name = None
        weather_category = None
        update = False
        verbose = True

        prepare_stats_data(route_name, weather_category, update, verbose)
    """

    metex = METExLite()

    # Get data of Schedule 8 incident locations
    incident_locations = metex.view_schedule8_costs_by_location(
        route_name=route_name,
        weather_category=weather_category,
        update=update,
        verbose=verbose)

    # Find the "midpoint" of each incident location
    incident_locations = find_midpoint_of_each_incident_location(
        incident_locations)

    # Split the data by "region"
    for region in incident_locations.Region.unique():
        region_data = incident_locations[incident_locations.Region == region]
        # Sort data by (frequency of incident occurrences, delay minutes, delay cost)
        sort_by_cols = [
            'WeatherCategory', 'IncidentCount', 'DelayMinutes', 'DelayCost'
        ]
        region_data.sort_values(sort_by_cols, ascending=False, inplace=True)
        region_data.index = range(len(region_data))
        export_path = cdd_exploration(
            "NC", "01",
            region.replace(" ", "-").lower() + ".csv")
        save(region_data, export_path, verbose=verbose)

    print("\nCompleted.")
Пример #2
0
def read_table_by_name(database_name, table_name, schema_name='dbo', col_names=None, chunk_size=None,
                       index_col=None, save_as=None, data_dir=None, **kwargs):
    """
    Get all data from a given table in a specific database.

    :param database_name: name of a database
    :type database_name: str
    :param table_name: name of a queried table from the given database
    :type table_name: str
    :param schema_name: defaults to ``'dbo'``
    :type schema_name: str
    :param col_names: defaults to ``None``
    :type col_names: list, None
    :param index_col: defaults to ``None``
    :type index_col: str or list or None
    :param chunk_size: defaults to ``None``
    :type chunk_size: int, None
    :param save_as: defaults to ``None``
    :type save_as: str or None
    :param data_dir: defaults to ``None``
    :type data_dir: str or None
    :param kwargs: optional parameters of
        `pandas.read_sql
        <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html>`_
    :return: data of the queried table
    :rtype: pandas.DataFrame
    """

    # Connect to the queried database
    db_conn = establish_mssql_connection(database_name)
    # Create a pandas.DataFrame of the queried table_name
    table_data = pd.read_sql_table(table_name=table_name, con=db_conn, schema=schema_name,
                                   columns=col_names, index_col=index_col, chunksize=chunk_size,
                                   **kwargs)
    # Disconnect the database
    db_conn.close()

    if save_as:
        path_to_file = os.path.join(
            os.path.realpath(data_dir if data_dir else ''), table_name + save_as)
        save(table_data, path_to_file)

    # Return the data frame of the queried table
    return table_data
Пример #3
0
def save_table_by_chunk(database_name, table_name, schema_name='dbo', col_names=None, index_col=None,
                        chunk_size=1000000, save_as=".pickle", data_dir=None, **kwargs):
    """
    Save a table chunk-wise from a database.

    :param database_name: name of a database to query
    :type database_name: str
    :param table_name: name of a queried table from the given database
    :type table_name: str
    :param schema_name: defaults to ``'dbo'``
    :type schema_name: str
    :param col_names: e.g. a list of column names to retrieve; ``None`` (default) for all columns
    :type col_names: list, None
    :param index_col: defaults to ``None``
    :type index_col: str or list or None
    :param chunk_size: defaults to ``None``
    :type chunk_size: int, None
    :param save_as: defaults to ``None``
    :type save_as: str or None
    :param data_dir: defaults to ``None``
    :type data_dir: str or None
    :param kwargs: optional parameters of
        `pandas.read_sql
        <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html>`_
    :return: data of the queried table
    :rtype: pandas.DataFrame
    """

    assert isinstance(save_as, str) and save_as in (".pickle", ".csv", ".xlsx", ".txt")
    if col_names is not None:
        assert isinstance(col_names, (list, tuple)) and all(isinstance(x, str) for x in col_names)
    if data_dir:
        assert isinstance(save_as, str)

    # Connect to the queried database
    assert isinstance(database_name, str)
    db_conn = establish_mssql_connection(database_name)

    # Check if there is column of 'geometry' type
    assert isinstance(table_name, str)
    sql_query_geom_col = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS " \
                         "WHERE TABLE_NAME='{}' AND DATA_TYPE='geometry'".format(table_name)
    geom_col_res = db_conn.execute(sql_query_geom_col).fetchall()
    geom_col_names = list(itertools.chain.from_iterable(geom_col_res)) if geom_col_res else []

    # Get a list of column names, excluding the 'geometry' one if it exists
    table_col_names = [x for x in get_table_column_names(database_name, table_name)
                       if x not in geom_col_names]

    # Specify SQL query - read all the selected non-geom columns
    selected_col_names = [x for x in col_names if x not in geom_col_names] if col_names \
        else table_col_names

    sql_query = 'SELECT {} FROM {}."{}"'.format(
        ', '.join('"' + tbl_col_name + '"' for tbl_col_name in selected_col_names),
        schema_name, table_name)

    dat_dir = os.path.realpath(data_dir if data_dir else 'temp_dat')
    if not geom_col_names:
        # Read the queried table_name into a pandas.DataFrame
        table_data = pd.read_sql(sql=sql_query, con=db_conn, columns=col_names, index_col=index_col,
                                 chunksize=chunk_size, **kwargs)
        for tbl_id, tbl_dat in enumerate(table_data):
            path_to_file = os.path.join(dat_dir, table_name + "_{}".format(tbl_id + 1) + save_as)
            save(tbl_dat, path_to_file, sheet_name="Sheet_{}".format(tbl_id + 1))
    else:
        # Read the queried table_name into a pd.DataFrame
        table_data = pd.read_sql(sql=sql_query, con=db_conn, columns=col_names, index_col=index_col,
                                 chunksize=chunk_size, **kwargs)
        tbl_chunks = [tbl_dat for tbl_dat in table_data]

        if len(geom_col_names) == 1:
            geom_sql_query = 'SELECT {} FROM {}."{}"'.format(
                '"{}".STAsText()'.format(geom_col_names[0]), schema_name, table_name)
        else:
            geom_sql_query = 'SELECT {} FROM {}."{}"'.format(
                ', '.join('"' + x + '".STAsText()' for x in geom_col_names),
                schema_name, table_name)
        geom_data = pd.read_sql(geom_sql_query, db_conn, chunksize=chunk_size, **kwargs)
        geom_chunks = [geom_dat.applymap(shapely.wkt.loads) for geom_dat in geom_data]

        counter = 0
        # noinspection PyTypeChecker
        for tbl_dat, geom_dat in zip(tbl_chunks, geom_chunks):
            path_to_file = os.path.join(dat_dir, table_name + "_{}".format(counter + 1) + save_as)
            save(tbl_dat.join(geom_dat), path_to_file, sheet_name="Sheet_{}".format(counter + 1))
            counter += 1

    # Disconnect the database
    db_conn.close()
Пример #4
0
def read_table_by_query(database_name, table_name, schema_name='dbo', col_names=None, index_col=None,
                        chunk_size=None, save_as=None, data_dir=None, **kwargs):
    """
    Get data from a table in a specific database by SQL query.

    :param database_name: name of a database
    :type database_name: str
    :param table_name: name of a queried table from the given database
    :type table_name: str
    :param schema_name: defaults to ``'dbo'``
    :type schema_name: str
    :param col_names: defaults to ``None``
    :type col_names: iterable, None
    :param index_col: defaults to ``None``
    :type index_col: str or list or None
    :param chunk_size: defaults to ``None``
    :type chunk_size: int, None
    :param save_as: defaults to ``None``
    :type save_as: str or None
    :param data_dir: defaults to ``None``
    :type data_dir: str or None
    :param kwargs: optional parameters of
        `pandas.read_sql
        <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_sql.html>`_
    :return: data of the queried table
    :rtype: pandas.DataFrame
    """

    if col_names is not None:
        assert isinstance(col_names, (list, tuple)) and all(isinstance(x, str) for x in col_names)
    if save_as:
        assert isinstance(save_as, str) and save_as in (".pickle", ".csv", ".xlsx", ".txt")
    if data_dir:
        assert isinstance(save_as, str)

    # Connect to the queried database
    assert isinstance(database_name, str)
    db_conn = establish_mssql_connection(database_name)

    # Check if there is column of 'geometry' type
    assert isinstance(table_name, str)
    sql_query_geom_col = "SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.COLUMNS " \
                         "WHERE TABLE_NAME='{}' AND DATA_TYPE='geometry'".format(table_name)
    geom_col_res = db_conn.execute(sql_query_geom_col).fetchall()
    geom_col_names = list(itertools.chain.from_iterable(geom_col_res)) if geom_col_res else []

    # Get a list of column names, excluding the 'geometry' one if it exists
    table_col_names = [x for x in get_table_column_names(database_name, table_name)
                       if x not in geom_col_names]

    # Specify SQL query - read all non-geom columns
    selected_col_names = [x for x in col_names if x not in geom_col_names] if col_names \
        else table_col_names
    sql_query = 'SELECT {} FROM {}."{}"'.format(
        ', '.join('"' + tbl_col_name + '"' for tbl_col_name in selected_col_names),
        schema_name, table_name)

    # Read the queried table_name into a pandas.DataFrame
    table_data = pd.read_sql(
        sql=sql_query, con=db_conn, columns=col_names, index_col=index_col, chunksize=chunk_size,
        **kwargs)

    if chunk_size:
        table_data = pd.concat([pd.DataFrame(tbl_dat) for tbl_dat in table_data], ignore_index=True)

    # Read geom column(s)
    if geom_col_names:
        if len(geom_col_names) == 1:
            geom_sql_query = 'SELECT {} FROM {}."{}"'.format(
                '"{}".STAsText()'.format(geom_col_names[0]), schema_name, table_name)
        else:
            geom_sql_query = 'SELECT {} FROM {}."{}"'.format(
                ', '.join('"' + x + '".STAsText()' for x in geom_col_names),
                schema_name, table_name)

        # Read geom data chunks into a pandas.DataFrame
        geom_data = pd.read_sql(geom_sql_query, db_conn, chunksize=chunk_size, **kwargs)

        if chunk_size:
            geom_data = pd.concat(
                [pd.DataFrame(geom_dat).applymap(shapely.wkt.loads) for geom_dat in geom_data],
                ignore_index=True)
        geom_data.columns = geom_col_names
        #
        table_data = table_data.join(geom_data)

    # Disconnect the database
    db_conn.close()

    if save_as:
        path_to_file = os.path.join(
            os.path.realpath(data_dir if data_dir else ''), table_name + save_as)
        save(table_data, path_to_file)

    return table_data
def prepare_monthly_stats_data(route_name=None,
                               weather_category=None,
                               update=False,
                               verbose=True):
    """
    Prepare data of monthly statistics.

    :param route_name: name of Route, defaults to ``None``
    :type route_name: str, None
    :param weather_category: weather to which an incident is attributed,
        defaults to ``None``
    :type weather_category: str, None
    :param update: whether to retrieve the source data (in case it has been updated),
        defaults to ``False``
    :type update: bool
    :param verbose: defaults to ``False``
    :type verbose: bool

    **Example**::

        route_name = None
        weather_category = None
        update = False
        verbose = True
    """

    metex = METExLite()

    # Get data of Schedule 8 incidents by datetime and location
    dat = metex.view_schedule8_costs_by_datetime_location(
        route_name=route_name,
        weather_category=weather_category,
        update=update,
        verbose=verbose)

    print("Cleaning data ... ", end="")
    # dat = metex.view_schedule8_data()
    dat.insert(
        dat.columns.get_loc('EndDateTime') + 1, 'StartYear',
        dat.StartDateTime.dt.year)
    dat.insert(
        dat.columns.get_loc('StartYear') + 1, 'StartMonth',
        dat.StartDateTime.dt.month)

    stats_calc = {
        'IncidentCount': np.count_nonzero,
        'DelayMinutes': np.sum,
        'DelayCost': np.sum
    }
    stats = dat.groupby(list(dat.columns[3:-3])).aggregate(stats_calc)
    stats.reset_index(inplace=True)

    # Find the "midpoint" of each incident location
    data = find_midpoint_of_each_incident_location(stats)
    print("Done.\n")

    sort_by_cols = [
        'WeatherCategory', 'IncidentCount', 'DelayMinutes', 'DelayCost'
    ]

    print("Processing monthly statistics ... ")
    for m in data.StartMonth.unique():
        m_ = ("0" + str(m)) if m < 10 else m
        print("           \"{}\" ... ".format(m_), end="")
        dat1 = data[data.StartMonth == m]
        if not dat1.empty:
            dat1.sort_values(sort_by_cols,
                             ascending=False,
                             na_position='last',
                             inplace=True)
            dat1.index = range(len(dat1))
            export_path = cdd_exploration("NC\\02\\GB\\Month",
                                          "{}.csv".format(m_))
            save(dat1, export_path, sep=',', verbose=False)
        print("Done.")
    print("Completed.\n")

    print("Processing monthly statistics of GB ... ")
    years, months = data.StartYear.unique(), data.StartMonth.unique()
    for y, m in list(itertools.product(years, months)):
        period = ("{}_0{}" if m < 10 else "{}_{}").format(y, m)
        print("           \"{}\" ... ".format(period), end="")
        dat = data[(data.StartYear == y) & (data.StartMonth == m)]
        if not dat.empty:
            dat.sort_values(sort_by_cols,
                            ascending=False,
                            na_position='last',
                            inplace=True)
            dat.index = range(len(dat))
            export_path = cdd_exploration("NC\\02\\GB\\Year_Month",
                                          "{}.csv".format(period))
            save(dat, export_path, sep=',', verbose=False)
        print("Done.")
    print("Completed.\n")

    # Split the data by "region"
    print("Processing monthly statistics for each region ... ")
    for region in data.Region.unique():
        region_data = data[data.Region == region]
        years, months = region_data.StartYear.unique(
        ), region_data.StartMonth.unique()
        print("           \"{}\" ... ".format(region), end="")
        for y, m in list(itertools.product(years, months)):
            dat_ = region_data[(region_data.StartYear == y)
                               & (region_data.StartMonth == m)]
            if not dat_.empty:
                dat_.sort_values(sort_by_cols,
                                 ascending=False,
                                 na_position='last',
                                 inplace=True)
                dat_.index = range(len(dat_))
                export_path = cdd_exploration(
                    "NC", "02", "Region",
                    "{}".format(region.replace(" ", "-").lower()),
                    ("{}_0{}.csv" if m < 10 else "{}_{}.csv").format(y, m))
                save(dat_, export_path, sep=',', verbose=False)
        print("Done.")
    print("Completed.\n")