예제 #1
0
def table_from_h5(table, h5file="stats_can.h5", path=None):
    """Read a table from h5 to a dataframe

    Parameters
    ----------
    table: str
        name of the table to read
    h5file: str, default stats_can.h5
        name of the h5file to retrieve the table from
    path: str or path, default = current working directory
        path to the h5file

    Returns
    -------
    df: pd.DataFrame
        table in dataframe format
    """
    table = "table_" + parse_tables(table)[0]
    if path:
        h5 = os.path.join(path, h5file)
    else:
        h5 = h5file
    try:
        with pd.HDFStore(h5, "r") as store:
            df = pd.read_hdf(store, key=table)
    except (KeyError, OSError):
        print("Downloading and loading " + table)
        tables_to_h5(tables=table, h5file=h5file, path=path)
        with pd.HDFStore(h5, "r") as store:
            df = pd.read_hdf(store, key=table)
    return df
예제 #2
0
def metadata_from_h5(tables, h5file="stats_can.h5", path=None):
    """Read table metadata from h5

    Parameters
    ----------
    table: str or list of str
        name of the tables to read
    h5file: str, default stats_can.h5
        name of the h5file to retrieve the table from
    path: str or path, default = current working directory
        path to the h5file

    Returns
    -------
    list of local table metadata
    """
    if path:
        h5file = os.path.join(path, h5file)
    tables = ["json_" + tbl for tbl in parse_tables(tables)]
    jsons = []
    try:
        with h5py.File(h5file, "r") as f:
            for tbl in tables:
                try:
                    table_json = json.loads(f[tbl][()])
                    jsons += [table_json]
                except KeyError:
                    print("Couldn't find table " + tbl)
    except OSError:
        print(f"{h5file} does not exist")
    return jsons
예제 #3
0
파일: scwds.py 프로젝트: jacmarcx/stats_can
def get_full_table_download(table, csv=True):
    """https://www.statcan.gc.ca/eng/developers/wds/user-guide#a12-6
    https://www.statcan.gc.ca/eng/developers/wds/user-guide#a12-7

    Take a table name and return a url to a zipped file of that table

    Parameters
    ----------
    table: str
        table name to download
    csv: boolean, default True
        download in CSV format, if not download SDMX

    Returns
    -------
    str:
        path to the file download
    """
    table = parse_tables(table)[0]
    if csv:
        url = SC_URL + "getFullTableDownloadCSV/" + table + "/en"
    else:
        url = SC_URL + "getFullTableDownloadSDMX/" + table
    result = requests.get(url)
    result = check_status(result)
    return result["object"]
예제 #4
0
def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
    """Delete downloaded tables

    Parameters
    ----------
    tables: list
        list of tables to delete
    path: str or os path object, default None
        where to look for the tables to delete
    h5file: str default stats_can.h5
        h5file to remove from, set to None to remove zips
    csv: boolean, default True
        if h5file is None this specifies whether to delete zipped csv or SDMX
    
    Returns
    -------
    to_delete: list
        list of deleted tables
    """
    clean_tables = parse_tables(tables)
    available_tables_jsons = list_downloaded_tables(path=path, h5file=h5file)
    available_tables = [j["productId"] for j in available_tables_jsons]
    to_delete = [t for t in clean_tables if t in available_tables]
    if h5file:
        keys_to_del = []
        for td in to_delete:
            json_to_del = "json_" + td
            tbl_to_del = "table_" + td
            keys_to_del.append(json_to_del)
            keys_to_del.append(tbl_to_del)
        if path:
            h5file = os.path.join(path, h5file)
        with h5py.File(h5file, "a") as f:
            for k in keys_to_del:
                del f[k]
    else:
        files_to_del = []
        for td in to_delete:
            json_to_del = td + ".json"
            if csv:
                zip_to_del = td + "-eng.zip"
            else:
                zip_to_del = td + ".zip"
            files_to_del.append(zip_to_del)
            files_to_del.append(json_to_del)
        if path:
            files_to_del = [os.path.join(path, f) for f in files_to_del]
        for file in files_to_del:
            if os.path.exists(file):
                os.remove(file)
    return to_delete
예제 #5
0
def delete_tables(tables, path=None, h5file="stats_can.h5", csv=True):
    """Delete downloaded tables.

    Parameters
    ----------
    tables: list
        list of tables to delete
    path: str or path object, default None
        where to look for the tables to delete
    h5file: str default stats_can.h5
        h5file to remove from, set to None to remove zips
    csv: boolean, default True
        if h5file is None this specifies whether to delete zipped csv or SDMX

    Returns
    -------
    to_delete: list
        list of deleted tables
    """
    path = pathlib.Path(path) if path else pathlib.Path()
    clean_tables = parse_tables(tables)
    available_tables_jsons = list_downloaded_tables(path=path, h5file=h5file)
    available_tables = [j["productId"] for j in available_tables_jsons]
    to_delete = [t for t in clean_tables if t in available_tables]
    if h5file:
        keys_to_del = []
        for td in to_delete:
            json_to_del = "json_" + td
            tbl_to_del = "table_" + td
            keys_to_del.append(json_to_del)
            keys_to_del.append(tbl_to_del)
        h5file = path / h5file
        with h5py.File(h5file, "a") as f:
            for k in keys_to_del:
                del f[k]
    else:
        files_to_del = []
        for td in to_delete:
            json_to_del = td + ".json"
            zip_to_del = td + ("-eng.zip" if csv else ".zip")
            files_to_del.append(zip_to_del)
            files_to_del.append(json_to_del)
        for file in files_to_del:
            p = path / file
            if p.is_file():
                p.unlink()

    return to_delete
예제 #6
0
파일: sc.py 프로젝트: zehengl/stats_can
def list_h5_tables(path=None, h5file="stats_can.h5"):
    """return a list of metadata for StatsCan tables from an hdf5 file

    Parameters
    ----------
    path: str or path, default = current working directory
        path to the h5 file
    h5file: str, default stats_can.h5
        name of the h5file to read table data from
    
    Returns
    -------
    jsons: list
        list of available tables json data
    """
    keys = h5_included_keys(h5file=h5file, path=path)
    tables = parse_tables([k for k in keys if k.startswith("json_")])
    return metadata_from_h5(tables, h5file=h5file, path=path)
예제 #7
0
def tables_to_h5(tables, h5file="stats_can.h5", path=None):
    """Take a table and its metadata and put it in an hdf5 file.

    Parameters
    ----------
    tables: list of str
        tables to add to the h5file
    h5file: str, default stats_can.h5
        name of the h5file to store the tables in
    path: str or path, default = current working directory
        path to the h5file

    Returns
    -------
    tables: list
        list of tables loaded into the file
    """
    path = pathlib.Path(path) if path else pathlib.Path()
    h5file = path / h5file
    tables = parse_tables(tables)
    path = h5file.parent

    for table in tables:
        hkey = "table_" + table
        jkey = "json_" + table
        zip_file = table + "-eng.zip"
        json_file = table + ".json"
        zip_file = path / zip_file
        json_file = path / json_file
        if not json_file.is_file():
            download_tables([table], path)
        df = zip_table_to_dataframe(table, path=path)
        with open(json_file) as f_name:
            df_json = json.load(f_name)
        with pd.HDFStore(h5file, "a") as store:
            store.put(key=hkey, value=df, format="table", complevel=1)
        with h5py.File(h5file, "a") as hfile:
            if jkey in hfile.keys():
                del hfile[jkey]
            hfile.create_dataset(jkey, data=json.dumps(df_json))
        zip_file.unlink()
        json_file.unlink()
    return tables
예제 #8
0
def tables_to_h5(tables, h5file="stats_can.h5", path=None):
    """Take a table and its metadata and put it in an hdf5 file

    Parameters
    ----------
    tables: list of str
        tables to add to the h5file
    h5file: str, default stats_can.h5
        name of the h5file to store the tables in
    path: str or path, default = current working directory
        path to the h5file
    
    Returns
    -------
    tables: list
        list of tables loaded into the file
    """
    if path:
        h5file = os.path.join(path, h5file)
    tables = parse_tables(tables)
    for table in tables:
        hkey = "table_" + table
        jkey = "json_" + table
        zip_file = table + "-eng.zip"
        json_file = table + ".json"
        if path:
            zip_file = os.path.join(path, zip_file)
            json_file = os.path.join(path, json_file)
        if not os.path.isfile(json_file):
            download_tables([table], path)
        df = zip_table_to_dataframe(table, path=path)
        with open(json_file) as f_name:
            df_json = json.load(f_name)
        with pd.HDFStore(h5file, "a") as store:
            df.to_hdf(store, key=hkey, format="table", complevel=1)
        with h5py.File(h5file, "a") as hfile:
            if jkey in hfile.keys():
                del hfile[jkey]
            hfile.create_dataset(jkey, data=json.dumps(df_json))
        os.remove(zip_file)
        os.remove(json_file)
    return tables
예제 #9
0
def get_cube_metadata(tables):
    """https://www.statcan.gc.ca/eng/developers/wds/user-guide#a11-1

    Take a list of tables and return a list of dictionaries with their
    metadata

    Parameters
    ----------
    tables : str or list of str
        IDs of tables to get metadata for

    Returns
    -------
    list of dicts
        one for each table with its metadata
    """
    tables = parse_tables(tables)
    tables = [{"productId": t} for t in tables]
    url = SC_URL + "getCubeMetadata"
    result = requests.post(url, json=tables)
    result = check_status(result)
    return [r["object"] for r in result]
예제 #10
0
def zip_table_to_dataframe(table, path=None):
    """Reads a StatsCan table into a pandas DataFrame

    If a zip file of the table does not exist in path, downloads it

    Parameters
    ----------
    table: str
        the table to load to dataframe from zipped csv
    path: str, default: current working directory when module is loaded
        where to download the tables or load them

    Returns:
    df: pandas.DataFrame
        the table as a dataframe
    """
    # Parse tables returns a list, can only do one table at a time here though
    table = parse_tables(table)[0]
    table_zip = table + "-eng.zip"
    if path:
        table_zip = os.path.join(path, table_zip)
    if not os.path.isfile(table_zip):
        download_tables([table], path)
    csv_file = table + ".csv"
    with zipfile.ZipFile(table_zip) as myzip:
        with myzip.open(csv_file) as myfile:
            col_names = pd.read_csv(myfile, nrows=0).columns
        # reopen the file or it misses the first row
        with myzip.open(csv_file) as myfile:
            types_dict = {"VALUE": float}
            types_dict.update(
                {col: str
                 for col in col_names if col not in types_dict})
            df = pd.read_csv(myfile, dtype=types_dict)

    possible_cats = [
        "GEO",
        "DGUID",
        "STATUS",
        "SYMBOL",
        "TERMINATED",
        "DECIMALS",
        "UOM",
        "UOM_ID",
        "SCALAR_FACTOR",
        "SCALAR_ID",
        "VECTOR",
        "COORDINATE",
        "Wages",
        "National Occupational Classification for Statistics (NOC-S)",
        "Supplementary unemployment rates",
        "Sex",
        "Age group",
        "Labour force characteristics",
        "Statistics",
        "Data type",
        "Job permanency",
        "Union coverage",
        "Educational attainment",
    ]
    actual_cats = [col for col in possible_cats if col in col_names]
    df[actual_cats] = df[actual_cats].astype("category")
    try:
        df["REF_DATE"] = pd.to_datetime(df["REF_DATE"], format="%Y-%m")
    except TypeError:
        df["REF_DATE"] = pd.to_datetime(df["REF_DATE"])
    return df