예제 #1
0
def _quick_download_lowres_misc_datasets():
    """
    Retrieves low resolution and miscellaneous datasets quickly using Quilt
    instead of downloading from the original source.
    """
    with open(os.devnull, "w") as null:
        print("Downloading neural network model input datasets ...", end=" ")

        _stdout = sys.stdout
        _stderr = sys.stderr
        sys.stdout = sys.stderr = null

        for geotiff in [
                "lowres/bedmap2_bed",
                "misc/REMA_100m_dem",
                "misc/REMA_200m_dem_filled",
                "misc/MEaSUREs_IceFlowSpeed_450m",
                "misc/Arthern_accumulation_bedmap2_grid1",
        ]:

            if not os.path.exists(path=f"{geotiff}.tif"):
                # Download packages first
                quilt.install(package=f"weiji14/deepbedmap/{geotiff}",
                              force=True)
                # Export the files to the right pathname
                quilt.export(package=f"weiji14/deepbedmap/{geotiff}",
                             force=True)
                # Add .tif extension to filename
                os.rename(src=geotiff, dst=f"{geotiff}.tif")

        sys.stderr = _stderr
        sys.stdout = _stdout
        print("done!")
예제 #2
0
def df_to_quilt(df, path):
    parts = path.split('/')
    assert len(parts) > 2

    root_pkg = '/'.join(parts[0:2])
    try:
        quilt.install(root_pkg, force=True)
    except Exception:
        pass

    object_encoding = {}
    df = df.copy()
    for col, dtype in df.dtypes.iteritems():
        if dtype.name in ('Int8', 'Int32'):
            object_encoding[col] = 'int32'
            df[col] = df[col].astype(object)
        else:
            object_encoding[col] = 'infer'

    with tempfile.NamedTemporaryFile(suffix='.parquet') as f:
        print('writing to %s' % f.name)
        fastparquet.write(f.name,
                          df,
                          compression='snappy',
                          object_encoding=object_encoding)
        print('build')
        quilt.build(path, f.name)
        print('push')
        quilt.push(root_pkg, is_public=True)
예제 #3
0
def _load_from_quilt(package_path):
    user, root_pkg, *sub_paths = package_path.split('/')

    pkg_store, root_node = store.PackageStore.find_package(
        None, user, root_pkg)
    if root_node is None:
        quilt.install(package_path, force=True)
        pkg_store, root_node = store.PackageStore.find_package(
            None, user, root_pkg)

    node = root_node
    while len(sub_paths):
        name = sub_paths.pop(0)
        for child_name, child_node in node.children.items():
            if child_name != name:
                continue
            try:
                node = _from_core_node(pkg_store, child_node)
            except store.StoreException:
                quilt.install(package_path, force=True)
                node = _from_core_node(pkg_store, child_node)
            break
        else:
            raise Exception('Dataset %s not found' % package_path)
    return node
예제 #4
0
def upload_to_quilt(spark, schemas_dic):
    """
    Function to upload data to quilt and to append it to already existing data
    :param spark: Spark Sessuin
    :return: None
    """

    # remove old data and get new one
    logging.info("Installing quilt gdelt data...")
    quilt.rm("nmduarte/gdelt", force=True)
    quilt.install("nmduarte/gdelt", force=True)
    from quilt.data.nmduarte import gdelt

    # get the old data from quilt
    logging.info("getting data from quilt...")
    events_from_quilt = gdelt.events()
    mentions_from_quilt = gdelt.mentions()
    news_from_quilt = gdelt.news()

    # transform the data into dataframes so it can be appended
    logging.info("Creating dataframes from quilt data...")
    events_from_quilt_df = spark.createDataFrame(events_from_quilt,
                                                 schema=schemas_dic['events2'])
    mentions_from_quilt_df = spark.createDataFrame(
        mentions_from_quilt, schema=schemas_dic['mentions'])
    news_from_quilt_df = spark.createDataFrame(news_from_quilt,
                                               schema=schemas_dic['news'])

    # mentions data - new data
    logging.info("Reading last 15min data from S3...")
    mentions_df = tools.read_from_s3_enriched(spark, "mentions",
                                              schemas_dic['mentions'],
                                              cmd_opts.date)
    events_df = tools.read_from_s3_enriched(spark, "events",
                                            schemas_dic['events2'],
                                            cmd_opts.date)
    news_df = tools.read_from_s3_enriched(spark, "news", schemas_dic['news'],
                                          cmd_opts.date)

    # concatenate already existing data with new data
    logging.info("Appending data to old quilt data...")
    mentions_concat = mentions_from_quilt_df.union(mentions_df)
    events_concat = events_from_quilt_df.union(events_df)
    news_concat = news_from_quilt_df.union(news_df)

    # build the 3 packages
    logging.info("Building quilt packages...")
    quilt.build("nmduarte/gdelt/mentions", mentions_concat.toPandas())
    quilt.build("nmduarte/gdelt/events", events_concat.toPandas())
    quilt.build("nmduarte/gdelt/news", news_concat.toPandas())

    # push the 3 packages
    logging.info("Pushing quilt info...")
    quilt.push("nmduarte/gdelt/mentions", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt/events", is_public=True, is_team=False)
    quilt.push("nmduarte/gdelt/news", is_public=True, is_team=False)
예제 #5
0
    def __init__(self, package_name, sub_path, timestamp=None):
        if '/' not in package_name:
            package_name = '/'.join([settings.QUILT_USER, package_name])
        self.package_name = package_name
        try:
            quilt.install(self.package_name, force=True)
        except HTTPResponseException:
            pass

        self.sub_path = sub_path
        self.timestamp = timestamp
예제 #6
0
def load_datasets(packages, include_units=False):
    if not isinstance(packages, (list, tuple)):
        packages = [packages]

    datasets = []
    for package_path in packages:
        user, root_pkg, *sub_paths = package_path.split('/')

        pkg_store, root_node = store.PackageStore.find_package(
            None, user, root_pkg)
        if root_node is None:
            # Quilt seems to have a bug that loading a sub path as fragments
            # will corrupt the local db. Load the full data set always for now.
            #quilt.install(package_path, force=True)
            quilt.install(user + "/" + root_pkg)
            pkg_store, root_node = store.PackageStore.find_package(
                None, user, root_pkg)

        node = root_node
        while len(sub_paths):
            name = sub_paths.pop(0)
            for child_name, child_node in node.children.items():
                if child_name != name:
                    continue
                try:
                    node = _from_core_node(pkg_store, child_node)
                except store.StoreException:
                    quilt.install(package_path, force=True)
                    node = _from_core_node(pkg_store, child_node)
                break
            else:
                raise Exception('Dataset %s not found' % package_path)

        try:
            df = node()
        except store.StoreException:
            _materialize(node)
            df = node()

        if include_units:
            for col_name in df.columns:
                unit = node._meta.get('%s_unit' % col_name, None)
                if not unit:
                    continue
                df[col_name] = df[col_name].astype('pint[%s]' % unit)

        datasets.append(df)

    if len(datasets) == 1:
        return datasets[0]

    return datasets
예제 #7
0
 def FromQuilt(
     package: str = DEFAULT_QUILT_PKG,
     hash: str = None,
     version: str = DEFAULT_QUILT_VERSION,
     tag: str = None,
     force: bool = True,
 ) -> object:
     """Create a GroupsData object from quilt."""
     quilt.install(
         package=package,
         version=version,
         force=force,
         tag=tag,
         hash=hash,
     )
     cc_pkg = quilt.load(DEFAULT_QUILT_PKG)
     return GroupsData.FromDataFrame(cc_pkg.data.group_definitions())
예제 #8
0
파일: quilt.py 프로젝트: jalabort/nucleus
def get_pkg(user: str,
            package: str,
            hash_key=None,
            force=True) -> pd.DataFrame:
    r"""

    Parameters
    ----------
    user
    package
    hash_key
    force

    Returns
    -------

    """
    pkg_path = f'{user}/{package}'
    quilt.install(pkg_path, hash=hash_key, force=force)
    return quilt.load(pkg_path)
예제 #9
0
def get_deepbedmap_model_inputs(
    window_bound: rasterio.coords.BoundingBox,
    padding: int = 1000,
    use_whole_rema: bool = False,
) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray):
    """
    Outputs one large tile for each of:
    BEDMAP2, REMA, MEASURES Ice Flow Velocity and Antarctic Snow Accumulation
    according to a given window_bound in the form of (xmin, ymin, xmax, ymax).
    """
    data_prep = _load_ipynb_modules("data_prep.ipynb")

    if window_bound == rasterio.coords.BoundingBox(
        left=-1_594_000.0, bottom=-166_500.0, right=-1_575_000.0, top=-95_500.0
    ):
        # Quickly pull from cached quilt storage if using (hardcoded) test region
        quilt.install(package="weiji14/deepbedmap/model/test", force=True)
        pkg = quilt.load(pkginfo="weiji14/deepbedmap/model/test")
        X_tile = pkg.X_tile()
        W1_tile = pkg.W1_tile()
        W2_tile = pkg.W2_tile()
        W3_tile = pkg.W3_tile()
예제 #10
0
파일: data.py 프로젝트: emorrow3/geosnap
import matplotlib.pyplot as plt
import pandas as pd
import quilt

import sys
sys.path.insert(0,
                os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from util import adjust_inflation, convert_gdf

try:
    from quilt.data.spatialucr import census
except ImportError:
    warn("Fetching data. This should only happen once")
    quilt.install("spatialucr/census")
    quilt.install("spatialucr/census_cartographic")
    from quilt.data.spatialucr import census
try:
    from quilt.data.geosnap_data import data_store
except ImportError:
    quilt.build("geosnap_data/data_store")
    from quilt.data.geosnap_data import data_store


class Bunch(dict):
    """A dict with attribute-access."""
    def __getattr__(self, key):
        try:
            return self.__getitem__(key)
        except KeyError:
예제 #11
0
#
# This notebook estimates the greenhouse gases emitted by passenger cars in Helsinki. The main source data for the model is the [LIPASTO](http://lipasto.vtt.fi/en/index.htm) calculation system developed by [VTT Technical Research Centre of Finland Ltd.](http://www.vttresearch.com/)
#
# Click Run -> Run All Cells to run the calculations.

# %%
import math
import re
import pandas as pd
import numpy as np
import scipy
try:
    from quilt.data.jyrjola import lipasto
except ImportError:
    import quilt
    quilt.install('jyrjola/lipasto')
    from quilt.data.jyrjola import lipasto

import plotly
import plotly.graph_objs as go
import cufflinks as cf
import aplans_graphs

plotly.offline.init_notebook_mode(connected=True)
cf.set_config_file(offline=True)

# %% [markdown]
# First we load the municipality-specific data from LIPASTO. We are mostly interested in the total mileage in Helsinki specified by the road type (_highways_ or _urban driving_). The mileage column below is in million kilometres (_Mkm_) and the gases are in metric tonnes (_t_).

# %%
muni = lipasto.emissions_by_municipality().set_index(
예제 #12
0
def install_data():
    quilt.install("gudbrandtandberg/chesspieces", force=True)
예제 #13
0
# %%
INPUT_DATASETS = ['jyrjola/ymparistotilastot']

import math
import re
import pandas as pd
import numpy as np
import importlib

for dataset in INPUT_DATASETS:
    mod_path = dataset.replace('/', '.')
    try:
        mod = importlib.import_module('quilt.data.%s' % mod_path)
    except ImportError:
        import quilt
        quilt.install(dataset)

from quilt.data.jyrjola import ymparistotilastot
from utils import dict_merge
import aplans_graphs

import plotly
import plotly.graph_objs as go
import cufflinks as cf

plotly.offline.init_notebook_mode(connected=True)
cf.set_config_file(offline=True)

# %%
df = ymparistotilastot.l34_polttoaine_tavoitteet().copy()
display(df.set_index(['Vuosi']))
예제 #14
0
def install_data():
    # force to avoid y/n prompt; does not re-download
    PKG = 'akarve/BSDS300'
    quilt.install(PKG, force=True)
예제 #15
0
def install_data():
    quilt.install("gudbrandtandberg/chessboard_segmentation")