예제 #1
0
def test_validate_par():
    up = UserParameter('arg1', type='int')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "oi"},
                          parameters=[up],
                          getenv=False)
    with pytest.raises(ValueError):
        e()
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': 1},
                          parameters=[up],
                          getenv=False)
    e()  # OK

    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "1"},
                          parameters=[up],
                          getenv=False)
    s = e()  # OK
    assert s.kwargs['arg1'] == 1  # a number, not str
예제 #2
0
    def _load(self):
        """
        Load the STAC Catalog.
        """
        subcatalog = None
        # load first sublevel catalog(s)
        for subcatalog in self._stac_obj.children():
            self._entries[subcatalog.id] = LocalCatalogEntry(
                name=subcatalog.id,
                description=subcatalog.description,
                driver=StacCatalog,
                catalog=self,
                args={'stac_obj': subcatalog.filename},
            )

        if subcatalog is None:
            # load items under last catalog
            for item in self._stac_obj.items():
                self._entries[item.id] = LocalCatalogEntry(
                    name=item.id,
                    description='',
                    driver=StacItem,
                    catalog=self,
                    args={'stac_obj': item},
                )
예제 #3
0
def test_maybe_default_from_env():
    # maybe fill in parameter default from the env, depending on getenv
    up = UserParameter('name', default='env(INTAKE_TEST_VAR)')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=False)
    s = e()
    assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)'

    os.environ['INTAKE_TEST_VAR'] = 'oi'
    s = e()
    assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)'

    up = UserParameter('name', default='env(INTAKE_TEST_VAR)')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=True)
    s = e()
    assert s.kwargs['arg1'] == 'oi'

    del os.environ['INTAKE_TEST_VAR']

    s = e()
    assert s.kwargs['arg1'] == ''
예제 #4
0
    def _load(self):
        """
        Load the STAC Catalog.
        """
        for subcatalog in self._stac_obj.get_children():
            if isinstance(subcatalog, pystac.Collection):
                # Collection subclasses Catalog, so check it first
                driver = StacCollection
            else:
                driver = StacCatalog

            self._entries[subcatalog.id] = LocalCatalogEntry(
                name=subcatalog.id,
                description=subcatalog.description,
                driver=driver,  # recursive
                catalog=self,
                args={'stac_obj': subcatalog.get_self_href()},
            )

        for item in self._stac_obj.get_items():
            self._entries[item.id] = LocalCatalogEntry(
                name=item.id,
                description='',
                driver=StacItem,
                catalog=self,
                args={'stac_obj': item},
            )
예제 #5
0
    def _load(self):
        if self.spark_cat is None:
            self.spark_cat = SparkHolder(True, [('catalog', )],
                                         self.context_args).setup()
        self._entries = {}
        dbs = (self.spark_cat.listDatabases()
               if self.database is None else [self.database])

        for db in dbs:
            tables = self.spark_cat.listTables(dbName=db.name)
            for table in tables:
                if db.name:
                    description = ('Spark table %s in database %s'
                                   '' % (table.name, db.name))
                else:
                    description = ('Spark table %s in default database'
                                   '' % table.name)
                args = {'args': [('table', (table.name, ))]}
                e = LocalCatalogEntry(table.name,
                                      description,
                                      'spark_dataframe',
                                      True,
                                      args,
                                      cache=[],
                                      parameters=[],
                                      metadata={},
                                      catalog_dir="",
                                      getenv=False,
                                      getshell=False)
                e._plugin = [SparkDataFrame]
                self._entries[table.name] = e
예제 #6
0
def test_validate_up():
    up = UserParameter('name', default=1, type='int')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=False)
    s = e()  # OK
    assert s.kwargs['arg1'] == '1'
    with pytest.raises(ValueError):
        e(name='oi')

    up = UserParameter('name', type='int')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=False)
    s = e()  # OK
    # arg1 is a string: real int gets rendered by jinja
    assert s.kwargs['arg1'] == '0'  # default default for int
    s = e(arg1='something')
    assert s.kwargs['arg1'] == 'something'
예제 #7
0
 def _load(self):
     import sqlalchemy
     from intake_sql import SQLSourceAutoPartition
     engine = sqlalchemy.create_engine(self.uri)
     meta = sqlalchemy.MetaData(bind=engine)
     meta.reflect(views=self.views)
     self._entries = {}
     for name, table in meta.tables.items():
         for c in table.columns:
             if c.primary_key:
                 description = 'SQL table %s from %s' % (name, self.uri)
                 args = {
                     'uri': self.uri,
                     'table': name,
                     'index': c.name,
                     'sql_kwargs': self.kwargs
                 }
                 e = LocalCatalogEntry(name,
                                       description,
                                       'sql_auto',
                                       True,
                                       args, {}, {}, {},
                                       "",
                                       getenv=False,
                                       getshell=False)
                 e._plugin = [SQLSourceAutoPartition]
                 self._entries[name] = e
                 break
예제 #8
0
def test_no_instance():
    from intake.catalog.local import LocalCatalogEntry

    e0 = LocalCatalogEntry('foo', '', 'fake')
    e1 = LocalCatalogEntry('foo0', '', 'fake')

    # this would error on instantiation with driver not found
    assert e0 != e1
예제 #9
0
def test_explicit_entry_driver():
    from intake.source.textfiles import TextFilesSource
    e = LocalCatalogEntry('test', 'desc', TextFilesSource,
                          args={'urlpath': None})
    assert e.describe()['container'] == 'python'
    assert isinstance(e(), TextFilesSource)

    with pytest.raises(TypeError):
        LocalCatalogEntry('test', 'desc', None)
예제 #10
0
def test_dict_adddel():
    from intake.catalog.base import Catalog
    entry = LocalCatalogEntry(name='trial', description='get this back',
                              driver='csv', args=dict(urlpath=""))
    cat = Catalog.from_dict({'trial': entry}, name='mycat')
    assert 'trial' in cat
    cat['trial2'] = entry
    assert list(cat) == ['trial', 'trial2']
    cat.pop('trial')
    assert list(cat) == ['trial2']
    assert cat['trial2'].describe() == entry.describe()
예제 #11
0
def test_filter():
    from intake.catalog.base import Catalog
    entry1 = LocalCatalogEntry(name='trial', description='get this back',
                               driver='csv', args=dict(urlpath=""))
    entry2 = LocalCatalogEntry(name='trial', description='pass this through',
                               driver='csv', args=dict(urlpath=""))
    cat = Catalog.from_dict({'trial1': entry1,
                             'trial2': entry2}, name='mycat')
    cat2 = cat.filter(lambda e: 'pass' in e._description)
    assert list(cat2) == ['trial2']
    assert cat2.trial2 == entry2()
예제 #12
0
 def _load(self):
     resources = resource_list()
     for r in resources:
         e = LocalCatalogEntry(name=r,
                               description=r,
                               driver=StripeTableSource,
                               catalog=self,
                               args={
                                   'api_key': self.api_key,
                                   'api_version': self.api_version,
                                   'resource': r
                               })
         e._plugin = [StripeTableSource]
         self._entries[r] = e
예제 #13
0
def test_unknown():
    e = LocalCatalogEntry('', '', driver, args={'arg1': "{{name}}"})
    s = e()
    assert s.kwargs['arg1'] == ""

    # parameter has no default
    up = UserParameter('name')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up])
    s = e()
    assert s.kwargs['arg1'] == ""
예제 #14
0
def test_maybe_default_from_env():
    # maybe fill in parameter default from the env, depending on getenv
    up = UserParameter('name', default='env(INTAKE_TEST_VAR)')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=False)
    s = e()
    assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)'

    os.environ['INTAKE_TEST_VAR'] = 'oi'
    # Clear the cached source so we can (not) pick up the changed environment variable.
    e.clear_cached_default_source()

    s = e()
    assert s.kwargs['arg1'] == 'env(INTAKE_TEST_VAR)'

    up = UserParameter('name', default='env(INTAKE_TEST_VAR)')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=True)
    s = e()
    assert s.kwargs['arg1'] == 'oi'

    del os.environ['INTAKE_TEST_VAR']
    # Clear the cached source so we can pick up the changed environment variable.
    e.clear_cached_default_source()

    s = e()
    assert s.kwargs['arg1'] == ''
예제 #15
0
 def _create_entry(self, row):
     name = f'{row.TABLE_SCHEMA}."{row.TABLE_NAME}"'
     description = f'Dremio {row.TABLE_TYPE} {name} from {self._hostname}'
     args = {'uri': self._uri, 'sql_expr': f'select * from {name}'}
     e = LocalCatalogEntry(name,
                           description,
                           'dremio',
                           True,
                           args, {}, {}, {},
                           "",
                           getenv=False,
                           getshell=False)
     e._plugin = [DremioSource]
     self._entries[name] = e
예제 #16
0
def test_cache_default_source():
    # If the user provides parameters, don't allow default caching
    up = UserParameter('name', default='oi')
    e = LocalCatalogEntry('', '', driver, getenv=False, parameters=[up])
    s1 = e(name="oioi")
    s2 = e()
    assert s1 is not s2
    s1 = e()
    s2 = e(name="oioi")
    assert s1 is not s2
    # Otherwise, we can cache the default source
    e = LocalCatalogEntry('', '', driver, getenv=False)
    s1 = e()
    s2 = e()
    assert s1 is s2
예제 #17
0
    def add(self, key, source):
        """Add the persisted source to the store under the given key

        key : str
            The unique token of the un-persisted, original source
        source : DataSource instance
            The thing to add to the persisted catalogue, referring to persisted
            data
        """
        from intake.catalog.local import LocalCatalogEntry
        try:
            with self.fs.open(self.path, 'rb') as f:
                data = yaml.safe_load(f)
        except IOError:
            data = {'sources': {}}
        ds = source._yaml()['sources'][source.name]
        data['sources'][key] = ds
        with self.fs.open(self.path, 'wb') as fo:
            fo.write(yaml.dump(data, default_flow_style=False).encode())
        self._entries[key] = LocalCatalogEntry(
            name=ds['metadata']['original_name'],
            direct_access=True,
            cache=[],
            parameters=[],
            catalog_dir=None,
            **data['sources'][key])
예제 #18
0
 def search(self, **query):
     """ Search for entries in the collection catalog
     """
     collection_columns = self.df.columns.tolist()
     for key in query.keys():
         if key not in collection_columns:
             raise ValueError(f'{key} is not in {self.collection_name}')
     for key in collection_columns:
         if key not in query:
             query[key] = None
     name = self.collection_name + '_' + str(uuid.uuid4())
     args = {'collection_name': self.collection_name, 'query': query}
     driver = config.get('sources')[self.collection_type]
     description = f'Catalog entry from {self.collection_name} collection'
     cat = LocalCatalogEntry(
         name=name,
         description=description,
         driver=driver,
         direct_access=True,
         args=args,
         cache={},
         parameters={},
         metadata=self.metadata.copy(),
         catalog_dir='',
         getenv=False,
         getshell=False,
     )
     self._entries[name] = cat
     return cat
예제 #19
0
 def _load(self):
     """
     Connect to the OmniSci database, list the available tables, and
     construct a catalog entry for each table.
     """
     connection = pymapd.connect(**self._init_args)
     self._entries = {}
     if self._ibis_con is None:
         try:
             import ibis.omniscidb
             self._ibis_con = ibis.omniscidb.connect(
                 uri=self._init_args['uri'],
                 user=self._init_args['user'],
                 password=self._init_args['password'],
                 host=self._init_args['host'],
                 port=self._init_args['port'],
                 protocol=self._init_args['protocol'],
                 database=self._init_args['dbname'],
             )
         except ImportError:
             pass
     for table in connection.get_tables():
         description = "SQL table %s from %s" % (table, str(self))
         args = {
             key: value
             for key, value in self._init_args.items() if value
         }
         args['ibis_con'] = self._ibis_con
         args["sql_expr"] = table
         e = LocalCatalogEntry(table, description, "omnisci", True, args)
         self._entries[table] = e
예제 #20
0
    def _load(self):
        """
        Query the Civis database for all the schemas which have tables
        and construct catalog entries for them.
        """
        fut = civis.io.query_civis(
            "SELECT DISTINCT(table_schema) FROM information_schema.tables WHERE "
            "table_schema != 'pg_catalog' AND table_schema != 'information_schema'",
            database=self._database,
            client=self._client,
        )
        res = fut.result()

        schemas = [row[0] for row in res.result_rows]
        self._entries = {}
        for schema in schemas:
            entry = LocalCatalogEntry(
                schema,
                f"Civis schema {schema} from {self._database}",
                CivisSchema,
                True,
                args={
                    "api_key": self._api_key,
                    "database": self._database,
                    "schema": schema,
                },
                getenv=False,
                getshell=False,
            )
            self._entries[schema] = entry
예제 #21
0
def test_auto_env_expansion():
    os.environ['INTAKE_TEST_VAR'] = 'oi'
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{env(INTAKE_TEST_VAR)}}"},
                          parameters=[],
                          getenv=False)
    s = e()

    # when getenv is False, you pass through the text
    assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}'

    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{env(INTAKE_TEST_VAR)}}"},
                          parameters=[],
                          getenv=True)
    s = e()
    assert s.kwargs['arg1'] == 'oi'

    # same, but with quoted environment name
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': '{{env("INTAKE_TEST_VAR")}}'},
                          parameters=[],
                          getenv=True)
    s = e()
    assert s.kwargs['arg1'] == 'oi'

    del os.environ['INTAKE_TEST_VAR']
    # Clear the cached source so we can pick up the changed environment variable.
    e.clear_cached_default_source()

    s = e()
    assert s.kwargs['arg1'] == ''

    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{env(INTAKE_TEST_VAR)}}"},
                          parameters=[],
                          getenv=False)
    s = e()
    assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}'
예제 #22
0
    def _load(self):
        """
        Query the Civis database for all the tables in the schema
        and construct catalog entries for them.
        """
        fut1 = civis.io.query_civis(
            "SELECT table_name FROM information_schema.tables "
            f"WHERE table_schema = '{self._dbschema}'",
            database=self._database,
            client=self._client,
        )
        # If the database has a geometry_columns table, we prefer that as we can
        # get the SRID for a column from it. Otherwise, we get the geometry columns
        # from the information schema.
        if self._has_geom:
            fut2 = civis.io.query_civis(
                "SELECT f_table_name, f_geometry_column, srid FROM geometry_columns "
                f"WHERE f_table_schema = '{self._dbschema}'",
                database=self._database,
                client=self._client,
            )
        else:
            fut2 = civis.io.query_civis(
                "SELECT table_name, column_name FROM information_schema.columns "
                f"WHERE table_schema = '{self._dbschema}' and udt_name = 'geometry'",
                database=self._database,
                client=self._client,
            )
        done, _ = concurrent.futures.wait((fut1, fut2))
        assert fut1 in done and fut2 in done
        res1 = fut1.result()
        res2 = fut2.result()

        tables = [row[0] for row in res1.result_rows]
        self._entries = {}
        for table in tables:
            geometry = [r[1] for r in res2.result_rows if r[0] == table]
            srid = [
                r[2] for r in res2.result_rows
                if r[0] == table and self._has_geom
            ]
            entry = LocalCatalogEntry(
                table,
                f"Civis table {table} from {self._database}",
                CivisSource,
                True,
                args={
                    "api_key": self._api_key,
                    "civis_kwargs": self._civis_kwargs,
                    "database": self._database,
                    "table": table,
                    "schema": self._dbschema,
                    "geometry": geometry if len(geometry) else None,
                    "crs": f"EPSG:{srid[0]}" if len(srid) else None,
                },
                getenv=False,
                getshell=False,
            )
            self._entries[table] = entry
예제 #23
0
def test_from_dict_with_data_source():
    "Check that Catalog.from_dict accepts DataSources not wrapped in Entry."
    from intake.catalog.base import Catalog
    fn = os.path.join(tempfile.mkdtemp(), 'mycat.yaml')
    entry = LocalCatalogEntry(name='trial', description='get this back',
                              driver='csv', args=dict(urlpath=""))
    ds = entry()
    cat = Catalog.from_dict({'trial': ds}, name='mycat')
예제 #24
0
def test_auto_env_expansion():
    os.environ['INTAKE_TEST_VAR'] = 'oi'
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{env(INTAKE_TEST_VAR)}}"},
                          parameters=[],
                          getenv=False)
    s = e()

    # when getenv is False, you pass through the text
    assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}'

    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{env(INTAKE_TEST_VAR)}}"},
                          parameters=[],
                          getenv=True)
    s = e()
    assert s.kwargs['arg1'] == 'oi'

    # same, but with quoted environment name
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': '{{env("INTAKE_TEST_VAR")}}'},
                          parameters=[],
                          getenv=True)
    s = e()
    assert s.kwargs['arg1'] == 'oi'

    del os.environ['INTAKE_TEST_VAR']

    s = e()
    assert s.kwargs['arg1'] == ''

    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{env(INTAKE_TEST_VAR)}}"},
                          parameters=[],
                          getenv=False)
    s = e()
    assert s.kwargs['arg1'] == '{{env(INTAKE_TEST_VAR)}}'
예제 #25
0
    def _load(self):
        from siphon.catalog import TDSCatalog

        self.cat = TDSCatalog(self.url)
        self.name = self.cat.catalog_name
        self.metadata.update(self.cat.metadata)

        # sub-cats
        self._entries = {
            r.title: LocalCatalogEntry(
                r.title,
                'THREDDS cat',
                'thredds_cat',
                True,
                {'url': r.href},
                [],
                [],
                {},
                None,
                catalog=self,
            )
            for r in self.cat.catalog_refs.values()
        }

        # data entries (only those with opendap links)
        self._entries.update({
            ds.name: LocalCatalogEntry(
                ds.name,
                'THREDDS data',
                # 'netcdf',
                'opendap',
                True,
                # {'urlpath': ds.access_urls['HTTPServer'], 'chunks': None},
                {
                    'urlpath': ds.access_urls['OPENDAP'],
                    'chunks': None
                },
                [],
                [],
                {},
                None,
                catalog=self,
            )
            for ds in self.cat.datasets.values()
        })
예제 #26
0
def test_nested_remote(intake_server):
    from intake.catalog.local import LocalCatalogEntry
    catalog = open_catalog()
    catalog._entries = {
        'server':
        LocalCatalogEntry('server', 'remote test', 'intake_remote', True,
                          {'url': intake_server}, [], [], {}, None)
    }
    assert 'entry1' in catalog.server()
예제 #27
0
def test_parameter_default():
    up = UserParameter('name', default='oi')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up])
    s = e()
    assert s.kwargs['arg1'] == 'oi'
예제 #28
0
    def _instantiate_source(self):
        """ Driving method of this class. """
        mode = self.storage[self.storage_mode if self.storage_mode else self.default]

        args = {}
        mode_url = mode
        if isinstance(mode, dict):
            mode_url = mode["url"]
            args = mode.get("args", {})

        parse_result, url_path = self.parse_storage_mode_url(mode_url)
        desc = self.catalog_object[self.name].describe()

        if parse_result.scheme == "parquet":
            # https://github.com/dask/dask/issues/5272: Dask parquet metadata w/ ~2k files very slow
            if "gather_statistics" not in args:
                args["gather_statistics"] = False

            if "engine" not in args:
                args["engine"] = "pyarrow"

        entry = LocalCatalogEntry(
            name=desc["name"],
            description=desc["description"],
            driver=parse_result.scheme,
            args={"urlpath": url_path, **args},
            parameters=self.catalog_object[self.name]._user_parameters,
            catalog=self.cat,
        )

        params = {
            "canonical_name": self._canonical_name,
            "storage_mode": self.storage_mode,
            "avro_schema": self._avro_schema,
            "dtypes": self._dtypes,
        }

        source = entry.get(metadata=self.metadata, **self.kwargs)
        # source = entry.get(metadata=self.metadata, **{**self.kwargs, **params})

        source.metadata["url_path"] = url_path
        source.metadata = {**source.metadata, **params}

        return source
예제 #29
0
    def _load(self):
        """ load entries into catalog """

        self._entries = {}

        exps = set()
        samples = set()
        for row in get_runs(self.conn):

            run_description = json.loads(row['run_description'])

            # move these functions so they can be loaded elsewhere
            exp_name, sample_name = get_names_from_experiment_id(
                self.conn, row['exp_id'])
            dependent_parameters, independent_parameters = parameters_from_description(
                run_description)

            self._entries[row['guid']] = LocalCatalogEntry(
                name='run {}'.format(row['run_id']),
                description='run {} at {} with guid {}'.format(
                    row['run_id'], str(self._db_path), row['guid']),
                driver=self._source_driver,
                direct_access='forbid',
                args={
                    'db_path': str(self._db_path),
                    'guid': row['guid'],
                    'run_id': row['run_id']
                },
                cache=None,
                parameters=[],
                metadata={
                    "start_time": row['run_timestamp'],
                    "stop_time": row['completed_timestamp'],
                    "dependent_parameters": dependent_parameters,
                    "independent_parameters": independent_parameters,
                    "experiment_name": exp_name,
                    "sample_name": sample_name,
                    "table_name": row['result_table_name'],
                    'plots': make_default_plots(run_description),
                },
                catalog_dir=str(self._db_path),
                getenv=False,
                getshell=False,
                catalog=self,
            )

            self._guid_lookup[row['run_id']] = row['guid']
            exps.add(exp_name)
            samples.add(sample_name)

        self._experiments = list(exps)
        self._samples = list(samples)
        self._run_id_lookup = {
            val: key
            for key, val in self._guid_lookup.items()
        }
예제 #30
0
def test_up_override_and_render():
    up = UserParameter('name', default='env(INTAKE_TEST_VAR)')
    e = LocalCatalogEntry('',
                          '',
                          driver,
                          args={'arg1': "{{name}}"},
                          parameters=[up],
                          getenv=False)
    s = e(name='other')
    assert s.kwargs['arg1'] == 'other'