def test_get_sdm_list(self): df1 = SSDMManager.get_sdm_list() self.assertEqual(len(df1), 0) data = [ { 'name': 'species_1038', 'taxon_id': 1038, 'date_create': datetime.now(), 'date_update': datetime.now(), 'properties': {}, }, { 'name': 'species_1180', 'taxon_id': 1180, 'date_create': datetime.now(), 'date_update': datetime.now(), 'properties': {}, }, { 'name': 'species_1260', 'taxon_id': 1260, 'date_create': datetime.now(), 'date_update': datetime.now(), 'properties': {}, }, ] ins = niamoto_db_meta.sdm_registry.insert().values(data) with Connector.get_connection() as connection: connection.execute(ins) df2 = SSDMManager.get_sdm_list() self.assertEqual(len(df2), 3)
def create_dimension(self, connection=None): """ Create the dimension in database. :param connection: If not None, use an existing connection. """ LOGGER.debug("Creating {}".format(self)) close_after = False if connection is None: connection = Connector.get_engine().connect() close_after = True if self.is_created(connection): m = "The dimension {} already exists in database. Creation will " \ "be skipped." LOGGER.warning(m.format(self.name)) return with connection.begin(): self.table.create(connection) ins = meta.dimension_registry.insert().values({ 'name': self.name, 'dimension_type_key': self.get_key(), 'label_column': self.label_col, 'date_create': datetime.now(), 'properties': self.properties, }) connection.execute(ins) if close_after: connection.close() LOGGER.debug("{} successfully created".format(self))
def truncate(self, cascade=False, connection=None): """ Truncate an existing dimension (i.e. drop every row) :param connection: If not None, use an existing connection. :param cascade: If True, TRUNCATE CASCADE. """ LOGGER.debug("Start Truncate {}".format(self)) close_after = False if connection is None: connection = Connector.get_engine().connect() close_after = True if not self.is_created(connection): m = "The dimension {} does not exists in database." \ " Truncate will be aborded" LOGGER.warning(m.format(self.name)) return with connection.begin(): sql = "TRUNCATE {}".format("{}.{}".format( settings.NIAMOTO_DIMENSIONS_SCHEMA, self.name)) if cascade: sql += " CASCADE" connection.execute(sql) if close_after: connection.close() LOGGER.debug("{} successfully truncated".format(self))
def load(cls, fact_table_name, publisher_cls=None): """ Load a registered fact table and return it. :param fact_table_name: The name of the fact table to load. :param publisher_cls: The publisher class to use for populating the dimension. Must be a subclass of BaseFactTablePublisher. :return: The loaded fact table. """ # TODO: Load properties from registry with Connector.get_connection() as connection: meta_ = sa.MetaData() meta_.reflect( bind=connection, schema=settings.NIAMOTO_FACT_TABLES_SCHEMA, ) table_key = '{}.{}'.format( settings.NIAMOTO_FACT_TABLES_SCHEMA, fact_table_name, ) table = meta_.tables[table_key] dimensions = [] dim_col_names = {} measures = [] for pk in table.primary_key: dim_col_names[pk.name] = True dim_name = list(pk.foreign_keys)[0].column.table.name dim = DimensionManager.get_dimension(dim_name) dimensions.append(dim) for col in table.columns: if col.name not in dim_col_names: measures.append(col.copy()) return cls(fact_table_name, dimensions, measures, publisher_cls=publisher_cls)
def populate(self, dataframe): """ Populates the fact table. Assume that the input dataframe had been correctly formatted to fit the fact table columns. All the null measure are set to 0 before populating. :param dataframe: The dataframe to populate from. """ LOGGER.debug("Populating {}".format(self)) cols = [c.name for c in self.columns] s = io.StringIO() dataframe[cols].fillna(value=0).to_csv(s, columns=cols, index=False) s.seek(0) sql_copy = \ """ COPY {}.{} ({}) FROM STDIN CSV HEADER DELIMITER ','; """.format( settings.NIAMOTO_FACT_TABLES_SCHEMA, self.name, ','.join(cols) ) raw_connection = Connector.get_engine().raw_connection() cur = raw_connection.cursor() cur.copy_expert(sql_copy, s) cur.close() raw_connection.commit() raw_connection.close() LOGGER.debug("{} successfully populated".format(self))
def create_fact_table(self, connection=None): """ Create the fact table in database. :param connection: If not None, use an existing connection. """ close_after = False if connection is None: connection = Connector.get_engine().connect() close_after = True if self.is_created(connection): m = "The fact table {} already exists in database. Creation " \ "will be skipped." LOGGER.warning(m.format(self.name)) return with connection.begin(): self.table.create(connection) ins = meta.fact_table_registry.insert().values({ 'name': self.name, 'date_create': datetime.now(), 'properties': self.properties, }) connection.execute(ins) if close_after: connection.close()
def test_add_sdm(self): # Test non existing raster null_path = os.path.join(NIAMOTO_HOME, "NULL.tif") self.assertRaises(FileNotFoundError, SSDMManager.add_sdm, 1038, null_path, tile_dimension=(200, 200)) # Test wrong taxon id self.assertRaises( NoRecordFoundError, SSDMManager.add_sdm, -1, TEST_SDM_1038, ) # Test existing raster SSDMManager.add_sdm( 1038, TEST_SDM_1038, ) df = SSDMManager.get_raster_list() self.assertEqual(len(df), 1) self.assertEqual(df['name'].iloc[0], 'species_{}'.format(1038)) engine = Connector.get_engine() inspector = Inspector.from_engine(engine) self.assertIn( 'species_{}'.format(1038), inspector.get_table_names(schema=settings.NIAMOTO_SSDM_SCHEMA), )
def make_mptt(cls): """ Build the mptt in database. """ df = cls.get_raw_taxon_dataframe() mptt = cls.construct_mptt(df) mptt['taxon_id'] = mptt.index upd = meta.taxon.update().where( meta.taxon.c.id == bindparam('taxon_id')).values({ 'mptt_tree_id': bindparam('mptt_tree_id'), 'mptt_depth': bindparam('mptt_depth'), 'mptt_left': bindparam('mptt_left'), 'mptt_right': bindparam('mptt_right'), }) with Connector.get_connection() as connection: connection.execute( upd, mptt[[ 'taxon_id', 'mptt_tree_id', 'mptt_depth', 'mptt_left', 'mptt_right', ]].to_dict(orient='records'))
def _process(self, *args, properties=None, **kwargs): """ Return the plot dataframe. :param properties: List of properties to retain. Can be a python list or a comma (',') separated string. """ with Connector.get_connection() as connection: sel_keys = select([ func.jsonb_object_keys( meta.plot.c.properties ).distinct(), ]) if properties is None: keys = [i[0] for i in connection.execute(sel_keys).fetchall()] else: if isinstance(properties, str): properties = properties.split(',') keys = properties props = [meta.plot.c.properties[k].label(k) for k in keys] sel = select([ meta.plot.c.id.label('id'), meta.plot.c.name.label('name'), func.st_x(meta.plot.c.location).label('x'), func.st_y(meta.plot.c.location).label('y'), ] + props) df = pd.read_sql(sel, connection, index_col='id') # Replace None values with nan df.fillna(value=pd.np.NAN, inplace=True) return df, [], {'index_label': 'id'}
def test_get_raster_list(self): df1 = RasterManager.get_raster_list() self.assertEqual(len(df1), 0) data = [ { 'name': 'raster_1', 'date_create': datetime.now(), 'date_update': datetime.now(), 'properties': {}, }, { 'name': 'raster_2', 'date_create': datetime.now(), 'date_update': datetime.now(), 'properties': {}, }, { 'name': 'raster_3', 'date_create': datetime.now(), 'date_update': datetime.now(), 'properties': {}, }, ] ins = niamoto_db_meta.raster_registry.insert().values(data) with Connector.get_connection() as connection: connection.execute(ins) df2 = RasterManager.get_raster_list() self.assertEqual(len(df2), 3)
def test_sync_insert(self): self.tearDownClass() self.setUpClass() data_provider_3 = TestDataProvider('test_data_provider_3') with Connector.get_connection() as connection: pp3 = BasePlotProvider(data_provider_3) self.assertEqual(len(pp3.get_niamoto_plot_dataframe(connection)), 0) pl = pd.DataFrame.from_records([ { 'id': 0, 'name': 'plot_3_1', 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, { 'id': 1, 'name': 'plot_3_2', 'location': from_shape(Point(166.551, -22.098), srid=4326), 'properties': '{}', }, ], index='id') i, u, d = pp3._sync(pl, connection) self.assertEqual(len(i), 2) self.assertEqual(len(u), 0) self.assertEqual(len(d), 0) self.assertEqual(len(pp3.get_niamoto_plot_dataframe(connection)), 2)
def fix_db_sequences(): fix_db_sequences_ownership() with Connector.get_connection() as connection: res = connection.execute(""" SELECT 'SELECT SETVAL(' || quote_literal(quote_ident(PGT.schemaname) || '.' || quote_ident(S.relname)) || ', COALESCE(MAX(' ||quote_ident(C.attname)|| '), 1) ) FROM ' || quote_ident(PGT.schemaname)|| '.'||quote_ident(T.relname)|| ';' FROM pg_class AS S, pg_depend AS D, pg_class AS T, pg_attribute AS C, pg_tables AS PGT WHERE S.relkind = 'S' AND S.oid = D.objid AND D.refobjid = T.oid AND D.refobjid = C.attrelid AND D.refobjsubid = C.attnum AND T.relname = PGT.tablename AND PGT.schemaname = '{}' ORDER BY S.relname; """.format(settings.NIAMOTO_SCHEMA)) statements = res.fetchall() for s in statements: connection.execute(s[0])
def get_current_revision(cls): LOGGER.debug("Getting the current database revision using alembic...") with Connector.get_connection() as connection: context = migration.MigrationContext.configure(connection) rev = context.get_current_revision() LOGGER.debug("Current revision is '{}'".format(rev)) return rev
def test_get_delete_dataframe(self): data_provider_1 = TestDataProvider('test_data_provider_1') with Connector.get_connection() as connection: prov1 = BasePlotOccurrenceProvider(data_provider_1) df1 = prov1.get_niamoto_plot_occurrence_dataframe(connection) # 1. Nothing to delete data_1 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT1_000', }, { 'plot_id': 1, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT2_000', }, { 'plot_id': 1, 'occurrence_id': 1, 'occurrence_identifier': 'PLOT2_001', }, { 'plot_id': 1, 'occurrence_id': 2, 'occurrence_identifier': 'PLOT2_002', }, { 'plot_id': 2, 'occurrence_id': 5, 'occurrence_identifier': 'PLOT2_002', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_1 = prov1.get_reindexed_provider_dataframe(data_1) delete = prov1.get_delete_dataframe(df1, reindexed_data_1) self.assertEqual(len(delete), 0) # 2. Everything to delete data_2 = pd.DataFrame.from_records([]) reindexed_data_2 = prov1.get_reindexed_provider_dataframe(data_2) delete = prov1.get_delete_dataframe(df1, reindexed_data_2) self.assertEqual(len(delete), 5) # 3. Partial delete data_3 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT1_000', }, { 'plot_id': 1, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT2_000', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_3 = prov1.get_reindexed_provider_dataframe(data_3) delete = prov1.get_delete_dataframe(df1, reindexed_data_3) self.assertEqual(len(delete), 3) self.assertEqual(list(delete['provider_occurrence_pk']), [1, 2, 5])
def fix_db_sequences_ownership(): with Connector.get_connection() as connection: res = connection.execute(""" SELECT 'ALTER SEQUENCE '|| quote_ident(MIN(schema_name)) ||'.'|| quote_ident(MIN(seq_name)) ||' OWNED BY '|| quote_ident(MIN(schema_name)) ||'.'|| quote_ident(MIN(TABLE_NAME)) ||'.'|| quote_ident(MIN(column_name)) ||';' FROM ( SELECT n.nspname AS schema_name, c.relname AS TABLE_NAME, a.attname AS column_name, SUBSTRING(d.adsrc FROM E'^nextval\\(''([^'']*)''(?:::text|::regclass)?\\)') AS seq_name FROM pg_class c JOIN pg_attribute a ON (c.oid=a.attrelid) JOIN pg_attrdef d ON (a.attrelid=d.adrelid AND a.attnum=d.adnum) JOIN pg_namespace n ON (c.relnamespace=n.oid) WHERE has_schema_privilege(n.oid,'USAGE') AND n.nspname NOT LIKE 'pg!_%%' escape '!' AND has_table_privilege(c.oid,'SELECT') AND (NOT a.attisdropped) AND d.adsrc ~ '^nextval' ) seq GROUP BY seq_name HAVING COUNT(*)=1; """) statements = res.fetchall() for s in statements: connection.execute(s[0])
def test_get_synonym_key(self): self.assertRaises(NoRecordFoundError, TaxonomyManager.get_synonym_key, 'Not existing') TaxonomyManager.register_synonym_key("test") TaxonomyManager.get_synonym_key("test") with Connector.get_connection() as connection: TaxonomyManager.get_synonym_key("test", bind=connection)
def test_add_single_synonym(self): synonym_key = "synonym_key_1" TaxonomyManager.register_synonym_key("synonym_key_1") data = [ { 'id': 0, 'full_name': 'Family One', 'rank_name': 'One', 'rank': niamoto_db_meta.TaxonRankEnum.FAMILIA, 'parent_id': None, 'synonyms': {}, 'mptt_left': 0, 'mptt_right': 0, 'mptt_tree_id': 0, 'mptt_depth': 0, }, ] ins = niamoto_db_meta.taxon.insert().values(data) with Connector.get_connection() as connection: connection.execute(ins) TaxonomyManager.add_synonym_for_single_taxon(0, synonym_key, 1) df1 = TaxonomyManager.get_raw_taxon_dataframe() self.assertEqual(df1.loc[0]['synonyms'], {synonym_key: 1}) TaxonomyManager.add_synonym_for_single_taxon(0, synonym_key, 2) df2 = TaxonomyManager.get_raw_taxon_dataframe() self.assertEqual(df2.loc[0]['synonyms'], { synonym_key: 1, synonym_key: 2, })
def test_get_current_plot_data(self): """ :return: Test for get_current_plot_data_method. Test the structure of the returned DataFrame. Test retrieving an empty DataFrame. Test retrieving a not-empty DataFrame. """ data_provider_1 = TestDataProvider('test_data_provider_1') data_provider_2 = TestDataProvider('test_data_provider_2') data_provider_3 = TestDataProvider('test_data_provider_3') with Connector.get_connection() as connection: pp1 = BasePlotProvider(data_provider_1) pp2 = BasePlotProvider(data_provider_2) pp3 = BasePlotProvider(data_provider_3) df1 = pp1.get_niamoto_plot_dataframe(connection) df2 = pp2.get_niamoto_plot_dataframe(connection) df3 = pp3.get_niamoto_plot_dataframe(connection) self.assertEqual(len(df1), 4) self.assertEqual(len(df2), 2) self.assertEqual(len(df3), 0) # 2. Check the structure of the DataFrame df_cols = list(df1.columns) + [ df1.index.name, ] db_cols = niamoto_db_meta.plot.columns for db_col in db_cols: self.assertIn(db_col.name, df_cols)
def test_sync_insert(self): self.tearDownClass() self.setUpClass() data_provider_3 = TestDataProvider('test_data_provider_3') with Connector.get_connection() as connection: op3 = BaseOccurrenceProvider(data_provider_3) self.assertEqual( len(op3.get_niamoto_occurrence_dataframe(connection)), 0) occ = pd.DataFrame.from_records([ { 'id': 0, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.551, -22.039), srid=4326), 'properties': '{}', }, { 'id': 1, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.551, -22.098), srid=4326), 'properties': '{}', }, ], index='id') i, u, d = op3._sync(occ, connection) self.assertEqual(len(i), 2) self.assertEqual(len(u), 0) self.assertEqual(len(d), 0) self.assertEqual( len(op3.get_niamoto_occurrence_dataframe(connection)), 2)
def _update_db_id(self): with Connector.get_connection() as connection: sel = select([ niamoto_db_meta.data_provider.c.id ]).where(niamoto_db_meta.data_provider.c.name == self.name) result = connection.execute(sel) self._db_id = result.fetchone()['id']
def test_base_fact_table(self): dim_1 = TestDimension("dim_1") dim_2 = TestDimension("dim_2") ft = BaseFactTable("test_fact", dimensions=[dim_1, dim_2], measure_columns=[ sa.Column('measure_1', sa.Float), ], publisher_cls=TestFactTablePublisher) dim_1.create_dimension() dim_2.create_dimension() dim_1.populate_from_publisher() dim_2.populate_from_publisher() self.assertFalse(ft.is_created()) ft.create_fact_table() self.assertTrue(ft.is_created()) with Connector.get_connection() as connection: inspector = Inspector.from_engine(connection) tables = inspector.get_table_names( schema=settings.NIAMOTO_FACT_TABLES_SCHEMA) self.assertIn("test_fact", tables) ft.populate_from_publisher() vals = ft.get_values() self.assertGreater(len(vals), 0) ft.truncate() vals_bis = ft.get_values() self.assertEqual(len(vals_bis), 0)
def load_data_provider(name, *args, connection=None, **kwargs): BaseDataProvider.assert_data_provider_exists(name) sel = select([ data_provider.c.id, data_provider.c.name, data_provider.c.provider_type_key.label('provider_type'), ]).where(data_provider.c.name == name) # Look for args that must be set None none_values = [ None, 'none', 'None', '0', 'n', 'N', ] nargs = [None if i in none_values else i for i in args] close_after = False if connection is None: close_after = True connection = Connector.get_engine().connect() r = connection.execute(sel) record = r.fetchone() name = record.name type_key = record.provider_type provider = PROVIDER_REGISTRY[type_key]['class'](name, *nargs, **kwargs) if close_after: connection.close() return provider
def get_synonyms_for_key(cls, synonym_key): """ :param synonym_key: The synonym key to consider. If synonym key is 'niamoto', return the niamoto id's (identity synonym). :return: A Series with index corresponding to the data provider's taxa ids, and values corresponding to their synonym in Niamoto's referential. """ with Connector.get_connection() as connection: niamoto_id_col = meta.taxon.c.id synonym_col = meta.taxon.c.synonyms if synonym_key == cls.IDENTITY_SYNONYM_KEY: sel = select([ niamoto_id_col.label("niamoto_taxon_id"), niamoto_id_col.label("provider_taxon_id"), ]) else: sel = select([ niamoto_id_col.label("niamoto_taxon_id"), synonym_col[synonym_key].label("provider_taxon_id"), ]).where(synonym_col[synonym_key].isnot(None)) synonyms = pd.read_sql( sel, connection, index_col="provider_taxon_id")["niamoto_taxon_id"] return synonyms[synonyms.index.notnull()]
def test_add_vector(self): # Test non existing raster null_path = os.path.join(NIAMOTO_HOME, "NULL.shp") self.assertRaises( FileNotFoundError, VectorManager.add_vector, "null_vector", null_path, ) VectorManager.add_vector("ncl_adm1", SHP_TEST) self.assertRaises( RecordAlreadyExistsError, VectorManager.add_vector, "ncl_adm1", SHP_TEST, ) df = VectorManager.get_vector_list() self.assertEqual(len(df), 1) self.assertEqual(df['name'].iloc[0], 'ncl_adm1') engine = Connector.get_engine() inspector = Inspector.from_engine(engine) self.assertIn( 'ncl_adm1', inspector.get_table_names(schema=settings.NIAMOTO_VECTOR_SCHEMA), )
def test_update_raster(self): # Add raster test_raster = os.path.join(NIAMOTO_HOME, "data", "raster", "rainfall_wgs84.tif") RasterManager.add_raster( "rainfall", test_raster, tile_dimension=(200, 200), ) # Update raster RasterManager.update_raster( "rainfall", test_raster, new_name="rainfall_new", tile_dimension=(100, 100), ) df = RasterManager.get_raster_list() engine = Connector.get_engine() inspector = Inspector.from_engine(engine) self.assertIn( 'rainfall_new', inspector.get_table_names(schema=settings.NIAMOTO_RASTER_SCHEMA), ) self.assertNotIn( 'rainfall', inspector.get_table_names(schema=settings.NIAMOTO_RASTER_SCHEMA), ) # Update raster, only properties RasterManager.update_raster("rainfall_new", properties={'test': 10})
def assert_raster_does_not_exist(cls, name): sel = cls.REGISTRY_TABLE.select().where( cls.REGISTRY_TABLE.c.name == name) with Connector.get_connection() as connection: r = connection.execute(sel).rowcount if r > 0: m = "The raster '{}' already exists in database." raise RecordAlreadyExistsError(m.format(name))
def get_synonym_keys(): """ :return: A Dataframe containing all the registered synonym keys. """ sel = select([niamoto_db_meta.synonym_key_registry]) with Connector.get_connection() as connection: df = pd.read_sql(sel, connection, index_col='id') return df
def assert_vector_does_not_exist(name): sel = meta.vector_registry.select().where( meta.vector_registry.c.name == name) with Connector.get_connection() as connection: r = connection.execute(sel).rowcount if r > 0: m = "The vector '{}' already exists in database." raise RecordAlreadyExistsError(m.format(name))
def tearDown(self): with Connector.get_connection() as connection: inspector = Inspector.from_engine(connection) tables = inspector.get_table_names( schema=settings.NIAMOTO_FACT_TABLES_SCHEMA) for tb in tables: connection.execute("DROP TABLE {};".format("{}.{}".format( settings.NIAMOTO_FACT_TABLES_SCHEMA, tb))) connection.execute(meta.fact_table_registry.delete()) with Connector.get_connection() as connection: inspector = Inspector.from_engine(connection) tables = inspector.get_table_names( schema=settings.NIAMOTO_DIMENSIONS_SCHEMA) for tb in tables: connection.execute("DROP TABLE {};".format("{}.{}".format( settings.NIAMOTO_DIMENSIONS_SCHEMA, tb))) connection.execute(meta.dimension_registry.delete())
def create_model(self): """ Create the dimension tables and the fact tables. """ with Connector.get_connection() as connection: for k, v in self.dimensions.items(): v.create_dimension(connection=connection) for k, v in self.fact_tables.items(): v.create_fact_table(connection=connection)