def _process(self, *args, properties=None, drop_null_properties=False, **kwargs): """ Return the occurrence dataframe. :param properties: List of properties to retain. Can be a python list or a comma (',') separated string. """ with Connector.get_connection() as connection: sel_keys = select([ func.jsonb_object_keys( meta.occurrence.c.properties ).distinct(), ]) if properties is None: keys = [i[0] for i in connection.execute(sel_keys).fetchall()] else: if isinstance(properties, str): properties = properties.split(',') keys = properties props = [meta.occurrence.c.properties[k].label(k) for k in keys] sel = select([ meta.occurrence.c.id.label('id'), meta.occurrence.c.taxon_id.label('taxon_id'), cast(meta.taxon.c.rank.label('rank'), String).label('rank'), meta.taxon.c.full_name.label('full_name'), func.st_x(meta.occurrence.c.location).label('x'), func.st_y(meta.occurrence.c.location).label('y'), ] + props).select_from( meta.occurrence.outerjoin( meta.taxon, meta.taxon.c.id == meta.occurrence.c.taxon_id ) ) df = pd.read_sql(sel, connection, index_col='id') df['taxon_id'] = df['taxon_id'].apply(pd.to_numeric) # Replace None values with nan df.fillna(value=pd.np.NAN, inplace=True) if drop_null_properties: for k in keys: df = df[df[k].notnull()] return df, [], {'index_label': 'id'}
def update_data_provider(cls, current_name, *args, new_name=None, properties={}, synonym_key=None, return_object=True, **kwargs): if new_name is None: new_name = current_name m = "DataProvider(current_name='{}', new_name='{}', type_name='{}'," \ "properties='{}', synonym_key='{}'): updating data provider...'" LOGGER.debug( m.format(current_name, new_name, cls.get_type_name(), properties, synonym_key)) with Connector.get_connection() as connection: cls.assert_data_provider_exists(current_name, bind=connection) synonym_key_id = None if synonym_key is not None: synonym_key_id = TaxonomyManager.get_synonym_key( synonym_key, bind=connection)['id'] upd = niamoto_db_meta.data_provider.update().values({ 'name': new_name, 'properties': properties, 'synonym_key_id': synonym_key_id, 'date_update': datetime.now(), }).where(niamoto_db_meta.data_provider.c.name == current_name) connection.execute(upd) m = "DataProvider(current_name='{}', new_name='{}', type_name='{}'," \ " properties='{}', synonym_key='{}'): Data provider had been " \ "successfully updated!'" LOGGER.debug( m.format(current_name, new_name, cls.get_type_name(), properties, synonym_key)) if return_object: return cls(new_name, *args, **kwargs)
def test_duplicate_synonym(self): synonym_key = "synonym_key_1" TaxonomyManager.register_synonym_key("synonym_key_1") data = [ { 'id': 0, 'full_name': 'Family One', 'rank_name': 'One', 'rank': niamoto_db_meta.TaxonRankEnum.FAMILIA, 'parent_id': None, 'synonyms': {}, 'mptt_left': 0, 'mptt_right': 0, 'mptt_tree_id': 0, 'mptt_depth': 0, }, { 'id': 1, 'full_name': 'Family Two', 'rank_name': 'Two', 'rank': niamoto_db_meta.TaxonRankEnum.FAMILIA, 'parent_id': None, 'synonyms': {}, 'mptt_left': 0, 'mptt_right': 0, 'mptt_tree_id': 0, 'mptt_depth': 0, }, ] ins = niamoto_db_meta.taxon.insert().values(data) with Connector.get_connection() as connection: connection.execute(ins) TaxonomyManager.add_synonym_for_single_taxon(0, synonym_key, 1) self.assertRaises( IntegrityError, TaxonomyManager.add_synonym_for_single_taxon, 1, synonym_key, 1, )
def test_get_synonyms_map(self): synonym_key = "synonym_key_1" TaxonomyManager.register_synonym_key("synonym_key_1") data = [ { 'id': 0, 'full_name': 'Family One', 'rank_name': 'One', 'rank': niamoto_db_meta.TaxonRankEnum.FAMILIA, 'parent_id': None, 'synonyms': { synonym_key: 10, }, 'mptt_left': 0, 'mptt_right': 0, 'mptt_tree_id': 0, 'mptt_depth': 0, }, { 'id': 1, 'full_name': 'Family Two', 'rank_name': 'Two', 'rank': niamoto_db_meta.TaxonRankEnum.FAMILIA, 'parent_id': None, 'synonyms': { synonym_key: 20, }, 'mptt_left': 0, 'mptt_right': 0, 'mptt_tree_id': 0, 'mptt_depth': 0, }, ] ins = niamoto_db_meta.taxon.insert().values(data) with Connector.get_connection() as connection: connection.execute(ins) synonyms = TaxonomyManager.get_synonyms_for_key(synonym_key) self.assertEqual(synonyms.loc[10], 0) self.assertEqual(synonyms.loc[20], 1)
def setUpClass(cls): super(TestBasePlotOccurrenceProvider, cls).setUpClass() data_provider_1 = TestDataProvider.register_data_provider( 'test_data_provider_1') data_provider_2 = TestDataProvider.register_data_provider( 'test_data_provider_2', ) TestDataProvider.register_data_provider('test_data_provider_3') plot_1 = test_data.get_plot_data_1(data_provider_1) plot_2 = test_data.get_plot_data_2(data_provider_2) occ_1 = test_data.get_occurrence_data_1(data_provider_1) occ_2 = test_data.get_occurrence_data_2(data_provider_2) plot_occ_1 = test_data.get_plot_occurrence_data_1(data_provider_1) plot_occ_2 = test_data.get_plot_occurrence_data_2(data_provider_2) ins_1 = niamoto_db_meta.plot.insert().values(plot_1 + plot_2) ins_2 = niamoto_db_meta.occurrence.insert().values(occ_1 + occ_2) ins_3 = niamoto_db_meta.plot_occurrence.insert().values(plot_occ_1 + plot_occ_2) with Connector.get_connection() as connection: connection.execute(ins_1) connection.execute(ins_2) connection.execute(ins_3) fix_db_sequences()
def register_synonym_key(cls, synonym_key, bind=None): """ Register a synonym key in database. :param synonym_key: The synonym key to register :param bind: If passed, use and existing engine or connection. """ now = datetime.now() ins = meta.synonym_key_registry.insert({ 'name': synonym_key, 'date_create': now, }) if bind is not None: cls.assert_synonym_key_does_not_exists(synonym_key, bind=bind) bind.execute(ins) cls._register_unique_synonym_key_constraint(synonym_key, bind=bind) return with Connector.get_connection() as connection: cls.assert_synonym_key_does_not_exists(synonym_key, bind=connection) connection.execute(ins) cls._register_unique_synonym_key_constraint(synonym_key, bind=connection) LOGGER.debug("synonym_key {} registered.".format(synonym_key))
def register_data_provider(cls, name, *args, properties={}, synonym_key=None, return_object=True, **kwargs): m = "DataProvider(name='{}', type_name='{}', properties='{}', " \ "synonym_key='{}'): Registering data provider...'" LOGGER.debug( m.format(name, cls.get_type_name(), properties, synonym_key)) with Connector.get_connection() as connection: cls.assert_data_provider_does_not_exist(name, bind=connection) synonym_key_id = None if synonym_key is not None: synonym_key_id = TaxonomyManager.get_synonym_key( synonym_key, bind=connection)['id'] ins = niamoto_db_meta.data_provider.insert({ 'name': name, 'provider_type_key': cls.get_type_name(), 'properties': properties, 'synonym_key_id': synonym_key_id, 'date_create': datetime.now(), }) connection.execute(ins) m = "DataProvider(name='{}', type_name='{}', properties='{}', " \ "synonym_key='{}'): Data provider had been successfully" \ " registered!'" LOGGER.debug( m.format(name, cls.get_type_name(), properties, synonym_key)) if return_object: return cls(name, *args, **kwargs)
def test_get_dataframe_and_sync(self): pt_provider = PlantnoteDataProvider( 'pl@ntnote_provider', self.TEST_DB_PATH, ) with Connector.get_connection() as connection: prov = pt_provider.plot_occurrence_provider df1 = prov.get_niamoto_plot_occurrence_dataframe(connection) self.assertEqual(len(df1), 0) # Sync plots and occurrences plot_prov = pt_provider.plot_provider occ_prov = pt_provider.occurrence_provider plot_prov.sync(connection) occ_prov.sync(connection) # Sync plot-occurrences df2 = prov.get_provider_plot_occurrence_dataframe() cols = df2.columns for i in [ 'occurrence_identifier', ]: self.assertIn(i, cols) prov.sync(connection) df3 = prov.get_niamoto_plot_occurrence_dataframe(connection) self.assertEqual(len(df2), len(df3))
def get_dimension(cls, dimension_name): """ Load a registered dimension. :param dimension_name: The name of the dimension to load. :return: The loaded dimension. """ sel = sa.select([ meta.dimension_registry.c.dimension_type_key, meta.dimension_registry.c.label_column, meta.dimension_registry.c.properties, ]).where(meta.dimension_registry.c.name == dimension_name) with Connector.get_connection() as connection: cls.assert_dimension_is_registered(dimension_name, connection) result = connection.execute(sel).fetchone() dim_type, label_column, properties = result column_labels = {} if 'column_labels' in properties: column_labels = properties['column_labels'] return DIMENSION_TYPE_REGISTRY[dim_type]['class'].load( dimension_name, label_col=label_column, properties=properties, column_labels=column_labels, )
def extract_raster_values_to_plots(cls, raster_name): with Connector.get_connection() as connection: with connection.begin(): RasterManager.assert_raster_exists(raster_name, connection=connection) m = "Extracting '{}' raster values to plots properties." LOGGER.debug(m.format(raster_name)) sql = \ """ WITH {raster_name} AS ( SELECT plot.id AS id, ST_Value(raster.rast, plot.location) AS rast_value FROM {plot_table} AS plot LEFT JOIN {raster_table} AS raster ON ST_Intersects(raster.rast, plot.location) ) UPDATE {plot_table} SET properties = ( {plot_table}.properties || jsonb_build_object( '{prefix}{raster_name}', {raster_name}.rast_value ) ) FROM {raster_name} WHERE {raster_name}.id = {plot_table}.id """.format(**{ 'raster_name': raster_name, 'raster_table': '{}.{}'.format( settings.NIAMOTO_RASTER_SCHEMA, raster_name ), 'plot_table': '{}.{}'.format( settings.NIAMOTO_SCHEMA, meta.plot.name ), 'prefix': RASTER_PROPERTY_PREFIX, }) connection.execute(sql)
def tearDownClass(cls): engine = Connector.get_engine() meta.metadata.drop_all(engine) with Connector.get_connection() as connection: inspector = Inspector.from_engine(connection) # Drop vectors tables = inspector.get_table_names( schema=settings.NIAMOTO_VECTOR_SCHEMA) for tb in tables: connection.execute("DROP TABLE IF EXISTS {} CASCADE;".format( "{}.{}".format(settings.NIAMOTO_VECTOR_SCHEMA, tb))) # Drop rasters tables = inspector.get_table_names( schema=settings.NIAMOTO_RASTER_SCHEMA) for tb in tables: connection.execute("DROP TABLE IF EXISTS {} CASCADE;".format( "{}.{}".format(settings.NIAMOTO_RASTER_SCHEMA, tb))) # Drop fact tables tables = inspector.get_table_names( schema=settings.NIAMOTO_FACT_TABLES_SCHEMA) for tb in tables: connection.execute("DROP TABLE IF EXISTS {} CASCADE;".format( "{}.{}".format(settings.NIAMOTO_FACT_TABLES_SCHEMA, tb))) # Drop dimensions tables = inspector.get_table_names( schema=settings.NIAMOTO_DIMENSIONS_SCHEMA) for tb in tables: connection.execute("DROP TABLE IF EXISTS {} CASCADE;".format( "{}.{}".format(settings.NIAMOTO_DIMENSIONS_SCHEMA, tb))) # Drop SDMs tables = inspector.get_table_names( schema=settings.NIAMOTO_SSDM_SCHEMA) for tb in tables: connection.execute("DROP TABLE IF EXISTS {} CASCADE;".format( "{}.{}".format(settings.NIAMOTO_SSDM_SCHEMA, tb))) super(BaseTestNiamotoSchemaCreated, cls).tearDownClass()
def get_geometry_column(cls, vector_name): """ Find the geometry column of a raster, inspecting the geometry_columns table. Assume that there is a single geometry column, if several are queried return the first one. :return: The geometry column name, type and srid (name, type, srid). """ with Connector.get_connection() as connection: cls.assert_vector_exists(vector_name, connection) sql = \ """ SELECT f_geometry_column, type, srid FROM public.geometry_columns WHERE f_table_schema = '{}' AND f_table_name = '{}' LIMIT 1; """.format( settings.NIAMOTO_VECTOR_SCHEMA, vector_name ) result = connection.execute(sql) return result.fetchone()
def test_get_insert_dataframe(self): data_provider_1 = TestDataProvider('test_data_provider_1') with Connector.get_connection() as connection: prov1 = BasePlotOccurrenceProvider(data_provider_1) df1 = prov1.get_niamoto_plot_occurrence_dataframe(connection) # 1. Nothing to insert data_1 = pd.DataFrame.from_records([ { 'plot_id': 1, 'occurrence_id': 1, 'occurrence_identifier': 'PLOT2_001', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_1 = prov1.get_reindexed_provider_dataframe(data_1) ins = prov1.get_insert_dataframe(df1, reindexed_data_1) self.assertEqual(len(ins), 0) # 2. Everything to insert data_2 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 1, 'occurrence_identifier': 'PLOT1_002', }, { 'plot_id': 0, 'occurrence_id': 2, 'occurrence_identifier': 'PLOT1_003', }, { 'plot_id': 0, 'occurrence_id': 5, 'occurrence_identifier': 'PLOT1_003', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_2 = prov1.get_reindexed_provider_dataframe(data_2) ins = prov1.get_insert_dataframe(df1, reindexed_data_2) self.assertEqual(len(ins), 3) # 3. Partial insert data_3 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT1_001', }, { 'plot_id': 0, 'occurrence_id': 2, 'occurrence_identifier': 'PLOT1_003', }, { 'plot_id': 0, 'occurrence_id': 5, 'occurrence_identifier': 'PLOT1_003', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_3 = prov1.get_reindexed_provider_dataframe(data_3) ins = prov1.get_insert_dataframe(df1, reindexed_data_3) self.assertEqual(len(ins), 2) self.assertEqual(list(ins['provider_occurrence_pk']), [2, 5])
def update_raster(cls, name, raster_file_path=None, new_name=None, tile_dimension=None, register=False, properties=None): """ Update an existing raster in database and update it the Niamoto raster registry. Uses raster2pgsql command. The raster is cut in tiles, using the dimension tile_width x tile_width. All rasters are stored in the settings.NIAMOTO_RASTER_SCHEMA schema. c.f. https://postgis.net/docs/using_raster_dataman.html#RT_Raster_Loader for more details on raster2pgsql. :param name: The name of the raster. If None, the raster data won't be updated. :param raster_file_path: The path to the raster file. :param new_name: The new name of the raster (not changed if None). :param tile_dimension: The tile dimension (width, height), if None, tile dimension will be chosen automatically by PostGIS. :param register: Register the raster as a filesystem (out-db) raster. (-R option of raster2pgsql). :param properties: A dict of arbitrary properties. """ cls.assert_raster_exists(name) if new_name is None: new_name = name else: cls.assert_raster_does_not_exist(new_name) if raster_file_path is not None: if not os.path.exists(raster_file_path): raise FileNotFoundError( "The raster {} does not exist".format(raster_file_path)) if tile_dimension is not None: dim = "{}x{}".format(tile_dimension[0], tile_dimension[1]) else: dim = 'auto' os.environ["PGPASSWORD"] = settings.NIAMOTO_DATABASE["PASSWORD"] d = "-d" if new_name != name: d = "-c" tb = "{}.{}".format(cls.DB_SCHEMA, new_name) raster2pgsql_args = [ "raster2pgsql", d, "-C", "-Y", '-t', dim, '-I', '-M', raster_file_path, tb, ] if register: raster2pgsql_args.append('-R') p1 = subprocess.Popen(raster2pgsql_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with open(LOG_FILE, mode='a') as log_file: p2 = subprocess.call([ "psql", "-q", "-U", settings.NIAMOTO_DATABASE["USER"], "-h", settings.NIAMOTO_DATABASE["HOST"], "-p", settings.NIAMOTO_DATABASE["PORT"], "-d", settings.NIAMOTO_DATABASE["NAME"], "-w", ], stdin=p1.stdout, stdout=log_file, stderr=log_file) stdout, stderr = p1.communicate() if stderr: LOGGER.debug(stderr) os.environ["PGPASSWORD"] = "" if p2 != 0 or p1.returncode != 0: raise RuntimeError("raster import failed.") upd_values = {'name': new_name, 'date_update': datetime.now()} if properties is not None: upd_values['properties'] = properties upd = cls.REGISTRY_TABLE.update() \ .values(upd_values)\ .where(cls.REGISTRY_TABLE.c.name == name) with Connector.get_connection() as connection: connection.execute(upd) if new_name != name: if raster_file_path is not None: connection.execute("DROP TABLE IF EXISTS {};".format( "{}.{}".format(cls.DB_SCHEMA, name))) else: connection.execute("ALTER TABLE {} RENAME TO {};".format( '{}.{}'.format(cls.DB_SCHEMA, name), '{}.{}'.format(cls.DB_SCHEMA, new_name)))
def test_get_update_dataframe(self): data_provider_1 = TestDataProvider('test_data_provider_1') with Connector.get_connection() as connection: prov1 = BasePlotOccurrenceProvider(data_provider_1) df1 = prov1.get_niamoto_plot_occurrence_dataframe(connection) # 1. Nothing to update data_1 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 1, 'occurrence_identifier': 'PLOT1_002', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_1 = prov1.get_reindexed_provider_dataframe(data_1) upd = prov1.get_update_dataframe(df1, reindexed_data_1) self.assertEqual(len(upd), 0) # 2. Everything to update data_2 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT1_002', }, { 'plot_id': 1, 'occurrence_id': 2, 'occurrence_identifier': 'PLOT1_003', }, { 'plot_id': 2, 'occurrence_id': 5, 'occurrence_identifier': 'PLOT1_003', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_2 = prov1.get_reindexed_provider_dataframe(data_2) upd = prov1.get_update_dataframe(df1, reindexed_data_2) self.assertEqual(len(upd), 3) # 3. Partial update data_3 = pd.DataFrame.from_records([ { 'plot_id': 0, 'occurrence_id': 0, 'occurrence_identifier': 'PLOT1_002', }, { 'plot_id': 1, 'occurrence_id': 2, 'occurrence_identifier': 'PLOT1_003', }, { 'plot_id': 1, 'occurrence_id': 5, 'occurrence_identifier': 'PLOT1_003', }, ], index=['plot_id', 'occurrence_id']) reindexed_data_3 = prov1.get_reindexed_provider_dataframe(data_3) upd = prov1.get_update_dataframe(df1, reindexed_data_3) self.assertEqual(len(upd), 2) l = list(upd['occurrence_id']) l.sort() self.assertEqual(l, [0, 2])
def add_raster(cls, name, raster_file_path, tile_dimension=None, register=False, properties={}, **kwargs): """ Add a raster in database and register it the Niamoto raster registry. Uses raster2pgsql command. The raster is cut in tiles, using the dimension tile_width x tile_width. All rasters are stored in the settings.NIAMOTO_RASTER_SCHEMA schema. c.f. https://postgis.net/docs/using_raster_dataman.html#RT_Raster_Loader for more details on raster2pgsql. :param name: The name of the raster. :param raster_file_path: The path to the raster file. :param tile_dimension: The tile dimension (width, height), if None, tile dimension will be chosen automatically by PostGIS. :param register: Register the raster as a filesystem (out-db) raster. (-R option of raster2pgsql). :param properties: A dict of arbitrary properties. """ if not os.path.exists(raster_file_path): raise FileNotFoundError( "The raster {} does not exist".format(raster_file_path)) cls.assert_raster_does_not_exist(name) cls.assert_raster_schema_exists() if tile_dimension is not None: dim = "{}x{}".format(tile_dimension[0], tile_dimension[1]) else: dim = 'auto' tb = "{}.{}".format(cls.DB_SCHEMA, name) os.environ["PGPASSWORD"] = settings.NIAMOTO_DATABASE["PASSWORD"] raster2pgsql_args = [ "raster2pgsql", "-c", "-Y", '-C', '-t', dim, '-I', '-M', raster_file_path, tb, ] if register: raster2pgsql_args.append('-R') p1 = subprocess.Popen(raster2pgsql_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) with open(LOG_FILE, mode='a') as log_file: p2 = subprocess.call([ "psql", "-q", "-U", settings.NIAMOTO_DATABASE["USER"], "-h", settings.NIAMOTO_DATABASE["HOST"], "-p", settings.NIAMOTO_DATABASE["PORT"], "-d", settings.NIAMOTO_DATABASE["NAME"], "-w", ], stdin=p1.stdout, stdout=log_file, stderr=log_file) stdout, stderr = p1.communicate() if stderr: LOGGER.debug(stderr) if stdout: LOGGER.debug(stdout) os.environ["PGPASSWORD"] = "" if p2 != 0 or p1.returncode != 0: raise RuntimeError( "raster import failed, check the logs for more details.") values = { 'name': name, 'date_create': datetime.now(), 'properties': properties, } values.update(kwargs) ins = cls.REGISTRY_TABLE.insert().values(values) with Connector.get_connection() as connection: connection.execute(ins)
def update_vector(cls, name, vector_file_path=None, new_name=None, properties=None): """ Update an existing vector in database and update it the Niamoto vector registry. Uses ogr2ogr. All vectors are stored in the settings.NIAMOTO_RASTER_SCHEMA schema. :param name: The name of the vector. :param vector_file_path: The path to the vector file. If None, the vector data won't be updated. :param new_name: The new name of the vector (not changed if None). :param properties: A dict of arbitrary properties. """ LOGGER.debug("VectorManager.update_vector({}, {}, new_name={})".format( name, vector_file_path, new_name)) cls.assert_vector_exists(name) if new_name is None: new_name = name else: cls.assert_vector_does_not_exist(new_name) if vector_file_path is not None: if not os.path.exists(vector_file_path): raise FileNotFoundError( "The vector {} does not exist".format(vector_file_path)) with Connector.get_connection() as connection: connection.execute("DROP TABLE IF EXISTS {};".format( "{}.{}".format(settings.NIAMOTO_VECTOR_SCHEMA, name))) tb = "{}.{}".format(settings.NIAMOTO_VECTOR_SCHEMA, new_name) pg_str = "dbname='{}' host='{}' port='{}' user='******' password='******'" pg_str = pg_str.format( settings.NIAMOTO_DATABASE["NAME"], settings.NIAMOTO_DATABASE["HOST"], settings.NIAMOTO_DATABASE["PORT"], settings.NIAMOTO_DATABASE["USER"], settings.NIAMOTO_DATABASE["PASSWORD"], ) p = subprocess.Popen([ "ogr2ogr", "-overwrite", "-lco", "SCHEMA={}".format(settings.NIAMOTO_VECTOR_SCHEMA), "-lco", "OVERWRITE=YES", '-f', 'PostgreSQL', 'PG:{}'.format(pg_str), '-nln', tb, '-nlt', 'PROMOTE_TO_MULTI', vector_file_path, ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout: LOGGER.debug(str(stdout)) if stderr: LOGGER.debug(str(stderr)) if p.returncode != 0: raise RuntimeError("vector import failed.") upd_values = {'name': new_name, 'date_update': datetime.now()} if properties is not None: upd_values['properties'] = properties upd = meta.vector_registry.update() \ .values(upd_values)\ .where(meta.vector_registry.c.name == name) with Connector.get_connection() as connection: if new_name != name: connection.execute(upd)
def tearDown(self): del1 = niamoto_db_meta.data_provider.delete() with Connector.get_connection() as connection: connection.execute(del1)
def sync(self, insert=True, update=True, delete=True, sync_occurrence=True, sync_plot=True, sync_plot_occurrence=True): """ Sync Niamoto database with providers data. :param insert: if False, skip insert operation. :param update: if False, skip update operation. :param delete: if False, skip delete operation. :param sync_occurrence: if False, skip occurrence sync. :param sync_plot: if False, skip plot sync. :param sync_plot_occurrence: if skip plot-occurrence sync. :return A dict containing the insert / update / delete dataframes for each specialized provider: { 'occurrence': { 'insert': insert_df, 'update': update_df, "delete': delete_df, }, 'plot': { ... }, 'plot_occurrence': { ... }, } """ t = time.time() LOGGER.debug("\r" + "-" * 80) LOGGER.info("*** Data sync starting ('{}' - {})...".format( self.name, self.get_type_name())) with Connector.get_connection() as connection: with connection.begin(): i1, u1, d1 = self.occurrence_provider.sync( connection, insert=insert, update=update, delete=delete, ) if sync_occurrence else ([], [], []) i2, u2, d2 = self.plot_provider.sync( connection, insert=insert, update=update, delete=delete, ) if sync_plot else ([], [], []) with connection.begin(): i3, u3, d3 = self.plot_occurrence_provider.sync( connection, insert=insert, update=update, delete=delete, ) if sync_plot_occurrence else ([], [], []) upd = niamoto_db_meta.data_provider.update().values({ 'last_sync': datetime.now(), }).where(niamoto_db_meta.data_provider.c.name == self.name) connection.execute(upd) m = "*** Data sync with '{}' done (total time: {:.2f} s)!" LOGGER.info(m.format(self.name, time.time() - t)) LOGGER.debug("\r" + "-" * 80) return { 'occurrence': { 'insert': i1, 'update': u1, 'delete': d1, }, 'plot': { 'insert': i2, 'update': u2, 'delete': d2, }, 'plot_occurrence': { 'insert': i3, 'update': u3, 'delete': d3, }, }
def get_vector_geo_dataframe(cls, vector_name, geojson_filter=None, geojson_cut=False): """ Return a registered vector as a GeoDataFrame. :param vector_name: The name of the vector. :param geojson_filter: Optional: if specified (as str), will only return the features intersecting with the geojson. :param geojson_cut: If True, return the intersection with the geojson filter (cut the intersecting features). :return: A GeoDataFrame corresponding to the vector. """ geom_col = cls.get_geometry_column(vector_name) pk_cols = cls.get_vector_primary_key_columns(vector_name) cols = cls.get_vector_sqlalchemy_table(vector_name).columns cols_str = [c.name for c in cols] where_statement = '' if geojson_filter is not None: geojson_postgis = \ """ ST_Transform( ST_SetSRID( ST_MakeValid(ST_GeomFromGeoJSON('{}')), 4326 ), {} ) """.format(geojson_filter, geom_col[2]) where_statement = \ """ WHERE ST_Intersects( {}, {} ) """.format( geom_col[0], geojson_postgis ) if geojson_cut: cols_str = [] for col in cols: if col.name == geom_col[0]: cols_str.append("ST_Intersection({}, {}) AS {}".format( geom_col[0], geojson_postgis, col.name, )) else: cols_str.append(col.name) with Connector.get_connection() as connection: sql = "SELECT {} FROM {}.{} {};".format( ','.join(cols_str), settings.NIAMOTO_VECTOR_SCHEMA, vector_name, where_statement, ) return gpd.read_postgis( sql, connection, index_col=[i[0] for i in pk_cols], geom_col=geom_col[0], crs='+init=epgs:{}'.format(geom_col[2]), )
def test_get_delete_dataframe(self): data_provider_1 = TestDataProvider('test_data_provider_1') with Connector.get_connection() as connection: pp1 = BasePlotProvider(data_provider_1) df1 = pp1.get_niamoto_plot_dataframe(connection) # 1. Nothing to delete plot_1 = pd.DataFrame.from_records([ { 'id': 0, 'name': 'plot_1_1', 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, { 'id': 1, 'name': 'plot_1_2', 'location': from_shape(Point(166.551, -22.098), srid=4326), 'properties': '{}', }, { 'id': 2, 'location': from_shape(Point(166.552, -22.097), srid=4326), 'properties': '{}', }, { 'id': 5, 'location': from_shape(Point(166.553, -22.099), srid=4326), 'properties': '{}', }, ], index='id') delete_df = pp1.get_delete_dataframe(df1, plot_1) self.assertIn('provider_pk', delete_df.columns) self.assertIn('provider_id', delete_df.columns) self.assertEqual(len(delete_df[pd.isnull(delete_df['provider_pk'])]), 0) self.assertEqual(len(delete_df[pd.isnull(delete_df['provider_id'])]), 0) self.assertEqual(len(delete_df), 0) self.assertEqual(len(delete_df[pd.isnull(delete_df['location'])]), 0) # 2. Everything to delete plot_2 = pd.DataFrame.from_records([ { 'id': 10, 'name': 'plot_1_11', 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, ], index='id') delete_df = pp1.get_delete_dataframe(df1, plot_2) self.assertEqual(len(delete_df), 4) self.assertEqual(len(delete_df[pd.isnull(delete_df['location'])]), 0) # 3. Partial delete plot_3 = pd.DataFrame.from_records([ { 'id': 0, 'name': 'plot_1_1', 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, ], index='id') delete_df = pp1.get_delete_dataframe(df1, plot_3) self.assertEqual(len(delete_df), 3) self.assertEqual(len(delete_df[pd.isnull(delete_df['location'])]), 0)
def test_publish_to_postgis(self): CsvDataProvider.register_data_provider('csv_provider') csv_provider = CsvDataProvider( 'csv_provider', occurrence_csv_path=TEST_OCCURRENCE_CSV, ) csv_provider.sync() with Connector.get_connection() as connection: sel = select([ meta.occurrence.c.id.label('id'), meta.occurrence.c.taxon_id.label('taxon_id'), cast(meta.taxon.c.rank.label('rank'), String).label('rank'), meta.taxon.c.full_name.label('full_name'), cast(meta.occurrence.c.location, String).label('location'), ]).select_from( meta.occurrence.outerjoin( meta.taxon, meta.taxon.c.id == meta.occurrence.c.taxon_id)) df = gpd.read_postgis(sel, connection, index_col='id', geom_col='location', crs='+init=epsg:4326') BaseDataPublisher._publish_sql(df, 'test_export_postgis', schema='niamoto') engine = Connector.get_engine() inspector = Inspector.from_engine(engine) self.assertIn( 'test_export_postgis', inspector.get_table_names(schema=settings.NIAMOTO_SCHEMA), ) df2 = gpd.read_postgis(sel, connection, index_col='id', geom_col='location', crs={'init': 'epsg:4326'}) BaseDataPublisher._publish_sql( df2, 'test_export_postgis', schema='niamoto', if_exists='truncate', truncate_cascade=True, ) # Test geometry types polygon = Polygon([(0, 0), (1, 0), (1, 1)]) linestring = LineString([(0, 0), (0, 1), (1, 1)]) multipoint = MultiPoint([(1, 2), (3, 4), (5, 6)]) multilinestring = MultiLineString([[(1, 2), (3, 4), (5, 6)], [(7, 8), (9, 10)]]) polygon_2 = Polygon([(1, 1), (1, -1), (-1, -1), (-1, 1)], [[(.5, .5), (.5, -.5), (-.5, -.5), (-.5, .5)]]) multipolygon = MultiPolygon([polygon, polygon_2]) geometry = GeometryCollection([polygon, polygon_2]) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': polygon }], geometry='geom'), 'test_export_postgis', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': linestring }], geometry='geom'), 'test_export_postgis', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': multilinestring }], geometry='geom'), 'test_export_postgis', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': multipoint }], geometry='geom'), 'test_export_postgis', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': multipolygon }], geometry='geom'), 'test_export_postgis', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': geometry }], geometry='geom'), 'test_export_postgis', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoDataFrame([{ 'A': 1, 'geom': geometry }], geometry='geom'), 'TEST123', schema='niamoto', if_exists='replace', ) BaseDataPublisher._publish_sql( gpd.GeoSeries([polygon]), 'test_export_postgis', schema='niamoto', if_exists='replace', ) self.assertRaises( ValueError, BaseDataPublisher._publish_sql, gpd.GeoSeries([polygon]), 'test_export_postgis', schema='niamoto', if_exists='thisisnotallowed', )
def set_taxonomy(cls, taxon_dataframe): """ Set the taxonomy. If a taxonomy already exist, delete it and set the new one. :param taxon_dataframe: A dataframe containing the taxonomy to set. The dataframe must contain at least the following columns: taxon_id -> The id of the taxon (must be the index of the DataFrame). parent_id -> The id of the taxon's parent. rank -> The rank of the taxon (must be a value in: 'REGNUM', 'PHYLUM', 'CLASSIS', 'ORDO', 'FAMILIA', 'GENUS', 'SPECIES', 'INFRASPECIES') full_name -> The full name of the taxon. rank_name -> The rank name of the taxon. The remaining columns will be stored as synonyms, using the column name. """ LOGGER.debug("Starting set_taxonomy...") required_columns = {'parent_id', 'rank', 'full_name', 'rank_name'} cols = set(list(taxon_dataframe.columns)) inter = cols.intersection(required_columns) synonym_cols = cols.difference(required_columns) if cls.IDENTITY_SYNONYM_KEY in synonym_cols: synonym_cols = synonym_cols.difference({cls.IDENTITY_SYNONYM_KEY}) m = "The '{}' synonym key is a special key reserved by Niamoto," \ "this column will be ignored." LOGGER.warning(m.format(cls.IDENTITY_SYNONYM_KEY)) if not inter == required_columns: m = "The taxon dataframe does not contains the required " \ "columns {}, csv has: {}".format(required_columns, cols) raise MalformedDataSourceError(m) if len(synonym_cols) > 0: LOGGER.debug( "The following synonym keys had been detected: {}".format( ','.join(synonym_cols))) synonyms = taxon_dataframe[list(synonym_cols)].apply( lambda x: x.to_json(), axis=1) else: LOGGER.debug("No synonym keys had been detected.") synonyms = '{}' taxon_dataframe.drop(synonym_cols, axis=1, inplace=True) taxon_dataframe['synonyms'] = synonyms mptt = ['mptt_tree_id', 'mptt_depth', 'mptt_left', 'mptt_right'] for col in mptt: taxon_dataframe[col] = 0 taxon_dataframe = cls.construct_mptt(taxon_dataframe) taxon_dataframe['taxon_id'] = taxon_dataframe.index taxon_dataframe = taxon_dataframe.astype(object).where( pd.notnull(taxon_dataframe), None) with Connector.get_connection() as connection: current_synonym_keys = set( pd.read_sql(select([meta.synonym_key_registry.c.name]), connection)['name']) to_add = synonym_cols.difference(current_synonym_keys) to_delete = current_synonym_keys.difference( synonym_cols).difference({cls.IDENTITY_SYNONYM_KEY}) to_keep = current_synonym_keys.intersection(synonym_cols) if len(to_delete) > 0: prov = meta.data_provider syno = meta.synonym_key_registry providers_to_update = pd.read_sql( select([prov.c.name]).select_from( prov.join(syno, syno.c.id == prov.c.synonym_key_id)).where( syno.c.name.in_(to_delete)), connection) msg = "The following synonym keys will be deleted: {}. " \ "The following data providers where depending on those" \ " synonym keys: {} Please consider updating " \ "them, or updating the taxonomy." LOGGER.warning( msg.format(to_delete, set(providers_to_update['name']))) with connection.begin(): connection.execute("SET CONSTRAINTS ALL DEFERRED;") # Unregister synonym keys cls.unregister_all_synonym_keys( bind=connection, exclude=to_keep, ) # Delete existing taxonomy cls.delete_all_taxa(bind=connection) # Register synonym cols for synonym_key in to_add: cls.register_synonym_key(synonym_key, bind=connection) # Insert the data LOGGER.debug("Inserting the taxonomy in database...") if len(taxon_dataframe) > 0: ins = meta.taxon.insert().values( id=bindparam('taxon_id'), full_name=bindparam('full_name'), rank_name=bindparam('rank_name'), rank=bindparam('rank'), parent_id=bindparam('parent_id'), synonyms=cast(bindparam('synonyms'), JSONB), mptt_left=bindparam('mptt_left'), mptt_right=bindparam('mptt_right'), mptt_tree_id=bindparam('mptt_tree_id'), mptt_depth=bindparam('mptt_depth'), ) result = connection.execute( ins, taxon_dataframe.to_dict(orient='records')).rowcount else: result = 0 m = "The taxonomy had been successfully set ({} taxa " \ "inserted)!" LOGGER.debug(m.format(result)) return result, synonym_cols
def get_reindexed_provider_dataframe(self, dataframe): """ :param dataframe: The provider's DataFrame, or a subset, with index being a multi-index composed with [plot_id, occurrence_id] (provider's ids). :return: The dataframe reindexed: provider_plot_pk -> plot_id provider_occurrence_pk -> occurrence_id """ LOGGER.debug("reindexing provider's plot-occurrence dataframe...") if len(dataframe) == 0: return dataframe # Set index names dataframe.index.set_names( ['provider_plot_pk', 'provider_occurrence_pk'], inplace=True ) with Connector.get_connection() as connection: sel_plot = select([ plot.c.id.label('plot_id'), plot.c.provider_pk.label('provider_plot_pk') ]).where(plot.c.provider_id == self.data_provider.db_id) sel_occ = select([ occurrence.c.id.label('occurrence_id'), occurrence.c.provider_pk.label('provider_occurrence_pk') ]).where(occurrence.c.provider_id == self.data_provider.db_id) plot_ids = pd.read_sql( sel_plot, connection, index_col='provider_plot_pk' ) occ_ids = pd.read_sql( sel_occ, connection, index_col='provider_occurrence_pk' ) dataframe.reset_index(inplace=True) # Assert plots and occurrences exist in db plot_pks = pd.Index(pd.unique(dataframe['provider_plot_pk'])) plot_not_in_db = plot_pks.difference(plot_ids.index) occ_pks = pd.Index(pd.unique(dataframe['provider_occurrence_pk'])) occ_not_in_db = occ_pks.difference(occ_ids.index) if len(plot_not_in_db) != 0: raise IncoherentDatabaseStateError( "Tried to insert plot_occurrence records with plot records " "that do not exist in database." ) if len(occ_not_in_db) != 0: raise IncoherentDatabaseStateError( "Tried to insert plot_occurrence records with occurrence " "records that dot not exist in database." ) dataframe = dataframe.merge( plot_ids, left_on='provider_plot_pk', right_index=True, ) dataframe = dataframe.merge( occ_ids, left_on='provider_occurrence_pk', right_index=True, ) dataframe.set_index(['plot_id', 'occurrence_id'], inplace=True) dataframe['provider_id'] = self.data_provider.db_id dataframe['plot_id'] = dataframe.index.get_level_values('plot_id') dataframe['occurrence_id'] = dataframe.index.get_level_values( 'occurrence_id' ) return dataframe
def test_get_insert_dataframe(self): data_provider_1 = TestDataProvider('test_data_provider_1') with Connector.get_connection() as connection: op1 = BaseOccurrenceProvider(data_provider_1) df1 = op1.get_niamoto_occurrence_dataframe(connection) # 1. Nothing to insert occ_1 = pd.DataFrame.from_records([ { 'id': 0, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, ], index='id') ins = op1.get_insert_dataframe(df1, occ_1) self.assertEqual(len(ins), 0) # 2. Everything to insert occ_2 = pd.DataFrame.from_records([ { 'id': 10, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, { 'id': 11, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.551, -22.098), srid=4326), 'properties': '{}', }, ], index='id') ins = op1.get_insert_dataframe(df1, occ_2) self.assertIn('provider_pk', ins.columns) self.assertIn('provider_id', ins.columns) self.assertEqual(len(ins[pd.isnull(ins['provider_pk'])]), 0) self.assertEqual(len(ins[pd.isnull(ins['provider_id'])]), 0) self.assertEqual(len(ins), 2) # 3. Partial insert occ_3 = pd.DataFrame.from_records([ { 'id': 0, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.5521, -22.0939), srid=4326), 'properties': '{}', }, { 'id': 11, 'taxon_id': None, 'provider_taxon_id': None, 'location': from_shape(Point(166.551, -22.098), srid=4326), 'properties': '{}', }, ], index='id') ins = op1.get_insert_dataframe(df1, occ_3) self.assertEqual(len(ins), 1)