def process_database(species=None, commit=False): session = get_session() # Just get taxa that have range polygons if species == None: taxa = session.execute( "SELECT DISTINCT taxon_id FROM taxon_range").fetchall() else: taxa = session.execute( "SELECT DISTINCT taxon_id FROM taxon_range, taxon WHERE taxon_id = taxon.id AND spno IN (%s)" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() # Unwrap tuple taxa = [taxon_id for (taxon_id, ) in taxa] # Delete old results if commit and len(taxa) > 0: # session.execute("DELETE FROM t2_ultrataxon_sighting WHERE taxon_id IN (:taxa)", { 'taxa': taxa }) session.execute( "DELETE FROM t2_ultrataxon_sighting WHERE taxon_id IN (%s)" % sql_list_placeholder('taxa', taxa), sql_list_argument('taxa', taxa)) session.commit() # Process in parallel tasks = [(taxon_id, commit) for taxon_id in taxa] # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() for result, error in tqdm(run_parallel(process_taxon, tasks), total=len(tasks)): if error: print(error)
def process_database(species=None, commit=False): session = get_session() if species == None: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM processing_method WHERE data_type = 2" ).fetchall() ] else: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM processing_method, taxon WHERE taxon.id = taxon_id AND data_type = 2 AND spno IN %s" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() ] create_region_lookup_table(session) log.info("Step 1/2: Monthly aggregation") fn = functools.partial(aggregate_by_month, commit=commit) for result, error in tqdm(run_parallel(fn, taxa), total=len(taxa)): pass log.info("Step 2/2: Yearly aggregation") fn = functools.partial(aggregate_by_year, commit=commit) for result, error in tqdm(run_parallel(fn, taxa), total=len(taxa)): pass cleanup_region_lookup_table(session)
def process_database(species=None, commit=False): session = get_session() if species == None: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM t1_sighting").fetchall() ] else: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM t1_sighting, taxon WHERE taxon.id = taxon_id AND spno IN (%s)" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() ] create_region_lookup_table(session) # Process in parallel tasks = [(taxon_id, commit) for taxon_id in taxa] log.info("Step 1/2: Monthly aggregation") for result, error in tqdm(run_parallel(aggregate_monthly, tasks), total=len(tasks)): if error: print error log.info("Step 1/2: Yearly aggregation") for result, error in tqdm(run_parallel(aggregate_yearly, tasks), total=len(tasks)): if error: print error cleanup_region_lookup_table(session)
def process_database(species=None, commit=False): """ Calculates spatial representativeness using alpha hulls Generates alpha hulls from each source x taxon combination Intersects alpha hulls with range layers, and then calculates percentage of range covered """ session = get_session() if commit: if species == None: session.execute("DELETE FROM taxon_source_alpha_hull") else: session.execute( """DELETE FROM taxon_source_alpha_hull WHERE taxon_id IN (SELECT id FROM taxon WHERE spno IN (%s))""" % sql_list_placeholder('species', species), sql_list_argument('species', species)) session.commit() # Load coastal shapefile coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp") with fiona.Env(OSR_WKT_FORMAT="WKT2_2018"), fiona.open( coastal_shape_filename, 'r') as coastal_shape: # Convert from fiona dictionary to shapely geometry and reproject shp_to_working_transformer = pyproj.Transformer.from_proj( pyproj.CRS.from_wkt(coastal_shape.crs_wkt), working_proj, always_xy=True) coastal_shape = reproject(shape(coastal_shape[0]['geometry']), shp_to_working_transformer) # Simplify coastal boundary - makes things run ~20X faster log.info("Simplifying coastal boundary") coastal_shape = coastal_shape.buffer(10000).simplify(10000) log.info("Generating alpha shapes") for data_type in 1, 2: log.info("Processing type %s data" % data_type) taxa = get_taxa(session, data_type, species) tasks = [(taxon_id, coastal_shape, data_type, commit) for taxon_id in taxa] # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() # TODO: not sure if this is needed now # Process all the species in parallel for result, error in tqdm(run_parallel(process, tasks), total=len(tasks)): if error: print(error)
def get_taxa(session, data_type, species): table = "t1_sighting" if data_type == 1 else "t2_ultrataxon_sighting" if species == None: taxa = session.execute("""SELECT DISTINCT taxon_id FROM {table}""".format(table = table)).fetchall() else: sql = """SELECT DISTINCT taxon_id FROM {table}, taxon WHERE taxon.id = taxon_id AND spno IN ({species})""".format( table = table, species = sql_list_placeholder('species', species) ) taxa = session.execute(sql, sql_list_argument('species', species)).fetchall() return [taxon_id for (taxon_id,) in taxa]
def process_database(species=None, monthly=False, filter_output=False, include_all_years_data=False, database_config=None, export_dir=None): session = get_session(database_config) if species == None: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM aggregated_by_year").fetchall() ] else: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM aggregated_by_year, taxon WHERE taxon.id = taxon_id AND spno IN (%s)" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() ] log.info("Generating numeric IDs") # Create stable IDs for each taxon_id / search_type_id / source_id / unit_id / site_id / data_type combination # session.execute("""CREATE TEMPORARY TABLE aggregated_id # ( INDEX (taxon_id, search_type_id, source_id, unit_id, site_id, grid_cell_id, data_type) ) # SELECT (@cnt := @cnt + 1) AS id, taxon_id, search_type_id, source_id, unit_id, site_id, grid_cell_id, data_type # FROM (SELECT DISTINCT taxon_id, search_type_id, source_id, unit_id, site_id, grid_cell_id, data_type FROM aggregated_by_year) t # CROSS JOIN (SELECT @cnt := 0) AS dummy""") log.info("Calculating region centroids") session.execute("""CREATE TEMPORARY TABLE region_centroid AS -- (PRIMARY KEY (id)) SELECT id, ST_X(ST_Centroid(geometry)) AS x, ST_Y(ST_Centroid(geometry)) AS y FROM region""") # Get year range min_year = tsx.config.config.getint("processing", "min_year") max_analysis_year = tsx.config.config.getint("processing", "max_year") # When enabled, this flag means that all year's data will be included for any time series that passed filtering, # even beyond the max_year specified in the config file. However, the TimeSeriesSampleYears and other stats still # need to reflect only the years up to max_year, so it makes things a tad more complicated. if include_all_years_data: (max_year, ) = session.execute( """SELECT MAX(start_date_y) FROM aggregated_by_year""").fetchone() else: max_year = max_analysis_year # Without this, the GROUP_CONCAT in the export query produces rows that are too long if database_config and "sqlite:" not in database_config: session.execute("""SET SESSION group_concat_max_len = 50000;""") export_dir = export_dir or tsx.config.data_dir('export') filename = 'lpi' if monthly: filename += '-monthly' if filter_output: filename += '-filtered' if include_all_years_data: filename += '-all-years' filename += '.csv' filepath = os.path.join(export_dir, filename) log.info("Exporting LPI wide table file: %s" % filepath) with open(filepath, 'w', encoding='utf-8') as csvfile: fieldnames = [ 'ID', 'Binomial', 'SpNo', 'TaxonID', 'CommonName', 'Class', 'Order', 'Family', 'FamilyCommonName', 'Genus', 'Species', 'Subspecies', 'FunctionalGroup', # 'FunctionalSubGroup', 'TaxonomicGroup', 'EPBCStatus', 'IUCNStatus', 'StatePlantStatus', 'MaxStatus', 'State', 'Region', 'RegionCentroidLatitude', 'RegionCentroidLongitude', 'RegionCentroidAccuracy', 'SiteID', 'SiteName', 'SourceID', 'SourceDesc', 'MonitoringProgram', 'UnitID', 'Unit', 'SearchTypeID', 'SearchTypeDesc', 'ExperimentalDesignType', 'ResponseVariableType', 'DataType' ] if monthly: fieldnames += [ "%s_%02d" % (year, month) for year in range(min_year, max_year + 1) for month in range(0, 13) ] else: fieldnames += [str(year) for year in range(min_year, max_year + 1)] fieldnames += [ 'TimeSeriesLength', 'TimeSeriesSampleYears', 'TimeSeriesCompleteness', 'TimeSeriesSamplingEvenness', 'AbsencesRecorded', 'StandardisationOfMethodEffort', 'ObjectiveOfMonitoring', 'SpatialRepresentativeness', # 'SeasonalConsistency', # TBD 'ConsistencyOfMonitoring', # 'MonitoringFrequencyAndTiming', # TBD 'IntensiveManagement', 'IntensiveManagementGrouping', 'DataAgreement', 'SuppressAggregatedData', 'SurveysCentroidLatitude', 'SurveysCentroidLongitude', 'SurveysSpatialAccuracy', 'SurveyCount', 'TimeSeriesID', 'NationalPriorityTaxa', 'Citation' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() where_conditions = [] having_clause = '' if filter_output: if include_all_years_data: having_clause = "HAVING MAX(include_in_analysis)" else: where_conditions += ['include_in_analysis'] if monthly: value_series = "GROUP_CONCAT(CONCAT(start_date_y, '_', LPAD(COALESCE(start_date_m, 0), 2, '0'), '=', value))" aggregated_table = 'aggregated_by_month' else: value_series = "GROUP_CONCAT(CONCAT(start_date_y, '=', value))" aggregated_table = 'aggregated_by_year' if database_config and "sqlite:" in database_config: current_date_expression = "DATE('NOW')" current_year_expression = "strftime('%Y', 'now')" else: current_date_expression = "DATE(NOW())" current_year_expression = "YEAR(NOW())" index = 1 for taxon_id in tqdm(taxa): # (SELECT CAST(id AS UNSIGNED) FROM aggregated_id agg_id WHERE agg.taxon_id = agg_id.taxon_id AND agg.search_type_id <=> agg_id.search_type_id AND agg.source_id = agg_id.source_id AND agg.unit_id = agg_id.unit_id AND agg.site_id <=> agg_id.site_id AND agg.grid_cell_id <=> agg_id.grid_cell_id AND agg.data_type = agg_id.data_type) AS ID, sql = """SELECT time_series_id AS TimeSeriesID, taxon.spno AS SpNo, taxon.id AS TaxonID, taxon.common_name AS CommonName, taxon.`order` AS `Order`, taxon.scientific_name AS scientific_name, taxon.family_scientific_name AS Family, taxon.family_common_name AS FamilyCommonName, (SELECT GROUP_CONCAT( CONCAT(taxon_group.group_name, COALESCE(CONCAT(':', taxon_group.subgroup_name), '')) ) FROM taxon_group WHERE taxon_group.taxon_id = taxon.id ) AS FunctionalGroup, taxon.taxonomic_group AS TaxonomicGroup, CASE taxon.taxonomic_group WHEN 'Birds' THEN 'Aves' WHEN 'Mammals' THEN 'Mammalia' ELSE '' END AS Class, taxon.national_priority AS NationalPriorityTaxa, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.epbc_status_id) AS EPBCStatus, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.iucn_status_id) AS IUCNStatus, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.state_status_id) AS StatePlantStatus, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.max_status_id) AS MaxStatus, search_type.id AS SearchTypeID, search_type.description AS SearchTypeDesc, COALESCE(site_id, grid_cell_id) AS SiteID, COALESCE( t1_site.name, t2_site.name, CONCAT('site_', agg.data_type, '_', site_id), CONCAT('grid_', grid_cell_id)) AS SiteName, (SELECT description FROM intensive_management WHERE t1_site.intensive_management_id = intensive_management.id) AS IntensiveManagement, (SELECT `grouping` FROM intensive_management WHERE t1_site.intensive_management_id = intensive_management.id) AS IntensiveManagementGrouping, source.id AS SourceID, source.description AS SourceDesc, (SELECT description FROM monitoring_program WHERE source.monitoring_program_id = monitoring_program.id) AS MonitoringProgram, unit.id AS UnitID, unit.description AS Unit, region.name AS Region, region.state AS State, MIN(region_centroid.x) AS RegionCentroidLongitude, MIN(region_centroid.y) AS RegionCentroidLatitude, region.positional_accuracy_in_m AS RegionCentroidAccuracy, {value_series} AS value_series, COUNT(*) AS value_count, agg.data_type AS DataType, (SELECT description FROM experimental_design_type WHERE agg.experimental_design_type_id = experimental_design_type.id) AS ExperimentalDesignType, (SELECT description FROM response_variable_type WHERE agg.response_variable_type_id = response_variable_type.id) AS ResponseVariableType, (CASE WHEN taxon.suppress_spatial_representativeness THEN NULL ELSE ROUND(alpha.alpha_hull_area_in_m2 / alpha.core_range_area_in_m2, 4) END) AS SpatialRepresentativeness, data_source.absences_recorded AS AbsencesRecorded, data_source.standardisation_of_method_effort_id AS StandardisationOfMethodEffort, data_source.objective_of_monitoring_id AS ObjectiveOfMonitoring, data_source.consistency_of_monitoring_id AS ConsistencyOfMonitoring, data_source.data_agreement_id AS DataAgreement, data_source.suppress_aggregated_data AS SuppressAggregatedData, MAX(ST_X(agg.centroid_coords)) AS SurveysCentroidLongitude, MAX(ST_Y(agg.centroid_coords)) AS SurveysCentroidLatitude, MAX(agg.positional_accuracy_in_m) AS SurveysSpatialAccuracy, SUM(agg.survey_count) AS SurveyCount, CONCAT( COALESCE(CONCAT(source.authors, ' '), ''), '(', {current_year_expression}, '). ', COALESCE(CONCAT(source.description, '. '), ''), COALESCE(CONCAT(source.provider, '. '), ''), 'Aggregated for National Environmental Science Program Threatened Species Recovery Hub Project 3.1. Generated on ', {current_date_expression} ) AS Citation FROM {aggregated_table} agg INNER JOIN taxon ON taxon.id = agg.taxon_id LEFT JOIN search_type ON search_type.id = agg.search_type_id INNER JOIN source ON source.id = agg.source_id INNER JOIN unit ON unit.id = agg.unit_id LEFT JOIN region ON region.id = agg.region_id LEFT JOIN region_centroid ON region_centroid.id = agg.region_id LEFT JOIN taxon_source_alpha_hull alpha ON alpha.taxon_id = agg.taxon_id AND alpha.source_id = agg.source_id AND alpha.data_type = agg.data_type LEFT JOIN data_source ON data_source.taxon_id = agg.taxon_id AND data_source.source_id = agg.source_id LEFT JOIN t1_site ON site_id = t1_site.id AND agg.data_type = 1 LEFT JOIN t2_site ON site_id = t2_site.id AND agg.data_type = 2 WHERE agg.taxon_id = :taxon_id AND start_date_y >= :min_year AND start_date_y <= :max_year {where_conditions} GROUP BY agg.source_id, agg.search_type_id, agg.site_id, agg.grid_cell_id, agg.experimental_design_type_id, agg.response_variable_type_id, agg.region_id, agg.unit_id, agg.data_type ORDER BY agg.source_id, agg.search_type_id, agg.site_id, agg.grid_cell_id, agg.experimental_design_type_id, agg.response_variable_type_id, agg.region_id, agg.unit_id, agg.data_type {having_clause} """.format(value_series=value_series, aggregated_table=aggregated_table, where_conditions=" ".join( "AND %s" % cond for cond in where_conditions), having_clause=having_clause, current_date_expression=current_date_expression, current_year_expression=current_year_expression) result = session.execute(sql, { 'taxon_id': taxon_id, 'min_year': min_year, 'max_year': max_year }) keys = result.keys() for row in result.fetchall(): # Get row as a dict data = dict(zip(keys, row)) data["ID"] = index index += 1 # Parse out the yearly values (or monthly) year_data = dict( item.split('=') for item in data['value_series'].split(',')) if len(year_data) != data['value_count']: raise ValueError( "Aggregation problem - duplicate years found in time series: %s" % row) # Populate years in output data.update(year_data) # Taxonomic columns data['Binomial'] = re.sub( r'[^A-Za-z]+', '_', data['scientific_name']).strip('_')[0:40] name_parts = data['scientific_name'].split(' ', 2) data['Genus'] = name_parts[0] if len(name_parts) > 1: data['Species'] = name_parts[1] if len(name_parts) > 2: data['Subspecies'] = name_parts[2] # Calculate temporal suitability metrics: if not monthly and len(year_data) > 0: years = sorted([int(year) for year in year_data.keys()]) # If we include all years' data, we still want to output stats as if we were only processing up to max_analysis_year if include_all_years_data: years = list( filter(lambda y: y <= max_analysis_year, years)) # Due to previous step, years could in fact be empty by this point if len(years) > 0: year_range = max(years) - min(years) + 1 data['TimeSeriesLength'] = year_range data['TimeSeriesSampleYears'] = len(years) data['TimeSeriesCompleteness'] = "%0.3f" % ( float(len(years)) / year_range) # Get all non-zero gaps between years gaps = [ b - a - 1 for a, b in zip(years[:-1], years[1:]) if b - a > 1 ] data['TimeSeriesSamplingEvenness'] = np.array( gaps).var() if len(gaps) > 0 else 0 # Remove unwanted key from dict del data['value_series'] del data['value_count'] del data['scientific_name'] if unicode_type_exists: writer.writerow({ k: None if v == None else unicode(v).encode("utf-8") for k, v in data.items() }) else: writer.writerow({ k: None if v == None else str(v) for k, v in data.items() }) log.info("Done")
def process_database(species = None, commit = False): """ Calculates spatial representativeness using alpha hulls Generates alpha hulls from each source x taxon combination Intersects alpha hulls with range layers, and then calculates percentage of range covered """ session = get_session() if commit: if species == None: session.execute("DELETE FROM taxon_source_alpha_hull") else: session.execute("""DELETE FROM taxon_source_alpha_hull WHERE taxon_id IN (SELECT id FROM taxon WHERE spno IN (%s))""" % sql_list_placeholder('species', species), sql_list_argument('species', species)) session.commit() db_proj = pyproj.Proj('+init=EPSG:4326') # Database always uses WGS84 working_proj = pyproj.Proj('+init=EPSG:3112') # GDA94 / Geoscience Australia Lambert - so that we can buffer in metres # Load coastal shapefile coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp") with fiona.open(coastal_shape_filename, 'r') as coastal_shape: # Convert from fiona dictionary to shapely geometry and reproject coastal_shape = reproject(shape(coastal_shape[0]['geometry']), pyproj.Proj(coastal_shape.crs), working_proj) # Simplify coastal boundary - makes things run ~20X faster log.info("Simplifying coastal boundary") coastal_shape = coastal_shape.buffer(10000).simplify(10000) log.info("Generating alpha shapes") for data_type in 1,2: log.info("Processing type %s data" % data_type) # Process a single species. # This gets run off the main thread. def process(taxon_id): session = get_session() try: # Load core range geometry core_range_geom = reproject(get_core_range_geometry(session, taxon_id), db_proj, working_proj).buffer(0).intersection(coastal_shape) for source_id in get_source_ids(session, data_type, taxon_id): log.info("Processing taxon_id: %s, source_id: %s" % (taxon_id, source_id)) # Get raw points from DB raw_points = get_raw_points(session, data_type, taxon_id, source_id) empty = len(raw_points) < 4 if empty: log.info("Taxon %s: not enough points to create alpha hull (%s)" % (taxon_id, len(raw_points))) if not empty: # Read points from database points = [reproject(p, db_proj, working_proj) for p in raw_points] # Generate alpha shape alpha_shp = make_alpha_hull( points = points, coastal_shape = None, thinning_distance = tsx.config.config.getfloat('processing.alpha_hull', 'thinning_distance'), alpha = tsx.config.config.getfloat('processing.alpha_hull', 'alpha'), hullbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'hullbuffer_distance'), isolatedbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'isolatedbuffer_distance')) # Clean up geometry alpha_shp = alpha_shp.buffer(0) if core_range_geom.area == 0: log.info("Core range geometry area is zero") empty = True else: # Intersect alpha hull with core range intersected_alpha = to_multipolygon(core_range_geom.intersection(alpha_shp)) empty = intersected_alpha.is_empty if empty: session.execute("""INSERT INTO taxon_source_alpha_hull (source_id, taxon_id, data_type, core_range_area_in_m2, alpha_hull_area_in_m2) VALUES (:source_id, :taxon_id, :data_type, 0, 0)""", { 'source_id': source_id, 'taxon_id': taxon_id, 'data_type': data_type }) else: session.execute("""INSERT INTO taxon_source_alpha_hull (source_id, taxon_id, data_type, geometry, core_range_area_in_m2, alpha_hull_area_in_m2) VALUES (:source_id, :taxon_id, :data_type, ST_GeomFromWKB(_BINARY :geom_wkb), :core_range_area, :alpha_hull_area)""", { 'source_id': source_id, 'taxon_id': taxon_id, 'data_type': data_type, 'geom_wkb': shapely.wkb.dumps(reproject(intersected_alpha, working_proj, db_proj)), 'core_range_area': core_range_geom.area, 'alpha_hull_area': intersected_alpha.area }) if commit: session.commit() except: log.exception("Exception processing alpha hull") raise finally: session.close() taxa = get_taxa(session, data_type, species) # This is important because we are about to spawn child processes, and this stops them attempting to share the # same database connection pool session.close() tsx.db.connect.engine.dispose() # Process all the species in parallel for result, error in tqdm(run_parallel(process, taxa, use_processes = True), total = len(taxa)): if error: print error
def process_database(species=None, monthly=False, filter_output=False): session = get_session() if species == None: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM aggregated_by_year").fetchall() ] else: taxa = [ taxon_id for (taxon_id, ) in session.execute( "SELECT DISTINCT taxon_id FROM aggregated_by_year, taxon WHERE taxon.id = taxon_id AND spno IN (%s)" % sql_list_placeholder('species', species), sql_list_argument('species', species)).fetchall() ] log.info("Generating numeric IDs") # Create stable IDs for each taxon_id / search_type_id / source_id / unit_id / site_id / data_type combination session.execute("""CREATE TEMPORARY TABLE aggregated_id ( INDEX (taxon_id, search_type_id, source_id, unit_id, site_id, grid_cell_id, data_type) ) SELECT (@cnt := @cnt + 1) AS id, taxon_id, search_type_id, source_id, unit_id, site_id, grid_cell_id, data_type FROM (SELECT DISTINCT taxon_id, search_type_id, source_id, unit_id, site_id, grid_cell_id, data_type FROM aggregated_by_year) t CROSS JOIN (SELECT @cnt := 0) AS dummy""") log.info("Calculating region centroids") session.execute("""CREATE TEMPORARY TABLE region_centroid (PRIMARY KEY (id)) SELECT id, ST_X(ST_Centroid(geometry)) AS x, ST_Y(ST_Centroid(geometry)) AS y FROM region""") # Get year range min_year = tsx.config.config.getint("processing", "min_year") max_year = tsx.config.config.getint("processing", "max_year") # Without this, the GROUP_CONCAT in the export query produces rows that are too long session.execute("""SET SESSION group_concat_max_len = 50000;""") export_dir = tsx.config.data_dir('export') filename = 'lpi' if monthly: filename += '-monthly' if filter_output: filename += '-filtered' filename += '.csv' filepath = os.path.join(export_dir, filename) log.info("Exporting LPI wide table file: %s" % filepath) with open(filepath, 'w') as csvfile: fieldnames = [ 'ID', 'Binomial', 'SpNo', 'TaxonID', 'CommonName', 'Class', 'Order', 'Family', 'FamilyCommonName', 'Genus', 'Species', 'Subspecies', 'FunctionalGroup', 'FunctionalSubGroup', 'TaxonomicGroup', 'EPBCStatus', 'IUCNStatus', 'BirdLifeAustraliaStatus', 'MaxStatus', 'State', 'Region', 'RegionCentroidLatitude', 'RegionCentroidLongitude', 'RegionCentroidAccuracy', 'SiteID', 'SiteDesc', 'SourceID', 'SourceDesc', 'UnitID', 'Unit', 'SearchTypeID', 'SearchTypeDesc', 'ExperimentalDesignType', 'ResponseVariableType', 'DataType' ] if monthly: fieldnames += [ "%s_%02d" % (year, month) for year in range(min_year, max_year + 1) for month in range(0, 13) ] else: fieldnames += [str(year) for year in range(min_year, max_year + 1)] fieldnames += [ 'TimeSeriesLength', 'TimeSeriesSampleYears', 'TimeSeriesCompleteness', 'TimeSeriesSamplingEvenness', 'AbsencesRecorded', 'StandardisationOfMethodEffort', 'ObjectiveOfMonitoring', 'SpatialRepresentativeness', # 'SeasonalConsistency', # TBD 'ConsistencyOfMonitoring', # 'MonitoringFrequencyAndTiming', # TBD 'DataAgreement', 'SuppressAggregatedData', 'SurveysCentroidLatitude', 'SurveysCentroidLongitude', 'SurveysSpatialAccuracy', 'SurveyCount', 'TimeSeriesID', 'NationalPriorityTaxa', 'Citation' ] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() where_conditions = [] if filter_output: where_conditions += ['include_in_analysis'] if monthly: value_series = "GROUP_CONCAT(CONCAT(start_date_y, '_', LPAD(COALESCE(start_date_m, 0), 2, '0'), '=', value) ORDER BY start_date_y)" aggregated_table = 'aggregated_by_month' else: value_series = "GROUP_CONCAT(CONCAT(start_date_y, '=', value) ORDER BY start_date_y)" aggregated_table = 'aggregated_by_year' for taxon_id in tqdm(taxa): sql = """SELECT (SELECT CAST(id AS UNSIGNED) FROM aggregated_id agg_id WHERE agg.taxon_id = agg_id.taxon_id AND agg.search_type_id <=> agg_id.search_type_id AND agg.source_id = agg_id.source_id AND agg.unit_id = agg_id.unit_id AND agg.site_id <=> agg_id.site_id AND agg.grid_cell_id <=> agg_id.grid_cell_id AND agg.data_type = agg_id.data_type) AS ID, time_series_id AS TimeSeriesID, taxon.spno AS SpNo, taxon.id AS TaxonID, taxon.common_name AS CommonName, taxon.order AS `Order`, taxon.scientific_name AS scientific_name, taxon.family_scientific_name AS Family, taxon.family_common_name AS FamilyCommonName, taxon.bird_group AS FunctionalGroup, taxon.bird_sub_group AS FunctionalSubGroup, taxon.taxonomic_group AS TaxonomicGroup, taxon.national_priority AS NationalPriorityTaxa, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.epbc_status_id) AS EPBCStatus, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.iucn_status_id) AS IUCNStatus, (SELECT description FROM taxon_status WHERE taxon_status.id = taxon.aust_status_id) AS BirdLifeAustraliaStatus, (SELECT description FROM taxon_status WHERE taxon_status.id = GREATEST(COALESCE(taxon.epbc_status_id, 0), COALESCE(taxon.iucn_status_id, 0), COALESCE(taxon.aust_status_id, 0))) AS MaxStatus, search_type.id AS SearchTypeID, search_type.description AS SearchTypeDesc, COALESCE(site_id, grid_cell_id) AS SiteID, COALESCE( (SELECT name FROM t1_site WHERE site_id = t1_site.id AND agg.data_type = 1), (SELECT name FROM t2_site WHERE site_id = t2_site.id AND agg.data_type = 2), CONCAT('site_', agg.data_type, '_', site_id), CONCAT('grid_', grid_cell_id)) AS SiteDesc, source.id AS SourceID, source.description AS SourceDesc, unit.id AS UnitID, unit.description AS Unit, region.name AS Region, region.state AS State, region_centroid.x AS RegionCentroidLongitude, region_centroid.y AS RegionCentroidLatitude, region.positional_accuracy_in_m AS RegionCentroidAccuracy, {value_series} AS value_series, COUNT(*) AS value_count, agg.data_type AS DataType, (SELECT description FROM experimental_design_type WHERE agg.experimental_design_type_id = experimental_design_type.id) AS ExperimentalDesignType, (SELECT description FROM response_variable_type WHERE agg.response_variable_type_id = response_variable_type.id) AS ResponseVariableType, IF(taxon.suppress_spatial_representativeness, NULL, COALESCE(ROUND(alpha.alpha_hull_area_in_m2 / alpha.core_range_area_in_m2, 4), 0)) AS SpatialRepresentativeness, data_source.absences_recorded AS AbsencesRecorded, data_source.standardisation_of_method_effort_id AS StandardisationOfMethodEffort, data_source.objective_of_monitoring_id AS ObjectiveOfMonitoring, data_source.consistency_of_monitoring_id AS ConsistencyOfMonitoring, data_source.data_agreement_id AS DataAgreement, data_source.suppress_aggregated_data AS SuppressAggregatedData, MAX(ST_X(agg.centroid_coords)) AS SurveysCentroidLongitude, MAX(ST_Y(agg.centroid_coords)) AS SurveysCentroidLatitude, MAX(agg.positional_accuracy_in_m) AS SurveysSpatialAccuracy, SUM(agg.survey_count) AS SurveyCount, CONCAT( COALESCE(CONCAT(source.authors, ' '), ''), '(', YEAR(NOW()), '). ', COALESCE(CONCAT(source.description, '. '), ''), COALESCE(CONCAT(source.provider, '. '), ''), 'Aggregated for National Environmental Science Program Threatened Species Recovery Hub Project 3.1. Generated on ', DATE(NOW()) ) AS Citation FROM {aggregated_table} agg INNER JOIN taxon ON taxon.id = agg.taxon_id LEFT JOIN search_type ON search_type.id = search_type_id INNER JOIN source ON source.id = agg.source_id INNER JOIN unit ON unit.id = unit_id LEFT JOIN region ON region.id = region_id LEFT JOIN region_centroid ON region_centroid.id = region_id LEFT JOIN taxon_source_alpha_hull alpha ON alpha.taxon_id = agg.taxon_id AND alpha.source_id = agg.source_id AND alpha.data_type = agg.data_type LEFT JOIN data_source ON data_source.taxon_id = agg.taxon_id AND data_source.source_id = agg.source_id WHERE agg.taxon_id = :taxon_id AND start_date_y >= :min_year AND start_date_y <= :max_year {where_conditions} GROUP BY agg.source_id, agg.search_type_id, agg.site_id, agg.grid_cell_id, agg.experimental_design_type_id, agg.response_variable_type_id, agg.region_id, agg.unit_id, agg.data_type """.format(value_series=value_series, aggregated_table=aggregated_table, where_conditions=" ".join( "AND %s" % cond for cond in where_conditions)) result = session.execute(sql, { 'taxon_id': taxon_id, 'min_year': min_year, 'max_year': max_year }) keys = result.keys() for row in result.fetchall(): # Get row as a dict data = dict(zip(keys, row)) # Parse out the yearly values (or monthly) year_data = dict( item.split('=') for item in data['value_series'].split(',')) if len(year_data) != data['value_count']: raise ValueError( "Aggregation problem - duplicate years found in time series: %s" % row) # Populate years in output data.update(year_data) # Taxonomic columns data['Binomial'] = re.sub(r'[^\w]', '_', data['CommonName']) data['Class'] = 'Aves' name_parts = data['scientific_name'].split(' ') data['Genus'] = name_parts[0] if len(name_parts) > 1: data['Species'] = name_parts[1] if len(name_parts) > 2: data['Subspecies'] = name_parts[2] # Calculate temporal suitability metrics: if not monthly and len(year_data) > 0: years = sorted([int(year) for year in year_data.keys()]) year_range = max(years) - min(years) + 1 data['TimeSeriesLength'] = year_range data['TimeSeriesSampleYears'] = len(years) data['TimeSeriesCompleteness'] = "%0.3f" % ( float(len(years)) / year_range) # Get all non-zero gaps between years gaps = [ b - a - 1 for a, b in zip(years[:-1], years[1:]) if b - a > 1 ] data['TimeSeriesSamplingEvenness'] = np.array( gaps).var() if len(gaps) > 0 else 0 # Remove unwanted key from dict del data['value_series'] del data['value_count'] del data['scientific_name'] writer.writerow(data) log.info("Done")