Python run_parallel 예제들, tsx.util.run_parallel Python 예제들

예제 #1

0

파일 보기

def process_database(species = None, commit = False):
    session = get_session()

    if species == None:
        taxa = [taxon_id for (taxon_id,) in session.execute("SELECT DISTINCT taxon_id FROM processing_method WHERE data_type = 2").fetchall()]
    else:
        taxa = [taxon_id for (taxon_id,) in session.execute(
                "SELECT DISTINCT taxon_id FROM processing_method, taxon WHERE taxon.id = taxon_id AND data_type = 2 AND spno IN %s" % sql_list_placeholder('species', species),
                sql_list_argument('species', species)
            ).fetchall()]

    create_region_lookup_table(session)

    log.info("Step 1/2: Monthly aggregation")

    fn = functools.partial(aggregate_by_month, commit = commit)

    for result, error in tqdm(run_parallel(fn, taxa), total = len(taxa)):
        pass

    log.info("Step 2/2: Yearly aggregation")

    fn = functools.partial(aggregate_by_year, commit = commit)

    for result, error in tqdm(run_parallel(fn, taxa), total = len(taxa)):
        pass

    cleanup_region_lookup_table(session)

예제 #2

0

파일 보기

파일: t1_aggregation.py 프로젝트: ElisaBayra/tsx

def process_database(species=None, commit=False):
    session = get_session()
    if species == None:
        taxa = [
            taxon_id for (taxon_id, ) in session.execute(
                "SELECT DISTINCT taxon_id FROM t1_sighting").fetchall()
        ]
    else:
        taxa = [
            taxon_id for (taxon_id, ) in session.execute(
                "SELECT DISTINCT taxon_id FROM t1_sighting, taxon WHERE taxon.id = taxon_id AND spno IN (%s)"
                % sql_list_placeholder('species', species),
                sql_list_argument('species', species)).fetchall()
        ]

    create_region_lookup_table(session)

    # Process in parallel
    tasks = [(taxon_id, commit) for taxon_id in taxa]

    log.info("Step 1/2: Monthly aggregation")

    for result, error in tqdm(run_parallel(aggregate_monthly, tasks),
                              total=len(tasks)):
        if error:
            print error

    log.info("Step 1/2: Yearly aggregation")

    for result, error in tqdm(run_parallel(aggregate_yearly, tasks),
                              total=len(tasks)):
        if error:
            print error

    cleanup_region_lookup_table(session)

예제 #3

0

파일 보기

파일: range_ultrataxon.py 프로젝트: nesp-tsr3-1/tsx

def process_database(species=None, commit=False):
    session = get_session()

    # Just get taxa that have range polygons
    if species == None:
        taxa = session.execute(
            "SELECT DISTINCT taxon_id FROM taxon_range").fetchall()
    else:
        taxa = session.execute(
            "SELECT DISTINCT taxon_id FROM taxon_range, taxon WHERE taxon_id = taxon.id AND spno IN (%s)"
            % sql_list_placeholder('species', species),
            sql_list_argument('species', species)).fetchall()

    # Unwrap tuple
    taxa = [taxon_id for (taxon_id, ) in taxa]

    # Delete old results
    if commit and len(taxa) > 0:
        # session.execute("DELETE FROM t2_ultrataxon_sighting WHERE taxon_id IN (:taxa)", { 'taxa': taxa })
        session.execute(
            "DELETE FROM t2_ultrataxon_sighting WHERE taxon_id IN (%s)" %
            sql_list_placeholder('taxa', taxa),
            sql_list_argument('taxa', taxa))
        session.commit()

    # Process in parallel
    tasks = [(taxon_id, commit) for taxon_id in taxa]

    # This is important because we are about to spawn child processes, and this stops them attempting to share the
    # same database connection pool
    session.close()
    for result, error in tqdm(run_parallel(process_taxon, tasks),
                              total=len(tasks)):
        if error:
            print(error)

예제 #4

0

파일 보기

def process_database(species=None, commit=False):
    """
    Calculates spatial representativeness using alpha hulls

    Generates alpha hulls from each source x taxon combination

    Intersects alpha hulls with range layers, and then calculates percentage of range covered
    """
    session = get_session()

    if commit:
        if species == None:
            session.execute("DELETE FROM taxon_source_alpha_hull")
        else:
            session.execute(
                """DELETE FROM taxon_source_alpha_hull
                WHERE taxon_id IN (SELECT id FROM taxon WHERE spno IN (%s))"""
                % sql_list_placeholder('species', species),
                sql_list_argument('species', species))
        session.commit()

    # Load coastal shapefile
    coastal_shape_filename = tsx.config.config.get("processing.alpha_hull",
                                                   "coastal_shp")
    with fiona.Env(OSR_WKT_FORMAT="WKT2_2018"), fiona.open(
            coastal_shape_filename, 'r') as coastal_shape:
        # Convert from fiona dictionary to shapely geometry and reproject
        shp_to_working_transformer = pyproj.Transformer.from_proj(
            pyproj.CRS.from_wkt(coastal_shape.crs_wkt),
            working_proj,
            always_xy=True)
        coastal_shape = reproject(shape(coastal_shape[0]['geometry']),
                                  shp_to_working_transformer)
        # Simplify coastal boundary - makes things run ~20X faster
        log.info("Simplifying coastal boundary")
        coastal_shape = coastal_shape.buffer(10000).simplify(10000)

    log.info("Generating alpha shapes")

    for data_type in 1, 2:
        log.info("Processing type %s data" % data_type)

        taxa = get_taxa(session, data_type, species)

        tasks = [(taxon_id, coastal_shape, data_type, commit)
                 for taxon_id in taxa]

        # This is important because we are about to spawn child processes, and this stops them attempting to share the
        # same database connection pool
        session.close()  # TODO: not sure if this is needed now

        # Process all the species in parallel
        for result, error in tqdm(run_parallel(process, tasks),
                                  total=len(tasks)):
            if error:
                print(error)

예제 #5

0

파일 보기

def process_database(species = None, commit = False):
    """
    Generates alpha hulls from raw sighting data in the database

    Intersects alpha hulls with range layers, and inserts the result back into the database
    """
    session = get_session()

    if species is None:
        species = get_all_spno(session)

    if commit:
        for spno in species:
            session.execute("""
                DELETE FROM taxon_presence_alpha_hull
                WHERE taxon_id IN (SELECT id FROM taxon WHERE spno = :spno)
                """, { 'spno': spno })
            session.execute("""
                DELETE FROM taxon_presence_alpha_hull_subdiv
                WHERE taxon_id IN (SELECT id FROM taxon WHERE spno = :spno)
                """, { 'spno': spno })
        session.commit()

    db_proj = pyproj.Proj('+init=EPSG:4326') # Database always uses WGS84
    working_proj = pyproj.Proj('+init=EPSG:3112') # GDA94 / Geoscience Australia Lambert - so that we can buffer in metres

    # Load coastal shapefile
    coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp")
    with fiona.open(coastal_shape_filename, 'r') as coastal_shape:
        # Convert from fiona dictionary to shapely geometry and reproject
        coastal_shape = reproject(shape(coastal_shape[0]['geometry']), pyproj.Proj(coastal_shape.crs), working_proj)
        # Simplify coastal boundary - makes things run ~20X faster
        log.info("Simplifying coastal boundary")
        coastal_shape = coastal_shape.buffer(10000).simplify(10000)

    log.info("Generating alpha shapes")

    # Process a single species.
    # This gets run off the main thread.
    def process_spno(spno):
        session = get_session()
        try:
            # Get raw points from DB
            raw_points = get_species_points(session, spno)

            if len(raw_points) < 4:
                # Not enough points to create an alpha hull
                return

            # Read points from database
            points = [reproject(p, db_proj, working_proj) for p in raw_points]

            # Generate alpha shape
            alpha_shp = make_alpha_hull(
                points = points,
                coastal_shape = coastal_shape,
                thinning_distance = tsx.config.config.getfloat('processing.alpha_hull', 'thinning_distance'),
                alpha = tsx.config.config.getfloat('processing.alpha_hull', 'alpha'),
                hullbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'hullbuffer_distance'),
                isolatedbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'isolatedbuffer_distance'))

            # Convert back to DB projection
            alpha_shp = reproject(alpha_shp, working_proj, db_proj)

            # Clean up geometry
            alpha_shp = alpha_shp.buffer(0)

            # Get range polygons to intersect with alpha shape
            for taxon_id, range_id, breeding_range_id, geom_wkb in get_species_range_polygons(session, spno):
                # Intersect and insert into DB
                geom = shapely.wkb.loads(binascii.unhexlify(geom_wkb)).buffer(0)
                geom = to_multipolygon(geom.intersection(alpha_shp)) # slow
                if len(geom) > 0:
                    session.execute("""INSERT INTO taxon_presence_alpha_hull (taxon_id, range_id, breeding_range_id, geometry)
                        VALUES (:taxon_id, :range_id, :breeding_range_id, ST_GeomFromWKB(_BINARY :geom_wkb))""", {
                            'taxon_id': taxon_id,
                            'range_id': range_id,
                            'breeding_range_id': breeding_range_id,
                            'geom_wkb': shapely.wkb.dumps(geom)
                        }
                    )
                # We also subdivide the geometries into small pieces and insert this into the database. This allows for much faster
                # spatial queries in the database.
                for subgeom in subdivide_geometry(geom, max_points = 100):
                    session.execute("""INSERT INTO taxon_presence_alpha_hull_subdiv (taxon_id, range_id, breeding_range_id, geometry)
                        VALUES (:taxon_id, :range_id, :breeding_range_id, ST_GeomFromWKB(_BINARY :geom_wkb))""", {
                            'taxon_id': taxon_id,
                            'range_id': range_id,
                            'breeding_range_id': breeding_range_id,
                            'geom_wkb': shapely.wkb.dumps(subgeom)
                        }
                    )
            if commit:
                session.commit()

        except:
            log.exception("Exception processing alpha hull")
            raise
        finally:
            session.close()

    # This is important because we are about to spawn child processes, and this stops them attempting to share the
    # same database connection pool
    session.close()
    tsx.db.connect.engine.dispose()
    # Process all the species in parallel
    for result, error in tqdm(run_parallel(process_spno, species, use_processes = True), total = len(species)):
        if error:
            print error

예제 #6

0

파일 보기

파일: pseudo_absence.py 프로젝트: nesp-tsr3-1/tsx

def process_sites(session):
    run_sql(
        session, "Populate t2_survey_site",
        """INSERT INTO t2_survey_site (survey_id, site_id)
			SELECT t2_survey.id, t2_site.id
			FROM t2_survey, t2_site
			WHERE site_id = t2_site.id""")
    # Query OK, 14073 rows affected (0.68 sec)
    # Records: 14073  Duplicates: 0  Warnings: 0

    run_sql(
        session, "Populate t2_survey_site (spatial)",
        """INSERT INTO t2_survey_site
			SELECT t2_survey.id, t2_site.id
			FROM t2_site STRAIGHT_JOIN t2_survey USE INDEX (coords)
			WHERE site_id IS NULL
			AND t2_survey.search_type_id = t2_site.search_type_id
			AND ST_Intersects(geometry, coords)""")
    # Query OK, 278678 rows affected (15.77 sec)
    # Records: 278678  Duplicates: 0  Warnings: 0

    run_sql(
        session, "Populate standardised site surveys",
        """INSERT INTO t2_processed_survey (raw_survey_id, site_id, search_type_id, start_date_y, start_date_m, source_id, experimental_design_type_id)
			SELECT t2_survey.id, t2_survey_site.site_id, search_type_id, start_date_y, start_date_m, source_id, 1
			FROM t2_survey, t2_survey_site
			WHERE t2_survey.id = t2_survey_site.survey_id""")
    # Query OK, 292751 rows affected (5.00 sec)
    # Records: 292751  Duplicates: 0  Warnings: 0

    run_sql(
        session, "Populate presences / non-pseudo-absences",
        """INSERT INTO t2_processed_sighting (survey_id, taxon_id, count, unit_id, pseudo_absence)
			SELECT t2_processed_survey.id, t2_ultrataxon_sighting.taxon_id, count, unit_id, 0
			FROM t2_ultrataxon_sighting, t2_sighting, t2_processed_survey
			WHERE t2_ultrataxon_sighting.sighting_id = t2_sighting.id
			AND t2_sighting.survey_id = t2_processed_survey.raw_survey_id
			AND t2_processed_survey.experimental_design_type_id = 1""")
    # Query OK, 5775718 rows affected (2 min 35.55 sec)
    # Records: 5775718  Duplicates: 0  Warnings: 0

    log.info("Identify taxa for each site based on alpha hulls")

    taxa = [
        taxon_id for (taxon_id, ) in session.execute(
            "SELECT DISTINCT taxon_id FROM taxon_presence_alpha_hull_subdiv").
        fetchall()
    ]
    session.execute("""DROP TABLE IF EXISTS tmp_taxon_site""")
    # Note for some reason CREATE TEMPORARY TABLE doesn't work as expected, the table seems to be empty by the time we get
    # to the next step... I'm guessing after some kind of timeout the transaction gets rolled back(?)
    session.execute("""CREATE TABLE tmp_taxon_site (
		site_id INT NOT NULL,
		taxon_id CHAR(6) NOT NULL,
		INDEX (taxon_id, site_id)
	)""")

    # The next step was originally a very slow query, directly populating the tmp_taxon_site table.
    # Instead, I've broken the query down to process one taxon at a time
    # Furthermore, we process taxa in parallel threads to fully utilise the CPU, and then the results of the query are
    # inserted in bulk on the main thread.
    # Time taken: 17 min 4 s

    for result, error in tqdm(run_parallel(get_taxon_sites, taxa),
                              total=len(taxa)):
        # Perform bulk insert on main thread
        if len(result) > 0:
            insert_data = [{
                'site_id': site_id,
                'taxon_id': taxon_id
            } for site_id, taxon_id in result]
            session.execute(
                """INSERT INTO tmp_taxon_site (site_id, taxon_id) VALUES (:site_id, :taxon_id)""",
                insert_data)

    log.info("Insert pseudo absences")

    # This next step originally ran in about 15 minutes on my laptop as a single query, but took forever on the server
    # (I gave up after a couple of hours) so I've split it up by taxon with is a bit slower overall but at least you can
    # see progress

    # Create temporary table so we aren't inserting and selecting from the same table
    session.execute(
        """CREATE TEMPORARY TABLE tmp_processed_sighting SELECT survey_id, taxon_id, count, unit_id, pseudo_absence FROM t2_processed_sighting WHERE FALSE"""
    )

    # Running this in parallel originally resulted in MySQL deadlock errors, but then we were inserting and selecting from the same table.
    # It might be worth trying a parallel approach again.
    for taxon_id in tqdm(taxa):
        # This query is a bit tricky. We do a left join to find taxa that are not present for a survey, and match on t2_processed_sighting.id = NULL to generate the pseudo-absences
        session.execute(
            """INSERT INTO tmp_processed_sighting (survey_id, taxon_id, count, unit_id, pseudo_absence)
				SELECT t2_processed_survey.id, tmp_taxon_site.taxon_id, 0, 2, 1
				FROM t2_survey
				INNER JOIN t2_survey_site ON t2_survey.id = t2_survey_site.survey_id
				INNER JOIN t2_processed_survey ON t2_survey.id = t2_processed_survey.raw_survey_id AND t2_processed_survey.experimental_design_type_id = 1
				INNER JOIN tmp_taxon_site ON t2_survey_site.site_id = tmp_taxon_site.site_id AND taxon_id = :taxon_id
				LEFT JOIN t2_processed_sighting ON t2_processed_sighting.survey_id = t2_processed_survey.id AND t2_processed_sighting.taxon_id = tmp_taxon_site.taxon_id
				WHERE t2_processed_sighting.id IS NULL""", {'taxon_id': taxon_id})

    session.execute(
        """INSERT INTO t2_processed_sighting(survey_id, taxon_id, count, unit_id, pseudo_absence) SELECT survey_id, taxon_id, count, unit_id, pseudo_absence FROM tmp_processed_sighting"""
    )

    session.execute("""DROP TABLE tmp_taxon_site""")

예제 #7

0

파일 보기

파일: spatial_rep.py 프로젝트: ElisaBayra/tsx

def process_database(species = None, commit = False):
    """
    Calculates spatial representativeness using alpha hulls

    Generates alpha hulls from each source x taxon combination

    Intersects alpha hulls with range layers, and then calculates percentage of range covered
    """
    session = get_session()

    if commit:
        if species == None:
            session.execute("DELETE FROM taxon_source_alpha_hull")
        else:
            session.execute("""DELETE FROM taxon_source_alpha_hull
                WHERE taxon_id IN (SELECT id FROM taxon WHERE spno IN (%s))""" % sql_list_placeholder('species', species),
                sql_list_argument('species', species))
        session.commit()

    db_proj = pyproj.Proj('+init=EPSG:4326') # Database always uses WGS84
    working_proj = pyproj.Proj('+init=EPSG:3112') # GDA94 / Geoscience Australia Lambert - so that we can buffer in metres

    # Load coastal shapefile
    coastal_shape_filename = tsx.config.config.get("processing.alpha_hull", "coastal_shp")
    with fiona.open(coastal_shape_filename, 'r') as coastal_shape:
        # Convert from fiona dictionary to shapely geometry and reproject
        coastal_shape = reproject(shape(coastal_shape[0]['geometry']), pyproj.Proj(coastal_shape.crs), working_proj)
        # Simplify coastal boundary - makes things run ~20X faster
        log.info("Simplifying coastal boundary")
        coastal_shape = coastal_shape.buffer(10000).simplify(10000)

    log.info("Generating alpha shapes")

    for data_type in 1,2:
        log.info("Processing type %s data" % data_type)

        # Process a single species.
        # This gets run off the main thread.
        def process(taxon_id):
            session = get_session()

            try:
                # Load core range geometry
                core_range_geom = reproject(get_core_range_geometry(session, taxon_id), db_proj, working_proj).buffer(0).intersection(coastal_shape)

                for source_id in get_source_ids(session, data_type, taxon_id):

                    log.info("Processing taxon_id: %s, source_id: %s" % (taxon_id, source_id))

                    # Get raw points from DB
                    raw_points = get_raw_points(session, data_type, taxon_id, source_id)

                    empty = len(raw_points) < 4

                    if empty:
                        log.info("Taxon %s: not enough points to create alpha hull (%s)" % (taxon_id, len(raw_points)))

                    if not empty:
                        # Read points from database
                        points = [reproject(p, db_proj, working_proj) for p in raw_points]

                        # Generate alpha shape
                        alpha_shp = make_alpha_hull(
                            points = points,
                            coastal_shape = None,
                            thinning_distance = tsx.config.config.getfloat('processing.alpha_hull', 'thinning_distance'),
                            alpha = tsx.config.config.getfloat('processing.alpha_hull', 'alpha'),
                            hullbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'hullbuffer_distance'),
                            isolatedbuffer_distance = tsx.config.config.getfloat('processing.alpha_hull', 'isolatedbuffer_distance'))

                        # Clean up geometry
                        alpha_shp = alpha_shp.buffer(0)

                        if core_range_geom.area == 0:
                            log.info("Core range geometry area is zero")
                            empty = True

                        else:
                            # Intersect alpha hull with core range
                            intersected_alpha = to_multipolygon(core_range_geom.intersection(alpha_shp))

                            empty = intersected_alpha.is_empty

                    if empty:
                        session.execute("""INSERT INTO taxon_source_alpha_hull (source_id, taxon_id, data_type, core_range_area_in_m2, alpha_hull_area_in_m2)
                            VALUES (:source_id, :taxon_id, :data_type, 0, 0)""", {
                                'source_id': source_id,
                                'taxon_id': taxon_id,
                                'data_type': data_type
                            })
                    else:
                        session.execute("""INSERT INTO taxon_source_alpha_hull (source_id, taxon_id, data_type, geometry, core_range_area_in_m2, alpha_hull_area_in_m2)
                            VALUES (:source_id, :taxon_id, :data_type, ST_GeomFromWKB(_BINARY :geom_wkb), :core_range_area, :alpha_hull_area)""", {
                                'source_id': source_id,
                                'taxon_id': taxon_id,
                                'data_type': data_type,
                                'geom_wkb': shapely.wkb.dumps(reproject(intersected_alpha, working_proj, db_proj)),
                                'core_range_area': core_range_geom.area,
                                'alpha_hull_area': intersected_alpha.area
                            })

                    if commit:
                        session.commit()

            except:
                log.exception("Exception processing alpha hull")
                raise
            finally:
                session.close()

        taxa = get_taxa(session, data_type, species)

        # This is important because we are about to spawn child processes, and this stops them attempting to share the
        # same database connection pool
        session.close()
        tsx.db.connect.engine.dispose()
        # Process all the species in parallel
        for result, error in tqdm(run_parallel(process, taxa, use_processes = True), total = len(taxa)):
            if error:
                print error