def disable_qc_index(cls): """ Disable non-clustered index on QC important - this depends on the index name being kept the same in the variable sql_disable_index below """ # Check DB connection... if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice (raise an exception may be) return "Fail, no connection " else: try: sql_disable_index = "ALTER INDEX IX_eurobis_qc ON dbo.eurobis DISABLE;" cursor = mssql.conn.cursor() cursor.execute(sql_disable_index) mssql.conn.commit() this.logger.debug(f"Non clustered index on qc disabled") mssql.close_db() return "Success" except Error as e: mssql.close_db() return f"Failed to disable clustered index: {e}"
def get_provider_data(self, das_prov_id): """ Obtain the dataset info from the dataproviders table and stores that into the instance fields :param - das_prov_id - Dataset id """ if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice pass else: # Can start querying the DB using the dataset provider ID sql_str = f"select * from dataproviders d where id = {das_prov_id};" # Fetch the provider line cur = mssql.conn.cursor() cur.execute(sql_str) # Get a dictionary provider_fields = [ description[0] for description in cur.description ] provider = cur.fetchone() self.provider_record = dict(zip(provider_fields, provider)) self.imis_das_id = self.provider_record['IMIS_DasID'] self.dataprovider_id = self.provider_record[ 'id'] # Of course this we have already self.darwin_core_type = self.provider_record['core'] self.dataset_name = self.provider_record['name'] # Should see if more is needed mssql.close_db()
def get_ev_occ_records(self, das_prov_id): """ retrieves event and occurrence records for the dataset in das_id from SQL Server also builds search indexes to easily retrieve dependant records :param das_prov_id """ if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice pass else: # get records sql_string = self.query_builder_eve_occur(das_prov_id) cursor = mssql.conn.cursor() cursor.execute(sql_string) columns = [column[0] for column in cursor.description] records = [] for row in cursor: records.append(dict(zip(columns, row))) # We proceed differently in accordance with the type of CORE record... if self.darwin_core_type == self.EVENT: for record in records: if record['DarwinCoreType'] == self.OCCURRENCE: self.occurrence_recs.append(record) if record[ 'eventID'] is not None: # To link we need an eventID # make a key - it must be lined by the eventID key = f"{record['dataprovider_id']}_{record['eventID']}" if key in self.occ_indices: # All the records with this key shall be in a list at 'key' self.occ_indices[key].append(record) else: self.occ_indices[key] = [record] elif record['DarwinCoreType'] == self.EVENT: self.event_recs.append(record) # If coretype is event, then eventid is the key - only used to reverse lookup. key = f"{record['dataprovider_id']}_{record['eventID']}" self.event_indices[key] = [record] else: # Occurrence records (Datasets with core_type = OCCURRENCE do not have events - verified) for record in records: if record['DarwinCoreType'] == self.OCCURRENCE: self.occurrence_recs.append(record) else: # Should really never happen !! self.event_recs.append(record) mssql.close_db()
def get_emof_records(self, das_prov_id): """ retrieves measurementorfact records for the dataset in das_id from SQL Server also building search indexes to easily retrieve dependant records NOTE: mof records do not exist for records that have eventID and occurrenceID NULL :param das_prov_id""" if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice pass else: # get records sql_string = self.query_builder_emof(das_prov_id) cursor = mssql.conn.cursor() cursor.execute(sql_string) columns = [column[0] for column in cursor.description] records = [] for row in cursor: records.append(dict(zip(columns, row))) for record in records: self.emof_recs.append(record) if self.darwin_core_type == self.EVENT: key = f"{record['dataprovider_id']}_{'NULL' if record['eventID'] is None else record['eventID']}_" \ f"{'NULL' if record['occurrenceID'] is None else record['occurrenceID']}" else: # Occurrence records in datasets with core = Occurrence may have other info in EventID and NULL in # eMoF record. key = f"{record['dataprovider_id']}_NULL_" \ f"{'NULL' if record['occurrenceID'] is None else record['occurrenceID']}" if key in self.emof_indices: # All the records with this key shall be in a list at 'key' self.emof_indices[key].append(record) else: self.emof_indices[key] = [record] mssql.close_db()
def rebuild_qc_index(cls): """ Rebuild non-clustered index on QC important - this depends on the index name as per disable_qc_index """ # Check DB connection... if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice (raise an exception may be) return "Fail, no connection " else: try: sql_disable_index = "ALTER INDEX IX_eurobis_qc ON dbo.eurobis REBUILD;" cursor = mssql.conn.cursor() cursor.execute(sql_disable_index) mssql.conn.commit() this.logger.debug(f"Non clustered index on qc rebuilt") mssql.close_db() return "Success" except Error as e: mssql.close_db() return f"Failed to rebuild clustered index: {e}"
def process_dataset_list(pool_no, dataset_id_list, from_pool=False, with_logging=False): """ Processes a list of DwCA archives, ideally to be called in parallel :param pool_no - Pool number to take track of the pools :param dataset_id_list (The list of datasets to be processed) :param from_pool: If it comes from a multiprocessing pool, then disabling of the QC index is taken care of :param with_logging (Logging enabled or not) """ # Prints pool data start = time.time() if with_logging: this.logger.info(f"Pool {pool_no} started") # Connect to the database, each pool should have its own connection conn = None mssql.close_db() # Make sure db connection is ours if not mssql.conn: conn = mssql.open_db() if conn is None: # Should find a way to exit and advice this.logger.error("No connection to DB, nothing can be done! ") return pool_no # Disable index on QC once if not from_pool: eurobis_dataset.EurobisDataset.disable_qc_index() errors = 0 for dataset_id in dataset_id_list: start_file = time.time() if with_logging: this.logger.info( f"Pool Number: {pool_no}, processsing dataset {dataset_id} ") try: dataset_qc_labeling(dataset_id, False, with_logging, pool_no) except Exception: errors += 1 this.logger.error(traceback.format_exc()) this.logger.warning( f"WARNING: Pool Number: {pool_no}, processsing dataset {dataset_id} FAILED " ) if with_logging: this.logger.info( f"Processed dataset {dataset_id} in {time.time() - start_file}" ) # REBUILD index on QC once if not from_pool: eurobis_dataset.EurobisDataset.rebuild_qc_index() if with_logging: this.logger.info(f"Pool {pool_no} completed in {time.time() - start}") return pool_no, errors
def update_record_qc(cls, records, batch_update_count, batch_size, ds_id, record_type): """ Shall update a batch of records from a dataset update queries shall be built record by record and sent to the DB in batches for execution :param records : A list of the records being updated :param batch_update_count: The dataset's batch number :param batch_size : Here only used to report the status of the update :param ds_id: dataset being processed :param record_type : For query optimization, occurrence and event records may be treated differently also used for printing NOTE: Special methods are provided to DISABLE the index on QC and REBUILD it after the updates. These improve vastly the query run time. """ # Check DB connection... if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice (raise an exception may be) return "Fail, no connection " else: record_count = len(records) # sql_update = f"BEGIN TRAN; \n" sql_update = "" for record in records: # Compose update query # physloc = bytes.hex(record['physloc']) # Note The fields other than physloc and dataprovider_id are used to optimize # the update queries execution plans and thus to reduce browsing the records # and using the existing indexes on the eurobis table. Observed speed improvements # are between 2.5 and 5 times faster. # This is a temporary fix - some qc values are set to None. if record['qc'] is None: record['qc'] = 0 sql_update += f"{cls.sql_update_start}{record['qc']}{cls.sql_update_middle} {record['dataprovider_id']}" """ Disabled - using auto_id now. if record['decimalLatitude'] is not None: sql_update = f"{sql_update}{cls.sql_if_lat}{record['decimalLatitude']}" else: sql_update = f"{sql_update} AND Latitude IS NULL " if record['decimalLongitude'] is not None: sql_update = f"{sql_update} {cls.sql_if_lon}{record['decimalLongitude']}" else: sql_update = f"{sql_update} AND Longitude IS NULL " if record_type == EurobisDataset.EVENT: if record['eventID'] is not None and misc.is_clean_for_sql(record['eventID']): sql_update = f"{sql_update} {cls.sql_if_event_id}'{record['eventID']}'" """ # sql_update = f"{sql_update} {cls.sql_update_end} 0x{physloc} \n" sql_update = f"{sql_update} {cls.sql_update_end} {record['auto_id']} \n" try: # sql_update += f"COMMIT TRAN;\n" cursor = mssql.conn.cursor() cursor.execute(sql_update) mssql.conn.commit() rec_type = "EVENT" if record_type == EurobisDataset.EVENT else "OCCURRENCE" dateTimeObj = datetime.now() this.logger.debug( f"{dateTimeObj}: {rec_type} records update count: {batch_update_count * batch_size + record_count} " f"of dataset {ds_id};") batch_update_count += 1 return "Success" except Error as e: return f"Fail, batch {batch_update_count} not updated, exception {str(e)}" # Added close_DB to make sure that transactions are "separated". mssql.close_db()
def do_db_multi_random_percent(percent): """ Example of processing multiple datasets at the same time in order to exploit the computing resources (cores) performs a random selection of 2% of available datasets having less than 4000 records """ start_time = time.time() # Now set to a percent of datasets... # sql_random_percent_of_datasets = f"SELECT id FROM dataproviders WHERE {percent} >= CAST(CHECKSUM(NEWID(), id) " \ # "& 0x7fffffff AS float) / CAST (0x7fffffff AS int)" # This selects percent from SMALL datasets (less than 4000 events/occurrences) sql_random_percent_of_datasets = f"select a.id, a.displayname, a.rec_count from " \ f"(select d.id, d.displayname, rec_count from dataproviders d " \ f"inner join eurobis e on e.dataprovider_id = d.id " \ f"where rec_count <= 4000 group by d.id, d.displayname, rec_count) a " \ f"where {percent} >= CAST(CHECKSUM(NEWID(), id) & 0x7fffffff AS float) " \ f"/ CAST (0x7fffffff AS int) order by id " dataset_ids = [] dataset_names = [] # we dedicate to the task the total number of processors - 3 or 1 if we only have 2 cores or less. # Knowing that mssql needs 2 cores at least. reserve_cpus = 1 + (0 if not mssql.server_local else 2) if mp.cpu_count() > reserve_cpus: n_cpus = mp.cpu_count() - reserve_cpus else: n_cpus = 1 pool = mp.Pool(n_cpus) # Connect to the database to get dataset list if not mssql.conn: mssql.open_db() if mssql.conn is None: # Should find a way to exit and advice this.logger.error("No connection to DB, nothing can be done! ") exit(0) else: # Fetch a random set of datasets cur = mssql.conn.cursor() cur.execute(sql_random_percent_of_datasets) for row in cur: dataset_ids.append(row[0]) # tuples names - size dataset_names.append((row[0], row[1], row[2])) mssql.close_db() # Retrieved list, now need to split dataset_id_lists = misc.split_list(dataset_ids, n_cpus) # We are OK until here. dataset_names_lists = misc.split_list(dataset_names, n_cpus) result_pool = [] for i, dataset_id_list in enumerate(dataset_id_lists): this.logger.info(f"Pool {i} will process {dataset_names_lists[i]} ") result_pool.append(pool.apply_async(process_dataset_list, args=(i, dataset_id_list, True, True))) for r in result_pool: res = r.get() if res[1] > 0: this.logger.warning(f"Pool {res[0]} failed to process {res[1]} datasets") pool.terminate() pool.join() this.logger.info(f"Started at: {start_time}. All processes have completed after {time.time() - start_time}")