def run(self): # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it if not self.force: raise Exception('Warning: this class does not delete CKAN records. Use --force to run it.') # Build a dict of all modules and collections # We then retrieve the appropriate collection from the records module name (AudTable) # Exclude the MongoDeleteTask though collections = {cls.module: cls(None).get_collection() for cls in MongoTask.__subclasses__()} ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file) for record in self.iterate_data(ke_data): module = record.get('AudTable') irn = record.get('AudKey') try: collection = collections[module] except KeyError: log.debug('Skipping eaudit record for %s' % module) # We do not have a collection for this module - skip to next record continue else: log.info('Deleting record %s(%s)' % (module, irn)) self.delete(collection, irn) self.mark_complete()
def bulk_update(self, ke_data): bulk = self.collection.initialize_unordered_bulk_op() count = 0 for record in self.iterate_data(ke_data): # Find and replace doc - inserting if it doesn't exist bulk.find({'_id': record['_id']}).upsert().replace_one(record) count += 1 # Bulk ops can have out of memory errors (I'm getting for ~400,000+ bulk ops) # So execute the bulk op in stages, when bulk_op_size is reached if count % self.bulk_op_size == 0: log.info('Executing bulk op') bulk.execute() bulk = self.collection.initialize_unordered_bulk_op() try: bulk.execute() except InvalidOperation: # If we do not have any records to execute, ignore error # They have been executed in ln124 pass
def main(): update_markers = mongo_get_update_markers() # Make sure the updates have all mongo classes bulk_tasks = [ MongoCollectionIndexTask, MongoCollectionEventTask, MongoCatalogueTask, MongoTaxonomyTask, # MongoMultimediaTask, MongoSiteTask, UnpublishTask, MongoDeleteTask ] def _get_task_names(tasks): """ We need to initiate and get the family name, not just the class name MongoDeleteTask => DeleteTask @param tasks: @return: """ return [unicode(task(date=0).task_family) for task in tasks] full_export_date = int(config.get('keemu', 'full_export_date')) for date, update_marker in update_markers.iteritems(): # If this is the fll export date, MongoDeleteTask is not required if full_export_date and date == full_export_date: bulk_task_copy = list(bulk_tasks) bulk_task_copy.remove(MongoDeleteTask) bulk_task_names = _get_task_names(bulk_task_copy) else: bulk_task_names = _get_task_names(bulk_tasks) # Assert that for every date we have all the bulk tasks missing_tasks = list(set(bulk_task_names) - set(update_marker)) assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (date, missing_tasks) # Get a list of all export files to process export_dates = [d for d in get_export_file_dates() if d not in update_markers.keys()] # Run setup_interface_logging to ensure luigi commands setup_interface_logging() sch = scheduler.CentralPlannerScheduler() w = BulkWorker(scheduler=sch) for export_date in export_dates: log.info('Processing date %s', export_date) # We only need to call the mongo delete task, as all other tasks are a requirement # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask w.add(MongoDeleteTask(date=export_date, force=True)) w.run() w.stop()
def run(self): mongo_db = mongo_client_db() collection = MongoCatalogueTask(date=None).collection_name cites_species = get_cites_species() # Set cites=true flag cites_records_cursor = mongo_db[collection].update({'DarScientificName': {'$in': cites_species}}, {'$set': {'cites': True}}, multi=True) log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
def on_success(self): """ On completion, add indexes @return: None """ self.collection = self.get_collection() log.info("Adding exportFileDate index") self.collection.ensure_index('exportFileDate')
def write(self, df): log.info("Saving records to CKAN resource %s", self.resource_id) # Convert all empty/null values to None - so will be NULL values in postgres # Ensure any float fields with value 0.0 are actually None for col, np_type in self.columns.iteritems(): if np_type.startswith('float'): df[col][df[col] == 0.0] = None else: # BUGFIX: Multimedia fields are being populated with empty string rather than NULL df[col][df[col].astype(str) == ''] = None # Loop through all the dataframe columns, removing internal ones (fields starting with _) for col in df: if col.startswith('_'): df.drop(col, axis=1, inplace=True) # Convert all NaN to None df = df.where(pd.notnull(df), None) # Convert records to dictionary records = df.to_dict(outtype='records') datastore_params = { 'resource_id': self.resource_id, 'records': records, 'force': True # 'primary_key': '_id' } # Check that the data doesn't contain invalid chars try: json.dumps(datastore_params).encode('ascii') except UnicodeDecodeError: # At least one of the records contain invalid chars # Loop through, validating each of the records validated_records = [] for i, record in enumerate(datastore_params['records']): try: json.dumps(record).encode('ascii') except UnicodeDecodeError: log.critical('Error encoding record: %s', ' '.join(['%s=%s' % (field, value) for field, value in record.iteritems() if value])) else: validated_records.append(record) datastore_params['records'] = validated_records self.remote_ckan.action.datastore_upsert(**datastore_params)
def ckan_delete(remote_ckan, mongo_record): # To avoid circular imports, import the tasks we need to check here # Dataset tasks are dependent on the DeleteTask from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask from ke2mongo.tasks.artefact import ArtefactDatasetAPITask from ke2mongo.tasks.specimen import SpecimenDatasetAPITask # By default, use SpecimenDatasetAPITask task_cls = SpecimenDatasetAPITask # Override default class if is Index Lot or Artefact for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]: if t.record_type == mongo_record['ColRecordType']: task_cls = t break # Get the primary key for col in task_cls.columns: if col[1] == task_cls.datastore['primary_key']: primary_key_field = col break # Get the source primary key - this needs to be split on . as we have added the collection name ke_primary_key = primary_key_field[0].split('.')[1] # The name of the primary key field used in CKAN ckan_primary_key = primary_key_field[1] try: primary_key_value = mongo_record[ke_primary_key] except KeyError: log.error('No value for primary key %s', ke_primary_key) else: resource_id = get_resource_id(remote_ckan, task_cls.package['name']) if resource_id: try: # And delete the record from the datastore log.info('Deleting record from CKAN where %s=%s' % (ckan_primary_key, primary_key_value)) remote_ckan.action.datastore_delete( id=resource_id, filters={ckan_primary_key: primary_key_value}, force=True) except ckanapi.CKANAPIError: # We don't care if the record isn't found log.error('Record not found') else: log.error('No resource ID')
def delete(self, collection, irn): # If this is an ecatalogue record, try and delete from CKAN if collection.name == 'ecatalogue': # Load the record from mongo mongo_record = collection.find_one({'_id': int(irn)}) if mongo_record: ckan_delete(self.remote_ckan, mongo_record) else: log.info('Record %s does not exist. SKipping delete.' % irn) # And call the Mongo Delete task delete() method to remove the record from mongodb super(DeleteAPITask, self).delete(collection, irn)
def run(self): mongo_db = mongo_client_db() collection = MongoCatalogueTask(date=None).collection_name cites_species = get_cites_species() # Set cites=true flag cites_records_cursor = mongo_db[collection].update( {'DarScientificName': { '$in': cites_species }}, {'$set': { 'cites': True }}, multi=True) log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
def get_or_create_resource(self): """ Either load a resource object Or if it doesn't exist, create the dataset package, and datastore @param package: params to create the package @param datastore: params to create the datastore @return: CKAN resource ID """ resource_id = None try: # If the package exists, retrieve the resource ckan_package = self.remote_ckan.action.package_show(id=self.package['name']) # Does a resource of the same name already exist for this dataset? # If it does, assign to resource_id for resource in ckan_package['resources']: if resource['name'] == self.datastore['resource']['name']: self.validate_resource(resource) resource_id = resource['id'] except ckanapi.NotFound: log.info("Package %s not found - creating", self.package['name']) # Create the package ckan_package = self.remote_ckan.action.package_create(**self.package) # If we don't have the resource ID, create if not resource_id: log.info("Resource %s not found - creating", self.datastore['resource']['name']) self.datastore['fields'] = [{'id': col, 'type': self.numpy_to_ckan_type(np_type)} for col, np_type in self.get_output_columns().iteritems()] self.datastore['resource']['package_id'] = ckan_package['id'] if self.indexed_fields: # Create BTREE indexes for all specified indexed fields self.datastore['indexes'] = [col['id'] for col in self.datastore['fields'] if col['id'] in self.indexed_fields] else: # Create BTREE indexes for all citext fields self.datastore['indexes'] = [col['id'] for col in self.datastore['fields'] if col['type'] == 'citext'] # API call to create the datastore resource_id = self.remote_ckan.action.datastore_create(**self.datastore)['resource_id'] # If this has geospatial fields, create geom columns if self.geospatial_fields: log.info("Creating geometry columns for %s", resource_id) self.geospatial_fields['resource_id'] = resource_id self.remote_ckan.action.create_geom_columns(**self.geospatial_fields) log.info("Created datastore resource %s", resource_id) return resource_id
def ckan_delete(remote_ckan, mongo_record): # To avoid circular imports, import the tasks we need to check here # Dataset tasks are dependent on the DeleteTask from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask from ke2mongo.tasks.artefact import ArtefactDatasetAPITask from ke2mongo.tasks.specimen import SpecimenDatasetAPITask # By default, use SpecimenDatasetAPITask task_cls = SpecimenDatasetAPITask # Override default class if is Index Lot or Artefact for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]: if t.record_type == mongo_record['ColRecordType']: task_cls = t break # Get the primary key for col in task_cls.columns: if col[1] == task_cls.datastore['primary_key']: primary_key_field = col break # Get the source primary key - this needs to be split on . as we have added the collection name ke_primary_key = primary_key_field[0].split('.')[1] # The name of the primary key field used in CKAN ckan_primary_key = primary_key_field[1] try: primary_key_value = mongo_record[ke_primary_key] except KeyError: log.error('No value for primary key %s', ke_primary_key) else: resource_id = get_resource_id(remote_ckan, task_cls.package['name']) if resource_id: try: # And delete the record from the datastore log.info('Deleting record from CKAN where %s=%s' % (ckan_primary_key, primary_key_value)) remote_ckan.action.datastore_delete(id=resource_id, filters={ckan_primary_key: primary_key_value}, force=True) except ckanapi.CKANAPIError: # We don't care if the record isn't found log.error('Record not found') else: log.error('No resource ID')
def on_success(self): """ On completion, add indexes @return: None """ self.collection = self.get_collection() log.info("Adding ecatalogue indexes") self.collection.ensure_index('ColRecordType') # Only include active records - not Stubs etc., self.collection.ensure_index('SecRecordStatus') # Add index on RegRegistrationParentRef - select records with the same parent self.collection.ensure_index('RegRegistrationParentRef') # Need to filter on web publishable self.collection.ensure_index('AdmPublishWebNoPasswordFlag') # Exclude records if they do not have a GUID self.collection.ensure_index('AdmGUIDPreferredValue') # Add embargo date index self.collection.ensure_index('RealEmbargoDate') super(MongoCatalogueTask, self).on_success()
def run(self): # Do not run if this is a full export date - all non-publishable records will # Already have been removed if int(self.full_export_date) == int(self.date): log.info("No records to unpublish for full exports") self.mark_complete() return collection = self.output().get_collection('ecatalogue') # We only care about records who's status has changed in the past week (6 days to be sure) date_object = datetime.strptime(str(self.date), '%Y%m%d') q = dict(AdmPublishWebNoPasswordFlag='N', exportFileDate=self.date, ISODateInserted={'$gte': date_object - timedelta(days=6)}) cursor = collection.find(q) log.info('%s records to unpublish', cursor.count()) for record in cursor: ckan_delete(self.remote_ckan, record) # And mark the object as complete self.mark_complete()
def run(self): count = 0 host = config.get('mongo', 'host') db = config.get('mongo', 'database') def _fill_field(field_arr, field_type): if field_type.startswith('string'): field_arr = field_arr.astype(np.str).filled('') elif field_type == 'bool': field_arr = field_arr.astype(np.str).filled(None) elif field_type.startswith('int'): field_arr = field_arr.filled(0) elif field_type.startswith('float'): field_arr = field_arr.filled(np.NaN) else: raise Exception('Unknown field type %s' % field_type) return field_arr with Monary(host) as m: log.info("Querying Monary") # Get field definitions for default collection query_fields, df_cols, field_types = zip(*self.get_collection_source_columns(self.collection_name)) catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size) log.info("Processing Monary data") for catalogue_block in catalogue_blocks: # Bit of a hack: fill fields with a blank value (depending on type) # So the masked value doesn't get used. As the masked is shared between # each block, if a field is empty it is getting populated by previous values catalogue_block = [_fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block)] # Create a pandas data frame with block of records # Columns use the name from the output columns - but must be in the same order as query_fields # Which is why we're using tuples for the columns df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols) # Loop through all the columns and ensure hidden integer fields are cast as int32 # For example, taxonomy_irn is used to join with taxonomy df for i, df_col in enumerate(df_cols): if field_types[i].startswith('int'): df[df_col] = df[df_col].astype(field_types[i]) df = self.process_dataframe(m, df) # Output the dataframe self.output().write(df) row_count, col_count = df.shape count += row_count log.info("\t %s records", count) # After running, update mongo self.mongo_target.touch()
def on_success(self): log.info("Import CSV file with:") log.info("COPY \"{resource_id}\" (\"{cols}\") FROM '{path}' DELIMITER ',' CSV ENCODING 'UTF8';".format( resource_id=self.resource_id, cols='","'.join(col for col in self.get_output_columns()), path=self.path )) log.info("And update full text index:") log.info("paster update-fulltext -i \"{resource_id}\" -c /vagrant/etc/default/development.ini".format( resource_id=self.resource_id, )) return super(DatasetCSVTask, self).complete()
def on_success(self): log.info("Import CSV file with:") log.info( "COPY \"{resource_id}\" (\"{cols}\") FROM '{path}' DELIMITER ',' CSV ENCODING 'UTF8';" .format(resource_id=self.resource_id, cols='","'.join(col for col in self.get_output_columns()), path=self.path)) log.info("And update full text index:") log.info( "paster update-fulltext -i \"{resource_id}\" -c /vagrant/etc/default/development.ini" .format(resource_id=self.resource_id, )) return super(DatasetCSVTask, self).complete()
def batch_insert(self, ke_data): def _insert(batch): try: self.collection.insert(batch) except DuplicateKeyError: # Duplicate key error - KE export does duplicate some records # So switch to bulk upsert for this operation log.error('Duplicate key error - switching to upsert') bulk = self.collection.initialize_unordered_bulk_op() for batch_record in batch: bulk.find({ '_id': batch_record['_id'] }).upsert().replace_one(batch_record) bulk.execute() batch = [] for record in self.iterate_data(ke_data): if self.batch_size: batch.append(record) # If the batch length equals the batch size, commit and clear the batch if len(batch) % self.batch_size == 0: log.info('Submitting batch') _insert(batch) batch = [] else: self.collection.insert(record) # Add any records remaining in the batch if batch: _insert(batch)
def run(self): # Do not run if this is a full export date - all non-publishable records will # Already have been removed if int(self.full_export_date) == int(self.date): log.info("No records to unpublish for full exports") self.mark_complete() return collection = self.output().get_collection('ecatalogue') # We only care about records who's status has changed in the past week (6 days to be sure) date_object = datetime.strptime(str(self.date), '%Y%m%d') q = dict( AdmPublishWebNoPasswordFlag='N', exportFileDate=self.date, ISODateInserted={'$gte': date_object - timedelta(days=6)} ) cursor = collection.find(q) log.info('%s records to unpublish', cursor.count()) for record in cursor: ckan_delete(self.remote_ckan, record) # And mark the object as complete self.mark_complete()
def batch_insert(self, ke_data): def _insert(batch): try: self.collection.insert(batch) except DuplicateKeyError: # Duplicate key error - KE export does duplicate some records # So switch to bulk upsert for this operation log.error('Duplicate key error - switching to upsert') bulk = self.collection.initialize_unordered_bulk_op() for batch_record in batch: bulk.find({'_id': batch_record['_id']}).upsert().replace_one(batch_record) bulk.execute() batch = [] for record in self.iterate_data(ke_data): if self.batch_size: batch.append(record) # If the batch length equals the batch size, commit and clear the batch if len(batch) % self.batch_size == 0: log.info('Submitting batch') _insert(batch) batch = [] else: self.collection.insert(record) # Add any records remaining in the batch if batch: _insert(batch)
def iterate_data(self, ke_data): """ Iterate through the data @return: """ for record in ke_data: status = ke_data.get_status() if status: log.info(status) # Use the IRN as _id record['_id'] = record['irn'] try: # Do not process if unprocessed flag is set if not self.unprocessed: record = self.process_record(record) except InvalidRecordException: continue else: yield record
def run(self): # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it if not self.force: raise Exception( 'Warning: this class does not delete CKAN records. Use --force to run it.' ) # Build a dict of all modules and collections # We then retrieve the appropriate collection from the records module name (AudTable) # Exclude the MongoDeleteTask though collections = { cls.module: cls(None).get_collection() for cls in MongoTask.__subclasses__() } ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file) for record in self.iterate_data(ke_data): module = record.get('AudTable') irn = record.get('AudKey') try: collection = collections[module] except KeyError: log.debug('Skipping eaudit record for %s' % module) # We do not have a collection for this module - skip to next record continue else: log.info('Deleting record %s(%s)' % (module, irn)) self.delete(collection, irn) self.mark_complete()
def run(self): if int(self.full_export_date) == int(self.date): log.info("No records to delete for full exports") self.mark_complete() return super(DeleteAPITask, self).run()
def get_or_create_resource(self): """ Either load a resource object Or if it doesn't exist, create the dataset package, and datastore @param package: params to create the package @param datastore: params to create the datastore @return: CKAN resource ID """ resource_id = None try: # If the package exists, retrieve the resource ckan_package = self.remote_ckan.action.package_show( id=self.package['name']) # Does a resource of the same name already exist for this dataset? # If it does, assign to resource_id for resource in ckan_package['resources']: if resource['name'] == self.datastore['resource']['name']: self.validate_resource(resource) resource_id = resource['id'] except ckanapi.NotFound: log.info("Package %s not found - creating", self.package['name']) # Create the package ckan_package = self.remote_ckan.action.package_create( **self.package) # If we don't have the resource ID, create if not resource_id: log.info("Resource %s not found - creating", self.datastore['resource']['name']) self.datastore['fields'] = [{ 'id': col, 'type': self.numpy_to_ckan_type(np_type) } for col, np_type in self.get_output_columns().iteritems()] self.datastore['resource']['package_id'] = ckan_package['id'] if self.indexed_fields: # Create BTREE indexes for all specified indexed fields self.datastore['indexes'] = [ col['id'] for col in self.datastore['fields'] if col['id'] in self.indexed_fields ] else: # Create BTREE indexes for all citext fields self.datastore['indexes'] = [ col['id'] for col in self.datastore['fields'] if col['type'] == 'citext' ] # API call to create the datastore resource_id = self.remote_ckan.action.datastore_create( **self.datastore)['resource_id'] # If this has geospatial fields, create geom columns if self.geospatial_fields: log.info("Creating geometry columns for %s", resource_id) self.geospatial_fields['resource_id'] = resource_id self.remote_ckan.action.create_geom_columns( **self.geospatial_fields) log.info("Created datastore resource %s", resource_id) return resource_id
def main(): update_markers = mongo_get_update_markers() # Make sure the updates have all mongo classes bulk_tasks = [ MongoCollectionIndexTask, MongoCollectionEventTask, MongoCatalogueTask, MongoTaxonomyTask, # MongoMultimediaTask, MongoSiteTask, UnpublishTask, MongoDeleteTask ] def _get_task_names(tasks): """ We need to initiate and get the family name, not just the class name MongoDeleteTask => DeleteTask @param tasks: @return: """ return [unicode(task(date=0).task_family) for task in tasks] full_export_date = int(config.get('keemu', 'full_export_date')) for date, update_marker in update_markers.iteritems(): # If this is the fll export date, MongoDeleteTask is not required if full_export_date and date == full_export_date: bulk_task_copy = list(bulk_tasks) bulk_task_copy.remove(MongoDeleteTask) bulk_task_names = _get_task_names(bulk_task_copy) else: bulk_task_names = _get_task_names(bulk_tasks) # Assert that for every date we have all the bulk tasks missing_tasks = list(set(bulk_task_names) - set(update_marker)) assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % ( date, missing_tasks) # Get a list of all export files to process export_dates = [ d for d in get_export_file_dates() if d not in update_markers.keys() ] # Run setup_interface_logging to ensure luigi commands setup_interface_logging() sch = scheduler.CentralPlannerScheduler() w = BulkWorker(scheduler=sch) for export_date in export_dates: log.info('Processing date %s', export_date) # We only need to call the mongo delete task, as all other tasks are a requirement # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask w.add(MongoDeleteTask(date=export_date, force=True)) w.run() w.stop()
def run(self): count = 0 host = config.get('mongo', 'host') db = config.get('mongo', 'database') def _fill_field(field_arr, field_type): if field_type.startswith('string'): field_arr = field_arr.astype(np.str).filled('') elif field_type == 'bool': field_arr = field_arr.astype(np.str).filled(None) elif field_type.startswith('int'): field_arr = field_arr.filled(0) elif field_type.startswith('float'): field_arr = field_arr.filled(np.NaN) else: raise Exception('Unknown field type %s' % field_type) return field_arr with Monary(host) as m: log.info("Querying Monary") # Get field definitions for default collection query_fields, df_cols, field_types = zip( *self.get_collection_source_columns(self.collection_name)) catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size) log.info("Processing Monary data") for catalogue_block in catalogue_blocks: # Bit of a hack: fill fields with a blank value (depending on type) # So the masked value doesn't get used. As the masked is shared between # each block, if a field is empty it is getting populated by previous values catalogue_block = [ _fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block) ] # Create a pandas data frame with block of records # Columns use the name from the output columns - but must be in the same order as query_fields # Which is why we're using tuples for the columns df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols) # Loop through all the columns and ensure hidden integer fields are cast as int32 # For example, taxonomy_irn is used to join with taxonomy df for i, df_col in enumerate(df_cols): if field_types[i].startswith('int'): df[df_col] = df[df_col].astype(field_types[i]) df = self.process_dataframe(m, df) # Output the dataframe self.output().write(df) row_count, col_count = df.shape count += row_count log.info("\t %s records", count) # After running, update mongo self.mongo_target.touch()