def run(self): count = 0 host = config.get('mongo', 'host') db = config.get('mongo', 'database') def _fill_field(field_arr, field_type): if field_type.startswith('string'): field_arr = field_arr.astype(np.str).filled('') elif field_type == 'bool': field_arr = field_arr.astype(np.str).filled(None) elif field_type.startswith('int'): field_arr = field_arr.filled(0) elif field_type.startswith('float'): field_arr = field_arr.filled(np.NaN) else: raise Exception('Unknown field type %s' % field_type) return field_arr with Monary(host) as m: log.info("Querying Monary") # Get field definitions for default collection query_fields, df_cols, field_types = zip(*self.get_collection_source_columns(self.collection_name)) catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size) log.info("Processing Monary data") for catalogue_block in catalogue_blocks: # Bit of a hack: fill fields with a blank value (depending on type) # So the masked value doesn't get used. As the masked is shared between # each block, if a field is empty it is getting populated by previous values catalogue_block = [_fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block)] # Create a pandas data frame with block of records # Columns use the name from the output columns - but must be in the same order as query_fields # Which is why we're using tuples for the columns df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols) # Loop through all the columns and ensure hidden integer fields are cast as int32 # For example, taxonomy_irn is used to join with taxonomy df for i, df_col in enumerate(df_cols): if field_types[i].startswith('int'): df[df_col] = df[df_col].astype(field_types[i]) df = self.process_dataframe(m, df) # Output the dataframe self.output().write(df) row_count, col_count = df.shape count += row_count log.info("\t %s records", count) # After running, update mongo self.mongo_target.touch()
def get_export_file_dates(): """ Gets all the dates of outstanding files @return: list of dates """ export_dir = config.get('keemu', 'export_dir') try: full_export_date = int(config.get('keemu', 'full_export_date')) except NoOptionError: full_export_date = None files = [f for f in os.listdir(export_dir) if os.path.isfile(os.path.join(export_dir,f))] # Use a set so we don't have duplicate dates dates = set() for f in files: # So this will work with both .gz and not compressed files f = f.replace('.gz', '') try: # Extract the date from the file name _, _, date = f.split('.') except ValueError: # file not in the correct format - hidden directory etc., pass else: try: date = int(date) except ValueError: # First dump did not contain date stamp # ecatalogue.export.zip continue else: # If we have full export date (the data the last full dump was produced) # we only want dates after the last full dump - so skip prior dates if full_export_date and date < full_export_date: continue dates.add(date) # Make sure they are in the right order and convert to list dates = sorted(list(dates)) return dates
def path(self): """ File name to output @return: str """ file_name = self.__class__.__name__.replace('DatasetCSVTask', '').lower() + '-' + str(self.date) return os.path.join(config.get('csv', 'output_dir'), file_name + '.csv')
def main(): update_markers = mongo_get_update_markers() # Make sure the updates have all mongo classes bulk_tasks = [ MongoCollectionIndexTask, MongoCollectionEventTask, MongoCatalogueTask, MongoTaxonomyTask, # MongoMultimediaTask, MongoSiteTask, UnpublishTask, MongoDeleteTask ] def _get_task_names(tasks): """ We need to initiate and get the family name, not just the class name MongoDeleteTask => DeleteTask @param tasks: @return: """ return [unicode(task(date=0).task_family) for task in tasks] full_export_date = int(config.get('keemu', 'full_export_date')) for date, update_marker in update_markers.iteritems(): # If this is the fll export date, MongoDeleteTask is not required if full_export_date and date == full_export_date: bulk_task_copy = list(bulk_tasks) bulk_task_copy.remove(MongoDeleteTask) bulk_task_names = _get_task_names(bulk_task_copy) else: bulk_task_names = _get_task_names(bulk_tasks) # Assert that for every date we have all the bulk tasks missing_tasks = list(set(bulk_task_names) - set(update_marker)) assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % (date, missing_tasks) # Get a list of all export files to process export_dates = [d for d in get_export_file_dates() if d not in update_markers.keys()] # Run setup_interface_logging to ensure luigi commands setup_interface_logging() sch = scheduler.CentralPlannerScheduler() w = BulkWorker(scheduler=sch) for export_date in export_dates: log.info('Processing date %s', export_date) # We only need to call the mongo delete task, as all other tasks are a requirement # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask w.add(MongoDeleteTask(date=export_date, force=True)) w.run() w.stop()
def path(self): """ File name to output @return: str """ file_name = self.__class__.__name__.replace( 'DatasetCSVTask', '').lower() + '-' + str(self.date) return os.path.join(config.get('csv', 'output_dir'), file_name + '.csv')
class UnpublishTask(APITask): """ Deprecated - once published, a record cannot be marked "do not publish to internet". If a KE EMu record has been marked non web publishable, it needs to be deleted from CKAN NB: This does not remove embargoed records which have already been published. You cannot embargo a record after it's release. """ database = config.get('mongo', 'database') keemu_schema_file = config.get('keemu', 'schema') def requires(self): # Mongo catalogue task for date must have run yield MongoCatalogueTask(self.date) @timeit def run(self): # Do not run if this is a full export date - all non-publishable records will # Already have been removed if int(self.full_export_date) == int(self.date): log.info("No records to unpublish for full exports") self.mark_complete() return collection = self.output().get_collection('ecatalogue') # We only care about records who's status has changed in the past week (6 days to be sure) date_object = datetime.strptime(str(self.date), '%Y%m%d') q = dict(AdmPublishWebNoPasswordFlag='N', exportFileDate=self.date, ISODateInserted={'$gte': date_object - timedelta(days=6)}) cursor = collection.find(q) log.info('%s records to unpublish', cursor.count()) for record in cursor: ckan_delete(self.remote_ckan, record) # And mark the object as complete self.mark_complete() def mark_complete(self): self.output().touch() def output(self): return MongoTarget(database=self.database, update_id=self.task_id)
def __init__(self, *args, **kwargs): # If a date parameter has been passed in, we'll just use that # Otherwise, loop through the files and get all dates super(DatasetTask, self).__init__(*args, **kwargs) # Get or create the resource object self.resource_id = self.get_or_create_resource() # Set up a mongo target to be used to mark complete self.mongo_target = MongoTarget(database=config.get('mongo', 'database'), update_id=self.update_id())
def mark_complete(self): # Move the file to the archive directory (if specified) try: archive_dir = config.get('keemu', 'archive_dir') self.input().move(os.path.join(archive_dir, self.input().file_name)) except NoOptionError: # Allow archive dir to be none pass # And mark the object as complete self.output().touch()
def __init__(self, *args, **kwargs): # If a date parameter has been passed in, we'll just use that # Otherwise, loop through the files and get all dates super(DatasetTask, self).__init__(*args, **kwargs) # Get or create the resource object self.resource_id = self.get_or_create_resource() # Set up a mongo target to be used to mark complete self.mongo_target = MongoTarget(database=config.get( 'mongo', 'database'), update_id=self.update_id())
class APITask(luigi.Task): """ Base CKAN API Task """ # Date to process date = luigi.IntParameter() full_export_date = config.get('keemu', 'full_export_date') def __init__(self, *args, **kwargs): # If a date parameter has been passed in, we'll just use that # Otherwise, loop through the files and get all dates super(APITask, self).__init__(*args, **kwargs) self.remote_ckan = ckanapi.RemoteCKAN(config.get('ckan', 'site_url'), apikey=config.get('ckan', 'api_key'))
class ArtefactDatasetTask(DatasetTask): # CKAN Dataset params package = { 'name': 'collection-artefacts', 'notes': u'Cultural and historical artefacts from The Natural History Museum', 'title': "Artefacts", 'author': DATASET_AUTHOR, 'license_id': DATASET_LICENCE, 'resources': [], 'dataset_category': DATASET_TYPE, 'owner_org': config.get('ckan', 'owner_org') } # And now save to the datastore datastore = { 'resource': { 'name': 'Artefacts', 'description': 'Museum artefacts', 'format': 'csv' }, 'primary_key': 'GUID' } columns = [ ('ecatalogue.AdmGUIDPreferredValue', 'GUID', 'uuid'), ('ecatalogue.ArtName', 'Name', 'string:100'), ('ecatalogue.ArtKind', 'Kind', 'string:100'), ('ecatalogue.PalArtDescription', 'Description', 'string:100'), ('ecatalogue.IdeCurrentScientificName', 'Scientific name', 'string:100'), ('ecatalogue.MulMultiMediaRef', 'Multimedia', 'json') ] record_type = 'Artefact' def process_dataframe(self, m, df): """ Process the dataframe, converting image IRNs to URIs @param m: monary @param df: dataframe @return: dataframe """ # And update images to URLs df = super(ArtefactDatasetTask, self).process_dataframe(m, df) self.ensure_multimedia(df, 'Multimedia') return df
def solr_reindex(): indexes = config.get('solr', 'indexes').split(',') # Loop through the indexes, request a full import and wait until it completes before # requesting for the next index - ensures there's always a stable index available for requests for index in indexes: solr_index = SolrIndex(index) print("Starting full import of index: %s" % index) solr_index.full_import() # Enter loop to keep checking status every SLEEP_INTERVAL while True: r = solr_index.status() if r['status'] == 'busy': print('Total Rows Fetched: %s' % r['statusMessages'].get('Total Rows Fetched')) print('Time elapsed: %s' % r['statusMessages'].get('Time Elapsed')) time.sleep(SLEEP_INTERVAL) else: print(r['statusMessages'].get('')) print('Time taken: %s' % r['statusMessages'].get('Time taken')) break;
def solr_reindex(): indexes = config.get('solr', 'indexes').split(',') # Loop through the indexes, request a full import and wait until it completes before # requesting for the next index - ensures there's always a stable index available for requests for index in indexes: solr_index = SolrIndex(index) print("Starting full import of index: %s" % index) solr_index.full_import() # Enter loop to keep checking status every SLEEP_INTERVAL while True: r = solr_index.status() if r['status'] == 'busy': print('Total Rows Fetched: %s' % r['statusMessages'].get('Total Rows Fetched')) print('Time elapsed: %s' % r['statusMessages'].get('Time Elapsed')) time.sleep(SLEEP_INTERVAL) else: print(r['statusMessages'].get('')) print('Time taken: %s' % r['statusMessages'].get('Time taken')) break
def __init__(self, *args, **kwargs): # If a date parameter has been passed in, we'll just use that # Otherwise, loop through the files and get all dates super(APITask, self).__init__(*args, **kwargs) self.remote_ckan = ckanapi.RemoteCKAN(config.get('ckan', 'site_url'), apikey=config.get('ckan', 'api_key'))
def process_dataframe(self, m, df): """ Process the dataframe, updating multimedia irns => URIs @param m: monary @param df: dataframe @return: dataframe """ df = super(SpecimenDatasetTask, self).process_dataframe(m, df) # Added literal columns for (field_name, _, default_value) in self.literal_columns: df[field_name] = default_value # Convert collection code to PAL, MIN etc., df['collectionCode'] = df['collectionCode'].str.upper().str[0:3] # Entom record collection code = BMNH(E) df['collectionCode'][df['collectionCode'] == 'ENT'] = "BMNH(E)" # Add the old stable identifier - IRN concatenated with catalogue name # etc., df['otherCatalogNumbers'] = 'NHMUK:ecatalogue:' + \ df['_id'].astype('str') # Ensure multimedia resources are suitable (jpeg rather than tiff # etc.,) self.ensure_multimedia(df, 'associatedMedia') # Assign determination name, type and field as to determinations for # determination history determination_fields = [ ('name', '_determinationNames'), ('type', '_determinationTypes'), ('filedAs', '_determinationFiledAs') ] def determinations_json(row): """ Convert determination fields to json Dictionary comprehension looping through each field, and if it exists adding to a dict @param row: @return: """ return json.dumps({field_name: row[determination].split(';') for field_name, determination in determination_fields if row[determination]}) df['determinations'] = df[df['_determinationNames'] != ''].apply(determinations_json, axis=1) # There doesn't seem to be a good way to identify centroids in KE EMu # I was using esites.LatDeriveCentroid, but this always defaults to True # And trying to use centroid lat/lon fields, also includes pretty much every record # But matching against *entroid being added to georeferencing notes # produces much better results df['centroid'][df['_latLongComments'].str.contains("entroid")] = True # Convert all blank strings to NaN so we can use fillna & # combine_first() to replace NaNs with value from parent df df = df.applymap(lambda x: np.nan if isinstance( x, basestring) and x == '' else x) df['catalogNumber'].fillna(df['_regRegistrationNumber'], inplace=True) # If PalNearestNamedPlaceLocal is missing, use sumPreciseLocation # And then try MinNhmVerbatimLocalityLocal df['locality'].fillna(df['_preciseLocation'], inplace=True) df['locality'].fillna(df['_minLocalityLocal'], inplace=True) # Replace missing DarTypeStatus df['typeStatus'].fillna(df['_sumTypeStatus'], inplace=True) # Replace missing depth fields df['minimumDepthInMeters'].fillna(df['_collEventFromMetres'], inplace=True) df['maximumDepthInMeters'].fillna(df['_collEventToMetres'], inplace=True) # Replace missing CatPreservative df['preservative'].fillna(df['_entCatPreservation'], inplace=True) # Cultivated should only be set on Botany records - but is actually on # everything df['cultivated'][df['collectionCode'] != 'BOT'] = np.nan # Process part parents parent_irns = self._get_unique_irns(df, '_parentRef') if parent_irns: # We want to get all parts associated to one parent record, so we can provide them as associated records # So select all records matching the parent IRN q = dict(self.query) # Delete _id if it's set - need this for testing if '_id' in q: del q['_id'] # Get all records with the same parent, so we can add them as # related records q['RegRegistrationParentRef'] = {'$in': parent_irns} monary_query = m.query(config.get('mongo', 'database'), 'ecatalogue', q, [ 'RegRegistrationParentRef', 'AdmGUIDPreferredValue'], ['int32', 'string:36']) part_df = pd.DataFrame(np.matrix(monary_query).transpose(), columns=[ 'RegRegistrationParentRef', 'AdmGUIDPreferredValue']) part_df['RegRegistrationParentRef'] = part_df[ 'RegRegistrationParentRef'].astype('int32') # Group by parent ref and concatenate all the GUIDs together # So we now have: # parent_irn guid; guid parts = part_df.groupby('RegRegistrationParentRef')[ 'AdmGUIDPreferredValue'].apply(lambda x: "%s" % ';'.join(x)) # And update the main date frame with the group parts, merged on # _parentRef df['relatedResourceID'] = df.apply(lambda row: parts[row['_parentRef']] if row[ '_parentRef'] in parts else np.NaN, axis=1) df['relationshipOfResource'][ df['relatedResourceID'].notnull()] = 'Parts' parent_df = self.get_dataframe(m, 'ecatalogue', self.get_collection_source_columns( 'ecatalogue'), parent_irns, '_id') # Ensure the parent multimedia images are usable self.ensure_multimedia(parent_df, 'associatedMedia') # Assign parentRef as the index to allow us to combine with # parent_df df.index = df['_parentRef'] # There is a annoying bug that coerces string columns to integers in combine_first # Hack: ensure there's always a string value that cannot be coerced in every column # So will create a dummy row, that gets deleted after combine_first # is called dummy_index = len(df) + 1 parent_df.loc[dummy_index] = ['-' for _ in parent_df] df = df.combine_first(parent_df) df = df.drop([dummy_index]) # Ensure our geo fields are floats df['decimalLongitude'] = df['decimalLongitude'].astype('float64') df['decimalLatitude'] = df['decimalLatitude'].astype('float64') # Get all collection columns collection_columns = self.get_collection_source_columns() # Load extra sites info (if there's an error radius + unit) site_irns = self._get_unique_irns(df, '_siteRef') sites_df = self.get_dataframe(m, 'esites', collection_columns[ 'esites'], site_irns, '_esitesIrn') df = pd.merge(df, sites_df, how='outer', left_on=[ '_siteRef'], right_on=['_esitesIrn']) # For CITES species, we need to hide Lat/Lon and Locality data - and # label images for i in ['locality', 'labelLocality', 'decimalLongitude', 'decimalLatitude', 'verbatimLongitude', 'verbatimLatitude', 'centroid', 'maxError', 'higherGeography', 'associatedMedia']: df[i][df['_cites'] == 'True'] = np.NaN # Some records are being assigned a Centroid even if they have no lat/lon fields. # Ensure it's False is latitude is null df['centroid'][df['decimalLatitude'].isnull()] = False # Load collection event data collection_event_irns = self._get_unique_irns( df, '_collectionEventRef') # if collection_event_irns: collection_event_df = self.get_dataframe(m, 'ecollectionevents', collection_columns[ 'ecollectionevents'], collection_event_irns, '_ecollectioneventsIrn') # print collection_event_df df = pd.merge(df, collection_event_df, how='outer', left_on=[ '_collectionEventRef'], right_on=['_ecollectioneventsIrn']) # Add parasite life stage # Parasite cards use a different field for life stage df['lifeStage'].fillna(df['_parasiteStage'], inplace=True) # Add parasite card parasite_taxonomy_irns = self._get_unique_irns(df, '_cardParasiteRef') if parasite_taxonomy_irns: parasite_df = self.get_dataframe( m, 'etaxonomy', self.parasite_taxonomy_fields, parasite_taxonomy_irns, '_irn') df.index = df['_cardParasiteRef'] df = df.combine_first(parasite_df) return df
class SpecimenDatasetTask(DatasetTask): # CKAN Dataset params package = { 'name': 'collection-specimens', 'notes': u'Specimen records from the Natural History Museum\'s collection', 'title': "Collection specimens", 'author': DATASET_AUTHOR, 'license_id': DATASET_LICENCE, 'resources': [], 'dataset_category': DATASET_TYPE, 'spatial': '{"type":"Polygon","coordinates":[[[-180,82],[180,82],[180,-82],[-180,-82],[-180,82]]]}', 'owner_org': config.get('ckan', 'owner_org') } # And now save to the datastore datastore = { 'resource': { 'id': config.get('dataset_id', 'specimen'), 'name': 'Specimens', 'description': 'Specimen records', 'format': 'dwc' # Darwin core }, 'primary_key': 'occurrenceID' } geospatial_fields = { 'latitude_field': 'decimalLatitude', 'longitude_field': 'decimalLongitude' } indexed_fields = ['collectionCode', 'catalogNumber', 'created', 'project'] columns = [ # List of columns # ([KE EMu field], [new field], [field type]) # Used for logging, joins and the old stable identifier ('ecatalogue._id', '_id', 'int32'), ('ecatalogue.AdmGUIDPreferredValue', 'occurrenceID', 'uuid'), ('ecatalogue.DarCatalogNumber', 'catalogNumber', 'string:100'), # Taxonomy ('ecatalogue.DarScientificName', 'scientificName', 'string:100'), # Rather than using the two darwin core fields DarScientificNameAuthorYear and ScientificNameAuthor # It's easier to just use IdeFiledAsAuthors which has them both # concatenated ('ecatalogue.IdeFiledAsAuthors', 'scientificNameAuthorship', 'string:100'), ('ecatalogue.DarTypeStatus', 'typeStatus', 'string:100'), # Use nearest name place rather than precise locality # https://github.com/NaturalHistoryMuseum/ke2mongo/issues/29 ('ecatalogue.PalNearestNamedPlaceLocal', 'locality', 'string:100'), ('ecatalogue.DarCountry', 'country', 'string:100'), ('ecatalogue.DarWaterBody', 'waterBody', 'string:100'), ('ecatalogue.EntLocExpeditionNameLocal', 'expedition', 'string:100'), ('ecollectionevents.ColParticipantLocal', 'recordedBy', 'string:100'), ('ecatalogue.ColDepartment', 'collectionCode', 'string:100'), ('ecatalogue.DarKingdom', 'kingdom', 'string:100'), ('ecatalogue.DarPhylum', 'phylum', 'string:100'), ('ecatalogue.DarClass', 'class', 'string:100'), ('ecatalogue.DarOrder', 'order', 'string:100'), ('ecatalogue.DarFamily', 'family', 'string:100'), ('ecatalogue.DarGenus', 'genus', 'string:100'), ('ecatalogue.DarSubgenus', 'subgenus', 'string:100'), ('ecatalogue.DarSpecies', 'specificEpithet', 'string:100'), ('ecatalogue.DarSubspecies', 'infraspecificEpithet', 'string:100'), ('ecatalogue.DarHigherTaxon', 'higherClassification', 'string:100'), ('ecatalogue.DarInfraspecificRank', 'taxonRank', 'string:100'), # Location ('ecatalogue.DarStateProvince', 'stateProvince', 'string:100'), ('ecatalogue.DarContinent', 'continent', 'string:100'), ('ecatalogue.DarIsland', 'island', 'string:100'), ('ecatalogue.DarIslandGroup', 'islandGroup', 'string:100'), # Removed: continentOcean is not in current DwC standard, replaced by waterBody and continent # ('ecatalogue.DarContinentOcean', 'continentOcean', 'string:100'), ('ecatalogue.DarHigherGeography', 'higherGeography', 'string:100'), ('ecatalogue.ColHabitatVerbatim', 'habitat', 'string:100'), ('ecatalogue.DarLatLongComments', '_latLongComments', 'string:100'), ('ecatalogue.DarDecimalLongitude', 'decimalLongitude', 'float64'), ('ecatalogue.DarDecimalLatitude', 'decimalLatitude', 'float64'), ('ecatalogue.DarGeodeticDatum', 'geodeticDatum', 'string:100'), ('ecatalogue.DarGeorefMethod', 'georeferenceProtocol', 'string:100'), ('esites.LatLongitude', 'verbatimLongitude', 'string:100'), ('esites.LatLatitude', 'verbatimLatitude', 'string:100'), # Occurrence ('ecatalogue.DarMinimumElevationInMeters', 'minimumElevationInMeters', 'string:100'), ('ecatalogue.DarMaximumElevationInMeters', 'maximumElevationInMeters', 'string:100'), ('ecatalogue.DarMinimumDepthInMeters', 'minimumDepthInMeters', 'string:100'), ('ecatalogue.DarMaximumDepthInMeters', 'maximumDepthInMeters', 'string:100'), # DarCollector doesn't have multiple collectors NHMUK:ecatalogue:1751715 - Switched to using ecollectionevents.ColParticipantLocal # ('ecatalogue.DarCollector', 'Recorded by', 'string:100'), ('ecatalogue.DarCollectorNumber', 'recordNumber', 'string:100'), ('ecatalogue.DarIndividualCount', 'individualCount', 'string:100'), # According to docs, ageClass has been superseded by lifeStage. We have both, but ageClass duplicates # And for the ~200 it has extra data, the data isn't good # ('ecatalogue.DarAgeClass', 'ageClass', 'string:100'), ('ecatalogue.DarLifeStage', 'lifeStage', 'string:100'), ('ecatalogue.DarSex', 'sex', 'string:100'), ('ecatalogue.DarPreparations', 'preparations', 'string:100'), # Identification ('ecatalogue.DarIdentifiedBy', 'identifiedBy', 'string:100'), # KE Emu has 3 fields for identification date: DarDayIdentified, DarMonthIdentified and DarYearIdentified # But EntIdeDateIdentified holds them all - which is what we want for # dateIdentified ('ecatalogue.EntIdeDateIdentified', 'dateIdentified', 'string:100'), ('ecatalogue.DarIdentificationQualifier', 'identificationQualifier', 'string:100'), # ('ecatalogue.DarFieldNumber', 'Field number', 'string:100'), Removed as mostly duplicates DarCollectorNumber (JW - feedback) ('ecatalogue.DarTimeOfDay', 'eventTime', 'string:100'), ('ecatalogue.DarDayCollected', 'day', 'string:100'), ('ecatalogue.DarMonthCollected', 'month', 'string:100'), ('ecatalogue.DarYearCollected', 'year', 'string:100'), # Geo ('ecatalogue.DarEarliestEon', 'earliestEonOrLowestEonothem', 'string:100'), ('ecatalogue.DarLatestEon', 'latestEonOrHighestEonothem', 'string:100'), ('ecatalogue.DarEarliestEra', 'earliestEraOrLowestErathem', 'string:100'), ('ecatalogue.DarLatestEra', 'latestEraOrHighestErathem', 'string:100'), ('ecatalogue.DarEarliestPeriod', 'earliestPeriodOrLowestSystem', 'string:100'), ('ecatalogue.DarLatestPeriod', 'latestPeriodOrHighestSystem', 'string:100'), ('ecatalogue.DarEarliestEpoch', 'earliestEpochOrLowestSeries', 'string:100'), ('ecatalogue.DarLatestEpoch', 'latestEpochOrHighestSeries', 'string:100'), ('ecatalogue.DarEarliestAge', 'earliestAgeOrLowestStage', 'string:100'), ('ecatalogue.DarLatestAge', 'latestAgeOrHighestStage', 'string:100'), ('ecatalogue.DarLowestBiostrat', 'lowestBiostratigraphicZone', 'string:100'), ('ecatalogue.DarHighestBiostrat', 'highestBiostratigraphicZone', 'string:100'), ('ecatalogue.DarGroup', 'group', 'string:100'), ('ecatalogue.DarFormation', 'formation', 'string:100'), ('ecatalogue.DarMember', 'member', 'string:100'), ('ecatalogue.DarBed', 'bed', 'string:100'), # Resource relationship # ('ecatalogue.DarRelatedCatalogItem', 'Related resource id', 'string:100'), Only 34 records have this field populated # So it's better to build automatically from part / parent records # Multimedia ('ecatalogue.MulMultiMediaRef', 'associatedMedia', 'json'), # Dynamic properties # These fields do not map to DwC, but are still very useful ('ecatalogue.ColRecordType', 'recordType', 'string:100'), ('ecatalogue.ColSubDepartment', 'subDepartment', 'string:100'), ('ecatalogue.PrtType', 'partType', 'string:100'), ('ecatalogue.RegCode', 'registrationCode', 'string:100'), ('ecatalogue.CatKindOfObject', 'kindOfObject', 'string:100'), ('ecatalogue.CatKindOfCollection', 'kindOfCollection', 'string:100'), ('ecatalogue.CatPreservative', 'preservative', 'string:100'), ('ecatalogue.ColKind', 'collectionKind', 'string:100'), ('ecatalogue.EntPriCollectionName', 'collectionName', 'string:100'), ('ecatalogue.PalAcqAccLotDonorFullName', 'donorName', 'string:100'), ('ecatalogue.DarPreparationType', 'preparationType', 'string:100'), ('ecatalogue.DarObservedWeight', 'observedWeight', 'string:100'), # Location # Data is stored in sumViceCountry field in ecatalogue data - but actually this # should be viceCountry (which it is in esites) ('ecatalogue.sumViceCountry', 'viceCounty', 'string:100'), ('ecatalogue.DnaExtractionMethod', 'extractionMethod', 'string:100'), ('ecatalogue.DnaReSuspendedIn', 'resuspendedIn', 'string:100'), ('ecatalogue.DnaTotalVolume', 'totalVolume', 'string:100'), # Parasite card ('ecatalogue.CardBarcode', 'barcode', 'string:100'), # Egg ('ecatalogue.EggClutchSize', 'clutchSize', 'string:100'), ('ecatalogue.EggSetMark', 'setMark', 'string:100'), # Nest ('ecatalogue.NesShape', 'nestShape', 'string:100'), ('ecatalogue.NesSite', 'nestSite', 'string:100'), # Silica gel ('ecatalogue.SilPopulationCode', 'populationCode', 'string:100'), # Botany ('ecatalogue.CollExsiccati', 'exsiccati', 'string:100'), ('ecatalogue.ColExsiccatiNumber', 'exsiccatiNumber', 'string:100'), # JW asked for this to be renamed from Site Description => Label # locality ('ecatalogue.ColSiteDescription', 'labelLocality', 'string:100'), ('ecatalogue.ColPlantDescription', 'plantDescription', 'string:100'), ('ecatalogue.FeaCultivated', 'cultivated', 'string:100'), # ('ecatalogue.FeaPlantForm', 'Plant form', 'string:100'), # JW asked for this to be removed # Paleo ('ecatalogue.PalDesDescription', 'catalogueDescription', 'string:100'), ('ecatalogue.PalStrChronostratLocal', 'chronostratigraphy', 'string:100'), ('ecatalogue.PalStrLithostratLocal', 'lithostratigraphy', 'string:100'), # Mineralogy ('ecatalogue.MinDateRegistered', 'dateRegistered', 'string:100'), ('ecatalogue.MinIdentificationAsRegistered', 'identificationAsRegistered', 'string:100'), ('ecatalogue.MinIdentificationDescription', 'identificationDescription', 'string:500'), ('ecatalogue.MinPetOccurance', 'occurrence', 'string:100'), ('ecatalogue.MinOreCommodity', 'commodity', 'string:200'), ('ecatalogue.MinOreDepositType', 'depositType', 'string:100'), ('ecatalogue.MinTextureStructure', 'texture', 'string:100'), ('ecatalogue.MinIdentificationVariety', 'identificationVariety', 'string:100'), ('ecatalogue.MinIdentificationOther', 'identificationOther', 'string:100'), ('ecatalogue.MinHostRock', 'hostRock', 'string:100'), ('ecatalogue.MinAgeDataAge', 'age', 'string:100'), ('ecatalogue.MinAgeDataType', 'ageType', 'string:100'), # Mineralogy location ('ecatalogue.MinNhmTectonicProvinceLocal', 'tectonicProvince', 'string:100'), ('ecatalogue.MinNhmStandardMineLocal', 'mine', 'string:100'), ('ecatalogue.MinNhmMiningDistrictLocal', 'miningDistrict', 'string:100'), ('ecatalogue.MinNhmComplexLocal', 'mineralComplex', 'string:100'), ('ecatalogue.MinNhmRegionLocal', 'geologyRegion', 'string:100'), # Meteorite ('ecatalogue.MinMetType', 'meteoriteType', 'string:100'), ('ecatalogue.MinMetGroup', 'meteoriteGroup', 'string:100'), ('ecatalogue.MinMetChondriteAchondrite', 'chondriteAchondrite', 'string:100'), ('ecatalogue.MinMetClass', 'meteoriteClass', 'string:100'), ('ecatalogue.MinMetPetType', 'petrologyType', 'string:100'), ('ecatalogue.MinMetPetSubtype', 'petrologySubtype', 'string:100'), ('ecatalogue.MinMetRecoveryFindFall', 'recovery', 'string:100'), ('ecatalogue.MinMetRecoveryDate', 'recoveryDate', 'string:100'), ('ecatalogue.MinMetRecoveryWeight', 'recoveryWeight', 'string:100'), ('ecatalogue.MinMetWeightAsRegistered', 'registeredWeight', 'string:100'), ('ecatalogue.MinMetWeightAsRegisteredUnit', 'registeredWeightUnit', 'string:100'), # Project ('ecatalogue.NhmSecProjectName', 'project', 'string:100'), # Project ('ecatalogue.EntCatBarcode', 'barcode', 'string:100'), # Record level ('ecatalogue.AdmDateModified', 'modified', 'string:100'), # This isn't actually in DwC - but I'm going to use dcterms:created ('ecatalogue.AdmDateInserted', 'created', 'string:100'), # Internal ('ecatalogue.RegRegistrationParentRef', '_parentRef', 'int32'), ('ecatalogue.sumSiteRef', '_siteRef', 'int32'), ('ecatalogue.sumCollectionEventRef', '_collectionEventRef', 'int32'), ('ecatalogue.CardParasiteRef', '_cardParasiteRef', 'int32'), # Used if DarCatalogueNumber is empty ('ecatalogue.RegRegistrationNumber', '_regRegistrationNumber', 'string:100'), # Used if CatPreservative is empty ('ecatalogue.EntCatPreservation', '_entCatPreservation', 'string:100'), # Used to build previous determinations for Botany ('ecatalogue.IdeCitationTypeStatus', '_determinationTypes', 'string:100'), ('ecatalogue.EntIdeScientificNameLocal', '_determinationNames', 'string:250'), ('ecatalogue.EntIdeFiledAs', '_determinationFiledAs', 'string:100'), # If DarTypeStatus is empty, we'll use sumTypeStatus which has previous # determinations ('ecatalogue.sumTypeStatus', '_sumTypeStatus', 'string:100'), # Id DarMinimumDepthInMeters is empty, use CollEventFromMetres - used for abyssline project ('ecatalogue.CollEventFromMetres', '_collEventFromMetres', 'string:100' ), ('ecatalogue.CollEventToMetres', '_collEventToMetres', 'string:100'), # Locality if nearest named place is empty # The encoding of DarLocality is buggered - see ecatalogue.1804973 # So better to use the original field with the correct encoding ('ecatalogue.sumPreciseLocation', '_preciseLocation', 'string:100'), # Locality if precise and nearest named place is empty ('ecatalogue.MinNhmVerbatimLocalityLocal', '_minLocalityLocal', 'string:100'), # CITES specimens ('ecatalogue.cites', '_cites', 'bool'), # Parasite cards use a different field for life stage ('ecatalogue.CardParasiteStage', '_parasiteStage', 'string:100'), # Join keys ('ecollectionevents._id', '_ecollectioneventsIrn', 'int32'), ('esites._id', '_esitesIrn', 'int32'), # Removed: We do not want notes, could contain anything # ('ecatalogue.DarNotes', 'DarNotes', 'string:100'), # ('ecatalogue.DarLatLongComments', 'latLongComments', 'string:100'), ] # Used to merge in data from parasite cards, which do not have taxonomic # data parasite_taxonomy_fields = [ ('_id', '_irn', 'int32'), ('ClaScientificNameBuilt', 'scientificName', 'string:100'), ('ClaKingdom', 'kingdom', 'string:60'), ('ClaPhylum', 'phylum', 'string:100'), ('ClaClass', 'class', 'string:100'), ('ClaOrder', 'order', 'string:100'), ('ClaFamily', 'family', 'string:100'), ('ClaGenus', 'genus', 'string:100'), ('ClaSubgenus', 'subgenus', 'string:100'), ('ClaSpecies', 'specificEpithet', 'string:100'), ('ClaSubspecies', 'infraspecificEpithet', 'string:100'), ('ClaRank', 'taxonRank', 'string:10') # NB: CKAN uses rank internally ] # Columns not selected from the database # In the format (field_name, field_type, default_value) literal_columns = [ ('institutionCode', 'string:100', 'NHMUK'), ('basisOfRecord', 'string:100', 'Specimen'), ('determinations', 'json', np.NaN), # This is set dynamically if this is a part record (with parent Ref) ('relatedResourceID', 'string:100', np.NaN), ('relationshipOfResource', 'string:100', np.NaN), ('centroid', 'bool', False), ('otherCatalogNumbers', 'string:100', np.NaN) ] @property def query(self): """ Query object for selecting data from mongoDB To test encoding, use query = {'_id': 42866} @return: dict """ query = super(SpecimenDatasetTask, self).query # Override the default ColRecordType query['ColRecordType'] = { "$nin": PARENT_TYPES + [ArtefactDatasetTask.record_type, IndexLotDatasetTask.record_type] } # And exclude all with an embargo date (timestamp) in the future query['RealEmbargoDate'] = {"$lt": time.time()} return query def get_output_columns(self): """ Override default get_output_columns and add in literal columns (not retrieved from mongo) @return: """ output_columns = super(SpecimenDatasetTask, self).get_output_columns() # Add the literal columns for (field_name, field_type, _) in self.literal_columns: output_columns[field_name] = field_type return output_columns def process_dataframe(self, m, df): """ Process the dataframe, updating multimedia irns => URIs @param m: monary @param df: dataframe @return: dataframe """ df = super(SpecimenDatasetTask, self).process_dataframe(m, df) # Added literal columns for (field_name, _, default_value) in self.literal_columns: df[field_name] = default_value # Convert collection code to PAL, MIN etc., df['collectionCode'] = df['collectionCode'].str.upper().str[0:3] # Entom record collection code = BMNH(E) df['collectionCode'][df['collectionCode'] == 'ENT'] = "BMNH(E)" # Add the old stable identifier - IRN concatenated with catalogue name # etc., df['otherCatalogNumbers'] = 'NHMUK:ecatalogue:' + \ df['_id'].astype('str') # Ensure multimedia resources are suitable (jpeg rather than tiff # etc.,) self.ensure_multimedia(df, 'associatedMedia') # Assign determination name, type and field as to determinations for # determination history determination_fields = [('name', '_determinationNames'), ('type', '_determinationTypes'), ('filedAs', '_determinationFiledAs')] def determinations_json(row): """ Convert determination fields to json Dictionary comprehension looping through each field, and if it exists adding to a dict @param row: @return: """ return json.dumps({ field_name: row[determination].split(';') for field_name, determination in determination_fields if row[determination] }) df['determinations'] = df[df['_determinationNames'] != ''].apply( determinations_json, axis=1) # There doesn't seem to be a good way to identify centroids in KE EMu # I was using esites.LatDeriveCentroid, but this always defaults to True # And trying to use centroid lat/lon fields, also includes pretty much every record # But matching against *entroid being added to georeferencing notes # produces much better results df['centroid'][df['_latLongComments'].str.contains("entroid")] = True # Convert all blank strings to NaN so we can use fillna & # combine_first() to replace NaNs with value from parent df df = df.applymap(lambda x: np.nan if isinstance(x, basestring) and x == '' else x) df['catalogNumber'].fillna(df['_regRegistrationNumber'], inplace=True) # If PalNearestNamedPlaceLocal is missing, use sumPreciseLocation # And then try MinNhmVerbatimLocalityLocal df['locality'].fillna(df['_preciseLocation'], inplace=True) df['locality'].fillna(df['_minLocalityLocal'], inplace=True) # Replace missing DarTypeStatus df['typeStatus'].fillna(df['_sumTypeStatus'], inplace=True) # Replace missing depth fields df['minimumDepthInMeters'].fillna(df['_collEventFromMetres'], inplace=True) df['maximumDepthInMeters'].fillna(df['_collEventToMetres'], inplace=True) # Replace missing CatPreservative df['preservative'].fillna(df['_entCatPreservation'], inplace=True) # Cultivated should only be set on Botany records - but is actually on # everything df['cultivated'][df['collectionCode'] != 'BOT'] = np.nan # Process part parents parent_irns = self._get_unique_irns(df, '_parentRef') if parent_irns: # We want to get all parts associated to one parent record, so we can provide them as associated records # So select all records matching the parent IRN q = dict(self.query) # Delete _id if it's set - need this for testing if '_id' in q: del q['_id'] # Get all records with the same parent, so we can add them as # related records q['RegRegistrationParentRef'] = {'$in': parent_irns} monary_query = m.query( config.get('mongo', 'database'), 'ecatalogue', q, ['RegRegistrationParentRef', 'AdmGUIDPreferredValue'], ['int32', 'string:36']) part_df = pd.DataFrame( np.matrix(monary_query).transpose(), columns=['RegRegistrationParentRef', 'AdmGUIDPreferredValue']) part_df['RegRegistrationParentRef'] = part_df[ 'RegRegistrationParentRef'].astype('int32') # Group by parent ref and concatenate all the GUIDs together # So we now have: # parent_irn guid; guid parts = part_df.groupby('RegRegistrationParentRef')[ 'AdmGUIDPreferredValue'].apply(lambda x: "%s" % ';'.join(x)) # And update the main date frame with the group parts, merged on # _parentRef df['relatedResourceID'] = df.apply( lambda row: parts[row['_parentRef']] if row['_parentRef'] in parts else np.NaN, axis=1) df['relationshipOfResource'][ df['relatedResourceID'].notnull()] = 'Parts' parent_df = self.get_dataframe( m, 'ecatalogue', self.get_collection_source_columns('ecatalogue'), parent_irns, '_id') # Ensure the parent multimedia images are usable self.ensure_multimedia(parent_df, 'associatedMedia') # Assign parentRef as the index to allow us to combine with # parent_df df.index = df['_parentRef'] # There is a annoying bug that coerces string columns to integers in combine_first # Hack: ensure there's always a string value that cannot be coerced in every column # So will create a dummy row, that gets deleted after combine_first # is called dummy_index = len(df) + 1 parent_df.loc[dummy_index] = ['-' for _ in parent_df] df = df.combine_first(parent_df) df = df.drop([dummy_index]) # Ensure our geo fields are floats df['decimalLongitude'] = df['decimalLongitude'].astype('float64') df['decimalLatitude'] = df['decimalLatitude'].astype('float64') # Get all collection columns collection_columns = self.get_collection_source_columns() # Load extra sites info (if there's an error radius + unit) site_irns = self._get_unique_irns(df, '_siteRef') sites_df = self.get_dataframe(m, 'esites', collection_columns['esites'], site_irns, '_esitesIrn') df = pd.merge(df, sites_df, how='outer', left_on=['_siteRef'], right_on=['_esitesIrn']) # For CITES species, we need to hide Lat/Lon and Locality data - and # label images for i in [ 'locality', 'labelLocality', 'decimalLongitude', 'decimalLatitude', 'verbatimLongitude', 'verbatimLatitude', 'centroid', 'maxError', 'higherGeography', 'associatedMedia' ]: df[i][df['_cites'] == 'True'] = np.NaN # Some records are being assigned a Centroid even if they have no lat/lon fields. # Ensure it's False is latitude is null df['centroid'][df['decimalLatitude'].isnull()] = False # Load collection event data collection_event_irns = self._get_unique_irns(df, '_collectionEventRef') # if collection_event_irns: collection_event_df = self.get_dataframe( m, 'ecollectionevents', collection_columns['ecollectionevents'], collection_event_irns, '_ecollectioneventsIrn') # print collection_event_df df = pd.merge(df, collection_event_df, how='outer', left_on=['_collectionEventRef'], right_on=['_ecollectioneventsIrn']) # Add parasite life stage # Parasite cards use a different field for life stage df['lifeStage'].fillna(df['_parasiteStage'], inplace=True) # Add parasite card parasite_taxonomy_irns = self._get_unique_irns(df, '_cardParasiteRef') if parasite_taxonomy_irns: parasite_df = self.get_dataframe(m, 'etaxonomy', self.parasite_taxonomy_fields, parasite_taxonomy_irns, '_irn') df.index = df['_cardParasiteRef'] df = df.combine_first(parasite_df) return df
class IndexLotDatasetTask(DatasetTask): record_type = 'Index Lot' # CKAN Dataset params package = { 'name': 'collection-indexlots', 'notes': u'Index Lot records from the Natural History Museum\'s collection', 'title': "Index Lot collection", 'author': DATASET_AUTHOR, 'license_id': DATASET_LICENCE, 'resources': [], 'dataset_category': DATASET_TYPE, 'owner_org': config.get('ckan', 'owner_org') } # And now save to the datastore datastore = { 'resource': { 'name': 'Index Lots', 'description': 'Species level record denoting the presence of a taxon in the Museum collection', 'format': 'csv' }, 'primary_key': 'GUID' } columns = [ ('etaxonomy2._id', '_current_name_irn', 'int32'), ('etaxonomy2.ClaScientificNameBuilt', 'Currently accepted name', 'string:100'), ('etaxonomy._id', '_taxonomy_irn', 'int32'), ('etaxonomy.ClaScientificNameBuilt', 'Original name', 'string:100'), ('etaxonomy.ClaKingdom', 'Kingdom', 'string:60'), ('etaxonomy.ClaPhylum', 'Phylum', 'string:100'), ('etaxonomy.ClaClass', 'Class', 'string:100'), ('etaxonomy.ClaOrder', 'Order', 'string:100'), ('etaxonomy.ClaSuborder', 'Suborder', 'string:100'), ('etaxonomy.ClaSuperfamily', 'Superfamily', 'string:100'), ('etaxonomy.ClaFamily', 'Family', 'string:100'), ('etaxonomy.ClaSubfamily', 'Subfamily', 'string:100'), ('etaxonomy.ClaGenus', 'Genus', 'string:100'), ('etaxonomy.ClaSubgenus', 'Subgenus', 'string:100'), ('etaxonomy.ClaSpecies', 'Species', 'string:100'), ('etaxonomy.ClaSubspecies', 'Subspecies', 'string:100'), ('etaxonomy.ClaRank', 'Taxonomic rank', 'string:20'), # NB: CKAN uses rank internally ('ecatalogue.AdmGUIDPreferredValue', 'GUID', 'uuid'), ('ecatalogue._id', 'IRN', 'int32'), ('ecatalogue.EntIndIndexLotNameRef', '_collection_index_irn', 'int32'), ('ecatalogue.EntIndMaterial', 'Material', 'bool'), ('ecatalogue.EntIndType', 'Type', 'bool'), ('ecatalogue.EntIndMedia', 'Media', 'bool'), ('ecatalogue.EntIndBritish', 'British', 'bool'), ('ecatalogue.EntIndKindOfMaterial', 'Kind of material', 'string:100'), ('ecatalogue.EntIndKindOfMedia', 'Kind of media', 'string:100'), # Material detail ('ecatalogue.EntIndCount', 'Material count', 'string:100'), ('ecatalogue.EntIndSex', 'Material sex', 'string:100'), ('ecatalogue.EntIndStage', 'Material stage', 'string:100'), ('ecatalogue.EntIndTypes', 'Material types', 'string:100'), ('ecatalogue.EntIndPrimaryTypeNo', 'Material primary type no', 'string:100'), # Separate Botany and Entomology ('ecatalogue.ColDepartment', 'Department', 'string:100'), # Audit info ('ecatalogue.AdmDateModified', 'Modified', 'string:100'), ('ecatalogue.AdmDateInserted', 'Created', 'string:100'), ] def process_dataframe(self, m, df): """ Process the dataframe, adding in the taxonomy fields @param m: monary @param df: dataframe @return: dataframe """ # Try and get taxonomy using the collection index # BS: 20140804 - Fix indexlots taxonomy bug # When the index lot record's taxonomy is updated (via collection index), # the index lot record's EntIndIndexLotTaxonNameLocalRef is not updated with the new taxonomy # So we need to use collection index to retrieve the record taxonomy df = super(IndexLotDatasetTask, self).process_dataframe(m, df) # Convert booleans to yes / no for all columns in the main collection for (_, field, field_type) in self.get_collection_source_columns( self.collection_name): if field_type == 'bool': df[field][df[field] == 'True'] = 'Yes' df[field][df[field] == 'False'] = 'No' df[field][df[field] == 'N/A'] = '' # BUG FIX BS 140811 # ColCurrentNameRef Is not being updated correctly - see record 899984 # ColCurrentNameRef = 964105 # Not a problem, as indexlots are using ColTaxonomicNameRef for summary data etc., # So ColTaxonomicNameRef is the correct field to use. collection_index_columns = [ ('_id', '_collection_index_irn', 'int32'), ('ColTaxonomicNameRef', '_taxonomy_irn', 'int32'), ('ColCurrentNameRef', '_current_name_irn', 'int32'), ] collection_index_irns = self._get_unique_irns(df, '_collection_index_irn') collection_index_df = self.get_dataframe(m, 'ecollectionindex', collection_index_columns, collection_index_irns, '_collection_index_irn') # Get all collection columns collection_columns = self.get_collection_source_columns() # And get the taxonomy for these collection taxonomy_irns = self._get_unique_irns(collection_index_df, '_taxonomy_irn') # The query to pre-load all taxonomy objects takes ~96 seconds # It is much faster to load taxonomy objects on the fly, for the current block # collection_index_irns = pd.unique(df._collection_index_irn.values.ravel()).tolist() taxonomy_df = self.get_dataframe(m, 'etaxonomy', collection_columns['etaxonomy'], taxonomy_irns, '_taxonomy_irn') # Merge the taxonomy into the collection index dataframe - we need to do this so we can merge into # main dataframe keyed by collection index ID collection_index_df = pd.merge(collection_index_df, taxonomy_df, how='inner', left_on=['_taxonomy_irn'], right_on=['_taxonomy_irn']) # Add current name - same process as the main taxonomy but using _current_name_irn source fields current_name_irns = self._get_unique_irns(collection_index_df, '_current_name_irn') current_name_df = self.get_dataframe(m, 'etaxonomy', collection_columns['etaxonomy2'], current_name_irns, '_current_name_irn') collection_index_df = pd.merge(collection_index_df, current_name_df, how='inner', left_on=['_current_name_irn'], right_on=['_current_name_irn']) # Merge results into main dataframe df = pd.merge(df, collection_index_df, how='outer', left_on=['_collection_index_irn'], right_on=['_collection_index_irn']) return df def get_output_columns(self): """ Get a list of output columns, with bool converted to string:3 (so can be converted to Yes/No) @return: """ return OrderedDict((col[1], 'string:3' if col[2] == 'bool' else col[2]) for col in self.columns if self._is_output_field(col[1]))
class MongoTask(luigi.Task): date = luigi.IntParameter() # Added parameter to allow skipping the processing of records - this is so MW can look at the raw data in mongo unprocessed = luigi.BooleanParameter(default=False, significant=False) flatten_mode = FlattenModeParameter(default=FLATTEN_ALL, significant=False) database = config.get('mongo', 'database') keemu_schema_file = config.get('keemu', 'schema') batch_size = 1000 bulk_op_size = 100000 collection = None file_extension = 'export' @abc.abstractproperty def module(self): return None @property def collection_name(self): return self.module # By default, the collection name will be the same as the module def requires(self): return KEFileTask(module=self.module, date=self.date, file_extension=self.file_extension) def get_collection(self): """ Get a reference to the mongo collection object @return: """ return self.output().get_collection(self.collection_name) @timeit def run(self): ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file, flatten_mode=self.flatten_mode) self.collection = self.get_collection() # If we have any records in the collection, use bulk_update with mongo bulk upsert # Otherwise use batch insert (20% faster than using bulk insert()) if self.collection.find_one(): self.bulk_update(ke_data) else: self.batch_insert(ke_data) self.mark_complete() def mark_complete(self): # Move the file to the archive directory (if specified) try: archive_dir = config.get('keemu', 'archive_dir') self.input().move(os.path.join(archive_dir, self.input().file_name)) except NoOptionError: # Allow archive dir to be none pass # And mark the object as complete self.output().touch() def bulk_update(self, ke_data): bulk = self.collection.initialize_unordered_bulk_op() count = 0 for record in self.iterate_data(ke_data): # Find and replace doc - inserting if it doesn't exist bulk.find({'_id': record['_id']}).upsert().replace_one(record) count += 1 # Bulk ops can have out of memory errors (I'm getting for ~400,000+ bulk ops) # So execute the bulk op in stages, when bulk_op_size is reached if count % self.bulk_op_size == 0: log.info('Executing bulk op') bulk.execute() bulk = self.collection.initialize_unordered_bulk_op() try: bulk.execute() except InvalidOperation: # If we do not have any records to execute, ignore error # They have been executed in ln124 pass def batch_insert(self, ke_data): def _insert(batch): try: self.collection.insert(batch) except DuplicateKeyError: # Duplicate key error - KE export does duplicate some records # So switch to bulk upsert for this operation log.error('Duplicate key error - switching to upsert') bulk = self.collection.initialize_unordered_bulk_op() for batch_record in batch: bulk.find({ '_id': batch_record['_id'] }).upsert().replace_one(batch_record) bulk.execute() batch = [] for record in self.iterate_data(ke_data): if self.batch_size: batch.append(record) # If the batch length equals the batch size, commit and clear the batch if len(batch) % self.batch_size == 0: log.info('Submitting batch') _insert(batch) batch = [] else: self.collection.insert(record) # Add any records remaining in the batch if batch: _insert(batch) def iterate_data(self, ke_data): """ Iterate through the data @return: """ for record in ke_data: status = ke_data.get_status() if status: log.info(status) # Use the IRN as _id record['_id'] = record['irn'] try: # Do not process if unprocessed flag is set if not self.unprocessed: record = self.process_record(record) except InvalidRecordException: continue else: yield record def process_record(self, record): # Keep the IRN but cast as string, so we can use it in $concat record['irn'] = str(record['irn']) # Add the date of the export file record['exportFileDate'] = self.date return record def output(self): return MongoTarget(database=self.database, update_id=self.update_id()) def update_id(self): """This update id will be a unique identifier for this insert on this collection.""" return self.task_id def on_success(self): """ On completion, add indexes @return: None """ self.collection = self.get_collection() log.info("Adding exportFileDate index") self.collection.ensure_index('exportFileDate')
def process_dataframe(self, m, df): """ Process the dataframe, updating multimedia irns => URIs @param m: monary @param df: dataframe @return: dataframe """ df = super(SpecimenDatasetTask, self).process_dataframe(m, df) # Added literal columns for (field_name, _, default_value) in self.literal_columns: df[field_name] = default_value # Convert collection code to PAL, MIN etc., df['collectionCode'] = df['collectionCode'].str.upper().str[0:3] # Entom record collection code = BMNH(E) df['collectionCode'][df['collectionCode'] == 'ENT'] = "BMNH(E)" # Add the old stable identifier - IRN concatenated with catalogue name # etc., df['otherCatalogNumbers'] = 'NHMUK:ecatalogue:' + \ df['_id'].astype('str') # Ensure multimedia resources are suitable (jpeg rather than tiff # etc.,) self.ensure_multimedia(df, 'associatedMedia') # Assign determination name, type and field as to determinations for # determination history determination_fields = [('name', '_determinationNames'), ('type', '_determinationTypes'), ('filedAs', '_determinationFiledAs')] def determinations_json(row): """ Convert determination fields to json Dictionary comprehension looping through each field, and if it exists adding to a dict @param row: @return: """ return json.dumps({ field_name: row[determination].split(';') for field_name, determination in determination_fields if row[determination] }) df['determinations'] = df[df['_determinationNames'] != ''].apply( determinations_json, axis=1) # There doesn't seem to be a good way to identify centroids in KE EMu # I was using esites.LatDeriveCentroid, but this always defaults to True # And trying to use centroid lat/lon fields, also includes pretty much every record # But matching against *entroid being added to georeferencing notes # produces much better results df['centroid'][df['_latLongComments'].str.contains("entroid")] = True # Convert all blank strings to NaN so we can use fillna & # combine_first() to replace NaNs with value from parent df df = df.applymap(lambda x: np.nan if isinstance(x, basestring) and x == '' else x) df['catalogNumber'].fillna(df['_regRegistrationNumber'], inplace=True) # If PalNearestNamedPlaceLocal is missing, use sumPreciseLocation # And then try MinNhmVerbatimLocalityLocal df['locality'].fillna(df['_preciseLocation'], inplace=True) df['locality'].fillna(df['_minLocalityLocal'], inplace=True) # Replace missing DarTypeStatus df['typeStatus'].fillna(df['_sumTypeStatus'], inplace=True) # Replace missing depth fields df['minimumDepthInMeters'].fillna(df['_collEventFromMetres'], inplace=True) df['maximumDepthInMeters'].fillna(df['_collEventToMetres'], inplace=True) # Replace missing CatPreservative df['preservative'].fillna(df['_entCatPreservation'], inplace=True) # Cultivated should only be set on Botany records - but is actually on # everything df['cultivated'][df['collectionCode'] != 'BOT'] = np.nan # Process part parents parent_irns = self._get_unique_irns(df, '_parentRef') if parent_irns: # We want to get all parts associated to one parent record, so we can provide them as associated records # So select all records matching the parent IRN q = dict(self.query) # Delete _id if it's set - need this for testing if '_id' in q: del q['_id'] # Get all records with the same parent, so we can add them as # related records q['RegRegistrationParentRef'] = {'$in': parent_irns} monary_query = m.query( config.get('mongo', 'database'), 'ecatalogue', q, ['RegRegistrationParentRef', 'AdmGUIDPreferredValue'], ['int32', 'string:36']) part_df = pd.DataFrame( np.matrix(monary_query).transpose(), columns=['RegRegistrationParentRef', 'AdmGUIDPreferredValue']) part_df['RegRegistrationParentRef'] = part_df[ 'RegRegistrationParentRef'].astype('int32') # Group by parent ref and concatenate all the GUIDs together # So we now have: # parent_irn guid; guid parts = part_df.groupby('RegRegistrationParentRef')[ 'AdmGUIDPreferredValue'].apply(lambda x: "%s" % ';'.join(x)) # And update the main date frame with the group parts, merged on # _parentRef df['relatedResourceID'] = df.apply( lambda row: parts[row['_parentRef']] if row['_parentRef'] in parts else np.NaN, axis=1) df['relationshipOfResource'][ df['relatedResourceID'].notnull()] = 'Parts' parent_df = self.get_dataframe( m, 'ecatalogue', self.get_collection_source_columns('ecatalogue'), parent_irns, '_id') # Ensure the parent multimedia images are usable self.ensure_multimedia(parent_df, 'associatedMedia') # Assign parentRef as the index to allow us to combine with # parent_df df.index = df['_parentRef'] # There is a annoying bug that coerces string columns to integers in combine_first # Hack: ensure there's always a string value that cannot be coerced in every column # So will create a dummy row, that gets deleted after combine_first # is called dummy_index = len(df) + 1 parent_df.loc[dummy_index] = ['-' for _ in parent_df] df = df.combine_first(parent_df) df = df.drop([dummy_index]) # Ensure our geo fields are floats df['decimalLongitude'] = df['decimalLongitude'].astype('float64') df['decimalLatitude'] = df['decimalLatitude'].astype('float64') # Get all collection columns collection_columns = self.get_collection_source_columns() # Load extra sites info (if there's an error radius + unit) site_irns = self._get_unique_irns(df, '_siteRef') sites_df = self.get_dataframe(m, 'esites', collection_columns['esites'], site_irns, '_esitesIrn') df = pd.merge(df, sites_df, how='outer', left_on=['_siteRef'], right_on=['_esitesIrn']) # For CITES species, we need to hide Lat/Lon and Locality data - and # label images for i in [ 'locality', 'labelLocality', 'decimalLongitude', 'decimalLatitude', 'verbatimLongitude', 'verbatimLatitude', 'centroid', 'maxError', 'higherGeography', 'associatedMedia' ]: df[i][df['_cites'] == 'True'] = np.NaN # Some records are being assigned a Centroid even if they have no lat/lon fields. # Ensure it's False is latitude is null df['centroid'][df['decimalLatitude'].isnull()] = False # Load collection event data collection_event_irns = self._get_unique_irns(df, '_collectionEventRef') # if collection_event_irns: collection_event_df = self.get_dataframe( m, 'ecollectionevents', collection_columns['ecollectionevents'], collection_event_irns, '_ecollectioneventsIrn') # print collection_event_df df = pd.merge(df, collection_event_df, how='outer', left_on=['_collectionEventRef'], right_on=['_ecollectioneventsIrn']) # Add parasite life stage # Parasite cards use a different field for life stage df['lifeStage'].fillna(df['_parasiteStage'], inplace=True) # Add parasite card parasite_taxonomy_irns = self._get_unique_irns(df, '_cardParasiteRef') if parasite_taxonomy_irns: parasite_df = self.get_dataframe(m, 'etaxonomy', self.parasite_taxonomy_fields, parasite_taxonomy_irns, '_irn') df.index = df['_cardParasiteRef'] df = df.combine_first(parasite_df) return df
def output(self): export_dir = config.get('keemu', 'export_dir') return KEFileTarget(export_dir, self.module, self.date, self.file_extension)
def mongo_client_db(database=config.get('mongo', 'database'), host=config.get('mongo', 'host')): return MongoClient(host)[database]
def run(self): count = 0 host = config.get('mongo', 'host') db = config.get('mongo', 'database') def _fill_field(field_arr, field_type): if field_type.startswith('string'): field_arr = field_arr.astype(np.str).filled('') elif field_type == 'bool': field_arr = field_arr.astype(np.str).filled(None) elif field_type.startswith('int'): field_arr = field_arr.filled(0) elif field_type.startswith('float'): field_arr = field_arr.filled(np.NaN) else: raise Exception('Unknown field type %s' % field_type) return field_arr with Monary(host) as m: log.info("Querying Monary") # Get field definitions for default collection query_fields, df_cols, field_types = zip( *self.get_collection_source_columns(self.collection_name)) catalogue_blocks = m.block_query(db, self.collection_name, self.query, query_fields, field_types, block_size=self.block_size) log.info("Processing Monary data") for catalogue_block in catalogue_blocks: # Bit of a hack: fill fields with a blank value (depending on type) # So the masked value doesn't get used. As the masked is shared between # each block, if a field is empty it is getting populated by previous values catalogue_block = [ _fill_field(arr, field_types[i]) for i, arr in enumerate(catalogue_block) ] # Create a pandas data frame with block of records # Columns use the name from the output columns - but must be in the same order as query_fields # Which is why we're using tuples for the columns df = pd.DataFrame(np.matrix(catalogue_block).transpose(), columns=df_cols) # Loop through all the columns and ensure hidden integer fields are cast as int32 # For example, taxonomy_irn is used to join with taxonomy df for i, df_col in enumerate(df_cols): if field_types[i].startswith('int'): df[df_col] = df[df_col].astype(field_types[i]) df = self.process_dataframe(m, df) # Output the dataframe self.output().write(df) row_count, col_count = df.shape count += row_count log.info("\t %s records", count) # After running, update mongo self.mongo_target.touch()
def main(): update_markers = mongo_get_update_markers() # Make sure the updates have all mongo classes bulk_tasks = [ MongoCollectionIndexTask, MongoCollectionEventTask, MongoCatalogueTask, MongoTaxonomyTask, # MongoMultimediaTask, MongoSiteTask, UnpublishTask, MongoDeleteTask ] def _get_task_names(tasks): """ We need to initiate and get the family name, not just the class name MongoDeleteTask => DeleteTask @param tasks: @return: """ return [unicode(task(date=0).task_family) for task in tasks] full_export_date = int(config.get('keemu', 'full_export_date')) for date, update_marker in update_markers.iteritems(): # If this is the fll export date, MongoDeleteTask is not required if full_export_date and date == full_export_date: bulk_task_copy = list(bulk_tasks) bulk_task_copy.remove(MongoDeleteTask) bulk_task_names = _get_task_names(bulk_task_copy) else: bulk_task_names = _get_task_names(bulk_tasks) # Assert that for every date we have all the bulk tasks missing_tasks = list(set(bulk_task_names) - set(update_marker)) assert missing_tasks == [], 'There are missing mongo tasks for date %s: %s' % ( date, missing_tasks) # Get a list of all export files to process export_dates = [ d for d in get_export_file_dates() if d not in update_markers.keys() ] # Run setup_interface_logging to ensure luigi commands setup_interface_logging() sch = scheduler.CentralPlannerScheduler() w = BulkWorker(scheduler=sch) for export_date in export_dates: log.info('Processing date %s', export_date) # We only need to call the mongo delete task, as all other tasks are a requirement # NB: This doesn't delete anything from CKAN - if that's needed change this to DeleteTask w.add(MongoDeleteTask(date=export_date, force=True)) w.run() w.stop()