예제 #1
0
    def __init__(self, database, update_id):

        self.update_id = update_id
        # Set up a connection to the database
        self.db = mongo_client_db(database)
        # Use the postgres table name for the collection
        self.marker_collection = self.get_collection(self.marker_collection_name)
예제 #2
0
    def __init__(self, database, update_id):

        self.update_id = update_id
        # Set up a connection to the database
        self.db = mongo_client_db(database)
        # Use the postgres table name for the collection
        self.marker_collection = self.get_collection(
            self.marker_collection_name)
예제 #3
0
    def run(self):

        mongo_db = mongo_client_db()
        collection = MongoCatalogueTask(date=None).collection_name
        cites_species = get_cites_species()

        # Set cites=true flag
        cites_records_cursor = mongo_db[collection].update({'DarScientificName': {'$in': cites_species}}, {'$set': {'cites': True}}, multi=True)
        log.info('Updated %s catalogue records as CITES', cites_records_cursor['nModified'])
예제 #4
0
def main():

    # Setup MongoDB
    mongo_db = mongo_client_db()

    fields = [
        'DarLocality', 'DarVerbatimElevation', 'DarInfraspecificRank',
        'DarDayIdentified', 'DarMinimumDepthInMeters', 'DarMonthIdentified',
        'DarMaximumDepthInMeters', 'DarIndividualCount', 'DarMaximumDepth',
        'DarVerbatimCollectingDate', 'DarTissues',
        'DarScientificNameAuthorYear', 'DarVerbatimLongitude', 'DarNotes',
        'DarCollectorNumber', 'DarGenBankNum', 'DarIdentificationModifier',
        'DarMinimumDepth', 'DarLatLongComments', 'DarIsland',
        'DarPreviousCatalogNumber', 'DarEndTimeOfDay', 'DarYearCollected',
        'DarVerbatimDepth', 'DarCatalogNumber', 'DarOriginalCoordinateSystem',
        'DarScientificNameAuthor', 'DarOtherCatalogNumbers', 'DarSubgenus',
        'DarFieldNumber', 'DarYearIdentified', 'DarRelationshipType',
        'DarEndMonthCollected', 'DarInfraspecificEpithet', 'DarAgeClass',
        'DarRemarks', 'DarGeodeticDatum', 'DarKingdom',
        'DarStart_EndCoordinatePrecision', 'DarCoordinatePrecision',
        'DarStartTimeOfDay', 'DarSpecificEpithet', 'DarDecimalLongitude',
        'DarLatitude', 'DarCitation', 'DarLifeStage', 'DarFamily',
        'DarStartYearCollected', 'DarEndLatitude', 'DarBasisOfRecord',
        'DarMaximumElevation', 'DarStartLatitude', 'DarCounty',
        'DarRelatedInformation', 'DarObservedIndividualCount', 'DarSource',
        'DarRecordURL', 'DarIslandGroup', 'DarWaterBody',
        'DarCoordinateUncertaintyInMeter', 'DarSex', 'DarStartDayCollected',
        'DarVerbatimLatitude', 'DarGenus', 'DarTimeOfDay', 'DarImageURL',
        'DarDecimalLatitude', 'DarTypeStatus', 'DarStateProvince',
        'DarBoundingBox', 'DarGeorefMethod', 'DarScientificName',
        'DarCollectionCode', 'DarLongitude', 'DarGlobalUniqueIdentifier',
        'DarInstitutionCode', 'DarRelatedCatalogItem', 'DarTimeCollected',
        'DarPreparations', 'DarContinent', 'DarEndJulianDay', 'DarGMLFeature',
        'DarCountry', 'DarJulianDay', 'DarSubspecies', 'DarFieldNotes',
        'DarMaximumElevationInMeters', 'DarContinentOcean',
        'DarIdentificationQualifier', 'DarTimeZone', 'DarEndLongitude',
        'DarHorizontalDatum', 'DarClass', 'DarRelatedCatalogItems',
        'DarPhylum', 'DarStartMonthCollected', 'DarHigherGeography',
        'DarDepthRange', 'DarDateLastModified', 'DarCollector',
        'DarObservedWeight', 'DarMinimumElevationInMeters', 'DarHigherTaxon',
        'DarStartJulianDay', 'DarDayCollected', 'DarTemperature',
        'DarEndDayCollected', 'DarStartLongitude', 'DarCatalogNumberNumeric',
        'DarOrder', 'DarMinimumElevation', 'DarPreparationType',
        'DarEndYearCollected', 'DarMonthCollected', 'DarIdentifiedBy',
        'DarCatalogNumberText', 'DarSpecies'
    ]

    for field in fields:
        results = mongo_db.ecatalogue.find({field: {'$exists': 1}})
        print '{0}:\t{1}\r'.format(field, results.count())
예제 #5
0
def get_cites_species():
    """
    Load cites species names from mongo

    These will already have been downloaded from http://checklist.cites.org/#/en in JSON
    And then loaded into the database with:

    mongoimport --db keemu --collection cites --type json --file /vagrant/exports/Index_of_CITES_Species_2014-10-17\ 17-34.json --jsonArray

    This should only be run if mongo is rebuilt - new records are marked as CITES on import

    @return: list
    """
    mongo_db = mongo_client_db()
    cursor = mongo_db[CITES_COLLECTION].find({'full_name': {'$ne': None}}, {'full_name':1})
    return [r['full_name'].encode('utf8') for r in cursor]
예제 #6
0
    def run(self):

        mongo_db = mongo_client_db()
        collection = MongoCatalogueTask(date=None).collection_name
        cites_species = get_cites_species()

        # Set cites=true flag
        cites_records_cursor = mongo_db[collection].update(
            {'DarScientificName': {
                '$in': cites_species
            }}, {'$set': {
                'cites': True
            }},
            multi=True)
        log.info('Updated %s catalogue records as CITES',
                 cites_records_cursor['nModified'])
예제 #7
0
    def ensure_multimedia(self, df, multimedia_field):

        mongo_client = mongo_client_db()

        # The multimedia field contains IRNS of all items - not just images
        # So we need to look up the IRNs against the multimedia record to get the mime type
        # And filter out non-image mimetypes we do not support

        # Convert associatedMedia field to a list
        df[multimedia_field] = df[multimedia_field].apply(
            lambda x: list(int(z.strip()) for z in x.split(';') if z.strip()))

        # Get a unique list of IRNS
        unique_multimedia_irns = list(
            set(itertools.chain(*[irn
                                  for irn in df[multimedia_field].values])))

        # Get a list of dictionary of valid multimedia valid mimetypes
        # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable
        cursor = mongo_client['emultimedia'].find(
            {
                '_id': {
                    '$in': unique_multimedia_irns
                },
                'AdmPublishWebNoPasswordFlag': 'Y',
                #'NhmSecEmbargoDate': 0,
                'GenDigitalMediaId': {
                    '$ne': 0
                }
            },
            {
                'GenDigitalMediaId': 1,
                'MulTitle': 1,
                'MulMimeFormat': 1,
                'NhmSecEmbargoDate': 1,
                'NhmSecEmbargoExtensionDate': 1
            })

        # Create a dictionary of multimedia records, keyed by _id
        multimedia_dict = {}

        for record in cursor:

            if record['GenDigitalMediaId'] == 'Pending':
                continue

# If the embargo extension date exists and is in the future, then skip
            if 'NhmSecEmbargoExtensionDate' in record:
                if record['NhmSecEmbargoExtensionDate'] > 0 and record[
                        'NhmSecEmbargoExtensionDate'] > datetime.datetime.today(
                        ).strftime("%Y-%m-%d"):
                    continue

# For remaining records, if the original embargo date exists and is in the future then skip
            if record['NhmSecEmbargoDate'] > 0 and record[
                    'NhmSecEmbargoDate'] > datetime.datetime.today().strftime(
                        "%Y-%m-%d"):
                continue

            multimedia_dict[record['_id']] = {
                'identifier':
                'http://www.nhm.ac.uk/services/media-store/asset/{mam_id}/contents/preview'
                .format(mam_id=record['GenDigitalMediaId'], ),
                'format':
                'image/%s' % record['MulMimeFormat'],
                "type":
                "StillImage",
                "license":
                "http://creativecommons.org/licenses/by/4.0/",
                "rightsHolder":
                "The Trustees of the Natural History Museum, London"
            }

            # Add the title if it exists
            if record.get('MulTitle', None):
                multimedia_dict[record['_id']]['title'] = record.get(
                    'MulTitle')

        def multimedia_to_json(irns):
            """
            Convert multimedia fields to json
            Loop through all the irns in the field, check they key exists in multimedia_dict
            (If it's not the image might not be publishable / be in the correct format)
            @param irns:
            @return: json
            """

            multimedia_records = [
                multimedia_dict[irn] for irn in irns if irn in multimedia_dict
            ]
            return json.dumps(
                multimedia_records) if multimedia_records else np.nan

        # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia
        df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)
예제 #8
0
def main():

        # Setup MongoDB
    mongo_db = mongo_client_db()

    q = {
        'MulMimeFormat': {'$in': MULTIMEDIA_FORMATS},
        'DocHeight': {'$exists': True},
        'DocWidth': {'$exists': True},
        'AdmPublishWebNoPasswordFlag': 'Y',
        'MulMimeType': 'image'
    }

    status = OrderedDict()

    total_failed = 0

    for d in pd.date_range(start='3/1/2014', end=pd.datetime.today()):

        date_str = str(d.date())

        q['AdmDateInserted'] = date_str

        print 'Checking date %s' % q['AdmDateInserted']

        status[date_str] = 0

        results = mongo_db.emultimedia.find(q).limit(1)

        if results.count():

            for record in results:
                url = 'http://www.nhm.ac.uk/emu-classes/class.EMuMedia.php?irn={_id}&image=yes&width={width}&height={height}'.format(
                    _id=record['_id'],
                    width=get_max_dimension(record['DocWidth']),
                    height=get_max_dimension(record['DocHeight'])
                )

                response = requests.head(url)

                # We only request jpeg images - but the error image is returned in png
                # So if image type == png, image request has failed
                failed = response.headers['content-type'] == 'image/png'

                if failed:
                    print 'Failed: %s' % date_str
                    # Count failures
                    status[date_str] += 1
                    total_failed += results.count()
                    print 'Total failed: %s' % total_failed

                # Pause so we don't kill the server
                time.sleep(0.5)

        else:

            print 'No images for %s' % date_str
            status[date_str] = None


    for d, failures in status.iteritems():
        print '%s: %s' % (d, failures)

    print '----------------'
    print 'Total failed: %s' % total_failed
예제 #9
0
def main():
    
    # Setup MongoDB
    mongo_db = mongo_client_db()
    
    fields = [
        'DarLocality',
        'DarVerbatimElevation',
        'DarInfraspecificRank',
        'DarDayIdentified',
        'DarMinimumDepthInMeters',
        'DarMonthIdentified',
        'DarMaximumDepthInMeters',
        'DarIndividualCount',
        'DarMaximumDepth',
        'DarVerbatimCollectingDate',
        'DarTissues',
        'DarScientificNameAuthorYear',
        'DarVerbatimLongitude',
        'DarNotes',
        'DarCollectorNumber',
        'DarGenBankNum',
        'DarIdentificationModifier',
        'DarMinimumDepth',
        'DarLatLongComments',
        'DarIsland',
        'DarPreviousCatalogNumber',
        'DarEndTimeOfDay',
        'DarYearCollected',
        'DarVerbatimDepth',
        'DarCatalogNumber',
        'DarOriginalCoordinateSystem',
        'DarScientificNameAuthor',
        'DarOtherCatalogNumbers',
        'DarSubgenus',
        'DarFieldNumber',
        'DarYearIdentified',
        'DarRelationshipType',
        'DarEndMonthCollected',
        'DarInfraspecificEpithet',
        'DarAgeClass',
        'DarRemarks',
        'DarGeodeticDatum',
        'DarKingdom',
        'DarStart_EndCoordinatePrecision',
        'DarCoordinatePrecision',
        'DarStartTimeOfDay',
        'DarSpecificEpithet',
        'DarDecimalLongitude',
        'DarLatitude',
        'DarCitation',
        'DarLifeStage',
        'DarFamily',
        'DarStartYearCollected',
        'DarEndLatitude',
        'DarBasisOfRecord',
        'DarMaximumElevation',
        'DarStartLatitude',
        'DarCounty',
        'DarRelatedInformation',
        'DarObservedIndividualCount',
        'DarSource',
        'DarRecordURL',
        'DarIslandGroup',
        'DarWaterBody',
        'DarCoordinateUncertaintyInMeter',
        'DarSex',
        'DarStartDayCollected',
        'DarVerbatimLatitude',
        'DarGenus',
        'DarTimeOfDay',
        'DarImageURL',
        'DarDecimalLatitude',
        'DarTypeStatus',
        'DarStateProvince',
        'DarBoundingBox',
        'DarGeorefMethod',
        'DarScientificName',
        'DarCollectionCode',
        'DarLongitude',
        'DarGlobalUniqueIdentifier',
        'DarInstitutionCode',
        'DarRelatedCatalogItem',
        'DarTimeCollected',
        'DarPreparations',
        'DarContinent',
        'DarEndJulianDay',
        'DarGMLFeature',
        'DarCountry',
        'DarJulianDay',
        'DarSubspecies',
        'DarFieldNotes',
        'DarMaximumElevationInMeters',
        'DarContinentOcean',
        'DarIdentificationQualifier',
        'DarTimeZone',
        'DarEndLongitude',
        'DarHorizontalDatum',
        'DarClass',
        'DarRelatedCatalogItems',
        'DarPhylum',
        'DarStartMonthCollected',
        'DarHigherGeography',
        'DarDepthRange',
        'DarDateLastModified',
        'DarCollector',
        'DarObservedWeight',
        'DarMinimumElevationInMeters',
        'DarHigherTaxon',
        'DarStartJulianDay',
        'DarDayCollected',
        'DarTemperature',
        'DarEndDayCollected',
        'DarStartLongitude',
        'DarCatalogNumberNumeric',
        'DarOrder',
        'DarMinimumElevation',
        'DarPreparationType',
        'DarEndYearCollected',
        'DarMonthCollected',
        'DarIdentifiedBy',
        'DarCatalogNumberText',
        'DarSpecies'
    ]
    
    for field in fields:
        results = mongo_db.ecatalogue.find({field: {'$exists': 1}})
        print '{0}:\t{1}\r'.format(field,  results.count())
예제 #10
0
    def ensure_multimedia(self, df, multimedia_field):

        mongo_client = mongo_client_db()

        # The multimedia field contains IRNS of all items - not just images
        # So we need to look up the IRNs against the multimedia record to get the mime type
        # And filter out non-image mimetypes we do not support

        # Convert associatedMedia field to a list
        df[multimedia_field] = df[multimedia_field].apply(lambda x: list(int(z.strip()) for z in x.split(';') if z.strip()))
	
        # Get a unique list of IRNS
        unique_multimedia_irns = list(set(itertools.chain(*[irn for irn in df[multimedia_field].values])))

        # Get a list of dictionary of valid multimedia valid mimetypes
        # It's not enough to just check for the derived image heights - some of these are tiffs etc., and undeliverable
        cursor = mongo_client['emultimedia'].find(
            {
                '_id': {'$in': unique_multimedia_irns},
                'AdmPublishWebNoPasswordFlag': 'Y',
                #'NhmSecEmbargoDate': 0,
                'GenDigitalMediaId': {'$ne': 0}
                },
            {
                'GenDigitalMediaId': 1,
                'MulTitle': 1,
                'MulMimeFormat': 1,
		'NhmSecEmbargoDate': 1,
		'NhmSecEmbargoExtensionDate': 1
            }
        )

        # Create a dictionary of multimedia records, keyed by _id
        multimedia_dict = {}

        for record in cursor:

            if record['GenDigitalMediaId'] == 'Pending':
                continue

	    # If the embargo extension date exists and is in the future, then skip
	    if 'NhmSecEmbargoExtensionDate' in record: 
	    	if record['NhmSecEmbargoExtensionDate'] > 0 and record['NhmSecEmbargoExtensionDate'] > datetime.datetime.today().strftime("%Y-%m-%d"):
			continue

	    # For remaining records, if the original embargo date exists and is in the future then skip
	    if record['NhmSecEmbargoDate'] > 0 and record['NhmSecEmbargoDate'] > datetime.datetime.today().strftime("%Y-%m-%d"):
		continue
		
            multimedia_dict[record['_id']] = {
                'identifier': 'http://www.nhm.ac.uk/services/media-store/asset/{mam_id}/contents/preview'.format(
                    mam_id=record['GenDigitalMediaId'],
                ),
                'format': 'image/%s' % record['MulMimeFormat'],
                "type": "StillImage",
                "license": "http://creativecommons.org/licenses/by/4.0/",
                "rightsHolder": "The Trustees of the Natural History Museum, London"
            }

            # Add the title if it exists
            if record.get('MulTitle', None):
                multimedia_dict[record['_id']]['title'] = record.get('MulTitle')

        def multimedia_to_json(irns):
            """
            Convert multimedia fields to json
            Loop through all the irns in the field, check they key exists in multimedia_dict
            (If it's not the image might not be publishable / be in the correct format)
            @param irns:
            @return: json
            """

            multimedia_records = [multimedia_dict[irn] for irn in irns if irn in multimedia_dict]
	    return json.dumps(multimedia_records) if multimedia_records else np.nan

        # And finally update the associatedMedia field, so formatting with the IRN with MULTIMEDIA_URL, if the IRN is in valid_multimedia
        df[multimedia_field] = df[multimedia_field].apply(multimedia_to_json)