def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') args = argparser.parse_args() factory = MetadataDatasetModelGeogratisFactory() now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN( remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 scan_date = None setting = get_setting('last_conversion_run') if args.since != '': try: scan_date = datetime.fromtimestamp( time.mktime(time.strptime(args.since, '%Y-%m-%d'))) except ValueError: logging.error("Incorrect since date format. Use YYYY-MM-DD") session.close() exit() except Exception, e: logging.error(e.message) session.close() exit()
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_url = ini_config.get('ckan', 'ckan.remote_portal') api_key = ini_config.get('ckan', 'ckan.api_key') user_agent = ini_config.get('ckan', 'ckan.user_agent') ckansite = ckanapi.RemoteCKAN(remote_url, apikey=api_key, user_agent=user_agent) session = connect_to_database() last_id = 0 while True: package_stream = session.query(Packages).filter(Packages.id > last_id) package_stream = package_stream.filter(Packages.status.in_(["new", "update"])).\ order_by(Packages.id).all() if len(package_stream) == 0: break else: for r in package_stream: sleep(60) print u'Processing dataset {0}'.format(r.id) try: new_pkg_dict = json.loads(r.ckan_json.decode('utf-8')) except AttributeError as a: print u'AttributeError {0}'.format(unicode(a)) continue is_new = False try: pkg_info = ckansite.action.package_show(id=r.uuid) except ckanapi.NotFound: is_new = True try: if is_new: ckansite.call_action('package_create', new_pkg_dict) else: ckansite.call_action('package_update', new_pkg_dict) r.status = 'posted' r.status_message = '' r.latest_posted = datetime.now() add_record(session, r) continue except ckanapi.NotAuthorized as e: print u'Not Authorized {0}'.format(unicode(e)) continue except ckanapi.CKANAPIError as c: r.status = 'error' r.status_message = u'CKAN API error {0}'.format(unicode(c)) add_record(session, r) print r.status_message continue except ckanapi.errors.ValidationError as v: r.status = 'error' r.status_message = u'Validation error {0}'.format(unicode(v.error_dict)) add_record(session, r) print r.status_message continue break
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') args = argparser.parse_args() factory = MetadataDatasetModelGeogratisFactory() now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 scan_date = None setting = get_setting('last_conversion_run') if args.since != '': try: scan_date = datetime.fromtimestamp(time.mktime(time.strptime(args.since, '%Y-%m-%d'))) except ValueError: logging.error("Incorrect since date format. Use YYYY-MM-DD") session.close() exit() except Exception, e: logging.error(e.message) session.close() exit()
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') factory = MetadataDatasetModelGeogratisFactory() # Create CKAN API connector to the portal ckan_portal = RemoteCKAN( remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') # Page through the datasets on session = connect_to_database() last_id = 0 try: while True: ckan_records = ckan_portal.action.package_search( q= 'extras_collection:geogratis AND extras_org_title_at_publication:"Natural Resources Canada"', rows=100, start=last_id) if not ckan_records: break else: for r in ckan_records['results']: rp = session.query(GeogratisRecord).filter( GeogratisRecord.uuid == r['name']).all() if not rp: print r['name'] last_id += 100 except Exception, e: print >> stderr, e.message pass
def load_naps(self): ns = Namespaces() gmd = ns.get_namespace('gmd') session = connect_to_database() for napid in self.napids: print '{0}Full NAP Record for {1}{2}'.format( Fore.GREEN, Fore.CYAN, napid) self.csw.getrecordbyid(id=[napid], outputschema=gmd) ec_rec = find_record_by_uuid(session, napid, query_class=ECRecord) if ec_rec is None: ec_rec = ECRecord( uuid=self.csw.records[napid].identifier, title=self.csw.records[napid].identification.title, state='active', nap_record=self.csw.records[napid].xml, csw_scanned=datetime.now().isoformat()) else: ec_rec.title = self.csw.records[napid].identification.title, ec_rec.state = 'active', ec_rec.nap_record = self.csw.records[napid].xml, ec_rec.csw_scanned = datetime.now().isoformat() add_record(session, ec_rec) session.close_all()
def main(): factory = MetadataDatasetModelGeogratisFactory() # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 while True: known_records = find_all_records(session, query_limit=10, limit_id=last_id) if len(known_records) == 0: break else: for geo_rec in known_records: print 'ID: {0} UUID: {1}'.format(geo_rec.id, geo_rec.uuid) try: # In order to avoid multiple updates, only allow for one instance of an update per uuid. # Previous updates are overridden with the latest update pkg_update = find_record_by_uuid(session, geo_rec.uuid, query_class=Packages) if pkg_update is None: pkg_update = Packages() pkg_update.status = 'new' if geo_rec.state == 'active': ckan_record = factory.create_model_ckan(geo_rec.uuid) geogratis_record = factory.create_model_geogratis(geo_rec.uuid) pkg_update.uuid = geo_rec.uuid # Set the dataset for immediate release on the Registry geogratis_record.portal_release_date = time.strftime("%Y-%m-%d") geogratis_record.ready_to_publish = True if not ckan_record is None: if not geogratis_record.equals(ckan_record): diffs = geogratis_record.compare(ckan_record, self_label="Geogratis", other_label="CKAN") pkg_update.differences = "\n".join(item for item in diffs) geo_rec.od_status = 'Needs Update' pkg_update.ckan_json = json.dumps(geogratis_record.as_dict()) pkg_update.status = 'update' else: geo_rec.od_status = 'Current' else: pkg_update.ckan_json = json.dumps(geogratis_record.as_dict()) geo_rec.od_status = 'New Record' else: geo_rec.od_status = 'Ineligible' pkg_update.last_comparison = datetime.now() add_record(session, geo_rec) if geo_rec.od_status == 'New Record' or geo_rec.od_status == "Needs Update": add_record(session, pkg_update) last_id = geo_rec.id except Exception, e: logging.error(e.message)
def create_model(self, uuid): session = connect_to_database() try: geogratis_rec = find_record_by_uuid(session, uuid) geo_rec_en = json.loads(geogratis_rec.json_record_en) geo_rec_fr = json.loads(geogratis_rec.json_record_fr) finally: session.close() # Even if the French or English record is missing, create an object with return self.convert_geogratis_json(geo_rec_en, geo_rec_fr)
def create_model_geogratis(self, uuid): session = connect_to_database() try: geogratis_rec = find_record_by_uuid(session, uuid) geo_rec_en = json.loads(geogratis_rec.json_record_en) geo_rec_fr = json.loads(geogratis_rec.json_record_fr) finally: session.close() # Even if the French or English record is missing, create an object with return self.convert_geogratis_json(geo_rec_en, geo_rec_fr)
def main(since, dumpfile, scan_type): ini_config = ConfigParser() ini_config.read('harvester.ini') session = connect_to_database() last_id = 0 while True: if args.monitor: last_run_setting = get_setting('last_conversion_' + scan_type) if last_run_setting.setting_value: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() elif args.since != '': package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > args.since).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() if len(package_stream) == 0: break else: if dumpfile != '': with open(dumpfile, 'a') as dfile: for r in package_stream: print u'Processing dataset {0}'.format(r.id) dfile.write(r.ckan_json + '\n') last_id = r.id else: for r in package_stream: print r.ckan_json + '\n' last_id = r.id session.close()
def main(since, scan_type): now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z') if scan_type == 'gr': factory = MetadataDatasetModelGeogratisFactory() setting = get_setting('last_conversion_gr') query_class = GeogratisRecord if setting is None: setting = Settings() setting.setting_name = 'last_conversion_gr' else: factory = MetadataDatasetModelECFactory() setting = get_setting('last_conversion_ec') query_class = ECRecord if setting is None: setting = Settings() setting.setting_name = 'last_conversion_ec' # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 scan_date = None if since != '': try: scan_date = datetime.fromtimestamp( time.mktime(time.strptime(args.since, '%Y-%m-%d'))) except ValueError: logging.error("Incorrect since date format. Use YYYY-MM-DD") session.close() exit() except Exception, e: logging.error(e.message) session.close() exit()
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN( remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') last_id = 0 last_run_setting = get_setting('last_conversion_run') session = connect_to_database() while True: if args.monitor: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .filter(GeogratisRecord.updated > last_run_setting.setting_value)\ .order_by(GeogratisRecord.id).limit(10).all() else: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .order_by(GeogratisRecord.id).limit(10).all() if len(geogratis_stream) == 0: break else: for r in geogratis_stream: # Determine if the record is already on the OD portal try: ckan_portal.action.package_show(id=r.uuid) # If the record does not exist, then a NotFound exception will be thrown print u'{0}'.format(r.uuid) except NotFound, e: pass last_id = r.id
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') last_id = 0 last_run_setting = get_setting('last_conversion_run') session = connect_to_database() while True: if args.monitor: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .filter(GeogratisRecord.updated > last_run_setting.setting_value)\ .order_by(GeogratisRecord.id).limit(10).all() else: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .order_by(GeogratisRecord.id).limit(10).all() if len(geogratis_stream) == 0: break else: for r in geogratis_stream: # Determine if the record is already on the OD portal try: ckan_portal.action.package_show(id=r.uuid) # If the record does not exist, then a NotFound exception will be thrown print u'{0}'.format(r.uuid) except NotFound, e: pass last_id = r.id
dest='outfile', help='Write extracted CKAN JSONL to this file') argparser.add_argument('-m', '--maxrecords', action='store', default=0, type=int, dest='maxrecords', help='Maximum number of records to retrieve. 0 means retrieve all') argparser.add_argument('-n', '--newonly', action='store_true', dest='newonly', default=False, help='Only extract new records') args = argparser.parse_args() session = connect_to_database() last_id = 0 jfile = open(args.outfile, mode='w') rec_count = 1 while True: known_records = find_all_records(session, query_class=Packages, query_limit=10, limit_id=last_id) if len(known_records) == 0: break else: for r in known_records: if args.newonly and r.status == 'update': continue if (r.status == 'new' or r.status == 'update') and r.package is not None: print >> jfile, r.package rec_count += 1 if 0 < args.maxrecords < rec_count:
def main(since='', start_index='', monitor=False): geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100' monitor_setting = get_setting(u'monitor_link') if monitor: if monitor_setting.setting_value is None: geog_url =\ 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2001-01-01&alt=json&max-results=100' else: geog_url = monitor_setting.setting_value elif since != '': geog_url =\ 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format(since) elif start_index != '': geog_url =\ 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.\ format(start_index) print ('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url)) r = requests.get(geog_url) logging.info('HTTP Response Status {0}'.format(r.status_code)) session = None try: session = connect_to_database() # Get the first page of the feed if r.status_code == 200: feed_page = r.json() # Save the monitor link for future use monitor_link = _get_link(feed_page, 'monitor') if monitor_link != '': monitor_setting.setting_value = monitor_link save_setting(monitor_setting) print "{0}Next Monitor Link: {1}{2}".format(Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value) next_link = _get_link(feed_page) print ('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count'])) if 'products' in feed_page: for product in feed_page['products']: try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format(product['id'])) logging.error(e) # Keep polling until exhausted while next_link != '': geog_url = next_link r = requests.get(geog_url) feed_page = r.json() next_link = _get_link(feed_page) print '{0}Next page link: {1}{2}'.format(Fore.YELLOW, Fore.BLUE, next_link) if 'products' in feed_page: for product in feed_page['products']: # Don't crash on every call - log the error and continue try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format(product['id'])) logging.error(e) save_setting(monitor_setting)
def create_model(self, uuid): # Get the previously harvested NAP XML session = connect_to_database() try: ec_rec = find_record_by_uuid(session, uuid, query_class=ECRecord) self.root = etree.fromstring(ec_rec.nap_record) finally: session.close() """Convert a NAP file into an Open Data record""" ds = MetadataDatasetModel() ds.owner_org = 'ec' ds.catalog_type = u'Geo Data | G\u00e9o' self.valid = True try: # Boilerplate fields for the Open Data record ds.author_email = "*****@*****.**" ds.language = "eng; CAN | fra; CAN" ds.owner_org = "ec" ds.department_number = "99" ds.catalog_type = u"Geo Data | G\u00e9o" ds.license_id = u"ca-ogl-lgo" ds.attribution = u"Contains information licensed under the Open Government Licence \u2013 Canada." ds.attribution_fra = u"Contient des informations autoris\u00e9es sous la Licence du gouvernement ouvert- Canada" ds.ready_to_publish = True ds.portal_release_date = "" ds.presentation_form = u"Document Digital | Document num\u00e9rique" ds.spatial_representation_type = "Vector | Vecteur" # Read in NAP fields and populate the OD dataset # UUID identifier ds.id = self._get_first_text( '/gmd:MD_Metadata/gmd:fileIdentifier/gco:CharacterString') # Title - English and French ds.title = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gco:CharacterString' ) if len(ds.title) == 0: print(ds.id + 'No English Title Given') self.valid = False ds.title_fra = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:title/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString' ) if len(ds.title_fra) == 0: print(ds.id + ' No French Title Given') self.valid = False # Description - English and French ds.notes = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gco:CharacterString' ).replace(u"\u2019", "'") ds.notes_fra = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:abstract/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString' ).replace(u"\u2019", "'") # Time Period Coverage - Start and End (optional) coverage_start_time = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:beginPosition' ) if not coverage_start_time is None: if len(coverage_start_time) == 4: coverage_start_time = "%s-01-01" % coverage_start_time ds.time_period_coverage_start = coverage_start_time coverage_end_time = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimePeriod/gml:endPosition' ).strip() # The time period coverage end time is not always present - it's not mandatory if (coverage_end_time.lower() <> u"ongoing") and (not len(coverage_end_time) == 0): if len(coverage_end_time) == 4: coverage_end_time = "%s-12-31" % coverage_end_time ds.time_period_coverage_end = coverage_end_time # Homepage and Endpoint URLs - English and French sup_text = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:supplementalInformation/gco:CharacterString' ) urls_en = [] if len(sup_text) > 0: urls_en = self._get_urls_from_string(sup_text) sup_text = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:supplementalInformation/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString' ) urls_fr = [] if len(sup_text) > 0: urls_fr = self._get_urls_from_string(sup_text) if len(urls_en) > 0: ds.url = urls_en[0] if len(urls_fr) > 0: ds.url_fra = urls_fr[0] if len(urls_en) > 1: ds.endpoint_url = urls_en[1] if len(urls_fr) > 1: ds.url_fra = urls_fr[1] # GoC Subject topics_subjects = self._get_gc_subject_category( self.root.xpath( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:topicCategory/gmd:MD_TopicCategoryCode', namespaces=self.nap_namespaces)) ds.subject = topics_subjects['subjects'] if len(ds.subject) == 0: self.valid = False print(ds.id + ' No GC Subjects') # GoC Topic ds.topic_category = topics_subjects['topics'] if len(ds.topic_category) == 0: self.valid = False print(ds.id + ' No GC Topics') # Tags - English and French ds.keywords = [] keywords_en = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gco:CharacterString' ) keywords_en = keywords_en.replace(';', ' ') if len(keywords_en) == 0: self.valid = False print(ds.id + ' No English Keywords') else: ds.keywords = keywords_en.split(',') ds.keywords_fra = [] keywords_fr = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:descriptiveKeywords/gmd:MD_Keywords/gmd:keyword/gmd:PT_FreeText/gmd:textGroup/gmd:LocalisedCharacterString' ) keywords_fr = keywords_fr.replace(u"/u2019", "'").replace(";", " ") if len(keywords_fr) == 0: self.valid = False print(ds.id + ' No French Keywords') else: ds.keywords_fra = keywords_fr.split(',') # Spatial - Convert a bounding box into a GeoJSON polygon westLong = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:westBoundLongitude/gco:Decimal' ) eastLong = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:eastBoundLongitude/gco:Decimal' ) northLat = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:northBoundLatitude/gco:Decimal' ) southLat = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:geographicElement/gmd:EX_GeographicBoundingBox/gmd:southBoundLatitude/gco:Decimal' ) # convert these 4 points into a bounding box ds.spatial = '{\"type\": \"Polygon\", \"coordinates\": [[[%s, %s], [%s, %s], [%s, %s], [%s, %s], [%s, %s]]]}' % ( westLong, northLat, eastLong, northLat, eastLong, southLat, westLong, southLat, westLong, northLat) # Data Published ds.date_published = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:citation/gmd:CI_Citation/gmd:date/gmd:CI_Date/gmd:date/gco:Date' ) # Browse Graphic File Name try: ds.browse_graphic_url = self._get_first_text( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:graphicOverview/gmd:MD_BrowseGraphic/gmd:fileName/gco:CharacterString' ) if len(ds.browse_graphic_url) == 0: ds.browse_graphic_url = '/static/img/canada_default.png' except: ds.browse_graphic_url = '/static/img/canada_default.png' # Frequency frequency_node = self.root.xpath( '/gmd:MD_Metadata/gmd:identificationInfo/gmd:MD_DataIdentification/gmd:resourceMaintenance/gmd:MD_MaintenanceInformation/gmd:maintenanceAndUpdateFrequency/gmd:MD_MaintenanceFrequencyCode/@codeListValue', namespaces=self.nap_namespaces) if len(frequency_node) > 0: ds.maintenance_and_update_frequency = self._get_update_frequency( frequency_node[0]) else: ds.maintenance_and_update_frequency = self._get_update_frequency( '') # Data Series Name, Issue Identification, DOI; These fields are not present in the EC ISO 19115 NAP files. ds.data_series_name = '' ds.data_series_name_fra = '' ds.data_series_issue_identification = '' ds.data_series_issue_identification_fra = '' ds.digital_object_identifier = "" # Load the Resources resources = self.root.xpath( '/gmd:MD_Metadata/gmd:distributionInfo/gmd:MD_Distribution/gmd:transferOptions/gmd:MD_DigitalTransferOptions/gmd:onLine', namespaces=self.nap_namespaces) od_resources = [] for resource in resources: od_resource = MetadataResourcesModel() lang_code = resource.xpath('@xlink:role', namespaces=self.nap_namespaces)[0] if lang_code == "urn:xml:lang:eng-CAN": od_resource.language = 'eng; CAN' elif lang_code == "urn:xml:lang:fra-CAN": od_resource.language = 'fra; CAN' else: od_resource.language = 'zxx; CAN' if len( resource.xpath( 'gmd:CI_OnlineResource/gmd:name/gco:CharacterString', namespaces=self.nap_namespaces)) > 0: od_resource.name = resource.xpath( 'gmd:CI_OnlineResource/gmd:name/gco:CharacterString', namespaces=self.nap_namespaces)[0].text else: if lang_code == "urn:xml:lang:eng-CAN": od_resource.name = "Dataset" else: od_resource.name = u"Donn\u00e9es" od_resource.name_fra = od_resource.name od_resource.resource_type = "file" od_resource.url = resource.xpath( 'gmd:CI_OnlineResource/gmd:linkage/gmd:URL', namespaces=self.nap_namespaces)[0].text od_resource.size = '' od_resource.format = self._guess_resource_type( od_resource.name) if not od_resource.format == 'none': od_resources.append(od_resource) ds.resources = od_resources except Exception as e: print("Failure: ", e) traceback.print_exc() if self.valid: ds.state = 'active' return ds else: return None
def main(): factory = MetadataDatasetModelGeogratisFactory() # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 while True: known_records = find_all_records(session, query_limit=10, limit_id=last_id) if len(known_records) == 0: break else: for geo_rec in known_records: print 'ID: {0} UUID: {1}'.format(geo_rec.id, geo_rec.uuid) try: # In order to avoid multiple updates, only allow for one instance of an update per uuid. # Previous updates are overridden with the latest update pkg_update = find_record_by_uuid(session, geo_rec.uuid, query_class=Packages) if pkg_update is None: pkg_update = Packages() pkg_update.status = 'new' if geo_rec.state == 'active': ckan_record = factory.create_model_ckan(geo_rec.uuid) geogratis_record = factory.create_model_geogratis( geo_rec.uuid) pkg_update.uuid = geo_rec.uuid # Set the dataset for immediate release on the Registry geogratis_record.portal_release_date = time.strftime( "%Y-%m-%d") geogratis_record.ready_to_publish = True if not ckan_record is None: if not geogratis_record.equals(ckan_record): diffs = geogratis_record.compare( ckan_record, self_label="Geogratis", other_label="CKAN") pkg_update.differences = "\n".join( item for item in diffs) geo_rec.od_status = 'Needs Update' pkg_update.ckan_json = json.dumps( geogratis_record.as_dict()) pkg_update.status = 'update' else: geo_rec.od_status = 'Current' else: pkg_update.ckan_json = json.dumps( geogratis_record.as_dict()) geo_rec.od_status = 'New Record' else: geo_rec.od_status = 'Ineligible' pkg_update.last_comparison = datetime.now() add_record(session, geo_rec) if geo_rec.od_status == 'New Record' or geo_rec.od_status == "Needs Update": add_record(session, pkg_update) last_id = geo_rec.id except Exception, e: logging.error(e.message)
def main(since, dumpfile): ini_config = ConfigParser() ini_config.read('geogratis.ini') session = connect_to_database() last_id = 0 while True: # @todo clean - up the messy if statement here if args.monitor: last_run_setting = get_setting('last_conversion_run') if last_run_setting.setting_value: if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ order_by(Packages.id).limit(10).all() else: if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ order_by(Packages.id).limit(10).all() elif args.since != '': if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > args.since). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > args.since). \ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(Packages.updated > args.since). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() else: if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ order_by(Packages.id).limit(10).all() if len(package_stream) == 0: break else: if dumpfile != '': with open(dumpfile, 'a') as dfile: for r in package_stream: print u'Processing dataset {0}'.format(r.id) dfile.write(r.ckan_json + '\n') last_id = r.id else: for r in package_stream: print r.ckan_json + '\n' last_id = r.id session.close()
'-m', '--maxrecords', action='store', default=0, type=int, dest='maxrecords', help='Maximum number of records to retrieve. 0 means retrieve all') argparser.add_argument('-n', '--newonly', action='store_true', dest='newonly', default=False, help='Only extract new records') args = argparser.parse_args() session = connect_to_database() last_id = 0 jfile = open(args.outfile, mode='w') rec_count = 1 while True: known_records = find_all_records(session, query_class=Packages, query_limit=10, limit_id=last_id) if len(known_records) == 0: break else: for r in known_records: if args.newonly and r.status == 'update': continue if (r.status == 'new'
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_url = ini_config.get('ckan', 'ckan.remote_portal') api_key = ini_config.get('ckan', 'ckan.api_key') user_agent = ini_config.get('ckan', 'ckan.user_agent') ckansite = ckanapi.RemoteCKAN(remote_url, apikey=api_key, user_agent=user_agent) session = connect_to_database() last_id = 0 while True: package_stream = session.query(Packages).filter(Packages.id > last_id) package_stream = package_stream.filter(Packages.status.in_(["new", "update"])).\ order_by(Packages.id).all() if len(package_stream) == 0: break else: for r in package_stream: sleep(60) print u'Processing dataset {0}'.format(r.id) try: new_pkg_dict = json.loads(r.ckan_json.decode('utf-8')) except AttributeError as a: print u'AttributeError {0}'.format(unicode(a)) continue is_new = False try: pkg_info = ckansite.action.package_show(id=r.uuid) except ckanapi.NotFound: is_new = True try: if is_new: ckansite.call_action('package_create', new_pkg_dict) else: ckansite.call_action('package_update', new_pkg_dict) r.status = 'posted' r.status_message = '' r.latest_posted = datetime.now() add_record(session, r) continue except ckanapi.NotAuthorized as e: print u'Not Authorized {0}'.format(unicode(e)) continue except ckanapi.CKANAPIError as c: r.status = 'error' r.status_message = u'CKAN API error {0}'.format(unicode(c)) add_record(session, r) print r.status_message continue except ckanapi.errors.ValidationError as v: r.status = 'error' r.status_message = u'Validation error {0}'.format( unicode(v.error_dict)) add_record(session, r) print r.status_message continue break
def main(since='', start_index='', monitor=False): geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100' monitor_setting = get_setting('monitor_link') if monitor: if monitor_setting.setting_value is None: geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2015-01-01&alt=json&max-results=100' else: geog_url = monitor_setting.setting_value elif since != '': geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format( since) elif start_index != '': geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.format( start_index) print('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url)) r = requests.get(geog_url) logging.info('HTTP Response Status {0}'.format(r.status_code)) session = None try: session = connect_to_database() # Get the first page of the feed if r.status_code == 200: feed_page = r.json() # Save the monitor link for future use monitor_link = _get_link(feed_page, 'monitor') if monitor_link != '': monitor_setting.setting_value = monitor_link save_setting(monitor_setting) print "{0}Next Monitor Link: {1}{2}".format( Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value) next_link = _get_link(feed_page) print('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count'])) if 'products' in feed_page: for product in feed_page['products']: try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format( product['id'])) logging.error(e) # Keep polling until exhausted while next_link != '': geog_url = next_link r = requests.get(geog_url) feed_page = r.json() next_link = _get_link(feed_page) print '{0}Next page link: {1}{2}'.format( Fore.YELLOW, Fore.BLUE, next_link) if 'products' in feed_page: for product in feed_page['products']: # Don't crash on every call - log the error and continue try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format( product['id'])) logging.error(e) save_setting(monitor_setting)