def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') args = argparser.parse_args() factory = MetadataDatasetModelGeogratisFactory() now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN( remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 scan_date = None setting = get_setting('last_conversion_run') if args.since != '': try: scan_date = datetime.fromtimestamp( time.mktime(time.strptime(args.since, '%Y-%m-%d'))) except ValueError: logging.error("Incorrect since date format. Use YYYY-MM-DD") session.close() exit() except Exception, e: logging.error(e.message) session.close() exit()
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') args = argparser.parse_args() factory = MetadataDatasetModelGeogratisFactory() now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 scan_date = None setting = get_setting('last_conversion_run') if args.since != '': try: scan_date = datetime.fromtimestamp(time.mktime(time.strptime(args.since, '%Y-%m-%d'))) except ValueError: logging.error("Incorrect since date format. Use YYYY-MM-DD") session.close() exit() except Exception, e: logging.error(e.message) session.close() exit()
def main(since, scan_type): now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z') if scan_type == 'gr': factory = MetadataDatasetModelGeogratisFactory() setting = get_setting('last_conversion_gr') query_class = GeogratisRecord if setting is None: setting = Settings() setting.setting_name = 'last_conversion_gr' else: factory = MetadataDatasetModelECFactory() setting = get_setting('last_conversion_ec') query_class = ECRecord if setting is None: setting = Settings() setting.setting_name = 'last_conversion_ec' # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential # record ID's session = connect_to_database() last_id = 0 scan_date = None if since != '': try: scan_date = datetime.fromtimestamp( time.mktime(time.strptime(args.since, '%Y-%m-%d'))) except ValueError: logging.error("Incorrect since date format. Use YYYY-MM-DD") session.close() exit() except Exception, e: logging.error(e.message) session.close() exit()
def main(since, dumpfile, scan_type): ini_config = ConfigParser() ini_config.read('harvester.ini') session = connect_to_database() last_id = 0 while True: if args.monitor: last_run_setting = get_setting('last_conversion_' + scan_type) if last_run_setting.setting_value: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() elif args.since != '': package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > args.since).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.source == scan_type).\ order_by(Packages.id).limit(10).all() if len(package_stream) == 0: break else: if dumpfile != '': with open(dumpfile, 'a') as dfile: for r in package_stream: print u'Processing dataset {0}'.format(r.id) dfile.write(r.ckan_json + '\n') last_id = r.id else: for r in package_stream: print r.ckan_json + '\n' last_id = r.id session.close()
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN( remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') last_id = 0 last_run_setting = get_setting('last_conversion_run') session = connect_to_database() while True: if args.monitor: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .filter(GeogratisRecord.updated > last_run_setting.setting_value)\ .order_by(GeogratisRecord.id).limit(10).all() else: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .order_by(GeogratisRecord.id).limit(10).all() if len(geogratis_stream) == 0: break else: for r in geogratis_stream: # Determine if the record is already on the OD portal try: ckan_portal.action.package_show(id=r.uuid) # If the record does not exist, then a NotFound exception will be thrown print u'{0}'.format(r.uuid) except NotFound, e: pass last_id = r.id
def main(): ini_config = ConfigParser() ini_config.read('geogratis.ini') remote_ckan_url = ini_config.get('ckan', 'ckan.url') # Create CKAN API connector to the portal ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data') last_id = 0 last_run_setting = get_setting('last_conversion_run') session = connect_to_database() while True: if args.monitor: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .filter(GeogratisRecord.updated > last_run_setting.setting_value)\ .order_by(GeogratisRecord.id).limit(10).all() else: geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\ .filter(GeogratisRecord.state == 'deleted')\ .order_by(GeogratisRecord.id).limit(10).all() if len(geogratis_stream) == 0: break else: for r in geogratis_stream: # Determine if the record is already on the OD portal try: ckan_portal.action.package_show(id=r.uuid) # If the record does not exist, then a NotFound exception will be thrown print u'{0}'.format(r.uuid) except NotFound, e: pass last_id = r.id
add_record(session, ec_rec) session.close_all() # Temporary main eccsw = CswScanner() scan8601 = None scan_date = None if args.all: scan_date = None elif args.monitor: monitor_date = get_setting('csw_last_scan_date') scan_date = datetime.now() if monitor_date.setting_value is not None: scan_date = dateutil.parser.parse(monitor_date.setting_value) elif args.since != '': scan_date = dateutil.parser.parse(args.since) if scan_date is None: logging.error('Invalid date: ' + args.since) exit() eccsw.get_all_ids(scan_date) eccsw.load_naps() monitor_date = get_setting('csw_last_scan_date') scan_date = datetime.now() monitor_date.setting_value = scan_date.isoformat()
def main(since, dumpfile): ini_config = ConfigParser() ini_config.read('geogratis.ini') session = connect_to_database() last_id = 0 while True: # @todo clean - up the messy if statement here if args.monitor: last_run_setting = get_setting('last_conversion_run') if last_run_setting.setting_value: if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > last_run_setting.setting_value).\ order_by(Packages.id).limit(10).all() else: if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ order_by(Packages.id).limit(10).all() elif args.since != '': if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > args.since). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id).\ filter(Packages.updated > args.since). \ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(Packages.updated > args.since). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() else: if args.new_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(not Packages.existing).\ order_by(Packages.id).limit(10).all() elif args.update_only: package_stream = session.query(Packages).filter(Packages.id > last_id). \ filter(Packages.existing).\ order_by(Packages.id).limit(10).all() else: package_stream = session.query(Packages).filter(Packages.id > last_id).\ order_by(Packages.id).limit(10).all() if len(package_stream) == 0: break else: if dumpfile != '': with open(dumpfile, 'a') as dfile: for r in package_stream: print u'Processing dataset {0}'.format(r.id) dfile.write(r.ckan_json + '\n') last_id = r.id else: for r in package_stream: print r.ckan_json + '\n' last_id = r.id session.close()
def main(since='', start_index='', monitor=False): geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100' monitor_setting = get_setting(u'monitor_link') if monitor: if monitor_setting.setting_value is None: geog_url =\ 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2001-01-01&alt=json&max-results=100' else: geog_url = monitor_setting.setting_value elif since != '': geog_url =\ 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format(since) elif start_index != '': geog_url =\ 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.\ format(start_index) print ('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url)) r = requests.get(geog_url) logging.info('HTTP Response Status {0}'.format(r.status_code)) session = None try: session = connect_to_database() # Get the first page of the feed if r.status_code == 200: feed_page = r.json() # Save the monitor link for future use monitor_link = _get_link(feed_page, 'monitor') if monitor_link != '': monitor_setting.setting_value = monitor_link save_setting(monitor_setting) print "{0}Next Monitor Link: {1}{2}".format(Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value) next_link = _get_link(feed_page) print ('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count'])) if 'products' in feed_page: for product in feed_page['products']: try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format(product['id'])) logging.error(e) # Keep polling until exhausted while next_link != '': geog_url = next_link r = requests.get(geog_url) feed_page = r.json() next_link = _get_link(feed_page) print '{0}Next page link: {1}{2}'.format(Fore.YELLOW, Fore.BLUE, next_link) if 'products' in feed_page: for product in feed_page['products']: # Don't crash on every call - log the error and continue try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format(product['id'])) logging.error(e) save_setting(monitor_setting)
def main(since='', start_index='', monitor=False): geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100' monitor_setting = get_setting('monitor_link') if monitor: if monitor_setting.setting_value is None: geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2015-01-01&alt=json&max-results=100' else: geog_url = monitor_setting.setting_value elif since != '': geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format( since) elif start_index != '': geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.format( start_index) print('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url)) r = requests.get(geog_url) logging.info('HTTP Response Status {0}'.format(r.status_code)) session = None try: session = connect_to_database() # Get the first page of the feed if r.status_code == 200: feed_page = r.json() # Save the monitor link for future use monitor_link = _get_link(feed_page, 'monitor') if monitor_link != '': monitor_setting.setting_value = monitor_link save_setting(monitor_setting) print "{0}Next Monitor Link: {1}{2}".format( Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value) next_link = _get_link(feed_page) print('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count'])) if 'products' in feed_page: for product in feed_page['products']: try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format( product['id'])) logging.error(e) # Keep polling until exhausted while next_link != '': geog_url = next_link r = requests.get(geog_url) feed_page = r.json() next_link = _get_link(feed_page) print '{0}Next page link: {1}{2}'.format( Fore.YELLOW, Fore.BLUE, next_link) if 'products' in feed_page: for product in feed_page['products']: # Don't crash on every call - log the error and continue try: save_geogratis_record(session, product['id']) except Exception, e: logging.error('{0} failed to load'.format( product['id'])) logging.error(e) save_setting(monitor_setting)