예제 #1
0
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')

    args = argparser.parse_args()
    factory = MetadataDatasetModelGeogratisFactory()

    now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')

    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(
        remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    scan_date = None
    setting = get_setting('last_conversion_run')
    if args.since != '':
        try:
            scan_date = datetime.fromtimestamp(
                time.mktime(time.strptime(args.since, '%Y-%m-%d')))
        except ValueError:
            logging.error("Incorrect since date format. Use YYYY-MM-DD")
            session.close()
            exit()
        except Exception, e:
            logging.error(e.message)
            session.close()
            exit()
예제 #2
0
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')

    args = argparser.parse_args()
    factory = MetadataDatasetModelGeogratisFactory()

    now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')

    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    scan_date = None
    setting = get_setting('last_conversion_run')
    if args.since != '':
        try:
            scan_date = datetime.fromtimestamp(time.mktime(time.strptime(args.since, '%Y-%m-%d')))
        except ValueError:
            logging.error("Incorrect since date format. Use YYYY-MM-DD")
            session.close()
            exit()
        except Exception, e:
            logging.error(e.message)
            session.close()
            exit()
예제 #3
0
def main(since, scan_type):

    now_str = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.000Z')
    if scan_type == 'gr':
        factory = MetadataDatasetModelGeogratisFactory()
        setting = get_setting('last_conversion_gr')
        query_class = GeogratisRecord
        if setting is None:
            setting = Settings()
            setting.setting_name = 'last_conversion_gr'
    else:
        factory = MetadataDatasetModelECFactory()
        setting = get_setting('last_conversion_ec')
        query_class = ECRecord
        if setting is None:
            setting = Settings()
            setting.setting_name = 'last_conversion_ec'

    # Potentially doing a VERY large ORM query. If we don't limit the read, then SQLAlchemy will try to pull
    # everything into memory. Therefore the query must be paged. Paging requires keeping track of the sequential
    # record ID's

    session = connect_to_database()
    last_id = 0
    scan_date = None

    if since != '':
        try:
            scan_date = datetime.fromtimestamp(
                time.mktime(time.strptime(args.since, '%Y-%m-%d')))
        except ValueError:
            logging.error("Incorrect since date format. Use YYYY-MM-DD")
            session.close()
            exit()
        except Exception, e:
            logging.error(e.message)
            session.close()
            exit()
예제 #4
0
def main(since, dumpfile, scan_type):
    ini_config = ConfigParser()
    ini_config.read('harvester.ini')

    session = connect_to_database()
    last_id = 0

    while True:

        if args.monitor:
            last_run_setting = get_setting('last_conversion_' + scan_type)
            if last_run_setting.setting_value:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > last_run_setting.setting_value).\
                    filter(Packages.source == scan_type).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.source == scan_type).\
                    order_by(Packages.id).limit(10).all()
        elif args.since != '':
            package_stream = session.query(Packages).filter(Packages.id > last_id).\
                filter(Packages.updated > args.since).\
                filter(Packages.source == scan_type).\
                order_by(Packages.id).limit(10).all()
        else:
            package_stream = session.query(Packages).filter(Packages.id > last_id).\
                filter(Packages.source == scan_type).\
                order_by(Packages.id).limit(10).all()
        if len(package_stream) == 0:
            break
        else:
            if dumpfile != '':
                with open(dumpfile, 'a') as dfile:
                    for r in package_stream:
                        print u'Processing dataset {0}'.format(r.id)
                        dfile.write(r.ckan_json + '\n')
                        last_id = r.id
            else:
                for r in package_stream:
                    print r.ckan_json + '\n'
                    last_id = r.id

    session.close()
예제 #5
0
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')
    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(
        remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    last_id = 0
    last_run_setting = get_setting('last_conversion_run')
    session = connect_to_database()

    while True:
        if args.monitor:

            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .filter(GeogratisRecord.updated > last_run_setting.setting_value)\
                .order_by(GeogratisRecord.id).limit(10).all()
        else:
            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .order_by(GeogratisRecord.id).limit(10).all()

        if len(geogratis_stream) == 0:
            break
        else:
            for r in geogratis_stream:

                # Determine if the record is already on the OD portal
                try:
                    ckan_portal.action.package_show(id=r.uuid)
                    # If the record does not exist, then a NotFound exception will be thrown
                    print u'{0}'.format(r.uuid)
                except NotFound, e:
                    pass
                last_id = r.id
예제 #6
0
def main():
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')
    remote_ckan_url = ini_config.get('ckan', 'ckan.url')
    # Create CKAN API connector to the portal
    ckan_portal = RemoteCKAN(remote_ckan_url, user_agent='converter/1.0 http://open.canada.ca/data')

    last_id = 0
    last_run_setting = get_setting('last_conversion_run')
    session = connect_to_database()

    while True:
        if args.monitor:

            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .filter(GeogratisRecord.updated > last_run_setting.setting_value)\
                .order_by(GeogratisRecord.id).limit(10).all()
        else:
            geogratis_stream = session.query(GeogratisRecord).filter(GeogratisRecord.id > last_id)\
                .filter(GeogratisRecord.state == 'deleted')\
                .order_by(GeogratisRecord.id).limit(10).all()

        if len(geogratis_stream) == 0:
            break
        else:
            for r in geogratis_stream:

                # Determine if the record is already on the OD portal
                try:
                    ckan_portal.action.package_show(id=r.uuid)
                    # If the record does not exist, then a NotFound exception will be thrown
                    print u'{0}'.format(r.uuid)
                except NotFound, e:
                    pass
                last_id = r.id
예제 #7
0
            add_record(session, ec_rec)

        session.close_all()


# Temporary main

eccsw = CswScanner()
scan8601 = None
scan_date = None

if args.all:
    scan_date = None
elif args.monitor:
    monitor_date = get_setting('csw_last_scan_date')
    scan_date = datetime.now()
    if monitor_date.setting_value is not None:
        scan_date = dateutil.parser.parse(monitor_date.setting_value)
elif args.since != '':
    scan_date = dateutil.parser.parse(args.since)
    if scan_date is None:
        logging.error('Invalid date: ' + args.since)
        exit()

eccsw.get_all_ids(scan_date)
eccsw.load_naps()

monitor_date = get_setting('csw_last_scan_date')
scan_date = datetime.now()
monitor_date.setting_value = scan_date.isoformat()
예제 #8
0
def main(since, dumpfile):
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')

    session = connect_to_database()
    last_id = 0

    while True:
        # @todo clean - up the messy if statement here
        if args.monitor:
            last_run_setting = get_setting('last_conversion_run')
            if last_run_setting.setting_value:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        order_by(Packages.id).limit(10).all()
            else:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        order_by(Packages.id).limit(10).all()
        elif args.since != '':
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
        else:
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    order_by(Packages.id).limit(10).all()
        if len(package_stream) == 0:
            break
        else:
            if dumpfile != '':
                with open(dumpfile, 'a') as dfile:
                    for r in package_stream:
                        print u'Processing dataset {0}'.format(r.id)
                        dfile.write(r.ckan_json + '\n')
                        last_id = r.id
            else:
                for r in package_stream:
                    print r.ckan_json + '\n'
                    last_id = r.id

    session.close()
예제 #9
0
def main(since, dumpfile):
    ini_config = ConfigParser()
    ini_config.read('geogratis.ini')

    session = connect_to_database()
    last_id = 0

    while True:
        # @todo clean - up the messy if statement here
        if args.monitor:
            last_run_setting = get_setting('last_conversion_run')
            if last_run_setting.setting_value:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        filter(Packages.updated > last_run_setting.setting_value).\
                        order_by(Packages.id).limit(10).all()
            else:
                if args.new_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(not Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                elif args.update_only:
                    package_stream = session.query(Packages).filter(Packages.id > last_id). \
                        filter(Packages.existing).\
                        order_by(Packages.id).limit(10).all()
                else:
                    package_stream = session.query(Packages).filter(Packages.id > last_id).\
                        order_by(Packages.id).limit(10).all()
        elif args.since != '':
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    filter(Packages.updated > args.since). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.updated > args.since). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
        else:
            if args.new_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(not Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            elif args.update_only:
                package_stream = session.query(Packages).filter(Packages.id > last_id). \
                    filter(Packages.existing).\
                    order_by(Packages.id).limit(10).all()
            else:
                package_stream = session.query(Packages).filter(Packages.id > last_id).\
                    order_by(Packages.id).limit(10).all()
        if len(package_stream) == 0:
            break
        else:
            if dumpfile != '':
                with open(dumpfile, 'a') as dfile:
                    for r in package_stream:
                        print u'Processing dataset {0}'.format(r.id)
                        dfile.write(r.ckan_json + '\n')
                        last_id = r.id
            else:
                for r in package_stream:
                    print r.ckan_json + '\n'
                    last_id = r.id

    session.close()
예제 #10
0
def main(since='', start_index='', monitor=False):
    geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100'
    monitor_setting = get_setting(u'monitor_link')
    if monitor:
        if monitor_setting.setting_value is None:
            geog_url =\
                'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2001-01-01&alt=json&max-results=100'
        else:
            geog_url = monitor_setting.setting_value
    elif since != '':
        geog_url =\
            'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format(since)
    elif start_index != '':
        geog_url =\
            'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.\
            format(start_index)
    print ('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url))
    r = requests.get(geog_url)
    logging.info('HTTP Response Status {0}'.format(r.status_code))
    session = None
    try:
        session = connect_to_database()
        # Get the first page of the feed
        if r.status_code == 200:
            feed_page = r.json()

            # Save the monitor link for future use
            monitor_link = _get_link(feed_page, 'monitor')
            if monitor_link != '':

                monitor_setting.setting_value = monitor_link
                save_setting(monitor_setting)
                print  "{0}Next Monitor Link: {1}{2}".format(Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value)
            next_link = _get_link(feed_page)

            print ('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count']))

            if 'products' in feed_page:
                for product in feed_page['products']:
                    try:
                        save_geogratis_record(session, product['id'])
                    except Exception, e:
                        logging.error('{0} failed to load'.format(product['id']))
                        logging.error(e)

            # Keep polling until exhausted
            while next_link != '':
                geog_url = next_link
                r = requests.get(geog_url)
                feed_page = r.json()
                next_link = _get_link(feed_page)
                print '{0}Next page link: {1}{2}'.format(Fore.YELLOW, Fore.BLUE, next_link)
                if 'products' in feed_page:
                    for product in feed_page['products']:

                        # Don't crash on every call - log the error and continue
                        try:
                            save_geogratis_record(session, product['id'])
                        except Exception, e:
                            logging.error('{0} failed to load'.format(product['id']))
                            logging.error(e)
                save_setting(monitor_setting)
예제 #11
0
def main(since='', start_index='', monitor=False):
    geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?alt=json&max-results=100'
    monitor_setting = get_setting('monitor_link')
    if monitor:
        if monitor_setting.setting_value is None:
            geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min=2015-01-01&alt=json&max-results=100'
        else:
            geog_url = monitor_setting.setting_value
    elif since != '':
        geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst?edited-min={0}&alt=json&max-results=100'.format(
            since)
    elif start_index != '':
        geog_url = 'http://geogratis.gc.ca/api/en/nrcan-rncan/ess-sst/?start-index={0}&alt=json&max-results=100'.format(
            start_index)
    print('{0}Scanning: {1}{2}'.format(Fore.GREEN, Fore.BLUE, geog_url))
    r = requests.get(geog_url)
    logging.info('HTTP Response Status {0}'.format(r.status_code))
    session = None
    try:
        session = connect_to_database()
        # Get the first page of the feed
        if r.status_code == 200:
            feed_page = r.json()

            # Save the monitor link for future use
            monitor_link = _get_link(feed_page, 'monitor')
            if monitor_link != '':

                monitor_setting.setting_value = monitor_link
                save_setting(monitor_setting)
                print "{0}Next Monitor Link: {1}{2}".format(
                    Fore.YELLOW, Fore.BLUE, monitor_setting.setting_value)
            next_link = _get_link(feed_page)

            print('{0}{1} Records Found'.format(Fore.BLUE, feed_page['count']))

            if 'products' in feed_page:
                for product in feed_page['products']:
                    try:
                        save_geogratis_record(session, product['id'])
                    except Exception, e:
                        logging.error('{0} failed to load'.format(
                            product['id']))
                        logging.error(e)

            # Keep polling until exhausted
            while next_link != '':
                geog_url = next_link
                r = requests.get(geog_url)
                feed_page = r.json()
                next_link = _get_link(feed_page)
                print '{0}Next page link: {1}{2}'.format(
                    Fore.YELLOW, Fore.BLUE, next_link)
                if 'products' in feed_page:
                    for product in feed_page['products']:

                        # Don't crash on every call - log the error and continue
                        try:
                            save_geogratis_record(session, product['id'])
                        except Exception, e:
                            logging.error('{0} failed to load'.format(
                                product['id']))
                            logging.error(e)
                save_setting(monitor_setting)