Пример #1
0
 def testMainCreatesCollectionProfile(self):
     '''Test that the main function produces a collection profile
     file for DPLA. The path to this file is needed when creating a
     DPLA ingestion document.
     '''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     c = Collection("https://registry.cdlib.org/api/v1/collection/197/")
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(ingest_doc_id, 'test-id')
     self.assertEqual(num, 128)
     self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
Пример #2
0
 def testMainCreatesCollectionProfile(self, mock_boto3):
     '''Test that the main function produces a collection profile
     file for DPLA. The path to this file is needed when creating a
     DPLA ingestion document.
     '''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     Collection("https://registry.cdlib.org/api/v1/collection/197/")
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(ingest_doc_id, 'test-id')
     self.assertEqual(num, 128)
     self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
Пример #3
0
 def testMainFn(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.harvester = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(len(self.test_log_handler.records), 10)
     self.assertIn(u'[INFO] HarvestMain: Init harvester next',
                   self.test_log_handler.formatted_records[0])
     self.assertEqual(self.test_log_handler.formatted_records[1],
                      u'[INFO] HarvestMain: Create DPLA profile document')
     self.assertTrue(u'[INFO] HarvestMain: DPLA profile document' in
                     self.test_log_handler.formatted_records[2])
     self.assertEqual(self.test_log_handler.formatted_records[3],
                      u'[INFO] HarvestMain: Create ingest doc in couch')
     self.assertEqual(self.test_log_handler.formatted_records[4],
                      u'[INFO] HarvestMain: Ingest DOC ID: test-id')
     self.assertEqual(self.test_log_handler.formatted_records[5],
                      u'[INFO] HarvestMain: Start harvesting next')
     self.assertTrue(
         u"[INFO] HarvestController: Starting harvest for: "
         u"[email protected] Santa Clara University: Digital Objects "
         u"['UCDL'] ['Calisphere']",
         self.test_log_handler.formatted_records[6])
     self.assertEqual(self.test_log_handler.formatted_records[7],
                      u'[INFO] HarvestController: 100 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[8],
                      u'[INFO] HarvestController: 128 records harvested')
     self.assertEqual(
         self.test_log_handler.formatted_records[9],
         u'[INFO] HarvestMain: Finished harvest of '
         u'calisphere-santa-clara-university-digital-objects. 128 '
         u'records harvested.'
     )
Пример #4
0
 def testMainFn(self, mock_boto3):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.harvester = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(len(self.test_log_handler.records), 10)
     self.assertIn(u'[INFO] HarvestMain: Init harvester next',
                   self.test_log_handler.formatted_records[0])
     self.assertEqual(self.test_log_handler.formatted_records[1],
                      u'[INFO] HarvestMain: Create DPLA profile document')
     self.assertTrue(u'[INFO] HarvestMain: DPLA profile document' in
                     self.test_log_handler.formatted_records[2])
     self.assertEqual(self.test_log_handler.formatted_records[3],
                      u'[INFO] HarvestMain: Create ingest doc in couch')
     self.assertEqual(self.test_log_handler.formatted_records[4],
                      u'[INFO] HarvestMain: Ingest DOC ID: test-id')
     self.assertEqual(self.test_log_handler.formatted_records[5],
                      u'[INFO] HarvestMain: Start harvesting next')
     self.assertTrue(
         u"[INFO] HarvestController: Starting harvest for: "
         u"[email protected] Santa Clara University: Digital Objects "
         u"['UCDL'] ['Calisphere']",
         self.test_log_handler.formatted_records[6])
     self.assertEqual(self.test_log_handler.formatted_records[7],
                      u'[INFO] HarvestController: 100 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[8],
                      u'[INFO] HarvestController: 128 records harvested')
     self.assertEqual(
         self.test_log_handler.formatted_records[9],
         u'[INFO] HarvestMain: Finished harvest of '
         u'calisphere-santa-clara-university-digital-objects. 128 '
         u'records harvested.')
Пример #5
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS,
                                           emails,
                                           level='ERROR',
                                           bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(
            url_api_collection, str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Finished metadata harvest for CID: {}\n'
        'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()
Пример #6
0
    try:
        collection = Collection(url_api_collection)
    except Exception, e:
        msg = 'Exception in Collection {}, init {}'.format(url_api_collection,
                                                           str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
Пример #7
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(
            EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(url_api_collection,
                                                           str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(subject,
                          'Finished metadata harvest for CID: {}\n'
                          'Fetched: {}\nSaved: {}'.format(
                              collection.id,
                              num_recs,
                              num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()