def testMainCreatesCollectionProfile(self): '''Test that the main function produces a collection profile file for DPLA. The path to this file is needed when creating a DPLA ingestion document. ''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) c = Collection("https://registry.cdlib.org/api/v1/collection/197/") with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(ingest_doc_id, 'test-id') self.assertEqual(num, 128) self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
def testMainCreatesCollectionProfile(self, mock_boto3): '''Test that the main function produces a collection profile file for DPLA. The path to this file is needed when creating a DPLA ingestion document. ''' httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) Collection("https://registry.cdlib.org/api/v1/collection/197/") with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(ingest_doc_id, 'test-id') self.assertEqual(num, 128) self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
def testMainFn(self): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri( httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.harvester = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(len(self.test_log_handler.records), 10) self.assertIn(u'[INFO] HarvestMain: Init harvester next', self.test_log_handler.formatted_records[0]) self.assertEqual(self.test_log_handler.formatted_records[1], u'[INFO] HarvestMain: Create DPLA profile document') self.assertTrue(u'[INFO] HarvestMain: DPLA profile document' in self.test_log_handler.formatted_records[2]) self.assertEqual(self.test_log_handler.formatted_records[3], u'[INFO] HarvestMain: Create ingest doc in couch') self.assertEqual(self.test_log_handler.formatted_records[4], u'[INFO] HarvestMain: Ingest DOC ID: test-id') self.assertEqual(self.test_log_handler.formatted_records[5], u'[INFO] HarvestMain: Start harvesting next') self.assertTrue( u"[INFO] HarvestController: Starting harvest for: " u"[email protected] Santa Clara University: Digital Objects " u"['UCDL'] ['Calisphere']", self.test_log_handler.formatted_records[6]) self.assertEqual(self.test_log_handler.formatted_records[7], u'[INFO] HarvestController: 100 records harvested') self.assertEqual(self.test_log_handler.formatted_records[8], u'[INFO] HarvestController: 128 records harvested') self.assertEqual( self.test_log_handler.formatted_records[9], u'[INFO] HarvestMain: Finished harvest of ' u'calisphere-santa-clara-university-digital-objects. 128 ' u'records harvested.' )
def testMainFn(self, mock_boto3): httpretty.register_uri( httpretty.GET, "https://registry.cdlib.org/api/v1/collection/197/", body=open(DIR_FIXTURES + '/collection_api_test.json').read()) httpretty.register_uri(httpretty.GET, re.compile("http://content.cdlib.org/oai?.*"), body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read()) with patch('dplaingestion.couch.Couch') as mock_couch: instance = mock_couch.return_value instance._create_ingestion_document.return_value = 'test-id' ingest_doc_id, num, self.dir_save, self.harvester = fetcher.main( self.user_email, self.url_api_collection, log_handler=self.test_log_handler, mail_handler=self.test_log_handler, dir_profile=self.dir_test_profile, profile_path=self.profile_path, config_file=self.config_file) self.assertEqual(len(self.test_log_handler.records), 10) self.assertIn(u'[INFO] HarvestMain: Init harvester next', self.test_log_handler.formatted_records[0]) self.assertEqual(self.test_log_handler.formatted_records[1], u'[INFO] HarvestMain: Create DPLA profile document') self.assertTrue(u'[INFO] HarvestMain: DPLA profile document' in self.test_log_handler.formatted_records[2]) self.assertEqual(self.test_log_handler.formatted_records[3], u'[INFO] HarvestMain: Create ingest doc in couch') self.assertEqual(self.test_log_handler.formatted_records[4], u'[INFO] HarvestMain: Ingest DOC ID: test-id') self.assertEqual(self.test_log_handler.formatted_records[5], u'[INFO] HarvestMain: Start harvesting next') self.assertTrue( u"[INFO] HarvestController: Starting harvest for: " u"[email protected] Santa Clara University: Digital Objects " u"['UCDL'] ['Calisphere']", self.test_log_handler.formatted_records[6]) self.assertEqual(self.test_log_handler.formatted_records[7], u'[INFO] HarvestController: 100 records harvested') self.assertEqual(self.test_log_handler.formatted_records[8], u'[INFO] HarvestController: 128 records harvested') self.assertEqual( self.test_log_handler.formatted_records[9], u'[INFO] HarvestMain: Finished harvest of ' u'calisphere-santa-clara-university-digital-objects. 128 ' u'records harvested.')
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format( url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting( subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()
try: collection = Collection(url_api_collection) except Exception, e: msg = 'Exception in Collection {}, init {}'.format(url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp))
def main(user_email, url_api_collection, log_handler=None, mail_handler=None, dir_profile='profiles', profile_path=None, config_file=None, redis_host=None, redis_port=None, redis_pswd=None, redis_timeout=600, rq_queue=None, run_image_harvest=False, **kwargs): '''Runs a UCLDC ingest process for the given collection''' cleanup_work_dir() # remove files from /tmp emails = [user_email] if EMAIL_SYS_ADMIN: emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')]) if not mail_handler: mail_handler = logbook.MailHandler( EMAIL_RETURN_ADDRESS, emails, level='ERROR', bubble=True) mail_handler.push_application() if not config_file: config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini') if not (redis_host and redis_port and redis_pswd): config = config_harvest(config_file=config_file) try: collection = Collection(url_api_collection) except Exception as e: msg = 'Exception in Collection {}, init {}'.format(url_api_collection, str(e)) logbook.error(msg) raise e if not log_handler: log_handler = logbook.StderrHandler(level='DEBUG') log_handler.push_application() logger = logbook.Logger('run_ingest') ingest_doc_id, num_recs, dir_save, harvester = fetcher.main( emails, url_api_collection, log_handler=log_handler, mail_handler=mail_handler, **kwargs) if 'prod' in os.environ['DATA_BRANCH'].lower(): if not collection.ready_for_publication: raise Exception(''.join( ('Collection {} is not ready for publication.', ' Run on stage and QA first, then set', ' ready_for_publication')).format(collection.id)) logger.info("INGEST DOC ID:{0}".format(ingest_doc_id)) logger.info('HARVESTED {0} RECORDS'.format(num_recs)) logger.info('IN DIR:{0}'.format(dir_save)) resp = enrich_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error enriching records {0}".format(resp)) raise Exception('Failed during enrichment process: {0}'.format(resp)) logger.info('Enriched records') resp = save_records.main([None, ingest_doc_id]) if not resp >= 0: logger.error("Error saving records {0}".format(str(resp))) raise Exception("Error saving records {0}".format(str(resp))) num_saved = resp logger.info("SAVED RECS : {}".format(num_saved)) resp = remove_deleted_records.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error deleting records {0}".format(resp)) raise Exception("Error deleting records {0}".format(resp)) resp = check_ingestion_counts.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error checking counts {0}".format(resp)) raise Exception("Error checking counts {0}".format(resp)) resp = dashboard_cleanup.main([None, ingest_doc_id]) if not resp == 0: logger.error("Error cleaning up dashboard {0}".format(resp)) raise Exception("Error cleaning up dashboard {0}".format(resp)) subject = format_results_subject(collection.id, 'Harvest to CouchDB {env} ') publish_to_harvesting(subject, 'Finished metadata harvest for CID: {}\n' 'Fetched: {}\nSaved: {}'.format( collection.id, num_recs, num_saved)) log_handler.pop_application() mail_handler.pop_application()