Пример #1
0
 def testAddRegistryData(self):
     '''Unittest the _add_registry_data function'''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/197/')
     self.tearDown_config()  # remove ones setup in setUp
     self.setUp_config(collection)
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     obj = {'id': 'fakey', 'otherdata': 'test'}
     self.assertNotIn('collection', obj)
     controller._add_registry_data(obj)
     self.assertIn('collection', obj)
     self.assertEqual(obj['collection'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/collection/197/')
     self.assertNotIn('campus', obj)
     self.assertIn('campus', obj['collection'][0])
     self.assertNotIn('repository', obj)
     self.assertIn('repository', obj['collection'][0])
     # need to test one without campus
     self.assertEqual(obj['collection'][0]['campus'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/campus/12/')
     self.assertEqual(obj['collection'][0]['repository'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/repository/37/')
Пример #2
0
 def testLoggingMoreThan1000(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/198/",
         body=open(DIR_FIXTURES + '/collection_api_big_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-2400-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/198/')
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     controller.harvest()
     self.assertEqual(len(self.test_log_handler.records), 13)
     self.assertEqual(self.test_log_handler.formatted_records[1],
                      '[INFO] HarvestController: 100 records harvested')
     shutil.rmtree(controller.dir_save)
     self.assertEqual(self.test_log_handler.formatted_records[10],
                      '[INFO] HarvestController: 1000 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[11],
                      '[INFO] HarvestController: 2000 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[12],
                      '[INFO] HarvestController: 2400 records harvested')
Пример #3
0
 def testCreateProfile(self):
     '''Test the creation of a DPLA style proflie file'''
     httpretty.register_uri(
         httpretty.GET,
         'https://registry.cdlib.org/api/v1/collection/178',
         body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read())
     c = Collection('https://registry.cdlib.org/api/v1/collection/178')
     self.assertTrue(hasattr(c, 'dpla_profile'))
     self.assertIsInstance(c.dpla_profile, str)
     j = json.loads(c.dpla_profile)
     self.assertEqual(j['name'], '178')
     self.assertEqual(j['enrichments_coll'], ['/compare_with_schema'])
     self.assertTrue('enrichments_item' in j)
     self.assertIsInstance(j['enrichments_item'], list)
     self.assertEqual(len(j['enrichments_item']), 30)
     self.assertIn('contributor', j)
     self.assertIsInstance(j['contributor'], list)
     self.assertEqual(len(j['contributor']), 4)
     self.assertEqual(j['contributor'][1], {
         u'@id': u'/api/v1/campus/1/',
         u'name': u'UCB'
     })
     self.assertTrue(hasattr(c, 'dpla_profile_obj'))
     self.assertIsInstance(c.dpla_profile_obj, dict)
     self.assertIsInstance(c.dpla_profile_obj['enrichments_item'], list)
     e = c.dpla_profile_obj['enrichments_item']
     self.assertEqual(e[0], '/oai-to-dpla')
     self.assertEqual(
         e[1],
         '/shred?prop=sourceResource/contributor%2CsourceResource/creator%2CsourceResource/date'
     )
Пример #4
0
 def testMainCreatesCollectionProfile(self, mock_boto3):
     '''Test that the main function produces a collection profile
     file for DPLA. The path to this file is needed when creating a
     DPLA ingestion document.
     '''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     Collection("https://registry.cdlib.org/api/v1/collection/197/")
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(ingest_doc_id, 'test-id')
     self.assertEqual(num, 128)
     self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
Пример #5
0
 def testMainHarvestController__init__Error(self, mock_method):
     '''Test the try-except block in main when HarvestController not created
     correctly'''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     sys.argv = [
         'thisexe', '*****@*****.**',
         'https://registry.cdlib.org/api/v1/collection/197/'
     ]
     self.assertRaises(Exception,
                       fetcher.main,
                       self.user_email,
                       self.url_api_collection,
                       log_handler=self.test_log_handler,
                       mail_handler=self.test_log_handler,
                       dir_profile=self.dir_test_profile)
     self.assertEqual(len(self.test_log_handler.records), 4)
     self.assertTrue("[ERROR] HarvestMain: Exception in harvester init" in
                     self.test_log_handler.formatted_records[3])
     self.assertTrue("Boom!" in self.test_log_handler.formatted_records[3])
     c = Collection('https://registry.cdlib.org/api/v1/collection/197/')
     os.remove(
         os.path.abspath(os.path.join(self.dir_test_profile,
                                      c.id + '.pjs')))
def update_collection_from_remote(url_remote_couchdb,
                                  url_api_collection,
                                  delete_first=True):
    '''Update a collection from a remote couchdb.
    '''
    if delete_first:
        delete_collection(url_api_collection.rsplit('/', 2)[1])
    collection = Collection(url_api_collection)
    # guard against updating production for not ready_for_publication
    # collections
    if 'prod' in environ.get('DATA_BRANCH', ''):
        if not collection.ready_for_publication:
            raise Exception(
                'In PRODUCTION ENV and collection {} not ready for '
                'publication'.format(collection.id))
    doc_ids = get_collection_doc_ids(collection.id, url_remote_couchdb)
    couchdb_remote = get_couchdb(url_remote_couchdb)
    couchdb_env = get_couchdb()
    created = 0
    updated = 0

    for doc_id in doc_ids:
        msg = update_from_remote(doc_id,
                                 couchdb_remote=couchdb_remote,
                                 couchdb_env=couchdb_env)
        if 'created' in msg:
            created += 1
        else:
            updated += 1

    return len(doc_ids), updated, created
Пример #7
0
    def setUp(self):
        super(HarvestControllerTestCase, self).setUp()
        httpretty.register_uri(
            httpretty.GET,
            "https://registry.cdlib.org/api/v1/collection/197/",
            body=open(DIR_FIXTURES + '/collection_api_test.json').read())
        httpretty.register_uri(httpretty.GET,
                               re.compile("http://content.cdlib.org/oai?.*"),
                               body=open(DIR_FIXTURES +
                                         '/testOAI-128-records.xml').read())
        self.collection = Collection(
            'https://registry.cdlib.org/api/v1/collection/197/')
        config_file, profile_path = self.setUp_config(self.collection)
        self.controller_oai = fetcher.HarvestController(
            '*****@*****.**',
            self.collection,
            profile_path=profile_path,
            config_file=config_file)
        self.objset_test_doc = json.load(
            open(DIR_FIXTURES + '/objset_test_doc.json'))

        class myNow(datetime.datetime):
            @classmethod
            def now(cls):
                return cls(2017, 7, 14, 12, 1)

        self.old_dt = datetime.datetime
        datetime.datetime = myNow
Пример #8
0
 def testCollectionNoEnrichItems(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/36/",
         body=open(DIR_FIXTURES +
                   '/collection_api_no_enrich_item.json').read())
     c = Collection("https://registry.cdlib.org/api/v1/collection/36/")
     with self.assertRaises(ValueError):
         c.dpla_profile_obj
Пример #9
0
    def testNuxeoHarvest(self, mock_deepharvest, mock_boto, mock_boto3):
        '''Test the function of the Nuxeo harvest'''
        media_json = open(DIR_FIXTURES + '/nuxeo_media_structmap.json').read()
        mock_boto.return_value.get_bucket.return_value.\
            get_key.return_value.\
            get_contents_as_string.return_value = media_json
        httpretty.register_uri(
            httpretty.GET,
            'http://registry.cdlib.org/api/v1/collection/19/',
            body=open(DIR_FIXTURES + '/collection_api_test_nuxeo.json').read())
        mock_deepharvest.return_value.fetch_objects.return_value = json.load(
            open(DIR_FIXTURES + '/nuxeo_object_list.json'))
        httpretty.register_uri(
            httpretty.GET,
            re.compile('https://example.edu/Nuxeo/site/api/v1/id/.*'),
            body=open(DIR_FIXTURES + '/nuxeo_doc.json').read())

        self.collection = Collection(
            'http://registry.cdlib.org/api/v1/collection/19/')
        with patch(
                'ConfigParser.SafeConfigParser',
                autospec=True) as mock_configparser:
            config_inst = mock_configparser.return_value
            config_inst.get.return_value = 'dublincore,ucldc_schema,picture'
            self.setUp_config(self.collection)
            self.controller = fetcher.HarvestController(
                '*****@*****.**',
                self.collection,
                config_file=self.config_file,
                profile_path=self.profile_path)
        self.assertTrue(hasattr(self.controller, 'harvest'))
        num = self.controller.harvest()
        self.assertEqual(num, 5)
        self.tearDown_config()
        # verify one record has collection and such filled in
        fname = os.listdir(self.controller.dir_save)[0]
        saved_objset = json.load(
            open(os.path.join(self.controller.dir_save, fname)))
        saved_obj = saved_objset[0]
        self.assertEqual(saved_obj['collection'][0]['@id'],
                         u'http://registry.cdlib.org/api/v1/collection/19/')
        self.assertEqual(saved_obj['collection'][0]['name'],
                         u'Cochems (Edward W.) Photographs')
        self.assertEqual(saved_obj['collection'][0]['title'],
                         u'Cochems (Edward W.) Photographs')
        self.assertEqual(saved_obj['collection'][0]['id'], u'19')
        self.assertEqual(saved_obj['collection'][0]['dcmi_type'], 'I')
        self.assertEqual(saved_obj['collection'][0]['rights_statement'],
                         'a sample rights statement')
        self.assertEqual(saved_obj['collection'][0]['rights_status'], 'PD')
        self.assertEqual(saved_obj['state'], 'project')
        self.assertEqual(
            saved_obj['title'],
            'Adeline Cochems having her portrait taken by her father '
            'Edward W, Cochems in Santa Ana, California: Photograph')
def main(url_remote_couchdb, url_api_collection):
    '''Update to the current environment's couchdb a remote couchdb collection
    '''
    collection = Collection(url_api_collection)
    total, updated, created = update_collection_from_remote(
        url_remote_couchdb, url_api_collection)
    msg = 'Synced {} documents to production for CouchDB collection {}'.format(
        total, collection.id)
    msg += '\nUpdated {} documents, created {} documents.'.format(
        updated, created)
    publish_to_harvesting('Synced CouchDB Collection {}'.format(collection.id),
                          msg)
Пример #11
0
def get_id_on_queue_and_run(queue):
    cdbworker = CouchDBWorker()
    cid = queue.get_nowait()
    while cid:
        c_reg = Collection(url_api_base + cid)
        h = HarvestController('*****@*****.**', c_reg)
        c_couch = h._add_registry_data({})['collection']
        del (h)
        print "STARTING COLLECTION: {}".format(cid)
        cdbworker.run_by_collection(cid, fix_registry_data, c_couch,
                                    cdbworker._couchdb)
        print "FINISHED COLLECTION: {}".format(cid)
        cid = queue.get_nowait()
def update_collection_description(doc):
    cjson = doc['originalRecord']['collection'][0]
    # get collection description
    if 'description' not in cjson:
        if cjson['@id'] in C_CACHE:
            c = C_CACHE[cjson['@id']]
        else:
            c = Collection(url_api=cjson['@id'])
            C_CACHE[cjson['@id']] = c
        description = c['description'] if c['description'] else c['name']
        print('DOC: {} DESCRIP: {}'.format(doc['_id'],
                                           c['description'].encode('utf8')))
        doc['originalRecord']['collection'][0]['description'] = description
        doc['sourceResource']['collection'][0]['description'] = description
    return doc
Пример #13
0
 def testOAICollectionAPI(self):
     httpretty.register_uri(
         httpretty.GET,
         'https://registry.cdlib.org/api/v1/collection/197',
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     c = Collection('https://registry.cdlib.org/api/v1/collection/197')
     self.assertEqual(c['harvest_type'], 'OAI')
     self.assertEqual(c.harvest_type, 'OAI')
     self.assertEqual(
         c['name'], 'Calisphere - Santa Clara University: Digital Objects')
     self.assertEqual(
         c.name, 'Calisphere - Santa Clara University: Digital Objects')
     self.assertEqual(c['url_oai'], 'fixtures/testOAI-128-records.xml')
     self.assertEqual(c.url_oai, 'fixtures/testOAI-128-records.xml')
     self.assertEqual(c.campus[0]['resource_uri'], '/api/v1/campus/12/')
     self.assertEqual(c.campus[0]['slug'], 'UCDL')
Пример #14
0
 def testOACApiCollection(self):
     httpretty.register_uri(
         httpretty.GET,
         'https://registry.cdlib.org/api/v1/collection/178',
         body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read())
     c = Collection('https://registry.cdlib.org/api/v1/collection/178')
     self.assertEqual(c['harvest_type'], 'OAJ')
     self.assertEqual(c.harvest_type, 'OAJ')
     self.assertEqual(c['name'], 'Harry Crosby Collection')
     self.assertEqual(c.name, 'Harry Crosby Collection')
     self.assertEqual(c['url_oac'], 'fixtures/testOAC.json')
     self.assertEqual(c.url_oac, 'fixtures/testOAC.json')
     self.assertEqual(c.campus[0]['resource_uri'], '/api/v1/campus/6/')
     self.assertEqual(c.campus[0]['slug'], 'UCSD')
     self.assertEqual(c.dcmi_type, 'I')
     self.assertEqual(c.rights_statement, "a sample rights statement")
     self.assertEqual(c.rights_status, "PD")
Пример #15
0
 def setUp(self):
     super(MainTestCase, self).setUp()
     self.dir_test_profile = '/tmp/profiles/test'
     self.dir_save = None
     if not os.path.isdir(self.dir_test_profile):
         os.makedirs(self.dir_test_profile)
     self.user_email = '*****@*****.**'
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     self.url_api_collection = \
         "https://registry.cdlib.org/api/v1/collection/197/"
     sys.argv = ['thisexe', self.user_email, self.url_api_collection]
     self.collection = Collection(self.url_api_collection)
     self.setUp_config(self.collection)
     self.mail_handler = logbook.TestHandler(bubble=True)
     self.mail_handler.push_thread()
def main(collection_ids,
         rq_queue='dh-q',
         config=None,
         pynuxrc=None,
         replace=False,
         timeout=JOB_TIMEOUT,
         log_handler=None):
    ''' Queue a deep harvest of a nuxeo object on a worker'''
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()
    log = logbook.Logger('QDH')
    for cid in [x for x in collection_ids.split(';')]:
        url_api = ''.join(
            ('https://registry.cdlib.org/api/v1/collection/', cid, '/'))
        coll = Collection(url_api)

        dh = DeepHarvestNuxeo(coll.harvest_extra_data, '', pynuxrc=pynuxrc)

        for obj in dh.fetch_objects():
            log.info('Queueing TOPLEVEL {} :-: {}'.format(
                obj['uid'], obj['path']))
            # deep harvest top level object
            queue_deep_harvest_path(config['redis_host'],
                                    config['redis_port'],
                                    config['redis_password'],
                                    config['redis_connect_timeout'],
                                    rq_queue=rq_queue,
                                    path=obj['path'],
                                    replace=replace,
                                    timeout=timeout)
            # deep harvest component sub-objects
            for c in dh.fetch_components(obj):
                log.info('Queueing {} :-: {}'.format(c['uid'], c['path']))
                queue_deep_harvest_path(config['redis_host'],
                                        config['redis_port'],
                                        config['redis_password'],
                                        config['redis_connect_timeout'],
                                        rq_queue=rq_queue,
                                        path=c['path'],
                                        replace=replace,
                                        timeout=timeout)

    log_handler.pop_application()
Пример #17
0
 def testMARCHarvest(self, mock_boto3):
     '''Test the function of the MARC harvest'''
     httpretty.register_uri(
         httpretty.GET,
         'http://registry.cdlib.org/api/v1/collection/',
         body=open(DIR_FIXTURES + '/collection_api_test_marc.json').read())
     self.collection = Collection(
         'http://registry.cdlib.org/api/v1/collection/')
     self.collection.url_harvest = 'file:' + DIR_FIXTURES + '/marc-test'
     self.setUp_config(self.collection)
     self.controller = fetcher.HarvestController(
         '*****@*****.**',
         self.collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
     self.assertTrue(hasattr(self.controller, 'harvest'))
     num = self.controller.harvest()
     self.assertEqual(num, 10)
     self.tearDown_config()
Пример #18
0
 def testFailsIfNoRecords(self):
     '''Test that the Controller throws an error if no records come back
     from fetcher
     '''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/101/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-no-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/101/')
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     self.assertRaises(fetcher.NoRecordsFetchedException,
                       controller.harvest)
Пример #19
0
 def setUp(self):
     super(HarvestOAC_JSON_ControllerTestCase, self).setUp()
     # self.testFile = DIR_FIXTURES+'/collection_api_test_oac.json'
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/178/",
         body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read())
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/tf2v19n928',
         body=open(DIR_FIXTURES + '/testOAC.json').read())
     self.collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/178/')
     self.setUp_config(self.collection)
     self.controller = fetcher.HarvestController(
         '*****@*****.**',
         self.collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
Пример #20
0
 def testOAIHarvest(self):
     '''Test the function of the OAI harvest'''
     httpretty.register_uri(
             httpretty.GET,
             'http://registry.cdlib.org/api/v1/collection/',
             body=open(DIR_FIXTURES+'/collection_api_test.json').read())
     httpretty.register_uri(
             httpretty.GET,
             'http://content.cdlib.org/oai',
             body=open(DIR_FIXTURES+'/testOAC-url_next-0.xml').read())
     self.collection = Collection(
             'http://registry.cdlib.org/api/v1/collection/')
     self.setUp_config(self.collection)
     self.controller = fetcher.HarvestController(
             '*****@*****.**', self.collection,
             config_file=self.config_file, profile_path=self.profile_path)
     self.assertTrue(hasattr(self.controller, 'harvest'))
     # TODO: fix why logbook.TestHandler not working for previous logging
     # self.assertEqual(len(self.test_log_handler.records), 2)
     self.tearDown_config()
Пример #21
0
 def testSaveToS3(self, mock_boto3):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/197/')
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     controller.save_objset_s3({"xxxx": "yyyy"})
     mock_boto3.assert_called_with('s3')
     mock_boto3().Bucket.assert_called_with('ucldc-ingest')
     mock_boto3().Bucket().put_object.assert_called_with(
         Body='{"xxxx": "yyyy"}\n',
         Key='data-fetched/197/2017-07-14-1201/page-0.jsonl')
Пример #22
0
def main(user_email,
         url_api_collections,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file='akara.ini',
         rq_queue=None,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS,
                                           emails,
                                           level='ERROR',
                                           bubble=True)
    mail_handler.push_application()
    config = config_harvest(config_file=config_file)
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')
    log_handler.push_application()

    for url_api_collection in [x for x in url_api_collections.split(';')]:
        try:
            collection = Collection(url_api_collection)
        except Exception, e:
            msg = 'Exception in Collection {}, init {}'.format(
                url_api_collection, str(e))
            logbook.error(msg)
            raise e
        queue_image_harvest(config['redis_host'],
                            config['redis_port'],
                            config['redis_password'],
                            config['redis_connect_timeout'],
                            rq_queue=rq_queue,
                            collection_key=collection.id,
                            object_auth=collection.auth,
                            **kwargs)
Пример #23
0
 def setUp(self):
     super(HarvestOAC_XML_ControllerTestCase, self).setUp()
     # self.testFile = DIR_FIXTURES+'/collection_api_test_oac.json'
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/178/",
         body=open(DIR_FIXTURES +
                   '/collection_api_test_oac_xml.json').read())
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/tf0c600134',
         body=open(DIR_FIXTURES + '/testOAC-url_next-0.xml').read())
     self.collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/178/')
     self.setUp_config(self.collection)
     self.controller = fetcher.HarvestController(
         '*****@*****.**',
         self.collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
     print "DIR SAVE::::: {}".format(self.controller.dir_save)
Пример #24
0
 def setUp(self):
     super(HarvestSolr_ControllerTestCase, self).setUp()
     # self.testFile = DIR_FIXTURES+'/collection_api_test_oac.json'
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/183/",
         body=open(DIR_FIXTURES +
                   '/collection_api_solr_harvest.json').read())
     httpretty.register_uri(
         httpretty.POST,
         'http://example.edu/solr/blacklight/select',
         body=open(DIR_FIXTURES +
                   '/ucsd-new-feed-missions-bb3038949s-0.xml').read())
     self.collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/183/')
     self.setUp_config(self.collection)
     self.controller = fetcher.HarvestController(
         '*****@*****.**',
         self.collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
     print "DIR SAVE::::: {}".format(self.controller.dir_save)
def add_rights_and_type_to_collection(doc):
    cjson = doc['originalRecord']['collection'][0]
    # get collection description
    if cjson['@id'] in C_CACHE:
        c = C_CACHE[cjson['@id']]
    else:
        c = Collection(url_api=cjson['@id'])
        C_CACHE[cjson['@id']] = c
    doc['originalRecord']['collection'][0]['rights_status'] = c[
        'rights_status']
    doc['originalRecord']['collection'][0]['rights_statement'] = c[
        'rights_statement']
    doc['originalRecord']['collection'][0]['dcmi_type'] = c['dcmi_type']
    if 'collection' in doc['sourceResource']:
        doc['sourceResource']['collection'][0]['rights_status'] = c[
            'rights_status']
        doc['sourceResource']['collection'][0]['rights_statement'] = c[
            'rights_statement']
        doc['sourceResource']['collection'][0]['dcmi_type'] = c['dcmi_type']
    else:
        doc['sourceResource']['collection'] = doc['originalRecord'][
            'collection']
    return doc
Пример #26
0
def main(args):
    parser = argparse.ArgumentParser(
        description='run the enrichments stored for a collection.')
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--collection_id',
                        help='Registry id for the collection')
    group.add_argument('--cid_file',
                        help='File with collection ids for running')
    parser.add_argument('--rq_queue',
			help='Override queue for jobs, normal-stage is default')

    args = parser.parse_args(args)
    Q = 'normal-stage'
    if args.rq_queue:
        Q = args.rq_queue
    enq = CouchDBJobEnqueue(Q)
    timeout = 10000

    cids = []
    if args.collection_id:
        cids = [ args.collection_id ]
    else: #cid file
        with open(args.cid_file) as foo:
            lines = foo.readlines()
        cids = [ l.strip() for l in lines]
    print "CIDS:{}".format(cids)

    for cid in cids:
        url_api = ''.join(('https://registry.cdlib.org/api/v1/collection/',
                    cid, '/'))
        coll = Collection(url_api)
        print coll.id
        enrichments = coll.enrichments_item
        enq.queue_collection(cid, timeout,
                     harvester.post_processing.enrich_existing_couch_doc.main,
                     enrichments
                     )
Пример #27
0
    def testNuxeoCollectionAuth(self):
        '''Test that a Nuxeo harvest collection returns an
        authentication tuple, not None
        '''
        httpretty.register_uri(
            httpretty.GET,
            'https://registry.cdlib.org/api/v1/collection/19',
            body=open(DIR_FIXTURES +
                      '/registry_api_collection_nuxeo.json').read())
        c = Collection('https://registry.cdlib.org/api/v1/collection/19')
        self.assertTrue(c.harvest_type, 'NUX')
        defaultrc = """\
[nuxeo_account]
user = TestUser
password = TestPass

[platform_importer]
base = http://localhost:8080/nuxeo/site/fileImporter
"""

        with patch('__builtin__.open') as fakeopen:
            fakeopen.return_value = StringIO.StringIO(defaultrc)
            self.assertEqual(c.auth[0], 'TestUser')
            self.assertEqual(c.auth[1], 'TestPass')
Пример #28
0
 def testHarvestControllerExists(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/197/')
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     self.assertTrue(hasattr(controller, 'fetcher'))
     self.assertIsInstance(controller.fetcher, fetcher.OAIFetcher)
     self.assertTrue(hasattr(controller, 'campus_valid'))
     self.assertTrue(hasattr(controller, 'dc_elements'))
     self.assertTrue(hasattr(controller, 'datetime_start'))
     print(controller.s3path)
     self.assertEqual(controller.s3path,
                      'data-fetched/197/2017-07-14-1201/')
     shutil.rmtree(controller.dir_save)
Пример #29
0
     cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc')
 else:
     cdb = get_couchdb(dbname='ucldc')
 collections = get_indexed_collection_list(SOLR)
 date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M')
 fname = 'indexed_collections-{}.csv'.format(date_to_minute)
 with open(fname, 'wb') as csvfile:
     csvwriter = UnicodeWriter(csvfile)
     csvwriter.writerow(
         ('Collection Name', 'Collection URL', 'Number in index',
          'Number in couchdb', 'Number in OAC', 'Couch missing in solr',
          'OAC missing in couch', 'Repository Name', 'Repository URL',
          'Campus'))
     for c_url, num in collections:
         try:
             c = Collection(c_url)
         except ValueError, e:
             print "NO COLLECTION FOR :{}".format(c_url)
             continue
         couch_count = get_couch_count(cdb, c.id)
         solr_equal_couch = False
         if couch_count == num:
             solr_equal_couch = True
         oac_num = None
         couch_equal_oac = None
         if c.harvest_type == 'OAC':
             fetcher = OAC_XML_Fetcher(c.url_harvest, c.harvest_extra_data)
             oac_num = fetcher.totalDocs
             if couch_count == oac_num:
                 couch_equal_oac = True
             else:
Пример #30
0
def main(user_email,
         url_api_collection,
         log_handler=None,
         mail_handler=None,
         dir_profile='profiles',
         profile_path=None,
         config_file=None,
         redis_host=None,
         redis_port=None,
         redis_pswd=None,
         redis_timeout=600,
         rq_queue=None,
         run_image_harvest=False,
         **kwargs):
    '''Runs a UCLDC ingest process for the given collection'''
    cleanup_work_dir()  # remove files from /tmp
    emails = [user_email]
    if EMAIL_SYS_ADMIN:
        emails.extend([u for u in EMAIL_SYS_ADMIN.split(',')])
    if not mail_handler:
        mail_handler = logbook.MailHandler(EMAIL_RETURN_ADDRESS,
                                           emails,
                                           level='ERROR',
                                           bubble=True)
    mail_handler.push_application()
    if not config_file:
        config_file = os.environ.get('DPLA_CONFIG_FILE', 'akara.ini')
    if not (redis_host and redis_port and redis_pswd):
        config = config_harvest(config_file=config_file)

    try:
        collection = Collection(url_api_collection)
    except Exception as e:
        msg = 'Exception in Collection {}, init {}'.format(
            url_api_collection, str(e))
        logbook.error(msg)
        raise e
    if not log_handler:
        log_handler = logbook.StderrHandler(level='DEBUG')

    log_handler.push_application()
    logger = logbook.Logger('run_ingest')
    ingest_doc_id, num_recs, dir_save, harvester = fetcher.main(
        emails,
        url_api_collection,
        log_handler=log_handler,
        mail_handler=mail_handler,
        **kwargs)
    if 'prod' in os.environ['DATA_BRANCH'].lower():
        if not collection.ready_for_publication:
            raise Exception(''.join(
                ('Collection {} is not ready for publication.',
                 ' Run on stage and QA first, then set',
                 ' ready_for_publication')).format(collection.id))
    logger.info("INGEST DOC ID:{0}".format(ingest_doc_id))
    logger.info('HARVESTED {0} RECORDS'.format(num_recs))
    logger.info('IN DIR:{0}'.format(dir_save))
    resp = enrich_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error enriching records {0}".format(resp))
        raise Exception('Failed during enrichment process: {0}'.format(resp))
    logger.info('Enriched records')

    resp = save_records.main([None, ingest_doc_id])
    if not resp >= 0:
        logger.error("Error saving records {0}".format(str(resp)))
        raise Exception("Error saving records {0}".format(str(resp)))
    num_saved = resp
    logger.info("SAVED RECS : {}".format(num_saved))

    resp = remove_deleted_records.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error deleting records {0}".format(resp))
        raise Exception("Error deleting records {0}".format(resp))

    resp = check_ingestion_counts.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error checking counts {0}".format(resp))
        raise Exception("Error checking counts {0}".format(resp))

    resp = dashboard_cleanup.main([None, ingest_doc_id])
    if not resp == 0:
        logger.error("Error cleaning up dashboard {0}".format(resp))
        raise Exception("Error cleaning up dashboard {0}".format(resp))
    subject = format_results_subject(collection.id,
                                     'Harvest to CouchDB {env} ')
    publish_to_harvesting(
        subject, 'Finished metadata harvest for CID: {}\n'
        'Fetched: {}\nSaved: {}'.format(collection.id, num_recs, num_saved))

    log_handler.pop_application()
    mail_handler.pop_application()