Exemplo n.º 1
0
    def testResourceIteratoreMultiPage(self):
        '''Test when less than one page worth of objects fetched'''
        httpretty.register_uri(
            httpretty.GET,
            'https://registry.cdlib.org/api/v1/repository/?limit=20&offset=20',
            body=open(DIR_FIXTURES +
                      '/registry_api_repository-page-2.json').read())
        httpretty.register_uri(
            httpretty.GET,
            'https://registry.cdlib.org/api/v1/repository/',
            body=open(DIR_FIXTURES + '/registry_api_repository.json').read())

        riter = self.registry.resource_iter('repository')
        self.assertEqual(riter.url,
                         'https://registry.cdlib.org/api/v1/repository/')
        self.assertEqual(riter.path_next,
                         '/api/v1/repository/?limit=20&offset=20')
        r = ''
        for x in range(0, 38):
            r = riter.next()
        self.assertFalse(isinstance(r, Collection))
        self.assertEqual(r['resource_uri'], '/api/v1/repository/42/')
        self.assertEqual(
            riter.url,
            'https://registry.cdlib.org/api/v1/repository/?limit=20&offset=20')
        self.assertEqual(riter.path_next, None)
        self.assertRaises(StopIteration, riter.next)
Exemplo n.º 2
0
 def testCollectionSlice(self):
     '''Test that results are correct for a known couchdb result'''
     url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design',
                                  COUCHDB_VIEW.split('/')[0], '_view',
                                  COUCHDB_VIEW.split('/')[1])
     httpretty.register_uri(
         httpretty.GET,
         re.compile(url_to_pretty + ".*$"),
         body=open(DIR_FIXTURES +
                   '/couchdb_by_provider_name-5112.json').read(),
         etag="2U5BW2TDDX9EHZJOO0DNE29D1",
         content_type='application/json',
     )
     #transfer_encoding='chunked', #NOTE: doesn't work with httpretty
     results = self._cdbrunner.queue_collection('5112',
                                                6000,
                                                self.function,
                                                'arg1',
                                                'arg2',
                                                kwarg1='1',
                                                kwarg2=2)
     self.assertEqual(len(results), 3)
     self.assertEqual(results[0].args,
                      ('5112--http://ark.cdlib.org/ark:/13030/kt7580382j',
                       'arg1', 'arg2'))
     self.assertEqual(results[0].kwargs, {'kwarg1': '1', 'kwarg2': 2})
     self.assertEqual(results[0].func_name,
                      'test.test_couchdb_runner.func_for_test')
Exemplo n.º 3
0
    def testInit(self):
        '''Basic tdd start'''
        url = 'https://s3.amazonaws.com/pastperfectonline/xmlfiles/museum_231'
        httpretty.register_uri(
            httpretty.GET,
            url,
            body=open(DIR_FIXTURES + '/xml-fetch.xml').read())
        h = fetcher.XML_Fetcher(url, None)
        self.assertEqual(h.url_base, url)
        docs = []
        d = h.next()
        self.assertEqual(len(d), 999)
        docs.extend(d)
        for d in h:
            docs.extend(d)
        self.assertEqual(len(docs), 2320)
        test1 = docs[0]
        test2 = docs[2]
        self.assertIn('title', test1['metadata'])
        self.assertEqual(test1['metadata']['title'], [
            'California desperadoes : stories of early California outlaws in their own word'
        ])

        # test that attributes are captured, even from empty elements
        self.assertEqual(test1['metadata']['q'], ['taken'])
        self.assertEqual(test1['metadata']['d'], ['Kodak'])
        self.assertEqual(test2['metadata']['q'], ['scanned'])
        self.assertEqual(test2['metadata']['d'], ['Epson'])
Exemplo n.º 4
0
 def testCreateProfile(self):
     '''Test the creation of a DPLA style proflie file'''
     httpretty.register_uri(
         httpretty.GET,
         'https://registry.cdlib.org/api/v1/collection/178',
         body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read())
     c = Collection('https://registry.cdlib.org/api/v1/collection/178')
     self.assertTrue(hasattr(c, 'dpla_profile'))
     self.assertIsInstance(c.dpla_profile, str)
     j = json.loads(c.dpla_profile)
     self.assertEqual(j['name'], '178')
     self.assertEqual(j['enrichments_coll'], ['/compare_with_schema'])
     self.assertTrue('enrichments_item' in j)
     self.assertIsInstance(j['enrichments_item'], list)
     self.assertEqual(len(j['enrichments_item']), 30)
     self.assertIn('contributor', j)
     self.assertIsInstance(j['contributor'], list)
     self.assertEqual(len(j['contributor']), 4)
     self.assertEqual(j['contributor'][1], {
         u'@id': u'/api/v1/campus/1/',
         u'name': u'UCB'
     })
     self.assertTrue(hasattr(c, 'dpla_profile_obj'))
     self.assertIsInstance(c.dpla_profile_obj, dict)
     self.assertIsInstance(c.dpla_profile_obj['enrichments_item'], list)
     e = c.dpla_profile_obj['enrichments_item']
     self.assertEqual(e[0], '/oai-to-dpla')
     self.assertEqual(
         e[1],
         '/shred?prop=sourceResource/contributor%2CsourceResource/creator%2CsourceResource/date'
     )
Exemplo n.º 5
0
 def testSolrHarvest(self, mock_boto3):
     '''Test the function of the Solr harvest with <date> objects
     in stream'''
     httpretty.register_uri(
         httpretty.POST,
         'http://example.edu/solr/blacklight/select',
         responses=[
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-0.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-1.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-2.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-3.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-4.xml').read())
         ])
     self.assertTrue(hasattr(self.controller, 'harvest'))
     self.controller.harvest()
     print "LOGS:{}".format(self.test_log_handler.formatted_records)
     self.assertEqual(len(self.test_log_handler.records), 2)
     self.assertTrue(
         'UC San Diego' in self.test_log_handler.formatted_records[0])
     self.assertEqual(self.test_log_handler.formatted_records[1],
                      '[INFO] HarvestController: 13 records harvested')
Exemplo n.º 6
0
 def testInit(self):
     '''Basic tdd start'''
     url = 'https://example.edu'
     user_id = 'testuser'
     page_size = 10
     url_first = fetcher.Flickr_Fetcher.url_get_photos_template.format(
         api_key='boguskey', user_id=user_id, per_page=page_size, page=1)
     httpretty.register_uri(
         httpretty.GET,
         url_first,
         body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read())
     h = fetcher.Flickr_Fetcher(url, user_id, page_size=page_size)
     self.assertEqual(h.url_base, url)
     self.assertEqual(h.user_id, user_id)
     self.assertEqual(h.page_size, 10)
     self.assertEqual(h.page_current, 1)
     self.assertEqual(h.doc_current, 0)
     self.assertEqual(h.docs_fetched, 0)
     self.assertEqual(h.url_get_photos_template,
                      'https://api.flickr.com/services/rest/'
                      '?api_key={api_key}&user_id={user_id}&per_page'
                      '={per_page}&method='
                      'flickr.people.getPublicPhotos&page={page}')
     self.assertEqual(h.api_key, 'boguskey')
     self.assertEqual(h.url_current, url_first)
     self.assertEqual(h.docs_total, 10)
     self.assertEqual(h.url_get_photo_info_template,
                      'https://api.flickr.com/services/rest/'
                      '?api_key={api_key}&method='
                      'flickr.photos.getInfo&photo_id={photo_id}')
Exemplo n.º 7
0
 def testLoggingMoreThan1000(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/198/",
         body=open(DIR_FIXTURES + '/collection_api_big_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-2400-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/198/')
     controller = fetcher.HarvestController(
         '*****@*****.**',
         collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
     controller.harvest()
     self.assertEqual(len(self.test_log_handler.records), 13)
     self.assertEqual(self.test_log_handler.formatted_records[1],
                      '[INFO] HarvestController: 100 records harvested')
     shutil.rmtree(controller.dir_save)
     self.assertEqual(self.test_log_handler.formatted_records[10],
                      '[INFO] HarvestController: 1000 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[11],
                      '[INFO] HarvestController: 2000 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[12],
                      '[INFO] HarvestController: 2400 records harvested')
Exemplo n.º 8
0
 def testFetch(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://digitalcollections.hoover.org/search/*/objects/xml?filter=approved:true&page=1',
         responses=[
             httpretty.Response(body=open(DIR_FIXTURES +
                                          '/eMuseum-page-1.xml').read()),
             httpretty.Response(body=open(DIR_FIXTURES +
                                          '/eMuseum-page-2.xml').read()),
             httpretty.Response(body=open(DIR_FIXTURES +
                                          '/eMuseum-page-3.xml').read()),
         ])
     url = 'http://digitalcollections.hoover.org'
     h = fetcher.eMuseum_Fetcher(url, None)
     self.assertEqual(h.url_base, url)
     docs = []
     d = h.next()
     docs.extend(d)
     for d in h:
         docs.extend(d)
     self.assertEqual(len(docs), 24)
     test1 = docs[12]
     self.assertIn('title', test1)
     self.assertEqual(
         test1['title']['text'],
         'Money is power.  A war savings certificate in every Canadian home.  Get yours now at post offices or banks.'
     )
     self.assertIn('unknown2', test1)
     self.assertIn('text2', test1['primaryMaker'])
     self.assertNotIn('attrib', test1['unknown1'])
Exemplo n.º 9
0
    def test_get_isShownBy_pdf(self, mock_deepharvest, mock_boto):
        ''' test getting correct isShownBy value for Nuxeo doc
            with no images and PDF at parent level
        '''
        deepharvest_mocker(mock_deepharvest)

        httpretty.register_uri(
            httpretty.GET,
            'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+'
            'Document+WHERE+ecm%3AparentId+%3D+'
            '%2700d55837-01b6-4211-80d8-b966a15c257e%27+ORDER+BY+'
            'ecm%3Apos&currentPageIndex=0&pageSize=100',
            responses=[
                httpretty.Response(body=open(DIR_FIXTURES +
                                             '/nuxeo_no_children.json').read(),
                                   status=200),
            ])

        h = fetcher.NuxeoFetcher('https://example.edu/api/v1',
                                 'path-to-asset/here')

        nuxeo_metadata = open(DIR_FIXTURES +
                              '/nuxeo_doc_pdf_parent.json').read()
        nuxeo_metadata = json.loads(nuxeo_metadata)
        isShownBy = h._get_isShownBy(nuxeo_metadata)
        self.assertEqual(
            isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/'
            'ucldc-nuxeo-thumb-media/00d55837-01b6-4211-80d8-b966a15c257e')
Exemplo n.º 10
0
 def testMainHarvestController__init__Error(self, mock_method):
     '''Test the try-except block in main when HarvestController not created
     correctly'''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     sys.argv = [
         'thisexe', '*****@*****.**',
         'https://registry.cdlib.org/api/v1/collection/197/'
     ]
     self.assertRaises(Exception,
                       fetcher.main,
                       self.user_email,
                       self.url_api_collection,
                       log_handler=self.test_log_handler,
                       mail_handler=self.test_log_handler,
                       dir_profile=self.dir_test_profile)
     self.assertEqual(len(self.test_log_handler.records), 4)
     self.assertTrue("[ERROR] HarvestMain: Exception in harvester init" in
                     self.test_log_handler.formatted_records[3])
     self.assertTrue("Boom!" in self.test_log_handler.formatted_records[3])
     c = Collection('https://registry.cdlib.org/api/v1/collection/197/')
     os.remove(
         os.path.abspath(os.path.join(self.dir_test_profile,
                                      c.id + '.pjs')))
Exemplo n.º 11
0
 def test_ignore_content_type(self, mock_stash, mock_couch):
     '''Test that content type check is not called if  --ignore_content_type parameter given'''
     url = 'http://getthisimage/image'
     doc = {'_id': 'IGNORE_CONTENT', 'isShownBy': url}
     httpretty.register_uri(
         httpretty.HEAD,
         url,
         body='',
         content_length='0',
         content_type='text/plain; charset=utf-8',
         connection='close',
     )
     httpretty.register_uri(
         httpretty.GET,
         url,
         body='',
         content_length='0',
         content_type='text/html; charset=utf-8',
         connection='close',
     )
     image_harvester = image_harvest.ImageHarvester(
         url_cache={},
         hash_cache={},
         bucket_bases=['region:x'],
         ignore_content_type=True)
     r = StashReport('test url', 'md5 test value', 's3 url object',
                     'mime_type', 'dimensions')
     ret = image_harvester.stash_image(doc)
     self.assertEqual(ret, [r])
Exemplo n.º 12
0
 def testFetch(self):
     '''Test the httpretty mocked fetching of documents'''
     url = 'https://example.edu/action/search/xml?q=ddu%3A20*&' \
           'asf=ddu&asd=&fd=1&_hd=&hd=on&sf=&_rs=&_ef=&ef=on&sd=&ed=&c=ga'
     httpretty.register_uri(
             httpretty.GET,
             url,
             responses=[
                 httpretty.Response(
                     open(DIR_FIXTURES+'/ucsf-page-1.xml').read(),
                     status=200),
                 httpretty.Response(
                     open(DIR_FIXTURES+'/ucsf-page-1.xml').read(),
                     status=200),
                 httpretty.Response(
                     open(DIR_FIXTURES+'/ucsf-page-2.xml').read(),
                     status=200),
                 httpretty.Response(
                     open(DIR_FIXTURES+'/ucsf-page-3.xml').read(),
                     status=200),
             ]
     )
     h = fetcher.UCSF_XML_Fetcher(url, None, page_size=3)
     docs = []
     for d in h:
         docs.extend(d)
     self.assertEqual(len(docs), 7)
     testy = docs[0]
     self.assertIn('tid', testy)
     self.assertEqual(testy['tid'], "nga13j00")
     self.assertEqual(testy['uri'],
                      'http://legacy.library.ucsf.edu/tid/nga13j00')
     self.assertIn('aup', testy['metadata'])
     self.assertEqual(testy['metadata']['aup'], ['Whent, Peter'])
Exemplo n.º 13
0
 def test_single_fetching(self):
     url = 'http://single.edu'
     playlist_id = 'PLwtrWl_IBMJtjP5zMk6dVR-BRjzKqCPOM'
     url_vids = fetcher.YouTube_Fetcher.url_video
     httpretty.register_uri(
         httpretty.GET,
         url_vids,
         responses=[
             httpretty.Response(
                 body=open(DIR_FIXTURES + '/youtube_single_video.json').read(),
                 status=200)
         ])
     h = fetcher.YouTube_Fetcher(url, playlist_id)
     vids = []
     for v in h:
         vids.extend(v)
     self.assertEqual(len(vids), 1)
     self.assertEqual(vids[0], {
         u'contentDetails': {
             u'definition': u'sd',
             u'projection': u'rectangular',
             u'caption': u'false',
             u'duration': u'PT19M35S',
             u'licensedContent': True,
             u'dimension': u'2d'
         },
         u'kind': u'youtube#video',
         u'etag':
         u'"m2yskBQFythfE4irbTIeOgYYfBU/-3AtVAYcRLEynWZprpf0OGaY8zo"',
         u'id': u'0Yx8zrbsUu8'
     })
Exemplo n.º 14
0
 def testMainCreatesCollectionProfile(self, mock_boto3):
     '''Test that the main function produces a collection profile
     file for DPLA. The path to this file is needed when creating a
     DPLA ingestion document.
     '''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     Collection("https://registry.cdlib.org/api/v1/collection/197/")
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(ingest_doc_id, 'test-id')
     self.assertEqual(num, 128)
     self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
Exemplo n.º 15
0
 def testRunIngestProductionNotReady(self, mock_couch, mock_dash_clean,
                                     mock_check, mock_remove, mock_save,
                                     mock_enrich, mock_couchdb, mock_redis,
                                     mock_boto3):
     mock_couch.return_value._create_ingestion_document.return_value = \
         'test-id'
     # this next is because the redis client unpickles....
     mock_redis.return_value.hget.return_value = pickle.dumps('RQ-result!')
     mail_handler = MagicMock()
     url_api_collection = 'https://registry.cdlib.org/api/v1/' \
         'collection/178/'
     httpretty.httpretty.enable()
     httpretty.register_uri(
         httpretty.GET,
         url_api_collection,
         body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read())
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/tf2v19n928',
         body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read())
     os.environ['DATA_BRANCH'] = 'production'
     self.assertRaises(
         Exception,
         run_ingest.main,
         '*****@*****.**',
         url_api_collection,
         log_handler=self.test_log_handler,
         mail_handler=mail_handler)
     print self.test_log_handler.records
     self.assertEqual(len(self.test_log_handler.records), 9)
Exemplo n.º 16
0
 def testMainFnWithException(self, mock_method):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         self.assertRaises(
             Exception,
             fetcher.main,
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(len(self.test_log_handler.records), 7)
     self.assertTrue("[ERROR] HarvestMain: Error while harvesting:" in
                     self.test_log_handler.formatted_records[6])
     self.assertTrue("Boom!" in self.test_log_handler.formatted_records[6])
Exemplo n.º 17
0
    def test_get_isShownBy_pdf(self, mock_deepharvest, mock_boto):
        ''' test getting correct isShownBy value for Nuxeo doc
            with no images and PDF at parent level
        '''
        deepharvest_mocker(mock_deepharvest)

        httpretty.register_uri(
            httpretty.GET,
            'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+'
            'Document+WHERE+ecm%3AparentId+%3D+'
            '%2700d55837-01b6-4211-80d8-b966a15c257e%27+ORDER+BY+'
            'ecm%3Apos&currentPageIndex=0&pageSize=100',
            responses=[
                httpretty.Response(
                    body=open(DIR_FIXTURES + '/nuxeo_no_children.json').read(),
                    status=200),
            ])

        h = fetcher.NuxeoFetcher('https://example.edu/api/v1',
                                 'path-to-asset/here')

        nuxeo_metadata = open(DIR_FIXTURES +
                              '/nuxeo_doc_pdf_parent.json').read()
        nuxeo_metadata = json.loads(nuxeo_metadata)
        isShownBy = h._get_isShownBy(nuxeo_metadata)
        self.assertEqual(
            isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/'
            'ucldc-nuxeo-thumb-media/00d55837-01b6-4211-80d8-b966a15c257e')
Exemplo n.º 18
0
 def testRunIngestProductionNotReady(self, mock_couch, mock_dash_clean,
                                     mock_check, mock_remove, mock_save,
                                     mock_enrich, mock_couchdb, mock_redis,
                                     mock_boto3):
     mock_couch.return_value._create_ingestion_document.return_value = \
         'test-id'
     # this next is because the redis client unpickles....
     mock_redis.return_value.hget.return_value = pickle.dumps('RQ-result!')
     mail_handler = MagicMock()
     url_api_collection = 'https://registry.cdlib.org/api/v1/' \
         'collection/178/'
     httpretty.httpretty.enable()
     httpretty.register_uri(
         httpretty.GET,
         url_api_collection,
         body=open(DIR_FIXTURES + '/collection_api_test_oac.json').read())
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/tf2v19n928',
         body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read())
     os.environ['DATA_BRANCH'] = 'production'
     self.assertRaises(Exception,
                       run_ingest.main,
                       '*****@*****.**',
                       url_api_collection,
                       log_handler=self.test_log_handler,
                       mail_handler=mail_handler)
     print self.test_log_handler.records
     self.assertEqual(len(self.test_log_handler.records), 9)
Exemplo n.º 19
0
    def setUp(self):
        super(HarvestControllerTestCase, self).setUp()
        httpretty.register_uri(
            httpretty.GET,
            "https://registry.cdlib.org/api/v1/collection/197/",
            body=open(DIR_FIXTURES + '/collection_api_test.json').read())
        httpretty.register_uri(
            httpretty.GET,
            re.compile("http://content.cdlib.org/oai?.*"),
            body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
        self.collection = Collection(
            'https://registry.cdlib.org/api/v1/collection/197/')
        config_file, profile_path = self.setUp_config(self.collection)
        self.controller_oai = fetcher.HarvestController(
            '*****@*****.**',
            self.collection,
            profile_path=profile_path,
            config_file=config_file)
        self.objset_test_doc = json.load(
            open(DIR_FIXTURES + '/objset_test_doc.json'))

        class myNow(datetime.datetime):
            @classmethod
            def now(cls):
                return cls(2017, 7, 14, 12, 1)

        self.old_dt = datetime.datetime
        datetime.datetime = myNow
Exemplo n.º 20
0
 def testMainCreatesCollectionProfile(self, mock_boto3):
     '''Test that the main function produces a collection profile
     file for DPLA. The path to this file is needed when creating a
     DPLA ingestion document.
     '''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     Collection("https://registry.cdlib.org/api/v1/collection/197/")
     with patch('dplaingestion.couch.Couch') as mock_couch:
         instance = mock_couch.return_value
         instance._create_ingestion_document.return_value = 'test-id'
         ingest_doc_id, num, self.dir_save, self.fetcher = fetcher.main(
             self.user_email,
             self.url_api_collection,
             log_handler=self.test_log_handler,
             mail_handler=self.test_log_handler,
             dir_profile=self.dir_test_profile,
             profile_path=self.profile_path,
             config_file=self.config_file)
     self.assertEqual(ingest_doc_id, 'test-id')
     self.assertEqual(num, 128)
     self.assertTrue(os.path.exists(os.path.join(self.profile_path)))
Exemplo n.º 21
0
 def testLoggingMoreThan1000(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/198/",
         body=open(DIR_FIXTURES + '/collection_api_big_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-2400-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/198/')
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     controller.harvest()
     self.assertEqual(len(self.test_log_handler.records), 13)
     self.assertEqual(self.test_log_handler.formatted_records[1],
                      '[INFO] HarvestController: 100 records harvested')
     shutil.rmtree(controller.dir_save)
     self.assertEqual(self.test_log_handler.formatted_records[10],
                      '[INFO] HarvestController: 1000 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[11],
                      '[INFO] HarvestController: 2000 records harvested')
     self.assertEqual(self.test_log_handler.formatted_records[12],
                      '[INFO] HarvestController: 2400 records harvested')
Exemplo n.º 22
0
 def testInit(self):
     '''Basic tdd start'''
     url = 'https://example.edu'
     playlist_id = 'testplaylist'
     page_size = 3
     url_first = fetcher.YouTube_Fetcher.url_playlistitems.format(
         api_key='boguskey',
         page_size=page_size,
         playlist_id=playlist_id,
         page_token='')
     httpretty.register_uri(
         httpretty.GET,
         url_first,
         body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read())
     h = fetcher.YouTube_Fetcher(url, playlist_id, page_size=page_size)
     self.assertEqual(h.url_base, url)
     self.assertEqual(h.playlist_id, playlist_id)
     self.assertEqual(h.api_key, 'boguskey')
     self.assertEqual(h.page_size, page_size)
     self.assertEqual(h.playlistitems, {'nextPageToken': ''})
     self.assertEqual(
         h.url_playlistitems,
         'https://www.googleapis.com/youtube/v3/playlistItems'
         '?key={api_key}&maxResults={page_size}&part=contentDetails&'
         'playlistId={playlist_id}&pageToken={page_token}')
     self.assertEqual(
         h.url_video,
         'https://www.googleapis.com/youtube/v3/videos?'
         'key={api_key}&part=snippet&id={video_ids}'
         )
Exemplo n.º 23
0
 def testInit(self):
     '''Basic tdd start'''
     url = 'https://example.edu'
     user_id = 'test@Nuser'
     page_size = 10
     url_first = fetcher.Flickr_Fetcher.url_get_user_photos_template.format(
         api_key='boguskey', user_id=user_id, per_page=page_size, page=1)
     httpretty.register_uri(
         httpretty.GET,
         url_first,
         body=open(DIR_FIXTURES + '/flickr-public-photos-1.xml').read())
     h = fetcher.Flickr_Fetcher(url, user_id, page_size=page_size)
     self.assertEqual(h.url_base, url)
     self.assertEqual(h.user_id, user_id)
     self.assertEqual(h.page_size, 10)
     self.assertEqual(h.page_current, 1)
     self.assertEqual(h.doc_current, 0)
     self.assertEqual(h.docs_fetched, 0)
     self.assertEqual(h.url_get_user_photos_template,
                      'https://api.flickr.com/services/rest/'
                      '?api_key={api_key}&user_id={user_id}&per_page'
                      '={per_page}&method='
                      'flickr.people.getPublicPhotos&page={page}')
     self.assertEqual(h.api_key, 'boguskey')
     self.assertEqual(h.url_current, url_first)
     self.assertEqual(h.docs_total, 10)
     self.assertEqual(h.url_get_photo_info_template,
                      'https://api.flickr.com/services/rest/'
                      '?api_key={api_key}&method='
                      'flickr.photos.getInfo&photo_id={photo_id}')
Exemplo n.º 24
0
    def testInit(self):
        '''Basic tdd start'''
        url = 'https://s3.amazonaws.com/pastperfectonline/xmlfiles/museum_231'
        httpretty.register_uri(httpretty.GET,
                               url,
                               body=open(DIR_FIXTURES +
                                         '/xml-fetch.xml').read())
        h = fetcher.XML_Fetcher(url, None)
        self.assertEqual(h.url_base, url)
        docs = []
        d = h.next()
        self.assertEqual(len(d), 999)
        docs.extend(d)
        for d in h:
            docs.extend(d)
        self.assertEqual(len(docs), 2320)
        test1 = docs[0]
        test2 = docs[2]
        self.assertIn('title', test1['metadata'])
        self.assertEqual(test1['metadata']['title'], [
            'California desperadoes : stories of early California outlaws in their own word'
        ])

        # test that attributes are captured, even from empty elements
        self.assertEqual(test1['metadata']['q'], ['taken'])
        self.assertEqual(test1['metadata']['d'], ['Kodak'])
        self.assertEqual(test2['metadata']['q'], ['scanned'])
        self.assertEqual(test2['metadata']['d'], ['Epson'])
Exemplo n.º 25
0
    def test_get_isShownBy_video(self, mock_deepharvest, mock_boto):
        ''' test getting correct isShownBy value for Nuxeo video object
        '''
        deepharvest_mocker(mock_deepharvest)

        httpretty.register_uri(
            httpretty.GET,
            'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+'
            'Document+WHERE+ecm%3AparentId+%3D+'
            '%274c80e254-6def-4230-9f28-bc48878568d4%27+'
            'AND+ecm%3AcurrentLifeCycleState+%21%3D+%27deleted%27+ORDER+BY+'
            'ecm%3Apos&currentPageIndex=0&pageSize=100',
            responses=[
                httpretty.Response(body=open(DIR_FIXTURES +
                                             '/nuxeo_no_children.json').read(),
                                   status=200),
            ])

        h = fetcher.NuxeoFetcher('https://example.edu/api/v1',
                                 'path-to-asset/here')

        nuxeo_metadata = open(DIR_FIXTURES + '/nuxeo_doc_video.json').read()
        nuxeo_metadata = json.loads(nuxeo_metadata)
        isShownBy = h._get_isShownBy(nuxeo_metadata)

        self.assertEqual(
            isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/'
            'ucldc-nuxeo-thumb-media/4c80e254-6def-4230-9f28-bc48878568d4')
Exemplo n.º 26
0
 def testFetch(self):
     '''Test the httpretty mocked fetching of documents'''
     url = 'https://example.edu/action/search/xml?q=ddu%3A20*&' \
           'asf=ddu&asd=&fd=1&_hd=&hd=on&sf=&_rs=&_ef=&ef=on&sd=&ed=&c=ga'
     httpretty.register_uri(
         httpretty.GET,
         url,
         responses=[
             httpretty.Response(open(DIR_FIXTURES +
                                     '/ucsf-page-1.xml').read(),
                                status=200),
             httpretty.Response(open(DIR_FIXTURES +
                                     '/ucsf-page-1.xml').read(),
                                status=200),
             httpretty.Response(open(DIR_FIXTURES +
                                     '/ucsf-page-2.xml').read(),
                                status=200),
             httpretty.Response(open(DIR_FIXTURES +
                                     '/ucsf-page-3.xml').read(),
                                status=200),
         ])
     h = fetcher.UCSF_XML_Fetcher(url, None, page_size=3)
     docs = []
     for d in h:
         docs.extend(d)
     self.assertEqual(len(docs), 7)
     testy = docs[0]
     self.assertIn('tid', testy)
     self.assertEqual(testy['tid'], "nga13j00")
     self.assertEqual(testy['uri'],
                      'http://legacy.library.ucsf.edu/tid/nga13j00')
     self.assertIn('aup', testy['metadata'])
     self.assertEqual(testy['metadata']['aup'], ['Whent, Peter'])
Exemplo n.º 27
0
 def testAddRegistryData(self):
     '''Unittest the _add_registry_data function'''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/197/')
     self.tearDown_config()  # remove ones setup in setUp
     self.setUp_config(collection)
     controller = fetcher.HarvestController(
         '*****@*****.**',
         collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
     obj = {'id': 'fakey', 'otherdata': 'test'}
     self.assertNotIn('collection', obj)
     controller._add_registry_data(obj)
     self.assertIn('collection', obj)
     self.assertEqual(obj['collection'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/collection/197/')
     self.assertNotIn('campus', obj)
     self.assertIn('campus', obj['collection'][0])
     self.assertNotIn('repository', obj)
     self.assertIn('repository', obj['collection'][0])
     # need to test one without campus
     self.assertEqual(obj['collection'][0]['campus'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/campus/12/')
     self.assertEqual(obj['collection'][0]['repository'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/repository/37/')
Exemplo n.º 28
0
 def test_single_fetching(self):
     url = 'http://single.edu'
     playlist_id = 'PLwtrWl_IBMJtjP5zMk6dVR-BRjzKqCPOM'
     url_vids = fetcher.YouTube_Fetcher.url_video
     httpretty.register_uri(
         httpretty.GET,
         url_vids,
         responses=[
             httpretty.Response(
                 body=open(DIR_FIXTURES +
                           '/youtube_single_video.json').read(),
                 status=200)
         ])
     h = fetcher.YouTube_Fetcher(url, playlist_id)
     vids = []
     for v in h:
         vids.extend(v)
     self.assertEqual(len(vids), 1)
     self.assertEqual(
         vids[0], {
             u'contentDetails': {
                 u'definition': u'sd',
                 u'projection': u'rectangular',
                 u'caption': u'false',
                 u'duration': u'PT19M35S',
                 u'licensedContent': True,
                 u'dimension': u'2d'
             },
             u'kind': u'youtube#video',
             u'etag':
             u'"m2yskBQFythfE4irbTIeOgYYfBU/-3AtVAYcRLEynWZprpf0OGaY8zo"',
             u'id': u'0Yx8zrbsUu8'
         })
Exemplo n.º 29
0
    def testCollectionSlice(self):
        '''Test that results are correct for a known couchdb result'''
        url_to_pretty = os.path.join(self.url_couch_base, self.cdb, '_design',
                                     COUCHDB_VIEW.split('/')[0], '_view',
                                     COUCHDB_VIEW.split('/')[1])
        httpretty.register_uri(
            httpretty.GET,
            re.compile(url_to_pretty + ".*$"),
            body=open(DIR_FIXTURES +
                      '/couchdb_by_provider_name-5112.json').read(),
            etag="2U5BW2TDDX9EHZJOO0DNE29D1",
            content_type='application/json',
        )

        results = self._cdbworker.run_by_collection('5112',
                                                    self.function,
                                                    'arg1',
                                                    'arg2',
                                                    kwarg1='1',
                                                    kwarg2=2)
        self.assertEqual(len(results), 3)
        self.assertEqual(results[1][0],
                         '5112--http://ark.cdlib.org/ark:/13030/kt7779r8zj')
        self.assertEqual(results[1][1][1], ('arg1', 'arg2'))
        self.assertEqual(results[1][1][2], {'kwarg1': '1', 'kwarg2': 2})
        doc = results[0][1][0]
        self.assertEqual(doc['isShownAt'],
                         'http://www.coronado.ca.us/library/')
Exemplo n.º 30
0
 def setUp(self):
     super(OAIFetcherTestCase, self).setUp()
     httpretty.register_uri(httpretty.GET,
                            'http://content.cdlib.org/oai',
                            body=open(DIR_FIXTURES + '/testOAI.xml').read())
     self.fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai',
                                       'oac:images')
Exemplo n.º 31
0
 def testMainHarvestController__init__Error(self, mock_method):
     '''Test the try-except block in main when HarvestController not created
     correctly'''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     sys.argv = [
         'thisexe', '*****@*****.**',
         'https://registry.cdlib.org/api/v1/collection/197/'
     ]
     self.assertRaises(
         Exception,
         fetcher.main,
         self.user_email,
         self.url_api_collection,
         log_handler=self.test_log_handler,
         mail_handler=self.test_log_handler,
         dir_profile=self.dir_test_profile)
     self.assertEqual(len(self.test_log_handler.records), 4)
     self.assertTrue("[ERROR] HarvestMain: Exception in harvester init" in
                     self.test_log_handler.formatted_records[3])
     self.assertTrue("Boom!" in self.test_log_handler.formatted_records[3])
     c = Collection('https://registry.cdlib.org/api/v1/collection/197/')
     os.remove(
         os.path.abspath(
             os.path.join(self.dir_test_profile, c.id + '.pjs')))
Exemplo n.º 32
0
 def testOverrideMetadataPrefix(self):
     '''test that the metadataPrefix for an OAI feed can be overridden.
     The extra_data for OAI can be either just a set spec or a html query
     string of set= &metadataPrefix=
     '''
     httpretty.register_uri(httpretty.GET,
                            'http://content.cdlib.org/oai',
                            body=open(DIR_FIXTURES + '/testOAI.xml').read())
     set_fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai',
                                      'set=oac:images')
     self.assertEqual(set_fetcher._set, 'oac:images')
     rec = set_fetcher.next()
     self.assertIsInstance(rec, dict)
     self.assertIn('id', rec)
     self.assertEqual(rec['id'], '13030/hb796nb5mn')
     self.assertIn('datestamp', rec)
     self.assertIn(rec['datestamp'], '2005-12-13')
     self.assertEqual(
         httpretty.last_request().querystring, {
             u'verb': [u'ListRecords'],
             u'set': [u'oac:images'],
             u'metadataPrefix': [u'oai_dc']
         })
     httpretty.register_uri(httpretty.GET,
                            'http://content.cdlib.org/oai',
                            body=open(DIR_FIXTURES +
                                      '/testOAI-didl.xml').read())
     didl_fetcher = fetcher.OAIFetcher(
         'http://content.cdlib.org/oai',
         'set=oac:images&metadataPrefix=didl')
     self.assertEqual(didl_fetcher._set, 'oac:images')
     self.assertEqual(didl_fetcher._metadataPrefix, 'didl')
     rec = didl_fetcher.next()
     self.assertIsInstance(rec, dict)
     self.assertIn('id', rec)
     self.assertEqual(rec['id'], 'oai:ucispace-prod.lib.uci.edu:10575/25')
     self.assertEqual(rec['title'], ['Schedule of lectures'])
     self.assertIn('datestamp', rec)
     self.assertEqual(rec['datestamp'], '2015-05-20T11:04:23Z')
     self.assertEqual(
         httpretty.last_request().querystring, {
             u'verb': [u'ListRecords'],
             u'set': [u'oac:images'],
             u'metadataPrefix': [u'didl']
         })
     self.assertEqual(
         rec['Resource']['@ref'],
         'http://ucispace-prod.lib.uci.edu/xmlui/bitstream/' +
         '10575/25/1/!COLLOQU.IA.pdf')
     self.assertEqual(rec['Item']['@id'],
                      'uuid-640925bd-9cdf-46be-babb-b2138c3fce9c')
     self.assertEqual(rec['Component']['@id'],
                      'uuid-897984d8-9392-4a68-912f-ffdf6fd7ce59')
     self.assertIn('Descriptor', rec)
     self.assertEqual(rec['Statement']['@mimeType'],
                      'application/xml; charset=utf-8')
     self.assertEqual(
         rec['DIDLInfo']['{urn:mpeg:mpeg21:2002:02-DIDL-NS}DIDLInfo'][0]
         ['text'], '2015-05-20T20:30:26Z')
     del didl_fetcher
Exemplo n.º 33
0
 def testIterateOverResults(self):
     '''Test the iteration over a mock set of data'''
     httpretty.register_uri(
         httpretty.POST,
         'http://example.edu/solr/select',
         responses=[
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-0.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-1.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-2.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-3.xml').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-4.xml').read())
         ])
     h = fetcher.SolrFetcher('http://example.edu/solr',
                             'extra_data',
                             rows=3)
     self.assertEqual(len(h.resp.results), 3)
     n = 0
     for r in h:
         n += 1
     self.assertEqual(['Mission at Santa Barbara'], r['title_tesim'])
     self.assertEqual(n, 10)
Exemplo n.º 34
0
 def testInit(self):
     '''Basic tdd start'''
     url = 'https://example.edu'
     playlist_id = 'testplaylist'
     page_size = 3
     url_first = fetcher.YouTube_Fetcher.url_playlistitems.format(
         api_key='boguskey',
         page_size=page_size,
         playlist_id=playlist_id,
         page_token='')
     httpretty.register_uri(httpretty.GET,
                            url_first,
                            body=open(DIR_FIXTURES +
                                      '/flickr-public-photos-1.xml').read())
     h = fetcher.YouTube_Fetcher(url, playlist_id, page_size=page_size)
     self.assertEqual(h.url_base, url)
     self.assertEqual(h.playlist_id, playlist_id)
     self.assertEqual(h.api_key, 'boguskey')
     self.assertEqual(h.page_size, page_size)
     self.assertEqual(h.playlistitems, {'nextPageToken': ''})
     self.assertEqual(
         h.url_playlistitems,
         'https://www.googleapis.com/youtube/v3/playlistItems'
         '?key={api_key}&maxResults={page_size}&part=contentDetails&'
         'playlistId={playlist_id}&pageToken={page_token}')
     self.assertEqual(
         h.url_video, 'https://www.googleapis.com/youtube/v3/videos?'
         'key={api_key}&part=snippet&id={video_ids}')
Exemplo n.º 35
0
    def test_get_isShownBy_video(self, mock_deepharvest, mock_boto):
        ''' test getting correct isShownBy value for Nuxeo video object
        '''
        deepharvest_mocker(mock_deepharvest)

        httpretty.register_uri(
            httpretty.GET,
            'https://example.edu/api/v1/path/@search?query=SELECT+%2A+FROM+'
            'Document+WHERE+ecm%3AparentId+%3D+'
            '%274c80e254-6def-4230-9f28-bc48878568d4%27+'
            'AND+ecm%3AcurrentLifeCycleState+%21%3D+%27deleted%27+ORDER+BY+'
            'ecm%3Apos&currentPageIndex=0&pageSize=100',
            responses=[
                httpretty.Response(
                    body=open(DIR_FIXTURES + '/nuxeo_no_children.json').read(),
                    status=200),
            ])

        h = fetcher.NuxeoFetcher('https://example.edu/api/v1',
                                 'path-to-asset/here')

        nuxeo_metadata = open(DIR_FIXTURES + '/nuxeo_doc_video.json').read()
        nuxeo_metadata = json.loads(nuxeo_metadata)
        isShownBy = h._get_isShownBy(nuxeo_metadata)

        self.assertEqual(
            isShownBy, 'https://s3.amazonaws.com/static.ucldc.cdlib.org/'
            'ucldc-nuxeo-thumb-media/4c80e254-6def-4230-9f28-bc48878568d4')
Exemplo n.º 36
0
 def testHarvestControllerExists(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(
         httpretty.GET,
         re.compile("http://content.cdlib.org/oai?.*"),
         body=open(DIR_FIXTURES + '/testOAI-128-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/197/')
     controller = fetcher.HarvestController(
         '*****@*****.**',
         collection,
         config_file=self.config_file,
         profile_path=self.profile_path)
     self.assertTrue(hasattr(controller, 'fetcher'))
     self.assertIsInstance(controller.fetcher, fetcher.OAIFetcher)
     self.assertTrue(hasattr(controller, 'campus_valid'))
     self.assertTrue(hasattr(controller, 'dc_elements'))
     self.assertTrue(hasattr(controller, 'datetime_start'))
     print(controller.s3path)
     self.assertEqual(controller.s3path,
                      'data-fetched/197/2017-07-14-1201/')
     shutil.rmtree(controller.dir_save)
Exemplo n.º 37
0
    def setUp(self):
        super(HarvestControllerTestCase, self).setUp()
        httpretty.register_uri(
            httpretty.GET,
            "https://registry.cdlib.org/api/v1/collection/197/",
            body=open(DIR_FIXTURES + '/collection_api_test.json').read())
        httpretty.register_uri(httpretty.GET,
                               re.compile("http://content.cdlib.org/oai?.*"),
                               body=open(DIR_FIXTURES +
                                         '/testOAI-128-records.xml').read())
        self.collection = Collection(
            'https://registry.cdlib.org/api/v1/collection/197/')
        config_file, profile_path = self.setUp_config(self.collection)
        self.controller_oai = fetcher.HarvestController(
            '*****@*****.**',
            self.collection,
            profile_path=profile_path,
            config_file=config_file)
        self.objset_test_doc = json.load(
            open(DIR_FIXTURES + '/objset_test_doc.json'))

        class myNow(datetime.datetime):
            @classmethod
            def now(cls):
                return cls(2017, 7, 14, 12, 1)

        self.old_dt = datetime.datetime
        datetime.datetime = myNow
Exemplo n.º 38
0
 def testAddRegistryData(self):
     '''Unittest the _add_registry_data function'''
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/197/",
         body=open(DIR_FIXTURES + '/collection_api_test.json').read())
     httpretty.register_uri(httpretty.GET,
                            re.compile("http://content.cdlib.org/oai?.*"),
                            body=open(DIR_FIXTURES +
                                      '/testOAI-128-records.xml').read())
     collection = Collection(
         'https://registry.cdlib.org/api/v1/collection/197/')
     self.tearDown_config()  # remove ones setup in setUp
     self.setUp_config(collection)
     controller = fetcher.HarvestController('*****@*****.**',
                                            collection,
                                            config_file=self.config_file,
                                            profile_path=self.profile_path)
     obj = {'id': 'fakey', 'otherdata': 'test'}
     self.assertNotIn('collection', obj)
     controller._add_registry_data(obj)
     self.assertIn('collection', obj)
     self.assertEqual(obj['collection'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/collection/197/')
     self.assertNotIn('campus', obj)
     self.assertIn('campus', obj['collection'][0])
     self.assertNotIn('repository', obj)
     self.assertIn('repository', obj['collection'][0])
     # need to test one without campus
     self.assertEqual(obj['collection'][0]['campus'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/campus/12/')
     self.assertEqual(obj['collection'][0]['repository'][0]['@id'],
                      'https://registry.cdlib.org/api/v1/repository/37/')
Exemplo n.º 39
0
 def testIterateOverResults(self):
     '''Test the iteration over a mock set of data'''
     httpretty.register_uri(
         httpretty.GET,
         'http://example.edu/solr/query',
         responses=[
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-0.json').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-1.json').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-2.json').read()),
             httpretty.Response(body=open(
                 DIR_FIXTURES +
                 '/ucsd-new-feed-missions-bb3038949s-3.json').read()),
         ])
     self.assertRaises(TypeError, fetcher.PySolrFetcher)
     h = fetcher.PySolrQueryFetcher('http://example.edu/solr', 'extra_data',
                                    **{'rows': 3})
     self.assertEqual(
         h._query_path,
         'query?q=extra_data&sort=id+asc&cursorMark=%2A&wt=json&rows=3')
     n = 0
     for r in h:
         n += 1
     self.assertEqual(n, 10)
     self.assertEqual(['Mission Santa Ynez'], r['title_tesim'])
Exemplo n.º 40
0
 def setUp(self):
     super(OAIFetcherTestCase, self).setUp()
     httpretty.register_uri(
             httpretty.GET,
             'http://content.cdlib.org/oai',
             body=open(DIR_FIXTURES+'/testOAI.xml').read())
     self.fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai',
                                       'oac:images')
Exemplo n.º 41
0
 def setUp(self):
     httpretty.register_uri(
         httpretty.GET,
         'https://registry.cdlib.org/api/v1/',
         body=
         '''{"campus": {"list_endpoint": "/api/v1/campus/", "schema": "/api/v1/campus/schema/"}, "collection": {"list_endpoint": "/api/v1/collection/", "schema": "/api/v1/collection/schema/"}, "repository": {"list_endpoint": "/api/v1/repository/", "schema": "/api/v1/repository/schema/"}}'''
     )
     self.registry = Registry()
Exemplo n.º 42
0
 def testCollectionNoEnrichItems(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/36/",
         body=open(DIR_FIXTURES +
                   '/collection_api_no_enrich_item.json').read())
     c = Collection("https://registry.cdlib.org/api/v1/collection/36/")
     with self.assertRaises(ValueError):
         c.dpla_profile_obj
Exemplo n.º 43
0
 def testCMISFetch(self):
     httpretty.register_uri(httpretty.GET,
                            'http://cmis-atom-endpoint/descendants',
                            body=open(DIR_FIXTURES +
                                      '/cmis-atom-descendants.xml').read())
     h = fetcher.CMISAtomFeedFetcher(
         'http://cmis-atom-endpoint/descendants', 'uname, pswd')
     self.assertTrue(hasattr(h, 'objects'))
     self.assertEqual(42, len(h.objects))
 def testEnrichDoc(self):
     httpretty.register_uri(httpretty.POST,
             'http://localhost:8889/enrich',
             body=open(DIR_FIXTURES+'/akara_response.json').read(),
             )
     indoc = json.load(open(DIR_FIXTURES+'/couchdb_doc.json'))
     doc = akara_enrich_doc(indoc, '/select-oac-id,/dpla_mapper?mapper_type=oac_dc')
     self.assertIn('added-key', doc['sourceResource'])
     self.assertEqual(doc['sourceResource']['title'], 'changed title')
Exemplo n.º 45
0
 def testCollectionNoEnrichItems(self):
     httpretty.register_uri(
         httpretty.GET,
         "https://registry.cdlib.org/api/v1/collection/36/",
         body=open(DIR_FIXTURES + '/collection_api_no_enrich_item.json')
         .read())
     c = Collection("https://registry.cdlib.org/api/v1/collection/36/")
     with self.assertRaises(ValueError):
         c.dpla_profile_obj
Exemplo n.º 46
0
 def setUp(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj',
         body=open(DIR_FIXTURES + '/testOAC-url_next-0.json').read())
     super(OAC_JSON_FetcherTestCase, self).setUp()
     self.fetcher = fetcher.OAC_JSON_Fetcher(
         'http://dsc.cdlib.org/search?rmode=json&facet=type-tab&'
         'style=cui&relation=ark:/13030/hb5d5nb7dj', 'extra_data')
Exemplo n.º 47
0
 def testHarvestIsIter(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj&startDoc=26',
         body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read())
     self.assertTrue(hasattr(self.fetcher, '__iter__'))
     self.assertEqual(self.fetcher, self.fetcher.__iter__())
     self.fetcher.next_record()
     self.fetcher.next()
Exemplo n.º 48
0
 def testBadOACSearch(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj--xxxx',
         body=open(DIR_FIXTURES + '/testOAC-badsearch.xml').read())
     self.assertRaises(
         ValueError, fetcher.OAC_XML_Fetcher,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj--xxxx', 'extra_data')
Exemplo n.º 49
0
 def setUp(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/tf0c600134',
         body=open(DIR_FIXTURES + '/testOAC-url_next-0.xml').read())
     super(OAC_XML_FetcherTestCase, self).setUp()
     self.fetcher = fetcher.OAC_XML_Fetcher(
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/tf0c600134', 'extra_data')
Exemplo n.º 50
0
 def testBadOACSearch(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj--xxxx',
         body=open(DIR_FIXTURES + '/testOAC-badsearch.xml').read())
     self.assertRaises(
         ValueError, fetcher.OAC_XML_Fetcher,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj--xxxx', 'extra_data')
Exemplo n.º 51
0
 def setUp(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj',
         body=open(DIR_FIXTURES + '/testOAC-url_next-0.json').read())
     super(OAC_JSON_FetcherTestCase, self).setUp()
     self.fetcher = fetcher.OAC_JSON_Fetcher(
         'http://dsc.cdlib.org/search?rmode=json&facet=type-tab&'
         'style=cui&relation=ark:/13030/hb5d5nb7dj', 'extra_data')
Exemplo n.º 52
0
 def testHarvestIsIter(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj&startDoc=26',
         body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read())
     self.assertTrue(hasattr(self.fetcher, '__iter__'))
     self.assertEqual(self.fetcher, self.fetcher.__iter__())
     self.fetcher.next_record()
     self.fetcher.next()
Exemplo n.º 53
0
 def testCMISFetch(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://cmis-atom-endpoint/descendants',
         body=open(DIR_FIXTURES+'/cmis-atom-descendants.xml').read())
     h = fetcher.CMISAtomFeedFetcher(
             'http://cmis-atom-endpoint/descendants',
             'uname, pswd')
     self.assertTrue(hasattr(h, 'objects'))
     self.assertEqual(42, len(h.objects))
 def testResourceIteratorOnePage(self):
     '''Test when less than one page worth of objects fetched'''
     httpretty.register_uri(httpretty.GET,
             'https://registry.cdlib.org/api/v1/campus/',
             body=open(DIR_FIXTURES+'/registry_api_campus.json').read())
     l = []
     for c in self.registry.resource_iter('campus'):
         l.append(c)
     self.assertEqual(len(l), 10)
     self.assertEqual(l[0]['slug'], 'UCB')
Exemplo n.º 55
0
    def testNuxeoHarvest(self, mock_deepharvest, mock_boto, mock_boto3):
        '''Test the function of the Nuxeo harvest'''
        media_json = open(DIR_FIXTURES + '/nuxeo_media_structmap.json').read()
        mock_boto.return_value.get_bucket.return_value.\
            get_key.return_value.\
            get_contents_as_string.return_value = media_json
        httpretty.register_uri(
            httpretty.GET,
            'http://registry.cdlib.org/api/v1/collection/19/',
            body=open(DIR_FIXTURES + '/collection_api_test_nuxeo.json').read())
        mock_deepharvest.return_value.fetch_objects.return_value = json.load(
            open(DIR_FIXTURES + '/nuxeo_object_list.json'))
        httpretty.register_uri(
            httpretty.GET,
            re.compile('https://example.edu/Nuxeo/site/api/v1/id/.*'),
            body=open(DIR_FIXTURES + '/nuxeo_doc.json').read())

        self.collection = Collection(
            'http://registry.cdlib.org/api/v1/collection/19/')
        with patch(
                'ConfigParser.SafeConfigParser',
                autospec=True) as mock_configparser:
            config_inst = mock_configparser.return_value
            config_inst.get.return_value = 'dublincore,ucldc_schema,picture'
            self.setUp_config(self.collection)
            self.controller = fetcher.HarvestController(
                '*****@*****.**',
                self.collection,
                config_file=self.config_file,
                profile_path=self.profile_path)
        self.assertTrue(hasattr(self.controller, 'harvest'))
        num = self.controller.harvest()
        self.assertEqual(num, 5)
        self.tearDown_config()
        # verify one record has collection and such filled in
        fname = os.listdir(self.controller.dir_save)[0]
        saved_objset = json.load(
            open(os.path.join(self.controller.dir_save, fname)))
        saved_obj = saved_objset[0]
        self.assertEqual(saved_obj['collection'][0]['@id'],
                         u'http://registry.cdlib.org/api/v1/collection/19/')
        self.assertEqual(saved_obj['collection'][0]['name'],
                         u'Cochems (Edward W.) Photographs')
        self.assertEqual(saved_obj['collection'][0]['title'],
                         u'Cochems (Edward W.) Photographs')
        self.assertEqual(saved_obj['collection'][0]['id'], u'19')
        self.assertEqual(saved_obj['collection'][0]['dcmi_type'], 'I')
        self.assertEqual(saved_obj['collection'][0]['rights_statement'],
                         'a sample rights statement')
        self.assertEqual(saved_obj['collection'][0]['rights_status'], 'PD')
        self.assertEqual(saved_obj['state'], 'project')
        self.assertEqual(
            saved_obj['title'],
            'Adeline Cochems having her portrait taken by her father '
            'Edward W, Cochems in Santa Ana, California: Photograph')
 def testResourceIteratorReturnsCollection(self):
     '''Test that the resource iterator returns a Collection object
     for library collection resources'''
     httpretty.register_uri(httpretty.GET,
             'https://registry.cdlib.org/api/v1/collection/',
             body=open(DIR_FIXTURES+'/registry_api_collection.json').read())
     riter = self.registry.resource_iter('collection')
     c = riter.next()
     self.assertTrue(isinstance(c, Collection))
     self.assertTrue(hasattr(c, 'auth'))
     self.assertEqual(c.auth, None)
Exemplo n.º 57
0
 def testOAC_JSON_FetcherReturnedData(self):
     '''test that the data returned by the OAC Fetcher is a proper dc
     dictionary
     '''
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj&startDoc=26',
         body=open(DIR_FIXTURES + '/testOAC-url_next-1.json').read())
     rec = self.fetcher.next()[0]
     self.assertIsInstance(rec, dict)
Exemplo n.º 58
0
 def test_fetching(self):
     url = 'https://example.edu'
     playlist_id = 'testplaylist'
     page_size = 3
     url_first = fetcher.YouTube_Fetcher.url_playlistitems.format(
         api_key='boguskey',
         page_size=page_size,
         playlist_id=playlist_id,
         page_token='')
     url_vids = fetcher.YouTube_Fetcher.url_video
     # Ugly but works
     httpretty.register_uri(
         httpretty.GET,
         url_first,
         responses=[
             httpretty.Response(
                 body=open(DIR_FIXTURES +
                           '/youtube_playlist_with_next.json').read(),
                 status=200),
             httpretty.Response(
                 body=open(DIR_FIXTURES + '/youtube_playlist_no_next.json')
                 .read(),
                 status=200),
         ])
     httpretty.register_uri(
         httpretty.GET,
         url_vids,
         responses=[
             httpretty.Response(
                 body=open(DIR_FIXTURES + '/youtube_video.json').read(),
                 status=200),
             httpretty.Response(
                 body=open(DIR_FIXTURES + '/youtube_video.json').read(),
                 status=200),
         ])
     h = fetcher.YouTube_Fetcher(url, playlist_id, page_size=page_size)
     vids = []
     for v in h:
         vids.extend(v)
     self.assertEqual(len(vids), 6)
     self.assertEqual(vids[0], {
         u'contentDetails': {
             u'definition': u'sd',
             u'projection': u'rectangular',
             u'caption': u'false',
             u'duration': u'PT19M35S',
             u'licensedContent': True,
             u'dimension': u'2d'
         },
         u'kind': u'youtube#video',
         u'etag':
         u'"m2yskBQFythfE4irbTIeOgYYfBU/-3AtVAYcRLEynWZprpf0OGaY8zo"',
         u'id': u'0Yx8zrbsUu8'
     })
Exemplo n.º 59
0
 def testOverrideMetadataPrefix(self):
     '''test that the metadataPrefix for an OAI feed can be overridden.
     The extra_data for OAI can be either just a set spec or a html query
     string of set= &metadataPrefix=
     '''
     httpretty.register_uri(
             httpretty.GET,
             'http://content.cdlib.org/oai',
             body=open(DIR_FIXTURES+'/testOAI.xml').read())
     set_fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai',
                                      'set=oac:images')
     self.assertEqual(set_fetcher._set, 'oac:images')
     rec = set_fetcher.next()
     self.assertIsInstance(rec, dict)
     self.assertIn('id', rec)
     self.assertEqual(rec['id'], '13030/hb796nb5mn')
     self.assertIn('datestamp', rec)
     self.assertIn(rec['datestamp'], '2005-12-13')
     self.assertEqual(httpretty.last_request().querystring,
                      {u'verb': [u'ListRecords'], u'set': [u'oac:images'],
                      u'metadataPrefix': [u'oai_dc']})
     httpretty.register_uri(
             httpretty.GET,
             'http://content.cdlib.org/oai',
             body=open(DIR_FIXTURES+'/testOAI-didl.xml').read())
     didl_fetcher = fetcher.OAIFetcher('http://content.cdlib.org/oai',
                                       'set=oac:images&metadataPrefix=didl')
     self.assertEqual(didl_fetcher._set, 'oac:images')
     self.assertEqual(didl_fetcher._metadataPrefix, 'didl')
     rec = didl_fetcher.next()
     self.assertIsInstance(rec, dict)
     self.assertIn('id', rec)
     self.assertEqual(rec['id'], 'oai:ucispace-prod.lib.uci.edu:10575/25')
     self.assertEqual(rec['title'], ['Schedule of lectures'])
     self.assertIn('datestamp', rec)
     self.assertEqual(rec['datestamp'], '2015-05-20T11:04:23Z')
     self.assertEqual(httpretty.last_request().querystring,
                      {u'verb': [u'ListRecords'], u'set': [u'oac:images'],
                      u'metadataPrefix': [u'didl']})
     self.assertEqual(rec['Resource']['@ref'],
                      'http://ucispace-prod.lib.uci.edu/xmlui/bitstream/' +
                      '10575/25/1/!COLLOQU.IA.pdf')
     self.assertEqual(rec['Item']['@id'],
                      'uuid-640925bd-9cdf-46be-babb-b2138c3fce9c')
     self.assertEqual(rec['Component']['@id'],
                      'uuid-897984d8-9392-4a68-912f-ffdf6fd7ce59')
     self.assertIn('Descriptor', rec)
     self.assertEqual(rec['Statement']['@mimeType'],
                      'application/xml; charset=utf-8')
     self.assertEqual(
             rec['DIDLInfo']
             ['{urn:mpeg:mpeg21:2002:02-DIDL-NS}DIDLInfo'][0]['text'],
             '2015-05-20T20:30:26Z')
     del didl_fetcher
Exemplo n.º 60
0
 def testAmpersandInDoc(self):
     httpretty.register_uri(
         httpretty.GET,
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj',
         body=open(DIR_FIXTURES + '/testOAC-utf8-content.xml').read())
     h = fetcher.OAC_XML_Fetcher(
         'http://dsc.cdlib.org/search?facet=type-tab&style=cui&raw=1&'
         'relation=ark:/13030/hb5d5nb7dj', 'extra_data')
     self.assertEqual(h.totalDocs, 25)
     self.assertEqual(h.currentDoc, 0)
     h.next()