def test_coins_citation_info(self): # minimal record item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic') info = item.coins_citation_info self.assert_('rfr_id' in info, 'referrer id should be set in COinS info') self.assert_('rft_val_fmt' in info, 'format is specified in COinS info') self.assertEqual(item.title, info['rft.title']) self.assertEqual(item.url, info['rft.identifier']) for key in ['rft.date', 'rft.place', 'rft.source', 'rft.format']: self.assert_(key not in info, 'unavailable data should not be set in COinS info') # add all fields to simulate a complete record item.date = '1887' item.format = 'Image' item.source = 'Smithsonian' item.location = 'USA' info = item.coins_citation_info self.assertEqual(item.date, info['rft.date']) self.assertEqual(item.format, info['rft.format']) self.assertEqual(item.source, info['rft.source']) self.assertEqual(item.location, info['rft.place'])
def find_items(keywords): # example use: # keyword should be a list of terms # DPLA.find_items(keywords=['term1', 'term2']) api = Bibs() qry = 'api_key->%s:q->%s' % ( DPLA.API_KEY, ' OR '.join(keywords) ) #qry from unicode string to regular string qry = qry.encode("utf8", "ignore") logger.debug('dpla query: %s' % qry) # TODO: restrict to image only, or at least things with preview image start = time.time() results = api.search(qry, 'dplav2', 'items') # TODO: error handling... logger.info('dpla query completed in %.2f sec' % (time.time() - start)) items = [] for doc in results['docs']: src_res = doc['sourceResource'] # for now, just skip items without an image url if not doc.get('object', None): continue i = DisplayItem( title=src_res.get('title', None), format=src_res.get('type', None), source=doc['provider'].get('name', None), # collection or provider here? src_rec['collection']['title'] # NOTE: collection apparently not set for all items thumbnail=doc.get('object', None), # according to dpla docs, should be url preview for item # docs reference a field for object mimetype, not seeing in results # url on provider's website with context url=doc.get('isShownAt', None) ) if 'date' in src_res: i.date = src_res['date'].get('displayDate', None) if 'spatial' in src_res and src_res['spatial']: # sometimes a list but not always if isinstance(src_res['spatial'], list): space = src_res['spatial'][0] else: space = src_res['spatial'] # country? state? coords? i.location = space.get('name', None) # Add the aggregator for reference i.aggregator = DPLA.name items.append(i) return items
def find_items(keywords): # example use: # keyword should be a list of terms # DPLA.find_items(keywords=['term1', 'term2']) api = Bibs() qry = 'api_key->%s:q->%s' % (DPLA.API_KEY, ' OR '.join(keywords)) #qry from unicode string to regular string qry = qry.encode("utf8", "ignore") logger.debug('dpla query: %s' % qry) # TODO: restrict to image only, or at least things with preview image start = time.time() results = api.search(qry, 'dplav2', 'items') # TODO: error handling... logger.info('dpla query completed in %.2f sec' % (time.time() - start)) items = [] for doc in results['docs']: src_res = doc['sourceResource'] # for now, just skip items without an image url if not doc.get('object', None): continue # url on DPLA site item_url = '%sitem/%s' % (url, doc.get('id')) i = DisplayItem( title=src_res.get('title', None), format=src_res.get('type', None), source=doc['provider'].get('name', None), # collection or provider here? src_rec['collection']['title'] # NOTE: collection apparently not set for all items thumbnail=doc.get('object', None), # according to dpla docs, should be url preview for item # docs reference a field for object mimetype, not seeing in results # url on DPLA site url=item_url) if 'date' in src_res: i.date = src_res['date'].get('displayDate', None) if 'spatial' in src_res and src_res['spatial']: # sometimes a list but not always if isinstance(src_res['spatial'], list): space = src_res['spatial'][0] else: space = src_res['spatial'] # country? state? coords? i.location = space.get('name', None) # Add the aggregator for reference i.aggregator = DPLA.name items.append(i) return items
def find_items(keywords=[]): qry = ' OR '.join(keywords) #qry from unicode string to regular string qry = qry.encode("utf8", "ignore") logger.debug('trove query: %s' % qry) qry_url = Trove.API_URL % (quote_plus(qry), Trove.API_KEY) items = [] start = time.time() try: response = urlopen(qry_url) except HTTPError as e: logger.error('trove api error: %s' % e) except URLError as e: logger.error('trove api error: %s' % e) else: logger.info('trove query completed in %.2f sec' % (time.time() - start)) results = simplejson.load(response) try: for doc in results['response']['zone'][0]['records']['work']: # skip items without a thumbnail url # have to dig around in identifier thumbnail = None if 'identifier' in doc: for link in doc['identifier']: if link['linktype'] == "thumbnail": thumbnail = link['value'] if not thumbnail: continue i = DisplayItem( title=doc.get('title', None), format='; '.join(doc.get('type', [])), # no way to get contributor name without another API call # so just set source to Trove for now source='Trove', url=doc.get('troveUrl', None), date=doc.get('issued', None), thumbnail=thumbnail ) # Add the aggregator for reference i.aggregator = Trove.name items.append(i) except (KeyError, IndexError, TypeError): # Either no results or something was wrong with the JSON logger.debug('Trove returned no results') return items
def test_coins_citation(self): # minimal record item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic') cit = item.coins_citation # just some basic sanity checks self.assert_(cit.startswith('ctx_ver=Z39.88-2004')) self.assert_('rft.title=%s' % item.title in cit)
def find_items(keywords=[]): qry = 'wskey->%s:query->%s' % ( Europeana.API_KEY, # ' OR '.join(['%s' % kw for kw in keywords]) ' OR '.join(keywords)) #qry from unicode string to regular string qry = qry.encode("utf8", "ignore") logger.debug('europeana query: %s' % qry) b = Bibs() results = b.search(qry, 'europeanav2', 'search') items = [] # no results! log this error? if 'items' not in results: return items for doc in results['items']: # NOTE: result includes a 'completeness' score # which we could use for a first-pass filter to weed out junk records # for now, just skip items without an image url if not 'edmPreview' in doc or not doc['edmPreview']: continue i = DisplayItem( format=doc.get('type', None), source='; '.join(doc.get('dataProvider', [])), # NOTE: provider is aggregator (i.e., 'The European Library') # dataProvider is original source # url on provider's website with context url=doc.get('guid', None), date=doc.get('edmTimespanLabel', None)) # NOTE: doc['link'] provides json with full record data # if we want more item details # should NOT be displayed to users (includes api key) # preview and title are both lists; for now, in both cases, # just grab the first one if 'edmTimespanLabel' in doc: i.date = doc['edmTimespanLabel'][0]['def'] if 'title' in doc: i.title = doc['title'][0] if 'edmPreview' in doc: i.thumbnail = doc['edmPreview'][0] # Add the aggregator for reference i.aggregator = Europeana.name # NOTE: spatial/location information doesn't seem to be included # in this item result items.append(i) return items
def test_coins_citation(self): # minimal record item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic') cit = item.coins_citation # just some basic sanity checks self.assert_(cit.startswith('ctx_ver=Z39.88-2004')) self.assert_('rft.title=%s' % item.title in cit) # variant content - lists item = DisplayItem(title=['Hippo'], url='http://some.url/to/a/hippo/pic') # should not throw an exception cit = item.coins_citation self.assert_('rft.title=%s' % item.title[0] in cit) # variant content - integer item = DisplayItem(title='Hippo', url='http://some.url/to/a/hippo/pic', date=1936) # should not throw an exception cit = item.coins_citation self.assert_('rft.date=%s' % item.date in cit)
def find_items(keywords): flickr = flickrapi.FlickrAPI(Flickr.API_KEY) # photos = flickr.photos_search(user_id='73509078@N00', per_page='10') start = time.time() # NOTE: flickr does support or, but doesn't like too many terms at once # (15 terms is apparently too many) query = ' OR '.join(set(keywords[:10])) logger.debug('flickr query: %s' % query) results = flickr.photos_search(text=query, format='json', is_commons='true', extras='owner_name', sort='relevance', per_page=15) # restrict to first 15 items (only ~10 for other apis currently) # comma-delimited list of extra fields # need owner name for source # TODO: future enhancement: access to date, location info, etc # extras='owner_name,date_upload,date_taken,geo') logger.info('flickr query completed in %.2f sec' % (time.time() - start)) # this is really stupid and should be uncessary but the 'jsonFlickrApi( )' needs to be stripped for the json to parse properly results = results.lstrip('jsonFlickrApi(') results = results.rstrip(')') results = simplejson.loads(results) # import pprint # pprint.pprint(results) items = [] # no results! log this error? # NOTE: could be bad api key; check code/stat in response if not 'photos' in results or 'photo' not in results['photos']: return items for doc in results['photos']['photo']: # NOTE: result includes a 'completeness' score # which we could use for a first-pass filter to weed out junk records i = DisplayItem( format=doc.get('type', None), source=doc.get('ownername', None), # url on provider's website with context # http://www.flickr.com/photos/{user-id}/{photo-id} url='http://www.flickr.com/photos/%(owner)s/%(id)s/' % (doc) # TODO get date data # date=doc.get('edmTimespanLabel', None) ) # NOTE: doc['link'] provides json with full record data # if we want more item details # should NOT be displayed to users (includes api key) # flickr title not a list if 'title' in doc: i.title = doc['title'] # build the url back to the image # http://farm{farm-id}.staticflickr.com/{server-id}/{id}_{secret}.jpg i.thumbnail = 'http://farm%(farm)s.staticflickr.com/%(server)s/%(id)s_%(secret)s_m.jpg' % doc # i.thumbnail = 'http://farm'+str(doc['farm'])+'.staticflickr.com/'+str(doc['server'])+'/'+str(doc['id'])+'_'+str(doc['secret'])+'.jpg' # Add the aggregator for reference i.aggregator = 'Flickr Commons' # NOTE: spatial/location information doesn't seem to be included # in this item result items.append(i) return items
def find_items(keywords=[]): qry = 'wskey->%s:query->%s' % ( Europeana.API_KEY, # ' OR '.join(['%s' % kw for kw in keywords]) ' OR '.join(keywords) ) #qry from unicode string to regular string qry = qry.encode("utf8", "ignore") logger.debug('europeana query: %s' % qry) b = Bibs() results = b.search(qry, 'europeanav2', 'search') items = [] # no results! log this error? if 'items' not in results: return items for doc in results['items']: # NOTE: result includes a 'completeness' score # which we could use for a first-pass filter to weed out junk records # for now, just skip items without an image url if not 'edmPreview' in doc or not doc['edmPreview']: continue i = DisplayItem( format=doc.get('type', None), source='; '.join(doc.get('dataProvider', [])), # NOTE: provider is aggregator (i.e., 'The European Library') # dataProvider is original source # url on provider's website with context url=doc.get('guid', None), date=doc.get('edmTimespanLabel', None) ) # NOTE: doc['link'] provides json with full record data # if we want more item details # should NOT be displayed to users (includes api key) # preview and title are both lists; for now, in both cases, # just grab the first one if 'edmTimespanLabel' in doc: i.date = doc['edmTimespanLabel'][0]['def'] if 'title' in doc: i.title = doc['title'][0] if 'edmPreview' in doc: i.thumbnail = doc['edmPreview'][0] # Add the aggregator for reference i.aggregator = Europeana.name # NOTE: spatial/location information doesn't seem to be included # in this item result items.append(i) return items