def test_load_session(self): from lingcod.common.utils import load_session request = TestRequest() load_session(request, '0') self.assertEquals(request.session, None) load_session(request, md5('blah').hexdigest()) self.assertEquals(request.session.__class__.__name__, 'SessionStore')
def qrcode(req, width): import qr url = req.GET.get('url') if url is None: raise Http404 try: data = url.encode('ascii') except UnicodeError: # only supports URLs properly urlencoded raise Http404 if width == "480": magnify = 8 else: magnify = 4 buf = StringIO() try: qr.qrcode(data, buf, format=qr.GIF, magnify=magnify) except ValueError: # qr module wasn't be compiled with GD library raise Http404 content = buf.getvalue() CACHE_TIMEOUT = 86400 res = HttpResponse(content, content_type='image/gif') res['Content-Length'] = str(len(content)) res['ETag'] = '"%s"' % md5(content).hexdigest() res['Last-Modified'] = http_date() res['Expires'] = http_date(time.time() + CACHE_TIMEOUT) patch_cache_control(res, max_age=CACHE_TIMEOUT) return res
def wrapper(self, *args, **kwargs): current_class = self.__class__ blocking_id = \ getmodule(current_class).__name__ + \ '.' + \ current_class.__name__ method = u"" for param in register_params: method += unicode(param) + u"=" + unicode(kwargs[param]) digest = md5(method).hexdigest() lock_id = "%s-lock-%s" % (blocking_id, digest) acquire_lock = lambda: cache.add(lock_id, 'true', expires) release_lock = lambda: cache.delete(lock_id) if acquire_lock(): value = cache.get(lock_id) try: value = view_func(self, *args, **kwargs) except Exception, e: pass finally:
def test_load_session(self): from madrona.common.utils import load_session request = TestRequest() load_session(request, '0') self.assertEquals(request.session, None) load_session(request, md5('blah').hexdigest()) self.assertEquals(request.session.__class__.__name__, 'SessionStore')
def border(req, style, rgb): import gd rgb = tuple(map(lambda x: int(x, 16), (rgb[0:2], rgb[2:4], rgb[4:6]))) try: width = int(req.GET.get('w', 228)) except (ValueError, TypeError): width = 228 try: height = int(req.GET.get('h', 1)) except (ValueError, TypeError): height = 1 if width < 1 or height < 1: raise Http404 if rgb != (0, 0, 0): # if line is black, then use white(#FFFFF) as background color backcolor = (255, 255, 255) else: backcolor = (0, 0, 0) # TODO # check display width img = gd.image((width, height)) back = img.colorAllocate(backcolor) img.colorTransparent(back) color = img.colorAllocate(rgb) if style == 'dotted': pattern = (color, color, back, back) elif style == 'dashed': pattern = (color, color, color, back, back, back) else: # solid pattern = (color, ) img.setStyle(pattern) for y in xrange(height): img.line((0, y), (width, y), gd.gdStyled) fp = StringIO() img.writeGif(fp) content = fp.getvalue() fp.close() content_type = 'image/gif' res = HttpResponse(content, content_type=content_type) res['Content-Type'] = content_type res['Content-Length'] = str(len(content)) res['ETag'] = '"%s"' % md5(content).hexdigest() res['Last-Modified'] = http_date() res['Expires'] = http_date(time.time() + CACHE_TIMEOUT) patch_cache_control(res, max_age=CACHE_TIMEOUT) return res
def border(req, style, rgb): import gd rgb = tuple(map(lambda x: int(x, 16), (rgb[0:2], rgb[2:4], rgb[4:6]))) try: width = int(req.GET.get('w', 228)) except (ValueError, TypeError): width = 228 try: height = int(req.GET.get('h', 1)) except (ValueError, TypeError): height = 1 if width < 1 or height < 1: raise Http404 if rgb != (0, 0, 0): # if line is black, then use white(#FFFFF) as background color backcolor = (255, 255, 255) else: backcolor = (0, 0, 0) # TODO # check display width img = gd.image((width, height)) back = img.colorAllocate(backcolor) img.colorTransparent(back) color = img.colorAllocate(rgb) if style == 'dotted': pattern = (color, color, back, back) elif style == 'dashed': pattern = (color, color, color, back, back, back) else: # solid pattern = (color,) img.setStyle(pattern) for y in xrange(height): img.line((0, y), (width, y), gd.gdStyled) fp = StringIO() img.writeGif(fp) content = fp.getvalue() fp.close() content_type = 'image/gif' res = HttpResponse(content, content_type=content_type) res['Content-Type'] = content_type res['Content-Length'] = str(len(content)) res['ETag'] = '"%s"' % md5(content).hexdigest() res['Last-Modified'] = http_date() res['Expires'] = http_date(time.time() + CACHE_TIMEOUT) patch_cache_control(res, max_age=CACHE_TIMEOUT) return res
def update_flickr_users(results, page=1, per_page=1, all_photos=False): limit = page * per_page offset = limit - per_page flickr_users = FlickrUser.objects.order_by("date_create")[offset:limit] user_updates = [] for flickr_user in flickr_users: nsid_digest = md5(flickr_user.nsid).hexdigest() lock_id = "%s-lock-%s" % ("update_photos", nsid_digest) # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) if acquire_lock: try: # First, update the flickr_user rsp = flickr.people.getInfo(user_id=flickr_user.nsid, format="json", nojsoncallback="true") json = simplejson.loads(rsp) if json and json["stat"] == "ok": api_user = json["person"] flickr_user.username = api_user["username"]["_content"] flickr_user.iconserver = api_user["iconserver"] flickr_user.iconfarm = api_user["iconfarm"] flickr_user.count_photos = api_user["photos"]["count"]["_content"] try: flickr_user.realname = api_user["realname"]["_content"] except KeyError: flickr_user.realname = None try: flickr_user.path_alias = api_user["path_alias"] except KeyError: flickr_user.path_alias = None flickr_user.save() except URLError, e: logger.error( "Problem talking to Flickr when calling people.getInfo from update_flickr_users (URLError), will try again. Reason: %s" % (e.reason) ) return update_photos_for_flickr_user.retry(countdown=5) except FlickrError, e: logger.error( "Problem talking to Flickr when calling people.getInfo from update_flickr_users (FlickrError), re-scheduling task.\n Error: %s" % (e) ) raise update_photos_for_flickr_user.retry(countdown=5) user_updates.append(update_photos_for_flickr_user.subtask((None, flickr_user.nsid, None, all_photos)))
def _caller(*args, **kwargs): """Caller.""" ret_value = None have_lock = False args_list = u','.join([unicode(arg) for arg in args]) key = u"{0}-lock-{1}".format(run_func.__name__, md5(args_list.encode('utf-8')).hexdigest()) lock = REDIS_CLIENT.lock(key, timeout=timeout) try: have_lock = lock.acquire(blocking=False) if have_lock: ret_value = run_func(*args, **kwargs) finally: if have_lock: lock.release() return ret_value
def _caller(*args, **kwargs): """Caller.""" ret_value = None have_lock = False args_list = u','.join([unicode(arg) for arg in args]) key = u"{0}-lock-{1}".format( run_func.__name__, md5(args_list.encode('utf-8')).hexdigest()) lock = REDIS_CLIENT.lock(key, timeout=timeout) try: have_lock = lock.acquire(blocking=False) if have_lock: ret_value = run_func(*args, **kwargs) finally: if have_lock: lock.release() return ret_value
def _wrapped_view(*args, **kwargs): method = unicode(args) + unicode(kwargs) digest = md5(method).hexdigest() lock_id = "%s-lock-%s" % (blocking_id, digest) acquire_lock = lambda: cache.add(lock_id, 'true', LOCK_EXPIRE) release_lock = lambda: cache.delete(lock_id) if acquire_lock(): value = cache.get(lock_id) try: value = view_func(*args, **kwargs) finally: release_lock() return value
def image_proxy(req, headers=None, cache=None, timeout=None, proxy_info=None): import httplib2 import gd form = ImageProxyForm(req.GET) if not form.is_valid(): return HttpResponseBadRequest() url = form.cleaned_data['url'] width = form.cleaned_data.get('w') height = form.cleaned_data.get('h') format = form.cleaned_data.get('f') timeout = form.cleaned_data.get('t') or timeout referrer = form.cleaned_data.get('r') headers = headers or {} if referrer: headers['Referer'] = referrer conn = httplib2.Http(cache, timeout, proxy_info) result, content = conn.request(url, headers=headers) status = int(result['status']) if status not in [200, 304]: return HttpResponse(status=status) try: img = Image.open(StringIO(content)) except IOError: raise Http404 w, h = img.size if width and height: if (w <= width and h <= height): need_resize = False else: need_resize = True w_ratio = width / float(w) h_ratio = height / float(h) # use smaller ratio ratio = min(w_ratio, h_ratio) size = (int(w * ratio), int(h * ratio)) else: # either width or height is undefined if height is not None and h > height: need_resize = True ratio = height / float(h) size = (int(w * ratio), int(h * ratio)) elif width is not None and w > width: need_resize = True ratio = width / float(w) size = (int(w * ratio), int(h * ratio)) else: need_resize = False format = img.format.lower() if format == 'gif': content_type = 'image/gif' if need_resize: newimage = gd.image(size) tmp = StringIO() img.save(tmp, 'PNG') tmp.seek(0) gdimage = gd.image(tmp, 'png') # resize gdimage.copyResizedTo(newimage, (0, 0), (0, 0), size, img.size) # get result output = StringIO() newimage.writeGif(output) # override image binary content content = output.getvalue() else: # force output to be JPEG content_type = 'image/jpeg' if need_resize or format != 'image/jpeg': # resize if need_resize: img = img.resize(size) # change color mode to RGB if not if img.mode != 'RGB': img = img.convert("RGB") output = StringIO() img.save(output, 'JPEG') # override image binary content content = output.getvalue() CACHE_TIMEOUT = 86400 res = HttpResponse(content, content_type=content_type) res['Content-Length'] = str(len(content)) res['ETag'] = '"%s"' % md5(content).hexdigest() res['Last-Modified'] = http_date() res['Expires'] = http_date(time.time() + CACHE_TIMEOUT) patch_cache_control(res, max_age=CACHE_TIMEOUT) return res
def process_flickr_photo(api_photo, nsid): logger.info("Processing photo %s for user %s.\n" % (api_photo['id'], nsid)) try: # Query Flickr for this photo's Exif data exif_rsp = flickr.photos.getExif(photo_id=api_photo['id'],format="json",nojsoncallback="true") json = simplejson.loads(exif_rsp) # If it exists, process it if json and json['stat'] == 'ok': exif_camera = "" raw_exif_make = "" exif_make = "" raw_exif_model = "" exif_model = "" exif_software = "" try: exif = json['photo']['exif'] for tag in exif: if tag['label'] == "Make" : raw_exif_make = tag['raw']['_content'] if tag['label'] == "Model" : raw_exif_model = tag['raw']['_content'] if tag['label'] == "Software" : exif_software = tag['raw']['_content'] # This is the "name" that Flickr uses, it's usually nice # if exif['photo']['camera']: # exif_camera = exif['photo']['camera'] # Create a clean version of the raw Exif make exif_make = clean_make(raw_exif_make) # Create a clean version of the raw Exif model, and remove the make if it's duplicated exif_model = clean_model(raw_exif_model, exif_make) # If there's a model (camera) we'll carry on if exif_model: # Process the date taken and date upload into nice time objecs # Date taken is a time string of the local time when the photo was taken, # we don't know the time zone, so we'll store it as UTC and always display it as UTC naive = parse_datetime(api_photo['datetaken']) api_date_taken = pytz.timezone("UTC").localize(naive) # Date upload is a unix timestamp, so we can store it as UTC and convert to whatever tz we want. api_date_upload = datetime.utcfromtimestamp(float(api_photo['dateupload'])).replace(tzinfo=timezone.utc) # Create the camera slug with things that should never change # I would use exif_camera, but I'm afraid those might change on Flickr's side camera_slug = slugify(exif_make + " " + exif_model) # Create a name for the camera if exif_make: camera_name = exif_make + " " + exif_model else: camera_name = exif_model # Try to create the camera, or get it if it existsg try: camera, created = Camera.objects.get_or_create( slug = camera_slug, defaults = { 'name': camera_name, 'model': exif_model, 'exif_model': raw_exif_model, 'exif_make': raw_exif_make, 'count': 0, 'count_photos': 0, } ) except IntegrityError: logger.warning("Camera %s already exists, but we're trying to add it again. Rescheduling task." % (camera_name)) raise process_flickr_photo.retry(countdown=5) if created: if exif_make: make_slug = slugify(exif_make) try: make, created = Make.objects.get_or_create( slug = make_slug, defaults = { 'name': exif_make, 'count': 1, } ) except IntegrityError: logger.warning("Make %s already exists, but we're trying to add it again. Rescheduling task." % (exif_make)) raise process_flickr_photo.retry(countdown=5) if not created: Make.objects.filter(slug=make_slug).update(count=F('count')+1) camera.make = make camera.save() # In case we need to create cache keys id_digest = md5(str(camera.id)).hexdigest() # A little bonus here, if the camera doesn't have aws info, try to get it. if not camera.amazon_item_response: lock_id = "%s-lock-%s" % ("aws_update", id_digest) acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) if acquire_lock(): logger.info("Fetching aws info for %s." % (camera.name)) add_aws_item_to_camera.delay(camera.id) else: logger.info("AWS item update for %s already scheduled, skipping." % (camera.name)) photo, created = Photo.objects.get_or_create( photo_id = api_photo['id'], defaults = { 'secret': api_photo['secret'], 'server': api_photo['server'], 'farm': api_photo['farm'], 'license': api_photo['license'], 'media': api_photo['media'], 'owner_nsid': api_photo['owner'], 'owner_name': api_photo['ownername'], 'date_taken': api_date_taken, 'date_upload': api_date_upload, 'camera': camera, } ) if created: photo.title = api_photo['title'] photo.path_alias = api_photo['pathalias'] photo.date_taken = api_date_taken photo.date_upload = api_date_upload photo.comments_count = api_photo['count_comments'] photo.faves_count = api_photo['count_faves'] if camera.make: photo.camera_make = camera.make if api_photo['latitude'] or api_photo['longitude'] and api_photo['geo_is_public']: photo.has_geo = 1 photo.latitude = api_photo['latitude'] photo.longitude = api_photo['longitude'] photo.accuracy = api_photo['accuracy'] photo.context = api_photo['context'] else: photo.has_geo = 0 # Ok, save the photo. logger.info("Saving photo %s for camera %s.\n" % (photo.photo_id, camera.name)) photo.save() Camera.objects.filter(slug=camera_slug).update(count_photos=F('count_photos')+1) return photo.photo_id else: logger.info("We've seen this photo before, moving on.") return False # The photo doesn't have camera info else: return False except KeyError: logger.error("KeyError! The photo doesn't have Exif data. (%s)" % (api_photo['id'])) return False else: logger.info("We probably don't have permission to see the Exif, carry on. %s" % (api_photo['id'])) return False #raise fetch_photos_for_flickr_user.retry(countdown=5) except URLError: logger.error("Problem talking to Flickr in process_photo (URLError), re-scheduling task.") raise fetch_photos_for_flickr_user.retry(countdown=5) except FlickrError, e: logger.error("Problem talking to Flickr in process_photo (FlickrError), re-scheduling task.\n Error: %s" % (e)) raise fetch_photos_for_flickr_user.retry(countdown=5)
def transfer_experiment(source): """ Pull public experiments from source into current mytardis. """ #TODO: Cleanup error messages #TODO: does not transfer liences as not part of METS format. #NOTE: As this is a pull we trust the data from the other tardis # Check identity of the feed from oaipmh.client import Client from oaipmh import error from oaipmh.metadata import MetadataRegistry, oai_dc_reader from django.core.cache import cache from django.utils.hashcompat import md5_constructor as md5 # The cache key consists of the task name and the MD5 digest # of the feed URL. cache_key = md5("token").hexdigest() lock_id = "%s-lock-%s" % ("consume_experiment", cache_key) LOCK_EXPIRE = 60 * 5 # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete(lock_id) registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) source_url = "%s/apps/oaipmh/?verb=Identify" % source client = Client(source_url, registry) try: identify = client.identify() except AttributeError as e: msg = "Error reading repos identity: %s:%s" % (source, e) logger.error(msg) raise ReposReadError(msg) except error.ErrorBase as e: msg = "OAIPMH error: %s" % e logger.error(msg) raise OAIPMHError(msg) except URLError as e: logger.error(e) raise repos = identify.baseURL() import urlparse repos_url = urlparse.urlparse(repos) dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc) if dest_name != source: msg = "Source directory reports incorrect name: %s" % dest_name logger.error(msg) raise BadAccessError(msg) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client( source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_metadata = [ meta for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc') ] except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return local_ids = [] for exp_metadata in exps_metadata: exp_id = exp_metadata.getField('identifier')[0] user = exp_metadata.getField('creator')[0] found_user = _get_or_create_user(source, user) #make sure experiment is publicish try: xmldata = getURL("%s/apps/reposproducer/expstate/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) try: exp_state = json.loads(xmldata) except ValueError as e: msg = "cannot parse public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not exp_state in [ Experiment.PUBLIC_ACCESS_FULL, Experiment.PUBLIC_ACCESS_METADATA ]: msg = 'cannot ingest private experiments.' % exp_id logger.error(msg) raise BadAccessError(msg) # Get the usernames of isOwner django_user ACLs for the experiment try: xmldata = getURL("%s/apps/reposproducer/acls/%s/" % (source, exp_id)) except HTTPError as e: msg = "Cannot get acl list of experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) try: acls = json.loads(xmldata) except ValueError as e: msg = "cannot parse acl list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) owners = [] for acl in acls: if acl['pluginId'] == 'django_user' and acl['isOwner']: user = _get_or_create_user(source, acl['entityId']) owners.append(user.username) else: # FIXME: skips all other types of acl for now pass # Get the METS for the experiment metsxml = "" try: metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" % (source, exp_id)) #metsxml = getURL("%s/experiment/metsexport/%s/" #% (source, exp_id)) except HTTPError as e: msg = "cannot get METS for experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) # load schema and parametername for experiment keys try: key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE) except Schema.DoesNotExist as e: msg = "No ExperimentKeyService Schema found" logger.error(msg) raise BadAccessError(msg) try: key_name = ParameterName.objects.get(name=settings.KEY_NAME) except ParameterName.DoesNotExist as e: msg = "No ExperimentKeyService ParameterName found" logger.error(msg) raise BadAccessError(msg) try: xmldata = getURL("%s/apps/reposproducer/key/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get key of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not xmldata: logger.warn( "Unable to retrieve experiment %s key. Will try again later" % exp_id) return try: key_value = json.loads(xmldata) except ValueError as e: msg = "cannot parse key list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not key_value: logger.warn( "Unable to retrieve experiment %s key value. Will try again later" % exp_id) return logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id)) exps = Experiment.objects.all() got_lock = True if not acquire_lock(): logger.warning("another worker has access to consume experiment") return duplicate_exp = 0 for exp in exps: #logger.warn("exp = %s" % exp.id) params = ExperimentParameter.objects.filter( name=key_name, parameterset__schema=key_schema, parameterset__experiment=exp) #logger.warn("params.count() = %s" % params.count()) if params.count() >= 1: key = params[0].string_value if key == key_value: duplicate_exp = exp.id #logger.warn("found duplicate for %s" % duplicate_exp) break if duplicate_exp: logger.warn( "Found duplicate experiment form %s exp %s to exp %s" % (source, exp_id, duplicate_exp)) if got_lock: release_lock() return # TODO: Need someway of updating and existing experiment. Problem is # that copy will have different id from original, so need unique identifier # to allow matching # We have not pulled everything we need from producer and are ready to create # experiment. # Make placeholder experiment and ready metadata e = Experiment( title='Placeholder Title', approved=True, created_by=found_user, public_access=exp_state, locked=False # so experiment can then be altered. ) e.save() # store the key #eps, was_created = ExperimentParameterSet.objects.\ # get_or_create(experiment=e, schema=key_schema) #if was_created: # logger.warn("was created") #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps, # name=key_name, # string_value=key_value) #if was_created: # logger.warn("was created again") #ep.save() if got_lock: release_lock() local_id = e.id filename = path.join(e.get_or_create_directory(), 'mets_upload.xml') f = open(filename, 'wb+') f.write(metsxml) f.close() # Ingest this experiment META data and isOwner ACLS eid = None try: eid, sync_path = _registerExperimentDocument(filename=filename, created_by=found_user, expid=local_id, owners=owners) logger.info('=== processing experiment %s: DONE' % local_id) except: # FIXME: what errors can mets return? msg = '=== processing experiment %s: FAILED!' \ % local_id logger.error(msg) raise MetsParseError(msg) # FIXME: if METS parse fails then we should go back and delete the placeholder experiment exp = Experiment.objects.get(id=eid) # so that tardis does not copy the data for datafile in exp.get_datafiles(): datafile.stay_remote = True datafile.save() #import nose.tools #nose.tools.set_trace() # FIXME: reverse lookup of URLs seem quite slow. # TODO: put this information into specific metadata schema attached to experiment exp.description += get_audit_message(source, exp_id) exp.save() local_ids.append(local_id) return local_ids
def transfer_experiment(source): """ Pull public experiments from source into current mytardis. """ #TODO: Cleanup error messages #TODO: does not transfer liences as not part of METS format. #NOTE: As this is a pull we trust the data from the other tardis # Check identity of the feed from oaipmh.client import Client from oaipmh import error from oaipmh.metadata import MetadataRegistry, oai_dc_reader from django.core.cache import cache from django.utils.hashcompat import md5_constructor as md5 # The cache key consists of the task name and the MD5 digest # of the feed URL. cache_key = md5("token").hexdigest() lock_id = "%s-lock-%s" % ("consume_experiment", cache_key) LOCK_EXPIRE = 60 * 5 # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) # memcache delete is very slow, but we have to use it to take # advantage of using add() for atomic locking release_lock = lambda: cache.delete(lock_id) registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) source_url = "%s/apps/oaipmh/?verb=Identify" % source client = Client(source_url, registry) try: identify = client.identify() except AttributeError as e: msg = "Error reading repos identity: %s:%s" % (source, e) logger.error(msg) raise ReposReadError(msg) except error.ErrorBase as e: msg = "OAIPMH error: %s" % e logger.error(msg) raise OAIPMHError(msg) except URLError as e: logger.error(e) raise repos = identify.baseURL() import urlparse repos_url = urlparse.urlparse(repos) dest_name = "%s://%s" % (repos_url.scheme, repos_url.netloc) if dest_name != source: msg = "Source directory reports incorrect name: %s" % dest_name logger.error(msg) raise BadAccessError(msg) # Get list of public experiments at sources registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(source + "/apps/oaipmh/?verb=ListRecords&metadataPrefix=oai_dc", registry) try: exps_metadata = [meta for (header, meta, extra) in client.listRecords(metadataPrefix='oai_dc')] except AttributeError as e: msg = "Error reading experiment %s" % e logger.error(msg) raise OAIPMHError(msg) except error.NoRecordsMatchError as e: msg = "no public records found on source %s" % e logger.warn(msg) return local_ids = [] for exp_metadata in exps_metadata: exp_id = exp_metadata.getField('identifier')[0] user = exp_metadata.getField('creator')[0] found_user = _get_or_create_user(source, user) #make sure experiment is publicish try: xmldata = getURL("%s/apps/reposproducer/expstate/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) try: exp_state = json.loads(xmldata) except ValueError as e: msg = "cannot parse public state of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not exp_state in [Experiment.PUBLIC_ACCESS_FULL, Experiment.PUBLIC_ACCESS_METADATA]: msg = 'cannot ingest private experiments.' % exp_id logger.error(msg) raise BadAccessError(msg) # Get the usernames of isOwner django_user ACLs for the experiment try: xmldata = getURL("%s/apps/reposproducer/acls/%s/" % (source, exp_id)) except HTTPError as e: msg = "Cannot get acl list of experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) try: acls = json.loads(xmldata) except ValueError as e: msg = "cannot parse acl list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) owners = [] for acl in acls: if acl['pluginId'] == 'django_user' and acl['isOwner']: user = _get_or_create_user(source, acl['entityId']) owners.append(user.username) else: # FIXME: skips all other types of acl for now pass # Get the METS for the experiment metsxml = "" try: metsxml = getURL("%s/experiment/metsexport/%s/?force_http_urls" % (source, exp_id)) #metsxml = getURL("%s/experiment/metsexport/%s/" #% (source, exp_id)) except HTTPError as e: msg = "cannot get METS for experiment %s" % exp_id logger.error(msg) raise ReposReadError(msg) # load schema and parametername for experiment keys try: key_schema = Schema.objects.get(namespace=settings.KEY_NAMESPACE) except Schema.DoesNotExist as e: msg = "No ExperimentKeyService Schema found" logger.error(msg) raise BadAccessError(msg) try: key_name = ParameterName.objects.get(name=settings.KEY_NAME) except ParameterName.DoesNotExist as e: msg = "No ExperimentKeyService ParameterName found" logger.error(msg) raise BadAccessError(msg) try: xmldata = getURL("%s/apps/reposproducer/key/%s/" % (source, exp_id)) except HTTPError as e: msg = "cannot get key of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not xmldata: logger.warn("Unable to retrieve experiment %s key. Will try again later" % exp_id) return try: key_value = json.loads(xmldata) except ValueError as e: msg = "cannot parse key list of experiment %s" % exp_id logger.error(msg) raise BadAccessError(msg) if not key_value: logger.warn("Unable to retrieve experiment %s key value. Will try again later" % exp_id) return logger.debug("retrieved key %s from experiment %s" % (key_value, exp_id)) exps = Experiment.objects.all() got_lock = True if not acquire_lock(): logger.warning("another worker has access to consume experiment") return duplicate_exp = 0 for exp in exps: #logger.warn("exp = %s" % exp.id) params = ExperimentParameter.objects.filter(name=key_name, parameterset__schema=key_schema, parameterset__experiment=exp) #logger.warn("params.count() = %s" % params.count()) if params.count() >= 1: key = params[0].string_value if key == key_value: duplicate_exp = exp.id #logger.warn("found duplicate for %s" % duplicate_exp) break if duplicate_exp: logger.warn("Found duplicate experiment form %s exp %s to exp %s" % (source, exp_id, duplicate_exp)) if got_lock: release_lock() return # TODO: Need someway of updating and existing experiment. Problem is # that copy will have different id from original, so need unique identifier # to allow matching # We have not pulled everything we need from producer and are ready to create # experiment. # Make placeholder experiment and ready metadata e = Experiment( title='Placeholder Title', approved=True, created_by=found_user, public_access=exp_state, locked=False # so experiment can then be altered. ) e.save() # store the key #eps, was_created = ExperimentParameterSet.objects.\ # get_or_create(experiment=e, schema=key_schema) #if was_created: # logger.warn("was created") #ep, was_created = ExperimentParameter.objects.get_or_create(parameterset=eps, # name=key_name, # string_value=key_value) #if was_created: # logger.warn("was created again") #ep.save() if got_lock: release_lock() local_id = e.id filename = path.join(e.get_or_create_directory(), 'mets_upload.xml') f = open(filename, 'wb+') f.write(metsxml) f.close() # Ingest this experiment META data and isOwner ACLS eid = None try: eid, sync_path = _registerExperimentDocument(filename=filename, created_by=found_user, expid=local_id, owners=owners) logger.info('=== processing experiment %s: DONE' % local_id) except: # FIXME: what errors can mets return? msg = '=== processing experiment %s: FAILED!' \ % local_id logger.error(msg) raise MetsParseError(msg) # FIXME: if METS parse fails then we should go back and delete the placeholder experiment exp = Experiment.objects.get(id=eid) # so that tardis does not copy the data for datafile in exp.get_datafiles(): datafile.stay_remote = True datafile.save() #import nose.tools #nose.tools.set_trace() # FIXME: reverse lookup of URLs seem quite slow. # TODO: put this information into specific metadata schema attached to experiment exp.description += get_audit_message(source, exp_id) exp.save() local_ids.append(local_id) return local_ids
def fetch_photos_for_flickr_user(results, nsid, page=None): from flickr.tasks import update_flickr_user_camera from flickr.tasks import flickr_user_fetch_photos_complete nsid_digest = md5(nsid).hexdigest() lock_id = "%s-lock-%s" % ("fetch_photos", nsid_digest) # cache.add fails if if the key already exists acquire_lock = lambda: cache.add(lock_id, "true", LOCK_EXPIRE) if page or acquire_lock(): flickr_user = FlickrUser.objects.get(nsid = nsid) if flickr_user.count_photos == 0: return flickr_user_fetch_photos_complete.delay(None, flickr_user.nsid) per_page = 100 if not page: page = math.ceil(float(flickr_user.count_photos) / float(per_page)) logger.info("Fetching page %s for %s" % (page, flickr_user.username)) try: # Fetch a page of photos photos_rsp = flickr.people.getPublicPhotos( user_id=flickr_user.nsid, per_page=per_page, page=page, extras="date_taken,date_upload,license,owner_name,media,path_alias,count_comments,count_faves,geo", format="json", nojsoncallback="true", ) json = simplejson.loads(photos_rsp) if json and json['stat'] == 'ok': pages = json['photos']['pages'] photo_updates = [] for photo in json['photos']['photo']: if not flickr_user.date_last_photo_update or int(photo['dateupload']) >= int(flickr_user.date_last_photo_update): photo_updates.append(process_flickr_photo.subtask((photo, flickr_user.nsid), link=update_flickr_user_camera.subtask((flickr_user.nsid, )))) photo_update_date = photo['dateupload'] if page == 1: logger.info("This is the last page (%s) for %s!" % (pages, flickr_user.username)) if photo_updates: return chord(photo_updates)(flickr_user_fetch_photos_complete.subtask((flickr_user.nsid, ))) else: return flickr_user_fetch_photos_complete.delay(None, flickr_user.nsid) else: logger.info("Firing tasks for page %s of %s for %s" % (page, pages, flickr_user.username)) next_page = page - 1 pct = 100 - ((float(page) / float(pages)) * 100) logger.info("pct should be: %s/%s * 100 = %s" % (page, pages, pct)) logger.info("Push it.") values = { 'secret': settings.PUSHY_SECRET, 'user_id': flickr_user.nsid, 'message': simplejson.dumps({'type': 'fetch_photos.update_progress_bar', 'data': {'pct': pct}}), } data = urllib.urlencode(values) req = urllib2.Request(settings.PUSHY_URL_LOCAL, data) try: response = urllib2.urlopen(req) except: logger.error("Problem calling pushy from photos fetch.") if photo_updates: flickr_user.date_last_photo_update = photo_update_date flickr_user.save() return chord(photo_updates)(fetch_photos_for_flickr_user.subtask((flickr_user.nsid, next_page, ))) else: return fetch_photos_for_flickr_user.delay(None, flickr_user.nsid, next_page) else: logger.error("Flickr api query did not respond OK calling getPublicPhotos for %s in fetch_photos, will try again." % (flickr_user.nsid)) return fetch_photos_for_flickr_user.retry(countdown=5) except URLError, e: logger.error("Problem talking to Flickr when calling getPublicPhotos for %s in fetch_photos (URLError), will try again. Reason: %s" % (flickr_user.nsid, e.reason)) return fetch_photos_for_flickr_user.retry(countdown=5) except FlickrError, e: logger.error("Problem talking to Flickr when calling getPublicPhotos for %s in fetch_photos (FlickrError), re-scheduling task.\n Error: %s" % (flickr_user.nsid, e)) raise fetch_photos_for_flickr_user.retry(countdown=5)
def get_chksum(title, url, pub_date): return md5("%s%s%s" % (title, url, pub_date)).hexdigest()