def post(self): # Create instance variable to track if parameters came from a direct request # Or if they came through Period entity self.params_from_request = None params = None s = "Version: %s\n" % __version__ s += "Arguments from POST:" for arg in self.request.arguments(): s += '\n%s:%s' % (arg, self.request.get(arg)) logging.info(s) # Try to get period from the request in case GetEvents was called directly self.period = self.request.get("period", None) # If real period not in request, try to get parameters from StatsRun entity # in case GetEvents was called from a previous task. if self.period is None or len(self.period)==0: run_key = ndb.Key("StatsRun", 5759180434571264) run_entity = run_key.get() self.period = run_entity.period self.params_from_request = False s = "Version: %s\n" % __version__ s += "Period %s determined from StatsRun entity: %s" % (self.period, params) logging.info(s) else: self.params_from_request = True s = "Version: %s\n" % __version__ s += "Period %s determined from request: %s" % (self.period, self.request) logging.info(s) if self.period is None or len(self.period)==0: self.error(400) resp = { "status": "error", "message": "Period parameter was not provided." } s = "Version: %s\n" % __version__ s += "%s" % resp logging.error(s) self.response.write(json.dumps(resp)+"\n") return # If Period not already stored, halt period_key = ndb.Key("Period", self.period) period_entity = period_key.get() if not period_entity: self.error(400) resp = { "status": "error", "message": "Provided period does not exist in datastore", "data": { "period": self.period } } logging.error(resp) self.response.write(json.dumps(resp)+"\n") return # Get the remaining parameters based on the parameter source if self.params_from_request == True: # Get parameters from request # 'table_name' parameter try: self.table_name = self.request.get('table_name') if self.table_name is None or len(self.table_name)==0: self.table_name = CDB_TABLE except KeyError: # Table name not provided, use default self.table_name = CDB_TABLE # 'downloads_extracted' parameter try: self.downloads_extracted = self.request.get('downloads_extracted').\ lower() == 'true' except Exception: s = "Version: %s\n" % __version__ s += "Aborting. " s += "Unable to extract 'downloads_extracted' from request: %s" % request logging.error(s) return # 'searches_extracted' parameter try: self.searches_extracted = self.request.get('searches_extracted').\ lower() == 'true' except KeyError: s = "Version: %s\n" % __version__ s += "Aborting. " s += "Unable to extract 'searches_extracted' from request: %s" % request logging.error(s) return else: # Get parameters from Period entity # 'table_name' parameter try: self.table_name = period_entity.table_name if self.table_name is None or len(self.table_name)==0: self.table_name = CDB_TABLE except KeyError: # default value for 'table_name' if not provided is None self.table_name = CDB_TABLE # 'downloads_extracted' parameter try: self.downloads_extracted = period_entity.downloads_extracted except Exception: s = "Version: %s\n" % __version__ s += "Aborting. " s += "Unable to extract 'downloads_extracted' from Period" logging.error(s) return # 'searches_extracted' parameter try: self.searches_extracted = period_entity.searches_extracted except Exception: s = "Version: %s\n" % __version__ s += "Aborting. " s += "Unable to extract 'searches_extracted' from Period" logging.error(s) return s = "Version: %s\n" % __version__ s += "Using %s as data table" % self.table_name logging.info(s) # Start with downloads if self.downloads_extracted == False: self.t = "download" # and continue with searches elif self.searches_extracted == False: self.t = "search" # if both are True, downloads and searches were both extracted... else: # ... call 'process_events' and move on taskqueue.add(url=URI_PROCESS_EVENTS, queue_name=QUEUENAME) return # Get events s = "Version: %s\n" % __version__ s += "Getting events" logging.info(s) err = self.get_events() if err: s = "Version: %s\n" % __version__ s += "Error from get_events(): %s" % err logging.error(s) return # Parse events s = "Version: %s\n" % __version__ s += "Parsing events" logging.info(s) err = self.parse_events() if err: s = "Version: %s\n" % __version__ s += "Error from parse_events(): %s" % err logging.error(s) return # Update Period counts s = "Version: %s\n" % __version__ s += "Updating Period counts" logging.info(s) err = self.update_period_counts() if err: s = "Version: %s\n" % __version__ s += "Error from update_period_counts(): %s" % err logging.error(s) return r = [] for resource in self.resources: params = { "t": self.t, "gbifdatasetid": resource, "resource": self.resources[resource] } r.append(ReportToProcess(**params)) # Store temporary entities s = "Version: %s\n" % __version__ s += "Putting %d entities" % len(r) logging.info(s) sr = ndb.put_multi(r) # Check if len(sr) != len(r): s = "Version: %s\n" % __version__ s += "Not all resources were put to process." logging.error(s) self.error(500) resp = { "status": "error", "message": s, "data": { "period": self.period, "t": self.t, "resources": len(r), "to_process": len(sr) } } self.response.write(json.dumps(resp) + "\n") return # Build response resp = { "status": "success", "message": "All %s events downloaded and parsed" % self.t, "data": { "period": self.period, "event_type": self.t, "event_number": len(self.data), "resources_to_process": len(self.resources) } } self.response.write(json.dumps(resp) + "\n") # Update Period entity with stat information if self.t == "search": period_entity.searches_extracted=True else: period_entity.downloads_extracted=True k = period_entity.put() if k != period_key: s = "Version: %s\n" % __version__ s += "Could not update processing properties in period %s" % self.period logging.error(s) self.error(500) resp = { "status": "error", "message": s, "data": { "period": self.period, } } self.response.write(json.dumps(resp) + "\n") return 1 # If both downloads and searches have been extracted, end now period_entity = period_key.get() if period_entity.searches_extracted is True and\ period_entity.downloads_extracted is True: # Call 'process_events' s = "Version: %s\n" % __version__ s += "All searches and downloads extracted" logging.info(s) taskqueue.add(url=URI_PROCESS_EVENTS, queue_name=QUEUENAME) else: taskqueue.add(url=URI_GET_EVENTS, queue_name=QUEUENAME) return
def post(self): # Retrieve parameters from memcache and request memcache_keys = ["period", "github_store", "github_issue"] params = memcache.get_multi(memcache_keys, key_prefix="usagestats_parser_") self.period = params['period'] self.github_store = params['github_store'] self.github_issue = params['github_issue'] # Start the loop, until deadline try: # Prepare query for all Reports to process query = ReportToProcess.query() query = query.order(ReportToProcess.gbifdatasetid) logging.info("ReportToProcess queried") # Get cursor from request, if any cursor_str = self.request.get('cursor', None) cursor = None if cursor_str: cursor = Cursor(urlsafe=cursor_str) logging.info("Cursor built: %s" % cursor) # Initialize loop more = True # Repeat while there are reports to process while more is True: # Get the next (or first) round of elements logging.info("Fetching %d entities" % PAGE_SIZE) results, new_cursor, more = query.fetch_page( PAGE_SIZE, start_cursor=cursor ) logging.info("Got %d results" % len(results)) # Process and store transactionally self.process_and_store(results) # Restart with new cursor (if any) if more is True: cursor = new_cursor logging.info("New cursor: %s" % cursor.urlsafe()) logging.info("Finished processing reports") # Store memcache'd counts counts = memcache.get_multi([ "processed_searches", "processed_downloads" ], key_prefix="usagestats_parser_") period_entity = ndb.Key("Period", self.period).get() period_entity.processed_searches = counts['processed_searches'] period_entity.processed_downloads = counts['processed_downloads'] resp = { "status": "success", "message": "Successfully finished processing all reports", "data": { "processed_searches": counts['processed_searches'], "processed_downloads": counts['processed_downloads'] } } # Launch process to store reports on GitHub, if applicable if self.github_store is True: resp['message'] += ". Launching GitHub storing process" taskqueue.add(url=URI_GITHUB_STORE, queue_name=QUEUENAME) # Launch process to create issues on GitHub, if applicable elif self.github_issue is True: resp['message'] += ". Launching GitHub issue process" taskqueue.add(url=URI_GITHUB_ISSUE, queue_name=QUEUENAME) # Otherwise, consider finished else: resp['message'] += ". No GitHub process launched" period_entity.status = "done" mail.send_mail( sender=EMAIL_SENDER, to=EMAIL_RECIPIENT, subject="Usage reports for period %s" % self.period, body=""" Hey there! Just a brief note to let you know the extraction of %s stats has successfully finished, with no GitHub processes launched. Congrats! """ % self.period) # In any case, store the counts, show message and finish period_entity.put() logging.info(resp) self.response.write(json.dumps(resp)+"\n") return # When timeout arrives... except DeadlineExceededError: # Launch new instance with current (failed) cursor taskqueue.add(url=URI_PROCESS_EVENTS, params={"cursor": cursor.urlsafe()}, queue_name=QUEUENAME) logging.info("Caught a DeadlineExceededError. Relaunching") resp = { "status": "in progress", "message": "Caught a DeadlineExceededError." " Relaunching with new cursor", "data": { "period": self.period, "cursor": cursor.urlsafe() } } logging.info(resp) self.response.write(json.dumps(resp)+"\n") return
def initialize_extraction(self, period=None, force=None): """Check if Period parameter is valid, if the Period entity already exists and create a new Period. """ self.response.headers['Content-Type'] = "application/json" # Check that 'period' is provided if not self.period: s = "Version: %s\n" % __version__ s += "Period not found on POST body. Aborting." logging.error(s) self.error(400) resp = {"status": "error", "message": s} self.response.write(json.dumps(resp) + "\n") return 1 # Check that 'period' is valid if len(self.period) != 6: s = "Version: %s\n" % __version__ s += "Malformed period. Should be YYYYMM (e.g., 201603)" logging.error(s) self.error(400) resp = {"status": "error", "message": s} self.response.write(json.dumps(resp) + "\n") return 1 # Get existing period period_key = ndb.Key("Period", self.period) period_entity = period_key.get() # If existing, abort or clear and start from scratch if period_entity: if self.force is not True: s = "Version: %s\n" % __version__ s += "Period %s already exists. " % self.period s += "Aborting. To override, use 'force=true'." logging.error(s) resp = {"status": "error", "message": s} self.response.write(json.dumps(resp) + "\n") return 1 else: s = "Version: %s\n" % __version__ s += "Period %s already exists. " % self.period s += "Overriding." logging.warning(s) # Delete Reports referencing period r = Report.query().filter(Report.reported_period == period_key) to_delete = r.fetch(keys_only=True) s = "Version: %s\n" % __version__ s += "Deleting %d Report entities" % len(to_delete) logging.info(s) deleted = ndb.delete_multi(to_delete) s = "Version: %s\n" % __version__ s += "%d Report entities removed" % len(deleted) logging.info(s) # Delete Period itself s = "Version: %s\n" % __version__ s += "Deleting Period %s" % period_key logging.info(s) period_key.delete() s = "Version: %s\n" % __version__ s += "Period %s deleted" % period_key logging.info(s) # Create new Period (id=YYYYMM) s = "Version: %s\n" % __version__ s += "Creating new Period %s" % self.period logging.info(s) y, m = (int(self.period[:4]), int(self.period[-2:])) p = Period(id=self.period) p.year = y p.month = m p.status = 'in progress' period_key = p.put() # Check if period_key: s = "Version: %s\n" % __version__ s += "New Period %s created successfully" % self.period s += "with key %s" % period_key logging.info(s) else: self.error(500) s = "Version: %s\n" % __version__ s += "Could not create new Period %s" % self.period logging.error(s) resp = {"status": "error", "message": s} self.response.write(json.dumps(resp) + "\n") return 1 # Clear temporary entities keys_to_delete = ReportToProcess.query().fetch(keys_only=True) s = "Version: %s\n" % __version__ s += "Deleting %d temporal (internal use only) entities" % len( keys_to_delete) logging.info(s) ndb.delete_multi(keys_to_delete) return 0
def post(self): s = "Version: %s\n" % __version__ s += "Arguments from POST:" for arg in self.request.arguments(): s += '\n%s:%s' % (arg, self.request.get(arg)) logging.info(s) # Try to get period from the request in case GetEvents was called directly try: self.period = self.request.get("period").lower() s = "Version: %s\n" % __version__ s += "Period %s determined from request: %s" % (self.period, self.request) logging.info(s) except Exception: pass # If real period not in request, try to get parameters from StatsRun entity # in case GetEvents was called from a previous task. if self.period is None or len(self.period) == 0: run_key = ndb.Key("StatsRun", 5759180434571264) run_entity = run_key.get() self.period = run_entity.period if self.period is None or len(self.period) == 0: self.error(400) resp = { "status": "error", "message": "Period parameter was not provided." } s = "Version: %s\n" % __version__ s += "%s" % resp logging.error(s) self.response.write(json.dumps(resp) + "\n") return # If Period not already stored, halt period_key = ndb.Key("Period", self.period) period_entity = period_key.get() if not period_entity: self.error(400) resp = { "status": "error", "message": "Provided period does not exist in datastore", "data": { "period": self.period } } logging.error(resp) self.response.write(json.dumps(resp) + "\n") return self.github_store = period_entity.github_store self.github_issue = period_entity.github_issue # Start the loop, until deadline try: # Prepare query for all Reports to process query = ReportToProcess.query() query = query.order(ReportToProcess.gbifdatasetid) s = "Version: %s\n" % __version__ s += "ReportToProcess queried" logging.info(s) # Get cursor from request, if any cursor_str = self.request.get('cursor', None) cursor = None if cursor_str: cursor = Cursor(urlsafe=cursor_str) s = "Version: %s\n" % __version__ s += "Cursor built: %s" % cursor logging.info(s) # Initialize loop more = True # Repeat while there are reports to process while more is True: # Get the next (or first) round of elements logging.info("Fetching %d entities" % PAGE_SIZE) results, new_cursor, more = query.fetch_page( PAGE_SIZE, start_cursor=cursor) s = "Version: %s\n" % __version__ s += "Got %d results" % len(results) logging.info(s) # Process and store transactionally self.process_and_store(results) # Restart with new cursor (if any) if more is True: cursor = new_cursor s = "Version: %s\n" % __version__ s += "New cursor: %s" % cursor.urlsafe() logging.info(s) s = "Version: %s\n" % __version__ s += "Finished processing reports" logging.info(s) period_entity = ndb.Key("Period", self.period).get() resp = { "status": "success", "message": "Successfully finished processing all reports", "data": { "processed_searches": period_entity.processed_searches, "processed_downloads": period_entity.processed_downloads } } # Launch process to store reports on GitHub, if applicable if self.github_store is True: resp['message'] += ". Launching GitHub storing process" taskqueue.add(url=URI_GITHUB_STORE, queue_name=QUEUENAME) # Launch process to create issues on GitHub, if applicable elif self.github_issue is True: resp['message'] += ". Launching GitHub issue process" taskqueue.add(url=URI_GITHUB_ISSUE, queue_name=QUEUENAME) # Otherwise, consider finished else: resp['message'] += ". No GitHub process launched" period_entity.status = "done" mail.send_mail(sender=EMAIL_SENDER, to=EMAIL_RECIPIENT, subject="Usage reports for period %s" % self.period, body=""" Hey there! Just a brief note to let you know the extraction of %s stats has successfully finished, with no GitHub processes launched. Congrats! """ % self.period) # In any case, store the status, show message and finish period_entity.put() logging.info(resp) self.response.write(json.dumps(resp) + "\n") return # When timeout arrives... except DeadlineExceededError: # Launch new instance with current (failed) cursor taskqueue.add(url=URI_PROCESS_EVENTS, params={"cursor": cursor.urlsafe()}, queue_name=QUEUENAME) s = "Version: %s\n" % __version__ s += "Caught a DeadlineExceededError. Relaunching" logging.warning(s) resp = { "status": "in progress", "message": "Caught a DeadlineExceededError." " Relaunching with new cursor", "data": { "period": self.period, "cursor": cursor.urlsafe() } } logging.info(resp) self.response.write(json.dumps(resp) + "\n") return
def initialize_extraction(self, period=None, force=None): """Check if Period parameter is valid, if the Period entity already exists and create a new Period.""" self.response.headers['Content-Type'] = "application/json" # Check that 'period' is provided if not self.period: logging.error("Period not found on POST body. Aborting.") self.error(400) resp = { "status": "error", "message": "Period not found on POST body. " + "Aborting." } self.response.write(json.dumps(resp) + "\n") return 1 # Check that 'period' is valid if len(self.period) != 6: self.error(400) resp = { "status": "error", "message": "Malformed period. Should be YYYYMM (e.g., 201603)" } self.response.write(json.dumps(resp) + "\n") return 1 # Get existing period period_key = ndb.Key("Period", self.period) period_entity = period_key.get() # If existing, abort or clear and start from scratch if period_entity: if self.force is not True: logging.error("Period %s already exists. " % self.period + "Aborting. To override, use 'force=true'.") resp = { "status": "error", "message": "Period %s already exists. " % self.period + "Aborting. To override, use 'force=true'." } self.response.write(json.dumps(resp) + "\n") return 1 else: logging.warning("Period %s already exists. " % self.period + "Overriding.") # Delete Reports referencing period r = Report.query().filter(Report.reported_period == period_key) to_delete = r.fetch(keys_only=True) logging.info("Deleting %d Report entities" % len(to_delete)) deleted = ndb.delete_multi(to_delete) logging.info("%d Report entities removed" % len(deleted)) # Delete Period itself logging.info("Deleting Period %s" % period_key) period_key.delete() logging.info("Period entity deleted") # Create new Period (id=YYYYMM) logging.info("Creating new Period %s" % self.period) y, m = (int(self.period[:4]), int(self.period[-2:])) p = Period(id=self.period) p.year = y p.month = m p.status = 'in progress' period_key = p.put() # Check if period_key: logging.info("New Period %s created successfully." % self.period) logging.info("New period's key = %s" % period_key) else: self.error(500) logging.error("Could not create new Period %s" % self.period) resp = { "status": "error", "message": "Could not create new Period %s" % self.period } self.response.write(json.dumps(resp) + "\n") return 1 # Clear temporary entities keys_to_delete = ReportToProcess.query().fetch(keys_only=True) logging.info("Deleting %d temporal (internal use only) entities" % len(keys_to_delete)) ndb.delete_multi(keys_to_delete) return 0