def convert_datetime_string_to_RFC3339(datetime_string, field_name, timestamp_formats): """ Convert datetime_string to an RFC-3339 datetime string. datetime_string The datetime string that was stored in the file field_name The name of the field to be set. Only used for logging timestamp_formats A list of format strings to try to apply to the datetime_string Returns the corresponding RFC-3339 string if datetime_string can be parsed, using the first matching format. Returns None if the string cannot be parsed using any of the supplied formats. """ fn_name = "convert_datetime_string_to_RFC3339: " try: if not datetime_string: # Nothing to convert return None if datetime_string in constants.ZERO_DATETIME_STRINGS: # There are significant number of instances where 'completed' returned from the server has a zero-date, # value of '0000-01-01T00:00:00.000Z' which is stored by GTB in the CSV or GTBak files as # '0000-01-01 00:00:00' or '0000-01-01'. # Zero dates cannot be parsed or converted to a datetime object, so we can't use strftime() to format, # so we check for zero-date and return the corresponding RFC-3339 string. # CAUTION: Attempting to create a task with a zero 'due' value results in strange behaviour; # insert() returns a task object with an 'id', however attemping to get() that 'id' returns # "404 Not found" return constants.ZERO_RFC3339_DATETIME_STRING t = None for timestamp_format in timestamp_formats: try: # Try creating a datetime object from a string using the supplied timestamp_format t = datetime.datetime.strptime(datetime_string, timestamp_format) try: # Successfuly parsed the datetime string, so return the corresponding RFC-3339 format rfc_3339_str = t.strftime("%Y-%m-%dT%H:%M:%S.000Z") return rfc_3339_str except Exception, e: try: logging.info(fn_name + constants.INVALID_FORMAT_LOG_LABEL + "Unable to convert '" + str(field_name) + "' value '" + str(datetime_string) + "' to an RFC-3339 datetime string: " + shared.get_exception_msg(e)) except Exception, e: logging.info(fn_name + constants.INVALID_FORMAT_LOG_LABEL + "Unable to convert '" + str(field_name) + "' value to an RFC-3339 datetime string, and unable to log value: " + shared.get_exception_msg(e)) except Exception, e: pass # Try the next format
def file_has_valid_encoding(file): """ Checks that file doesn't have a BOM Arguments: file A file object that refers to the CSV file to be processed by unicodcsv Returns a tuple; msg 'OK' if the file encoding is supported (either UTF-8 without BOM, or ASCII) Returns a string describing the problem if the header is not valid. """ try: file.seek(0) header_row = file.readline() # From the chardet.universaldetector package, to detect files with a BOM BOMs = ( # EF BB BF UTF-8 with BOM ([0xEF, 0xBB, 0xBF], "UTF-8 with BOM"), # FF FE 00 00 UTF-32, little-endian BOM ([0xFF, 0xFE, 0x00, 0x00], "UTF-32LE"), # 00 00 FE FF UTF-32, big-endian BOM ([0x00, 0x00, 0xFE, 0xFF], "UTF-32BE"), # FE FF 00 00 UCS-4, unusual octet order BOM (3412) ([0xFE, 0xFF, 0x00, 0x00], "X-ISO-10646-UCS-4-3412"), # 00 00 FF FE UCS-4, unusual octet order BOM (2143) ([0x00, 0x00, 0xFF, 0xFE], "X-ISO-10646-UCS-4-2143"), # FF FE UTF-16, little endian BOM ([0xFF, 0xFE], "UTF-16LE"), # FE FF UTF-16, big endian BOM ([0xFE, 0xFF], "UTF-16BE"), ) header_row_len = len(header_row) # If the data starts with BOM, we know it is UTF # Testing character by character because comparing strings results in # "Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal" for chunk, result in BOMs: chunk_len = len(chunk) if header_row_len >= chunk_len: i = 0 match = 0 while i < chunk_len: if ord(header_row[i:i+1]) == chunk[i]: #print "** Match" match += 1 i += 1 if match == chunk_len: # File is using one of the unsupported encoding, so return an error message return result + " file encoding is not supported. File must be ASCII or UTF-8 without BOM" # File is not using one of the unsupported encoding, so return "OK" # File could still be another encoding (e.g., a binary file such as MS Excel or Doc) return 'OK' except Exception, e: return "Error processing first row of file: " + shared.get_exception_msg(e)
def get(self, blob_key): blob_key = str(urllib.unquote(blob_key)) blob_info = blobstore.BlobInfo.get(blob_key) if blob_info: try: blob_info.delete() self.redirect(settings.MAIN_PAGE_URL) return except Exception, e: msg = """Error deleting blobstore %s<br />%s""" % (blob_key, shared.get_exception_msg(e))
def file_has_valid_header_row(file, valid_column_names): """ Checks if the header row is valid. - File doesn't have a BOM - Header row is plain ASCII - Header row contains the minimumm set of valid column names Arguments: file A file object that refers to the CSV file to be processed by unicodcsv valid_column_names A list of valid column names Returns a tuple; msg 'OK' if the header row is valid. Returns a string describing the problem if the header is not valid. display_line Returns True if the probline is with the data in the first row. i.e., the calling method should display the line to the user Returns False if the problem is with the file No point displaying the first row, because the problem is with the file itself e.g., the file uses incorrect encoding (not UTF-8 without BOM and not ASCII) Calls _file_contains_valid_columns() to ensure that the file can be parsed by the unicodecsv package """ try: file.seek(0) header_row = file.readline() if not header_row.strip(): return "First row must contain column headers", False # Check if header row is plain ASCII try: s1 = header_row.encode('ascii') except Exception, e: return "The header row may only contain plain ASCII characters: " + shared.get_exception_msg(e), True # Check if the file can be parsed by unicodecsv, and that it has the minimum required columns # NOTE: _file_contains_valid_columns() returns a tuple return _file_contains_valid_columns(file, valid_column_names)
except Exception, e: logging.exception(fn_name + "Error putting results in DB") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Unable to store tasklists in DB: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self._log_progress("Exception") self.process_tasks_job.put() except urlfetch_errors.DeadlineExceededError, e: logging.exception(fn_name + "urlfetch_errors.DeadlineExceededError:") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Server took too long to respond: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self._log_progress("urlfetch_errors.DeadlineExceededError") self.process_tasks_job.put() except apiproxy_errors.DeadlineExceededError, e: logging.exception(fn_name + "apiproxy_errors.DeadlineExceededError:") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Server took too long to respond: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self._log_progress("apiproxy_errors.DeadlineExceededError") self.process_tasks_job.put() except DeadlineExceededError, e:
def get(self): fn_name = "DisplayStatsHandler.get(): " logging.debug(fn_name + "<Start> (app version %s)" % appversion.version ) logservice.flush() stats_query = model.UsageStats.all() # stats_query.order('start_time') # stats_query.order('user_hash') stats = stats_query.run() try: stats_filename = "stats_" + get_application_id() + "_" + datetime.datetime.now().strftime("%Y-%m-%d") + ".csv" template_values = {'stats' : stats} self.response.headers["Content-Type"] = "text/csv" self.response.headers.add_header( "Content-Disposition", "attachment; filename=%s" % stats_filename) path = os.path.join(os.path.dirname(__file__), constants.PATH_TO_TEMPLATES, "stats.csv") self.response.out.write(template.render(path, template_values)) logging.debug(fn_name + "<End>" ) logservice.flush() except Exception, e: logging.exception(fn_name + "Caught top-level exception") self.response.headers["Content-Type"] = "text/html; charset=utf-8" try: # Clear "Content-Disposition" so user will see error in browser. # If not removed, output goes to file (if error generated after "Content-Disposition" was set), # and user would not see the error message! del self.response.headers["Content-Disposition"] except Exception, e: logging.debug(fn_name + "Unable to delete 'Content-Disposition' from headers: " + shared.get_exception_msg(e))
class DownloadStatsHandler(webapp.RequestHandler): """Display statistics""" def get(self): fn_name = "DisplayStatsHandler.get(): " logging.debug(fn_name + "<Start> (app version %s)" % appversion.version ) logservice.flush() stats_query = model.UsageStats.all() # stats_query.order('start_time') # stats_query.order('user_hash') stats = stats_query.run() try: stats_filename = "stats_" + get_application_id() + "_" + datetime.datetime.now().strftime("%Y-%m-%d") + ".csv" template_values = {'stats' : stats} self.response.headers["Content-Type"] = "text/csv" self.response.headers.add_header( "Content-Disposition", "attachment; filename=%s" % stats_filename) path = os.path.join(os.path.dirname(__file__), constants.PATH_TO_TEMPLATES, "stats.csv") self.response.out.write(template.render(path, template_values)) logging.debug(fn_name + "<End>" ) logservice.flush() except Exception, e: logging.exception(fn_name + "Caught top-level exception") self.response.headers["Content-Type"] = "text/html; charset=utf-8" try: # Clear "Content-Disposition" so user will see error in browser. # If not removed, output goes to file (if error generated after "Content-Disposition" was set), # and user would not see the error message! del self.response.headers["Content-Disposition"] except Exception, e: logging.debug(fn_name + "Unable to delete 'Content-Disposition' from headers: " + shared.get_exception_msg(e)) self.response.clear() self.response.out.write("""Oops! Something went terribly wrong.<br />%s<br />Please report this error to <a href="http://code.google.com/p/tasks-backup/issues/list">code.google.com/p/tasks-backup/issues/list</a>""" % shared.get_exception_msg(e)) logging.debug(fn_name + "<End> due to exception" ) logservice.flush()
class ProcessTasksWorker(webapp.RequestHandler): """ Process tasks according to data in the ProcessTasksJob entity """ credentials = None user_email = None is_test_user = False process_tasks_job = None tasks_svc = None tasklists_svc = None def post(self): fn_name = "ProcessTasksWorker.post(): " logging.debug(fn_name + "<start> (app version %s)" %appversion.version) logservice.flush() client_id, client_secret, user_agent, app_title, project_name, host_msg = shared.get_settings(self.request.host) self.user_email = self.request.get(settings.TASKS_QUEUE_KEY_NAME) self.is_test_user = shared.isTestUser(self.user_email) if self.user_email: # Retrieve the DB record for this user self.process_tasks_job = model.ProcessTasksJob.get_by_key_name(self.user_email) if self.process_tasks_job is None: logging.error(fn_name + "No DB record for " + self.user_email) logservice.flush() logging.debug(fn_name + "<End> No DB record") # TODO: Find some way of notifying the user????? # Could use memcache to relay a message which is displayed in ProgressHandler return else: logging.debug(fn_name + "Retrieved process tasks job for " + str(self.user_email)) logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.INITIALISING self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.message = "Validating background job ..." logging.debug(fn_name + "Initialising - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() user = self.process_tasks_job.user if not user: logging.error(fn_name + "No user object in DB record for " + str(self.user_email)) logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Problem with user details. Please restart." self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "No user - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() logging.debug(fn_name + "<End> No user object") return self.credentials = self.process_tasks_job.credentials if not self.credentials: logging.error(fn_name + "No credentials in DB record for " + str(self.user_email)) logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Problem with user self.credentials. Please restart." self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() logging.debug(fn_name + "<End> No self.credentials") return if self.credentials.invalid: logging.error(fn_name + "Invalid credentials in DB record for " + str(self.user_email)) logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Invalid self.credentials. Please restart and re-authenticate." self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "Credentials invalid - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() logging.debug(fn_name + "<End> Invalid self.credentials") return if self.is_test_user: logging.debug(fn_name + "User is test user %s" % self.user_email) logservice.flush() http = httplib2.Http() http = self.credentials.authorize(http) service = discovery.build("tasks", "v1", http) self.tasklists_svc = service.tasklists() self.tasks_svc = service.tasks() self.export_tasks() # logging.debug(fn_name + "Finished processing. Total progress = " + # str(self.process_tasks_job.total_progress) + " for " + str(self.user_email)) else: logging.error(fn_name + "No processing, as there was no user_email key") logservice.flush() logging.debug(fn_name + "<End>, user = "******"export_tasks: " logging.debug(fn_name + "<Start>") logservice.flush() start_time = datetime.datetime.now() include_hidden = self.process_tasks_job.include_hidden include_completed = self.process_tasks_job.include_completed include_deleted = self.process_tasks_job.include_deleted summary_msg = '' # Retrieve all tasks for the user try: logging.debug(fn_name + "include_hidden = " + str(include_hidden) + ", include_completed = " + str(include_completed) + ", include_deleted = " + str(include_deleted)) logservice.flush() # ############################################## # FLOW # ---------------------------------------------- # For each page of taskslists # For each tasklist # For each page of tasks # For each task # Fix date format # Add tasks to tasklist collection # Add tasklist to tasklists collection # Use tasklists collection to return tasks backup to user self.process_tasks_job.status = constants.ExportJobStatus.BUILDING self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.message = 'Retrieving tasks from server ...' logging.debug(fn_name + "Building - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() # This list will contain zero or more tasklist dictionaries, which each contain tasks tasklists = [] total_num_tasklists = 0 total_num_tasks = 0 tasks_per_list = [] # --------------------------------------- # Retrieve all the tasklists for the user # --------------------------------------- logging.debug(fn_name + "Retrieve all the tasklists for the user") logservice.flush() next_tasklists_page_token = None more_tasklists_data_to_retrieve = True while more_tasklists_data_to_retrieve: if self.is_test_user: logging.debug(fn_name + "calling tasklists.list().execute() to create tasklists list") logservice.flush() retry_count = constants.NUM_API_RETRIES while retry_count > 0: try: if next_tasklists_page_token: tasklists_data = self.tasklists_svc.list(pageToken=next_tasklists_page_token).execute() else: tasklists_data = self.tasklists_svc.list().execute() # Successfully retrieved data, so break out of retry loop break except Exception, e: retry_count = retry_count - 1 if retry_count > 0: logging.warning(fn_name + "Error retrieving list of tasklists. " + str(retry_count) + " retries remaining") logservice.flush() if retry_count <= 2: logging.debug(fn_name + "Sleeping for " + str(settings.WORKER_API_RETRY_SLEEP_DURATION) + " seconds before retrying") logservice.flush() time.sleep(settings.WORKER_API_RETRY_SLEEP_DURATION) else: logging.exception(fn_name + "Still error retrieving list of tasklists after " + str(constants.NUM_API_RETRIES) + " retries. Giving up") logservice.flush() raise e if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasklists_data ==>") logging.debug(tasklists_data) logservice.flush() if tasklists_data.has_key(u'items'): tasklists_list = tasklists_data[u'items'] else: # If there are no tasklists, then there will be no 'items' element. This could happen if # the user has deleted all their tasklists. Not sure if this is even possible, but # checking anyway, since it is possible to have a tasklist without 'items' (see issue #9) logging.debug(fn_name + "User has no tasklists.") logservice.flush() tasklists_list = [] # tasklists_list is a list containing the details of the user's tasklists. # We are only interested in the title # if self.is_test_user and settings.DUMP_DATA: # logging.debug(fn_name + "tasklists_list ==>") # logging.debug(tasklists_list) # --------------------------------------- # Process all the tasklists for this user # --------------------------------------- for tasklist_data in tasklists_list: total_num_tasklists = total_num_tasklists + 1 if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasklist_data ==>") logging.debug(tasklist_data) logservice.flush() """ Example of a tasklist entry; u'id': u'MDAxNTkzNzU0MzA0NTY0ODMyNjI6MDow', u'kind': u'tasks#taskList', u'selfLink': u'https://www.googleapis.com/tasks/v1/users/@me/lists/MDAxNTkzNzU0MzA0NTY0ODMyNjI6MDow', u'title': u'Default List', u'updated': u'2012-01-28T07:30:18.000Z'}, """ tasklist_title = tasklist_data[u'title'] tasklist_id = tasklist_data[u'id'] if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "Process all the tasks in " + str(tasklist_title)) logservice.flush() # ===================================================== # Process all the tasks in this task list # ===================================================== tasklist_dict, num_tasks = self.get_tasks_in_tasklist(tasklist_title, tasklist_id, include_hidden, include_completed, include_deleted) # Track number of tasks per tasklist tasks_per_list.append(num_tasks) total_num_tasks = total_num_tasks + num_tasks self.process_tasks_job.total_progress = total_num_tasks self.process_tasks_job.tasklist_progress = 0 # Because total_progress now includes num_tasks for current tasklist self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.message = '' logging.debug(fn_name + "Processed tasklist. Updated job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() # if self.is_test_user: # logging.debug(fn_name + "Adding %d tasks to tasklist" % len(tasklist_dict[u'tasks'])) # Add the data for this tasklist (including all the tasks) into the collection of tasklists tasklists.append(tasklist_dict) # Check if there is another page of tasklists to be retrieved if tasklists_data.has_key('nextPageToken'): # There is another page of tasklists to be retrieved for this user, # which we'll retrieve next time around the while loop. # This happens if there is more than 1 page of tasklists. # It seems that each page contains 20 tasklists. more_tasklists_data_to_retrieve = True # Go around while loop again next_tasklists_page_token = tasklists_data['nextPageToken'] # if self.is_test_user: # logging.debug(fn_name + "There is (at least) one more page of tasklists to be retrieved") else: # This is the last (or only) page of results (list of tasklists) more_tasklists_data_to_retrieve = False next_tasklists_page_token = None # *** end while more_tasks_data_to_retrieve *** # ------------------------------------------------------ # Store the data, so we can return it to the user # ------------------------------------------------------ """ tasklists is a list of tasklist structures structure of tasklist { "title" : tasklist.title, # Name of this tasklist "tasks" : [ task ] # List of task items in this tasklist } structure of task { "title" : title, # Free text "status" : status, # "completed" | "needsAction" "id" : id, # Used when determining parent-child relationships "parent" : parent, # OPT: ID of the parent of this task (only if this is a sub-task) "notes" : notes, # OPT: Free text "due" : due, # OPT: Date due, e.g. 2012-01-30T00:00:00.000Z NOTE time = 0 "updated" : updated, # Timestamp, e.g., 2012-01-26T07:47:18.000Z "completed" : completed # Timestamp, e.g., 2012-01-27T10:38:56.000Z } """ # Delete existing backup data records tasklist_data_records = model.TasklistsData.gql("WHERE ANCESTOR IS :1", db.Key.from_path(settings.DB_KEY_TASKS_BACKUP_DATA, self.user_email)) num_records = tasklist_data_records.count() logging.debug(fn_name + "Deleting " + str(num_records) + " old blobs") logservice.flush() for tasklists_data_record in tasklist_data_records: tasklists_data_record.delete() # logging.debug(fn_name + "Pickling tasks data ...") pickled_tasklists = pickle.dumps(tasklists) # logging.debug(fn_name + "Pickled data size = " + str(len(pickled_tasklists))) data_len = len(pickled_tasklists) # Multiply by 1.0 float value so that we can use ceiling to find number of Blobs required num_of_blobs = int(math.ceil(data_len * 1.0 / constants.MAX_BLOB_SIZE)) logging.debug(fn_name + "Calculated " + str(num_of_blobs) + " blobs required to store " + str(data_len) + " bytes") logservice.flush() try: for i in range(num_of_blobs): # Write backup data records tasklist_rec = model.TasklistsData(db.Key.from_path(settings.DB_KEY_TASKS_BACKUP_DATA, self.user_email)) slice_start = int(i*constants.MAX_BLOB_SIZE) slice_end = int((i+1)*constants.MAX_BLOB_SIZE) # logging.debug(fn_name + "Creating part " + str(i+1) + " of " + str(num_of_blobs) + # " using slice " + str(slice_start) + " to " + str(slice_end)) pkl_part = pickled_tasklists[slice_start : slice_end] tasklist_rec.pickled_tasks_data = pkl_part tasklist_rec.idx = i tasklist_rec.put() # logging.debug(fn_name + "Marking backup job complete") end_time = datetime.datetime.now() process_time = end_time - start_time proc_time_str = str(process_time.seconds) + "." + str(process_time.microseconds)[:3] + " seconds" # Mark backup completed summary_msg = "Retrieved %d tasks from %d tasklists" % (total_num_tasks, total_num_tasklists) breakdown_msg = "Tasks per list: " + str(tasks_per_list) self.process_tasks_job.status = constants.ExportJobStatus.EXPORT_COMPLETED self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() # self.process_tasks_job.message = summary_msg + " in " + proc_time_str self.process_tasks_job.message = summary_msg + " at " + \ self.process_tasks_job.job_start_timestamp.strftime("%H:%M UTC, %a %d %b %Y") # logging.debug(fn_name + "COMPLETED: Export complete - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + # str(self.process_tasks_job.total_progress) + ", msg: '" + # str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logging.info(fn_name + "COMPLETED: " + summary_msg + " for " + self.user_email + " in " + proc_time_str) logservice.flush() self.process_tasks_job.put() try: end_time = datetime.datetime.now() process_time = end_time - start_time processing_time = process_time.days * 3600*24 + process_time.seconds + process_time.microseconds / 1000000.0 included_options_str = "Includes: Completed = %s, Deleted = %s, Hidden = %s" % (str(include_completed), str(include_deleted), str(include_hidden)) logging.debug(fn_name + "STATS: Started at " + str(start_time) + "\n " + summary_msg + "\n " + breakdown_msg + "\n " + proc_time_str + "\n " + included_options_str) logservice.flush() usage_stats = model.UsageStats( user_hash = hash(self.user_email), number_of_tasks = self.process_tasks_job.total_progress, number_of_tasklists = total_num_tasklists, tasks_per_tasklist = tasks_per_list, include_completed = include_completed, include_deleted = include_deleted, include_hidden = include_hidden, start_time = start_time, processing_time = processing_time) usage_stats.put() logging.debug(fn_name + "Saved stats") logservice.flush() except Exception, e: logging.exception("Error saving stats") logservice.flush() # Don't bother doing anything else, because stats aren't critical except apiproxy_errors.RequestTooLargeError, e: logging.exception(fn_name + "Error putting results in DB - Request too large") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Tasklists data is too large - Unable to store tasklists in DB: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "apiproxy_errors.RequestTooLargeError - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() except Exception, e: logging.exception(fn_name + "Error putting results in DB") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Unable to store tasklists in DB: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "Exception - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put()
def get_tasks_in_tasklist(self, tasklist_title, tasklist_id, include_hidden, include_completed, include_deleted): """ Returns all the tasks in the tasklist arguments: tasklist_title -- Name of the tasklist tasklist_id -- ID used to retrieve tasks from this tasklist MUST match the ID returned in the tasklist data include_hidden -- If true, include hidden tasks in the backup include_completed -- If true, include completed tasks in the backup include_deleted -- If true, include deleted tasks in the backup returns a tuple; two-element dictionary; 'title' is a string, the name of the tasklist 'tasks' is a list. Each element in the list is dictionary representing 1 task number of tasks """ fn_name = "CreateBackupHandler.get_tasks_in_tasklist(): " tasklist_dict = {} # Blank dictionary for this tasklist tasklist_dict[u'title'] = tasklist_title # Store the tasklist name in the dictionary tasklist_dict[u'id'] = tasklist_id # Store the tasklist ID in the dictionary num_tasks = 0 more_tasks_data_to_retrieve = True next_tasks_page_token = None # Keep track of when last updated, to prevent excessive DB access which could exceed quota prev_progress_timestamp = datetime.datetime.now() if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "include_hidden = " + str(include_hidden) + ", include_completed = " + str(include_completed) + ", include_deleted = " + str(include_deleted)) logservice.flush() # --------------------------------------------------------------------------- # Retrieve the tasks in this tasklist, and store as "tasks" in the dictionary # --------------------------------------------------------------------------- while more_tasks_data_to_retrieve: retry_count = constants.NUM_API_RETRIES while retry_count > 0: try: # Retrieve a page of (up to 100) tasks if next_tasks_page_token: # Get the next page of results # This happens if there are more than 100 tasks in the list # See http://code.google.com/apis/tasks/v1/using.html#api_params # "Maximum allowable value: maxResults=100" tasks_data = self.tasks_svc.list(tasklist = tasklist_id, pageToken=next_tasks_page_token, showHidden=include_hidden, showCompleted=include_completed, showDeleted=include_deleted).execute() else: # Get the first (or only) page of results for this tasklist tasks_data = self.tasks_svc.list(tasklist = tasklist_id, showHidden=include_hidden, showCompleted=include_completed, showDeleted=include_deleted).execute() # Succeeded, so continue break except Exception, e: retry_count = retry_count - 1 if retry_count > 0: logging.warning(fn_name + "Error retrieving tasks, " + str(retry_count) + " retries remaining") logservice.flush() # Last chances - sleep to give the server some extra time before re-requesting if retry_count <= 2: logging.debug(fn_name + "Sleeping for " + str(settings.WORKER_API_RETRY_SLEEP_DURATION) + " seconds before retrying") logservice.flush() time.sleep(settings.WORKER_API_RETRY_SLEEP_DURATION) else: logging.exception(fn_name + "Still error retrieving tasks for tasklist after " + str(constants.NUM_API_RETRIES) + " retries. Giving up") logservice.flush() raise e if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasks_data ==>") logging.debug(tasks_data) if not tasks_data.has_key(u'items'): # When using the Google Tasks webpage at https://mail.google.com/tasks/canvas, there will always # be at least one task in any tasklist, because when deleting the last task, a new blank task is # automatically created. # However, a third-party app (e.g., Calengoo on Android) CAN delete all the tasks in a task list, # which results in a tasklist without an 'items' element. logging.debug(fn_name + "No tasks in tasklist") logservice.flush() else: try: tasks = tasks_data[u'items'] # Store all the tasks (List of Dict) except Exception, e: logging.exception(fn_name, "Exception extracting items from tasks_data.") #logging.error(tasks_data) logservice.flush() raise e # if self.is_test_user and settings.DUMP_DATA: # logging.debug(fn_name + "tasks ==>") # logging.debug(tasks) # logservice.flush() # ------------------------------------------------------------------------------------------------ # Fix date/time format for each task, so that the date/time values can be used in Django templates # Convert the yyyy-mm-ddThh:mm:ss.dddZ format to a datetime object, and store that. # There have been occassional format errors in the 'completed' property, # due to 'completed' value such as "-1701567-04-26T07:12:55.000Z" # According to http://docs.python.org/library/datetime.html # "The exact range of years for which strftime() works also varies across platforms. # Regardless of platform, years before 1900 cannot be used." # so if any date/timestamp value is invalid, set the property to '1900-01-01 00:00:00' # NOTE: Sometimes a task has a completion date of '0000-01-01T00:00:00.000Z', which also cannot # be converted to datetime, because the earliest allowable datetime year is 0001 # ------------------------------------------------------------------------------------------------ for t in tasks: num_tasks = num_tasks + 1 date_due = t.get(u'due') if date_due: try: new_due_date = datetime.datetime.strptime(date_due, "%Y-%m-%dT00:00:00.000Z").date() except ValueError, e: new_due_date = datetime.date(1900, 1, 1) logging.warning(fn_name + "Invalid 'due' timestamp (" + str(date_due) + "), so using " + str(new_due_date) + ": " + shared.get_exception_msg(e)) logservice.flush() t[u'due'] = new_due_date datetime_updated = t.get(u'updated') if datetime_updated: try: new_datetime_updated = datetime.datetime.strptime(datetime_updated, "%Y-%m-%dT%H:%M:%S.000Z") except ValueError, e: new_datetime_updated = datetime.datetime(1900, 1, 1, 0, 0, 0) logging.warning(fn_name + "Invalid 'updated' timestamp (" + str(datetime_updated) + "), so using " + str(new_datetime_updated) + ": " + shared.get_exception_msg(e)) logservice.flush() t[u'updated'] = new_datetime_updated
self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Unable to store tasklists in DB: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "Exception - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() except urlfetch_errors.DeadlineExceededError, e: logging.exception(fn_name + "urlfetch_errors.DeadlineExceededError:") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "urlfetch_errors.DeadlineExceededError: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "urlfetch_errors.DeadlineExceededError - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'") logservice.flush() self.process_tasks_job.put() except apiproxy_errors.DeadlineExceededError, e: logging.exception(fn_name + "apiproxy_errors.DeadlineExceededError:") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "apiproxy_errors.DeadlineExceededError: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug(fn_name + "apiproxy_errors.DeadlineExceededError - Job status: '" + str(self.process_tasks_job.status) + "', progress: " +
def _file_contains_valid_columns(file, valid_column_names): """Returns True if file is a CSV that contains (at least) all the valid_column_names file A file object valid_column_names A list of valid column names Returns a tuple; msg 'OK' if the header row is valid. Returns a string describing the problem if the header is not valid. display_line Returns True if the probline is with the data in the first row. i.e., the calling method should display the line to the user Uses unicodecsv so that unicode CSV files can be parsed. There may be extra columns, but that doesn't matter because we are processing the CSV as a dictionary. """ fn_name = "_file_contains_valid_columns: " try: file.seek(0) dict_reader=unicodecsv.DictReader(file,dialect='excel') # Check if uploaded file appears to be an Outlook file num_outlook_column_names = 0 for col_name in constants.OUTLOOK_HEADER_ROW: if col_name in dict_reader.fieldnames: num_outlook_column_names += 1 # logging.debug(fn_name + "DEBUG: Found '" + col_name + "' in header row") # else: # logging.debug(fn_name + "DEBUG: '" + col_name + "' not found in header row") # logging.debug(fn_name + "DEBUG: Found " + str(num_outlook_column_names) + " of " + # str(len(constants.OUTLOOK_HEADER_ROW)) + " Outlook column names") if num_outlook_column_names == len(constants.OUTLOOK_HEADER_ROW): return (host_settings.APP_TITLE + " cannot directly import an Outlook export file. Please refer to the <a href='" + settings.OUTLOOK_INSTRUCTIONS_URL + "'>instructions for exporting tasks from outlook</a>."), False for col_name in valid_column_names: if not col_name in dict_reader.fieldnames: return "Missing '" + col_name + "' column", True # All columns found (there may be extra columns, but that doesn't matter) return 'OK', False except Exception, e: # 2013-02-19: Kludge to check if the exception is caused by a file that doesn't end in CR/LF # At present, processing Mac files returns # "Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?" # We check for "new-line" or "newline" and hope that future version of Python will contain the same keywords. display_line = True kwds = ["new-line", "newline", "line-feed", "linefeed"] err_msg_list = str(e).split(' ') if any([kwd in kwds for kwd in err_msg_list]): msg = "Unable to process file. " + constants.INVALID_LINE_TERMINATOR_MSG logging.info(fn_name + constants.INVALID_FORMAT_LOG_LABEL + shared.get_exception_msg(e)) # Don't display the offending line, because it could be the entire file because there was no line terminator! display_line = False else: msg = "Error checking column names - " + shared.get_exception_msg(e) logging.warning(fn_name + constants.INVALID_FORMAT_LOG_LABEL + msg) return msg, display_line
+ ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'" ) logservice.flush() self.process_tasks_job.put() except urlfetch_errors.DeadlineExceededError, e: logging.exception(fn_name + "urlfetch_errors.DeadlineExceededError:") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = "" self.process_tasks_job.error_message = "urlfetch_errors.DeadlineExceededError: " + shared.get_exception_msg( e ) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() logging.debug( fn_name + "urlfetch_errors.DeadlineExceededError - Job status: '" + str(self.process_tasks_job.status) + "', progress: " + str(self.process_tasks_job.total_progress) + ", msg: '" + str(self.process_tasks_job.message) + "', err msg: '" + str(self.process_tasks_job.error_message) + "'" ) logservice.flush()
def depth_is_valid(task_row_data, row_num, is_test_user, compare_with_previous_row = False, is_first_task_in_tasklist = False, prev_row_depth = 0): """ params task_row_data a dictionary representing a row from a CSV file row_num 1-based data row number is_test_user True if this is a test user, used to provide additional logging compare_with_previous_row [OPTIONAL] If True, check that depth of first task in tasklist is zero, and compare depth value with value of previous task is_first_task_in_tasklist [OPTIONAL] True if this task is the first task in a new tasklist prev_row_depth [OPTIONAL] depth value of the previous data row returns result True if the depth value is valid depth The depth value of this task err_msg1 Error message set if result is False err_msg2 Error message set if result is False """ fn_name = "depth_is_valid: " # --------------------- # Check depth value # --------------------- # Depth is optional (worker will use depth = 0 if missing) # If depth exists; # it must be numeric OR a blank string (which is interpretted as 0) # it must be zero for first task in each tasklist err_msg1 = "" err_msg2 = "" depth = 0 # Check if task has a depth value if not task_row_data.has_key('depth'): # OK: No depth value, so worker will use depth = 0 return True, 0, "", "" # Check for missing depth field if task_row_data['depth'] == None: err_msg1 = "Missing depth field for data row " + str(row_num) err_msg2 = "Either the number of fields does not match the number of header columns, or there may be a newline in a non-quoted field" # ERROR: No depth return False, 0, err_msg1, err_msg2 # Check for blank depth value depth_str = unicode(task_row_data['depth']).strip() if len(depth_str) == 0: if is_test_user: logging.debug(fn_name + "TEST: Blank 'depth' property in data row " + str(row_num) + ". Will set depth = 0 and import task as root") logservice.flush() # OK: Blank depth value, so worker will use depth = 0 return True, 0, "", "" # Interpret the depth value try: depth = int(depth_str) except Exception, e: err_msg1 = ("Invalid 'depth' value [" + unicode(task_row_data['depth']) + "] for data row " + str(row_num)) err_msg2 = "The 'depth' value can be blank, or any whole number from -1 upwards" if is_test_user: logging.info(fn_name + "TEST: " + err_msg1 + ": " + shared.get_exception_msg(e)) logservice.flush() return False, 0, err_msg1, err_msg2
def post(self): """ Delete a selection of Blobstores (selected in form, and posted """ fn_name = "BulkDeleteBlobstoreHandler.post(): " logging.debug(fn_name + "<Start>") logservice.flush() try: self.response.out.write('<html><body>') blobstores_to_delete = self.request.get_all('blob_key') del_count = 0 for blob_key in blobstores_to_delete: blob_info = blobstore.BlobInfo.get(blob_key) if blob_info: try: blob_info.delete() del_count = del_count + 1 except Exception, e: logging.exception(fn_name + "Exception deleting blobstore [" + str(del_count) + "] " + str(blob_key)) self.response.out.write("""<div>Error deleting blobstore %s</div>%s""" % (blob_key, shared.get_exception_msg(e))) else: self.response.out.write("""<div>Blobstore %s doesn't exist</div>""" % blob_key) self.response.out.write('Deleted ' + str(del_count) + ' blobstores') self.response.out.write('<br /><br /><a href="' + settings.ADMIN_MANAGE_BLOBSTORE_URL + '">Back to Blobstore Management</a><br /><br />') self.response.out.write("""<br /><br /><a href=""" + settings.MAIN_PAGE_URL + """>Home page</a><br /><br />""") self.response.out.write('</body></html>') logging.debug(fn_name + "<End>") logservice.flush()
def _export_tasks(self): fn_name = "_export_tasks: " logging.debug(fn_name + "<Start>") logservice.flush() start_time = datetime.datetime.now() include_hidden = self.process_tasks_job.include_hidden include_completed = self.process_tasks_job.include_completed include_deleted = self.process_tasks_job.include_deleted summary_msg = '' # Retrieve all tasks for the user try: logging.debug(fn_name + "include_completed = " + str(include_completed) + ", include_hidden = " + str(include_hidden) + ", include_deleted = " + str(include_deleted)) logservice.flush() # ############################################## # FLOW # ---------------------------------------------- # For each page of taskslists # For each tasklist # For each page of tasks # For each task # Fix date format # Add tasks to tasklist collection # Add tasklist to tasklists collection # Use tasklists collection to return tasks backup to user self.process_tasks_job.status = constants.ExportJobStatus.BUILDING self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.message = 'Retrieving tasks from server ...' self._log_progress("Building") self.process_tasks_job.put() # This list will contain zero or more tasklist dictionaries, which each contain tasks tasklists = [] total_num_tasklists = 0 total_num_tasks = 0 tasks_per_list = [] # --------------------------------------- # Retrieve all the tasklists for the user # --------------------------------------- logging.debug(fn_name + "Retrieve all the tasklists for the user") logservice.flush() next_tasklists_page_token = None more_tasklists_data_to_retrieve = True while more_tasklists_data_to_retrieve: if self.is_test_user: logging.debug(fn_name + "calling tasklists.list().execute() to create tasklists list") logservice.flush() retry_count = settings.NUM_API_TRIES while retry_count > 0: try: if next_tasklists_page_token: tasklists_data = self.tasklists_svc.list(pageToken=next_tasklists_page_token).execute() else: tasklists_data = self.tasklists_svc.list().execute() # Successfully retrieved data, so break out of retry loop break except Exception, e: retry_count = retry_count - 1 if retry_count > 0: if isinstance(e, AccessTokenRefreshError): # Log first 'n' AccessTokenRefreshError as Info, because they are reasonably common, # and the system usually continues normally after the 2nd call to # "new_request: Refreshing due to a 401" # Occassionally, the system seems to need a 3rd attempt # (i.e., success after waiting 45 seconds) logging.info(fn_name + "Access Token Refresh Error whilst retrieving list of tasklists (1st time, not yet an error). " + str(retry_count) + " attempts remaining: " + shared.get_exception_msg(e)) else: logging.warning(fn_name + "Error retrieving list of tasklists. " + str(retry_count) + " attempts remaining: " + shared.get_exception_msg(e)) logservice.flush() if retry_count <= 2: logging.debug(fn_name + "Giving server an extra chance; Sleeping for " + str(settings.WORKER_API_RETRY_SLEEP_DURATION) + " seconds before retrying") logservice.flush() # Update job_progress_timestamp so that job doesn't time out self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.put() time.sleep(settings.WORKER_API_RETRY_SLEEP_DURATION) else: logging.exception(fn_name + "Still error retrieving list of tasklists after " + str(settings.NUM_API_TRIES) + " attempts. Giving up") logservice.flush() raise e if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasklists_data ==>") logging.debug(tasklists_data) logservice.flush() if tasklists_data.has_key(u'items'): tasklists_list = tasklists_data[u'items'] else: # If there are no tasklists, then there will be no 'items' element. This could happen if # the user has deleted all their tasklists. Not sure if this is even possible, but # checking anyway, since it is possible to have a tasklist without 'items' (see issue #9) logging.debug(fn_name + "User has no tasklists.") logservice.flush() tasklists_list = [] # tasklists_list is a list containing the details of the user's tasklists. # We are only interested in the title # if self.is_test_user and settings.DUMP_DATA: # logging.debug(fn_name + "tasklists_list ==>") # logging.debug(tasklists_list) # --------------------------------------- # Process all the tasklists for this user # --------------------------------------- for tasklist_data in tasklists_list: total_num_tasklists = total_num_tasklists + 1 if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasklist_data ==>") logging.debug(tasklist_data) logservice.flush() """ Example of a tasklist entry; u'id': u'MDAxNTkzNzU0MzA0NTY0ODMyNjI6MDow', u'kind': u'tasks#taskList', u'selfLink': u'https://www.googleapis.com/tasks/v1/users/@me/lists/MDAxNTkzNzU0MzA0NTY0ODMyNjI6MDow', u'title': u'Default List', u'updated': u'2012-01-28T07:30:18.000Z'}, """ tasklist_title = tasklist_data[u'title'] tasklist_id = tasklist_data[u'id'] if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "Process all the tasks in " + str(tasklist_title)) logservice.flush() # ===================================================== # Process all the tasks in this task list # ===================================================== tasklist_dict, num_tasks = self._get_tasks_in_tasklist(tasklist_title, tasklist_id, include_hidden, include_completed, include_deleted) # Track number of tasks per tasklist tasks_per_list.append(num_tasks) total_num_tasks = total_num_tasks + num_tasks self.process_tasks_job.total_progress = total_num_tasks self.process_tasks_job.tasklist_progress = 0 # Because total_progress now includes num_tasks for current tasklist self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.message = '' self._log_progress("Processed tasklist") self.process_tasks_job.put() # if self.is_test_user: # logging.debug(fn_name + "Adding %d tasks to tasklist" % len(tasklist_dict[u'tasks'])) # Add the data for this tasklist (including all the tasks) into the collection of tasklists tasklists.append(tasklist_dict) # Check if there is another page of tasklists to be retrieved if tasklists_data.has_key('nextPageToken'): # There is another page of tasklists to be retrieved for this user, # which we'll retrieve next time around the while loop. # This happens if there is more than 1 page of tasklists. # It seems that each page contains 20 tasklists. more_tasklists_data_to_retrieve = True # Go around while loop again next_tasklists_page_token = tasklists_data['nextPageToken'] # if self.is_test_user: # logging.debug(fn_name + "There is (at least) one more page of tasklists to be retrieved") else: # This is the last (or only) page of results (list of tasklists) more_tasklists_data_to_retrieve = False next_tasklists_page_token = None # *** end while more_tasks_data_to_retrieve *** # ------------------------------------------------------ # Store the data, so we can return it to the user # ------------------------------------------------------ """ tasklists is a list of tasklist structures structure of tasklist { "title" : tasklist.title, # Name of this tasklist "tasks" : [ task ] # List of task items in this tasklist } structure of task { "title" : title, # Free text "status" : status, # "completed" | "needsAction" "id" : id, # Used when determining parent-child relationships "parent" : parent, # OPT: ID of the parent of this task (only if this is a sub-task) "notes" : notes, # OPT: Free text "due" : due, # OPT: Date due, e.g. 2012-01-30T00:00:00.000Z NOTE time = 0 "updated" : updated, # Timestamp, e.g., 2012-01-26T07:47:18.000Z "completed" : completed # Timestamp, e.g., 2012-01-27T10:38:56.000Z } """ # Delete existing backup data records tasklist_data_records = model.TasklistsData.gql("WHERE ANCESTOR IS :1", db.Key.from_path(settings.DB_KEY_TASKS_BACKUP_DATA, self.user_email)) num_records = tasklist_data_records.count() logging.debug(fn_name + "Deleting " + str(num_records) + " old blobs") logservice.flush() for tasklists_data_record in tasklist_data_records: tasklists_data_record.delete() # logging.debug(fn_name + "Pickling tasks data ...") pickled_tasklists = pickle.dumps(tasklists) # logging.debug(fn_name + "Pickled data size = " + str(len(pickled_tasklists))) data_len = len(pickled_tasklists) # Multiply by 1.0 float value so that we can use ceiling to find number of Blobs required num_of_blobs = int(math.ceil(data_len * 1.0 / constants.MAX_BLOB_SIZE)) logging.debug(fn_name + "Calculated " + str(num_of_blobs) + " blobs required to store " + str(data_len) + " bytes") logservice.flush() try: for i in range(num_of_blobs): # Write backup data records tasklist_rec = model.TasklistsData(db.Key.from_path(settings.DB_KEY_TASKS_BACKUP_DATA, self.user_email)) slice_start = int(i*constants.MAX_BLOB_SIZE) slice_end = int((i+1)*constants.MAX_BLOB_SIZE) # logging.debug(fn_name + "Creating part " + str(i+1) + " of " + str(num_of_blobs) + # " using slice " + str(slice_start) + " to " + str(slice_end)) pkl_part = pickled_tasklists[slice_start : slice_end] tasklist_rec.pickled_tasks_data = pkl_part tasklist_rec.idx = i tasklist_rec.put() # logging.debug(fn_name + "Marking backup job complete") end_time = datetime.datetime.now() process_time = end_time - start_time proc_time_str = str(process_time.seconds) + "." + str(process_time.microseconds)[:3] + " seconds" # Mark backup completed summary_msg = "Retrieved %d tasks from %d tasklists" % (total_num_tasks, total_num_tasklists) breakdown_msg = "Tasks per list: " + str(tasks_per_list) self.process_tasks_job.status = constants.ExportJobStatus.EXPORT_COMPLETED self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() # self.process_tasks_job.message = summary_msg + " in " + proc_time_str self.process_tasks_job.message = summary_msg + " at " + \ start_time.strftime("%H:%M UTC, %a %d %b %Y") logging.info(fn_name + "COMPLETED: " + summary_msg + " for " + self.user_email + " in " + proc_time_str) logservice.flush() self.process_tasks_job.put() try: end_time = datetime.datetime.now() process_time = end_time - start_time processing_time = process_time.days * 3600*24 + process_time.seconds + process_time.microseconds / 1000000.0 included_options_str = "Includes: Completed = %s, Deleted = %s, Hidden = %s" % (str(include_completed), str(include_deleted), str(include_hidden)) logging.debug(fn_name + "STATS: Job started at " + str(self.process_tasks_job.job_start_timestamp) + "\n Worker started at " + str(start_time) + "\n " + summary_msg + "\n " + breakdown_msg + "\n " + proc_time_str + "\n " + included_options_str) logservice.flush() usage_stats = model.UsageStats( user_hash = hash(self.user_email), number_of_tasks = self.process_tasks_job.total_progress, number_of_tasklists = total_num_tasklists, tasks_per_tasklist = tasks_per_list, include_completed = include_completed, include_deleted = include_deleted, include_hidden = include_hidden, start_time = start_time, processing_time = processing_time) usage_stats.put() logging.debug(fn_name + "Saved stats") logservice.flush() except Exception, e: logging.exception("Error saving stats") logservice.flush() # Don't bother doing anything else, because stats aren't critical except apiproxy_errors.RequestTooLargeError, e: logging.exception(fn_name + "Error putting results in DB - Request too large") logservice.flush() self.process_tasks_job.status = constants.ExportJobStatus.ERROR self.process_tasks_job.message = '' self.process_tasks_job.error_message = "Tasklists data is too large - Unable to store tasklists in DB: " + shared.get_exception_msg(e) self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self._log_progress("apiproxy_errors.RequestTooLargeError") self.process_tasks_job.put()
def get_tasks_in_tasklist(self, tasklist_title, tasklist_id, include_hidden, include_completed, include_deleted): """ Returns all the tasks in the tasklist arguments: tasklist_title -- Name of the tasklist tasklist_id -- ID used to retrieve tasks from this tasklist MUST match the ID returned in the tasklist data include_hidden -- If true, include hidden tasks in the backup include_completed -- If true, include completed tasks in the backup include_deleted -- If true, include deleted tasks in the backup returns a tuple; two-element dictionary; 'title' is a string, the name of the tasklist 'tasks' is a list. Each element in the list is dictionary representing 1 task number of tasks """ fn_name = "CreateBackupHandler.get_tasks_in_tasklist(): " tasklist_dict = {} # Blank dictionary for this tasklist tasklist_dict[u"title"] = tasklist_title # Store the tasklist name in the dictionary tasklist_dict[u"id"] = tasklist_id # Store the tasklist ID in the dictionary num_tasks = 0 more_tasks_data_to_retrieve = True next_tasks_page_token = None # Keep track of when last updated, to prevent excessive DB access which could exceed quota prev_progress_timestamp = datetime.datetime.now() if self.is_test_user and settings.DUMP_DATA: logging.debug( fn_name + "include_hidden = " + str(include_hidden) + ", include_completed = " + str(include_completed) + ", include_deleted = " + str(include_deleted) ) logservice.flush() # --------------------------------------------------------------------------- # Retrieve the tasks in this tasklist, and store as "tasks" in the dictionary # --------------------------------------------------------------------------- while more_tasks_data_to_retrieve: retry_count = constants.NUM_API_RETRIES while retry_count > 0: try: # Retrieve a page of (up to 100) tasks if next_tasks_page_token: # Get the next page of results # This happens if there are more than 100 tasks in the list # See http://code.google.com/apis/tasks/v1/using.html#api_params # "Maximum allowable value: maxResults=100" tasks_data = self.tasks_svc.list( tasklist=tasklist_id, pageToken=next_tasks_page_token, showHidden=include_hidden, showCompleted=include_completed, showDeleted=include_deleted, ).execute() else: # Get the first (or only) page of results for this tasklist tasks_data = self.tasks_svc.list( tasklist=tasklist_id, showHidden=include_hidden, showCompleted=include_completed, showDeleted=include_deleted, ).execute() # Succeeded, so continue break except Exception, e: retry_count = retry_count - 1 if retry_count > 0: logging.warning(fn_name + "Error retrieving tasks, " + str(retry_count) + " retries remaining") logservice.flush() # Last chances - sleep to give the server some extra time before re-requesting if retry_count <= 2: logging.debug( fn_name + "Sleeping for " + str(settings.WORKER_API_RETRY_SLEEP_DURATION) + " seconds before retrying" ) logservice.flush() time.sleep(settings.WORKER_API_RETRY_SLEEP_DURATION) else: logging.exception( fn_name + "Still error retrieving tasks for tasklist after " + str(constants.NUM_API_RETRIES) + " retries. Giving up" ) logservice.flush() raise e if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasks_data ==>") logging.debug(tasks_data) if not tasks_data.has_key(u"items"): # When using the Google Tasks webpage at https://mail.google.com/tasks/canvas, there will always # be at least one task in any tasklist, because when deleting the last task, a new blank task is # automatically created. # However, a third-party app (e.g., Calengoo on Android) CAN delete all the tasks in a task list, # which results in a tasklist without an 'items' element. logging.debug(fn_name + "No tasks in tasklist") logservice.flush() else: try: tasks = tasks_data[u"items"] # Store all the tasks (List of Dict) except Exception, e: logging.exception(fn_name, "Exception extracting items from tasks_data.") # logging.error(tasks_data) logservice.flush() raise e # if self.is_test_user and settings.DUMP_DATA: # logging.debug(fn_name + "tasks ==>") # logging.debug(tasks) # logservice.flush() # ------------------------------------------------------------------------------------------------ # Fix date/time format for each task, so that the date/time values can be used in Django templates # Convert the yyyy-mm-ddThh:mm:ss.dddZ format to a datetime object, and store that. # There have been occassional format errors in the 'completed' property, # due to 'completed' value such as "-1701567-04-26T07:12:55.000Z" # According to http://docs.python.org/library/datetime.html # "The exact range of years for which strftime() works also varies across platforms. # Regardless of platform, years before 1900 cannot be used." # so if any date/timestamp value is invalid, set the property to '1900-01-01 00:00:00' # NOTE: Sometimes a task has a completion date of '0000-01-01T00:00:00.000Z', which also cannot # be converted to datetime, because the earliest allowable datetime year is 0001 # ------------------------------------------------------------------------------------------------ for t in tasks: num_tasks = num_tasks + 1 date_due = t.get(u"due") if date_due: try: new_due_date = datetime.datetime.strptime(date_due, "%Y-%m-%dT00:00:00.000Z").date() except ValueError, e: new_due_date = datetime.date(1900, 1, 1) logging.warning( fn_name + "Invalid 'due' timestamp (" + str(date_due) + "), so using " + str(new_due_date) + ": " + shared.get_exception_msg(e) ) logservice.flush() t[u"due"] = new_due_date datetime_updated = t.get(u"updated") if datetime_updated: try: new_datetime_updated = datetime.datetime.strptime( datetime_updated, "%Y-%m-%dT%H:%M:%S.000Z" ) except ValueError, e: new_datetime_updated = datetime.datetime(1900, 1, 1, 0, 0, 0) logging.warning( fn_name + "Invalid 'updated' timestamp (" + str(datetime_updated) + "), so using " + str(new_datetime_updated) + ": " + shared.get_exception_msg(e) ) logservice.flush() t[u"updated"] = new_datetime_updated
def _get_tasks_in_tasklist(self, tasklist_title, tasklist_id, include_hidden, include_completed, include_deleted): """ Returns all the tasks in the tasklist arguments: tasklist_title -- Name of the tasklist tasklist_id -- ID used to retrieve tasks from this tasklist MUST match the ID returned in the tasklist data include_hidden -- If true, include hidden tasks in the backup include_completed -- If true, include completed tasks in the backup include_deleted -- If true, include deleted tasks in the backup returns a tuple; two-element dictionary; 'title' is a string, the name of the tasklist 'tasks' is a list. Each element in the list is dictionary representing 1 task number of tasks """ fn_name = "CreateBackupHandler._get_tasks_in_tasklist(): " tasklist_dict = {} # Blank dictionary for this tasklist tasklist_dict[u'title'] = tasklist_title # Store the tasklist name in the dictionary tasklist_dict[u'id'] = tasklist_id # Store the tasklist ID in the dictionary num_tasks = 0 more_tasks_data_to_retrieve = True next_tasks_page_token = None # Keep track of when last updated, to prevent excessive DB access which could exceed quota prev_progress_timestamp = datetime.datetime.now() if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "TEST: include_completed = " + str(include_completed) + ", include_hidden = " + str(include_hidden) + ", include_deleted = " + str(include_deleted)) logservice.flush() # --------------------------------------------------------------------------- # Retrieve the tasks in this tasklist, and store as "tasks" in the dictionary # --------------------------------------------------------------------------- while more_tasks_data_to_retrieve: retry_count = settings.NUM_API_TRIES while retry_count > 0: try: # Retrieve a page of (up to 100) tasks if next_tasks_page_token: # Get the next page of results # This happens if there are more than 100 tasks in the list # See http://code.google.com/apis/tasks/v1/using.html#api_params # "Maximum allowable value: maxResults=100" tasks_data = self.tasks_svc.list(tasklist = tasklist_id, pageToken=next_tasks_page_token, showHidden=include_hidden, showCompleted=include_completed, showDeleted=include_deleted).execute() else: # Get the first (or only) page of results for this tasklist tasks_data = self.tasks_svc.list(tasklist = tasklist_id, showHidden=include_hidden, showCompleted=include_completed, showDeleted=include_deleted).execute() # Succeeded, so continue break except Exception, e: retry_count = retry_count - 1 if retry_count > 0: logging.warning(fn_name + "Error retrieving tasks, " + str(retry_count) + " attempts remaining: " + shared.get_exception_msg(e)) logservice.flush() # Last chances - sleep to give the server some extra time before re-requesting if retry_count <= 2: logging.debug(fn_name + "Giving server an extra chance; Sleeping for " + str(settings.WORKER_API_RETRY_SLEEP_DURATION) + " seconds before retrying") logservice.flush() # Update job_progress_timestamp so that job doesn't time out self.process_tasks_job.job_progress_timestamp = datetime.datetime.now() self.process_tasks_job.put() time.sleep(settings.WORKER_API_RETRY_SLEEP_DURATION) else: logging.exception(fn_name + "Still error retrieving tasks for tasklist after " + str(settings.NUM_API_TRIES) + " attempts. Giving up") logservice.flush() raise e if self.is_test_user and settings.DUMP_DATA: logging.debug(fn_name + "tasks_data ==>") logging.debug(tasks_data) if not tasks_data.has_key(u'items'): # When using the Google Tasks webpage at https://mail.google.com/tasks/canvas, there will always # be at least one task in any tasklist, because when deleting the last task, a new blank task is # automatically created. # However, a third-party app (e.g., Calengoo on Android) CAN delete all the tasks in a task list, # which results in a tasklist without an 'items' element. logging.debug(fn_name + "No tasks in tasklist") logservice.flush() else: try: tasks = tasks_data[u'items'] # Store all the tasks (List of Dict) except Exception, e: logging.exception(fn_name, "Exception extracting items from tasks_data: " + shared.get_exception_msg(e)) #logging.error(tasks_data) logservice.flush() raise e # if self.is_test_user and settings.DUMP_DATA: # logging.debug(fn_name + "tasks ==>") # logging.debug(tasks) # logservice.flush() for t in tasks: num_tasks = num_tasks + 1 # TODO: Investigate if including this will cause memory to be exceeded for very large tasks list # Store original RFC-3339 timestamps (used for raw2 export format) if t.has_key('due'): t['due_RFC3339'] = t['due'] if t.has_key('updated'): t['updated_RFC3339'] = t['updated'] if t.has_key('completed'): t['completed_RFC3339'] = t['completed'] # Converts the RFC-3339 string returned by the server to a date or datetime object # so that other methods (such as Django templates) can display a custom formatted date shared.set_timestamp(t, u'due', date_only=True) shared.set_timestamp(t, u'updated') shared.set_timestamp(t, u'completed') if tasklist_dict.has_key(u'tasks'): # This is the n'th page of task data for this tasklist, so extend the existing list of tasks tasklist_dict[u'tasks'].extend(tasks) else: # This is the first (or only) list of task for this tasklist tasklist_dict[u'tasks'] = tasks
shared.get_exception_msg(e)) except Exception, e: pass # Try the next format # logging.debug(fn_name + "DEBUG: Unable to parse '" + str(datetime_string) + # "' as '" + timestamp_format + "': " + shared.get_exception_msg(e)) # Could not parse datetime_string with any of the supplied format strings try: logging.info(fn_name + constants.INVALID_FORMAT_LOG_LABEL + "Unable to parse '" + str(field_name) + "' value '" + str(datetime_string) + "' as a datetime using any of the supplied formats") except Exception, e: # Just in case logging the datetime string causes an exception logging.info(fn_name + constants.INVALID_FORMAT_LOG_LABEL + "Unable to parse '" + str(field_name) + "' datetime string as a datetime using any of the supplied formats, and unable to log datetime string: " + shared.get_exception_msg(e)) return None except Exception, e: # Major error! try: logging.error(fn_name + "Error attempting to parse '" + str(field_name) + "' value '" + str(datetime_string) + "': " + shared.get_exception_msg(e)) except Exception, e: # Just in case logging the datetime string causes an exception logging.error(fn_name + "Error attempting to parse '" + str(field_name) + "'datetime string, and unable to log datetime string: " + shared.get_exception_msg(e)) return None