def update_list(list, start_msg, limit = 10): import email_loader import urllib required_fields = ['date', 'message_id', 'subject', 'body', 'sender'] thread_id_cache = {} thread_pool = {} message_pool = [] new = 0 err = None for i in range(start_msg, start_msg + limit): url = "%s/msg%05i.html" % (list.list_url, i) logging.info("loading an email from url %s" % url) # Try three times to fetch the URL result = None for attempt in range(3): try: result = urlfetch.fetch(url=url) err = None break except urlfetch.DownloadError, msg: err = msg continue if result is None: break if result.status_code == 404: # This message does not exist, so return messages collected so far. break elif result.status_code != 200: err = 'Got status code %i while trying to fetch %s' % (result.status_code, url) break logging.info("got content: %s" % result.content) result = email_loader.parser(StringIO(result.content)) logging.info("got result: %s" % result) result['source_url'] = url result['list_msg_id'] = i if not all((field in result) for field in required_fields): logging.error( "failed to update list %s msg %i with url %s; got bad parse result %s" % (list, i, url, result)) break result['date'] = datetime.strptime(' '.join(result['date'].split()[:-1]), '%a, %d %b %Y %H:%M:%S') # Determine thread_id if 'references' in result: for reference in result['references']: if 'thread_id' in result: break if reference in thread_id_cache: result['thread_id'] = thread_id_cache[reference] for reference in result['references']: if 'thread_id' in result: break ref = gql_limit1(Message, message_id = reference) if ref is not None: result['thread_id'] = ref.thread_id # If no thread_id was found, start a new thread with this message if 'thread_id' not in result: result['thread_id'] = result['message_id'] thread = Thread(thread_id = result['message_id'], list_url = list.list_url, last_message_time = result['date'], subject = result['subject'], last_message_body = result['body'], participants = [result['sender']]) thread_pool[result['thread_id']] = thread # Build the message object message = Message(**result) # Update the thread if result['thread_id'] not in thread_pool: thread = gql_limit1(Thread, thread_id = result['thread_id']) thread_pool[result['thread_id']] = thread thread = thread_pool[result['thread_id']] thread.last_message_time = message.date thread.last_message_body = message.body if message.sender not in thread.participants: thread.participants.append(message.sender) thread_id_cache[result['message_id']] = result['thread_id'] # Update the message_pool message_pool.append(message) new += 1