def _unquotevalue(value): # This is different than utils.collapse_rfc2231_value() because it doesn't # try to convert the value to a unicode. Message.get_param() and # Message.get_params() are both currently defined to return the tuple in # the face of RFC 2231 parameters. if isinstance(value, tuple): return value[0], value[1], utils.unquote(value[2]) else: return utils.unquote(value)
def from_mbox(self, mbfile): """ Upload all the emails in a mbox file into the database using kittystore API. :arg mbfile, a mailbox file from which the emails are extracted and upload to the database. :arg list_name, the fully qualified list name. """ cnt_imported = 0 cnt_read = 0 for message in mailbox.mbox(mbfile): if self.since: date = message["date"] if date: try: date = awarify(parse(date)) except ValueError, e: print "Can't parse date string in message %s: %s" \ % (message["message-id"], date) print e continue if date < self.since: continue cnt_read = cnt_read + 1 self.total_imported += 1 if self.verbose: print "%s (%d)" % (message["Message-Id"], self.total_imported) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header( "subject", TEXTWRAP_RE.sub(" ", message["subject"])) # Try to find the mailing-list subject prefix in the first email if cnt_read == 1: subject_prefix = PREFIX_RE.search(message["subject"]) if subject_prefix: self.mlist.display_name = unicode(subject_prefix.group(1)) if self.force_import: while self.store.is_message_in_list( self.mlist.fqdn_listname, unquote(message["Message-Id"])): oldmsgid = message["Message-Id"] message.replace_header( "Message-Id", "<%s-%s>" % (unquote(message["Message-Id"]), str(randint(0, 100)))) print( "Found duplicate, changing message id from %s to %s" % (oldmsgid, message["Message-Id"])) # Parse message to search for attachments try: attachments = self.extract_attachments(message) except DownloadError, e: print( "Could not download one of the attachments! " "Skipping this message. Error: %s" % e.args[0]) continue
def from_mbox(self, mbfile): """ Upload all the emails in a mbox file into the database using kittystore API. :arg mbfile, a mailbox file from which the emails are extracted and upload to the database. :arg list_name, the fully qualified list name. """ self.store.search_index = make_delayed(self.store.search_index) cnt_imported = 0 cnt_read = 0 for message in mailbox.mbox(mbfile): if self.since: date = message["date"] if date: try: date = awarify(parse(date)) except ValueError, e: print "Can't parse date string in message %s: %s" \ % (message["message-id"], date) print e continue if date < self.since: continue cnt_read = cnt_read + 1 self.total_imported += 1 if self.verbose: print "%s (%d)" % (message["Message-Id"], self.total_imported) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header("subject", TEXTWRAP_RE.sub(" ", message["subject"])) # Try to find the mailing-list subject prefix in the first email if not self.mlist.subject_prefix and message["subject"]: subject_prefix = PREFIX_RE.search(message["subject"]) if subject_prefix: self.mlist.subject_prefix = unicode(subject_prefix.group(1)) if self.force_import: while self.store.is_message_in_list( self.mlist.fqdn_listname, unquote(message["Message-Id"])): oldmsgid = message["Message-Id"] message.replace_header("Message-Id", "<%s-%s>" % (unquote(message["Message-Id"]), str(randint(0, 100)))) print("Found duplicate, changing message id from %s to %s" % (oldmsgid, message["Message-Id"])) # Parse message to search for attachments try: attachments = self.extract_attachments(message) except DownloadError, e: print ("Could not download one of the attachments! " "Skipping this message. Error: %s" % e.args[0]) continue
def from_mbox(self, mbfile): """ Upload all the emails in a mbox file into the database using kittystore API. :arg mbfile, a mailbox file from which the emails are extracted and upload to the database. :arg list_name, the fully qualified list name. """ cnt_imported = 0 cnt_read = 0 for message in mailbox.mbox(mbfile): cnt_read = cnt_read + 1 self.total_imported += 1 # Un-wrap the subject line if necessary if message["subject"]: message.replace_header("subject", TEXTWRAP_RE.sub(" ", message["subject"])) # Try to find the mailing-list subject prefix in the first email if cnt_read == 1: subject_prefix = PREFIX_RE.search(message["subject"]) if subject_prefix: self.mlist.display_name = unicode(subject_prefix.group(1)) if self.force_import: while self.store.is_message_in_list( self.mlist.fqdn_listname, unquote(message["Message-Id"])): oldmsgid = message["Message-Id"] message.replace_header("Message-Id", "<%s-%s>" % (unquote(message["Message-Id"]), str(randint(0, 100)))) print("Found duplicate, changing message id from %s to %s" % (oldmsgid, message["Message-Id"])) # Parse message to search for attachments try: attachments = self.extract_attachments(message) except DownloadError, e: print ("Could not download one of the attachments! " "Skipping this message. Error: %s" % e.args[0]) continue # Now insert the message try: self.store.add_to_list(self.mlist, message) except ValueError, e: if len(e.args) != 2: raise # Regular ValueError exception print "%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject")) continue
def from_mbox(self, mbfile): """ Insert all the emails contained in an mbox file into the database. :arg mbfile: a mailbox file """ mbox = mailbox.mbox(mbfile) progress_marker = ProgressMarker(self.verbose, self.stdout) if not self.since: progress_marker.total = len(mbox) for message in mbox: if self._is_too_old(message): continue progress_marker.tick(message["Message-Id"]) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header( "subject", TEXTWRAP_RE.sub(" ", message["subject"])) if message.get_from(): message.set_unixfrom(message.get_from()) # Now insert the message try: with transaction.atomic(): add_to_list(self.list_address, message) except DuplicateMessage as e: if self.verbose: self.stderr.write("Duplicate email with message-id '%s'" % e.args[0]) continue except ValueError as e: self.stderr.write("Failed adding message %s: %s" % (message.get("Message-ID"), e)) if len(e.args) != 2: raise # Regular ValueError exception try: self.stderr.write("%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject"))) except UnicodeDecodeError: pass continue except DatabaseError: try: print_exc(file=self.stderr) except UnicodeError: pass self.stderr.write("Message %s failed to import, skipping" % unquote(message["Message-Id"])) continue email = Email.objects.get(mailinglist__name=self.list_address, message_id=get_message_id(message)) # # Commit every time to be able to rollback on error # if not transaction.get_autocommit(): # transaction.commit() # Store the list of impacted threads to be able to compute the # thread_order and thread_depth values self.impacted_thread_ids.add(email.thread_id) progress_marker.count_imported += 1 # self.store.search_index.flush() # Now commit to the search index progress_marker.finish()
def parse_header_value(header: str) -> Tuple[str, Dict[str, str]]: """ Parse an HTTP header value. Parameter values will be unquoted. If the key ends with an asterisk (``*``), the asterisk is removed from the key name and the value is then decoded according to :rfc:`2231`. :param header: :return: a tuple of (main value, params dict) """ assert check_argument_types() main_value, params_str = header.partition(';')[::2] params = {} for match in header_param_re.finditer(params_str): key, value = match.groups() value = unquote(value) if key.endswith('*'): key = key[:-1] encoding, value = decode_rfc2231(value)[::2] value = urllib_unquote(value, encoding) params[key] = value return main_value.rstrip(), params
def sender(self): s = self._get("from") name, addr = parseaddr(s) s = unquote(name) if s.startswith("=?"): name = self._decode(s) return name, addr
def __init__(self, attachment, encoding): # Note that an attachment can be either a tuple of (filename, content, mimetype) # or a MIMEBase object. (Also, both filename and mimetype may be missing.) self._attachment = attachment self.encoding = encoding # should we be checking attachment["Content-Encoding"] ??? self.inline = False self.content_id = None self.cid = "" if isinstance(attachment, MIMEBase): self.name = attachment.get_filename() self.content = attachment.get_payload(decode=True) self.mimetype = attachment.get_content_type() if get_content_disposition(attachment) == 'inline': self.inline = True self.content_id = attachment["Content-ID"] # probably including the <...> if self.content_id is not None: self.cid = unquote(self.content_id) # without the <, > else: (self.name, self.content, self.mimetype) = attachment # Guess missing mimetype from filename, borrowed from # django.core.mail.EmailMessage._create_attachment() if self.mimetype is None and self.name is not None: self.mimetype, _ = mimetypes.guess_type(self.name) if self.mimetype is None: self.mimetype = DEFAULT_ATTACHMENT_MIME_TYPE
def get(self): data = self.parser.parse_args() ref = unquote(data['ref']).strip() get_all = data.get('getall', False) get_both = data.get('getboth', False) return integrated_lookup(ref, return_all=get_all, return_both=get_both)
def __init__(self, attachment, encoding): # Note that an attachment can be either a tuple of (filename, content, mimetype) # or a MIMEBase object. (Also, both filename and mimetype may be missing.) self._attachment = attachment self.encoding = encoding # should we be checking attachment["Content-Encoding"] ??? self.inline = False self.content_id = None self.cid = "" if isinstance(attachment, MIMEBase): self.name = attachment.get_filename() self.content = attachment.get_payload(decode=True) self.mimetype = attachment.get_content_type() if get_content_disposition(attachment) == 'inline': self.inline = True self.content_id = attachment[ "Content-ID"] # probably including the <...> if self.content_id is not None: self.cid = unquote(self.content_id) # without the <, > else: (self.name, self.content, self.mimetype) = attachment # Guess missing mimetype from filename, borrowed from # django.core.mail.EmailMessage._create_attachment() if self.mimetype is None and self.name is not None: self.mimetype, _ = mimetypes.guess_type(self.name) if self.mimetype is None: self.mimetype = DEFAULT_ATTACHMENT_MIME_TYPE
def from_mbox(self, mbfile): """ Insert all the emails contained in an mbox file into the database. :arg mbfile: a mailbox file """ #self.store.search_index = make_delayed(self.store.search_index) mbox = mailbox.mbox(mbfile) progress_marker = ProgressMarker(self.verbose, self.stdout) if not self.since: progress_marker.total = len(mbox) for message in mbox: if self._is_too_old(message): continue progress_marker.tick(message["Message-Id"]) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header("subject", TEXTWRAP_RE.sub(" ", message["subject"])) # Now insert the message try: with transaction.atomic(): add_to_list(self.list_address, message) except DuplicateMessage as e: if self.verbose: self.stderr.write( "Duplicate email with message-id '%s'" % e.args[0]) continue except ValueError as e: if len(e.args) != 2: raise # Regular ValueError exception try: self.stderr.write("%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject"))) except UnicodeDecodeError: self.stderr.write("%s with message-id %s" % (e.args[0], e.args[1].get("Message-ID"))) continue except DatabaseError: try: print_exc(file=self.stderr) except UnicodeError: pass self.stderr.write("Message %s failed to import, skipping" % unquote(message["Message-Id"])) continue email = Email.objects.get( mailinglist__name=self.list_address, message_id=get_message_id(message)) ## Commit every time to be able to rollback on error #if not transaction.get_autocommit(): # transaction.commit() # Store the list of impacted threads to be able to compute the # thread_order and thread_depth values self.impacted_thread_ids.add(email.thread_id) progress_marker.count_imported += 1 #self.store.search_index.flush() # Now commit to the search index progress_marker.finish()
def attach_inline_image(message, content, filename=None, subtype=None, idstring="img", domain=None): """Add inline image to an EmailMessage, and return its content id""" content_id = make_msgid(idstring, domain) # Content ID per RFC 2045 section 7 (with <...>) image = MIMEImage(content, subtype) image.add_header('Content-Disposition', 'inline', filename=filename) image.add_header('Content-ID', content_id) message.attach(image) return unquote(content_id) # Without <...>, for use as the <img> tag src
def doc_from_bytes(docid, rdkey, b): msg = message_from_string(b) doc = {} mp = doc['multipart'] = msg.is_multipart() headers = doc['headers'] = {} # Given we have no opportunity to introduce an object which can ignore # the case of headers, we lowercase the keys for hn in msg.keys(): vals = msg.get_all(hn) if vals: # first do any charset etc conversion... vals = [_safe_convert_header(v) for v in vals] if hn.lower() == 'references': # email.utils.unquote will do bad things to references headers (stripping # initial and trailing <>'s, so we don't want to use it for the # references header-- but other fields seem ok. We split the references # into a list here because why not. headers[hn.lower()] = [extract_message_ids(vals[0])] else: headers[hn.lower()] = [unquote(v) for v in vals] # a sanity check and to help debug an obscure bug which seemed to # cause the wrong 'source' doc being passed! if __debug__ and rdkey[0]=='email' and hn.lower()=='message-id': from raindrop.proto.imap import get_rdkey_for_email assert tuple(rdkey)==get_rdkey_for_email(vals[0]), (rdkey, docid, vals) # XXX - technically msg objects are recursive; handling that requires # more thought. For now, assume they are flat. # We must return non-text parts in attachments, so just return # *everything* in attachments. attachments = doc['_attachments'] = {} if mp: # a multi-part message - flatten it here by walking the list, but # only looking at the 'leaf' nodes. # attachments have lost their order; this object helps keep the # other and is a convenient place to stash other headers coming # with this part. mi = doc['multipart_info'] = [] i = 1 for attach in msg.walk(): if not attach.is_multipart(): name = sanitize_attach_name(attach.get_filename()) if not name: name = "subpart-%d" % i i += 1 attachments[name] = attach_from_msg((docid, name), attach) # Put together info about the attachment. ah = {} for hn, hv in attach.items(): ah[hn.lower()] = _safe_convert_header(hv) # content-type is redundant, but may be helpful... ct = attachments[name]['content_type'] info = {'name': name, 'headers': ah, 'content_type': ct} mi.append(info) else: attachments['body'] = attach_from_msg((docid, 'body'), msg) return doc
def test_email_from_html(self): from nimodipine.management.commands.send_messages import inline_images from django.core.mail import EmailMultiAlternatives msg = EmailMultiAlternatives(subject="foo") html = 'some <b>html</b> and stuff <img src="data:image/png;base64,cafe"> ting' msg = inline_images(msg, html) attachment = msg.attachments[0] self.assertEqual(attachment.get_payload(), "cafe") cid = unquote(attachment.get("content-id")) self.assertIn('<img src="cid:{}">'.format(cid), msg.alternatives[0][0])
def _get_date(self, message, header, report_name): try: date = message.get(header) except (TypeError, ValueError) as e: if self.verbose: self.stderr.write( "Can't get {} header in message {}{}: {}.".format( header, unquote(message.get("message-id", 'n/a')), report_name, e)) return None return date
def attach_inline_image(message, content, filename=None, subtype=None, idstring="img", domain=None): """Add inline image to an EmailMessage, and return its content id""" if domain is None: # Avoid defaulting to hostname that might end in '.com', because some ESPs # use Content-ID as filename, and Gmail blocks filenames ending in '.com'. domain = 'inline' # valid domain for a msgid; will never be a real TLD content_id = make_msgid(idstring, domain) # Content ID per RFC 2045 section 7 (with <...>) image = MIMEImage(content, subtype) image.add_header('Content-Disposition', 'inline', filename=filename) image.add_header('Content-ID', content_id) message.attach(image) return unquote(content_id) # Without <...>, for use as the <img> tag src
def _get_cookies(self): """ 从environ里取出cookies字符串,并解析成键值对 组成的字典 """ if not hasattr(self, '_cookies'): cookies = {} cookie_str = self._environ.get('HTTP_COOKIE') if cookie_str: for c in cookie_str.split(';'): pos = c.find('=') if pos > 0: cookies[c[:pos].strip()] = eutils.unquote(c[pos + 1:]) self._cookies = cookies return self._cookies
def get_message_message_id(message_id_str: str) -> str: """ Get the message message-id header as a string. NOTE: No need to use unquote, as policy strict bakes this in. :param message_id_str: the message 'message id' header as a string :return: parsed or generated message id """ # Create message-id if non found if not message_id_str: message_id_str = make_msgid() clean_message_id = unquote(message_id_str) return clean_message_id
def inline_images(message, html): """Given HTML with inline data images, convert these to attachments, and add HTML as an alternative """ images = re.findall(r'<img.*?src="data:image/png;base64,.*?">', html) for i, image_tag in enumerate(images): filename = "img{}.png".format(i) data = re.findall(r'<img.*?src="data:image/png;base64,(.*?)">', image_tag)[0] content_id = make_msgid( "img") # Content ID per RFC 2045 section 7 (with <...>) image = MIMEImage(data, "png", _encoder=lambda x: x) image.add_header("Content-Disposition", "inline", filename=filename) image.add_header("Content-ID", content_id) image.add_header("Content-Transfer-Encoding", "base64") message.attach(image) html = html.replace(image_tag, '<img src="cid:{}">'.format(unquote(content_id))) message.attach_alternative(html, "text/html") return message
def _is_too_old(self, message, report_name): if not self.since: return False date = message.get("date") if not date: return False try: date = parse_date(date) except ValueError as e: if self.verbose: self.stderr.write( "Can't parse date string in message {}{}: {}. " "The date string is: '{}'".format( unquote(message.get("message-id", 'n/a')), report_name, e, date.decode("ascii", "replace"))) return False if date.tzinfo is None: date = date.replace(tzinfo=utc) try: return date <= self.since except ValueError: return False
def post(self, request): user_form = self.user_update_form_class(data=request.POST, instance=request.user) profile_form = self.profile_form_class(data=request.POST, files=request.FILES, instance=request.user.profile) if user_form.is_valid() and profile_form.is_valid(): update = user_form.save(commit=False) update.user = request.user update.user.username = request.user.username user = User.objects.filter(email=unquote(request.user.email)) if user: if user[0].id == request.user.id: #if user didn't change email update.save() profile_form.save() else: return render(request, self.template_name, {'user_form': user_form, 'profile_form': profile_form, 'error_message':'This email address is already in use. Please supply a different email address.'}) else: update.save() profile_form.save() return render(request, self.template_name, {'user_form': user_form, 'profile_form': profile_form})
def _unquotevalue(value): if isinstance(value, tuple): return (value[0], value[1], utils.unquote(value[2])) else: return utils.unquote(value)
def test_generate_header(key, val): header = generate_header(key, {key: val}) k, params = parse_header(header) assert unquote(k), params == (key, {key: val})
def inline_attachments(self): """dict of Content-ID: attachment (as MIMEPart objects)""" return {unquote(part['Content-ID']): part for part in self.walk() if part.is_inline_attachment() and part['Content-ID']}
def add_to_list(self, mlist, message): """Add the message to a specific list of the store. :param mlist: The mailing-list object, implementing mailman.interfaces.mailinglist.IMailingList. :param message: An email.message.Message instance containing at least a unique Message-ID header. The message will be given an X-Message-ID-Hash header, overriding any existing such header. :returns: The calculated X-Message-ID-Hash header. :raises ValueError: if the message is missing a Message-ID header. The storage service is also allowed to raise this exception if it find, but disallows collisions. """ list_name = unicode(mlist.fqdn_listname) # Create the list if it does not exist l = self.db.find(List, List.name == list_name).one() if l is None: l = List(list_name) self.db.add(l) l.display_name = mlist.display_name l.subject_prefix = mlist.subject_prefix if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) msg_id = unicode(unquote(message['Message-Id'])) email = Email(list_name, msg_id) if self.is_message_in_list(list_name, email.message_id): print("Duplicate email from %s: %s" % (message['From'], message.get('Subject', '""'))) return email.message_id_hash # the message.as_string() call must be done before scrubbing email_full = EmailFull(list_name, msg_id, message.as_string()) # Find thread id new_thread = False ref, thread_id = get_ref_and_thread_id(message, list_name, self) if thread_id is None: new_thread = True # make up the thread_id if not found thread_id = email.message_id_hash email.thread_id = thread_id email.in_reply_to = ref from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name) email.sender_name = from_name.strip() email.sender_email = unicode(from_email).strip() email.subject = header_to_unicode(message.get('Subject')) msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = datetime.datetime.utcnow() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(tzutc()).replace(tzinfo=None) email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = ( (utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60 scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # store the Mailman user email.user_id = self._store_mailman_user(email.sender_email) #category = 'Question' # TODO: enum + i18n ? #if ('agenda' in message.get('Subject', '').lower() or # 'reminder' in message.get('Subject', '').lower()): # # i18n! # category = 'Agenda' if new_thread: thread = Thread(list_name, thread_id, email.date) else: thread = self.db.find( Thread, And( Thread.list_name == list_name, Thread.thread_id == thread_id, )).one() thread.date_active = email.date self.db.add(thread) self.db.add(email) self.db.add(email_full) compute_thread_order_and_depth(thread) for attachment in attachments: self.add_attachment(list_name, msg_id, *attachment) self.flush() # search indexing if self.search_index is not None: self.search_index.add(email) return email.message_id_hash
def process_headers( self, msg ): headers = {} # for now we just take todays date as the received date message = { "receivedDate" : datetime.datetime.utcnow().isoformat() } for hn in msg.keys(): header_values = msg.get_all(hn) if header_values: header_name = hn.lower() # add this header to the list of available headers headers[header_name] = [] # do any charset etc conversion on the values... header_values = [self._safe_convert_header(v) for v in header_values] # go through the values converting them into usable lists for value in header_values: if re.match(r"<.+>,",value): for v in value.split(","): headers[header_name].append(unquote(v.strip())) # multiple reference processing elif header_name == "references" and re.match(r"<[^<>]+>\s+",value): for ref in re.findall(r"<[^<>]+>",value): headers[header_name].append(unquote(ref.strip())) else: headers[header_name].append(unquote(value.strip())) for header_name in headers: header_values = headers[header_name] if header_name in ["to","cc", "bcc", "from", "replyto"]: message[header_name] = [{ "name" : name, "address" : address} \ for name, address \ in getaddresses(header_values) \ if address] elif header_name == "received": dv = 0 for v in header_values: date = re.match(r".*;\s*(.+)",v,re.DOTALL).group(1) parse = int(mktime_tz(parsedate_tz(date))) if parse > dv: dv = parse rd = formatdate(parse) message["receivedDate"] = { "original" : rd, "utctimestamp" : parse, "utcisoformat" : datetime.datetime.fromtimestamp(parse, tzutc()).isoformat() } elif header_name in ["message-id"]: # single value header value = header_values[0] message["mid"] = value elif header_name in ["subject"]: # single value header value = header_values[0] message["subject"] = value elif header_name in ["date"]: # single value header value = header_values[0] utctimestamp = int(mktime_tz(parsedate_tz(value))) timestamp = datetime.datetime.fromtimestamp(utctimestamp, tzutc()) message["date"] = { "original" : value, "utctimestamp" : utctimestamp, "utcisoformat" : timestamp.isoformat() } return message
try: self.store.add_to_list(self.mlist, message) except ValueError, e: if len(e.args) != 2: raise # Regular ValueError exception try: print "%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject")) except UnicodeDecodeError: print "%s with message-id %s" % ( e.args[0], e.args[1].get("Message-ID")) continue except DatabaseError: print_exc() print ("Message %s failed to import, skipping" % unquote(message["Message-Id"])) self.store.rollback() continue # And insert the attachments for counter, att in enumerate(attachments): self.store.add_attachment( self.mlist.fqdn_listname, message["Message-Id"].strip(" <>"), counter, att[0], att[1], None, att[2]) self.store.flush() cnt_imported += 1 # Commit every time to be able to rollback on error self.store.commit() self.store.search_index.flush() # Now commit to the search index if self.verbose:
def add_to_list(self, mlist, message): """Add the message to a specific list of the store. :param mlist: The mailing-list object, implementing mailman.interfaces.mailinglist.IMailingList. :param message: An email.message.Message instance containing at least a unique Message-ID header. The message will be given an X-Message-ID-Hash header, overriding any existing such header. :returns: The calculated X-Message-ID-Hash header. :raises ValueError: if the message is missing a Message-ID header. The storage service is also allowed to raise this exception if it find, but disallows collisions. """ list_name = unicode(mlist.fqdn_listname) # Create the list if it does not exist l = self.db.find(List, List.name == list_name).one() if l is None: l = List(list_name) self.db.add(l) l.display_name = mlist.display_name if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) msg_id = unicode(unquote(message['Message-Id'])) email = Email(list_name, msg_id) if self.is_message_in_list(list_name, email.message_id): print ("Duplicate email from %s: %s" % (message['From'], message.get('Subject', '""'))) return email.message_id_hash # the message.as_string() call must be done before scrubbing email_full = EmailFull(list_name, msg_id, message.as_string()) # Find thread id new_thread = False ref, thread_id = get_ref_and_thread_id(message, list_name, self) if thread_id is None: new_thread = True # make up the thread_id if not found thread_id = email.message_id_hash email.thread_id = thread_id email.in_reply_to = ref from_name, from_email = parseaddr(message['From']) from_name = header_to_unicode(from_name) email.sender_name = from_name.strip() email.sender_email = unicode(from_email).strip() email.subject = header_to_unicode(message.get('Subject')) msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = datetime.datetime.now() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(tzutc()).replace(tzinfo=None) email.date = msg_date utcoffset = msg_date.utcoffset() if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = ( (utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60 scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() #category = 'Question' # TODO: enum + i18n ? #if ('agenda' in message.get('Subject', '').lower() or # 'reminder' in message.get('Subject', '').lower()): # # i18n! # category = 'Agenda' if new_thread: thread = Thread(list_name, thread_id, email.date) else: thread = self.db.find(Thread, And( Thread.list_name == list_name, Thread.thread_id == thread_id, )).one() thread.date_active = email.date self.db.add(thread) self.db.add(email) self.db.add(email_full) self.flush() for attachment in attachments: self.add_attachment(list_name, msg_id, *attachment) return email.message_id_hash
def test_cid_in_message(self): alternative_message = self.message.alternatives[0][0] self.assertIn('cid:%s' % unquote(self.inline_image._content_id), alternative_message)
def _unquote_boundary(self, b): return b[:2] + email_utils.unquote( b[2:-2].decode('ascii')).encode('ascii') + b[-2:]
groups = { 'transporte': 'Transporte', 'turismo': 'Turismo', 'vivienda': 'Vivienda', 'cultura': 'Cultura', 'deporte': 'Deporte', 'desarrollo-social': 'Desarrollo Social', 'economia': 'Economía', 'educacion': 'Educación', 'industria': 'Industria', 'infraestructura': 'Infraestructura', 'medio-ambiente': 'Medio Ambiente', 'salud': 'Salud', 'seguridad': 'Seguridad', 'trabajo': 'Trabajo' } for name, title in groups.iteritems(): ckan.group_register_post({'name': name, 'title': title}) # asociate datasets to groups based on category custom field for package_id in ckan.package_register_get(): group_name = unquote( ckan.package_entity_get(package_id)['extras'].get('category')) if group_name: group_name_to_get = group_name.replace('_', '-') group = ckan.group_entity_get(group_name_to_get) if package_id not in group['packages']: group['packages'].append(package_id) ckan.group_entity_put(group)
def __str__(self): if not self._content_id: self.generate_cid() return 'cid:' + unquote(self._content_id)
def add_to_list(self, mlist, message): list_name = unicode(mlist.fqdn_listname) # Create the list if it does not exist l = self.db.find(List, List.name == list_name).one() if l is None: l = List(list_name) # Don't wait for the cache to set those properties for propname in l.mailman_props: setattr(l, propname, getattr(mlist, propname)) self.db.add(l) if mlist.archive_policy == ArchivePolicy.never: logger.info("Archiving disabled by list policy for %s" % list_name) return None if not message.has_key("Message-Id"): raise ValueError("No 'Message-Id' header in email", message) msg_id = unicode(unquote(message["Message-Id"])) # Protect against extremely long Message-Ids (there is no limit in the # email spec), it's set to VARCHAR(255) in the database if len(msg_id) >= 255: msg_id = msg_id[:254] email = Email(list_name, msg_id) if self.is_message_in_list(list_name, email.message_id): logger.info("Duplicate email from %s: %s" % (message["From"], message.get("Subject", '""'))) return email.message_id_hash # if not getattr(settings.KITTYSTORE_FULL_EMAIL): # # If it's a valid value, leave it to the "prototype" archiver # # Note: the message.as_string() call must be done before scrubbing # email_full = EmailFull(list_name, msg_id, message.as_string()) # self.db.add(email_full) # Find thread id new_thread = False ref, thread_id = get_ref_and_thread_id(message, list_name, self) if thread_id is None: new_thread = True # make up the thread_id if not found thread_id = email.message_id_hash email.thread_id = thread_id email.in_reply_to = ref try: from_name, from_email = parseaddr(message["From"]) from_name = header_to_unicode(from_name).strip() email.sender_email = unicode(from_email).strip() except (UnicodeDecodeError, UnicodeEncodeError): raise ValueError("Non-ascii sender address", message) sender = self.db.find(Sender, Sender.email == email.sender_email).one() if sender is None: sender = Sender(email.sender_email, from_name) self.db.add(sender) else: sender.name = from_name # update the name if needed email.subject = header_to_unicode(message.get("Subject")) if email.subject is not None: # limit subject size to 2000 chars or PostgreSQL may complain email.subject = email.subject[:2000] msg_date = parsedate(message.get("Date")) if msg_date is None: # Absent or unparseable date msg_date = datetime.datetime.utcnow() utcoffset = msg_date.utcoffset() if msg_date.tzinfo is not None: msg_date = msg_date.astimezone(tzutc()).replace(tzinfo=None) email.date = msg_date if utcoffset is None: email.timezone = 0 else: # in minutes email.timezone = ((utcoffset.days * 24 * 60 * 60) + utcoffset.seconds) / 60 scrubber = Scrubber(list_name, message) # warning: scrubbing modifies the msg in-place email.content, attachments = scrubber.scrub() # category = 'Question' # TODO: enum + i18n ? # if ('agenda' in message.get('Subject', '').lower() or # 'reminder' in message.get('Subject', '').lower()): # # i18n! # category = 'Agenda' if new_thread: thread = Thread(list_name, thread_id, email.date) else: thread = self.db.find(Thread, And(Thread.list_name == list_name, Thread.thread_id == thread_id)).one() thread.date_active = email.date self.db.add(thread) self.db.add(email) compute_thread_order_and_depth(thread) for attachment in attachments: self.add_attachment(list_name, msg_id, *attachment) self.flush() # invalidate the cache events.notify(events.NewMessage(self, mlist, email)) if new_thread: events.notify(events.NewThread(self, mlist, thread)) # search indexing # do it after caching because we need some list properties (like # archive_policy) if self.search_index is not None: self.search_index.add(email) return email.message_id_hash
def from_mbox(self, mbfile): """ Insert all the emails contained in an mbox file into the database. :arg mbfile: a mailbox file """ mbox = mailbox.mbox(mbfile) progress_marker = ProgressMarker(self.verbose, self.stdout) if not self.since: progress_marker.total = len(mbox) for msg in mbox: # FIXME: this converts mailbox.mboxMessage to # email.message.EmailMessage msg_raw = msg.as_bytes(unixfrom=False) unixfrom = msg.get_from() message = message_from_bytes(msg_raw, policy=policy.default) # Fix missing and wierd Date: headers. date = (self._get_date(message, "date") or self._get_date(message, "resent-date")) if unixfrom and not date: date = " ".join(unixfrom.split()[1:]) if date: # Make sure this date can be parsed before setting it as as the # header. If not, a TypeError is raised and we just keep the # old Header. with suppress(TypeError): del message['Date'] message['Date'] = date if self._is_too_old(message): continue progress_marker.tick(message["Message-Id"]) # Un-wrap the subject line if necessary if message["subject"]: message.replace_header( "subject", TEXTWRAP_RE.sub(" ", message["subject"])) if unixfrom: message.set_unixfrom(unixfrom) if message['message-id'] is None: message['Message-ID'] = make_msgid('generated') # Now insert the message try: with transaction.atomic(): add_to_list(self.list_address, message) except DuplicateMessage as e: if self.verbose: self.stderr.write("Duplicate email with message-id '%s'" % e.args[0]) continue except (LookupError, UnicodeError, ValueError) as e: self.stderr.write("Failed adding message %s: %s" % (message.get("Message-ID"), e)) if len(e.args) == 2: try: self.stderr.write("%s from %s about %s" % (e.args[0], e.args[1].get("From"), e.args[1].get("Subject"))) except UnicodeDecodeError: pass # Don't reraise the exception continue except DatabaseError: try: print_exc(file=self.stderr) except UnicodeError: pass self.stderr.write("Message %s failed to import, skipping" % unquote(message["Message-Id"])) continue except Exception as e: # In case of *any* exception, log and continue to import the # rest of the archive. self.stderr.write( "Message {} failed to import, skipping".format( unquote(message["Message-ID"]))) self.stderr.write(e) continue email = Email.objects.get(mailinglist__name=self.list_address, message_id=get_message_id(message)) # # Commit every time to be able to rollback on error # if not transaction.get_autocommit(): # transaction.commit() # Store the list of impacted threads to be able to compute the # thread_order and thread_depth values self.impacted_thread_ids.add(email.thread_id) progress_marker.count_imported += 1 # self.store.search_index.flush() # Now commit to the search index progress_marker.finish() mbox.close()
def try_download_link(self, add_token: bool = False, delete_if_successful: bool = False, use_cookies: bool = False) -> bool: """This function should only be used for shortcut/URL files. It tests whether a URL refers to a file, that is not an HTML web page. Then downloads it. Otherwise an attempt will be made to download an HTML video from the website. Args: add_token (bool, optional): Adds the ws-token to the url. Defaults to False. delete_if_successful (bool, optional): Deletes the tmp file if download was successfull. Defaults to False. use_cookies (bool, optional): Adds the cookies to the requests. Defaults to False. Returns: bool: If it was successfull. """ url_to_download = self.file.content_fileurl logging.debug('T%s - Try to download linked file %s', self.thread_id, url_to_download) if add_token: url_to_download = self._add_token_to_url(self.file.content_fileurl) cookies_path = self.options.get('cookies_path', None) if use_cookies: if cookies_path is None or not os.path.isfile(cookies_path): self.success = False raise ValueError( 'Moodle Cookies are missing. Run `moodle-dl -nt` to set a privatetoken for cookie generation (If necessary additionally `-sso`)' ) if delete_if_successful: # if temporary file is not needed delete it as soon as possible try: os.remove(self.file.saved_to) except Exception as e: logging.warning( 'T%s - Could not delete %s before download is started. Error: %s', self.thread_id, self.file.saved_to, e, ) isHTML = False new_filename = "" total_bytes_estimate = -1 session = requests.Session() if cookies_path is not None: session.cookies = MozillaCookieJar(cookies_path) if os.path.isfile(cookies_path): session.cookies.load(ignore_discard=True, ignore_expires=True) try: response = session.head( url_to_download, headers=RequestHelper.stdHeader, verify=self.verify_cert, allow_redirects=True, ) except (InvalidSchema, InvalidURL, MissingSchema): # don't download urls like 'mailto:[email protected]' logging.debug( 'T%s - Attempt is aborted because the URL has no correct format', self.thread_id) self.success = True return False if not response.ok: # The URL reports an HTTP error, so we give up trying to download the URL. logging.warning( 'T%s - Stopping the attemp to download %s because of the HTTP ERROR %s', self.thread_id, self.file.content_fileurl, response.status_code, ) self.success = True return True content_type = response.headers.get('Content-Type', 'text/html').split(';')[0] if content_type == 'text/html' or content_type == 'text/plain': isHTML = True total_bytes_estimate = int(response.headers.get('Content-Length', -1)) last_modified = response.headers.get('Last-Modified', None) if response.url != url_to_download: if response.history and len(response.history) > 0: logging.debug('T%s - URL was %s time(s) redirected', self.thread_id, len(response.history)) else: logging.debug( 'T%s - URL has changed after information retrieval', self.thread_id) url_to_download = response.url url_parsed = urlparse.urlparse(url_to_download) new_filename = posixpath.basename(url_parsed.path) if "Content-Disposition" in response.headers.keys(): found_names = re.findall("filename=(.+)", response.headers["Content-Disposition"]) if len(found_names) > 0: new_filename = unquote(found_names[0]) if isHTML and not self.is_blocked_for_youtube_dl(url_to_download): filename_tmpl = self.filename + ' | %(title)s (%(id)s).%(ext)s' if self.file.content_type == 'description-url': filename_tmpl = '%(title)s (%(id)s).%(ext)s' outtmpl = str(Path(self.destination) / filename_tmpl) ydl_opts = { 'logger': self.YtLogger(self), 'progress_hooks': [self.yt_hook], 'outtmpl': outtmpl, 'nocheckcertificate': self.skip_cert_verify, 'retries': 10, 'fragment_retries': 10, 'ignoreerrors': True, 'addmetadata': True, } youtube_dl_options = self.options.get('youtube_dl_options', {}) ydl_opts.update(youtube_dl_options) if cookies_path is not None and os.path.isfile(cookies_path): ydl_opts.update({'cookiefile': cookies_path}) ydl = youtube_dl.YoutubeDL(ydl_opts) add_additional_extractors(ydl) try: ydl_results = ydl.download([url_to_download]) if ydl_results == 1: pass elif self.file.module_name != 'index_mod-page': self.file.saved_to = str( Path(self.destination) / self.filename) self.file.time_stamp = int(time.time()) self.success = True return True except Exception as e: logging.error( 'T%s - Youtube-dl failed! Error: %s', self.thread_id, e, ) self.youtube_dl_failed_with_error = True # if we want we could save ydl.cookiejar (Also the cookiejar of moodle-dl) if self.youtube_dl_failed_with_error is True: if not delete_if_successful: # cleanup the url-link file try: os.remove(self.file.saved_to) except Exception as e: logging.warning( 'T%s - Could not delete %s after youtube-dl failed. Error: %s', self.thread_id, self.file.saved_to, e, ) self.success = False raise RuntimeError( 'Youtube-dl could not download the URL. For details see youtube-dl error messages in the log file' ) logging.debug('T%s - Downloading file directly', self.thread_id) # generate file extension for modules names new_name, new_extension = os.path.splitext(new_filename) if new_extension == '' and isHTML: new_extension = '.html' if self.file.content_type == 'description-url' and new_name != '': self.filename = new_name + new_extension old_name, old_extension = os.path.splitext(self.filename) if old_extension != new_extension: self.filename = self.filename + new_extension self.set_path(True) if total_bytes_estimate != -1: self.thread_report[ self.thread_id]['extra_totalsize'] = total_bytes_estimate self.urlretrieve( url_to_download, self.file.saved_to, context=self.ssl_context, reporthook=self.add_progress, cookies_path=cookies_path, ) self.set_utime(last_modified) self.file.time_stamp = int(time.time()) self.success = True return True