def clean_text_for_search(text): """ Prepare text for indexing and search. """ # Get the normalized unicode text. text= force_unicode(text).strip() # Remove e-mail quotation from the beginnings of the string. text= re.sub(r'^\>+', '', text) # Remove e-mail addresses. text= re.sub(r'\b[A-Za-z0-9_\.-]+@[A-Za-z0-9_\.-]+[A-Za-z0-9_][A-Za-z0-9_]\b', '', text) # Try to convert html to text. try: text= html2text(text) except: pass # Clean the text from special characters, such as # section divisions ***, etc. but preserve punctuation. text= re.sub(r'\B\W{2,}\B', ' ', text) # Remove all returns and new lines. text= re.sub(r'\n+', ' ', text) text= re.sub(r'\r+', ' ', text) # Convert multiple spaces to singles. text= re.sub(r'\s{2,}', ' ', text) return text
def extract_mail_content(self, message_data, **kwargs): """ Returns text message content. """ msg_plain_text, msg_attachments= '', [] for response_part in message_data: if isinstance(response_part, tuple): msg= email.message_from_string(response_part[1]) for part in msg.walk(): if part.is_multipart(): continue attachment_part= part.get_params(None, 'Content-Disposition') if attachment_part: attachment_size= len(part.get_payload(decode=True)) attachment_name= self._process_attachment(part, **kwargs) if attachment_name: msg_attachments.append({'filename': attachment_name, 'filesize': attachment_size}) else: # Process message text. # Update `msg_plain_text` only if it isn't updated yet. if len(msg_plain_text) == 0: if str(part.get_content_type()) == 'text/plain': msg_plain_text= unicode(part.get_payload(decode=True), part.get_content_charset(), 'ignore').encode('utf8','replace') elif str(part.get_content_type()) == 'text/html': msg_plain_text= unicode(html2text(part.get_payload(decode=True)), part.get_content_charset(), 'ignore').encode('utf8','replace') return msg_plain_text, msg_attachments