def test_reply_from_gmail_2_ptBr(self):
     with open("test/emails/email_gmail2_ptBr.txt") as f:
         self.assertIn(
             "entendi, muito obrigado pela informação, vou verificar aqui se tenho outras opções.",
             EmailReplyParser.parse_reply(f.read()),
         )
     with open("test/emails/email_gmail2_ptBr.txt") as f:
         self.assertIn(
             "Em sex., 18 de dez. de 2020 às 14:12",
             EmailReplyParser.read(f.read()).fragments[1].content,
         )
     with open("test/emails/email_gmail2_ptBr.txt") as f:
         self.assertIn(
             "Já viu o link desse anuncio?",
             EmailReplyParser.read(f.read()).fragments[0].content,
         )
 def test_reply_from_gmail_ptBr(self):
     with open("test/emails/email_gmail_ptBr.txt") as f:
         self.assertEqual(
             "Esta é uma resposta para mensagens github.",
             EmailReplyParser.parse_reply(f.read()),
         )
     with open("test/emails/email_gmail_ptBr.txt") as f:
         self.assertIn(
             "Em qua., 18 de mai. de 2016 às 11:10 Someone",
             EmailReplyParser.read(f.read()).fragments[1].content,
         )
     with open("test/emails/email_gmail_ptBr.txt") as f:
         self.assertIn(
             "Esta é uma resposta para mensagens github.",
             EmailReplyParser.read(f.read()).fragments[0].content,
         )
Пример #3
0
    def __init__(self, email_text):
        """Decode base64 email and turn it into a Django email object."""
        try:
            email_text = base64.standard_b64decode(
                urllib2.unquote(email_text.rstrip()))
        except TypeError:
            # Corrupt or invalid base 64.
            self.decode_error = True
            log.info('Decoding error for CommEmailParser')
            return

        self.email = message_from_string(email_text)

        payload = self.email.get_payload()
        if isinstance(payload, list):
            # If multipart, get the plain text part.
            for part in payload:
                # Nested multipart. Go deeper.
                if part.get_content_type() == 'multipart/alternative':
                    payload = part.get_payload()
                    for part in payload:
                        if part.get_content_type() == 'text/plain':
                            # Found the plain text part.
                            payload = part.get_payload()
                            break

                if part.get_content_type() == 'text/plain':
                    # Found the plain text part.
                    payload = part.get_payload()
                    break

        # Decode quoted-printable data and remove non-breaking spaces.
        payload = (quopri.decodestring(payload).replace('\xc2\xa0', ' '))
        payload = self.extra_email_reply_parse(payload)
        self.reply_text = EmailReplyParser.read(payload).reply
Пример #4
0
 def set_content_and_type(self):
     self.content, self.content_type = '[Blank Email]', 'text/plain'
     if self.html_content:
         self.content, self.content_type = self.html_content, 'text/html'
     else:
         self.content, self.content_type = EmailReplyParser.read(
             self.text_content).text.replace("\n", "\n\n"), 'text/plain'
Пример #5
0
    def __init__(self, message):
        if (not isinstance(message, dict) or 'TextBody' not in message):
            log.exception('ActivityEmailParser didn\'t get a valid message.')
            raise ActivityEmailEncodingError(
                'Invalid or malformed json message object.')

        self.email = message
        reply = self._extra_email_reply_parse(self.email['TextBody'])
        self.reply = EmailReplyParser.read(reply).reply
Пример #6
0
    def __init__(self, message):
        if (not isinstance(message, dict) or 'TextBody' not in message):
            log.exception('ActivityEmailParser didn\'t get a valid message.')
            raise ActivityEmailEncodingError(
                'Invalid or malformed json message object.')

        self.email = message
        reply = self._extra_email_reply_parse(self.email['TextBody'])
        self.reply = EmailReplyParser.read(reply).reply
Пример #7
0
 def set_content_and_type(self):
     self.content, self.content_type = "[Blank Email]", "text/plain"
     if self.html_content:
         self.content, self.content_type = self.html_content, "text/html"
     else:
         self.content, self.content_type = (
             EmailReplyParser.read(self.text_content).text.replace(
                 "\n", "\n\n"),
             "text/plain",
         )
 def test_parse_out_just_top_for_outlook_with_reply_directly_above_line_ptBr(self):
     with open("test/emails/email_2_2_ptBr.txt") as f:
         self.assertEqual(
             "um novo dia testando !! navegador!",
             EmailReplyParser.parse_reply(f.read()),
         )
     with open("test/emails/email_2_2_ptBr.txt") as f:
         self.assertIn(
             "um novo dia testando",
             EmailReplyParser.read(f.read()).fragments[0].content,
         )
     with open("test/emails/email_2_2_ptBr.txt") as f:
         self.assertIn(
             "Outlook", EmailReplyParser.read(f.read()).fragments[1].content
         )
     with open("test/emails/email_2_2_ptBr.txt") as f:
         self.assertIn(
             "De: Store <*****@*****.**>",
             EmailReplyParser.read(f.read()).fragments[3].content,
         )
Пример #9
0
    def __init__(self, email_text):
        """Decode base64 email and turn it into a Django email object."""
        try:
            email_text = base64.standard_b64decode(urllib2.unquote(email_text.rstrip()))
        except TypeError:
            # Corrupt or invalid base 64.
            self.decode_error = True
            return

        self.email = message_from_string(email_text)
        self.reply_text = EmailReplyParser.read(self.email.get_payload()).reply
Пример #10
0
    def __init__(self, email_text):
        """Decode base64 email and turn it into a Django email object."""
        try:
            email_text = base64.standard_b64decode(
                urllib2.unquote(email_text.rstrip()))
        except TypeError:
            # Corrupt or invalid base 64.
            self.decode_error = True
            return

        self.email = message_from_string(email_text)
        self.reply_text = EmailReplyParser.read(self.email.get_payload()).reply
Пример #11
0
    def __init__(self, message):
        invalid_email = not isinstance(message, dict) or not message.get(
            'TextBody', None)

        if invalid_email:
            log.exception("ActivityEmailParser didn't get a valid message.")
            raise ActivityEmailEncodingError(
                'Invalid or malformed json message object.')

        self.email = message
        reply = self._extra_email_reply_parse(self.email['TextBody'])
        self.reply = EmailReplyParser.read(reply).reply
Пример #12
0
def get_mail_corpus(nlon_cleaning=False):
    if (nlon_cleaning):
        nlon, nlon_model = training_nlon()

    #Path to mail's corpus
    corpus_file = 'data/mailcorpus.json'
    with open(corpus_file) as data_file:
        corpus = json.load(data_file)

    print('Reading and cleaning emails corpus. Number of emails: ' + str(len(corpus)))
    dict = {}
    n = 0
    #Text cleaning
    for d in corpus:
        if d['type_of_recipient'] == 'From':
            # if not d['is_response_of'] == None:
            res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n'))
            text = res.reply
            # else:
            #     text = d['message_body'].replace('\\n', '\n')
            n += 1

            if (nlon_cleaning):
                try:
                    soup = BS4(text, 'html.parser')
                    clean_message_body = soup.text
                except Exception as e:
                    print('Error with BS4 on text:\n\n%s\n\n' % text, str(e))
                    clean_message_body = text.strip()
                message_by_lines = text.splitlines()
                list_length = len(message_by_lines)
                index = 0
                for count in range(0, list_length):
                    text1 = robjects.StrVector([message_by_lines[index]])
                    if nlon.NLoNPredict(nlon_model, text1)[0] == 'Not':
                        del message_by_lines[index]
                    else:
                        index = index + 1
                clean_message_body = '\n'.join(message_by_lines)
                text = clean_message_body

            if not text == '':
                if d['email_address'] in dict:
                    dict[d['email_address']].append(text)
                else:
                    dict[d['email_address']] = [text]
        print(str(n)+'/'+str(len(corpus))+'\n' if n%50==0 else '', end='')

    print('Mails retrieved: '+ str(n))
    print('Email addresses: '+ str(len(dict)))
    return dict
Пример #13
0
def extract_alert(msg):
    """Extract the original alert from an email thread.

    Walk through all replies comprising the message, locate the
    original alert email, strip off all pseudo-headers, remove quote
    markers, and return the result.

    """
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content = EmailReplyParser.read(part.get_payload(decode=True))
            for fragment in content.fragments:
                content = fragment._content
                if content != extract_reply(msg):
                    return sanitize_email_fragment(content)

    return ''
Пример #14
0
def extract_alert(msg):
    """Extract the original alert from an email thread.

    Walk through all replies comprising the message, locate the
    original alert email, strip off all pseudo-headers, remove quote
    markers, and return the result.

    """
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':
            content = EmailReplyParser.read(
                    part.get_payload(decode=True))
            for fragment in content.fragments:
                content = fragment._content
                if content != extract_reply(msg):
                    return sanitize_email_fragment(content)

    return ''
Пример #15
0
    def __init__(self, email_text):
        """Decode base64 email and turn it into a Django email object."""
        try:
            email_text = base64.standard_b64decode(
                urllib2.unquote(email_text.rstrip()))
        except TypeError:
            # Corrupt or invalid base 64.
            self.decode_error = True
            return

        self.email = message_from_string(email_text)

        payload = self.email.get_payload()  # If not multipart, it's a string.
        if isinstance(payload, list):
            # If multipart, get the plaintext part.
            for part in payload:
                if part.get_content_type() == 'text/plain':
                    payload = part.get_payload()
                    break

        self.reply_text = EmailReplyParser.read(payload).reply
Пример #16
0
def get_mail_corpus():
    # Path to mail corpus
    corpus_file = 'dataset/raw/mailcorpus.json'
    with open(corpus_file) as data_file:
        corpus = json.load(data_file)

    print('Reading and cleaning emails corpus. Number of emails: ' +
          str(len(corpus)))
    _dict = {}
    n = 0
    # Text cleaning
    for d in corpus:
        try:
            res = EmailReplyParser.read(d['message_body'].replace('\\n', '\n'))
            clean_message_body = EmailReplyParser.parse_reply(res.text)
            n += 1

            #clean_message_body = _remove_contractions(clean_message_body)
            clean_message_body = _remove_lines_of_code(clean_message_body)
            clean_message_body = _clean_body(clean_message_body)
            clean_message_body = _remove_stopwords_nonenglish_punctuation(
                clean_message_body)

            if not clean_message_body == '':
                if d['email_address'] in _dict:
                    _dict[d['email_address']].add(clean_message_body)
                else:
                    _dict[d['email_address']] = {clean_message_body}
            print(str(n) + '/' + str(len(corpus)) + '\n' if n %
                  50 == 0 else '',
                  end='')
        except Exception as e:
            print(e)
            continue

    print('Mails retrieved: ' + str(n))
    print('Email addresses: ' + str(len(_dict)))
    return _dict
Пример #17
0
    def __init__(self, email_text):
        """Decode base64 email and turn it into a Django email object."""
        try:
            log.info('CommEmailParser received email: ' + email_text)
            email_text = base64.standard_b64decode(
                urllib2.unquote(email_text.rstrip()))
        except TypeError:
            # Corrupt or invalid base 64.
            self.decode_error = True
            log.info('Decoding error for CommEmailParser')
            return

        self.email = message_from_string(email_text)

        payload = self.email.get_payload()  # If not multipart, it's a string.
        if isinstance(payload, list):
            # If multipart, get the plaintext part.
            for part in payload:
                if part.get_content_type() == 'text/plain':
                    payload = part.get_payload()
                    break

        self.reply_text = EmailReplyParser.read(payload).reply
Пример #18
0
    def __init__(self, email_text):
        """Decode base64 email and turn it into a Django email object."""
        try:
            log.info('CommEmailParser received email: ' + email_text)
            email_text = base64.standard_b64decode(
                urllib2.unquote(email_text.rstrip()))
        except TypeError:
            # Corrupt or invalid base 64.
            self.decode_error = True
            log.info('Decoding error for CommEmailParser')
            return

        self.email = message_from_string(email_text)

        payload = self.email.get_payload()
        if isinstance(payload, list):
            # If multipart, get the plain text part.
            for part in payload:
                # Nested multipart. Go deeper.
                if part.get_content_type() == 'multipart/alternative':
                    payload = part.get_payload()
                    for part in payload:
                        if part.get_content_type() == 'text/plain':
                            # Found the plain text part.
                            payload = part.get_payload()
                            break

                if part.get_content_type() == 'text/plain':
                    # Found the plain text part.
                    payload = part.get_payload()
                    break

        # Decode quoted-printable data and remove non-breaking spaces.
        payload = (quopri.decodestring(payload)
                         .replace('\xc2\xa0', ' '))
        payload = self.extra_email_reply_parse(payload)
        self.reply_text = EmailReplyParser.read(payload).reply
Пример #19
0
 def text(self):
     from email_reply_parser import EmailReplyParser
     message = EmailReplyParser.read(self.raw_text)
     return message.reply
 def get_email(self, name):
     """ Return EmailMessage instance
     """
     with open('test/emails/%s.txt' % name) as f:
         text = f.read()
     return EmailReplyParser.read(text)
Пример #21
0
 def get_email(self, name):
     """ Return EmailMessage instance
     """
     with open("test/emails/%s.txt" % name) as f:
         text = f.read()
     return EmailReplyParser.read(text)
Пример #22
0
def object_from_message(message, queue, logger):
    # 'message' must be an RFC822 formatted message.
    message = email.message_from_string(message)

    subject = message.get('subject', _('Comment from e-mail'))
    subject = decode_mail_headers(decodeUnknown(message.get_charset(), subject))
    for affix in STRIPPED_SUBJECT_STRINGS:
        subject = subject.replace(affix, "")
    subject = subject.strip()

    sender = message.get('from', _('Unknown Sender'))
    sender = decode_mail_headers(decodeUnknown(message.get_charset(), sender))

    # to address bug #832, we wrap all the text in front of the email address in
    # double quotes by using replace() on the email string. Then,
    # take first item of list, second item of tuple is the actual email address.
    # Note that the replace won't work on just an email with no real name,
    # but the getaddresses() function seems to be able to handle just unclosed quotes
    # correctly. Not ideal, but this seems to work for now.
    sender_email = email.utils.getaddresses(['\"' + sender.replace('<', '\" <')])[0][1]

    cc = message.get_all('cc', None)
    if cc:
        # first, fixup the encoding if necessary
        cc = [decode_mail_headers(decodeUnknown(message.get_charset(), x)) for x in cc]
        # get_all checks if multiple CC headers, but individual emails may be comma separated too
        tempcc = []
        for hdr in cc:
            tempcc.extend(hdr.split(','))
        # use a set to ensure no duplicates
        cc = set([x.strip() for x in tempcc])

    for ignore in IgnoreEmail.objects.filter(Q(queues=queue) | Q(queues__isnull=True)):
        if ignore.test(sender_email):
            if ignore.keep_in_mailbox:
                # By returning 'False' the message will be kept in the mailbox,
                # and the 'True' will cause the message to be deleted.
                return False
            return True

    matchobj = re.match(r".*\[" + queue.slug + r"-(?P<id>\d+)\]", subject)
    if matchobj:
        # This is a reply or forward.
        ticket = matchobj.group('id')
        logger.info("Matched tracking ID %s-%s" % (queue.slug, ticket))
    else:
        logger.info("No tracking ID matched.")
        ticket = None

    body = None
    full_body = None
    counter = 0
    files = []

    for part in message.walk():
        if part.get_content_maintype() == 'multipart':
            continue

        name = part.get_param("name")
        if name:
            name = email.utils.collapse_rfc2231_value(name)

        if part.get_content_maintype() == 'text' and name is None:
            if part.get_content_subtype() == 'plain':
                body = part.get_payload(decode=True)
                # https://github.com/django-helpdesk/django-helpdesk/issues/732
                if part['Content-Transfer-Encoding'] == '8bit' and part.get_content_charset() == 'utf-8':
                    body = body.decode('unicode_escape')
                body = decodeUnknown(part.get_content_charset(), body)
                # have to use django_settings here so overwritting it works in tests
                # the default value is False anyway
                if ticket is None and getattr(django_settings, 'HELPDESK_FULL_FIRST_MESSAGE_FROM_EMAIL', False):
                    # first message in thread, we save full body to avoid losing forwards and things like that
                    body_parts = []
                    for f in EmailReplyParser.read(body).fragments:
                        body_parts.append(f.content)
                    full_body = '\n\n'.join(body_parts)
                    body = EmailReplyParser.parse_reply(body)
                else:
                    # second and other reply, save only first part of the message
                    body = EmailReplyParser.parse_reply(body)
                    full_body = body
                # workaround to get unicode text out rather than escaped text
                try:
                    body = body.encode('ascii').decode('unicode_escape')
                except UnicodeEncodeError:
                    body.encode('utf-8')
                logger.debug("Discovered plain text MIME part")
            else:
                try:
                    email_body = encoding.smart_text(part.get_payload(decode=True))
                except UnicodeDecodeError:
                    email_body = encoding.smart_text(part.get_payload(decode=False))

                if not body and not full_body:
                    # no text has been parsed so far - try such deep parsing for some messages
                    altered_body = email_body.replace("</p>", "</p>\n").replace("<br", "\n<br")
                    mail = BeautifulSoup(str(altered_body), "html.parser")
                    full_body = mail.get_text()

                if "<body" not in email_body:
                    email_body = f"<body>{email_body}</body>"

                payload = (
                    '<html>'
                    '<head>'
                    '<meta charset="utf-8" />'
                    '</head>'
                    '%s'
                    '</html>'
                ) % email_body
                files.append(
                    SimpleUploadedFile(_("email_html_body.html"), payload.encode("utf-8"), 'text/html')
                )
                logger.debug("Discovered HTML MIME part")
        else:
            if not name:
                ext = mimetypes.guess_extension(part.get_content_type())
                name = "part-%i%s" % (counter, ext)
            else:
                name = ("part-%i_" % counter) + name

            # # FIXME: this code gets the paylods, then does something with it and then completely ignores it
            # # writing the part.get_payload(decode=True) instead; and then the payload variable is
            # # replaced by some dict later.
            # # the `payloadToWrite` has been also ignored so was commented
            # payload = part.get_payload()
            # if isinstance(payload, list):
            #     payload = payload.pop().as_string()
            # # payloadToWrite = payload
            # # check version of python to ensure use of only the correct error type
            # non_b64_err = TypeError
            # try:
            #     logger.debug("Try to base64 decode the attachment payload")
            #     # payloadToWrite = base64.decodebytes(payload)
            # except non_b64_err:
            #     logger.debug("Payload was not base64 encoded, using raw bytes")
            #     # payloadToWrite = payload
            files.append(SimpleUploadedFile(name, part.get_payload(decode=True), mimetypes.guess_type(name)[0]))
            logger.debug("Found MIME attachment %s" % name)

        counter += 1

    if not body:
        mail = BeautifulSoup(str(message), "html.parser")
        beautiful_body = mail.find('body')
        if beautiful_body:
            try:
                body = beautiful_body.text
                full_body = body
            except AttributeError:
                pass
        if not body:
            body = ""

    if getattr(django_settings, 'HELPDESK_ALWAYS_SAVE_INCOMING_EMAIL_MESSAGE', False):
        # save message as attachment in case of some complex markup renders wrong
        files.append(
            SimpleUploadedFile(
                _("original_message.eml").replace(
                    ".eml",
                    timezone.localtime().strftime("_%d-%m-%Y_%H:%M") + ".eml"
                ),
                str(message).encode("utf-8"),
                'text/plain'
            )
        )

    smtp_priority = message.get('priority', '')
    smtp_importance = message.get('importance', '')
    high_priority_types = {'high', 'important', '1', 'urgent'}
    priority = 2 if high_priority_types & {smtp_priority, smtp_importance} else 3

    payload = {
        'body': body,
        'full_body': full_body or body,
        'subject': subject,
        'queue': queue,
        'sender_email': sender_email,
        'priority': priority,
        'files': files,
    }

    return create_object_from_email_message(message, ticket, payload, files, logger=logger)
 def get_email(self, name):
     """ Return EmailMessage instance
     """
     with open(os.path.join(TEST_EMAILS_DIR, '%s.txt' % name)) as f:
         text = f.read()
     return EmailReplyParser.read(text)
Пример #24
0
 def get_email(self, name):
     """ Return EmailMessage instance
     """
     text = open('test/emails/%s.txt' % name).read()
     return EmailReplyParser.read(text)
Пример #25
0
def clean_email_body(raw_body):
    """
    Cleans an email's plain text body by stripping out any signatures.
    """
    s = EmailReplyParser.read(raw_body)
    return r'\n'.join([f.content for f in s.fragments if not f.signature])
Пример #26
0
	def set_content_and_type(self):
		self.content, self.content_type = '[Blank Email]', 'text/plain'
		if self.html_content:
			self.content, self.content_type = self.html_content, 'text/html'
		else:
			self.content, self.content_type = EmailReplyParser.read(self.text_content).text.replace("\n","\n\n"), 'text/plain'
Пример #27
0
 def __init__(self, email_text):
     self.email = message_from_string(email_text)
     self.reply_text = EmailReplyParser.read(self.email.get_payload()).reply
Пример #28
0
def collect_data():
    """Messy code to download training data.
    """
    c = load_config('templates')
    templates = c['templates']

    training_data = []

    mail = imaplib2.IMAP4_SSL(IMAP_SERVER)
    mail.login(MAIL_USER, MAIL_PASSWORD)

    mail.select("[Gmail]/All Mail", readonly=True)

    result, data = mail.search(None, '(BODY "%s")' % ("@faqbot"))

    ids = data[0]
    id_list = ids.split()

    for idx, r_id in enumerate(id_list):
        _, data = mail.fetch(r_id, "(RFC822)")

        print "%i / %i (%i%%)" % (idx, len(id_list),
                                  int(float(idx) / len(id_list) * 100))

        raw_email = "null"
        for d in data:
            if type(d) is tuple:
                if "RFC822" in d[0]:
                    raw_email = d[1]

        flanker_msg = mime.from_string(raw_email)

        body = "null"

        try:
            for part in flanker_msg.parts:
                if str(part) == "(text/plain)":
                    pp = part.body.encode('ascii', 'ignore')
                    body = pp
        except Exception as _:
            pass

        if body == "null":
            continue

        parsed_body = EmailReplyParser.read(body)

        if len(parsed_body.fragments) >= 2:
            if parsed_body.fragments[0].content.split()[0] == "@faqbot":
                fb = parsed_body.fragments[0].content.split()[1]
                original = parsed_body.fragments[1].content

                lines = []

                for l in original.split('\n'):
                    if l.startswith('> '):
                        tl = l.replace('>', '').strip()
                        if tl != '' and not (tl.startswith('On')):
                            lines.append(l.replace('>', ''))

                key = fb
                original = '\n'.join(lines)

                # Now that we have this, let's make sure it's
                # valid and stuff and then save it.

                if key in templates:
                    training_data.append((key, original))
                    save_config(training_data, 'smartreply_data')