Пример #1
0
    def test_fetch_from_date(self):
        """Test whether a list of messages is returned since a given date"""

        from_date = datetime.datetime(2008, 1, 1)

        backend = MBox('http://example.com/', self.tmp_path)
        messages = [m for m in backend.fetch(from_date=from_date)]

        expected = [
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', '51535703010a3e63d5272202942c283394cdebca', 1205746505.0),
            ('<019801ca633f$f4376140$dca623c0$@[email protected]>', '302e314c07242bb4750351286862f49e758f3e17', 1257992964.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', 'ad3116ae93c0df50436f7c84bfc94000e990996c', 1421328145.0)
        ]

        self.assertEqual(len(messages), len(expected))

        for x in range(len(messages)):
            message = messages[x]
            self.assertEqual(message['data']['Message-ID'], expected[x][0])
            self.assertEqual(message['origin'], 'http://example.com/')
            self.assertEqual(message['uuid'], expected[x][1])
            self.assertEqual(message['updated_on'], expected[x][2])
            self.assertEqual(message['category'], 'message')
            self.assertEqual(message['tag'], 'http://example.com/')
Пример #2
0
    def test_fetch(self):
        """Test whether it parses a set of mbox files"""

        backend = MBox('http://example.com/', self.tmp_path)
        messages = [m for m in backend.fetch()]

        expected = [
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', 'bd0185317b013beb21ad3ea04635de3db72496ad', 1095843820.0),
            ('<*****@*****.**>', '51535703010a3e63d5272202942c283394cdebca', 1205746505.0),
            ('<019801ca633f$f4376140$dca623c0$@[email protected]>', '302e314c07242bb4750351286862f49e758f3e17', 1257992964.0),
            ('<*****@*****.**>', 'ddda42422c55d08d56c017a6f128fcd7447484ea', 1043881350.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', 'ad3116ae93c0df50436f7c84bfc94000e990996c', 1421328145.0),
            ('<*****@*****.**>', '4e255acab6442424ecbf05cb0feb1eccb587f7de', 1030123489.0),
        ]

        self.assertEqual(len(messages), len(expected))

        for x in range(len(messages)):
            message = messages[x]
            self.assertEqual(message['data']['Message-ID'], expected[x][0])
            self.assertEqual(message['origin'], 'http://example.com/')
            self.assertEqual(message['uuid'], expected[x][1])
            self.assertEqual(message['updated_on'], expected[x][2])
            self.assertEqual(message['category'], 'message')
            self.assertEqual(message['tag'], 'http://example.com/')
Пример #3
0
    def test_ignore_file_errors(self):
        """Files with IO errors should be ignored"""

        tmp_path_ign = tempfile.mkdtemp(prefix='perceval_')

        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_single.mbox'), tmp_path_ign)
        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_multipart.mbox'), tmp_path_ign)

        # Update file mode to make it unable to access
        os.chmod(os.path.join(tmp_path_ign, 'mbox_multipart.mbox'), 0o000)

        backend = MBox('http://example.com/', tmp_path_ign)
        messages = [m for m in backend.fetch()]

        # Only one message is read
        self.assertEqual(len(messages), 1)
        self.assertEqual(messages[0]['data']['Message-ID'],
                         '<*****@*****.**>')
        self.assertEqual(messages[0]['data']['Date'],
                         'Wed, 01 Dec 2010 14:26:40 +0100')

        shutil.rmtree(tmp_path_ign)
Пример #4
0
    def test_ignore_messages(self):
        """Test if it ignores some messages without mandatory fields"""

        backend = MBox('http://example.com/', self.tmp_error_path)
        messages = [m for m in backend.fetch()]

        # There are only two valid message on the mbox
        self.assertEqual(len(messages), 2)

        expected = {
            'From': 'goran at domain.com ( Göran Lastname )',
            'Date': 'Wed, 01 Dec 2010 14:26:40 +0100',
            'Subject': '[List-name] Protocol Buffers anyone?',
            'Message-ID': '<*****@*****.**>',
            'unixfrom': 'goran at domain.com  Wed Dec  1 08:26:40 2010',
            'body': {
                'plain': "Hi!\n\nA message in English, with a signature "
                         "with a different encoding.\n\nregards, G?ran"
                         "\n",
            }
        }

        message = messages[0]['data']
        self.assertDictEqual(message, expected)

        # On the second message, the only change is that 'Message-id'
        # is replaced by 'Message-ID'
        message = messages[1]['data']
        self.assertDictEqual(message, expected)
Пример #5
0
 def list_mailers(self, url, directory="files/mbox"):
     repo = MBox(uri=url, dirpath=directory)
     count = 0
     list_mailers = []
     for message in repo.fetch():
         list_mailers.append(message['data']['From'])
     return list_mailers
Пример #6
0
    def test_search_fields(self):
        """Test whether the search_fields is properly set"""

        backend = MBox('http://example.com/', self.tmp_path)
        messages = [m for m in backend.fetch(from_date=None)]

        for message in messages:
            self.assertEqual(backend.metadata_id(message['data']), message['search_fields']['item_id'])
Пример #7
0
    def test_fetch_exception(self, mock_str_to_datetime):
        """Test whether an exception is thrown when the the fetch_items method fails"""

        mock_str_to_datetime.side_effect = Exception

        backend = MBox('http://example.com/', self.tmp_path)

        with self.assertRaises(Exception):
            _ = [m for m in backend.fetch(from_date=None)]
Пример #8
0
    def test_parse_mbox(self):
        """Test whether it parses a mbox file"""

        messages = MBox.parse_mbox(self.files['single'])
        result = [msg for msg in messages]

        self.assertEqual(len(result), 1)

        message = {k: v for k, v in result[0].items()}

        expected = {
            'From': 'goran at domain.com ( Göran Lastname )',
            'Date': 'Wed, 01 Dec 2010 14:26:40 +0100',
            'Subject': '[List-name] Protocol Buffers anyone?',
            'Message-ID': '<*****@*****.**>',
            'unixfrom': 'goran at domain.com  Wed Dec  1 08:26:40 2010',
            'body': {
                'plain':
                "Hi!\n\nA message in English, with a signature "
                "with a different encoding.\n\nregards, G?ran"
                "\n\n\n",
            }
        }

        self.assertDictEqual(message, expected)
Пример #9
0
    def test_ignore_file_errors(self):
        """Files with IO errors should be ignored"""

        tmp_path_ign = tempfile.mkdtemp(prefix='perceval_')

        def copy_mbox_side_effect(*args, **kwargs):
            """Copy a mbox archive or raise IO error for 'mbox_multipart.mbox' archive"""

            error_file = os.path.join(tmp_path_ign, 'mbox_multipart.mbox')
            mbox = args[0]

            if mbox.filepath == error_file:
                raise OSError('Mock error')

            tmp_path = tempfile.mktemp(prefix='perceval_')

            with mbox.container as f_in:
                with open(tmp_path, mode='wb') as f_out:
                    for l in f_in:
                        f_out.write(l)
            return tmp_path

        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_single.mbox'), tmp_path_ign)
        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_multipart.mbox'), tmp_path_ign)

        # Mock 'copy_mbox' method for forcing to raise an OSError
        # with file 'data/mbox/mbox_multipart.mbox' to check if
        # the code ignores this file
        with unittest.mock.patch('perceval.backends.core.mbox.MBox._copy_mbox'
                                 ) as mock_copy_mbox:
            mock_copy_mbox.side_effect = copy_mbox_side_effect

            backend = MBox('http://example.com/', tmp_path_ign)
            messages = [m for m in backend.fetch()]

            # Only one message is read
            self.assertEqual(len(messages), 1)
            self.assertEqual(messages[0]['data']['Message-ID'],
                             '<*****@*****.**>')
            self.assertEqual(messages[0]['data']['Date'],
                             'Wed, 01 Dec 2010 14:26:40 +0100')

        shutil.rmtree(tmp_path_ign)
Пример #10
0
    def get_content(self, max_items: Optional[int] = None) -> List[str]:
        """
        Получает только содержимое писем в MBOX хранилище писем игнорируя все заголовки
        :param max_items: Максимальное количество сообщений
        :return: генератор с содержимым писем архива MBOX
        """
        repo = MBox(self.mbox_path, self.mbox_path)
        result = []

        for index, msg in enumerate(repo.fetch()):
            if max_items and index >= max_items:
                break

            result.append(msg['data']['body'].get(
                'html', msg['data']['body'].get('plain', '')))

        return result
Пример #11
0
 def getAllMessage(self,repo):
     print("\nWORK IN PROGRESS !")
     base_path = os.path.dirname(os.path.realpath(__file__))
     mbox_path = base_path.replace("/python/testPerceval/classes", "/data/mbox")
     dict = MBox.parse_mbox(mbox_path+"/dm.mbox")
     for mes in dict:
         print(mes.get_content_maintype())
     repo.fetch()
Пример #12
0
    def test_parse_unixfrom_decoding_error(self):
        """Check whether it parses a mbox thatn contains encoding errors on its from header"""

        messages = MBox.parse_mbox(self.files['unixfrom'])
        result = [msg for msg in messages]

        self.assertEqual(len(result), 1)
        self.assertEqual(result[0]['unixfrom'],
                         "christian at “example.org”  Thu Jan 15 13:22:25 2015")
Пример #13
0
    def test_parse_unknown_encoding_mbox(self):
        """Check whether it parses a mbox that contains an unknown encoding"""

        messages = MBox.parse_mbox(self.files['unknown'])
        result = [msg for msg in messages]

        self.assertEqual(len(result), 1)
        self.assertEqual(result[0]['From'],
                         '"\udcc3\udc94\udcc2\udcac\udcc2\udcb4\udcc3\udc8f" <*****@*****.**>')
Пример #14
0
    def test_initialization(self):
        """Test whether attributes are initializated"""

        backend = MBox('http://example.com/', self.tmp_path, tag='test')

        self.assertEqual(backend.uri, 'http://example.com/')
        self.assertEqual(backend.dirpath, self.tmp_path)
        self.assertEqual(backend.origin, 'http://example.com/')
        self.assertEqual(backend.tag, 'test')

        # When origin is empty or None it will be set to
        # the value in uri
        backend = MBox('http://example.com/', self.tmp_path)
        self.assertEqual(backend.origin, 'http://example.com/')
        self.assertEqual(backend.tag, 'http://example.com/')

        backend = MBox('http://example.com/', self.tmp_path, tag='')
        self.assertEqual(backend.origin, 'http://example.com/')
        self.assertEqual(backend.tag, 'http://example.com/')
Пример #15
0
    def test_parse_complex_mbox(self):
        """Test whether it parses a complex mbox file"""

        messages = MBox.parse_mbox(self.files['complex'])
        result = [msg for msg in messages]

        self.assertEqual(len(result), 2)

        m0 = {k: v for k, v in result[0].items()}
        self.assertEqual(len(m0.keys()), 34)
        self.assertEqual(m0['Message-ID'],
                         '<*****@*****.**>')
        self.assertEqual(m0['Date'], 'Wed, 22 Sep 2004 02:03:40 -0700')
        self.assertEqual(m0['From'],
                         '"Eugenia Loli-Queru" <*****@*****.**>')
        self.assertEqual(
            m0['To'],
            '<*****@*****.**>, <*****@*****.**>')
        self.assertEqual(m0['Cc'], None)
        self.assertEqual(m0['Subject'], 'Re: Revisiting the Gnome Bindings')
        self.assertEqual(m0['unixfrom'],
                         '[email protected]  Wed Sep 22 05:05:28 2004')

        expected_body = {
            'plain':
            ">I don't think it's fair to blame the Foundation [...]\n"
            ">of packaging since it's really not (just) a case [...]\n"
            ">marketing.\n\n"
            "No matter what is really to blame, it ultimately [...]\n\n"
            "[...]\n\n"
            "Rgds,\n"
            "Eugenia\n"
        }
        self.assertDictEqual(m0['body'], expected_body)

        m1 = {k: v for k, v in result[1].items()}
        self.assertEqual(len(m1.keys()), 35)
        self.assertEqual(m1['Message-ID'], '<*****@*****.**>')
        self.assertEqual(m1['Date'], 'Mon, 17 Mar 2008 10:35:05 +0100')
        self.assertEqual(m1['From'], '[email protected] (Danilo  Šegan )')
        self.assertEqual(m1['To'],
                         'Simos Xenitellis <*****@*****.**>')
        self.assertEqual(
            m1['Cc'], '[email protected], '
            '"Nikolay V. Shmyrev" <*****@*****.**>,\n\t'
            'Brian Nitz <*****@*****.**>, '
            'Bastien Nocera <*****@*****.**>')
        self.assertEqual(m1['Subject'], 'Re: Low memory hacks')
        self.assertEqual(
            m1['unixfrom'],
            '[email protected]  Mon Mar 17 09:35:25 2008')
Пример #16
0
    def test_parse_multipart_mbox(self):
        """Test if it parses a message with a multipart body"""

        messages = MBox.parse_mbox(self.files['multipart'])
        result = [msg for msg in messages]

        self.assertEqual(len(result), 2)

        # Multipart message
        plain_body = result[0]['body']['plain']
        html_body = result[0]['body']['html']
        self.assertEqual(
            plain_body, 'technology.esl Committers,\n\n'
            'This automatically generated message marks the successful completion of\n'
            'voting for Chuwei Huang to receive full Committer status on the\n'
            'technology.esl project. The next step is for the PMC to approve this vote,\n'
            'followed by the EMO processing the paperwork and provisioning the account.\n\n\n\n'
            'Vote summary: 4/0/0 with 0 not voting\n\n'
            '  +1  Thomas Guiu\n\n'
            '  +1  Jin Liu\n\n'
            '  +1  Yves YANG\n\n'
            '  +1  Bo Zhou\n\n\n\n'
            'If you have any questions, please do not hesitate to contact your project\n'
            'lead, PMC member, or the EMO <*****@*****.**>\n\n\n\n\n\n')
        self.assertEqual(len(html_body), 3103)

        # Multipart message without defined encoding
        plain_body = result[1]['body']['plain']
        html_body = result[1]['body']['html']
        self.assertEqual(
            plain_body,
            'I am fairly new to eclipse. I am evaluating the use of eclipse for a generic\n'
            'UI framework that is not necessarily related to code generation.\n'
            'Eclipse is very flexible and adding functionality seems straightforward. I\n'
            'can still use the project concept for what I need but there are things in\n'
            'the Workbench window that I don\'t want. For example the Open perspective\n'
            'icon, or some of the menus, like the Windows and project menu .\n\n'
            'I understand that by using retargetable actions I can have my view taking\n'
            'over most of the actions, but I could not figure out how to block the core\n'
            'plug-in to put their own actions. In the workbench plug-in (org.eclipse.ui)\n'
            'I could not find where menus are defined and where actionsviews for all\n'
            'generic toolbars are defined.\n\nHow do I do this?\nCan this be done?\n'
            'Is anybody using eclipse as a generic UI framework?\n\nI appreciate any help.\n\n'
            'Thanks,\n\nDaniel Nehren\n\n')
        self.assertEqual(len(html_body), 1557)
Пример #17
0
from perceval.backends.core.mbox import MBox
from elasticsearch import Elasticsearch
import hashlib
from jwzthreading import thread, Message
es = Elasticsearch()

src_url = 'https://lists.xenproject.org/archives/html/mbox/'
dest_dir = '/home/heather/dev/xen/mboxes'

repo = MBox(uri=src_url, dirpath=dest_dir)
logfile = open('mboxAnalysis.log', 'w')


def create_index():
    mapping = {
        "properties": {
            "data": {
                "properties": {
                    "Message-ID": {
                        "type": "keyword"
                    }
                }
            }
        }
    }
    es.indices.create(index='xenmbox', ignore=400)
    es.indices.put_mapping(index='xenmbox',
                           doc_type='unknown',
                           body=mapping,
                           update_all_types=True)
Пример #18
0
 def numMails(self, url, directory="files/mbox"):
     repo = MBox(uri=url, dirpath=directory)
     count = 0
     for message in repo.fetch():
         count += 1
     return count
Пример #19
0
 def createRepo(self, dir, mailList):
     mbox_uri = mailList + '@iova.net'
     mbox_dir = dir
     repo = MBox(uri=mbox_uri, dirpath=mbox_dir)
     return repo
Пример #20
0
 def getmbox(self, mbox_files):
     mbox_parser = MBox(uri=mbox_files, dirpath='./mboxes')
     return mbox_parser.fetch()
Пример #21
0
 def createRepo(self):
     mbox_uri = '*****@*****.**'
     mbox_dir = '../mails'
     repo = MBox(uri=mbox_uri, dirpath=mbox_dir)
     return repo
Пример #22
0
    def test_has_resuming(self):
        """Test if it returns True when has_resuming is called"""

        self.assertEqual(MBox.has_resuming(), True)
Пример #23
0
    def test_has_caching(self):
        """Test if it returns False when has_caching is called"""

        self.assertEqual(MBox.has_caching(), False)
Пример #24
0
    def test_parse_iso8859_encoding_mbox(self):
        """Check whether no execption is raisen when parsing a mbox that contains a iso 8859 encoding"""

        messages = MBox.parse_mbox(self.files['iso8859'])
        _ = [msg for msg in messages]
Пример #25
0
mbox_uri = 'http://mail-archives.apache.org/mod_mbox/httpd-announce/'
# directory for letting Perceval where mbox archives are
# you need to have the archives to analyzed there before running the script
mbox_dir = 'archives'
# ElasticSearch instance (url)
es = elasticsearch.Elasticsearch(['http://localhost:9200/'])

# Create the 'messages' index in ElasticSearch
try:
    es.indices.create('messages')
except elasticsearch.exceptions.RequestError:
    print('Index already exisits, remove it before running this script again.')
    exit()

# create a mbox object, using mbox_uri as label, mbox_dir as directory to scan
repo = MBox(uri=mbox_uri, dirpath=mbox_dir)

# Fetch all commits as an iteratoir, and iterate it uploading to ElasticSearch
print('Analyzing mbox archives...')
# fetch all messages as an iteratoir
for message in repo.fetch():
    # Create the object (dictionary) to upload to ElasticSearch
    summary = {
        'from': message['data']['From'],
        'subject': message['data']['Subject'],
        'date': email.utils.parsedate_to_datetime(message['data']['Date'])
    }
    print('.', end='')
    # Upload the object to ElasticSearch
    es.index(index='messages', doc_type='summary', body=summary)