示例#1
0
    def test_ignore_messages(self):
        """Test if it ignores some messages without mandatory fields"""

        backend = MBox('http://example.com/', self.tmp_error_path)
        messages = [m for m in backend.fetch()]

        # There are only two valid message on the mbox
        self.assertEqual(len(messages), 2)

        expected = {
            'From': 'goran at domain.com ( Göran Lastname )',
            'Date': 'Wed, 01 Dec 2010 14:26:40 +0100',
            'Subject': '[List-name] Protocol Buffers anyone?',
            'Message-ID': '<*****@*****.**>',
            'unixfrom': 'goran at domain.com  Wed Dec  1 08:26:40 2010',
            'body': {
                'plain': "Hi!\n\nA message in English, with a signature "
                         "with a different encoding.\n\nregards, G?ran"
                         "\n",
            }
        }

        message = messages[0]['data']
        self.assertDictEqual(message, expected)

        # On the second message, the only change is that 'Message-id'
        # is replaced by 'Message-ID'
        message = messages[1]['data']
        self.assertDictEqual(message, expected)
示例#2
0
    def test_fetch(self):
        """Test whether it parses a set of mbox files"""

        backend = MBox('http://example.com/', self.tmp_path)
        messages = [m for m in backend.fetch()]

        expected = [
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', 'bd0185317b013beb21ad3ea04635de3db72496ad', 1095843820.0),
            ('<*****@*****.**>', '51535703010a3e63d5272202942c283394cdebca', 1205746505.0),
            ('<019801ca633f$f4376140$dca623c0$@[email protected]>', '302e314c07242bb4750351286862f49e758f3e17', 1257992964.0),
            ('<*****@*****.**>', 'ddda42422c55d08d56c017a6f128fcd7447484ea', 1043881350.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', 'ad3116ae93c0df50436f7c84bfc94000e990996c', 1421328145.0),
            ('<*****@*****.**>', '4e255acab6442424ecbf05cb0feb1eccb587f7de', 1030123489.0),
        ]

        self.assertEqual(len(messages), len(expected))

        for x in range(len(messages)):
            message = messages[x]
            self.assertEqual(message['data']['Message-ID'], expected[x][0])
            self.assertEqual(message['origin'], 'http://example.com/')
            self.assertEqual(message['uuid'], expected[x][1])
            self.assertEqual(message['updated_on'], expected[x][2])
            self.assertEqual(message['category'], 'message')
            self.assertEqual(message['tag'], 'http://example.com/')
示例#3
0
    def test_fetch_from_date(self):
        """Test whether a list of messages is returned since a given date"""

        from_date = datetime.datetime(2008, 1, 1)

        backend = MBox('http://example.com/', self.tmp_path)
        messages = [m for m in backend.fetch(from_date=from_date)]

        expected = [
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', '51535703010a3e63d5272202942c283394cdebca', 1205746505.0),
            ('<019801ca633f$f4376140$dca623c0$@[email protected]>', '302e314c07242bb4750351286862f49e758f3e17', 1257992964.0),
            ('<*****@*****.**>', '86315b479b4debe320b59c881c1e375216cbf333', 1291210000.0),
            ('<*****@*****.**>', 'ad3116ae93c0df50436f7c84bfc94000e990996c', 1421328145.0)
        ]

        self.assertEqual(len(messages), len(expected))

        for x in range(len(messages)):
            message = messages[x]
            self.assertEqual(message['data']['Message-ID'], expected[x][0])
            self.assertEqual(message['origin'], 'http://example.com/')
            self.assertEqual(message['uuid'], expected[x][1])
            self.assertEqual(message['updated_on'], expected[x][2])
            self.assertEqual(message['category'], 'message')
            self.assertEqual(message['tag'], 'http://example.com/')
示例#4
0
    def test_ignore_file_errors(self):
        """Files with IO errors should be ignored"""

        tmp_path_ign = tempfile.mkdtemp(prefix='perceval_')

        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_single.mbox'), tmp_path_ign)
        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_multipart.mbox'), tmp_path_ign)

        # Update file mode to make it unable to access
        os.chmod(os.path.join(tmp_path_ign, 'mbox_multipart.mbox'), 0o000)

        backend = MBox('http://example.com/', tmp_path_ign)
        messages = [m for m in backend.fetch()]

        # Only one message is read
        self.assertEqual(len(messages), 1)
        self.assertEqual(messages[0]['data']['Message-ID'],
                         '<*****@*****.**>')
        self.assertEqual(messages[0]['data']['Date'],
                         'Wed, 01 Dec 2010 14:26:40 +0100')

        shutil.rmtree(tmp_path_ign)
示例#5
0
 def list_mailers(self, url, directory="files/mbox"):
     repo = MBox(uri=url, dirpath=directory)
     count = 0
     list_mailers = []
     for message in repo.fetch():
         list_mailers.append(message['data']['From'])
     return list_mailers
示例#6
0
    def test_search_fields(self):
        """Test whether the search_fields is properly set"""

        backend = MBox('http://example.com/', self.tmp_path)
        messages = [m for m in backend.fetch(from_date=None)]

        for message in messages:
            self.assertEqual(backend.metadata_id(message['data']), message['search_fields']['item_id'])
    def test_fetch_exception(self, mock_str_to_datetime):
        """Test whether an exception is thrown when the the fetch_items method fails"""

        mock_str_to_datetime.side_effect = Exception

        backend = MBox('http://example.com/', self.tmp_path)

        with self.assertRaises(Exception):
            _ = [m for m in backend.fetch(from_date=None)]
    def test_ignore_file_errors(self):
        """Files with IO errors should be ignored"""

        tmp_path_ign = tempfile.mkdtemp(prefix='perceval_')

        def copy_mbox_side_effect(*args, **kwargs):
            """Copy a mbox archive or raise IO error for 'mbox_multipart.mbox' archive"""

            error_file = os.path.join(tmp_path_ign, 'mbox_multipart.mbox')
            mbox = args[0]

            if mbox.filepath == error_file:
                raise OSError('Mock error')

            tmp_path = tempfile.mktemp(prefix='perceval_')

            with mbox.container as f_in:
                with open(tmp_path, mode='wb') as f_out:
                    for l in f_in:
                        f_out.write(l)
            return tmp_path

        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_single.mbox'), tmp_path_ign)
        shutil.copy(
            os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         'data/mbox/mbox_multipart.mbox'), tmp_path_ign)

        # Mock 'copy_mbox' method for forcing to raise an OSError
        # with file 'data/mbox/mbox_multipart.mbox' to check if
        # the code ignores this file
        with unittest.mock.patch('perceval.backends.core.mbox.MBox._copy_mbox'
                                 ) as mock_copy_mbox:
            mock_copy_mbox.side_effect = copy_mbox_side_effect

            backend = MBox('http://example.com/', tmp_path_ign)
            messages = [m for m in backend.fetch()]

            # Only one message is read
            self.assertEqual(len(messages), 1)
            self.assertEqual(messages[0]['data']['Message-ID'],
                             '<*****@*****.**>')
            self.assertEqual(messages[0]['data']['Date'],
                             'Wed, 01 Dec 2010 14:26:40 +0100')

        shutil.rmtree(tmp_path_ign)
示例#9
0
    def get_content(self, max_items: Optional[int] = None) -> List[str]:
        """
        Получает только содержимое писем в MBOX хранилище писем игнорируя все заголовки
        :param max_items: Максимальное количество сообщений
        :return: генератор с содержимым писем архива MBOX
        """
        repo = MBox(self.mbox_path, self.mbox_path)
        result = []

        for index, msg in enumerate(repo.fetch()):
            if max_items and index >= max_items:
                break

            result.append(msg['data']['body'].get(
                'html', msg['data']['body'].get('plain', '')))

        return result
示例#10
0
# you need to have the archives to analyzed there before running the script
mbox_dir = 'archives'
# ElasticSearch instance (url)
es = elasticsearch.Elasticsearch(['http://localhost:9200/'])

# Create the 'messages' index in ElasticSearch
try:
    es.indices.create('messages')
except elasticsearch.exceptions.RequestError:
    print('Index already exisits, remove it before running this script again.')
    exit()

# create a mbox object, using mbox_uri as label, mbox_dir as directory to scan
repo = MBox(uri=mbox_uri, dirpath=mbox_dir)

# Fetch all commits as an iteratoir, and iterate it uploading to ElasticSearch
print('Analyzing mbox archives...')
# fetch all messages as an iteratoir
for message in repo.fetch():
    # Create the object (dictionary) to upload to ElasticSearch
    summary = {
        'from': message['data']['From'],
        'subject': message['data']['Subject'],
        'date': email.utils.parsedate_to_datetime(message['data']['Date'])
    }
    print('.', end='')
    # Upload the object to ElasticSearch
    es.index(index='messages', doc_type='summary', body=summary)

print('\nCreated new index with commits.')
示例#11
0
 def getmbox(self, mbox_files):
     mbox_parser = MBox(uri=mbox_files, dirpath='./mboxes')
     return mbox_parser.fetch()
示例#12
0
 def numMails(self, url, directory="files/mbox"):
     repo = MBox(uri=url, dirpath=directory)
     count = 0
     for message in repo.fetch():
         count += 1
     return count