예제 #1
0
    def execute(cls, options):
        options = cls.handle_options(options)

        try:
            if options.extract:
                MySQLExtractor(options).extract()
            if options.load:
                MediawikiLoader(options).load()
        finally:
            if not options.keep_dumps:
                shutil.rmtree(options.dump_dir)
    def setUp(self):
        setup_basic_test()
        self.options = mock.Mock()
        self.options.dump_dir = os.path.join(g.tmpdir, 'w2m_test')

        # monkey-patch MySQLExtractor for test
        def pages(self):
            yield {'page_id': 1, 'title': 'Test title'}
            yield {'page_id': 2, 'title': 'Main_Page'}
            yield {'page_id': 3, 'title': 'Test'}

        def history(self, page_id):
            data = {
                1: [
                    {'timestamp': 1, 'text': "Test", 'username': '******'},
                    {'timestamp': 2, 'text': "Test Text", 'username': '******'}
                ],
                2: [
                    {'timestamp': 1, 'text': "Main_Page", 'username': '******'},
                    {'timestamp': 2, 'text': "Main_Page text", 'username': '******'}
                ],
                3: [
                    {'timestamp': 1, 'text': "Some test text", 'username': ''},
                    {'timestamp': 2, 'text': "", 'username': ''}
                ]
            }
            revisions = data[page_id]
            for rev in revisions:
                yield rev

        def talk(self, page_title):
            return {
                'text': 'Talk for page %s.' % page_title,
                'timestamp': 1,
                'username': '******'
            }

        def attachments(self, *args, **kwargs):
            # make 'empty' iterator
            if False:
                yield

        MySQLExtractor._pages = pages
        MySQLExtractor._history = history
        MySQLExtractor._talk = talk
        MySQLExtractor._attachments = attachments
        self.extractor = MySQLExtractor(self.options)
예제 #3
0
    def setUp(self):
        setup_basic_test()
        self.options = mock.Mock()
        self.options.dump_dir = os.path.join(g.tmpdir, 'w2m_test')

        # monkey-patch MySQLExtractor for test
        def pages(self):
            yield {'page_id': 1, 'title': 'Test title'}
            yield {'page_id': 2, 'title': 'Main_Page'}
            yield {'page_id': 3, 'title': 'Test'}

        def history(self, page_id):
            data = {
                1: [{
                    'timestamp': 1,
                    'text': "Test",
                    'username': '******'
                }, {
                    'timestamp': 2,
                    'text': "Test Text",
                    'username': '******'
                }],
                2: [{
                    'timestamp': 1,
                    'text': "Main_Page",
                    'username': '******'
                }, {
                    'timestamp': 2,
                    'text': "Main_Page text",
                    'username': '******'
                }],
                3: [{
                    'timestamp': 1,
                    'text': "Some test text",
                    'username': ''
                }, {
                    'timestamp': 2,
                    'text': "",
                    'username': ''
                }]
            }
            revisions = data[page_id]
            for rev in revisions:
                yield rev

        def talk(self, page_title):
            return {
                'text': 'Talk for page %s.' % page_title,
                'timestamp': 1,
                'username': '******'
            }

        def attachments(self, *args, **kwargs):
            # make 'empty' iterator
            if False:
                yield

        MySQLExtractor._pages = pages
        MySQLExtractor._history = history
        MySQLExtractor._talk = talk
        MySQLExtractor._attachments = attachments
        self.extractor = MySQLExtractor(self.options)
예제 #4
0
class TestMySQLExtractor(object):
    def setUp(self):
        setup_basic_test()
        self.options = mock.Mock()
        self.options.dump_dir = os.path.join(g.tmpdir, 'w2m_test')

        # monkey-patch MySQLExtractor for test
        def pages(self):
            yield {'page_id': 1, 'title': 'Test title'}
            yield {'page_id': 2, 'title': 'Main_Page'}
            yield {'page_id': 3, 'title': 'Test'}

        def history(self, page_id):
            data = {
                1: [{
                    'timestamp': 1,
                    'text': "Test",
                    'username': '******'
                }, {
                    'timestamp': 2,
                    'text': "Test Text",
                    'username': '******'
                }],
                2: [{
                    'timestamp': 1,
                    'text': "Main_Page",
                    'username': '******'
                }, {
                    'timestamp': 2,
                    'text': "Main_Page text",
                    'username': '******'
                }],
                3: [{
                    'timestamp': 1,
                    'text': "Some test text",
                    'username': ''
                }, {
                    'timestamp': 2,
                    'text': "",
                    'username': ''
                }]
            }
            revisions = data[page_id]
            for rev in revisions:
                yield rev

        def talk(self, page_title):
            return {
                'text': 'Talk for page %s.' % page_title,
                'timestamp': 1,
                'username': '******'
            }

        def attachments(self, *args, **kwargs):
            # make 'empty' iterator
            if False:
                yield

        MySQLExtractor._pages = pages
        MySQLExtractor._history = history
        MySQLExtractor._talk = talk
        MySQLExtractor._attachments = attachments
        self.extractor = MySQLExtractor(self.options)

    def test_extract_pages(self):
        """Test that pages and edit history extracted properly"""
        self.extractor.extract_pages()

        # rev 1 of page 1
        with open(
                os.path.join(self.options.dump_dir, 'pages/1/history/1.json'),
                'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 1,
            'text': 'Test',
            'page_id': 1,
            'title': 'Test title',
            'username': '******'
        }
        assert page == res_page

        # rev 2 of page 1
        with open(
                os.path.join(self.options.dump_dir, 'pages/1/history/2.json'),
                'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 2,
            'text': 'Test Text',
            'page_id': 1,
            'title': 'Test title',
            'username': '******'
        }
        assert page == res_page

        # rev 1 of page 2
        with open(
                os.path.join(self.options.dump_dir, 'pages/2/history/1.json'),
                'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 1,
            'text': 'Main_Page',
            'page_id': 2,
            'title': 'Main_Page',
            'username': '******'
        }
        assert page == res_page

        # rev 2 of page 2
        with open(
                os.path.join(self.options.dump_dir, 'pages/2/history/2.json'),
                'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 2,
            'text': 'Main_Page text',
            'page_id': 2,
            'title': 'Main_Page',
            'username': '******'
        }
        assert page == res_page

        # rev 1 of page 3
        with open(
                os.path.join(self.options.dump_dir, 'pages/3/history/1.json'),
                'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 1,
            'text': 'Some test text',
            'page_id': 3,
            'title': 'Test',
            'username': ''
        }
        assert page == res_page

        # rev 2 of page 3
        with open(
                os.path.join(self.options.dump_dir, 'pages/3/history/2.json'),
                'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 2,
            'text': '',
            'page_id': 3,
            'title': 'Test',
            'username': ''
        }
        assert page == res_page

    def test_extract_talk(self):
        """Test that talk pages extracted properly."""
        pages = [
            {
                'page_id': 1,
                'title': 'Test 1'
            },
            {
                'page_id': 2,
                'title': 'Test 2'
            },
            {
                'page_id': 3,
                'title': 'Test 3'
            },
        ]
        for page in pages:
            self.extractor.extract_talk(page)

        with open(
                os.path.join(self.options.dump_dir, 'pages/1/discussion.json'),
                'r') as f:
            page = json.load(f)
        assert page == {
            'text': 'Talk for page Test 1.',
            'username': '******',
            'timestamp': 1
        }

        with open(
                os.path.join(self.options.dump_dir, 'pages/2/discussion.json'),
                'r') as f:
            page = json.load(f)
        assert page == {
            'text': 'Talk for page Test 2.',
            'timestamp': 1,
            'username': '******'
        }

        with open(
                os.path.join(self.options.dump_dir, 'pages/3/discussion.json'),
                'r') as f:
            page = json.load(f)
        assert page == {
            'text': 'Talk for page Test 3.',
            'timestamp': 1,
            'username': '******'
        }
class TestMySQLExtractor(object):

    def setUp(self):
        setup_basic_test()
        self.options = mock.Mock()
        self.options.dump_dir = os.path.join(g.tmpdir, 'w2m_test')

        # monkey-patch MySQLExtractor for test
        def pages(self):
            yield {'page_id': 1, 'title': 'Test title'}
            yield {'page_id': 2, 'title': 'Main_Page'}
            yield {'page_id': 3, 'title': 'Test'}

        def history(self, page_id):
            data = {
                1: [
                    {'timestamp': 1, 'text': "Test", 'username': '******'},
                    {'timestamp': 2, 'text': "Test Text", 'username': '******'}
                ],
                2: [
                    {'timestamp': 1, 'text': "Main_Page", 'username': '******'},
                    {'timestamp': 2, 'text': "Main_Page text", 'username': '******'}
                ],
                3: [
                    {'timestamp': 1, 'text': "Some test text", 'username': ''},
                    {'timestamp': 2, 'text': "", 'username': ''}
                ]
            }
            revisions = data[page_id]
            for rev in revisions:
                yield rev

        def talk(self, page_title):
            return {
                'text': 'Talk for page %s.' % page_title,
                'timestamp': 1,
                'username': '******'
            }

        def attachments(self, *args, **kwargs):
            # make 'empty' iterator
            if False:
                yield

        MySQLExtractor._pages = pages
        MySQLExtractor._history = history
        MySQLExtractor._talk = talk
        MySQLExtractor._attachments = attachments
        self.extractor = MySQLExtractor(self.options)

    def test_extract_pages(self):
        """Test that pages and edit history extracted properly"""
        self.extractor.extract_pages()

        # rev 1 of page 1
        with open(os.path.join(self.options.dump_dir, 'pages/1/history/1.json'), 'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 1,
            'text': 'Test',
            'page_id': 1,
            'title': 'Test title',
            'username': '******'
        }
        assert page == res_page

        # rev 2 of page 1
        with open(os.path.join(self.options.dump_dir, 'pages/1/history/2.json'), 'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 2,
            'text': 'Test Text',
            'page_id': 1,
            'title': 'Test title',
            'username': '******'
        }
        assert page == res_page

        # rev 1 of page 2
        with open(os.path.join(self.options.dump_dir, 'pages/2/history/1.json'), 'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 1,
            'text': 'Main_Page',
            'page_id': 2,
            'title': 'Main_Page',
            'username': '******'
        }
        assert page == res_page

        # rev 2 of page 2
        with open(os.path.join(self.options.dump_dir, 'pages/2/history/2.json'), 'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 2,
            'text': 'Main_Page text',
            'page_id': 2,
            'title': 'Main_Page',
            'username': '******'
        }
        assert page == res_page

        # rev 1 of page 3
        with open(os.path.join(self.options.dump_dir, 'pages/3/history/1.json'), 'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 1,
            'text': 'Some test text',
            'page_id': 3,
            'title': 'Test',
            'username': ''
        }
        assert page == res_page

        # rev 2 of page 3
        with open(os.path.join(self.options.dump_dir, 'pages/3/history/2.json'), 'r') as f:
            page = json.load(f)
        res_page = {
            'timestamp': 2,
            'text': '',
            'page_id': 3,
            'title': 'Test',
            'username': ''
        }
        assert page == res_page

    def test_extract_talk(self):
        """Test that talk pages extracted properly."""
        pages = [
            {'page_id': 1, 'title': 'Test 1'},
            {'page_id': 2, 'title': 'Test 2'},
            {'page_id': 3, 'title': 'Test 3'},
        ]
        for page in pages:
            self.extractor.extract_talk(page)

        with open(os.path.join(self.options.dump_dir, 'pages/1/discussion.json'), 'r') as f:
            page = json.load(f)
        assert page == {
                        'text': 'Talk for page Test 1.',
                        'username': '******',
                        'timestamp': 1}

        with open(os.path.join(self.options.dump_dir, 'pages/2/discussion.json'), 'r') as f:
            page = json.load(f)
        assert page == {
                        'text': 'Talk for page Test 2.',
                        'timestamp': 1,
                        'username': '******'}

        with open(os.path.join(self.options.dump_dir, 'pages/3/discussion.json'), 'r') as f:
            page = json.load(f)
        assert page == {
                        'text': 'Talk for page Test 3.',
                        'timestamp': 1,
                        'username': '******'}