예제 #1
0
    def test_err_missing_dirs(self):
        """ Test various errors with missing warcs dir,
        missing cdx dir, non dir cdx file, and missing collections root
        """
        colls = os.path.join(self.root_dir, COLLECTIONS)

        # No Statics -- ignorable
        shutil.rmtree(os.path.join(colls, 'foo', 'static'))

        # No WARCS
        warcs_path = os.path.join(colls, 'foo', ARCHIVE_DIR)
        shutil.rmtree(warcs_path)

        with raises(IOError):
            main(['add', 'foo', 'somewarc'])

        # No CDX
        cdx_path = os.path.join(colls, 'foo', INDEX_DIR)
        shutil.rmtree(cdx_path)

        # CDX a file not a dir
        with open(cdx_path, 'w+b') as fh:
            fh.write(b'foo\n')

        shutil.rmtree(colls)

        # No Collections to list
        with raises(IOError):
            main(['list'])

        # No Collections
        resp = self.testapp.get('/test/', status=404)
        assert resp.status_int == 404
예제 #2
0
    def test_more_custom_templates(self):
        """
        Test custom templates and metadata
        Template is relative to collection-specific dir
        Add custom metadata and test its presence in custom search page
        """
        custom_search = os.path.join(self.root_dir, COLLECTIONS, 'test',
                                      'templates', 'search.html')

        # add metadata
        main(['metadata', 'test', '--set', 'some=value'])

        with open(custom_search, 'w+b') as fh:
            fh.write(b'overriden search page: ')
            fh.write(b'{{ metadata | tojson }}\n')

        # force clear of jinja env cache to reload
        self.app.rewriterapp.jinja_env.jinja_env.cache = {}

        resp = self.testapp.get('/test/')
        resp.charset = 'utf-8'
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'
        assert 'overriden search page: ' in resp.text
        assert '"some":"value"' in resp.text
예제 #3
0
    def test_more_custom_templates(self):
        """
        Test custom templates and metadata
        Template is relative to collection-specific dir
        Add custom metadata and test its presence in custom search page
        """
        custom_search = os.path.join(self.root_dir, COLLECTIONS, 'test',
                                      'templates', 'search.html')

        # add metadata
        main(['metadata', 'test', '--set', 'some=value'])

        with open(custom_search, 'w+b') as fh:
            fh.write(b'overriden search page: ')
            fh.write(b'{{ metadata | tojson }}\n')

        # force clear of jinja env cache to reload
        self.app.rewriterapp.jinja_env.jinja_env.cache = {}

        resp = self.testapp.get('/test/')
        resp.charset = 'utf-8'
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'
        assert 'overriden search page: ' in resp.text
        #assert '"some":"value"' in resp.text, resp.text
        assert '{"some":"value"}' in resp.text, resp.text
예제 #4
0
    def test_err_no_such_coll(self):
        """ Test error adding warc to non-existant collection
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        with raises(IOError):
            main(['add', 'bar', warc1])
예제 #5
0
    def test_err_no_such_coll(self):
        """ Test error adding warc to non-existant collection
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        with raises(IOError):
            main(['add', 'bar', warc1])
예제 #6
0
    def test_err_no_such_coll(self):
        """ Test error adding warc to non-existant collection
        """
        warc1 = self._get_sample_warc("example.warc.gz")

        with raises(IOError):
            main(["add", "bar", warc1])
예제 #7
0
    def test_err_missing_dirs(self):
        """ Test various errors with missing warcs dir,
        missing cdx dir, non dir cdx file, and missing collections root
        """
        colls = os.path.join(self.root_dir, COLLECTIONS)

        # No Statics -- ignorable
        shutil.rmtree(os.path.join(colls, 'foo', 'static'))

        # No WARCS
        warcs_path = os.path.join(colls, 'foo', ARCHIVE_DIR)
        shutil.rmtree(warcs_path)

        with raises(IOError):
            main(['add', 'foo', 'somewarc'])

        # No CDX
        cdx_path = os.path.join(colls, 'foo', INDEX_DIR)
        shutil.rmtree(cdx_path)

        # CDX a file not a dir
        with open(cdx_path, 'w+b') as fh:
            fh.write(b'foo\n')

        shutil.rmtree(colls)

        # No Collections to list
        with raises(IOError):
            main(['list'])

        # No Collections
        resp = self.testapp.get('/test/', status=404)
        assert resp.status_int == 404
예제 #8
0
    def test_custom_config(self):
        """ Test custom created config.yaml which overrides auto settings
        Template is relative to collection-specific dir
        Add custom metadata and test its presence in custom search page
        """
        config_path = os.path.join(self.root_dir, "collections", "test", "config.yaml")
        with open(config_path, "w+b") as fh:
            fh.write(b"search_html: ./templates/custom_search.html\n")
            fh.write(b"index_paths: ./cdx2/\n")

        custom_search = os.path.join(self.root_dir, "collections", "test", "templates", "custom_search.html")

        # add metadata
        main(["metadata", "test", "--set", "some=value"])

        with open(custom_search, "w+b") as fh:
            fh.write(b"config.yaml overriden search page: ")
            fh.write(b"{{ wbrequest.user_metadata | tojson }}\n")

        os.rename(
            os.path.join(self.root_dir, "collections", "test", INDEX_DIR),
            os.path.join(self.root_dir, "collections", "test", "cdx2"),
        )

        self._create_app()
        resp = self.testapp.get("/test/")
        resp.charset = "utf-8"
        assert resp.status_int == 200
        assert resp.content_type == "text/html"
        assert 'config.yaml overriden search page: {"some": "value"}' in resp.text

        resp = self.testapp.get("/test/20140103030321/http://example.com?example=1")
        assert resp.status_int == 200
예제 #9
0
    def test_custom_config(self):
        """ Test custom created config.yaml which overrides auto settings
        Template is relative to collection-specific dir
        Add custom metadata and test its presence in custom search page
        """
        config_path = os.path.join(self.root_dir, 'collections', 'test',
                                   'config.yaml')
        with open(config_path, 'w+b') as fh:
            fh.write('search_html: ./templates/custom_search.html\n')
            fh.write('index_paths: ./cdx2/\n')

        custom_search = os.path.join(self.root_dir, 'collections', 'test',
                                     'templates', 'custom_search.html')

        # add metadata
        main(['metadata', 'test', '--set', 'some=value'])

        with open(custom_search, 'w+b') as fh:
            fh.write('config.yaml overriden search page: ')
            fh.write('{{ wbrequest.user_metadata | tojson }}\n')

        os.rename(
            os.path.join(self.root_dir, 'collections', 'test', INDEX_DIR),
            os.path.join(self.root_dir, 'collections', 'test', 'cdx2'))

        self._create_app()
        resp = self.testapp.get('/test/')
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'
        assert 'config.yaml overriden search page: {"some": "value"}' in resp.body

        resp = self.testapp.get(
            '/test/20140103030321/http://example.com?example=1')
        assert resp.status_int == 200
예제 #10
0
    def test_custom_config(self):
        """ Test custom created config.yaml which overrides auto settings
        Template is relative to collection-specific dir
        Add custom metadata and test its presence in custom search page
        """
        config_path = os.path.join(self.root_dir, 'collections', 'test', 'config.yaml')
        with open(config_path, 'w+b') as fh:
            fh.write('search_html: ./templates/custom_search.html\n')
            fh.write('index_paths: ./cdx2/\n')

        custom_search = os.path.join(self.root_dir, 'collections', 'test',
                                     'templates', 'custom_search.html')

        # add metadata
        main(['metadata', 'test', '--set', 'some=value'])

        with open(custom_search, 'w+b') as fh:
            fh.write('config.yaml overriden search page: ')
            fh.write('{{ wbrequest.user_metadata | tojson }}\n')

        os.rename(os.path.join(self.root_dir, 'collections', 'test', INDEX_DIR),
                  os.path.join(self.root_dir, 'collections', 'test', 'cdx2'))

        self._create_app()
        resp = self.testapp.get('/test/')
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'
        assert 'config.yaml overriden search page: {"some": "value"}' in resp.body

        resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
        assert resp.status_int == 200
예제 #11
0
    def test_another_coll(self):
        """ Test adding warc to a new coll, check replay
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        main(['init', 'foo'])

        main(['add', 'foo', warc1])
예제 #12
0
    def test_err_invalid_name(self):
        """ Invalid collection name
        """
        with raises(ValueError):
            main(["init", "../abc%"])

        with raises(ValueError):
            main(["init", "45^23"])
예제 #13
0
    def test_another_coll(self):
        """ Test adding warc to a new coll, check replay
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        main(['init', 'foo'])

        main(['add', 'foo', warc1])
예제 #14
0
    def test_err_invalid_name(self):
        """ Invalid collection name
        """
        with raises(ValueError):
            main(['init', '../abc%'])

        with raises(ValueError):
            main(['init', '45^23'])
예제 #15
0
    def test_err_invalid_name(self):
        """ Invalid collection name
        """
        with raises(ValueError):
            main(['init', '../abc%'])

        with raises(ValueError):
            main(['init', '45^23'])
예제 #16
0
    def test_add_warcs(self):
        """ Test adding warc to new coll, check replay
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        main(['add', 'test', warc1])

        self._create_app()
        resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
        assert resp.status_int == 200
예제 #17
0
    def test_add_warcs(self):
        """ Test adding warc to new coll, check replay
        """
        warc1 = self._get_sample_warc("example.warc.gz")

        main(["add", "test", warc1])

        self._create_app()
        resp = self.testapp.get("/test/20140103030321/http://example.com?example=1")
        assert resp.status_int == 200
예제 #18
0
    def test_add_warcs(self):
        """ Test adding warc to new coll, check replay
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        main(['add', 'test', warc1])

        self._create_app()
        resp = self.testapp.get('/test/20140103030321/http://example.com?example=1')
        assert resp.status_int == 200
예제 #19
0
    def test_add_title_metadata_index_page(self):
        """ Test adding title metadata to a collection, test
        retrieval on default index page
        """
        main(['metadata', 'foo', '--set', 'title=Collection Title'])

        self._create_app()
        resp = self.testapp.get('/')
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'
        assert '(Collection Title)' in resp.body
예제 #20
0
    def test_auto_index(self):
        main(['init', 'auto'])
        auto_dir = os.path.join(self.root_dir, COLLECTIONS, 'auto')
        archive_dir = os.path.join(auto_dir, ARCHIVE_DIR)

        archive_sub_dir = os.path.join(archive_dir, 'sub')
        os.makedirs(archive_sub_dir)

        def do_copy():
            try:
                time.sleep(1.0)
                shutil.copy(self._get_sample_warc('example.warc.gz'),
                            archive_dir)
                shutil.copy(self._get_sample_warc('example-extra.warc'),
                            archive_sub_dir)
                time.sleep(1.0)
            finally:
                indexer.interval = 0

        indexer = AutoIndexer(interval=0.25)
        indexer.start()

        ge = gevent.spawn(do_copy)
        ge.join()

        index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
        assert os.path.isfile(index_file)

        with open(index_file, 'r') as fh:
            index = fh.read()

        assert '"example.warc.gz' in index, index
        assert '"sub/example-extra.warc' in index, index

        mtime = os.path.getmtime(index_file)

        # Update
        indexer.interval = 0.25
        indexer.start()

        os.remove(index_file)

        #thread = threading.Thread(target=do_copy)
        #thread.daemon = True
        #thread.start()
        ge = gevent.spawn(do_copy)

        #wayback(['-p', '0', '-a', '--auto-interval', '0.25'])

        #thread.join()
        ge.join()

        # assert file was update
        assert os.path.getmtime(index_file) > mtime
예제 #21
0
    def test_add_title_metadata_index_page(self):
        """ Test adding title metadata to a collection, test
        retrieval on default index page
        """
        main(['metadata', 'foo', '--set', 'title=Collection Title'])

        self._create_app()
        resp = self.testapp.get('/')
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'
        assert '(Collection Title)' in resp.body
예제 #22
0
    def test_create_first_coll(self):
        """ Test first collection creation, with all required dirs
        """
        main(['init', 'test'])

        colls = os.path.join(self.root_dir, COLLECTIONS)
        assert os.path.isdir(colls)

        test = os.path.join(colls, 'test')
        assert os.path.isdir(test)

        self._check_dirs(test, [INDEX_DIR, ARCHIVE_DIR, 'static', 'templates'])
예제 #23
0
    def test_add_title_metadata_index_page(self):
        """ Test adding title metadata to a collection, test
        retrieval on default index page
        """
        main(["metadata", "foo", "--set", "title=Collection Title"])

        self._create_app()
        resp = self.testapp.get("/")
        assert resp.status_int == 200
        assert resp.content_type == "text/html"
        resp.charset = "utf-8"
        assert "(Collection Title)" in resp.text
예제 #24
0
    def test_create_first_coll(self):
        """ Test first collection creation, with all required dirs
        """
        main(['init', 'test'])

        colls = os.path.join(self.root_dir, 'collections')
        assert os.path.isdir(colls)

        test = os.path.join(colls, 'test')
        assert os.path.isdir(test)

        self._check_dirs(test, [INDEX_DIR, ARCHIVE_DIR, 'static', 'templates'])
예제 #25
0
    def test_create_first_coll(self):
        """ Test first collection creation, with all required dirs
        """
        main(["init", "test"])

        colls = os.path.join(self.root_dir, "collections")
        assert os.path.isdir(colls)

        test = os.path.join(colls, "test")
        assert os.path.isdir(test)

        self._check_dirs(test, [INDEX_DIR, ARCHIVE_DIR, "static", "templates"])
예제 #26
0
    def test_auto_index(self):
        main(['init', 'auto'])
        auto_dir = os.path.join(self.root_dir, COLLECTIONS, 'auto')
        archive_dir = os.path.join(auto_dir, ARCHIVE_DIR)

        archive_sub_dir = os.path.join(archive_dir, 'sub')
        os.makedirs(archive_sub_dir)

        def do_copy():
            try:
                time.sleep(1.0)
                shutil.copy(self._get_sample_warc('example.warc.gz'), archive_dir)
                shutil.copy(self._get_sample_warc('example-extra.warc'), archive_sub_dir)
                time.sleep(1.0)
            finally:
                indexer.interval = 0

        indexer = AutoIndexer(interval=0.25)
        indexer.start()

        ge = gevent.spawn(do_copy)
        ge.join()

        index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
        assert os.path.isfile(index_file)

        with open(index_file, 'r') as fh:
            index = fh.read()

        assert '"example.warc.gz' in index, index
        assert '"sub/example-extra.warc' in index, index

        mtime = os.path.getmtime(index_file)

        # Update
        indexer.interval = 0.25
        indexer.start()

        os.remove(index_file)

        #thread = threading.Thread(target=do_copy)
        #thread.daemon = True
        #thread.start()
        ge = gevent.spawn(do_copy)

        #wayback(['-p', '0', '-a', '--auto-interval', '0.25'])

        #thread.join()
        ge.join()

	# assert file was update
        assert os.path.getmtime(index_file) > mtime
예제 #27
0
    def test_add_more_warcs(self):
        """ Test adding additional warcs, check replay of added content
        """
        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example-extra.warc')

        main(['add', 'test', warc1, warc2])

        # Spurrious file in collections
        with open(os.path.join(self.root_dir, COLLECTIONS, 'blah'), 'w+b') as fh:
            fh.write(b'foo\n')

        with raises(IOError):
            main(['add', 'test', 'non-existent-file.warc.gz'])
예제 #28
0
    def test_add_more_warcs(self):
        """ Test adding additional warcs, check replay of added content
        """
        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example-extra.warc')

        main(['add', 'test', warc1, warc2])

        # Spurrious file in collections
        with open(os.path.join(self.root_dir, COLLECTIONS, 'blah'), 'w+b') as fh:
            fh.write(b'foo\n')

        with raises(IOError):
            main(['add', 'test', 'non-existent-file.warc.gz'])
예제 #29
0
    def test_add_custom_nested_warcs(self):
        """ Test recursive indexing of custom created WARC hierarchy,
        warcs/A/..., warcs/B/sub/...
        Ensure CDX is relative to root archive dir, test replay
        """

        main(['init', 'nested'])

        nested_root = os.path.join(self.root_dir, 'collections', 'nested',
                                   ARCHIVE_DIR)
        nested_a = os.path.join(nested_root, 'A')
        nested_b = os.path.join(nested_root, 'B', 'sub')

        os.makedirs(nested_a)
        os.makedirs(nested_b)

        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example.warc.gz')

        shutil.copy2(warc1, nested_a)
        shutil.copy2(warc2, nested_b)

        main([
            'index', 'nested',
            os.path.join(nested_a, 'iana.warc.gz'),
            os.path.join(nested_b, 'example.warc.gz')
        ])

        nested_cdx = os.path.join(self.root_dir, 'collections', 'nested',
                                  INDEX_DIR, INDEX_FILE)
        with open(nested_cdx) as fh:
            nested_cdx_index = fh.read()

        assert '1043' in nested_cdx_index
        assert '333' in nested_cdx_index
        assert 'B/sub/example.warc.gz' in nested_cdx_index

        assert '2258' in nested_cdx_index
        assert '334' in nested_cdx_index
        assert 'A/iana.warc.gz' in nested_cdx_index

        self._create_app()
        resp = self.testapp.get('/nested/20140126200624/http://www.iana.org/')
        assert resp.status_int == 200

        resp = self.testapp.get(
            '/nested/20140103030321/http://example.com?example=1')
        assert resp.status_int == 200
예제 #30
0
    def test_add_modify_home_template(self):
        # Add shared template
        main(['template', '--add', 'home_html'])

        filename = os.path.join(self.root_dir, 'templates', 'index.html')
        assert os.path.isfile(filename)

        with open(filename, 'r+b') as fh:
            buf = fh.read()
            buf = buf.replace(b'Pywb Wayback Machine', b'Custom Test Homepage')
            fh.seek(0)
            fh.write(buf)

        resp = self.testapp.get('/')
        resp.charset = 'utf-8'
        assert resp.content_type == 'text/html'
        assert 'Custom Test Homepage' in resp.text, resp.text
예제 #31
0
    def test_add_modify_home_template(self):
        # Add shared template
        main(['template', '--add', 'home_html'])

        filename = os.path.join(self.root_dir, 'templates', 'index.html')
        assert os.path.isfile(filename)

        with open(filename, 'r+b') as fh:
            buf = fh.read()
            buf = buf.replace('</html>', 'Custom Test Homepage</html>')
            fh.seek(0)
            fh.write(buf)

        self._create_app()
        resp = self.testapp.get('/')
        assert resp.content_type == 'text/html'
        assert 'Custom Test Homepage</html>' in resp.body, resp.body
예제 #32
0
    def test_add_modify_home_template(self):
        # Add shared template
        main(['template', '--add', 'home_html'])

        filename = os.path.join(self.root_dir, 'templates', 'index.html')
        assert os.path.isfile(filename)

        with open(filename, 'r+b') as fh:
            buf = fh.read()
            buf = buf.replace(b'</html>', b'Custom Test Homepage</html>')
            fh.seek(0)
            fh.write(buf)

        resp = self.testapp.get('/')
        resp.charset = 'utf-8'
        assert resp.content_type == 'text/html'
        assert 'Custom Test Homepage</html>' in resp.text, resp.text
예제 #33
0
    def test_add_custom_nested_warcs(self):
        """ Test recursive indexing of custom created WARC hierarchy,
        warcs/A/..., warcs/B/sub/...
        Ensure CDX is relative to root archive dir, test replay
        """

        main(['init', 'nested'])

        nested_root = os.path.join(self.root_dir, 'collections', 'nested', ARCHIVE_DIR)
        nested_a = os.path.join(nested_root, 'A')
        nested_b = os.path.join(nested_root, 'B', 'sub')

        os.makedirs(nested_a)
        os.makedirs(nested_b)

        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example.warc.gz')

        shutil.copy2(warc1, nested_a)
        shutil.copy2(warc2, nested_b)

        main(['index',
              'nested',
              os.path.join(nested_a, 'iana.warc.gz'),
              os.path.join(nested_b, 'example.warc.gz')
             ])

        nested_cdx = os.path.join(self.root_dir, 'collections', 'nested', INDEX_DIR, INDEX_FILE)
        with open(nested_cdx) as fh:
            nested_cdx_index = fh.read()

        assert '1043' in nested_cdx_index
        assert '333' in nested_cdx_index
        assert 'B/sub/example.warc.gz' in nested_cdx_index

        assert '2258' in nested_cdx_index
        assert '334' in nested_cdx_index
        assert 'A/iana.warc.gz' in nested_cdx_index

        self._create_app()
        resp = self.testapp.get('/nested/20140126200624/http://www.iana.org/')
        assert resp.status_int == 200

        resp = self.testapp.get('/nested/20140103030321/http://example.com?example=1')
        assert resp.status_int == 200
예제 #34
0
    def test_list_colls(self):
        """ Test collection listing, printed to stdout
        """
        orig_stdout = sys.stdout
        buff = StringIO()
        sys.stdout = buff

        try:
            main(['list'])
        finally:
            sys.stdout = orig_stdout

        output = sorted(buff.getvalue().splitlines())
        assert len(output) == 4
        assert 'Collections:' in output
        assert '- foo' in output
        assert '- nested' in output
        assert '- test' in output
예제 #35
0
    def test_add_modify_home_template(self):
        # Add shared template
        main(["template", "--add", "home_html"])

        filename = os.path.join(self.root_dir, "templates", "index.html")
        assert os.path.isfile(filename)

        with open(filename, "r+b") as fh:
            buf = fh.read()
            buf = buf.replace(b"</html>", b"Custom Test Homepage</html>")
            fh.seek(0)
            fh.write(buf)

        self._create_app()
        resp = self.testapp.get("/")
        resp.charset = "utf-8"
        assert resp.content_type == "text/html"
        assert "Custom Test Homepage</html>" in resp.text, resp.text
예제 #36
0
    def test_list_colls(self):
        """ Test collection listing, printed to stdout
        """
        orig_stdout = sys.stdout
        buff = BytesIO()
        sys.stdout = buff

        try:
            main(['list'])
        finally:
            sys.stdout = orig_stdout

        output = sorted(buff.getvalue().splitlines())
        assert len(output) == 4
        assert 'Collections:' in output
        assert '- foo' in output
        assert '- nested' in output
        assert '- test' in output
예제 #37
0
    def test_convert_cdx(self):
        """ Create non-surt cdx, then convert to cdxj
        """
        migrate_dir = os.path.join(self.root_dir, '_migrate')

        os.mkdir(migrate_dir)

        cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')])

        # try one file with -9
        cdxindexer_main([
            '-u', '-9', migrate_dir,
            self._get_sample_warc('example.warc.gz')
        ])

        cdxs = os.listdir(migrate_dir)
        assert all(x.endswith('.cdx') for x in cdxs)

        @patch('pywb.manager.manager.get_input', lambda x: 'blah')
        def do_migrate_no():
            main(['cdx-convert', migrate_dir])

        do_migrate_no()
        assert os.listdir(migrate_dir) == cdxs

        @patch('pywb.manager.manager.get_input', lambda x: 'y')
        def do_migrate_yes():
            main(['cdx-convert', migrate_dir])

        do_migrate_yes()
        cdxjs = os.listdir(migrate_dir)

        assert len(cdxs) == len(cdxjs)
        assert all(x.endswith('.cdxj') for x in cdxjs)

        with open(os.path.join(migrate_dir, 'iana.cdxj'), 'rb') as fh:
            cdx = CDXObject(fh.readline())
            assert cdx['urlkey'] == 'org,iana)/'
            assert cdx['timestamp'] == '20140126200624'
            assert cdx['url'] == 'http://www.iana.org/'
            #assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')

        # Nothing else to migrate
        main(['cdx-convert', migrate_dir])
예제 #38
0
    def test_add_more_warcs(self):
        """ Test adding additional warcs, check replay of added content
        """
        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example-extra.warc')

        main(['add', 'test', warc1, warc2])

        # Spurrious file in collections
        with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh:
            fh.write('foo\n')

        with raises(IOError):
            main(['add', 'test', 'non-existent-file.warc.gz'])

        # check new cdx
        self._create_app()
        resp = self.testapp.get('/test/20140126200624/http://www.iana.org/')
        assert resp.status_int == 200
예제 #39
0
    def test_other_metadata_search_page(self):
        main(["metadata", "foo", "--set", "desc=Some Description Text", "other=custom value"])

        with raises(ValueError):
            main(["metadata", "foo", "--set", "name_only"])

        self._create_app()
        resp = self.testapp.get("/foo/")
        resp.charset = "utf-8"
        assert resp.status_int == 200
        assert resp.content_type == "text/html"

        assert "Collection Title" in resp.text

        assert "desc" in resp.text
        assert "Some Description Text" in resp.text

        assert "other" in resp.text
        assert "custom value" in resp.text
예제 #40
0
    def test_add_more_warcs(self):
        """ Test adding additional warcs, check replay of added content
        """
        warc1 = self._get_sample_warc("iana.warc.gz")
        warc2 = self._get_sample_warc("example-extra.warc")

        main(["add", "test", warc1, warc2])

        # Spurrious file in collections
        with open(os.path.join(self.root_dir, "collections", "blah"), "w+b") as fh:
            fh.write(b"foo\n")

        with raises(IOError):
            main(["add", "test", "non-existent-file.warc.gz"])

        # check new cdx
        self._create_app()
        resp = self.testapp.get("/test/20140126200624/http://www.iana.org/")
        assert resp.status_int == 200
예제 #41
0
    def test_add_more_warcs(self):
        """ Test adding additional warcs, check replay of added content
        """
        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example-extra.warc')

        main(['add', 'test', warc1, warc2])

        # Spurrious file in collections
        with open(os.path.join(self.root_dir, 'collections', 'blah'), 'w+b') as fh:
            fh.write(b'foo\n')

        with raises(IOError):
            main(['add', 'test', 'non-existent-file.warc.gz'])

        # check new cdx
        self._create_app()
        resp = self.testapp.get('/test/20140126200624/http://www.iana.org/')
        assert resp.status_int == 200
예제 #42
0
    def test_other_metadata_search_page(self):
        main(['metadata', 'foo', '--set',
              'desc=Some Description Text',
              'other=custom value'])

        with raises(ValueError):
            main(['metadata', 'foo', '--set', 'name_only'])

        resp = self.testapp.get('/foo/')
        resp.charset = 'utf-8'
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'

        assert 'Collection Title' in resp.text

        assert 'desc' in resp.text
        assert 'Some Description Text' in resp.text

        assert 'other' in resp.text
        assert 'custom value' in resp.text
예제 #43
0
    def test_other_metadata_search_page(self):
        main(['metadata', 'foo', '--set',
              'desc=Some Description Text',
              'other=custom value'])

        with raises(ValueError):
            main(['metadata', 'foo', '--set', 'name_only'])

        resp = self.testapp.get('/foo/')
        resp.charset = 'utf-8'
        assert resp.status_int == 200
        assert resp.content_type == 'text/html'

        assert 'Collection Title' in resp.text

        assert 'desc' in resp.text
        assert 'Some Description Text' in resp.text

        assert 'other' in resp.text
        assert 'custom value' in resp.text
예제 #44
0
    def test_convert_cdx(self):
        """ Create non-surt cdx, then convert to cdxj
        """
        migrate_dir = os.path.join(self.root_dir, "_migrate")

        os.mkdir(migrate_dir)

        cdxindexer_main(["-u", migrate_dir, self._get_sample_warc("")])

        # try one file with -9
        cdxindexer_main(["-u", "-9", migrate_dir, self._get_sample_warc("example.warc.gz")])

        cdxs = os.listdir(migrate_dir)
        assert all(x.endswith(".cdx") for x in cdxs)

        @patch("pywb.manager.manager.get_input", lambda x: "blah")
        def do_migrate_no():
            main(["cdx-convert", migrate_dir])

        do_migrate_no()
        assert os.listdir(migrate_dir) == cdxs

        @patch("pywb.manager.manager.get_input", lambda x: "y")
        def do_migrate_yes():
            main(["cdx-convert", migrate_dir])

        do_migrate_yes()
        cdxjs = os.listdir(migrate_dir)

        assert len(cdxs) == len(cdxjs)
        assert all(x.endswith(".cdxj") for x in cdxjs)

        with open(os.path.join(migrate_dir, "iana.cdxj"), "rb") as fh:
            cdx = CDXObject(fh.readline())
            assert cdx["urlkey"] == "org,iana)/"
            assert cdx["timestamp"] == "20140126200624"
            assert cdx["url"] == "http://www.iana.org/"
            # assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')

        # Nothing else to migrate
        main(["cdx-convert", migrate_dir])
예제 #45
0
    def test_add_custom_nested_warcs(self):
        """ Test recursive indexing of custom created WARC hierarchy,
        warcs/A/..., warcs/B/sub/...
        Ensure CDX is relative to root archive dir, test replay
        """

        main(["init", "nested"])

        nested_root = os.path.join(self.root_dir, "collections", "nested", ARCHIVE_DIR)
        nested_a = os.path.join(nested_root, "A")
        nested_b = os.path.join(nested_root, "B", "sub")

        os.makedirs(nested_a)
        os.makedirs(nested_b)

        warc1 = self._get_sample_warc("iana.warc.gz")
        warc2 = self._get_sample_warc("example.warc.gz")

        shutil.copy2(warc1, nested_a)
        shutil.copy2(warc2, nested_b)

        main(["index", "nested", os.path.join(nested_a, "iana.warc.gz"), os.path.join(nested_b, "example.warc.gz")])

        nested_cdx = os.path.join(self.root_dir, "collections", "nested", INDEX_DIR, INDEX_FILE)
        with open(nested_cdx) as fh:
            nested_cdx_index = fh.read()

        assert "1043" in nested_cdx_index
        assert "333" in nested_cdx_index
        assert "B/sub/example.warc.gz" in nested_cdx_index

        assert "2258" in nested_cdx_index
        assert "334" in nested_cdx_index
        assert "A/iana.warc.gz" in nested_cdx_index

        self._create_app()
        resp = self.testapp.get("/nested/20140126200624/http://www.iana.org/")
        assert resp.status_int == 200

        resp = self.testapp.get("/nested/20140103030321/http://example.com?example=1")
        assert resp.status_int == 200
예제 #46
0
    def test_convert_cdx(self):
        """ Create non-surt cdx, then convert to cdxj
        """
        migrate_dir = os.path.join(self.root_dir, '_migrate')

        os.mkdir(migrate_dir)

        cdxindexer_main(['-u', migrate_dir, self._get_sample_warc('')])

        # try one file with -9
        cdxindexer_main(['-u', '-9', migrate_dir, self._get_sample_warc('example.warc.gz')])

        cdxs = os.listdir(migrate_dir)
        assert all(x.endswith('.cdx') for x in cdxs)

        @patch('pywb.manager.manager.get_input', lambda x: 'blah')
        def do_migrate_no():
            main(['cdx-convert', migrate_dir])

        do_migrate_no()
        assert os.listdir(migrate_dir) == cdxs

        @patch('pywb.manager.manager.get_input', lambda x: 'y')
        def do_migrate_yes():
            main(['cdx-convert', migrate_dir])

        do_migrate_yes()
        cdxjs = os.listdir(migrate_dir)

        assert len(cdxs) == len(cdxjs)
        assert all(x.endswith('.cdxj') for x in cdxjs)

        with open(os.path.join(migrate_dir, 'iana.cdxj'), 'rb') as fh:
            cdx = CDXObject(fh.readline())
            assert cdx['urlkey'] == 'org,iana)/'
            assert cdx['timestamp'] == '20140126200624'
            assert cdx['url'] == 'http://www.iana.org/'
            #assert fh.readline().startswith('org,iana)/ 20140126200624 {"url": "http://www.iana.org/",')

        # Nothing else to migrate
        main(['cdx-convert', migrate_dir])
예제 #47
0
    def test_err_missing_dirs(self):
        """ Test various errors with missing warcs dir,
        missing cdx dir, non dir cdx file, and missing collections root
        """
        colls = os.path.join(self.root_dir, "collections")

        # No Statics -- ignorable
        shutil.rmtree(os.path.join(colls, "foo", "static"))
        self._create_app()

        # No WARCS
        warcs_path = os.path.join(colls, "foo", ARCHIVE_DIR)
        shutil.rmtree(warcs_path)

        with raises(IOError):
            main(["add", "foo", "somewarc"])

        # No CDX
        cdx_path = os.path.join(colls, "foo", INDEX_DIR)
        shutil.rmtree(cdx_path)

        with raises(Exception):
            self._create_app()

        # CDX a file not a dir
        with open(cdx_path, "w+b") as fh:
            fh.write(b"foo\n")

        with raises(Exception):
            self._create_app()

        shutil.rmtree(colls)

        # No Collections to list
        with raises(IOError):
            main(["list"])

        # No Collections
        self._create_app()
        resp = self.testapp.get("/test/", status=404)
        assert resp.status_int == 404
예제 #48
0
    def test_merge_vs_reindex_equality(self):
        """ Test full reindex vs merged update when adding warcs
        to ensure equality of indexes
        """
        # ensure merged index is same as full reindex
        coll_dir = os.path.join(self.root_dir, COLLECTIONS, 'test', INDEX_DIR)
        orig = os.path.join(coll_dir, INDEX_FILE)
        bak = os.path.join(coll_dir, 'index.bak')

        shutil.copy(orig, bak)

        main(['reindex', 'test'])

        with open(orig) as orig_fh:
            merged_cdx = orig_fh.read()

        with open(bak) as bak_fh:
            reindex_cdx = bak_fh.read()

        assert len(reindex_cdx.splitlines()) == len(merged_cdx.splitlines())
        assert merged_cdx == reindex_cdx
예제 #49
0
    def test_merge_vs_reindex_equality(self):
        """ Test full reindex vs merged update when adding warcs
        to ensure equality of indexes
        """
        # ensure merged index is same as full reindex
        coll_dir = os.path.join(self.root_dir, COLLECTIONS, 'test', INDEX_DIR)
        orig = os.path.join(coll_dir, INDEX_FILE)
        bak = os.path.join(coll_dir, 'index.bak')

        shutil.copy(orig, bak)

        main(['reindex', 'test'])

        with open(orig) as orig_fh:
            merged_cdx = orig_fh.read()

        with open(bak) as bak_fh:
            reindex_cdx = bak_fh.read()

        assert len(reindex_cdx.splitlines()) == len(merged_cdx.splitlines())
        assert merged_cdx == reindex_cdx
예제 #50
0
    def test_add_custom_nested_warcs(self):
        """ Test recursive indexing of custom created WARC hierarchy,
        warcs/A/..., warcs/B/sub/...
        Ensure CDX is relative to root archive dir, test replay
        """

        main(['init', 'nested'])

        nested_root = os.path.join(self.root_dir, COLLECTIONS, 'nested',
                                   ARCHIVE_DIR)
        nested_a = os.path.join(nested_root, 'A')
        nested_b = os.path.join(nested_root, 'B', 'sub')

        os.makedirs(nested_a)
        os.makedirs(nested_b)

        warc1 = self._get_sample_warc('iana.warc.gz')
        warc2 = self._get_sample_warc('example.warc.gz')

        shutil.copy2(warc1, nested_a)
        shutil.copy2(warc2, nested_b)

        main([
            'index', 'nested',
            os.path.join(nested_a, 'iana.warc.gz'),
            os.path.join(nested_b, 'example.warc.gz')
        ])

        nested_cdx = os.path.join(self.root_dir, COLLECTIONS, 'nested',
                                  INDEX_DIR, INDEX_FILE)
        with open(nested_cdx) as fh:
            nested_cdx_index = fh.read()

        assert '1043' in nested_cdx_index
        assert '333' in nested_cdx_index
        assert 'B/sub/example.warc.gz' in nested_cdx_index

        assert '2258' in nested_cdx_index
        assert '334' in nested_cdx_index
        assert 'A/iana.warc.gz' in nested_cdx_index
예제 #51
0
    def test_auto_index(self):
        main(['init', 'auto'])
        auto_dir = os.path.join(self.root_dir, 'collections', 'auto')
        archive_dir = os.path.join(auto_dir, ARCHIVE_DIR)

        archive_sub_dir = os.path.join(archive_dir, 'sub')
        os.makedirs(archive_sub_dir)

        pywb.manager.autoindex.keep_running = True

        def do_copy():
            try:
                time.sleep(1)
                shutil.copy(self._get_sample_warc('example.warc.gz'),
                            archive_dir)
                shutil.copy(self._get_sample_warc('example-extra.warc'),
                            archive_sub_dir)
                time.sleep(1)
            finally:
                pywb.manager.autoindex.keep_running = False

        thread = threading.Thread(target=do_copy)
        thread.daemon = True
        thread.start()

        main(['autoindex'])

        thread.join()

        index_file = os.path.join(auto_dir, INDEX_DIR, AUTOINDEX_FILE)
        assert os.path.isfile(index_file)

        with open(index_file) as fh:
            index = fh.read()

        assert '"example.warc.gz' in index
        assert '"sub/example-extra.warc' in index, index

        mtime = os.path.getmtime(index_file)

        # Update
        pywb.manager.autoindex.keep_running = True

        os.remove(index_file)

        thread = threading.Thread(target=do_copy)
        thread.daemon = True
        thread.start()

        main(['autoindex', 'auto'])

        thread.join()

        # assert file was update
        assert os.path.getmtime(index_file) > mtime
예제 #52
0
    def test_add_default_coll_templates(self):
        """ Test add default templates: collection,
        and overwrite collection template
        """
        # list
        main(['template', 'foo', '--list'])

        # Add collection template
        main(['template', 'foo', '--add', 'query_html'])
        assert os.path.isfile(os.path.join(self.root_dir, COLLECTIONS, 'foo', 'templates', 'query.html'))

        # overwrite -- force
        main(['template', 'foo', '--add', 'query_html', '-f'])
예제 #53
0
    def test_err_wrong_warcs(self):
        warc1 = self._get_sample_warc('example.warc.gz')
        invalid_warc = os.path.join(self.root_dir, COLLECTIONS, 'test', ARCHIVE_DIR, 'invalid.warc.gz')

        # Empty warc list, argparse calls exit
        with raises(SystemExit):
            main(['index', 'test'])

        # Wrong paths not in collection
        with raises(IOError):
            main(['index', 'test', warc1])

        # Non-existent
        with raises(IOError):
            main(['index', 'test', invalid_warc])
예제 #54
0
    def test_err_template_remove(self):
        """ Test various error conditions for templates:
        invalid template name, no collection for collection template
        no template file found
        """
        # no such template
        with raises(KeyError):
            main(['template', 'foo', '--remove', 'blah_html'])

        # collection needed
        with raises(IOError):
            main(['template', '--remove', 'query_html'])

        # already removed
        with raises(IOError):
            main(['template', 'foo', '--remove', 'query_html'])
예제 #55
0
 def test_add_template_input_yes(self):
     """ Test answer 'yes' to overwrite
     """
     main(['template', 'foo', '--add', 'query_html'])
예제 #56
0
    def test_add_warcs(self):
        """ Test adding warc to new coll, check replay
        """
        warc1 = self._get_sample_warc('example.warc.gz')

        main(['add', 'test', warc1])
예제 #57
0
 def test_add_template_input_other(self):
     """ Test answer 'other' to overwrite
     """
     with raises(IOError):
         main(['template', 'foo', '--add', 'query_html'])
예제 #58
0
 def do_migrate_yes():
     main(['convert-cdx', migrate_dir])
예제 #59
0
 def test_remove_not_confirm(self):
     """ Test answer 'no' to remove
     """
     # don't remove -- not confirmed
     with raises(IOError):
         main(['template', 'foo', '--remove', 'query_html'])
예제 #60
0
 def test_remove_confirm(self):
     # remove -- confirm
     main(['template', 'foo', '--remove', 'query_html'])