def test_non_html_ignored(self, workdir): # Non .html/.xhtml files are ignored proc = HTMLCleaner() sample_path = workdir / "src" / "sample.txt" resultpath, metadata = proc.process(str(sample_path), {'error': False}) # input was not touched assert resultpath == str(sample_path)
def test_option_fix_head_nums_false(self, samples_dir, workdir): # Make sure we respect the `fix_head_nums` option if false. samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html") proc = HTMLCleaner(options={'html-cleaner-fix-head-nums': 'False'}) resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = codecs.open(resultpath, 'r', 'utf-8').read() assert u'<span class="u-o-headnum">1</span>Häding1' not in contents
def test_option_invalid(self): # Make sure we complain when trash is set as `fix_head_nums`. with pytest.raises(ArgumentParserError): HTMLCleaner(options={'html-cleaner-fix-head-nums': 'foo'}) with pytest.raises(ArgumentParserError): HTMLCleaner(options={'html-cleaner-fix-img-links': 'foo'}) with pytest.raises(ArgumentParserError): HTMLCleaner(options={'html-cleaner-fix-sdfields': 'foo'})
def test_option_fix_sdfields_true(self, samples_dir, workdir): # Make sure we respect the `fix_sdtags` option if false samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html") proc = HTMLCleaner(options={'html-cleaner-fix-sd-fields': '1'}) resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = codecs.open(resultpath, 'r', 'utf-8').read() assert u'<sdfield type="PAGE">' not in contents
def test_rename_img_files_src_is_dir(self, workdir): # We cope with src files that are in fact dirs proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( str(workdir), {'src': 'sample.jpg'}) list_dir = os.listdir(str(workdir)) assert 'sample.jpg' not in list_dir
def test_non_html_ignored(self, workdir): # Non .html/.xhtml files are ignored proc = HTMLCleaner() sample_path = workdir / "src" / "sample.txt" resultpath, metadata = proc.process( str(sample_path), {'error': False}) # input was not touched assert resultpath == str(sample_path)
def test_rename_img_files_no_src(self, samples_dir, workdir): # We cope with not existing source files samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files(str(workdir / "src"), {'not-existing-filename': 'sample_1.gif'}) list_dir = os.listdir(str(workdir / "src")) assert 'sample_1.gif' not in list_dir
def test_non_html_ignored(self): # Non .html/.xhtml files are ignored proc = HTMLCleaner() sample_path = os.path.join(self.workdir, 'sample.txt') open(sample_path, 'w').write('Sample file.') self.resultpath, metadata = proc.process( sample_path, {'error': False}) # input was not touched assert self.resultpath == sample_path
def test_rename_img_files(self): proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( self.workdir2, {'image_sample_html_m20918026.gif': 'sample_1.gif'} ) list_dir = os.listdir(self.workdir2) assert 'sample_1.gif' in list_dir assert 'image_sample_html_m20918026.gif' not in list_dir
def test_option_fix_sdfields_true(self): # Make sure we respect the `fix_sdtags` option if false proc = HTMLCleaner( options={ 'html-cleaner-fix-sd-fields': '1'}) self.resultpath, metadata = proc.process( self.sample_path, {'error': False}) contents = open(self.resultpath, 'rb').read() snippet = '<sdfield type="PAGE">' assert snippet not in contents
def test_rename_img_files_no_src(self): # We cope with not existing source files proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( self.workdir2, {'not-existing-filename': 'sample_1.gif'} ) list_dir = os.listdir(self.workdir2) assert 'sample_1.gif' not in list_dir
def test_option_fix_head_nums_false(self, samples_dir, workdir): # Make sure we respect the `fix_head_nums` option if false. samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html") proc = HTMLCleaner( options={ 'html-cleaner-fix-head-nums': 'False'}) resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = codecs.open(resultpath, 'r', 'utf-8').read() assert u'<span class="u-o-headnum">1</span>Häding1' not in contents
def test_option_fix_sdfields_true(self, samples_dir, workdir): # Make sure we respect the `fix_sdtags` option if false samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html") proc = HTMLCleaner( options={ 'html-cleaner-fix-sd-fields': '1'}) resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = codecs.open(resultpath, 'r', 'utf-8').read() assert u'<sdfield type="PAGE">' not in contents
def test_cleaner(self, workdir, samples_dir): # make sure erranous headings are fixed by default. samples_dir.join("sample3.html").copy(workdir / "src" / "sample.html") proc = HTMLCleaner() resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = codecs.open(resultpath, 'r', 'utf-8').read() u'<span class="u-o-headnum">1</span>Häding1' in contents u'<span class="u-o-headnum">1.1</span>Heading1.1' in contents u'<span class="u-o-headnum">1.2.</span>Heading1.2.' in contents
def test_rename_img_files_src_is_dir(self): # We cope with src files that are in fact dirs proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) os.mkdir(os.path.join(self.workdir2, 'some_dir')) proc.rename_img_files( self.workdir2, {'some_dir': 'sample.jpg'} ) list_dir = os.listdir(self.workdir2) assert 'sample.jpg' not in list_dir
def test_rename_img_files_dst_exists_already(self): # We cope with dest files that already exist proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( self.workdir2, {'image_sample_html_m20918026.gif': 'image_sample_html_m20918026.gif'} ) list_dir = os.listdir(self.workdir2) assert 'image_sample_html_m20918026.gif' in list_dir
def test_rename_img_files_dst_exists_already(self, samples_dir, workdir): # We cope with dest files that already exist samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files(str(workdir / "src"), { 'image_sample_html_m20918026.gif': 'image_sample_html_m20918026.gif' }) list_dir = os.listdir(str(workdir / "src")) assert 'image_sample_html_m20918026.gif' in list_dir
def test_rename_img_files(self, samples_dir, workdir): # we can rename image files samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( str(workdir / "src"), {'image_sample_html_m20918026.gif': 'sample_1.gif'}) list_dir = os.listdir(str(workdir / "src")) assert 'sample_1.gif' in list_dir assert 'image_sample_html_m20918026.gif' not in list_dir
def test_rename_img_files_no_src(self, samples_dir, workdir): # We cope with not existing source files samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( str(workdir / "src"), {'not-existing-filename': 'sample_1.gif'} ) list_dir = os.listdir(str(workdir / "src")) assert 'sample_1.gif' not in list_dir
def test_option_fix_head_nums_false(self): # Make sure we respect the `fix_head_nums` option if false. proc = HTMLCleaner( options={ 'html-cleaner-fix-head-nums': 'False'}) self.resultpath, metadata = proc.process( self.sample_path, {'error': False}) contents = open(self.resultpath, 'rb').read() snippet1 = "%s" % ( '<h1 class="foo"><span class="u-o-headnum">1</span>Häding1</h1>') assert snippet1 not in contents
def test_rename_img_files(self, samples_dir, workdir): # we can rename image files samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( str(workdir / "src"), {'image_sample_html_m20918026.gif': 'sample_1.gif'} ) list_dir = os.listdir(str(workdir / "src")) assert 'sample_1.gif' in list_dir assert 'image_sample_html_m20918026.gif' not in list_dir
def test_option_fix_img_links_true(self): # Make sure we respect the `fix_img_links` option if true proc = HTMLCleaner( options={ 'html-cleaner-fix-img-links': '1'}) self.resultpath, metadata = proc.process( self.img_sample_path, {'error': False}) contents = open(self.resultpath, 'rb').read() resultdir = os.path.dirname(self.resultpath) snippet = '<IMG SRC="image_sample_html_m20918026.gif"' list_dir = os.listdir(resultdir) assert snippet not in contents assert 'image_sample_html_m20918026.gif' not in list_dir assert 'sample_1.gif' in list_dir
def test_rename_img_files_dst_exists_already(self, samples_dir, workdir): # We cope with dest files that already exist samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner( options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files( str(workdir / "src"), { 'image_sample_html_m20918026.gif': 'image_sample_html_m20918026.gif' } ) list_dir = os.listdir(str(workdir / "src")) assert 'image_sample_html_m20918026.gif' in list_dir
def test_option_fix_img_links_true(self, samples_dir, workdir): # Make sure we respect the `fix_img_links` option if true samples_dir.join("image_sample.html").copy(workdir / "src" / "sample.html") samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'}) resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = open(resultpath, 'r').read() resultdir = os.path.dirname(resultpath) snippet = '<IMG SRC="image_sample_html_m20918026.gif"' list_dir = os.listdir(resultdir) assert snippet not in contents assert 'image_sample_html_m20918026.gif' not in list_dir assert 'sample_1.gif' in list_dir
def test_cleaner(self): # make sure erranous headings are fixed by default. proc = HTMLCleaner() self.resultpath, metadata = proc.process( self.sample_path, {'error': False}) contents = open(self.resultpath, 'rb').read() snippet1 = "%s" % ( '<span class="u-o-headnum">1</span>Häding1') snippet2 = "%s" % ( '<span class="u-o-headnum">1.1</span>Heading1.1') snippet3 = "%s" % ( '<span class="u-o-headnum">1.2.</span>Heading1.2.') assert snippet1 in contents assert snippet2 in contents assert snippet3 in contents
def test_option_fix_img_links_true(self, samples_dir, workdir): # Make sure we respect the `fix_img_links` option if true samples_dir.join("image_sample.html").copy( workdir / "src" / "sample.html") samples_dir.join("image_sample_html_m20918026.gif").copy( workdir / "src" / "image_sample_html_m20918026.gif") proc = HTMLCleaner( options={ 'html-cleaner-fix-img-links': '1'}) resultpath, metadata = proc.process( str(workdir / "src" / "sample.html"), {'error': False}) contents = open(resultpath, 'r').read() resultdir = os.path.dirname(resultpath) snippet = '<IMG SRC="image_sample_html_m20918026.gif"' list_dir = os.listdir(resultdir) assert snippet not in contents assert 'image_sample_html_m20918026.gif' not in list_dir assert 'sample_1.gif' in list_dir
def test_rename_img_files_src_is_dir(self, workdir): # We cope with src files that are in fact dirs proc = HTMLCleaner(options={'html-cleaner-fix-img-links': '1'}) proc.rename_img_files(str(workdir), {'src': 'sample.jpg'}) list_dir = os.listdir(str(workdir)) assert 'sample.jpg' not in list_dir