def test_duplicates(httpserver: HTTPServer): title = "content" httpserver.expect_request("/page2").respond_with_data( get_page("<h1>page2</h1>", title), content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/page3").respond_with_data( get_page("<h1>page3</h1>", "Other title"), content_type="text/html") page3_url = httpserver.url_for("/page3") httpserver.expect_request("/").respond_with_data( get_page( f""" <h1>page1</h1> <a href="{page2_url}">page 2</a> <a href="{page3_url}">page 3</a> """, title, ), content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateTitle"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ ResultData( title, sorted([httpserver.url_for("/"), httpserver.url_for("/page2")])) ] assert res.data == expected_data
def test_ignoring_pdf(httpserver: HTTPServer): httpserver.expect_request("/page1").respond_with_data( "<h1>Page 1</h1>", content_type="text/html", ) httpserver.expect_request("/page2.pdf").respond_with_data( "<h1>Page 2</h1>", content_type="application/pdf", ) httpserver.expect_request("/").respond_with_data( f""" <a href="{httpserver.url_for("/page1")}">page1</a> <a href="{httpserver.url_for("/page2.pdf")}">page2</a> """, content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["IgnorePDF"]) crawler.asyncio_crawl(save=False) assert sorted(crawler.all_urls) == sorted( [httpserver.url_for("/page1"), httpserver.url_for("/")])
def test_internal_301(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data( "", status=301, content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/page3").respond_with_data( f'<a href="{page2_url}">page2</a>', status=301, content_type="text/html") page3_url = httpserver.url_for("/page3") httpserver.expect_request("/").respond_with_data( f""" <a href="{page2_url}">page2</a> <a href="{page3_url}">page3</a> """, content_type="text/html", ) page1_url = httpserver.url_for("/") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["Internal301"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ ResultData(page2_url, page2_url, sorted([page1_url, page3_url])), ResultData(page3_url, page3_url, [page1_url]), ] assert res.data == expected_data
def main(url, verbose, plugin, verify, disable, delay, engine, workers, **kwargs): """This script will crawl give URL and analyse the output using plugins.""" if verbose: traceback_install() if url is None: console = Console() ctx = click.get_current_context() try: with open("README.md") as f: rawMarkdown = f.read() md = Markdown(rawMarkdown) console.print(md) console.print("\n") except FileNotFoundError: pass console.print(ctx.get_help()) ctx.exit() crawler = Crawler( url, verbose=verbose, plugins=plugin, verify=verify, disabled=disable, delay=delay, engine=engine, plugin_options=kwargs, ) asyncio.run(crawler.crawl())
def test_external(httpserver: HTTPServer): link = "http://example.com/" httpserver.expect_request("/page1").respond_with_data( f'<a href="{link}">external</a>', content_type="text/html", ) httpserver.expect_request("/page2").respond_with_data( f'<a href="{link}">external</a>', content_type="text/html", ) httpserver.expect_request("/").respond_with_data( f""" <img src="internal.png" /> <a href="{httpserver.url_for("/page1")}">page1</a> <a href="{httpserver.url_for("/page2")}">page2</a> """, content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinksByURL"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ ResultData( link, sorted( [httpserver.url_for("/page1"), httpserver.url_for("/page2")])) ] assert res.data == expected_data
def test_runs_plugin(httpserver: HTTPServer): httpserver.expect_request("/").respond_with_data("<h1>page2</h1>", content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["MultipleH1", "MissingH1"]) results = crawler.asyncio_crawl(save=False) assert len(results) == 2
def test_not_external(httpserver: HTTPServer): httpserver.expect_request("/").respond_with_data( '<a href="/bob">bob</a>', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinks"]) (res,) = crawler.asyncio_crawl(save=False) assert res.data == []
def test_not_external(httpserver: HTTPServer): httpserver.expect_request("/").respond_with_data( '<img src="internal.png" />', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalImages"]) (res, ) = crawler.asyncio_crawl(save=False) assert res.data == []
def test_can_crawl(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data( "<h1>page2</h1>", content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/").respond_with_data( f'<a href="{page2_url}">page 2</a>', content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False) crawler.asyncio_crawl(save=False) assert len(crawler.all_urls) == 2 assert page2_url in crawler.all_urls
def test_external(httpserver: HTTPServer): link = "http://example.com/" httpserver.expect_request("/").respond_with_data( f'<a href="{link}">external</a>', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinks"]) (res,) = crawler.asyncio_crawl(save=False) expected_data = [ResultData(link)] assert res.data == expected_data
def test_no_duplicates(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data(f"<p>{lorem.paragraph()}</p>", content_type="text/html") page2_link = f'<a href="{httpserver.url_for("/page2")}">page 2</a>' httpserver.expect_request("/page3").respond_with_data(f"<p>{lorem.paragraph()}</p>", content_type="text/html") page3_link = f'<a href="{httpserver.url_for("/page3")}">page 3</a>' httpserver.expect_request("/").respond_with_data(f"{page2_link}{page3_link}", content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateContent"]) (res,) = crawler.asyncio_crawl(save=False) expected_data = [] assert res.data == expected_data
def test_no_duplicates(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data( "<h1>page2</h1>", content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/").respond_with_data( f'<h1>page1</h1><a href="{page2_url}">page 2</a>', content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateH1"]) res = crawler.asyncio_crawl(save=False)[0] assert res.data == []
def test_external(httpserver: HTTPServer): img = "http://example.com/external.png" httpserver.expect_request("/").respond_with_data( f'<img src="{img}" />', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalImages"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ResultData(img)] assert res.data == expected_data
def list_plugins(ctx, param, value): if not value or ctx.resilient_parsing: return click.secho("\nAvailable plugins:\n", fg="green") plugins = Crawler.get_plugin_list() [click.echo(f"\t{plugin}") for plugin in plugins] ctx.exit()
class SeoGUI(QtWidgets.QMainWindow, ui.Ui_MainWindow): def __init__(self, parent=None): super().__init__(parent) self.setupUi(self) self.connectSignalsSlots() @asyncSlot() async def doCrawl(self): url = self.inputURL.text() self.crawler = Crawler(url) ic(self.crawler.get_plugin_list()) await self.crawler.crawl() def updateOutput(self, text): self.textOutput.document().setPlainText(text) # self.textOutput def connectSignalsSlots(self): self.btnCrawl.clicked.connect(self.doCrawl)
def test_clean_name_changes(): name = "-bOb:: '#' ::dOd-" cleaned_name = Crawler._clean_filename(name) assert cleaned_name == "bob-dod"
# Third Party from bs4 import BeautifulSoup # First Party from processors.plugins.MissingMeta import MissingMeta, ResultData from seotool.crawl import Crawler c = Crawler("example.com") def test_missing_meta(): html = BeautifulSoup( """ <p>content</p> """, "html.parser", ) plugin = MissingMeta(c) plugin.process(html=html, url="/") res = plugin.get_results_set() assert res.data == [ResultData("/")] def test_empty_meta(): html = BeautifulSoup( """ <meta name="description" content="" /> """, "html.parser",
def test_clean_name_no_change(): name = "bob" cleaned_name = Crawler._clean_filename(name) assert name == cleaned_name
def test_skip_page(): c = Crawler("example.com") with pytest.raises(SkipPage): c.skip_page()
async def doCrawl(self): url = self.inputURL.text() self.crawler = Crawler(url) ic(self.crawler.get_plugin_list()) await self.crawler.crawl()
with open("README.md") as f: rawMarkdown = f.read() md = Markdown(rawMarkdown) console.print(md) console.print("\n") except FileNotFoundError: pass console.print(ctx.get_help()) ctx.exit() crawler = Crawler( url, verbose=verbose, plugins=plugin, verify=verify, disabled=disable, delay=delay, engine=engine, plugin_options=kwargs, ) asyncio.run(crawler.crawl()) options = Crawler.get_extra_options() for plugin_options in options: for func in plugin_options: func(main) if __name__ == "__main__": main()