def main(): """main driver""" args = commandline_args_setup() # handle verbose option if args.verbose is True: # setup a logging handler for the command line console = logging.StreamHandler() # stream=sys.stdout console.setLevel(logging.INFO) formatter = logging.Formatter('%(funcName)s: %(message)s') console.setFormatter(formatter) # add the handler to the verbose logger verbose = logging.getLogger('verbose') verbose.setLevel(logging.INFO) verbose.addHandler(console) verbose.info('Running verbose.') base_url = args.base_url session = parse.Session() html = session.html_at_url(base_url) # this web page is defined as a URL and some HTML page = parse.Page(base_url, html) logging.getLogger('verbose').info(page.content) links = page.links logging.getLogger('verbose').info(links) # store the file if opted sm_arg = args.sitemap_dest to_save = _prompt_sitemap_save(base_url, sm_arg) if to_save: sm = Sitemap(links) sm.export(sm_arg)
def __init__(self, config_file_name): self.config = Config(config_file_name) self.sitemap = Sitemap(self.config.value('website_base_url')) self.template = Template( self.tag['content'], self.config.value('template_path') + Website.WEBSITE_TEMPLATE) self.refresh()
def get(self,url,inventory=None): """Get a inventory from url Will either create a new Inventory object or add to one supplied. """ # Either use inventory passed in or make a new one if (inventory is None): inventory = Inventory() inventory_fh = URLopener().open(url) Sitemap().inventory_parse_xml(fh=inventory_fh, inventory=inventory) return(inventory)
def write_zip(self, inventory=None, dumpfile=None): """Write a ZIP dump file""" compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED ) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write inventory first s = Sitemap(pretty_xml=True, allow_multifile=False) zf.writestr('manifest.xml',s.resources_as_xml(inventory)) # Add all files in the inventory for resource in inventory: zf.write(resource.uri) zf.close() zipsize = os.path.getsize(dumpfile) print "Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize)
def test_sitemap(): """ Test of test_sitemap() """ base_url = 'https://example.com' page1 = 'example-%s.html' % randint(0, 999999) page2 = 'example-%s.html' % randint(0, 999999) page3 = 'example-%s.html' % randint(0, 999999) sitemap = Sitemap(base_url) # New sitemap should include no pages assert not sitemap.pages sitemap.append(page1) # after appending one page, sitemap should contain that page and only that page assert len(sitemap.pages) == 1 assert sitemap.pages[0] == "%s/%s" % (base_url, page1) sitemap.append(page2) sitemap.append(page3) # after appebding two more pages, sitemap should contain 3 pages assert len(sitemap.pages) == 3 assert sitemap.pages[0] == "%s/%s" % (base_url, page1) assert sitemap.pages[1] == "%s/%s" % (base_url, page2) assert sitemap.pages[2] == "%s/%s" % (base_url, page3) sitemap.append('index.html') # index.html should be included as '/', without 'index.html' assert len(sitemap.pages) == 4 assert sitemap.pages[3] == "%s/" % base_url sitemap.write(TEST_WEBSITE.config.value('output_path')) with open(TEST_WEBSITE.config.value('output_path') + 'sitemap.txt', 'r') as my_file: sitemap_from_file = my_file.read().splitlines() # sitemap written to file should contain our 3 pages assert len(sitemap.pages) == 4 assert sitemap_from_file[0] == "%s/%s" % (base_url, page1) assert sitemap_from_file[1] == "%s/%s" % (base_url, page2) assert sitemap_from_file[2] == "%s/%s" % (base_url, page3) assert sitemap_from_file[3] == "%s/" % base_url
def test_happy_path(self): sitemap = Sitemap(loc=self.loc, lastmod=self.lastmod) assert isinstance(sitemap, Sitemap)
def test_invalid_sitemap_input(self): with pytest.raises(TypeError): Sitemap(loc=self.loc, lastmod=123)
usage = """\ python3 __main__.py https://www.femsense.com """ def exit_wrong_usage(): print(usage) sys.exit(1) def format_result(result: typing.Union[CheckResult, CheckResults]): symbol = "✅" if result.valid else "❌" return f"\t{symbol} {check.msg}" if __name__ == "__main__": logging.basicConfig(level=logging.INFO) if len(sys.argv) < 2: exit_wrong_usage() site = sys.argv[1] crawler = Crawler(site) sitemap = Sitemap(site) results = [*crawler.crawl()] print("Dere!") for n, check in enumerate(results): # if not check.valid: print(f"{n: 4d} {format_result(check)}")
if (URL_STABILIZATION.search(file_read(page))): rePage = URL_STABILIZATION.sub("", file_read(page)) link_href = re.compile('href=".*?"') for href in link_href.findall(rePage): #проверка на абсолютные ссылки if (not href[6:10] == "http"): rePage = re.sub(href, 'href="' + PATH_site_name + href[6:], rePage) link_src = re.compile('src=".*?"') for src in link_src.findall(rePage): if (not src[5:9] == "http"): rePage = re.sub(src, 'src="' + PATH_site_name + src[5:], rePage) file_write(page, rePage) #создание sitemap sm = Sitemap(changefreq=SITEMAP_all_update) for page in l_page_create: sm.add(PATH_site_name + page[7:]) file_write(PATH_site_out + "sitemap.xml", sm.get_as_string()) l_page_create.append(PATH_site_out + "sitemap.xml") #создание robots file_write(PATH_site_out + "robots.txt", ROBOT_TXT) l_page_create.append(PATH_site_out + "robots.txt") print "Content-Type: text/html\n\n" print "<html>" print "<br>Модулей найдено для подключения:<br>" for page in os.listdir(PATH_module): print page + "<br>" print "<br>Страниц создано:<br>" for page in l_page_create: print page + "<br>"
def main(args): is_release = "release" in args Path("../out/news").mkdir(parents=True, exist_ok=True) copytree("./news", "../out/news", ignore=ignore_patterns("*.md")) news_list = find_news() env = Environment( loader=PackageLoader("generate", "./templates"), autoescape=select_autoescape(["html", "xml"]), ) news_items = [ { "title": Markup(news.render_title_link("/" + news.path.strip("/") + "/")), "date": format_date(news.date), "path": news.path, } for news in news_list ] sitemap = Sitemap(BASE_URL) sitemap.add_url("/") sitemap.add_url("/news/") for news in news_list: render_template( env, "news.html", f"../out/{news.path.strip('/')}/index.html", release=is_release, meta_title=news.render_title(), meta_description=news.description, meta_canonical=f'{BASE_URL.rstrip("/")}/{news.path.strip("/")}/', content=Markup(news.html), date=format_date(news.date), other_news=[on for on in news_items if news.path != on["path"]][:3], ) sitemap.add_url(news.path) render_template( env, "news-index.html", "../out/news/index.html", release=is_release, news=news_items, meta_title="Новости", meta_description=( "Новости либертарианства и Либертарианской Партии России в Екатеринбурге и Свердловской области" ), meta_canonical=f'{BASE_URL.rstrip("/")}/news/', ) render_template( env, "home.html", "../out/index.html", release=is_release, news=news_items[:3], meta_description=( "Выступаем за свободную экономику, независимое местное самоуправление, " "суверенитет личности и против цензуры в интернете. Присоединяйся!" ), ) render_template(env, "sitemap.xml", "../out/sitemap.xml", urls=sitemap.urls) copytree("./static", "../out")
def new_sitemap(self): """Create new Sitemap object with default settings""" return Sitemap(pretty_xml=self.pretty_xml)