示例#1
0
def main():
    """main driver"""
    args = commandline_args_setup()

    # handle verbose option
    if args.verbose is True:
        # setup a logging handler for the command line
        console = logging.StreamHandler()  # stream=sys.stdout
        console.setLevel(logging.INFO)
        formatter = logging.Formatter('%(funcName)s: %(message)s')
        console.setFormatter(formatter)
        # add the handler to the verbose logger
        verbose = logging.getLogger('verbose')
        verbose.setLevel(logging.INFO)
        verbose.addHandler(console)
        verbose.info('Running verbose.')

    base_url = args.base_url
    session = parse.Session()
    html = session.html_at_url(base_url)

    # this web page is defined as a URL and some HTML
    page = parse.Page(base_url, html)
    logging.getLogger('verbose').info(page.content)

    links = page.links
    logging.getLogger('verbose').info(links)

    # store the file if opted
    sm_arg = args.sitemap_dest
    to_save = _prompt_sitemap_save(base_url, sm_arg)
    if to_save:
        sm = Sitemap(links)
        sm.export(sm_arg)
示例#2
0
    def __init__(self, config_file_name):

        self.config = Config(config_file_name)
        self.sitemap = Sitemap(self.config.value('website_base_url'))
        self.template = Template(
            self.tag['content'],
            self.config.value('template_path') + Website.WEBSITE_TEMPLATE)
        self.refresh()
示例#3
0
    def get(self,url,inventory=None):
        """Get a inventory from url

        Will either create a new Inventory object or add to one supplied.
        """
        # Either use inventory passed in or make a new one
        if (inventory is None):
            inventory = Inventory()

        inventory_fh = URLopener().open(url)
        Sitemap().inventory_parse_xml(fh=inventory_fh, inventory=inventory)
        return(inventory)
示例#4
0
 def write_zip(self, inventory=None, dumpfile=None):
     """Write a ZIP dump file"""
     compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED )
     zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True)
     # Write inventory first
     s = Sitemap(pretty_xml=True, allow_multifile=False)
     zf.writestr('manifest.xml',s.resources_as_xml(inventory))
     # Add all files in the inventory
     for resource in inventory:
         zf.write(resource.uri)
     zf.close()
     zipsize = os.path.getsize(dumpfile)
     print "Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize)
示例#5
0
def test_sitemap():
    """ Test of test_sitemap()
    """

    base_url = 'https://example.com'

    page1 = 'example-%s.html' % randint(0, 999999)
    page2 = 'example-%s.html' % randint(0, 999999)
    page3 = 'example-%s.html' % randint(0, 999999)

    sitemap = Sitemap(base_url)

    # New sitemap should include no pages
    assert not sitemap.pages

    sitemap.append(page1)

    # after appending one page, sitemap should contain that page and only that page
    assert len(sitemap.pages) == 1
    assert sitemap.pages[0] == "%s/%s" % (base_url, page1)

    sitemap.append(page2)
    sitemap.append(page3)

    # after appebding two more pages, sitemap should contain 3 pages
    assert len(sitemap.pages) == 3
    assert sitemap.pages[0] == "%s/%s" % (base_url, page1)
    assert sitemap.pages[1] == "%s/%s" % (base_url, page2)
    assert sitemap.pages[2] == "%s/%s" % (base_url, page3)

    sitemap.append('index.html')

    # index.html should be included as '/', without 'index.html'
    assert len(sitemap.pages) == 4
    assert sitemap.pages[3] == "%s/" % base_url

    sitemap.write(TEST_WEBSITE.config.value('output_path'))

    with open(TEST_WEBSITE.config.value('output_path') + 'sitemap.txt',
              'r') as my_file:
        sitemap_from_file = my_file.read().splitlines()

    # sitemap written to file should contain our 3 pages
    assert len(sitemap.pages) == 4
    assert sitemap_from_file[0] == "%s/%s" % (base_url, page1)
    assert sitemap_from_file[1] == "%s/%s" % (base_url, page2)
    assert sitemap_from_file[2] == "%s/%s" % (base_url, page3)
    assert sitemap_from_file[3] == "%s/" % base_url
示例#6
0
 def test_happy_path(self):
     sitemap = Sitemap(loc=self.loc, lastmod=self.lastmod)
     assert isinstance(sitemap, Sitemap)
示例#7
0
 def test_invalid_sitemap_input(self):
     with pytest.raises(TypeError):
         Sitemap(loc=self.loc, lastmod=123)
示例#8
0
usage = """\
python3 __main__.py https://www.femsense.com
"""


def exit_wrong_usage():
    print(usage)
    sys.exit(1)


def format_result(result: typing.Union[CheckResult, CheckResults]):
    symbol = "✅" if result.valid else "❌"
    return f"\t{symbol} {check.msg}"


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    if len(sys.argv) < 2:
        exit_wrong_usage()

    site = sys.argv[1]

    crawler = Crawler(site)
    sitemap = Sitemap(site)

    results = [*crawler.crawl()]
    print("Dere!")
    for n, check in enumerate(results):
        # if not check.valid:
        print(f"{n: 4d} {format_result(check)}")
示例#9
0
    if (URL_STABILIZATION.search(file_read(page))):
        rePage = URL_STABILIZATION.sub("", file_read(page))
        link_href = re.compile('href=".*?"')
        for href in link_href.findall(rePage):
            #проверка на абсолютные ссылки
            if (not href[6:10] == "http"):
                rePage = re.sub(href, 'href="' + PATH_site_name + href[6:],
                                rePage)
        link_src = re.compile('src=".*?"')
        for src in link_src.findall(rePage):
            if (not src[5:9] == "http"):
                rePage = re.sub(src, 'src="' + PATH_site_name + src[5:],
                                rePage)
        file_write(page, rePage)
#создание sitemap
sm = Sitemap(changefreq=SITEMAP_all_update)
for page in l_page_create:
    sm.add(PATH_site_name + page[7:])
file_write(PATH_site_out + "sitemap.xml", sm.get_as_string())
l_page_create.append(PATH_site_out + "sitemap.xml")
#создание robots
file_write(PATH_site_out + "robots.txt", ROBOT_TXT)
l_page_create.append(PATH_site_out + "robots.txt")
print "Content-Type: text/html\n\n"
print "<html>"
print "<br>Модулей найдено для подключения:<br>"
for page in os.listdir(PATH_module):
    print page + "<br>"
print "<br>Страниц создано:<br>"
for page in l_page_create:
    print page + "<br>"
示例#10
0
def main(args):
    is_release = "release" in args

    Path("../out/news").mkdir(parents=True, exist_ok=True)
    copytree("./news", "../out/news", ignore=ignore_patterns("*.md"))

    news_list = find_news()

    env = Environment(
        loader=PackageLoader("generate", "./templates"),
        autoescape=select_autoescape(["html", "xml"]),
    )

    news_items = [
        {
            "title": Markup(news.render_title_link("/" + news.path.strip("/") + "/")),
            "date": format_date(news.date),
            "path": news.path,
        }
        for news in news_list
    ]

    sitemap = Sitemap(BASE_URL)
    sitemap.add_url("/")
    sitemap.add_url("/news/")

    for news in news_list:
        render_template(
            env,
            "news.html",
            f"../out/{news.path.strip('/')}/index.html",
            release=is_release,
            meta_title=news.render_title(),
            meta_description=news.description,
            meta_canonical=f'{BASE_URL.rstrip("/")}/{news.path.strip("/")}/',
            content=Markup(news.html),
            date=format_date(news.date),
            other_news=[on for on in news_items if news.path != on["path"]][:3],
        )

        sitemap.add_url(news.path)

    render_template(
        env,
        "news-index.html",
        "../out/news/index.html",
        release=is_release,
        news=news_items,
        meta_title="Новости",
        meta_description=(
            "Новости либертарианства и Либертарианской Партии России в Екатеринбурге и Свердловской области"
        ),
        meta_canonical=f'{BASE_URL.rstrip("/")}/news/',
    )

    render_template(
        env,
        "home.html",
        "../out/index.html",
        release=is_release,
        news=news_items[:3],
        meta_description=(
            "Выступаем за свободную экономику, независимое местное самоуправление, "
            "суверенитет личности и против цензуры в интернете. Присоединяйся!"
        ),
    )

    render_template(env, "sitemap.xml", "../out/sitemap.xml", urls=sitemap.urls)

    copytree("./static", "../out")
示例#11
0
 def new_sitemap(self):
     """Create new Sitemap object with default settings"""
     return Sitemap(pretty_xml=self.pretty_xml)