def generate(crawler): """Output the list of not checked pages.""" session = Session() links = session.query(Link).filter(Link.yanked != None).order_by(Link.url) links = links.options(joinedload(Link.linkproblems)) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Output a sorted list of URLs.""" session = Session() links = session.query(Link).order_by(Link.url) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Present the list of bad links.""" session = Session() links = session.query(Link).filter(Link.linkproblems.any()) links = links.order_by(Link.url).options(joinedload(Link.linkproblems)) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Output the list of pages without a title.""" session = Session() links = session.query(Link).filter_by(is_page=True, is_internal=True) links = links.filter((char_length(Link.title) == 0) | (Link.title == None)).order_by(Link.url) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Generate the list of external links.""" session = Session() links = session.query(Link).filter(Link.is_internal != True).order_by(Link.url) links = links.options(joinedload(Link.linkproblems)) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Output the list of outdated pages to the specified file descriptor.""" session = Session() oldtime = datetime.datetime.now() - datetime.timedelta(days=config.REPORT_WHATSOLD_URL_AGE) links = session.query(Link).filter_by(is_page=True, is_internal=True) links = links.filter(Link.mtime < oldtime).order_by(Link.mtime) render(__outputfile__, crawler=crawler, title=__title__, links=links, now=datetime.datetime.now()) session.close()
def generate(crawler): """Output the list of recently modified pages.""" session = Session() newtime = datetime.datetime.now() - datetime.timedelta(days=config.REPORT_WHATSNEW_URL_AGE) links = session.query(Link).filter_by(is_page=True, is_internal=True) links = links.filter(Link.mtime > newtime).order_by(Link.mtime.desc()) render(__outputfile__, crawler=crawler, title=__title__, links=links, now=datetime.datetime.now()) session.close()
def generate(crawler): """Output the sitemap.""" session = Session() links = [ session.query(Link).filter_by(url=url).first() for url in crawler.base_urls ] links = explore(links) render(__outputfile__, crawler=crawler, title=__title__, links=links)
def generate(crawler): """Output the list of large pages.""" session = Session() links = session.query(Link).filter_by(is_page=True, is_internal=True) links = [x for x in links if get_size(x) >= config.REPORT_SLOW_URL_SIZE * 1024] links.sort(lambda a, b: cmp(b.total_size, a.total_size)) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Generate a list of image URLs that were found.""" session = Session() # get non-page links that have an image/* mimetype links = session.query(Link) links = links.filter((Link.is_page != True) | (Link.is_page == None)) links = links.filter(Link.mimetype.startswith('image/')) links = links.order_by(Link.url) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Output the list of large pages.""" session = Session() links = session.query(Link).filter_by(is_page=True, is_internal=True) links = [ x for x in links if get_size(x) >= config.REPORT_SLOW_URL_SIZE * 1024 ] links.sort(lambda a, b: cmp(b.total_size, a.total_size)) render(__outputfile__, crawler=crawler, title=__title__, links=links) session.close()
def generate(crawler): """Output the list of outdated pages to the specified file descriptor.""" session = Session() oldtime = datetime.datetime.now() - datetime.timedelta( days=config.REPORT_WHATSOLD_URL_AGE) links = session.query(Link).filter_by(is_page=True, is_internal=True) links = links.filter(Link.mtime < oldtime).order_by(Link.mtime) render(__outputfile__, crawler=crawler, title=__title__, links=links, now=datetime.datetime.now()) session.close()
def generate(crawler): """Output the overview of problems per author.""" session = Session() # make a list of problems per author problem_db = collections.defaultdict(list) # get internal links with page problems links = session.query(Link).filter_by(is_internal=True) links = links.filter(Link.pageproblems.any()).order_by(Link.url) for link in links: author = link.author.strip() if link.author else u'Unknown' problem_db[author].append(link) # get a sorted list of authors authors = problem_db.keys() authors.sort() authors = [(x, problem_db[x]) for x in authors] render(__outputfile__, crawler=crawler, title=__title__, authors=authors, mk_id=mk_id) session.close()
def generate(crawler): """Output a list of modules, it's authors and the webcheck version.""" session = Session() render(__outputfile__, crawler=crawler, title=__title__, numlinks=session.query(Link).count()) session.close()