示例#1
0
def main(url, resolve_dupes=True):
    """Simple wrapper, just calls the resolver for now
    
    Drops the last letter of the passed `url` and then grabs all URLs
    that start with the remaining `string`
    
    :param url: `string` of the base URL to start with
    :param resolve_dupes: `Boolean` whether or not to re-process
                          existing entries.
    :rtype: None
    
    """
    db = init_db_conn()

    base_url = url[:-2]
    for i in CHARSET:
        for j in CHARSET:
            url = '%s%s%s' % (base_url, chr(i), chr(j))

            # If skipping existing entries, check for this URL and skip
            # if we already have it
            if not resolve_dupes:
                existing = get_result(db, url)

                if existing:
                    continue

            bitly = resolve_url(url)

            if bitly.status != 404:
                sys.stdout.write('%s\t%s\n' % (bitly.content_type,
                                               bitly.path[-1]))

            save_result(db, bitly)
示例#2
0
    def do_GET(self):
        # Not the most efficient way to connect to the DB, would be
        # better to connect once and retain, but, I'm lazy.
        db = init_db_conn()

        # Pop off headers
        self.send_response(200)
        self.send_header("Content-type", "text/html")
        self.end_headers()

        self.wfile.write("<html><head><title>bit.ly grinder</title></head>")
        self.wfile.write("<body><ul>")

        if self.path == '/images':
            """Show all matching images"""
            results = get_results_by_content_type(db, content_type='image/%')
            for res in results:
                self.wfile.write('<br><img src="%s" />%s - %s<br>\n' %
                                 (res.path[-1], res.path[0],
                                  res.path[-1]))

        elif self.path == '/nonhtml':
            results = get_results(db, status=200, exclude_content='text/html')
            self.print_links(results)

        elif self.path == '/nonhtml-all':
            results = get_results(db, exclude_content='text/html')
            self.print_links(results)

        elif self.path == '/all':
            results = get_results(db)
            self.print_links(results)

        else:
            results = get_results(db, status=200)
            self.print_links(results)

        self.wfile.write("</ul></body></html>")