def get_urls(uri, output):
    print "Spidering %s, getting %d maximum pages and following %d links deep." % (uri, MAX_NUMBER_OF_PAGES_TO_CRAWL, MAX_NUMBER_OF_LINKS_TO_FOLLOW)
    urls = spider.weburls(uri, 
                          width=MAX_NUMBER_OF_PAGES_TO_CRAWL,
                          depth=MAX_NUMBER_OF_LINKS_TO_FOLLOW)

    spider.urls = urls
    print "Generating report..."
    spider.webreport(output)
    print "Report of URLs written to %s" % output
示例#2
0
import spider
from pprint import pprint

if __name__ == '__main__':
    a = spider.ftpurls('ftp://localhost/')
    print 1; pprint(a)
    a = spider.ftppaths('ftp://localhost')
    print 2; pprint(a)
    a = spider.weburls('http://localhost/')
    print 3; pprint(a)
    a = spider.weburls('http://localhost/', 200, 5, 3)
    print 4; pprint(a)
    spider.ftpmirror('e:\\ftp\\', 14, 'ftp://localhost/')
    a = spider.ftpspider('ftp://localhost/')
    print 5; pprint(a)
    a = spider.webpaths('http://localhost/')
    print 6; pprint(a)
    spider.webreport('e:\\web1.txt', 'http://localhost/')
    spider.webmirror('e:\\web\\', 18, 'http://localhost/')
    a = spider.webspider('http://localhost/')
    print 7; pprint(a)
    spider.urlreport('e:\\web2.txt', 'http://localhost/',)
    spider.badurlreport('e:\\web3.txt', 'http://localhost/')
    spider.badhtmreport('e:\\web4.txt', 'http://localhost/')
    spider.redireport('e:\\web5.txt', 'http://localhost/')
    spider.outreport('e:\\web6.txt', 'http://localhost')
    spider.othereport('e:\\web7.txt', 'http://localhost/')
    a = spider.Spider('ftp://localhost/', 200, 16)
    a.ftppaths()
    print 1; pprint(a.paths)
    a.ftpurls()
示例#3
0
import spider

urllist = spider.weburls(base="https://arxiv.org/catchup")