示例#1
0
def go_index(page):
    """
    go_index(...) recursive function that will scrape web pages
    for every url it finds it will go in and call itself to continue
    caution this is a big memory hog and IT WILL fail eventually
    """
    try:
        scraper = WebScraper()
        if scraper.scrape(page):
            domain = indexer.get_domain(page)
            indexer.index_domain(domain)
            indexer.index_file(domain, domain, True)
            urls = scraper.get_link_urls()
            if indexer.has_crawable(urls):
                for url in urls:
                    title = url.encode(encoding="utf-8", errors="replace")
                    hash_path = "{0}.link".format(indexer.do_hash(title))
                    path = os.path.join(domain, hash_path)
                    if not os.path.exists(path) and url != scraper.page:
                        indexer.index_file(url, domain, True)
                        go_index(url)
        else:
            print("Can not scrape requested page {0}".format(page))
    except RuntimeError:
        # Figure out a way to respawn in another thread
        print("Runtime Error occurred. Killing Script")
        sys.exit()
示例#2
0
def do_index(config, testrun):
    dbpath = testrun.dbpath(config)
    indexlogpath = testrun.indexlogpath(config)

    if not config.preserve or \
       not os.path.exists(dbpath) or \
       not os.path.exists(indexlogpath):

        if os.path.exists(dbpath):
            shutil.rmtree(dbpath)
        if os.path.exists(indexlogpath):
            os.unlink(indexlogpath)

        print "Starting index run (creating %s)" % dbpath
        indexer.index_file(inputfile=testrun.inputfile,
                           dbpath=dbpath,
                           logpath=indexlogpath,
                           flushspeed=testrun.flushspeed,
                           description=testrun.description,
                           maxdocs=testrun.maxdocs,
                           logspeed=testrun.logspeed)
        print "Ending index run"
示例#3
0

crawler_backlog = {}
crawler_data = []
seed = "http://www.newhaven.edu/"
crawler_backlog[seed]=0

print("Creating Web Pickle....")
visit_url(seed, "www.newhaven.edu") #create raw_web.pickle with web contents 		 List of Tuples
out = open("raw_web.pickle", "bw") 
pickle.dump(crawler_data,out)
out.close()
print("Creating Data Pickle....")
data_load.traverse("fortune1") #creates raw_data.pickle with file contents   		 List of Tuples
print("Indexing Web Pickle....")
indexer.index_file("raw_web.pickle","out_data")
print("Indexing Data Pickle....")
indexer.index_file("raw_data.pickle","out_data")


getWeather("West Haven","CT")
searcher.searchFile("out_data")








示例#4
0
import data_load
import searcher 
import indexer 


#data_load.traverse("fortune1")
wordDictionary = indexer.index_file("raw_data.pickle", "the_shelve")
searcher.searchFile("the_shelve")