def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) corpus.add_page(url, content, outlinks) tocrawl.update(outlinks) crawled.append(url) corpus.finish_crawl() return corpus
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) add_page_to_index(wcorpus.index, url, content) outlinks = get_all_links(content) wcorpus.graph[url] = outlinks tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) tocrawl.update(outlinks) crawled.append(url) return wcorpus
def test_engine(): print "Testing..." content = """This is a sample <a href="http://www.example.com">webpage</a> with <a href="http://www.go.to">two links</a> that lead nowhere special. """ outlinks = ["http://www.example.com", "http://www.go.to"] corpus = WebCorpus() assert corpus.lookup("anything") == None for link in outlinks: corpus.add_link("http://www.test.info", link) assert corpus._graph["http://www.test.info"] == outlinks corpus.add_word_occurrence("http://www.test.info", "sample") assert corpus._index["sample"] == ["http://www.test.info"] print "Finished tests."
def crawl_web(seed): # returns webcorpus (includes index, graph) tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() if url not in crawled: content = get_page(url) add_page_to_index(corpus, url, content) outlinks = get_all_links(content) for outlink in outlinks: corpus.add_link(url, outlink) tocrawl.update(outlinks) crawled.append(url) return corpus
### ### You should define the WeCorpus class in the file webcorpus.py ### from webcorpus import WebCorpus print "Testing webcorpus..." wc1 = WebCorpus() assert isinstance(wc1.index, dict) assert isinstance(wc1.graph, dict) print "Finished tests."