def crawl_web(seed,max_depth,crawled): # returns index, graph of inlinks" tocrawl = set(seed) next_depth = [] depth = 0 wcorpus = WebCorpus() last_url = None while tocrawl : url = tocrawl.pop() if url not in crawled: try: content = real_get_page(url) if content: words = get_all_words(content) if 'joe' in words or 'Joe' in words or 'joseph' in words or 'Joseph' in words: print url outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url,outlink) if depth >= max_depth: outlinks = set(outlinks) tocrawl.update(outlinks) tolist = list(tocrawl) write_tocrawl(tolist) write_crawled(crawled) return wcorpus soup = BeautifulSoup(content, "html.parser") try: title = soup.title.string except: title = '' try: desc = str(soup.findAll(attrs={"name":"description"})[0]['content'].encode('utf-8')) except: desc = '' for word in words: wcorpus.add_word_occurrence(url,str(word.encode("UTF-8")),title,desc) tocrawl.update(outlinks) tolist = list(tocrawl) d = {'tocrawl':tolist} with open('tocrawl_json.json', 'w') as outfile: json.dump(d, outfile) crawled.append(url) depth+=1 outlinks = get_all_links(content) crawled.append(url) tocrawl.update(outlinks) except Exception, e: print "ERROR "+str(e)
def test_engine(): print "Testing..." content = """This is a sample <a href="http://www.example.com">webpage</a> with <a href="http://www.go.to">two links</a> that lead nowhere special. """ outlinks = ["http://www.example.com", "http://www.go.to"] corpus = WebCorpus() assert corpus.lookup("anything") == None for link in outlinks: corpus.add_link("http://www.test.info", link) assert corpus._graph["http://www.test.info"] == outlinks corpus.add_word_occurrence("http://www.test.info", "sample") assert corpus._index["sample"] == ["http://www.test.info"] print "Finished tests."
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawl_web(seed): # returns index, graph of inlinks tocrawl = set([seed]) crawled = [] wcorpus = WebCorpus() while tocrawl: url = tocrawl.pop() # changed page to url - clearer name if url not in crawled: content = get_page(url) outlinks = get_all_links(content) for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) tocrawl.update(outlinks) crawled.append(url) return wcorpus
def crawlWeb(seed): toCrawl = set([seed]) #start with a seed page crawled = [] #keep a record of sites crawled to prevent repeat visits wcorpus = WebCorpus() while toCrawl: url = toCrawl.pop() if url not in crawled: #check whether already crawled content = get_page(url) #read-in all of the page's html text outlinks = getAllLinks(content) #store outlinks in var for building graph for outlink in outlinks: wcorpus.add_link(url, outlink) for word in content.split(): wcorpus.add_word_occurrence(url, word) toCrawl.update(outlinks) #add outlinks to toCrawl stack if we haven't cralwed already crawled.append(url) #store page that we popped in crawled. return wcorpus
def test_engine(): print "Testing..." content = """This is a sample <a href="http://www.example.com">webpage</a> with <a href="http://www.go.to">two links</a> that lead nowhere special. """ outlinks = ["http://www.example.com", "http://www.go.to"] corpus = WebCorpus() assert corpus.lookup("anything") == None for link in outlinks: corpus.add_link("http://www.test.info", link) assert corpus._graph["http://www.test.info"] == outlinks corpus.add_word_occurrence("http://www.test.info", "sample") assert corpus._index["sample"] == ["http://www.test.info"] print "Finished tests."
def crawl_web(seed): # returns webcorpus (includes index, graph) tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() if url not in crawled: content = get_page(url) add_page_to_index(corpus, url, content) outlinks = get_all_links(content) for outlink in outlinks: corpus.add_link(url, outlink) tocrawl.update(outlinks) crawled.append(url) return corpus
def crawl_web(seed): # returns webcorpus (includes index, graph) tocrawl = set([seed]) crawled = [] corpus = WebCorpus() while tocrawl: url = tocrawl.pop() if url not in crawled: content = get_page(url) add_page_to_index(corpus, url, content) outlinks = get_all_links(content) for outlink in outlinks: corpus.add_link(url, outlink) tocrawl.update(outlinks) crawled.append(url) return corpus