예제 #1
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus
예제 #2
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
예제 #3
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink)
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
예제 #4
0
def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]

    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]

    print "Finished tests."
예제 #5
0
파일: crawler.py 프로젝트: lipghee/Udacity
def crawl_web(seed):  # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)

            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)

    return corpus
예제 #6
0
###
### You should define the WeCorpus class in the file webcorpus.py
###
from webcorpus import WebCorpus

print "Testing webcorpus..."
wc1 = WebCorpus()
assert isinstance(wc1.index, dict)
assert isinstance(wc1.graph, dict)
print "Finished tests."