Python WebCorpus.add_link 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: webcorpus

클래스/타입: WebCorpus

메소드/함수: add_link

hotexamples.com에서의 예제들: 8

Python WebCorpus.add_link - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 webcorpus.WebCorpus.add_link에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

WebCorpus(6)

add_link(3)

add_word_occurrence(2)

add_friend(1)

add_page(1)

finish_crawl(1)

get_count(1)

get_results(1)

lookup(1)

ranks(1)

예제 #1

파일 보기

파일: crawler.py 프로젝트: petecummings/JOEGLE_web_crawler

def crawl_web(seed,max_depth,crawled): # returns index, graph of inlinks"
    tocrawl = set(seed)
    next_depth = []
    depth = 0
    wcorpus = WebCorpus()
    last_url = None
    while tocrawl :             
        url = tocrawl.pop()
        if url not in crawled:  
            try:
                content = real_get_page(url)
                if content:
		    words = get_all_words(content)
		    if 'joe' in words or 'Joe' in words or 'joseph' in words or 'Joseph' in words:
			print url
			outlinks = get_all_links(content)
			for outlink in outlinks:
			    wcorpus.add_link(url,outlink)
			if depth >= max_depth:
			    outlinks = set(outlinks)
			    tocrawl.update(outlinks)
			    tolist = list(tocrawl)
			    write_tocrawl(tolist)
			    write_crawled(crawled)
			    return wcorpus
			soup = BeautifulSoup(content, "html.parser")
			try:
			    title = soup.title.string
			except:
			    title = ''
			try:
			    desc = str(soup.findAll(attrs={"name":"description"})[0]['content'].encode('utf-8'))
			except:
			    desc = ''
			
			for word in words:
			    wcorpus.add_word_occurrence(url,str(word.encode("UTF-8")),title,desc)
			tocrawl.update(outlinks)
			tolist = list(tocrawl)
			d = {'tocrawl':tolist}
			with open('tocrawl_json.json', 'w') as outfile:
			    json.dump(d, outfile)
			crawled.append(url)
			depth+=1
		    outlinks = get_all_links(content)
		    crawled.append(url)
		    tocrawl.update(outlinks)

            except Exception, e:
                print "ERROR "+str(e)

예제 #2

파일 보기

파일: studentMain.py 프로젝트: DesPenny/cs101

def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]
    
    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]
    
    print "Finished tests."

예제 #3

파일 보기

def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink)
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus

예제 #4

파일 보기

파일: crawler.py 프로젝트: sachinlohith/searchengine

def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink) 
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus

예제 #5

파일 보기

파일: crawler.py 프로젝트: BrianSipple/Python-Search-Engine

def crawlWeb(seed):
	toCrawl = set([seed])		#start with a seed page
	crawled = []			#keep a record of sites crawled to prevent repeat visits
	wcorpus = WebCorpus()
	while toCrawl:
		url = toCrawl.pop()					
		if url not in crawled:				#check whether already crawled
			content = get_page(url)		#read-in all of the page's html text
			outlinks = getAllLinks(content)  #store outlinks in var for building graph
			for outlink in outlinks:
				wcorpus.add_link(url, outlink)
			for word in content.split():
				wcorpus.add_word_occurrence(url, word)
			toCrawl.update(outlinks)		#add outlinks to toCrawl stack if we haven't cralwed already
			crawled.append(url)				#store page that we popped in crawled. 
	return wcorpus

예제 #6

파일 보기

파일: studentMain.py 프로젝트: tylerCarter/Python

def test_engine():
    print "Testing..."
    content = """This is a sample <a href="http://www.example.com">webpage</a> with 
    <a href="http://www.go.to">two links</a> that lead nowhere special.
    """
    outlinks = ["http://www.example.com", "http://www.go.to"]

    corpus = WebCorpus()
    assert corpus.lookup("anything") == None
    for link in outlinks:
        corpus.add_link("http://www.test.info", link)
    assert corpus._graph["http://www.test.info"] == outlinks
    corpus.add_word_occurrence("http://www.test.info", "sample")
    assert corpus._index["sample"] == ["http://www.test.info"]

    print "Finished tests."

예제 #7

파일 보기

파일: crawler.py 프로젝트: lipghee/Udacity

def crawl_web(seed):  # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)

            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)

    return corpus

예제 #8

파일 보기

파일: crawler.py 프로젝트: cbeltranv/Udacity

def crawl_web(seed): # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl: 
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)
            
            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)
    
    return corpus