示例#1
0
def test():
	#html = urllib2.urlopen('http://www.cs.sfu.ca/~ggbaker/')
	html = urllib2.urlopen('http://www.cs.sfu.ca/~woshun/')
	soup = BeautifulSoup(html)
	text = soup.findAll(text=True)
	page = filter(visible, text)
	page = [token.strip(' ').lower() for token in page]
	print page
	print rel.check(page)
示例#2
0
def test():
	html = urllib2.urlopen('http://www.cs.sfu.ca/people/faculty.html')
	soup = BeautifulSoup(html)
	text = soup.findAll(text=True)
	page = filter(visible, text)
	page = [token.strip(' ').lower() for token in page]
	print rel.check(page)

	a = doc('111','hello','<had')
	print a.getHTML()
示例#3
0
def extractInternalLinks(seedURL, parentSoup):
	links=[]

	for link in parentSoup.find_all('a'):
		link = link.get('href')
		try:
			link = urljoin(seedURL, link)
			if not('cs.sfu.ca' in link.lower()) or ('#' in link) or ('calendar' in link.lower()):
				continue
			try:
				html = urllib2.urlopen(link) #pulls html
				parsedPage = BeautifulSoup(html)
			except:
				print 'cant open page' + link
				continue

			text = parsedPage.findAll(text=True)
			page = filter(visible, text)
			page = [token.strip(' ').lower() for token in page]
			page = [token.split(' ') for token in page]
			page = list(itertools.chain.from_iterable(page))
			page = [re.sub(',*:*;*-*', '', token) for token in page]
			relevance = rel.check(page)
			#print "Page with URL {0} is {1}.".format(link, "relevant" if relevance else "not relevant")
			if relevance:
				links.append(link)
		except:
			print 'empty link'
	return links
示例#4
0
def extractInternalLinks(seedURL, parentSoup):
	links=[]

	for link in parentSoup.find_all('a'):
		link = link.get('href')
		print link
		try:
			link = urljoin(seedURL, link)
			if not('cs.sfu.ca' in link.lower()):
				continue
			try:
				html = urllib2.urlopen(link) #pulls html
				parsedPage = BeautifulSoup(html)
			except:
				print 'cant open page' + link 
				continue
		
			text = parsedPage.findAll(text=True)
			page = filter(visible, text)
			page = [token.strip(' ').lower() for token in page]
			
			relevance = rel.check(page)
			print "Page with URL {0} is {1}.".format(link, "relevant" if relevance else "not relevant")
			if relevance:
				links.append(link)
		except:
			print 'empty link'
			
		#	if link.startswith('/'): #deals with internal links eg: /people/faculty.html
		#		link = 'http://www.cs.sfu.ca'+link.get('href')
		#		links.append(link)
		#	elif 'cs.sfu.ca' in link:
		#		links.append(link)
	return links
示例#5
0
def relevtest():
	url = 'http://www.cs.sfu.ca/~kabanets/pubs.html'
	html = urllib2.urlopen(url)
	soup = BeautifulSoup(html)

	text = soup.findAll(text=True)
	page = filter(visible, text)
	
	page = [token.strip(' ').lower() for token in page]
	page = [token.split(' ') for token in page]
	page = list(itertools.chain.from_iterable(page))
	page = [re.sub(',*:*;*-*', '', token) for token in page]
	relevance = rel.check(page)
	print relevance