def test(): #html = urllib2.urlopen('http://www.cs.sfu.ca/~ggbaker/') html = urllib2.urlopen('http://www.cs.sfu.ca/~woshun/') soup = BeautifulSoup(html) text = soup.findAll(text=True) page = filter(visible, text) page = [token.strip(' ').lower() for token in page] print page print rel.check(page)
def test(): html = urllib2.urlopen('http://www.cs.sfu.ca/people/faculty.html') soup = BeautifulSoup(html) text = soup.findAll(text=True) page = filter(visible, text) page = [token.strip(' ').lower() for token in page] print rel.check(page) a = doc('111','hello','<had') print a.getHTML()
def extractInternalLinks(seedURL, parentSoup): links=[] for link in parentSoup.find_all('a'): link = link.get('href') try: link = urljoin(seedURL, link) if not('cs.sfu.ca' in link.lower()) or ('#' in link) or ('calendar' in link.lower()): continue try: html = urllib2.urlopen(link) #pulls html parsedPage = BeautifulSoup(html) except: print 'cant open page' + link continue text = parsedPage.findAll(text=True) page = filter(visible, text) page = [token.strip(' ').lower() for token in page] page = [token.split(' ') for token in page] page = list(itertools.chain.from_iterable(page)) page = [re.sub(',*:*;*-*', '', token) for token in page] relevance = rel.check(page) #print "Page with URL {0} is {1}.".format(link, "relevant" if relevance else "not relevant") if relevance: links.append(link) except: print 'empty link' return links
def extractInternalLinks(seedURL, parentSoup): links=[] for link in parentSoup.find_all('a'): link = link.get('href') print link try: link = urljoin(seedURL, link) if not('cs.sfu.ca' in link.lower()): continue try: html = urllib2.urlopen(link) #pulls html parsedPage = BeautifulSoup(html) except: print 'cant open page' + link continue text = parsedPage.findAll(text=True) page = filter(visible, text) page = [token.strip(' ').lower() for token in page] relevance = rel.check(page) print "Page with URL {0} is {1}.".format(link, "relevant" if relevance else "not relevant") if relevance: links.append(link) except: print 'empty link' # if link.startswith('/'): #deals with internal links eg: /people/faculty.html # link = 'http://www.cs.sfu.ca'+link.get('href') # links.append(link) # elif 'cs.sfu.ca' in link: # links.append(link) return links
def relevtest(): url = 'http://www.cs.sfu.ca/~kabanets/pubs.html' html = urllib2.urlopen(url) soup = BeautifulSoup(html) text = soup.findAll(text=True) page = filter(visible, text) page = [token.strip(' ').lower() for token in page] page = [token.split(' ') for token in page] page = list(itertools.chain.from_iterable(page)) page = [re.sub(',*:*;*-*', '', token) for token in page] relevance = rel.check(page) print relevance