def isMultiPaged(url): html_page1 = common.download(url % 1) tree = etree.HTML(html_page1) xp1 = tree.xpath("/html/body/div[1]/div[1]/div[1]/div[2]/*") xp1 = ",".join(x.text for x in xp1) html_page2 = common.download(url % 2) if html_page2 == None: return False tree = etree.HTML(html_page2) xp2 = tree.xpath("/html/body/div[1]/div[1]/div[1]/div[2]/*") xp2 = ",".join(x.text for x in xp2) if xp1 == xp2: return False else: return True
def isMultiPaged(url): html_page1 = common.download(url % 1) soup = BeautifulSoup(html_page1, 'html.parser') body1 = soup.find('body') body1.script.decompose() html_page2 = common.download(url % 2) if html_page2 == None: return False soup = BeautifulSoup(html_page2, "html.parser") body2 = soup.find('body') #print [x.extract() for x in body2.findAll('script') ] body2.script.decompose() if str(body1) == str(body2): return False else: return True
def crawled_page(crawled_url): html = common.download(crawled_url) soup = BeautifulSoup(html, 'html.parser') title = soup.find('h1', {'class': 'title'}) if title == None: return "Title_Is_None", crawled_url content = soup.find('div', {'class': 'show-content'}) if content == None: return title.text, "Content_Is_None" return title.text, content.text
def getNumberOfPages(url): count = 1 flag = True if (isMultiPaged(url)): while flag: url = url % count # print "url: %s" % url count += 1 html = common.download(url) if html == None: break return count
def crawled_page(crawled_url): html = common.download(crawled_url) tree = etree.HTML(html) title = tree.xpath("/html/body/div[1]/div[1]/div[1]/h1") if title == None or len(title) == 0: return "Title_Is_None", crawled_url contents = tree.xpath("/html/body/div[1]/div[1]/div[1]/div[2]/*") if contents == None or len(contents) == 0: return title.text, "Content_Is_None" content = '' for x in contents: if (x.text != None): content = content + x.xpath('string()') return title[0].text, content
def a_links(url_seed, attrs={}): html = common.download(url_seed) soup = BeautifulSoup(html, 'html.parser') alinks = soup.find_all('a', attrs) return alinks
def a_links(url_seed, attrs={}): html = common.download(url_seed) tree = etree.HTML(html) alinks = tree.xpath("//a") return alinks