#self.hrefs[self.url] = self.soup.find_all('a') self.hrefs[self.url] = self.soup.find_all(href=re.compile("showforum")) def setLinks(self): templinks = [] for link in self.hrefs[self.url]: templinks.append(link.get('href')) #templinks.append(link) self.links[self.url] = templinks def getLinks(self): return self.links[self.url] if __name__ == "__main__": #print sys.path #global dbs dbs,cursor = connectDB() myhost = URL() myhost.setUrls("http://seahawkshuddle.com/forum/") conn = connection(myhost.getUrls()) conn.createConnection() #conn.gethtmlPage() myparent = ParentWebPage(conn) myparent.setContent() #print "content = ", myparent.getContent() tree= lxml.html.parse('http://seahawkshuddle.com/forum/') #print tree elements = tree.xpath('//title//text()') print elements myparent.sethrefs() myparent.setLinks() #print myparent.getLinks() #print type(myparent.gethrefs())
def setLinks(self): templinks = [] for link in self.hrefs[self.url]: templinks.append(link.get('href')) #templinks.append(link) self.links[self.url] = templinks def getLinks(self): return self.links[self.url] if __name__ == "__main__": #print sys.path #global dbs dbs, cursor = connectDB() myhost = URL() myhost.setUrls("http://seahawkshuddle.com/forum/") conn = connection(myhost.getUrls()) conn.createConnection() #conn.gethtmlPage() myparent = ParentWebPage(conn) myparent.setContent() #print "content = ", myparent.getContent() tree = lxml.html.parse('http://seahawkshuddle.com/forum/') #print tree elements = tree.xpath('//title//text()') print elements myparent.sethrefs() myparent.setLinks() #print myparent.getLinks() #print type(myparent.gethrefs())
def createConnection(self): try: openConxn = urllib2.Request(self.host) response = urllib2.urlopen(openConxn) self.htmlPage = response.read() except Exception as e: print e exit def gethtmlPage(self): return self.htmlPage def getHost(self): return self.host if __name__ == "__main__": myhost = URL() myhost.setUrls("http://boards.baltimoreravens.com/") conn = connection(myhost.getUrls()) conn.createConnection() parser = etree.HTMLParser() tree = etree.parse(StringIO(conn.gethtmlPage()), parser) elements = tree.xpath('//title//text()') print elements urls = tree.xpath('.//td[@class="col_c_forum"]//h4//a/@href') #print urls p = re.compile('#') urlQ = [] for url in urls: urlQ.append(url) forum = forumPage()
openConxn = urllib2.Request(self.host) response = urllib2.urlopen(openConxn) self.htmlPage = response.read() except Exception as e: print e exit def gethtmlPage(self): return self.htmlPage def getHost(self): return self.host if __name__ == "__main__": myhost = URL() myhost.setUrls("http://boards.baltimoreravens.com/") conn = connection(myhost.getUrls()) conn.createConnection() parser = etree.HTMLParser() tree = etree.parse(StringIO(conn.gethtmlPage()), parser) elements = tree.xpath('//title//text()') print elements urls = tree.xpath('.//td[@class="col_c_forum"]//h4//a/@href') #print urls p = re.compile('#') urlQ = [] for url in urls: urlQ.append(url) forum = forumPage()