def get_links_from_page(number_of_pages): # get initial url url = web.URL('http://www.imdb.com/search/title?sort=num_votes,desc&start=1&title_type=feature&year=1990,2012') # create an empty array to populate with the urls pages = [] # the loop to get the links for page_index in range(number_of_pages): if page_index == 0: # the first page has only next button so the DOM is different dom = web.DOM(url.download(cached = False)) # to see which part of the DOM to use right click in Chrome # and use 'Inspect Element' entry = dom('span.pagination')[1].by_tag('a') href = 'http://www.imdb.com/' + entry[0].attributes.get('href') pages.append(href) print(pages) url = web.URL(href) else: # after the first page you have both previous and next butoon so you select next dom = web.DOM(url.download(cached = False)) entry = dom('span.pagination')[1].by_tag('a') href = 'http://www.imdb.com/' + entry[1].attributes.get('href') pages.append(href) print(pages) url = web.URL(href) # return a list that handles empty urls return list(set(pages))
def test_element(self): # Assert Element properties (test <body>). v = web.DOM(self.html).body self.assertEqual(v.tag, "body") self.assertEqual(v.attributes["id"], "front") self.assertEqual(v.attributes["class"], "comments") self.assertTrue(v.content.startswith("\n<script")) # Assert Element.getElementsByTagname() (test navigation links). a = v.by_tag("a") self.assertEqual(len(a), 3) self.assertEqual(a[0].content, "nav1") self.assertEqual(a[1].content, "nav2") self.assertEqual(a[2].content, "nav3") # Assert Element.getElementsByClassname() (test <p class="comment">). a = v.by_class("comment") self.assertEqual(a[0].tag, "p") self.assertEqual(a[0].by_tag("span")[0].attributes["class"], "date") self.assertEqual(a[0].by_tag("span")[1].attributes["class"], "author") for selector in (".comment", "p.comment", "*.comment"): self.assertEqual(v.by_tag(selector)[0], a[0]) # Assert Element.getElementById() (test <div id="content">). e = v.by_id("content") self.assertEqual(e.tag, "div") self.assertEqual(e, a[0].parent) for selector in ("#content", "div#content", "*#content"): self.assertEqual(v.by_tag(selector)[0], e) # Assert Element.getElementByAttribute() (test on <a href="">). a = v.by_attribute(href="nav1.html") self.assertEqual(a[0].content, "nav1") print "pattern.web.Node.Element" print "pattern.web.Node.Element.by_tag()" print "pattern.web.Node.Element.by_class()" print "pattern.web.Node.Element.by_id()" print "pattern.web.Node.Element.by_attribute()"
def get_data_from_pages(links): # open an empty array data = [] #create the loop to get the links that you created from the previous function for urltext in links: #parse the url url = web.URL(urltext) # print them for "matrix" like effect (slower, comment this line if you do not want it) print "Getting data from: ", url try: # the main scraping loop, it all about DOM manipulation # learn more about DOM at http://code.tutsplus.com/tutorials/javascript-and-the-dom-series-lesson-1--net-3134 dom = web.DOM(url.download(cached=False)) for movie in dom.by_tag('td.title'): title = movie.by_tag('a')[0].content print title genres = movie.by_tag('span.genre')[0].by_tag('a') genres = [g.content for g in genres] print genres director = movie.by_tag('span.credit')[0].by_tag('a')[0].content print director first_actor = movie.by_tag('span.credit')[0].by_tag('a')[1].content print first_actor second_actor = movie.by_tag('span.credit')[0].by_tag('a')[2].content print second_actor runtime = movie.by_tag('span.runtime')[0].content print runtime rating = movie.by_tag('span.value')[0].content print rating data.append((title, genres, director, first_actor, second_actor, runtime, rating)) except KeyboardInterrupt: break # to be able to interrupt the Ctrl+c without losing the data except: pass # to not stop in case of missing data return data
def test_selector(self): # Assert DOM CSS selectors with multiple classes. v = web.DOM(self.html).body # TODO uncomment these! # p = v("p.class1") # self.assertEqual(len(p), 1) # self.assertTrue("class1" in p[0].attributes["class"]) # p = v("p.class2") # self.assertEqual(len(p), 1) # self.assertTrue("class2" in p[0].attributes["class"]) p = v("p.class1.class2") self.assertEqual(len(p), 1) self.assertTrue("class1" in p[0].attributes["class"]) self.assertTrue("class2" in p[0].attributes["class"]) e = p[0] # This was previously incorrect self.assertEqual([], v("p[class='class1 class2']")) self.assertEqual(e, v("p[class^='class1']")[0]) self.assertEqual(e, v("p[class$='class2']")[0]) self.assertEqual(e, v("p[class*='class']")[0]) self.assertEqual(e, v("p:contains('blah')")[1]) self.assertTrue(web.Selector("p[class='class1 class2']").match(e)) print("pattern.web.Selector()")
def test_node_traverse(self): # Assert Node.traverse() (must visit all child nodes recursively). self.b = False def visit(node): if node.type == web.ELEMENT and node.tag == "span": self.b = True v = web.DOM(self.html) v.traverse(visit) self.assertEqual(self.b, True) print "pattern.web.Node.traverse()"
def test_selector(self): v = web.DOM(self.html).body p1 = v("p.class1") self.assertEqual(len(p1), 1) self.assertTrue("class1" in p1[0].attributes["class"]) p2 = v("p.class2") self.assertEqual(len(p2), 1) self.assertTrue("class2" in p2[0].attributes["class"]) p1andp2 = v(".class1.class2") self.assertEqual(len(p1andp2), 1) self.assertTrue("class1" in p1andp2[0].attributes["class"]) self.assertTrue("class2" in p1andp2[0].attributes["class"]) print "pattern.web.Node.Element()"
def test_selector(self): # Assert DOM CSS selectors with multiple classes. v = web.DOM(self.html).body p = v("p.class1") self.assertEqual(len(p), 1) self.assertTrue("class1" in p[0].attributes["class"]) p = v("p.class2") self.assertEqual(len(p), 1) self.assertTrue("class2" in p[0].attributes["class"]) p = v("p.class1.class2") self.assertEqual(len(p), 1) self.assertTrue("class1" in p[0].attributes["class"]) self.assertTrue("class2" in p[0].attributes["class"]) print "pattern.web.Node.Element()"
def test_selector(self): # Assert DOM CSS selectors with multiple classes. v = web.DOM(self.html).body p = v("p.class1") self.assertEqual(len(p), 1) self.assertTrue("class1" in p[0].attributes["class"]) p = v("p.class2") self.assertEqual(len(p), 1) self.assertTrue("class2" in p[0].attributes["class"]) p = v("p.class1.class2") self.assertEqual(len(p), 1) self.assertTrue("class1" in p[0].attributes["class"]) self.assertTrue("class2" in p[0].attributes["class"]) e = p[0] self.assertEqual(e, v("p[class='class1 class2']")[0]) self.assertEqual(e, v("p[class^='class1']")[0]) self.assertEqual(e, v("p[class$='class2']")[0]) self.assertEqual(e, v("p[class*='class']")[0]) self.assertEqual(e, v("p:contains('blah')")[1]) self.assertTrue(web.Selector("p[class='class1 class2']").match(e)) print("pattern.web.Selector()")