def test_scrape_all_links_from_all_pages_in_same_domain_for_given_start_url( self): link_scraper = mock() crawler_rules = mock() mock_writer = mock() when(link_scraper).scrape_links("http://samplepage.com").thenReturn([ Link(url="/about", label="About", parent_url="http://samplepage.com") ]) when(link_scraper).scrape_links( "http://samplepage.com/about").thenReturn([]) when(crawler_rules).apply_rules([ Link(url="/about", label="About", parent_url="http://samplepage.com") ]).thenReturn([ Link(url="/about", label="About", parent_url="http://samplepage.com") ]) when(crawler_rules).apply_rules([]).thenReturn([]) spider = Spider(link_scraper, crawler_rules) expected_links = { "page_url": "http://samplepage.com", "child_links": [{ "page_url": "http://samplepage.com/about", "child_links": [] }] } links = spider.scrape("http://samplepage.com", mock_writer) self.assertEquals(expected_links, links)
def test_absolute_url_not_same_as_the_given_url(self): link = Link("http://www.another.com/microsite", "A link", parent_url="") has_same_parent = link.has_same_parent("http://www.apage.com/") self.assertFalse(has_same_parent)
def test_parse_multiple_links_on_different_levels_in_html_page(self): parsed_links = html_parser.parse( links_parser, """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"><head><title></title></head> <body> <p>Some test<a href="http://somelink.com">SomeLink</a> and also <a target="_blank" href="http://anotherlink.com">AnotherLink</a></p> <div> <p>There is some text here with a link in another div <div><a href="http://adeeperlink.com">Deep</a></div> </p> </div> </body> </html>""") self.assertEquals(3, len(parsed_links)) self.assertEquals( Link(url="http://somelink.com", label="SomeLink", parent_url=parent_url), parsed_links[0]) self.assertEquals( Link(url="http://anotherlink.com", label="AnotherLink", parent_url=parent_url), parsed_links[1]) self.assertEquals( Link(url="http://adeeperlink.com", label="Deep", parent_url=parent_url), parsed_links[2])
def test_links_with_same_url_different_labels_are_unequal(self): link = Link("http://thisurl.com", "A link", parent_url="") another_link = Link("http://thisurl.com", "Something else", parent_url="") self.assertNotEqual(another_link, link)
def test_url_is_not_same_as_given_url(self): link = Link("http://www.friendpage.com", "A link", parent_url="http://www.apage.com") is_same = link.is_same_as("http://www.somethingrandom.com") self.assertFalse(is_same)
def test_get_url_returns_include_parent_base_url_for_a_relative_url(self): link = Link("/something", "A link", parent_url="http://www.apage.com/someplace/whatever") url = link.get_url() self.assertEquals("http://www.apage.com/something", url)
def test_url_does_not_have_same_parent_path_as_the_given_url(self): link = Link("/microsite/about", "A link", parent_url="http://www.apage.com") has_same_parent = link.has_same_parent( "http://www.apage.com/anothermicrosite") self.assertFalse(has_same_parent)
def test_url_with_deeper_path_has_parent_path_as_the_given_url(self): link = Link("/microsite/blog/anotherplace/about/specific", "A link", parent_url="http://www.apage.com") has_same_parent = link.has_same_parent( "http://www.apage.com/microsite") self.assertTrue(has_same_parent)
def __init__(self, config={}): if 'url_list' not in config.keys(): raise Exception('I will need a list of urls to work') self.link = Link() #self.kafka = MyKafka('NOTHS-crawler-topic') self.config['url_list'] = config['url_list'] self.start()
def test_url_having_same_hostname_as_the_given_url_belongs_to_same_domain( self): link = Link("http://www.friendpage.com", "A link", parent_url="http://www.apage.com") is_same_domain = link.is_same_domain("http://www.friendpage.com") self.assertTrue(is_same_domain)
def test_url_is_same_as_given_url(self): link = Link("http://www.friendpage.com", "A link", parent_url="http://www.apage.com") is_same = link.is_same_as("http://www.friendpage.com") self.assertTrue(is_same)
def test_url_with_diffent_hostname_than_the_given_url_does_not_belong_to_same_domain( self): link = Link("http://www.friendpage.com", "A link", parent_url="http://www.apage.com") is_same_domain = link.is_same_domain("http://www.someotherpage.com") self.assertFalse(is_same_domain)
def test_url_has_parent_path_as_the_given_url(self): link = Link("/microsite/about", "A link", parent_url="http://www.apage.com") has_same_parent = link.has_same_parent( "http://www.apage.com/microsite") self.assertTrue(has_same_parent)
def test_return_empty_list_when_none_match(self): filter = SameHierarcyUrlFilter("http://www.somepage.com/microsite") non_matching_link1 = Link(url='/somethingelse/privacy', label="Privacy", parent_url="http://www.somepage.com") non_matching_link2 = Link(url='/blog/hello', label="Blog", parent_url="http://www.somepage.com") non_matching_link3 = Link(url='/intl/de/policies/terms/', label="Terms", parent_url="http://www.somepage.com") filtered_links = filter.filter_links([non_matching_link1, non_matching_link2, non_matching_link3]) self.assertEquals(0, len(filtered_links))
def test_links_with_same_urls_labels_parent_urls_are_equal(self): link = Link(url="http://thisurl.com", label="A link", parent_url="http://anotherlink.com") another_link = Link("http://thisurl.com", "A link", parent_url="http://anotherlink.com") self.assertEquals(another_link, link)
def test_filter_urls_that_match_parent_path_with_given_url(self): filter = SameHierarcyUrlFilter("http://www.somepage.com/microsite") matching_link1 = Link(url='/microsite/privacy', label="Privacy", parent_url="http://www.somepage.com") matching_link2 = Link(url='/microsite/blog/hello', label="Blog", parent_url="http://www.somepage.com") non_matching_link = Link(url='/intl/de/policies/terms/', label="Terms", parent_url="http://www.somepage.com") filtered_links = filter.filter_links([matching_link1, matching_link2, non_matching_link]) self.assertEquals(2, len(filtered_links)) self.assertCountEqual([matching_link1, matching_link2], filtered_links)
def test_return_empty_when_no_urls_have_same_domain(self): filter = SameDomainUrlFilter("http://google.com") link1InDifferentDomain = Link(url='http://www.facebook.com', label="Terms", parent_url="http://google.com") link2InDifferentDomain = Link(url='http://www.youtube.com/?gl=DE&tab=w1', label="Video", parent_url="http://google.com") links = [ link1InDifferentDomain, link2InDifferentDomain ] filtered_links = filter.filter_links(links) self.assertEquals(0, len(filtered_links))
def test_returns_urls_that_satifies_all_filters(self): mock_filter1 = mock() mock_filter2 = mock() filters = [mock_filter1, mock_filter2] links = [Link(url = "", label="", parent_url=""), Link(url = "", label="", parent_url="")] when(mock_filter1).filter_links(links).thenReturn(links) when(mock_filter2).filter_links(links).thenReturn(links) rule = CrawlerRules(filters) filtered_links = rule.apply_rules(links) self.assertEquals(2, len(filtered_links)) self.assertCountEqual(links, filtered_links)
def test_return_empty_when_all_urls_are_duplicate(self): filter = DuplicateUrlFilter("http://google.com") linkRepeated1 = Link(url='http://google.com', label="Terms", parent_url="http://google.com") linkRepeated2 = Link(url='/', label="Video", parent_url="http://google.com") links = [ linkRepeated1, linkRepeated2, ] filtered_links = filter.filter_links(links) self.assertEquals(0, len(filtered_links))
def test_filter_url_based_on_given_domain(self): filter = SameDomainUrlFilter("http://google.com") link1InSameDomain = Link(url='/intl/de/policies/privacy/', label="Privacy", parent_url="http://google.com") link2InSameDomain = Link(url='/intl/de/policies/terms/', label="Terms", parent_url="http://google.com") linkInDifferentDomain = Link(url='http://www.youtube.com/?gl=DE&tab=w1', label="Video", parent_url="http://google.com") links = [ link1InSameDomain, link2InSameDomain, linkInDifferentDomain ] filtered_links = filter.filter_links(links) self.assertEquals(2, len(filtered_links)) self.assertCountEqual([link2InSameDomain, link1InSameDomain], filtered_links)
def test_parse_multiple_links_in_html_page(self): parsed_links = html_parser.parse( links_parser, '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><title></title></head><body><p>Some test<a href="http://somelink.com">SomeLink</a> and also <a href="http://anotherlink.com">AnotherLink</a></p></body></html>' ) self.assertEquals(2, len(parsed_links)) self.assertEquals( Link(url="http://somelink.com", label="SomeLink", parent_url=parent_url), parsed_links[0]) self.assertEquals( Link(url="http://anotherlink.com", label="AnotherLink", parent_url=parent_url), parsed_links[1])
def test_return_empty_when_no_urls_match_the_rules(self): mock_filter1 = mock() mock_filter2 = mock() filters = [mock_filter1, mock_filter2] unsatifying_link1 = Link(url = "", label="", parent_url="") unsatifying_link2 = Link(url = "", label="", parent_url="") links = [unsatifying_link1, unsatifying_link2] when(mock_filter1).filter_links(links).thenReturn([]) when(mock_filter2).filter_links([]).thenReturn([]) rule = CrawlerRules(filters) filtered_links = rule.apply_rules(links) self.assertEquals(0, len(filtered_links))
def test_remove_urls_same_as_given_url(self): filter = DuplicateUrlFilter("http://google.com") linkRepeated1 = Link(url='http://google.com', label="Terms", parent_url="http://google.com") linkRepeated2 = Link(url='/', label="Video", parent_url="http://google.com") anotherLink = Link(url='http://www.facebook.com', label="Terms", parent_url="http://google.com") links = [ linkRepeated1, linkRepeated2, anotherLink ] filtered_links = filter.filter_links(links) self.assertEquals(1, len(filtered_links)) self.assertCountEqual([anotherLink], filtered_links)
def test_pass_filtered_urls_from_one_filter_to_the_next_one(self): mock_filter1 = mock() mock_filter2 = mock() filters = [mock_filter1, mock_filter2] link = Link(url = "", label="", parent_url="") link2 = Link(url = "", label="", parent_url="") link_unsatisfying_filter1 = Link(url = "", label="", parent_url="") links = [link, link2, link_unsatisfying_filter1] when(mock_filter1).filter_links(links).thenReturn([link, link2]) when(mock_filter2).filter_links([link, link2]).thenReturn([link]) rule = CrawlerRules(filters) filtered_links = rule.apply_rules(links) self.assertEquals(1, len(filtered_links)) self.assertCountEqual([link], filtered_links)
def test_remove_url_that_dont_satisfy_even_one_filter(self): mock_filter1 = mock() mock_filter2 = mock() filters = [mock_filter1, mock_filter2] link = Link(url = "", label="", parent_url="") unsatifying_link = Link(url = "", label="", parent_url="") links = [link, unsatifying_link] when(mock_filter1).filter_links(links).thenReturn(links) when(mock_filter2).filter_links(links).thenReturn([link]) rule = CrawlerRules(filters) filtered_links = rule.apply_rules(links) self.assertEquals(1, len(filtered_links)) self.assertCountEqual([link], filtered_links)
def test_create_link_with_label(self): lb = LinkBuilder("") link = lb.create_new("http://aurl.com").with_label("A Link") link = lb.build() self.assertEquals( Link(url="http://aurl.com", label="A Link", parent_url=""), link)
def test_scrape_ignores_links_that_fail_the_rules(self): link_scraper = mock() crawler_rules = mock() mock_writer = mock() same_domain_links1 = [ Link(url="/about", label="About", parent_url="http://samplepage.com") ] when(link_scraper).scrape_links("http://samplepage.com").thenReturn( same_domain_links1) when(link_scraper).scrape_links( "http://samplepage.com/about").thenReturn([ Link(url="http://anotherdoamin.com", label="External", parent_url="http://samplepage.com/about") ]) when(link_scraper).scrape_links("http://anotherdoamin.com").thenReturn( [ Link(url="/anotherabout", label="External About", parent_url="http://anotherdoamin.com") ]) when(link_scraper).scrape_links( "http://anotherdoamin.com/anotherabout").thenReturn([]) when(crawler_rules).apply_rules(same_domain_links1).thenReturn( same_domain_links1) when(crawler_rules).apply_rules(neq(same_domain_links1)).thenReturn([]) spider = Spider(link_scraper, crawler_rules) expected_links = { "page_url": "http://samplepage.com", "child_links": [{ "page_url": "http://samplepage.com/about", "child_links": [] }] } links = spider.scrape("http://samplepage.com", mock_writer) self.assertEquals(expected_links, links)
class LinkItemTestCase(unittest.TestCase): def setUp(self): self.item = Item('img', r'(?P<img><img [^>]+?>)') self.link = Link('movie.douban', 'http://movie.douban.com/', [self.item]) def test_register_funcs(self): def cf(one): print one return one self.link.register_funcs([cf]) self.assertTrue(cf in self.item.clean_funcs) def test_fetch(self): results = self.link.fetch() douban_logo = '<img style="top: -5px; position: relative;" src="http://img3.douban.com/pics/site/icon_site_beta.gif"/>' self.assertIn('img', results) movie = results['img'] self.assertIn(douban_logo, [m['img'] for m in movie])
def test_return_created_link_from_existing_transaction_on_commit(self): expected_link = Link(url="/about", label="About", parent_url="http://something.com") mock_link_builder = mock() when(mock_link_builder).create_new("/about").thenReturn( mock_link_builder) when(mock_link_builder).build().thenReturn(expected_link) tag_parser = LinkTagParser(mock_link_builder) tag_parser.create_transaction("a", [('href', '/about')]) link = tag_parser.commit("a") self.assertEquals(expected_link, link)
def test_parse_all_links_using_links_tag_parser_in_page_for_the_given_url( self): page_url = "http://asamplepage.com" html_content = "An html page with links" page_links = [ Link(url="/about", label="A link", parent_url="http://asamplepage.com"), Link("http://aurl.com", "A link", parent_url="http://asamplepage.com") ] mock_client = mock() mock_parser = mock() when(mock_client).get_html_page(page_url).thenReturn(html_content) when(mock_parser).parse(ANY(LinkTagParser), eq(html_content)).thenReturn(page_links) scraper = LinkScraper(mock_client, mock_parser) links = scraper.scrape_links(page_url) self.assertEquals(2, len(links)) self.assertCountEqual(page_links, links)
def test_scrape_do_not_scrape_same_url_again(self): link_scraper = mock() crawler_rules = mock() mock_writer = mock() same_domain_links1 = [ Link(url="/about", label="About", parent_url="http://samplepage.com") ] same_domain_links_repeated = [ Link(url="http://samplepage.com", label="Home", parent_url="http://samplepage.com/about") ] when(link_scraper).scrape_links("http://samplepage.com").thenReturn( same_domain_links1) when(link_scraper).scrape_links( "http://samplepage.com/about").thenReturn( same_domain_links_repeated) when(crawler_rules).apply_rules(same_domain_links1).thenReturn( same_domain_links1) when(crawler_rules).apply_rules(same_domain_links_repeated).thenReturn( same_domain_links_repeated) spider = Spider(link_scraper, crawler_rules) expected_links = { "page_url": "http://samplepage.com", "child_links": [{ "page_url": "http://samplepage.com/about", "child_links": [] }] } links = spider.scrape("http://samplepage.com", mock_writer) self.assertEquals(expected_links, links)
def setUp(self): self.item = Item('img', r'(?P<img><img [^>]+?>)') self.link = Link('movie.douban', 'http://movie.douban.com/', [self.item])
class Parse(Link, MyKafka): config = {} def __init__(self, config={}): if 'url_list' not in config.keys(): raise Exception('I will need a list of urls to work') self.link = Link() #self.kafka = MyKafka('NOTHS-crawler-topic') self.config['url_list'] = config['url_list'] self.start() def start(self): # try: for url in self.config['url_list']: self.base_url = url bsojb = self.__getBSObj(url) if re.search(r'personalcreations', url): self.personalcreations(bsojb) elif re.search(r'tierneyphotography', url): self.tierneyphotography(bsojb) else: print('still need method for', url) # except: # print('Unable to open ', url) def personalcreations(self, bsOjb): category_links = self.link.find_with_class(bsOjb, 'dropDownNavLI ', 'li') threads = [] for category_link in category_links: self.link.addLink(category_link, 'category_link') #start treads t = Thread(target=self.__collect_product_links, args=(category_link[1], 'name', 'div', category_link[0])) threads.append(t) for t in threads: t.start() for t in threads: t.join() product_threads = [] for item in self.__split({i: self.link.linkList[i] for i in self.link.linkList}, 200): t = Thread(target=self.__collect_send_product_detail, args=(item, {'product': {'class': 'productTitle', 'tag': 'span'}, 'price': {'class': 'mainPrice_MinSalePrice', 'alter_class': 'mainPrice_SalePrice', 'tag': 'span' }, 'desc': {'class': 'ProductInfoText', 'tag': 'div' } }) ) product_threads.append(t) for t in product_threads: t.start() def __getBSObj(self, url, retries=0): if retries >= 3: return try: rg = re.compile("^(http|www)") if rg.match(url): r = requests.get(url).text.encode('ascii', 'ignore') else: r = requests.get(self.base_url + url).text.encode('ascii', 'ignore') bsObj = BS(r, "html.parser") return bsObj except: print('getBSObj Err - retry:', url ) retries += 1 self.__getBSObj(url, retries) def __collect_product_links(self, link, class_, tag, product_category): print('starting thread ..', link) bsObj = self.__getBSObj(link) products_links = self.link.find_with_class(bsObj, class_, tag) for products_link in products_links: self.link.addLink(products_link, 'product_link', product_category) def __split(self, data, SIZE=10000): it = iter(data) for i in range(0, len(data), SIZE): yield {k: data[k] for k in islice(it, SIZE)} def __collect_send_product_detail(self, links, args): print('starting thread ..', links) for k, v in links.items(): if v['group'] is 'product_link': bsObj = self.__getBSObj(v['link']) dlink = v['link'].replace(',', '') dgroup = v['group'] dcategory = v['category'] if 'category' in v else '' #getprice for price in bsObj.find_all(args['price']['tag'], {'class': args['price']['class']}, 'visible'): dprice = price.text.replace(',', '') if not dprice: for price in bsObj.find_all(args['price']['tag'], {'class': args['price']['alter_class']}, 'visible'): dprice = price.text.replace(',', '') #gettitle for product in bsObj.find_all(args['product']['tag'], {'class': args['product']['class']}, 'visible'): dproduct = product.text.replace(',', '') #getdesc for desc in bsObj.find_all(args['desc']['tag'], {'class': args['desc']['class']}, 'visible'): ddesc = desc.text.replace(',', '') data = ",".join([dgroup, dlink, dproduct, dprice, ddesc, dcategory.decode("utf-8")]) self.kafka.send(data) elif v['group'] is 'category_link': dlink = v['link'].replace(',', '') dgroup = v['group'] dtitle = v['title'] data = ",".join([dgroup, dlink, dtitle.decode("utf-8")]) self.kafka.send(data) def tierneyphotography(self, bsOjb): photo_links = self.link.find_with_class(bsOjb, 'block-link', False) for link in photo_links: #print(link[1]) if re.search(r'cache', link[1]): full_link = 'http://www.tierneyphotography.co.uk/' + link[1] filename = full_link.split('/')[-1] urllib.urlretrieve(full_link, filename) pass