Пример #1
0
    def test_scrape_all_links_from_all_pages_in_same_domain_for_given_start_url(
            self):
        link_scraper = mock()
        crawler_rules = mock()
        mock_writer = mock()
        when(link_scraper).scrape_links("http://samplepage.com").thenReturn([
            Link(url="/about",
                 label="About",
                 parent_url="http://samplepage.com")
        ])
        when(link_scraper).scrape_links(
            "http://samplepage.com/about").thenReturn([])
        when(crawler_rules).apply_rules([
            Link(url="/about",
                 label="About",
                 parent_url="http://samplepage.com")
        ]).thenReturn([
            Link(url="/about",
                 label="About",
                 parent_url="http://samplepage.com")
        ])
        when(crawler_rules).apply_rules([]).thenReturn([])
        spider = Spider(link_scraper, crawler_rules)
        expected_links = {
            "page_url":
            "http://samplepage.com",
            "child_links": [{
                "page_url": "http://samplepage.com/about",
                "child_links": []
            }]
        }

        links = spider.scrape("http://samplepage.com", mock_writer)

        self.assertEquals(expected_links, links)
Пример #2
0
    def test_absolute_url_not_same_as_the_given_url(self):
        link = Link("http://www.another.com/microsite",
                    "A link",
                    parent_url="")
        has_same_parent = link.has_same_parent("http://www.apage.com/")

        self.assertFalse(has_same_parent)
Пример #3
0
    def test_parse_multiple_links_on_different_levels_in_html_page(self):
        parsed_links = html_parser.parse(
            links_parser,
            """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
      <html xmlns="http://www.w3.org/1999/xhtml"><head><title></title></head>
        <body>
          <p>Some test<a href="http://somelink.com">SomeLink</a>
          and also <a target="_blank" href="http://anotherlink.com">AnotherLink</a></p>
          <div>
            <p>There is some text here with a link in another div
                <div><a href="http://adeeperlink.com">Deep</a></div>
            </p>
          </div>
        </body>
      </html>""")

        self.assertEquals(3, len(parsed_links))
        self.assertEquals(
            Link(url="http://somelink.com",
                 label="SomeLink",
                 parent_url=parent_url), parsed_links[0])
        self.assertEquals(
            Link(url="http://anotherlink.com",
                 label="AnotherLink",
                 parent_url=parent_url), parsed_links[1])
        self.assertEquals(
            Link(url="http://adeeperlink.com",
                 label="Deep",
                 parent_url=parent_url), parsed_links[2])
Пример #4
0
    def test_links_with_same_url_different_labels_are_unequal(self):
        link = Link("http://thisurl.com", "A link", parent_url="")
        another_link = Link("http://thisurl.com",
                            "Something else",
                            parent_url="")

        self.assertNotEqual(another_link, link)
Пример #5
0
    def test_url_is_not_same_as_given_url(self):
        link = Link("http://www.friendpage.com",
                    "A link",
                    parent_url="http://www.apage.com")

        is_same = link.is_same_as("http://www.somethingrandom.com")

        self.assertFalse(is_same)
Пример #6
0
    def test_get_url_returns_include_parent_base_url_for_a_relative_url(self):
        link = Link("/something",
                    "A link",
                    parent_url="http://www.apage.com/someplace/whatever")

        url = link.get_url()

        self.assertEquals("http://www.apage.com/something", url)
Пример #7
0
    def test_url_does_not_have_same_parent_path_as_the_given_url(self):
        link = Link("/microsite/about",
                    "A link",
                    parent_url="http://www.apage.com")
        has_same_parent = link.has_same_parent(
            "http://www.apage.com/anothermicrosite")

        self.assertFalse(has_same_parent)
Пример #8
0
    def test_url_with_deeper_path_has_parent_path_as_the_given_url(self):
        link = Link("/microsite/blog/anotherplace/about/specific",
                    "A link",
                    parent_url="http://www.apage.com")
        has_same_parent = link.has_same_parent(
            "http://www.apage.com/microsite")

        self.assertTrue(has_same_parent)
Пример #9
0
    def __init__(self, config={}):
        if 'url_list' not in config.keys():
            raise Exception('I will need a list of urls to work')

        self.link = Link()
        #self.kafka = MyKafka('NOTHS-crawler-topic')
        self.config['url_list'] = config['url_list']
        self.start()
Пример #10
0
    def test_url_having_same_hostname_as_the_given_url_belongs_to_same_domain(
            self):
        link = Link("http://www.friendpage.com",
                    "A link",
                    parent_url="http://www.apage.com")
        is_same_domain = link.is_same_domain("http://www.friendpage.com")

        self.assertTrue(is_same_domain)
Пример #11
0
    def test_url_is_same_as_given_url(self):
        link = Link("http://www.friendpage.com",
                    "A link",
                    parent_url="http://www.apage.com")

        is_same = link.is_same_as("http://www.friendpage.com")

        self.assertTrue(is_same)
Пример #12
0
    def test_url_with_diffent_hostname_than_the_given_url_does_not_belong_to_same_domain(
            self):
        link = Link("http://www.friendpage.com",
                    "A link",
                    parent_url="http://www.apage.com")
        is_same_domain = link.is_same_domain("http://www.someotherpage.com")

        self.assertFalse(is_same_domain)
Пример #13
0
    def test_url_has_parent_path_as_the_given_url(self):
        link = Link("/microsite/about",
                    "A link",
                    parent_url="http://www.apage.com")
        has_same_parent = link.has_same_parent(
            "http://www.apage.com/microsite")

        self.assertTrue(has_same_parent)
Пример #14
0
    def test_return_empty_list_when_none_match(self):
        filter = SameHierarcyUrlFilter("http://www.somepage.com/microsite")
        non_matching_link1 = Link(url='/somethingelse/privacy', label="Privacy", parent_url="http://www.somepage.com")
        non_matching_link2 = Link(url='/blog/hello', label="Blog", parent_url="http://www.somepage.com")
        non_matching_link3 = Link(url='/intl/de/policies/terms/', label="Terms", parent_url="http://www.somepage.com")

        filtered_links = filter.filter_links([non_matching_link1, non_matching_link2, non_matching_link3])

        self.assertEquals(0, len(filtered_links))
Пример #15
0
    def test_links_with_same_urls_labels_parent_urls_are_equal(self):
        link = Link(url="http://thisurl.com",
                    label="A link",
                    parent_url="http://anotherlink.com")
        another_link = Link("http://thisurl.com",
                            "A link",
                            parent_url="http://anotherlink.com")

        self.assertEquals(another_link, link)
Пример #16
0
    def test_filter_urls_that_match_parent_path_with_given_url(self):
        filter = SameHierarcyUrlFilter("http://www.somepage.com/microsite")
        matching_link1 = Link(url='/microsite/privacy', label="Privacy", parent_url="http://www.somepage.com")
        matching_link2 = Link(url='/microsite/blog/hello', label="Blog", parent_url="http://www.somepage.com")
        non_matching_link = Link(url='/intl/de/policies/terms/', label="Terms", parent_url="http://www.somepage.com")

        filtered_links = filter.filter_links([matching_link1, matching_link2, non_matching_link])

        self.assertEquals(2, len(filtered_links))
        self.assertCountEqual([matching_link1, matching_link2], filtered_links)
Пример #17
0
  def test_return_empty_when_no_urls_have_same_domain(self):
    filter = SameDomainUrlFilter("http://google.com")
    link1InDifferentDomain = Link(url='http://www.facebook.com', label="Terms", parent_url="http://google.com")
    link2InDifferentDomain = Link(url='http://www.youtube.com/?gl=DE&tab=w1', label="Video", parent_url="http://google.com")
    links = [
        link1InDifferentDomain,
        link2InDifferentDomain
      ]

    filtered_links = filter.filter_links(links)

    self.assertEquals(0, len(filtered_links))
Пример #18
0
  def test_returns_urls_that_satifies_all_filters(self):
    mock_filter1 = mock()
    mock_filter2 = mock()
    filters = [mock_filter1, mock_filter2]
    links = [Link(url = "", label="", parent_url=""), Link(url = "", label="", parent_url="")]
    when(mock_filter1).filter_links(links).thenReturn(links)
    when(mock_filter2).filter_links(links).thenReturn(links)
    rule = CrawlerRules(filters)

    filtered_links = rule.apply_rules(links)

    self.assertEquals(2, len(filtered_links))
    self.assertCountEqual(links, filtered_links)
Пример #19
0
  def test_return_empty_when_all_urls_are_duplicate(self):
    filter = DuplicateUrlFilter("http://google.com")
    linkRepeated1 = Link(url='http://google.com', label="Terms", parent_url="http://google.com")
    linkRepeated2 = Link(url='/', label="Video", parent_url="http://google.com")

    links = [
      linkRepeated1,
      linkRepeated2,
    ]

    filtered_links = filter.filter_links(links)

    self.assertEquals(0, len(filtered_links))
Пример #20
0
  def test_filter_url_based_on_given_domain(self):
    filter = SameDomainUrlFilter("http://google.com")
    link1InSameDomain = Link(url='/intl/de/policies/privacy/', label="Privacy", parent_url="http://google.com")
    link2InSameDomain = Link(url='/intl/de/policies/terms/', label="Terms", parent_url="http://google.com")
    linkInDifferentDomain = Link(url='http://www.youtube.com/?gl=DE&tab=w1', label="Video", parent_url="http://google.com")
    links = [
        link1InSameDomain,
        link2InSameDomain,
        linkInDifferentDomain
      ]

    filtered_links = filter.filter_links(links)

    self.assertEquals(2, len(filtered_links))
    self.assertCountEqual([link2InSameDomain, link1InSameDomain], filtered_links)
Пример #21
0
    def test_parse_multiple_links_in_html_page(self):
        parsed_links = html_parser.parse(
            links_parser,
            '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"><html xmlns="http://www.w3.org/1999/xhtml"><head><title></title></head><body><p>Some test<a href="http://somelink.com">SomeLink</a> and also <a href="http://anotherlink.com">AnotherLink</a></p></body></html>'
        )

        self.assertEquals(2, len(parsed_links))
        self.assertEquals(
            Link(url="http://somelink.com",
                 label="SomeLink",
                 parent_url=parent_url), parsed_links[0])
        self.assertEquals(
            Link(url="http://anotherlink.com",
                 label="AnotherLink",
                 parent_url=parent_url), parsed_links[1])
Пример #22
0
  def test_return_empty_when_no_urls_match_the_rules(self):
    mock_filter1 = mock()
    mock_filter2 = mock()
    filters = [mock_filter1, mock_filter2]
    unsatifying_link1 = Link(url = "", label="", parent_url="")
    unsatifying_link2 = Link(url = "", label="", parent_url="")

    links = [unsatifying_link1, unsatifying_link2]
    when(mock_filter1).filter_links(links).thenReturn([])
    when(mock_filter2).filter_links([]).thenReturn([])
    rule = CrawlerRules(filters)

    filtered_links = rule.apply_rules(links)

    self.assertEquals(0, len(filtered_links))
Пример #23
0
  def test_remove_urls_same_as_given_url(self):
    filter = DuplicateUrlFilter("http://google.com")
    linkRepeated1 = Link(url='http://google.com', label="Terms", parent_url="http://google.com")
    linkRepeated2 = Link(url='/', label="Video", parent_url="http://google.com")
    anotherLink = Link(url='http://www.facebook.com', label="Terms", parent_url="http://google.com")

    links = [
      linkRepeated1,
      linkRepeated2,
      anotherLink
    ]

    filtered_links = filter.filter_links(links)

    self.assertEquals(1, len(filtered_links))
    self.assertCountEqual([anotherLink], filtered_links)
Пример #24
0
  def test_pass_filtered_urls_from_one_filter_to_the_next_one(self):
    mock_filter1 = mock()
    mock_filter2 = mock()
    filters = [mock_filter1, mock_filter2]
    link = Link(url = "", label="", parent_url="")
    link2 = Link(url = "", label="", parent_url="")
    link_unsatisfying_filter1 = Link(url = "", label="", parent_url="")
    links = [link, link2, link_unsatisfying_filter1]
    when(mock_filter1).filter_links(links).thenReturn([link, link2])
    when(mock_filter2).filter_links([link, link2]).thenReturn([link])
    rule = CrawlerRules(filters)

    filtered_links = rule.apply_rules(links)

    self.assertEquals(1, len(filtered_links))
    self.assertCountEqual([link], filtered_links)
Пример #25
0
  def test_remove_url_that_dont_satisfy_even_one_filter(self):
    mock_filter1 = mock()
    mock_filter2 = mock()
    filters = [mock_filter1, mock_filter2]
    link = Link(url = "", label="", parent_url="")
    unsatifying_link = Link(url = "", label="", parent_url="")

    links = [link, unsatifying_link]
    when(mock_filter1).filter_links(links).thenReturn(links)
    when(mock_filter2).filter_links(links).thenReturn([link])
    rule = CrawlerRules(filters)

    filtered_links = rule.apply_rules(links)

    self.assertEquals(1, len(filtered_links))
    self.assertCountEqual([link], filtered_links)
Пример #26
0
    def test_create_link_with_label(self):
        lb = LinkBuilder("")
        link = lb.create_new("http://aurl.com").with_label("A Link")

        link = lb.build()

        self.assertEquals(
            Link(url="http://aurl.com", label="A Link", parent_url=""), link)
Пример #27
0
    def __init__(self, config={}):
        if 'url_list' not in config.keys():
            raise Exception('I will need a list of urls to work')

        self.link = Link()
        #self.kafka = MyKafka('NOTHS-crawler-topic')
        self.config['url_list'] = config['url_list']
        self.start()
Пример #28
0
    def test_scrape_ignores_links_that_fail_the_rules(self):
        link_scraper = mock()
        crawler_rules = mock()
        mock_writer = mock()
        same_domain_links1 = [
            Link(url="/about",
                 label="About",
                 parent_url="http://samplepage.com")
        ]
        when(link_scraper).scrape_links("http://samplepage.com").thenReturn(
            same_domain_links1)
        when(link_scraper).scrape_links(
            "http://samplepage.com/about").thenReturn([
                Link(url="http://anotherdoamin.com",
                     label="External",
                     parent_url="http://samplepage.com/about")
            ])
        when(link_scraper).scrape_links("http://anotherdoamin.com").thenReturn(
            [
                Link(url="/anotherabout",
                     label="External About",
                     parent_url="http://anotherdoamin.com")
            ])
        when(link_scraper).scrape_links(
            "http://anotherdoamin.com/anotherabout").thenReturn([])
        when(crawler_rules).apply_rules(same_domain_links1).thenReturn(
            same_domain_links1)
        when(crawler_rules).apply_rules(neq(same_domain_links1)).thenReturn([])
        spider = Spider(link_scraper, crawler_rules)
        expected_links = {
            "page_url":
            "http://samplepage.com",
            "child_links": [{
                "page_url": "http://samplepage.com/about",
                "child_links": []
            }]
        }

        links = spider.scrape("http://samplepage.com", mock_writer)

        self.assertEquals(expected_links, links)
Пример #29
0
class LinkItemTestCase(unittest.TestCase):

    def setUp(self):
        self.item = Item('img', r'(?P<img><img [^>]+?>)')
        self.link = Link('movie.douban', 'http://movie.douban.com/', [self.item])

    def test_register_funcs(self):
        def cf(one):
            print one
            return one
        
        self.link.register_funcs([cf])

        self.assertTrue(cf in self.item.clean_funcs)

    def test_fetch(self):
        results = self.link.fetch()
        douban_logo = '<img style="top: -5px; position: relative;" src="http://img3.douban.com/pics/site/icon_site_beta.gif"/>'
        self.assertIn('img', results)

        movie = results['img']
        self.assertIn(douban_logo, [m['img'] for m in movie])
Пример #30
0
    def test_return_created_link_from_existing_transaction_on_commit(self):
        expected_link = Link(url="/about",
                             label="About",
                             parent_url="http://something.com")
        mock_link_builder = mock()
        when(mock_link_builder).create_new("/about").thenReturn(
            mock_link_builder)
        when(mock_link_builder).build().thenReturn(expected_link)
        tag_parser = LinkTagParser(mock_link_builder)
        tag_parser.create_transaction("a", [('href', '/about')])

        link = tag_parser.commit("a")

        self.assertEquals(expected_link, link)
Пример #31
0
    def test_parse_all_links_using_links_tag_parser_in_page_for_the_given_url(
            self):
        page_url = "http://asamplepage.com"
        html_content = "An html page with links"
        page_links = [
            Link(url="/about",
                 label="A link",
                 parent_url="http://asamplepage.com"),
            Link("http://aurl.com",
                 "A link",
                 parent_url="http://asamplepage.com")
        ]
        mock_client = mock()
        mock_parser = mock()
        when(mock_client).get_html_page(page_url).thenReturn(html_content)
        when(mock_parser).parse(ANY(LinkTagParser),
                                eq(html_content)).thenReturn(page_links)
        scraper = LinkScraper(mock_client, mock_parser)

        links = scraper.scrape_links(page_url)

        self.assertEquals(2, len(links))
        self.assertCountEqual(page_links, links)
Пример #32
0
    def test_scrape_do_not_scrape_same_url_again(self):
        link_scraper = mock()
        crawler_rules = mock()
        mock_writer = mock()
        same_domain_links1 = [
            Link(url="/about",
                 label="About",
                 parent_url="http://samplepage.com")
        ]
        same_domain_links_repeated = [
            Link(url="http://samplepage.com",
                 label="Home",
                 parent_url="http://samplepage.com/about")
        ]
        when(link_scraper).scrape_links("http://samplepage.com").thenReturn(
            same_domain_links1)
        when(link_scraper).scrape_links(
            "http://samplepage.com/about").thenReturn(
                same_domain_links_repeated)
        when(crawler_rules).apply_rules(same_domain_links1).thenReturn(
            same_domain_links1)
        when(crawler_rules).apply_rules(same_domain_links_repeated).thenReturn(
            same_domain_links_repeated)
        spider = Spider(link_scraper, crawler_rules)
        expected_links = {
            "page_url":
            "http://samplepage.com",
            "child_links": [{
                "page_url": "http://samplepage.com/about",
                "child_links": []
            }]
        }

        links = spider.scrape("http://samplepage.com", mock_writer)

        self.assertEquals(expected_links, links)
Пример #33
0
 def setUp(self):
     self.item = Item('img', r'(?P<img><img [^>]+?>)')
     self.link = Link('movie.douban', 'http://movie.douban.com/', [self.item])
Пример #34
0
class Parse(Link, MyKafka):

    config = {}

    def __init__(self, config={}):
        if 'url_list' not in config.keys():
            raise Exception('I will need a list of urls to work')

        self.link = Link()
        #self.kafka = MyKafka('NOTHS-crawler-topic')
        self.config['url_list'] = config['url_list']
        self.start()

    def start(self):

        # try:
            for url in self.config['url_list']:

                self.base_url = url
                bsojb = self.__getBSObj(url)
                if re.search(r'personalcreations', url):
                    self.personalcreations(bsojb)
                elif re.search(r'tierneyphotography', url):
                    self.tierneyphotography(bsojb)
                else:
                    print('still need method for', url)
        # except:
        #     print('Unable to open ', url)

    def personalcreations(self, bsOjb):

        category_links = self.link.find_with_class(bsOjb, 'dropDownNavLI ', 'li')

        threads = []

        for category_link in category_links:
            self.link.addLink(category_link, 'category_link')
            #start treads
            t = Thread(target=self.__collect_product_links, args=(category_link[1], 'name', 'div', category_link[0]))
            threads.append(t)

        for t in threads:
            t.start()

        for t in threads:
            t.join()

        product_threads = []
        for item in self.__split({i: self.link.linkList[i] for i in self.link.linkList}, 200):
            t = Thread(target=self.__collect_send_product_detail, args=(item,
                                                                        {'product': {'class': 'productTitle',
                                                                                     'tag': 'span'},
                                                                         'price':   {'class': 'mainPrice_MinSalePrice',
                                                                                     'alter_class': 'mainPrice_SalePrice',
                                                                                     'tag': 'span'
                                                                                     },
                                                                         'desc':    {'class': 'ProductInfoText',
                                                                                     'tag': 'div'
                                                                                    }
                                                                         })
                       )
            product_threads.append(t)

        for t in product_threads:
            t.start()

    def __getBSObj(self, url, retries=0):

        if retries >= 3: return

        try:

            rg = re.compile("^(http|www)")
            if rg.match(url):
                r = requests.get(url).text.encode('ascii', 'ignore')
            else:
                r = requests.get(self.base_url + url).text.encode('ascii', 'ignore')
            bsObj = BS(r, "html.parser")
            return bsObj
        except:
            print('getBSObj Err - retry:', url )
            retries += 1
            self.__getBSObj(url, retries)


    def __collect_product_links(self, link, class_, tag, product_category):

        print('starting thread ..', link)
        bsObj = self.__getBSObj(link)
        products_links = self.link.find_with_class(bsObj, class_, tag)
        for products_link in products_links:
            self.link.addLink(products_link, 'product_link', product_category)

    def __split(self, data, SIZE=10000):
        it = iter(data)
        for i in range(0, len(data), SIZE):
            yield {k: data[k] for k in islice(it, SIZE)}

    def __collect_send_product_detail(self, links, args):

        print('starting thread ..', links)

        for k, v in links.items():

            if v['group'] is 'product_link':

                bsObj = self.__getBSObj(v['link'])

                dlink = v['link'].replace(',', '')
                dgroup = v['group']
                dcategory = v['category'] if 'category' in v else ''

                #getprice
                for price in bsObj.find_all(args['price']['tag'], {'class': args['price']['class']}, 'visible'):
                    dprice = price.text.replace(',', '')

                if not dprice:
                    for price in bsObj.find_all(args['price']['tag'], {'class': args['price']['alter_class']}, 'visible'):
                        dprice = price.text.replace(',', '')

                #gettitle
                for product in bsObj.find_all(args['product']['tag'], {'class': args['product']['class']}, 'visible'):
                    dproduct = product.text.replace(',', '')
                #getdesc
                for desc in bsObj.find_all(args['desc']['tag'], {'class': args['desc']['class']}, 'visible'):
                    ddesc = desc.text.replace(',', '')

                data = ",".join([dgroup, dlink, dproduct, dprice, ddesc, dcategory.decode("utf-8")])
                self.kafka.send(data)

            elif v['group'] is 'category_link':

                dlink = v['link'].replace(',', '')
                dgroup = v['group']
                dtitle = v['title']
                data = ",".join([dgroup, dlink, dtitle.decode("utf-8")])
                self.kafka.send(data)


    def tierneyphotography(self, bsOjb):

        photo_links = self.link.find_with_class(bsOjb, 'block-link', False)

        for link in photo_links:
            #print(link[1])
            if re.search(r'cache', link[1]):
                full_link = 'http://www.tierneyphotography.co.uk/' + link[1]
                filename = full_link.split('/')[-1]
                urllib.urlretrieve(full_link, filename)

        pass