示例#1
0
 def set_next_page(self):
     self.url = self.base_url
     is_success = False
     while not is_success:
         try:
             self.driver.get(self.url)
             time.sleep(3)
         except Exception as e:
             self.driver = setup_chrome()
             continue
         is_success = True
示例#2
0
class BaseCrawler:
    map_util = None
    crawler_name = None
    brand_name = None
    brand = None
    driver = setup_chrome()
    url = None

    def _set_brand(self):
        if not self.brand_name:
            raise NotImplementedError
        brand, created = Brand.objects.get_or_create(name=self.brand_name)
        self.brand = brand

    def get_brand(self):
        if not self.brand:
            self._set_brand()
        return self.brand

    def set_next_page(self):
        raise NotImplementedError

    def get_place_data(self) -> [Place]:
        # return Place(name, description, latitude, longitude)
        raise NotImplementedError

    def run(self):
        if not self.driver:
            raise ModuleNotFoundError('selenium driver required')
        while True:
            self.set_next_page()
            print(self.url)
            places = self.get_place_data()
            if not len(places):
                break
            place_names = [place.name for place in places]
            exist_places = Place.objects.filter(name__in=place_names)
            for exist_place in exist_places:
                for place in places:
                    if place.name == exist_place.name:
                        exist_place.description = place.description
                        exist_place.address = place.address
                        exist_place.telephone = place.telephone
                        exist_place.latitude = place.latitude
                        exist_place.longitude = place.longitude
                        exist_place.save()
            exist_place_names = [place.name for place in exist_places]
            new_places = [
                place for place in places
                if place.name not in exist_place_names
            ]
            Place.objects.bulk_create(new_places)
            print('new %s places are created' % str(len(new_places)))
        print('%s finished' % self.crawler_name)
示例#3
0
 def set_next_page(self):
     self.url = self.base_url + str(self.page_number)
     is_success = False
     while not is_success:
         try:
             self.driver.get(self.url)
             time.sleep(3)
         except Exception as e:
             print(e)
             self.driver = setup_chrome()
             continue
         is_success = True
     self.page_number += 1
示例#4
0
 def set_next_page(self):
     self.url = self.base_url
     if self.page_number == 1:
         is_success = False
         while not is_success:
             self.driver.get(self.url)
             try:
                 time.sleep(3)
                 is_success = True
             except Exception as e:
                 self.driver = setup_chrome()
                 continue
     else:
         self.driver.execute_script('ajaxCompanyArea(%s)' %
                                    self.page_number)
         time.sleep(1)
     self.page_number += 1