def set_next_page(self): self.url = self.base_url is_success = False while not is_success: try: self.driver.get(self.url) time.sleep(3) except Exception as e: self.driver = setup_chrome() continue is_success = True
class BaseCrawler: map_util = None crawler_name = None brand_name = None brand = None driver = setup_chrome() url = None def _set_brand(self): if not self.brand_name: raise NotImplementedError brand, created = Brand.objects.get_or_create(name=self.brand_name) self.brand = brand def get_brand(self): if not self.brand: self._set_brand() return self.brand def set_next_page(self): raise NotImplementedError def get_place_data(self) -> [Place]: # return Place(name, description, latitude, longitude) raise NotImplementedError def run(self): if not self.driver: raise ModuleNotFoundError('selenium driver required') while True: self.set_next_page() print(self.url) places = self.get_place_data() if not len(places): break place_names = [place.name for place in places] exist_places = Place.objects.filter(name__in=place_names) for exist_place in exist_places: for place in places: if place.name == exist_place.name: exist_place.description = place.description exist_place.address = place.address exist_place.telephone = place.telephone exist_place.latitude = place.latitude exist_place.longitude = place.longitude exist_place.save() exist_place_names = [place.name for place in exist_places] new_places = [ place for place in places if place.name not in exist_place_names ] Place.objects.bulk_create(new_places) print('new %s places are created' % str(len(new_places))) print('%s finished' % self.crawler_name)
def set_next_page(self): self.url = self.base_url + str(self.page_number) is_success = False while not is_success: try: self.driver.get(self.url) time.sleep(3) except Exception as e: print(e) self.driver = setup_chrome() continue is_success = True self.page_number += 1
def set_next_page(self): self.url = self.base_url if self.page_number == 1: is_success = False while not is_success: self.driver.get(self.url) try: time.sleep(3) is_success = True except Exception as e: self.driver = setup_chrome() continue else: self.driver.execute_script('ajaxCompanyArea(%s)' % self.page_number) time.sleep(1) self.page_number += 1