def complete(self, initiative: InitiativeImport): initiative_url = self.config.get_initiative_url(initiative.source_id) try: detail = PlatformSource.get(initiative_url) soup = BeautifulSoup(detail.content, 'html.parser') table = soup.find("dl") records = table.findAll(["dd", "dt"]) initiative.description = soup.find("p").text.strip('\t\n\r') initiative.group = self.config.group initiative.source = initiative_url setcount = 0 for i in range(0, len(records), 2): # TODO: Error prevention label = records[i].contents[1].strip("\":").lower() if label in self.config.field_map: setattr(initiative, self.config.field_map[label], records[i + 1].contents[0]) setcount += 1 if self.config.group == InitiativeGroup.DEMAND: title = soup.find("h2", "result__title") initiative.organiser = title.contents[0] # TODO: Logging is no values are assigned except ScrapeException as e: # should not catch # ('error scraping ' + initiative_url + ':' + e.args[0]) if initiative is not None: initiative.state = "processing_error"
def initiatives(self) -> Generator[InitiativeImport, None, None]: response = self.get(self.config.list_endpoint) data = json.loads(response.content, object_hook=lambda d: namedtuple('X', d.keys()) (*d.values())) for item in data: initiative = InitiativeImport( source_id=item.id, source_uri=f"https://wijamsterdam.nl/initiatief/{item.id}", # using dateutil and not datetime because: https://stackoverflow.com/a/3908349/167131 created_at=parser.parse(item.createdAt), name=item.title, description=f"{item.summary}" f"\n--------\n" f"{item.description}", location=item.extraData.area, organiser=item.extraData.isOrganiserName, group=InitiativeGroup.SUPPLY, category=item.extraData.theme, url=item.extraData.isOrganiserWebsite, extra_fields=response.content.decode("utf-8") # Probably better to leave email / phone empty # name is already tricky maybe albeit open data. ) if hasattr(item, "position"): initiative.latitude = item.position.lat initiative.longitude = item.position.lng yield initiative
def complete(self, initiative: InitiativeImport): initiative_url = self.config.get_initiative_url(initiative.source_id) # This already raises ScrapeExceptions detail = PlatformSource.get(initiative_url) try: soup = BeautifulSoup(detail.content, 'html.parser') table = soup.find("dl") records = table.findAll(["dd", "dt"]) initiative.description = soup.find("p").text.strip('\t\n\r ') initiative.group = self.config.group initiative.source = initiative_url set_count = self.extract_details_table(initiative, records) if self.config.group == InitiativeGroup.DEMAND: title = soup.find("h2", "result__title") initiative.organiser = title.contents[0] if not initiative.location: self.try_alternative_place(soup, initiative) except Exception as ex: msg = f"Error reading contents from {initiative_url}" raise ScrapeException(msg) from ex if set_count == 0: raise ScrapeException("Failed to load field map details table")
def scrape_group(self, config: InitiativeGroupConfig, batch: ImportBatch): print('scraping ' + config.group) page = requests.get(config.url) # TODO: Handle http error codes result = page.json() parsed_markers = [] for marker in result['markers']: if marker['id'] not in parsed_markers: # TODO: Error handling and possibly a retry parsed_markers.append(marker['id']) markerurl = config.get_marker_url(marker['id']) print('scraping ' + markerurl) initiative = None try: detail = requests.get(markerurl) # TODO: Handle http error codes soup = BeautifulSoup(detail.content, 'html.parser') table = soup.find("dl") records = table.findAll(["dd", "dt"]) description = soup.find("p").text.strip('\t\n\r') initiative = InitiativeImport(description=description, group=config.group, source=markerurl, source_id=marker['id']) setcount = 0 for i in range(0, len(records), 2): # TODO: Error prevention label = records[i].contents[1].strip("\":").lower() if label in config.field_map: setattr(initiative, config.field_map[label], records[i + 1].contents[0]) setcount += 1 if config.group == InitiativeGroup.DEMAND: title = soup.find("h2", "result__title") name = title.contents[0] # TODO: Logging is no values are assigned except Exception as e: print('error scraping ' + markerurl + ':' + e.args[0]) if initiative is not None: initiative.state = "processing_error" if initiative is not None: batch.initiatives.append(initiative) # debugging if not self.should_continue(len(parsed_markers)): break self._db.session.commit()
def map_initiative(response, item): initiative = InitiativeImport( source_id=item.id, source_uri=f"https://wijamsterdam.nl/initiatief/{item.id}", # using dateutil and not datetime because: https://stackoverflow.com/a/3908349/167131 created_at=parser.parse(item.createdAt), name=item.title, description=f"{item.summary}" f"\n--------\n" f"{item.description}", group=InitiativeGroup.SUPPLY, extra_fields=response.content.decode("utf-8") # Probably better to leave email / phone empty # name is already tricky maybe albeit open data. ) if hasattr(item.extraData, "area"): initiative.location = item.extraData.area if hasattr(item.extraData, "isOrganiserName"): initiative.organiser = item.extraData.isOrganiserName if hasattr(item.extraData, "theme"): initiative.category = item.extraData.theme if hasattr(item.extraData, "isOrganiserWebsite"): initiative.url = item.extraData.isOrganiserWebsite if hasattr(item, "position"): initiative.latitude = item.position.lat initiative.longitude = item.position.lng return initiative
def map_initiative(item): org = json.dumps(item) initiative = InitiativeImport( source_id=item["id"], source_uri=f"https://wijamsterdam.nl/initiatief/{item['id']}", # using dateutil and not datetime because: https://stackoverflow.com/a/3908349/167131 created_at=parser.parse(item["createdAt"]), name=item["title"], description=f"{item['summary']}" f"\n--------\n" f"{item['description']}", group=InitiativeGroup.SUPPLY, extra_fields=org # Probably better to leave email / phone empty # name is already tricky maybe albeit open data. ) extra_data = item["extraData"] if "area" in extra_data: initiative.location = extra_data["area"] if "isOrganiserName" in extra_data: initiative.organiser = extra_data["isOrganiserName"] if "theme" in extra_data: initiative.category = extra_data["theme"] if "isOrganiserWebsite" in extra_data: initiative.url = extra_data["isOrganiserWebsite"] if "position" in item: initiative.latitude = item["position"]["lat"] initiative.longitude = item["position"]["lng"] return initiative
def scrape(self): super().scrape() page = requests.get(self.URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(class_='ideas-list') questions = results.find_all(class_='idea-item') count = 0 for card in questions: title = card.find('h3').text.strip(' \t\n\r') rawlocation = card.find(class_='gebied').text.strip(' \t\n\r') description = card.find('p').text.strip(' \t\n\r') link = card.find('a')['href'] self._db.session.add( InitiativeImport( name=title, description=description, group="unknown", source='https://wijamsterdam.nl' + link, source_id=link.strip('/initiatief/'), location=rawlocation, )) count += 1 if not self.should_continue(count): break self._db.session.commit()
def complete(self, initiative: InitiativeImport): post_url = self.config.get_api_post_url(initiative.source_id) detail = self.get(post_url) try: initiative_url_guid = '75aa5e4d-fe98-4a7a-94ec-adab2f7f9b88' result = detail.json() initiative.created_at=parser.parse(result['created']) initiative.scraped_at=datetime.datetime.now() initiative.name=result['title'] initiative.description=result['content'] if initiative_url_guid in result['values']: initiative.url = result['values'][initiative_url_guid][0] initiative.extra_fields = self.parse_extra_fields(result) category_list = [] for tag in result['tags']: category_list.append(self.category_dict[tag['id']]) s = ', ' initiative.category = s.join(category_list) except Exception as ex: msg = f"Error in complete function for initiative {initiative.source_id}" raise ScrapeException(msg) from ex
def _collect_initiative(self, initiative: InitiativeImport, source): if initiative is None: raise ValueError("Expecting an initiative instance!") try: source.complete(initiative) initiative.scraped_at = datetime.utcnow() initiative.source = self.platform_url self.add_initiative(initiative) self.get_logger().debug(f"Scraped {initiative.source_uri}") except ScrapeException as e: self.get_logger()\ .exception(f"Error while collecting initiative {initiative.source_uri}") # There's maybe no point in doing this unless it's saved or at least counted. # this is actually indicating error with down the line processing. initiative.state = "processing_error" # Should probably do this very neat with a context manager. if self._collect_recovery.should_raise(e): raise e
def initiatives(self) -> Generator[InitiativeImport, None, None]: url = self.config.get_list_url() page = PlatformSource.get(url) result = page.json() for marker in result['markers']: initiative = InitiativeImport( source_id=marker['id'], source_uri=self.config.get_marker_url(marker['id']), latitude=marker['lat'], longitude=marker['lon'], ) yield initiative
def _collect_initiative(self, initiative: InitiativeImport, source): if initiative is None: raise ValueError("Expecting an initiative instance!") try: source.complete(initiative) initiative.scraped_at = datetime.utcnow() initiative.source = self.platform_url self.get_logger().debug(f"Scraped {initiative.source_uri}") except ScrapeException as e: self.get_logger()\ .exception(f"Error while collecting initiative {initiative.source_uri}") # There's maybe no point in doing this unless it's saved or at least counted. # this is actually indicating error with down the line processing. initiative.state = InitiativeImportState.IMPORT_ERROR ex_info = sys.exc_info() initiative.error_reason = "".join( traceback.format_exception(*ex_info)) # Should probably do this very neat with a context manager. if self._collect_recovery.should_raise(e): raise e finally: # Always store initiative for traceability. self.add_initiative(initiative)
def initiatives(self) -> Generator[InitiativeImport, None, None]: url = self.config.get_list_url() page = PlatformSource.get(url) try: result = page.json() for marker in result['markers']: initiative = InitiativeImport( source_id=marker['id'], source_uri=self.config.get_marker_url(marker['id']), latitude=marker['lat'], longitude=marker['lon'], ) yield initiative except Exception as ex: msg = f"Error reading contents from {url}" raise ScrapeException(msg) from ex
def createInitiativeFromDeedDetails(self, deedDetails): logging.info("Creating initiative from deed details") deedID = self.getDeedIDFromJSON(deedDetails) coordinates = self.getCoordinatesFromDeedDetails(deedDetails) initiative = InitiativeImport( category=deedDetails["fullType"], group="supply", description=deedDetails["summary"], # name = deedDetails[""], source=self.getAPIDeedDetailsURL(deedID), # frequency = deedDetails["subtype"], location=deedDetails["address"], latitude=coordinates["lat"], longitude=coordinates["lng"]) return initiative
def create_initiative_from_deed_details(self, deed_details): logging.info("Creating initiative from deed details") deed_id = self.get_deed_id_from_json(deed_details) coordinates = self.get_coordinates_from_deed_details(deed_details) initiative = InitiativeImport( category=deed_details["fullType"], group="supply", description=deed_details["summary"], # name = deedDetails[""], source=self.get_api_deed_details_url(deed_id), # frequency = deedDetails["subtype"], location=deed_details["address"], latitude=coordinates["lat"], longitude=coordinates["lng"]) return initiative
def initiatives(self) -> Generator[InitiativeImport, None, None]: self.category_dict = self.get_category_dict() url = self.config.get_api_list_url() page = self.get(url) try: result = page.json() for feature in result['features']: initiative = InitiativeImport( name=feature['properties']['title'], description=feature['properties']['description'], group= self.config.group, source=self.config.url, source_id=feature['properties']['id'], source_uri=feature['properties']['url'].replace('/api/v3', ""), longitude=feature['geometry']['geometries'][0]['coordinates'][0], latitude=feature['geometry']['geometries'][0]['coordinates'][1] ) yield initiative except Exception as ex: msg = f"Error reading contents from {url}" raise ScrapeException(msg) from ex
def complete(self, initiative: InitiativeImport): initiative_url = self.config.get_initiative_url(initiative.source_id) # This already raises ScrapeExceptions detail = self.get(initiative_url) try: soup = BeautifulSoup(detail.content, 'html.parser') table = soup.find("dl") records = table.findAll(["dd", "dt"]) initiative.description = soup.find("p").text.strip('\t\n\r ') initiative.group = self.config.group initiative.source = initiative_url set_count = self.extract_details_table(initiative, records) if self.config.group == InitiativeGroup.DEMAND: title = soup.find("h2", "result__title") initiative.name = title.contents[0] h5nodeOrganization = soup.find("h5", text="Aangesloten bij:") if h5nodeOrganization: initiative.organiser = h5nodeOrganization.find_next_sibling( ).get_text(strip=True) else: h5nodePerson = soup.find("h5", text="Geplaatst door:") if h5nodePerson: initiative.organiser = h5nodePerson.find_next_sibling( ).get_text(strip=True) if not initiative.location: self.try_alternative_place(soup, initiative) except Exception as ex: msg = f"Error reading contents from {initiative_url}" raise ScrapeException(msg) from ex if set_count == 0: raise ScrapeException("Failed to load field map details table")