class LobbyistsIndexScraper(BaseScraper): """ This scraper gets the list of lobbyist ids from the knesset lobbyists page html returns a list of lobbyist ids - doesn't store anything in db """ LOBBYISTS_INDEX_PAGE_URL = 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx' def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL) self.storage = ListStorage() def _storeLobbyistIdsFromSoup(self, soup): elts = soup.findAll(lobbyist_id=True) counter = 0 for elt in elts: lobbyist_id = elt.get('lobbyist_id') if lobbyist_id.isdigit(): self.storage.store(lobbyist_id) self._getLogger().debug(lobbyist_id) counter = counter + 1 self._getLogger().info('got %s lobbyists', str(counter)) def _scrape(self): try: html = self.source.fetch() soup = BeautifulSoup(html) except Exception as e: send_chat_notification( __file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL}) raise e return self._storeLobbyistIdsFromSoup(soup)
class LobbyistScraper(BaseScraper): """ This scraper gets a lobbyist id, it then goes to the knesset api to get the data about the lobbyist """ def __init__(self): super(LobbyistScraper, self).__init__() self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)') self.storage = LobbyistScraperDictStorage() def _storeLobbyistDataFromSoup(self, soup): lobbyist_id = soup.find('d:lobbyist_id').text.strip() self._getLogger().info('got lobbyist id "%s"', lobbyist_id) lobbyist = { 'id': lobbyist_id, 'first_name': soup.find('d:first_name').text.strip(), 'family_name': soup.find('d:family_name').text.strip(), 'profession': soup.find('d:profession').text.strip(), 'corporation_name': soup.find('d:corporation_name').text.strip(), 'corporation_id': soup.find('d:corporation_id').text.strip(), 'faction_member': soup.find('d:faction_member').text.strip(), 'faction_name': soup.find('d:faction_name').text.strip(), 'permit_type': soup.find('d:lobyst_permit_type').text.strip(), } self.storage.storeDict(lobbyist) self._getLogger().debug(lobbyist) def _scrape(self, lobbyist_id): html = self.source.fetch(lobbyist_id) soup = BeautifulSoup(html) return self._storeLobbyistDataFromSoup(soup)
class LobbyistsIndexScraper(BaseScraper): """ This scraper gets the list of lobbyist ids from the knesset lobbyists page html returns a list of lobbyist ids - doesn't store anything in db """ LOBBYISTS_INDEX_PAGE_URL = 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx' def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL) self.storage = ListStorage() def _storeLobbyistIdsFromSoup(self, soup): elts = soup.findAll(lobbyist_id=True) counter = 0 for elt in elts: lobbyist_id = elt.get('lobbyist_id') if lobbyist_id.isdigit(): self.storage.store(lobbyist_id) self._getLogger().debug(lobbyist_id) counter = counter + 1 self._getLogger().info('got %s lobbyists', str(counter)) def _scrape(self): try: html = self.source.fetch() soup = BeautifulSoup(html) except Exception as e: send_chat_notification(__file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL}) raise e return self._storeLobbyistIdsFromSoup(soup)
class LobbyistsIndexScraper(BaseScraper): """ This scraper gets the list of lobbyist ids from the knesset lobbyists page html returns a list of lobbyist ids - doesn't store anything in db """ def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource('http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx') self.storage = ListStorage() def _storeLobbyistIdsFromSoup(self, soup): elts = soup.findAll(lobbyist_id=True) counter = 0 for elt in elts: lobbyist_id = elt.get('lobbyist_id') if lobbyist_id.isdigit(): self.storage.store(lobbyist_id) self._getLogger().debug(lobbyist_id) counter = counter + 1 self._getLogger().info('got %s lobbyists', str(counter)) def _scrape(self): html = self.source.fetch() soup = BeautifulSoup(html) return self._storeLobbyistIdsFromSoup(soup)
class LobbyistScraper(BaseScraper): """ This scraper gets a lobbyist id, it then goes to the knesset api to get the data about the lobbyist """ def __init__(self): super(LobbyistScraper, self).__init__() self.source = UrlSource( 'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)' ) self.storage = LobbyistScraperDictStorage() def _storeLobbyistDataFromSoup(self, soup): lobbyist_id = soup.find('d:lobbyist_id').text.strip() self._getLogger().info('got lobbyist id "%s"', lobbyist_id) lobbyist = { 'id': lobbyist_id, 'first_name': soup.find('d:first_name').text.strip(), 'family_name': soup.find('d:family_name').text.strip(), 'profession': soup.find('d:profession').text.strip(), 'corporation_name': soup.find('d:corporation_name').text.strip(), 'corporation_id': soup.find('d:corporation_id').text.strip(), 'faction_member': soup.find('d:faction_member').text.strip(), 'faction_name': soup.find('d:faction_name').text.strip(), 'permit_type': soup.find('d:lobyst_permit_type').text.strip(), } self.storage.storeDict(lobbyist) self._getLogger().debug(lobbyist) def _scrape(self, lobbyist_id): html = self.source.fetch(lobbyist_id) soup = BeautifulSoup(html) return self._storeLobbyistDataFromSoup(soup)
class LobbyistsIndexScraper(BaseScraper): """ This scraper gets the list of lobbyist ids from the knesset lobbyists page html returns a list of lobbyist ids - doesn't store anything in db """ def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource( 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx') self.storage = ListStorage() def _storeLobbyistIdsFromSoup(self, soup): elts = soup.findAll(lobbyist_id=True) counter = 0 for elt in elts: lobbyist_id = elt.get('lobbyist_id') if lobbyist_id.isdigit(): self.storage.store(lobbyist_id) self._getLogger().debug(lobbyist_id) counter = counter + 1 self._getLogger().info('got %s lobbyists', str(counter)) def _scrape(self): html = self.source.fetch() soup = BeautifulSoup(html) return self._storeLobbyistIdsFromSoup(soup)
class LobbyistRepresentScraper(BaseScraper): """ This scraper gets a lobbyist id and returns a list of LobbyistRepresent objects for that lobbyist """ def __init__(self): super(LobbyistRepresentScraper, self).__init__(self) self.source = UrlSource( 'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)/lobbist_type' ) self.storage = LobbyistRepresentListStorage() def _storeLobbyistRepresentDataFromSoup(self, soup, lobbyist_id): self._getLogger().info('got lobbyist represent (lobbyist id "%s")', lobbyist_id) for elt in soup.findAll('content'): represent = {} represent['id'] = elt.find('d:lobbyist_represent_id').text.strip() represent['lobbyist_id'] = elt.find('d:lobbyist_id').text.strip() represent['name'] = elt.find( 'd:lobbyist_represent_name').text.strip() represent['domain'] = elt.find( 'd:lobbyist_represent_domain').text.strip() represent['type'] = elt.find( 'd:lobbyist_represent_type').text.strip() self._getLogger().debug(represent) self.storage.store(represent) def _scrape(self, lobbyist_id): html = self.source.fetch(lobbyist_id) soup = BeautifulSoup(html) return self._storeLobbyistRepresentDataFromSoup(soup, lobbyist_id)
class LobbyistRepresentScraper(BaseScraper): """ This scraper gets a lobbyist id and returns a list of LobbyistRepresent objects for that lobbyist """ def __init__(self): super(LobbyistRepresentScraper, self).__init__(self) self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)/lobbist_type') self.storage = LobbyistRepresentListStorage() def _storeLobbyistRepresentDataFromSoup(self, soup, lobbyist_id): self._getLogger().info('got lobbyist represent (lobbyist id "%s")', lobbyist_id) for elt in soup.findAll('content'): represent = {} represent['id'] = elt.find('d:lobbyist_represent_id').text.strip() represent['lobbyist_id'] = elt.find('d:lobbyist_id').text.strip() represent['name'] = elt.find('d:lobbyist_represent_name').text.strip() represent['domain'] = elt.find('d:lobbyist_represent_domain').text.strip() represent['type'] = elt.find('d:lobbyist_represent_type').text.strip() self._getLogger().debug(represent) self.storage.store(represent) def _scrape(self, lobbyist_id): html = self.source.fetch(lobbyist_id) soup = BeautifulSoup(html) return self._storeLobbyistRepresentDataFromSoup(soup, lobbyist_id)
def __init__(self): super(LobbyistScraper, self).__init__() self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)') self.storage = LobbyistScraperDictStorage()
def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL) self.storage = ListStorage()
def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource('http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx') self.storage = ListStorage()
def __init__(self): super(LobbyistScraper, self).__init__() self.source = UrlSource( 'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)' ) self.storage = LobbyistScraperDictStorage()
def __init__(self): super(LobbyistsIndexScraper, self).__init__(self) self.source = UrlSource( 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx') self.storage = ListStorage()