Python UrlSource示例，okscraper.sources.UrlSource Python示例

示例#1

0

显示文件

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """
    LOBBYISTS_INDEX_PAGE_URL = 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx'

    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        try:
            html = self.source.fetch()
            soup = BeautifulSoup(html)
        except Exception as e:
            send_chat_notification(
                __file__, 'failed to fetch or parse the lobbyists index page',
                {'url': self.LOBBYISTS_INDEX_PAGE_URL})
            raise e
        return self._storeLobbyistIdsFromSoup(soup)

示例#2

0

显示文件

文件： lobbyist.py 项目： RobotnickIsrael/Open-Knesset

class LobbyistScraper(BaseScraper):
    """
    This scraper gets a lobbyist id, it then goes to the knesset api to get the data about the lobbyist
    """

    def __init__(self):
        super(LobbyistScraper, self).__init__()
        self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)')
        self.storage = LobbyistScraperDictStorage()

    def _storeLobbyistDataFromSoup(self, soup):
        lobbyist_id = soup.find('d:lobbyist_id').text.strip()
        self._getLogger().info('got lobbyist id "%s"', lobbyist_id)
        lobbyist = {
            'id': lobbyist_id,
            'first_name': soup.find('d:first_name').text.strip(),
            'family_name': soup.find('d:family_name').text.strip(),
            'profession': soup.find('d:profession').text.strip(),
            'corporation_name': soup.find('d:corporation_name').text.strip(),
            'corporation_id': soup.find('d:corporation_id').text.strip(),
            'faction_member': soup.find('d:faction_member').text.strip(),
            'faction_name': soup.find('d:faction_name').text.strip(),
            'permit_type': soup.find('d:lobyst_permit_type').text.strip(),
        }
        self.storage.storeDict(lobbyist)
        self._getLogger().debug(lobbyist)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistDataFromSoup(soup)

示例#3

0

显示文件

文件： lobbyists_index.py 项目： MeirKriheli/Open-Knesset

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """
    LOBBYISTS_INDEX_PAGE_URL = 'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx'

    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        try:
            html = self.source.fetch()
            soup = BeautifulSoup(html)
        except Exception as e:
            send_chat_notification(__file__, 'failed to fetch or parse the lobbyists index page', {'url': self.LOBBYISTS_INDEX_PAGE_URL})
            raise e
        return self._storeLobbyistIdsFromSoup(soup)

示例#4

0

显示文件

文件： lobbyists_index.py 项目： JoeyHa/Open-Knesset

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """

    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource('http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        html = self.source.fetch()
        soup = BeautifulSoup(html)
        return self._storeLobbyistIdsFromSoup(soup)

示例#5

0

显示文件

class LobbyistScraper(BaseScraper):
    """
    This scraper gets a lobbyist id, it then goes to the knesset api to get the data about the lobbyist
    """
    def __init__(self):
        super(LobbyistScraper, self).__init__()
        self.source = UrlSource(
            'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)'
        )
        self.storage = LobbyistScraperDictStorage()

    def _storeLobbyistDataFromSoup(self, soup):
        lobbyist_id = soup.find('d:lobbyist_id').text.strip()
        self._getLogger().info('got lobbyist id "%s"', lobbyist_id)
        lobbyist = {
            'id': lobbyist_id,
            'first_name': soup.find('d:first_name').text.strip(),
            'family_name': soup.find('d:family_name').text.strip(),
            'profession': soup.find('d:profession').text.strip(),
            'corporation_name': soup.find('d:corporation_name').text.strip(),
            'corporation_id': soup.find('d:corporation_id').text.strip(),
            'faction_member': soup.find('d:faction_member').text.strip(),
            'faction_name': soup.find('d:faction_name').text.strip(),
            'permit_type': soup.find('d:lobyst_permit_type').text.strip(),
        }
        self.storage.storeDict(lobbyist)
        self._getLogger().debug(lobbyist)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistDataFromSoup(soup)

示例#6

0

显示文件

class LobbyistsIndexScraper(BaseScraper):
    """
    This scraper gets the list of lobbyist ids from the knesset lobbyists page html
    returns a list of lobbyist ids - doesn't store anything in db
    """
    def __init__(self):
        super(LobbyistsIndexScraper, self).__init__(self)
        self.source = UrlSource(
            'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
        self.storage = ListStorage()

    def _storeLobbyistIdsFromSoup(self, soup):
        elts = soup.findAll(lobbyist_id=True)
        counter = 0
        for elt in elts:
            lobbyist_id = elt.get('lobbyist_id')
            if lobbyist_id.isdigit():
                self.storage.store(lobbyist_id)
                self._getLogger().debug(lobbyist_id)
                counter = counter + 1
        self._getLogger().info('got %s lobbyists', str(counter))

    def _scrape(self):
        html = self.source.fetch()
        soup = BeautifulSoup(html)
        return self._storeLobbyistIdsFromSoup(soup)

示例#7

0

显示文件

class LobbyistRepresentScraper(BaseScraper):
    """
    This scraper gets a lobbyist id and returns a list of LobbyistRepresent objects for that lobbyist 
    """
    def __init__(self):
        super(LobbyistRepresentScraper, self).__init__(self)
        self.source = UrlSource(
            'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)/lobbist_type'
        )
        self.storage = LobbyistRepresentListStorage()

    def _storeLobbyistRepresentDataFromSoup(self, soup, lobbyist_id):
        self._getLogger().info('got lobbyist represent (lobbyist id "%s")',
                               lobbyist_id)
        for elt in soup.findAll('content'):
            represent = {}
            represent['id'] = elt.find('d:lobbyist_represent_id').text.strip()
            represent['lobbyist_id'] = elt.find('d:lobbyist_id').text.strip()
            represent['name'] = elt.find(
                'd:lobbyist_represent_name').text.strip()
            represent['domain'] = elt.find(
                'd:lobbyist_represent_domain').text.strip()
            represent['type'] = elt.find(
                'd:lobbyist_represent_type').text.strip()
            self._getLogger().debug(represent)
            self.storage.store(represent)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistRepresentDataFromSoup(soup, lobbyist_id)

示例#8

0

显示文件

文件： lobbyist_represent.py 项目： JoeyHa/Open-Knesset

class LobbyistRepresentScraper(BaseScraper):
    """
    This scraper gets a lobbyist id and returns a list of LobbyistRepresent objects for that lobbyist 
    """

    def __init__(self):
        super(LobbyistRepresentScraper, self).__init__(self)
        self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)/lobbist_type')
        self.storage = LobbyistRepresentListStorage()

    def _storeLobbyistRepresentDataFromSoup(self, soup, lobbyist_id):
        self._getLogger().info('got lobbyist represent (lobbyist id "%s")', lobbyist_id)
        for elt in soup.findAll('content'):
            represent = {}
            represent['id'] = elt.find('d:lobbyist_represent_id').text.strip()
            represent['lobbyist_id'] = elt.find('d:lobbyist_id').text.strip()
            represent['name'] = elt.find('d:lobbyist_represent_name').text.strip()
            represent['domain'] = elt.find('d:lobbyist_represent_domain').text.strip()
            represent['type'] = elt.find('d:lobbyist_represent_type').text.strip()
            self._getLogger().debug(represent)
            self.storage.store(represent)

    def _scrape(self, lobbyist_id):
        html = self.source.fetch(lobbyist_id)
        soup = BeautifulSoup(html)
        return self._storeLobbyistRepresentDataFromSoup(soup, lobbyist_id)

示例#9

0

显示文件

文件： lobbyist.py 项目： RobotnickIsrael/Open-Knesset

 def __init__(self):
     super(LobbyistScraper, self).__init__()
     self.source = UrlSource('http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)')
     self.storage = LobbyistScraperDictStorage()

示例#10

0

显示文件

文件： lobbyists_index.py 项目： MeirKriheli/Open-Knesset

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
     self.storage = ListStorage()

示例#11

0

显示文件

文件： lobbyists_index.py 项目： JoeyHa/Open-Knesset

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource('http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
     self.storage = ListStorage()

示例#12

0

显示文件

 def __init__(self):
     super(LobbyistScraper, self).__init__()
     self.source = UrlSource(
         'http://online.knesset.gov.il/WsinternetSps/KnessetDataService/LobbyistData.svc/View_lobbyist(<<id>>)'
     )
     self.storage = LobbyistScraperDictStorage()

示例#13

0

显示文件

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource(self.LOBBYISTS_INDEX_PAGE_URL)
     self.storage = ListStorage()

示例#14

0

显示文件

 def __init__(self):
     super(LobbyistsIndexScraper, self).__init__(self)
     self.source = UrlSource(
         'http://www.knesset.gov.il/lobbyist/heb/lobbyist.aspx')
     self.storage = ListStorage()