Python getfirst示例，vpapi.getfirst Python示例

示例#1

0

显示文件

文件： georgia_scraper.py 项目： opendatakosovo/parldata-scraper

    def get_id(self, collection, identifier, type=None):
        if collection != "organizations":
            existing = vpapi.getfirst(collection, where={'identifiers': {'$elemMatch': {'identifier': identifier}}})
        else:
            if type:
                existing = vpapi.getfirst(collection, where={'identifiers': {'$elemMatch': {'identifier': identifier}}})
            else:
                existing = vpapi.getfirst(collection, where={'name': identifier})

        if existing:
            p_id = existing['id']
        else:
            p_id = "Not found"
        return p_id

示例#2

0

显示文件

文件： moldova_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_membership(self):
        # Returns chambers membership list with the basic information data
        # for each member of every chamber for Moldova's parliament.
        chamber_membership = []
        print "\n\tScraping chambers membership from Moldova's parliament..."
        mps_list = self.mps_list()
        members = {}
        membership_correction = self.membership_correction()
        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['identifiers'][0]['identifier']] = member['id']
        chamber_id = vpapi.getfirst("organizations",
                                    where={"identifiers": {
                                        "$elemMatch": {
                                            "identifier": "20", "scheme": "parlament.md"
                                        }
                                    }})
        deputy_list_url = "http://www.parlament.md/StructuraParlamentului/" \
                          "Deputies/tabid/87/language/ro-RO/Default.aspx"

        for member in mps_list:
            p_id = members[member['identifier']]
            role = membership_correction[member['membership'].encode('utf-8')]
            chamber_membership_json = self.build_memberships_doc(p_id, chamber_id['id'], member['membership'],
                                                                 role, deputy_list_url)
            chamber_membership.append(chamber_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chamber_membership)) + " members of chambers \n"
        return chamber_membership

示例#3

0

显示文件

文件： belarus_lowerhouse_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_chamber(self):
        # Iterates in every parliamentary group json document and
        # returns the list with the json document structure that Visegrad+ API accepts
        print "\n\tScraping chambers from Belarus Lowerhouse parliament..."
        chambers = parser.chambers()
        chambers_list = []
        url = "http://house.gov.by/index.php/,10087,,,,2,,,0.html"
        for chamber in chambers:
            chamber_json = self.build_organization_doc("chamber", chambers[chamber]['name'], chamber,
                                                       chambers[chamber]['start_date'], chambers[chamber]['end_date'],
                                                       url, "", "")
            if chamber == "2":
                del chamber_json['dissolution_date']
            del chamber_json['contact_details']
            del chamber_json['parent_id']

            existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}})
            if not existing:
                resp = vpapi.post("organizations", chamber_json)
            else:
                # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date())
            if resp["_status"] != "OK":
                raise Exception("Invalid status code")
            chambers_list.append(chamber_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers"
        return chambers_list

示例#4

0

显示文件

文件： armenia_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_chamber(self):
        # Scrapes chambers and Returns the list of chambers with all the information needed for each
        url = "http://www.parliament.am/deputies.php?sel=ful&ord=photo&show_session=5&lang=arm&enc=utf8"
        soup = scrape.download_html_file(url)
        chambers_list = []
        print "\n\tScraping chambers from Armenia's parliament...\n"
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        all_options = soup.find("select", {"name": "show_session"}).findAll("option")
        for each_option in pbar(all_options):
            identifier = each_option.get('value')
            name = each_option.get_text()
            url = "http://www.parliament.am/deputies.php?lang=arm&sel=&ord=&show_session=" + identifier
            if "100" not in identifier:
                founding_date = self.terms[identifier]["start_date"]
                dissolution_date = self.terms[identifier]["end_date"]
                chamber_json = self.build_organization_doc("chamber", name, identifier, founding_date,
                                                           dissolution_date, url, "", "")

                del chamber_json['contact_details']
                del chamber_json['parent_id']
                if identifier == "5":
                    del chamber_json['dissolution_date']

                existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}})
                if not existing:
                    resp = vpapi.post("organizations", chamber_json)
                else:
                    resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date())
                if resp["_status"] != "OK":
                    raise Exception("Invalid status code")
                chambers_list.append(chamber_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers"
        return chambers_list

示例#5

0

显示文件

文件： armenia_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_parliamentary_group_membership(self):
        # Returns parliamentary groups membership list with the basic information data
        # for each member of every parliamentary group for Armenia's parliament.
        print "\n\tScraping parliamentary groups membership from Armenia's parliament...\n"
        chambers = {}
        groups = {}
        members = {}
        memberships = self.membership_correction()

        all_chambers = vpapi.getall("organizations", where={"classification": "chamber"})
        for chamber in all_chambers:
            chambers[chamber['identifiers'][0]["identifier"]] = chamber['id']

        all_groups = vpapi.getall('organizations', where={"classification": "parliamentary group"})
        for group in all_groups:
            groups[group['sources'][0]['url']] = group['id']

        all_members = vpapi.getall("people")
        for member in all_members:
            members[member['name']] = member['id']

        parties_membership = []

        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for term in pbar(list(reversed(sorted(self.terms.keys())))):
            url = "http://www.parliament.am/deputies.php?lang=arm&sel=factions&SubscribeEmail=&show_session=" + str(term)
            soup = scrape.download_html_file(url)
            for each_div in soup.findAll('div', {"class": "content"}):
                party_name = each_div.find("center").find("b").get_text()
                party_name_ordered = party_name.replace("  ", " ")
                exist = vpapi.getfirst("organizations", where={'name': party_name_ordered,
                                                               "parent_id": chambers[str(term)]})
                if exist:
                    o_id = exist['id']
                for each_tr in each_div.find('table', {"style": "margin-top:10px; margin-bottom:10px;"}).findAll('tr'):
                    if each_tr.has_attr('bgcolor'):
                        continue
                    else:
                        td_array = each_tr.findAll('td')
                        names = td_array[0].find('a').get_text().split(' ')
                        first_name = names[1]
                        last_name = names[0]
                        middle_name = names[2]
                        name_ordered = "%s %s %s" % (first_name, middle_name, last_name)
                        membership = each_tr.find('span', {'class': "news_date"}).get_text()

                        if membership == "":
                            membership = "անդամ".decode('utf-8')
                        else:
                            membership = membership[1:len(membership)-1]

                        role = memberships[membership.encode('utf-8')]
                        if name_ordered in members:
                            p_id = members[name_ordered]
                        party_membership_json = self.build_memberships_doc(p_id, o_id, membership, role, url)
                        parties_membership.append(party_membership_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(parties_membership)) + " members of parliamentary groups"
        return parties_membership

示例#6

0

显示文件

文件： ukraine_scraper.py 项目： opendatakosovo/parldata-scraper

 def get_index(self, collection, sort, vote_events):
     last_event = vpapi.getfirst(collection, sort=sort)
     if last_event:
         index = (
             next(index for (index, d) in enumerate(vote_events) if d["identifier"] == last_event["identifier"]) + 1
         )
     else:
         index = 0
     return index

示例#7

0

显示文件

文件： speeches.py 项目： KohoVolit/scraper-psp.cz

def extract_person_idendifier(text):
    match = re.findall('/sqw/detail.sqw\?id=([0-9]{1,})', text)
    if match:
        ex = vpapi.getfirst('people', where={'id': match[0]})
        if ex:
            return match[0]
        else:
            return None
    else:
        return None

示例#8

0

显示文件

	def scrape_chamber_changes_and_save(term):
		"""Scrape list of changes of memberships in the parliament chamber
		and save (or update) the respective memberships.
		If an MP referred by the membership does not exist, scrape and save him/her.
		"""
		change_list = parse.change_list(term)
		oid = get_chamber_id(term)

		for change in reversed(change_list['_items']):
			logging.info('Scraping mandate change of `%s` at %s' % (change['poslanec']['meno'], change['dátum']))

			# if MP is not scraped yet, scrape and save him
			existing = vpapi.getfirst('people',
				where={'identifiers': {'$elemMatch': {'identifier': change['poslanec']['id'], 'scheme': 'nrsr.sk'}}},
				projection={'id': 1})
			if existing:
				pid = existing['id']
			else:
				p = Person.scrape(change['poslanec']['id'], term)
				pid = p.save()

			# create or update the membership
			m = Membership()
			m.label = 'Poslanec Národnej rady SR'
			m.role = 'member'
			m.person_id = pid
			m.organization_id = oid
			m.sources = [{
				'url': change_list['url'],
				'note': 'Zmeny v poslaneckom zbore na webe NRSR'
			}]

			if change['zmena'] in ('Mandát vykonávaný (aktívny poslanec)', 'Mandát náhradníka vykonávaný'):
				m.start_date = sk_to_utc(change['dátum'])
				m.save()
				# close previous membership of Izák, Jaroslav (9. 9. 2008 - 20. 5. 2009 - 12. 6. 2010)
				if term == '4' and change['poslanec']['meno'] == 'Izák, Jaroslav' and change['dátum'] == '20. 5. 2009':
					del m.start_date
					m.end_date = change['dátum']
					m.save()
			elif change['zmena'] in ('Mandát zaniknutý', 'Mandát sa neuplatňuje', 'Mandát náhradníka zaniknutý'):
				m.end_date = sk_to_utc(change['dátum'])
				# only close an existing membership (counterexample: Érsek, Árpád, 27. 9. 2010 - 10. 3. 2012)
				existing_only = True
				# except an inaccuracy in source data for Šimko, Ivan (15. 10. 2002 - 15. 10. 2002)
				if term == '3' and change['poslanec']['meno'] == 'Šimko, Ivan':
					existing_only = False
				m.save(existing_only)
			elif change['zmena'] in ('Mandát nadobudnutý vo voľbách', 'Mandát náhradníka získaný'):
				pass
			else:
				raise RuntimeError("unknown change '%s' of a membership in chamber" % change['zmena'])

		logging.info('Scraped %s mandate changes' % len(change_list['_items']))

示例#9

0

显示文件

	def save(self):
		scraped = self.__dict__
		existing = vpapi.getfirst('people', where={'identifiers': {'$elemMatch': self.identifiers[0]}})
		if not existing:
			resp = vpapi.post('people', scraped)
		else:
			# update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
			resp = vpapi.put('people', existing['id'], scraped, effective_date=effective_date)

		if resp['_status'] != 'OK':
			raise Exception(self.name, resp)
		return resp['id']

示例#10

0

显示文件

文件： base.py 项目： epforgpl/parldata-scrapers-poland-hungary-montenegro

    def get_or_create(self, endpoint, item, refresh=False, where_keys=None):
        sort = []
        embed = []
        where = {}
        if where_keys:
            for key in where_keys:
                where[key] = item[key]
        elif endpoint == 'memberships':
            where = {
                'person_id': item['person_id'],
                'organization_id': item['organization_id']
            }
            where['start_date'] = item.get('start_date', {"$exists": False})

            sort = [('start_date', -1)]
        elif endpoint in ('motions', 'speeches'):
            where = {'sources.url': item['sources'][0]['url']}
        elif endpoint == 'vote-events':
            embed = ['votes']
            if 'motion_id' in item:
                where = {'motion_id': item['motion_id']}
            else:
                where = {'start_date': item['start_date']}
        elif endpoint == 'votes':
            where = {
                'vote_event_id': item['vote_event_id'],
                'voter_id': item['voter_id'],
            }
        elif endpoint == 'events':
            where = {'identifier': item['identifier']}
        else:
            where = {
                'identifiers': {'$elemMatch': item['identifiers'][0]}}
        created = False
        resp = vpapi.getfirst(endpoint, where=where, sort=sort)
        if not resp:
            resp = vpapi.post(endpoint, item)
            created = True
            self.log('Created %s' % resp['_links']['self']['href'], DEBUG)
        else:
            pk = resp['id']
            resp = vpapi.put("%s/%s" % (endpoint, pk), item)
            self.log('Updated %s' % resp['_links']['self']['href'], DEBUG)

        if resp['_status'] != 'OK':
            raise Exception(resp)
        if refresh:
            resp = vpapi.get(
                resp['_links']['self']['href'], sort=sort, embed=embed)
        resp['_created'] = created
        return resp

示例#11

0

显示文件

文件： belarus_lowerhouse_parser.py 项目： opendatakosovo/parldata-scraper

    def parliamentary_group_membership(self):
        # Returns parliamentary groups membership list with all needed information data
        # for each member of every parliamentary group for Belarus Lower house parliament.
        party_membership_list = []
        roles = self.membership_correction()
        party = self.parliamentary_groups()
        url = party['url']
        soup = scrape.download_html_file(url)
        party = soup.find("h1").get_text()
        existing_party = vpapi.getfirst("organizations", where={"name": party})
        if existing_party:
            for each_tr in soup.find("table", {"width": "595"}).findAll('tr')[1:]:
                td_array = each_tr.findAll('td')
                name = td_array[1].get_text().strip()
                if "кіраўнік групы" in name.encode('utf-8'):
                    name = name.encode('utf-8').replace("кіраўнік групы", "").strip()
                    name = name[:len(name) - 4]
                    membership = "кіраўнік групы".decode('utf-8')
                else:
                    membership = "Член".decode('utf-8')
                names = name.split(" ")
                first_name = names[1]
                last_name = names[0]
                name_ordered = last_name + ", " + first_name
                existing = vpapi.getfirst("people", where={'sort_name': name_ordered})
                if existing:
                    p_id = existing['id']

                if existing_party['id'] and p_id:
                    party_membership_json = {
                        "organization_id": existing_party['id'],
                        "person_id": p_id,
                        "url": url,
                        "membership": membership,
                        "role": roles[membership.encode('utf-8')]
                    }
                    party_membership_list.append(party_membership_json)
        return party_membership_list

示例#12

0

显示文件

def get_or_create(resource, item, key=None):
	"""Unless the item already exists in the resource (identified by
	`key` fields) create it. Return id of the item and a bool whether
	the item was newly created or not. If key is not given, all fields
	of the item are used as a key.
	"""
	if key is None:
		key = item.keys()
	query = {field: item[field] for field in key}
	existing = vpapi.getfirst(resource, where=query)
	if existing:
		return existing['id'], False
	resp = vpapi.post(resource, item)
	return resp['id'], True

示例#13

0

显示文件

文件： belarus_lowerhouse_parser.py 项目： opendatakosovo/parldata-scraper

 def parliamentary_groups(self):
     # Scrapes parliamentary groups and Returns the list json doc of parliamentary group
     url = "http://house.gov.by/index.php/,17543,,,,2,,,0.html"
     index_start = url.index("/,") + 2
     index_end = url.index(",,,,2")
     identifier = url[index_start:index_end]
     chamber = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': {"identifier": "2", "scheme": "house.by"}}})
     soup = scrape.download_html_file(url)
     party_name = soup.find('h1').get_text()
     party_json = {
         "name": party_name,
         "url": url,
         "identifier": identifier,
         "parent_id": chamber['id']
     }
     return party_json

示例#14

0

显示文件

文件： moldova_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_committee(self):
        # Scrapes committee groups and Returns the list of
        # committee groups with all the information needed for each.
        print "\n\tScraping parliamentary committees from Moldova's parliament..."
        committees = self.committee_list()
        chamber_id = vpapi.getfirst("organizations",
                                    where={"identifiers": {
                                        "$elemMatch": {
                                            "identifier": "20", "scheme": "parlament.md"
                                        }
                                    }})
        committees_list = []
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for committee in pbar(committees):
            soup = scrape.download_html_file(committee['url'])
            email_tag = soup.find("span", {"id": "dnn_ctr486_ViewCommissionPermanent_ctrlViewCommissionType_lblCommissionContacts"}).find('a')
            phone = soup.find("span", {"id": "dnn_ctr486_ViewCommissionPermanent_ctrlViewCommissionType_lblCommissionContacts"}).find('p')
            if phone.get_text().strip() != "":
                phone_number = phone.get_text()[6:].strip()
            else:
                phone_number = None
            if email_tag:
                email = email_tag.get_text()
            else:
                email = None

            committee_json = self.build_organization_doc("committe", committee['name'], committee['identifier'],
                                                         "", "", committee['url'], email, chamber_id['id'], )

            del committee_json['founding_date']
            del committee_json['dissolution_date']
            if not email:
                del committee_json['contact_details']
            elif not phone_number:
                del committee_json['contact_details']
            else:
                committee_json['contact_details'].append({
                    "label": "Tel.",
                    "type": "tel",
                    "value": phone_number
                })
            committees_list.append(committee_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(committees_list)) + " committees"
        return committees_list

示例#15

0

显示文件

文件： moldova_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_chamber(self):
        # Scrapes chambers and Returns the list of chambers with all the information needed for each
        url = "http://www.parlament.md/Parlamentarismul%C3%AEnRepublicaMoldova/" \
              "Istorie%C8%99ievolu%C8%9Bie/tabid/96/language/ro-RO/Default.aspx"
        chambers_to_fix = {"XII": "12", "XIII": "13", "XIV": "14", "XV": "15", "XVI": "16", "XVII": "17",
                           "XVIII": "18", "XIX": "19", "XX": "20"}
        chambers = []
        soup = scrape.download_html_file(url)
        print "\n\tScraping chambers from Moldova's parliament..."
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for each_a in pbar(soup.find('div', {"class": "LocalizedContent"}).findAll('a')):
            name = each_a.get_text().strip()
            if name != "":
                url = "http://www.parlament.md" + each_a.get('href')
                if "(" in name:
                    chamber_roman = name[name.index('X'):name.index('(')].replace('-a', "").strip()
                    chamber_identifier = chambers_to_fix[chamber_roman]
                    founding_date = self.terms[chamber_identifier]['start_date']
                    dissolution_date = self.terms[chamber_identifier]['end_date']
                else:
                    chamber_roman = name[-6:len(name)-3].strip()
                    chamber_identifier = chambers_to_fix[chamber_roman]
                    founding_date = self.terms[chamber_identifier]['start_date']
                    dissolution_date = self.terms[chamber_identifier]['end_date']

                chamber_json = self.build_organization_doc("chamber", name, chamber_identifier, founding_date,
                                                           dissolution_date, url, "", "")

                del chamber_json['contact_details']
                del chamber_json['parent_id']
                if chamber_identifier == "20":
                    del chamber_json['dissolution_date']

                existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}})
                if not existing:
                    resp = vpapi.post("organizations", chamber_json)
                else:
                    # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                    resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date())
                if resp["_status"] != "OK":
                    raise Exception("Invalid status code")
                chambers.append(chamber_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers)) + " chambers"
        return chambers

示例#16

0

显示文件

文件： belarus_lowerhouse_scraper.py 项目： opendatakosovo/parldata-scraper

 def scrape_membership(self):
     # Iterates in chamber member json document and
     # returns the list with the json document structure that Visegrad+ API accepts
     print "\n\tScraping membership's data from Belarus Lowerhouese parliament..."
     mp_list = parser.mps_list()
     chamber_membership_list = []
     members = {}
     url = "http://house.gov.by/index.php/,17041,,,,2,,,0.html"
     all_members = vpapi.getall("people")
     for person in all_members:
         members[person['identifiers'][0]['identifier']] = person['id']
     chamber = vpapi.getfirst("organizations", where={"identifiers": {"$elemMatch": {"identifier": "2", "scheme": "house.by"}}})
     for member in mp_list:
         p_id = members[member['identifier']]
         o_id = chamber['id']
         chamber_membership_json = self.build_memberships_doc(p_id, o_id, member['membership'], member['role'], url)
         chamber_membership_list.append(chamber_membership_json)
     print "\n\tScraping completed! \n\tScraped " + str(len(chamber_membership_list)) + " members"
     return chamber_membership_list

示例#17

0

显示文件

文件： belarus_lowerhouse_parser.py 项目： opendatakosovo/parldata-scraper

 def committee_list(self):
     # Returns the list of committee groups with basic information for each
     url = "http://house.gov.by/index.php/,17052,,,,2,,,0.html"
     soup = scrape.download_html_file(url)
     committees = []
     chamber = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': {"identifier": "2", "scheme": "house.by"}}})
     for each_div in soup.findAll('div', {"style": "margin-left:0px; padding-bottom: 1px;"}):
         name = each_div.find('a').get_text().strip()
         url = each_div.find('a').get('href')
         index_start = url.index("/,") + 2
         index_end = url.index(",,,,2")
         identifier = url[index_start:index_end]
         committee_json = {
             "name": name,
             "url": url,
             "identifier": identifier,
             "parent_id": chamber['id']
         }
         committees.append(committee_json)
     return committees

示例#18

0

显示文件

文件： belarus_upperhouse_scraper.py 项目： opendatakosovo/parldata-scraper

    def scrape_committee_members(self):
        # Iterates in every committee member json doc and returns the
        # list with the json document structure that Visegrad+ API accepts
        print "\n\tScraping committee groups from Belarus Upperhouse parliament...\n"
        members = {}
        committee_membership = []
        all_members = vpapi.getall("people")
        for member in all_members:
            if member['identifiers'][0]['identifier'] not in members:
                members[member['identifiers'][0]['identifier']] = member['id']
            else:
                continue

        committee_membership_list = parser.committee_membership()
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)

        print "\n\tProcessing members of committee groups from Belarus Upperhouse parliament...\n"
        for member in pbar(committee_membership_list):

            if member['identifier'] in members:
                p_id = members[member['identifier']]
            else:
                p_id = None
            existing = vpapi.getfirst("organizations", where={"name": member['committee_name'], "parent_id": member['committee_parent_id']})
            if existing:
                o_id = existing['id']
            else:
                o_id = None

            if p_id and o_id:
                committee_membership_json = self.build_memberships_doc(p_id, o_id, member['membership'],
                                                                       member['role'], member['url'])
                committee_membership.append(committee_membership_json)
            else:
                continue
        print "\n\tScraping completed! \n\tScraped " + str(len(committee_membership)) + " members"
        return committee_membership

示例#19

0

显示文件

文件： moldova_scraper.py 项目： opendatakosovo/parldata-scraper

 def scrape_parliamentary_groups(self):
     # Scrapes parliamentary groups and Returns the list of
     # parliamentary groups with all the information needed for each
     chamber_id = vpapi.getfirst("organizations",
                                 where={"identifiers": {
                                     "$elemMatch": {
                                         "identifier": "20", "scheme": "parlament.md"
                                     }
                                 }})
     parties_list = self.parliamentary_group_list()
     parties = []
     print "\n\tScraping parliamentary groups from Moldova's parliament..."
     widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                ' ', ETA(), " - Processed: ", Counter(), ' items             ']
     pbar = ProgressBar(widgets=widgets)
     for party in pbar(parties_list):
         founding_date = self.terms["20"]["start_date"]
         party_json = self.build_organization_doc("parliamentary group", party['name'],
                                                  party['identifier'], founding_date, "",
                                                  party['url'], "", chamber_id['id'])
         del party_json['contact_details']
         del party_json['dissolution_date']
         parties.append(party_json)
     return parties

示例#20

0

显示文件

def scrape_new_debates(term):
	"""Scrape and save speeches from debates of the given term, one
	of those newer terms where transcripts of debates are published
	in parts assigned to individual speakers.

	Returns number of scraped speeches.
	"""
	debate_part_kinds = {
		'Uvádzajúci uvádza bod': 'speech',
		'Vstup predsedajúceho': 'speech',
		'Vystúpenie spoločného spravodajcu': 'speech',
		'Vystúpenie': 'speech',
		'Vystúpenie v rozprave': 'speech',
		'Vystúpenie s faktickou poznámkou': 'speech',
		'Vystúpenie s procedurálnym návrhom': 'speech',
		'Prednesenie otázky': 'question',
		'Zodpovedanie otázky': 'answer',
		'Doplňujúca otázka / reakcia zadávajúceho': 'question',
		'Prednesenie interpelácie': 'question',
		'Odpoveď na interpeláciu': 'answer',
		'scene': 'scene'
	}

	def insert_speech(kind):
		"""Insert a speech entity for the given debate part kind
		and data from parent scope variables and update end date
		of the corresponding session and sitting. Delete `text`
		variable."""
		nonlocal text, last_speech_enddatetime
		if not text: return
		speech = {
			'text': text.strip().replace('[', '(').replace(']', ')'),
			'date': start_datetime,
			'type': debate_part_kinds.get(kind, 'speech'),
			'position': len(speeches) + 1,
			'event_id': sitting_id,
			'sources' : [{
				'url': dpart_url,
				'note': 'Prepis časti debaty na webe NRSR'
			}]
		}
		if dpart_video:
			speech['video'] = dpart_video
		if kind != 'scene':
			speech['creator_id'] = speaker_id
			speech['attribution_text'] = attribution.strip()
		speeches.append(speech)
		text = ''

		if end_datetime > session_end_date:
			vpapi.patch('events', session_id, {'end_date': end_datetime})
		if end_datetime > sitting_end_date:
			vpapi.patch('events', sitting_id, {'end_date': end_datetime})
		last_speech_enddatetime = datetime.strptime(end_datetime, '%Y-%m-%dT%H:%M:%S')

	logging.info('Scraping debates of term `%s`' % term)
	chamber_id = get_chamber_id(term)

	# prepare mapping from MP's name to id
	people = vpapi.getall('people', projection={'name': 1})
	mps = {mp['name']: mp['id'] for mp in people}

	# load name corrections
	with open(os.path.join(CONF_DIR, 'name_corrections.json'), encoding='utf8') as f:
		name_corrections = json.load(f)

	# scraping will start since the most recent sitting start date
	last_sitting = vpapi.getfirst('events',
		where={'type': 'sitting', 'organization_id': chamber_id},
		sort='-start_date')
	since_date = last_sitting['start_date'][:10] if last_sitting else None

	# scrape list of debate parts
	debate_parts = parse.new_debates_list(term, since_date)

	speech_count = 0
	session_name = ''
	speeches = []
	for dp in debate_parts['_items']:
		# stop at very recent debate parts (may be incomplete)
		start_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['od']))
		sd = datetime.strptime(start_datetime, '%Y-%m-%dT%H:%M:%S')
		if datetime.utcnow() - sd < timedelta(days=5):
			break

		# skip already scraped debate parts
		existing = vpapi.getfirst('speeches', where={'sources.url': dp['prepis']['url']})
		if existing: continue

		logging.info('Scraping debate part %s %s-%s (id=%s)' %
			(dp['dátum'], dp['trvanie']['od'], dp['trvanie']['do'], dp['prepis']['id']))
		dpart = parse.debate_of_terms56(dp['prepis']['id'])
		if not dpart['riadky']: continue

		end_datetime = sk_to_utc('%s %s' % (dp['dátum'], dp['trvanie']['do']))
		dpart_kind = dp['druh']
		dpart_url = dp['prepis']['url']
		dpart_video = dp['video']['url'] if 'video' in dp else None

		if not session_name.startswith('%s. ' % dp['schôdza']):
			# create new session event
			session_name = '%s. schôdza' % dp['schôdza']
			session = {
				'name': session_name,
				'identifier': dp['schôdza'],
				'organization_id': chamber_id,
				'type': 'session',
				'start_date': start_datetime,
				'end_date': end_datetime,
			}
			key = ('organization_id', 'type', 'identifier')
			session_id, _ = get_or_create('events', session, key)
			session_end_date = end_datetime

			# find the last moment of the last sitting of this session
			session_last_sitting = vpapi.getfirst('events',
				where={'type': 'sitting', 'parent_id': session_id},
				sort='-start_date')
			if session_last_sitting:
				last_speech_enddatetime = datetime.strptime(session_last_sitting['end_date'], '%Y-%m-%dT%H:%M:%S')
				sitting_identifier = session_last_sitting['identifier']
				sitting_id = session_last_sitting['id']
				sitting_end_date = session_last_sitting['end_date']
			else:
				last_speech_enddatetime = datetime.min
				sitting_identifier = '0'

		if sd - last_speech_enddatetime > timedelta(hours=5):
			# create new sitting event
			sitting_identifier = str(int(sitting_identifier) + 1)
			sitting_name = '%s. deň rokovania, %s' % (sitting_identifier, dp['dátum'])
			sitting = {
				'name': sitting_name,
				'identifier': sitting_identifier,
				'organization_id': chamber_id,
				'type': 'sitting',
				'start_date': start_datetime,
				'end_date': end_datetime,
				'parent_id': session_id,
			}
			key = ('parent_id', 'type', 'identifier')
			sitting_id, _ = get_or_create('events', sitting, key)
			sitting_end_date = end_datetime

			# save speeches of the previous sitting
			if len(speeches) > 0:
				vpapi.post('speeches', speeches)
				speech_count += len(speeches)
			if dp != debate_parts['_items'][0]:
				logging.info('Scraped %s speeches from previous sitting' % len(speeches))
			speeches = []

		# add the first speaker name that is sometimes missing
		first_speaker = '<strong>%s, %s</strong>' % (dp['osoba']['meno'], dp['osoba']['funkcia'])
		dpart['riadky'].insert(0, first_speaker)

		# extract speeches from the debate part
		text = ''
		within_scene = False
		for par in dpart['riadky']:
			if not par: continue
			par = par.replace('\n', ' ').strip()

			# skip eventual speech number
			if re.match('^(\d+)\.$', par): continue

			# convert brackets to parentheses
			par = re.sub(r'\[(.*?)\]', r'(\1)', par)
			# convert all inner nested parentheses to brackets
			n = 1
			while n >= 1:
				(par, n) = re.subn(r'\((.*?)\((\.*?)\)(.*?)\)', r'(\1[\2]\3)', par, flags=re.DOTALL)

			# process eventual multiparagraph scene
			if par.startswith('(') and par.count('(') > par.count(')'):
				# save eventual previous speech
				insert_speech(dpart_kind)

				text = '<p>%s</p>' % lxml.html.fromstring(par[1:]).text_content()
				within_scene = True
				continue
			if within_scene:
				if par.endswith(')') and par.count(')') > par.count('('):
					text += '\n\n<p>%s</p>' % lxml.html.fromstring(par[:-1]).text_content()
					insert_speech('scene')
					within_scene = False
				else:
					text += '\n\n<p>%s</p>' % lxml.html.fromstring(par).text_content()
				continue

			# process eventual new speaker
			# format `Doe, John, foreign minister`
			speech_start_pattern = r'<strong>(\w+), (\w+\.?)( (\w+\.?))?, (.*)</strong>'
			sp = re.match(speech_start_pattern, par, re.DOTALL)
			if sp:
				# save eventual previous speech
				insert_speech(dpart_kind)

				# identify speaker
				name = '%s %s' % (sp.group(2), sp.group(1))
				if (sp.group(4)):
					name = name.replace(' ', ' %s ' % sp.group(4))
				attribution = sp.group(5)
				if name in name_corrections:
					name = name_corrections[name]
				if len(name) == 0: continue
				speaker_id = mps.get(name)

				# create unknown speakers
				if not speaker_id:
					logging.warn('Speaker `%s, %s` not found, creating new Person' % (name, attribution))
					name_parts = re.match(r'(\w+\.?)( (\w+\.?))? (\w+)', name)
					person = {
						'name': name,
						'family_name': name_parts.group(4),
						'given_name': name_parts.group(1)
					}
					person['sort_name'] = '%s, %s' % (person['family_name'], person['given_name'])
					if name_parts.group(3):
						person['additional_name'] = name_parts.group(3)
						person['sort_name'] += ' %s' % person['additional_name']
					resp = vpapi.post('people', person)
					speaker_id = resp['id']
					mps[name] = speaker_id
				continue

			# remove HTML tags
			par = lxml.html.fromstring(par).text_content()

			# process eventual scene in this paragraph
			scene_pattern = r'(.*?)\(\s*([\d%s][^\(\)]{2,}[\.?!“])\s*\)(.*)$' % scrapeutils.CS_UPPERS
			while True:
				scene = re.match(scene_pattern, par, re.DOTALL)
				if not scene: break
				if scene.group(1):
					text += '\n\n<p>%s</p>' % scene.group(1).strip()
				insert_speech(dpart_kind)
				text = '<p>%s</p>' % scene.group(2).strip()
				insert_speech('scene')
				par = scene.group(3)

			if par:
				text += '\n\n<p>%s</p>' % par

		insert_speech(dpart_kind)

	if len(speeches) > 0:
		vpapi.post('speeches', speeches)
	logging.info('Scraped %s speeches' % len(speeches))
	speech_count += len(speeches)

	logging.info('Scraped %s speeches in total' % speech_count)

示例#21

0

显示文件

def get_chamber_id(term):
	"""Return chamber id of the given term."""
	chamber = vpapi.getfirst('organizations', where={
		'classification': 'chamber',
		'identifiers': {'$elemMatch': {'identifier': term, 'scheme': 'nrsr.sk'}}})
	return chamber['id'] if chamber else None

示例#22

0

显示文件

def scrape_motions(term):
	"""Scrape and save motions from the given term that are not scraped
	yet starting from the oldest ones. One Motion item, one VoteEvent
	item and many Vote items are created for each scraped motion detail
	page.

	Returns number of scraped motions.
	"""
	logging.info('Scraping motions of term `%s`' % term)

	# prepare mappings from source identifier to id for MPs and parliamentary groups
	chamber_id = get_chamber_id(term)

	people = vpapi.getall('people', projection={'identifiers': 1})
	mps = {mp['identifiers'][0]['identifier']: mp['id'] for mp in people if 'identifiers' in mp}

	orgs = vpapi.getall('organizations', where={'classification': 'parliamentary group', 'parent_id': chamber_id})
	parl_groups = {c['name']: c['id'] for c in orgs}

	# add differently spelled parliamentary groups
	group_corrections = {
		'2': {
			'Klub HZDS': 'Klub ĽS-HZDS',
			'Klub SMK': 'Klub SMK-MKP',
			'Klub Nezávislí': 'Klub Nezávislý',
		},
		'3': {
			'Klub HZDS': 'Klub ĽS-HZDS',
			'Klub SDKÚ': 'Klub SDKÚ-DS',
			'Klub Smer': 'Klub SMER-SD',
			'Klub Smer-SD': 'Klub SMER-SD',
			'Klub KNP': 'Klub nezávislých poslancov NR SR',
			'Klub Nezávislý': 'Klub nezávislých poslancov NR SR',
		},
	}
	for k, v in group_corrections.get(term, {}).items():
		parl_groups[k] = parl_groups[v]

	# prepare list of sessions that are not completely scraped yet
	sessions_to_scrape = []
	session_list = parse.session_list(term)
	for session in session_list['_items']:
		motions = parse.session(session['číslo'], term)
		if len(motions['_items']) == 0: continue
		last_motion_id = motions['_items'][-1]['id']
		m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % last_motion_id
		existing = vpapi.getfirst('motions', where={'sources.url': m_url})
		if existing: break
		sessions_to_scrape.append((session, motions))

	# scrape motions from those sessions
	scraped_motions_count = 0
	for s, motions in reversed(sessions_to_scrape):
		logging.info('Scraping session `%s`' % s['názov'])

		# insert the session event unless it already exists
		session = {
			'name': s['názov'],
			'identifier': s['číslo'],
			'organization_id': chamber_id,
			'type': 'session',
		}
		try:
			session['start_date'] = sk_to_utc(s['trvanie']) + 'T00:00:00'
			session['end_date'] = session['start_date']
		except ValueError:
			# multiday session contains votes; dates are set by debates scraping
			pass
		key = ('organization_id', 'type', 'identifier')
		session_id, _ = get_or_create('events', session, key)

		for i, m in enumerate(motions['_items']):
			# check if the motion is already present
			m_id = re.search(r'ID=(\d+)', m['url']['výsledok']).group(1)
			# we not use directly m['url']['kluby'] because it is not always present
			m_url = 'http://www.nrsr.sk/web/Default.aspx?sid=schodze/hlasovanie/hlasklub&ID=%s' % m_id
			existing = vpapi.getfirst('motions', where={'sources.url': m_url})
			if existing: continue

			try:
				motion_id = None
				vote_event_id = None

				# insert motion
				logging.info('Scraping motion %s of %s (voted at %s)' % (i+1, len(motions['_items']), m['dátum']))
				parsed_motion = parse.motion(m['id'])
				motion = {
					'organization_id': chamber_id,
					'legislative_session_id': session_id,
					'identifier': parsed_motion['číslo'],
					'text': parsed_motion['názov'],
					'date': sk_to_utc(m['dátum']),
					'sources': [{
						'url': parsed_motion['url'],
						'note': 'Hlasovanie na webe NRSR'
					}],
				}
				if 'výsledok' in parsed_motion:
					motion['result'] = 'pass' if parsed_motion['výsledok'] == 'Návrh prešiel' else 'fail'
				resp = vpapi.post('motions', motion)
				motion_id = resp['id']

				# insert vote event
				vote_event = {
					'motion_id': motion_id,
					'organization_id': chamber_id,
					'legislative_session_id': session_id,
					'identifier': parsed_motion['číslo'],
					'start_date': motion['date'],
					'sources': [{
						'url': parsed_motion['url'],
						'note': 'Hlasovanie na webe NRSR'
					}],
				}
				if 'výsledok' in parsed_motion:
					vote_event['result'] = motion['result']
				if 'súčty' in parsed_motion:
					options = {
						'yes': '[z] za',
						'no': '[p] proti',
						'abstain': '[?] zdržalo sa',
						'absent': '[0] neprítomní',
						'not voting': '[n] nehlasovalo'
					}
					vote_event['counts'] = [
						{'option': o, 'value': int(parsed_motion['súčty'][s])}
						for o, s in options.items() if parsed_motion['súčty'][s] != ''
					]
					if len(vote_event['counts']) == 0:
						del vote_event['counts']
				resp = vpapi.post('vote-events', vote_event)
				vote_event_id = resp['id']

				# insert votes
				if 'hlasy' in parsed_motion and len(parsed_motion['hlasy']) > 0:
					vote_options = {
						'z': 'yes',
						'p': 'no',
						'?': 'abstain',
						'n': 'not voting',
						'0': 'absent'
					}
					votes = []
					for v in parsed_motion['hlasy']:
						# skip MPs not applying their mandate
						if v['hlas'] == '-': continue
						pg = normalize_parlgroup_name(v['klub'])
						votes.append({
							'vote_event_id': vote_event_id,
							'option': vote_options[v['hlas']],
							'voter_id': mps.get(v['id']),
							'group_id': parl_groups.get(pg),
						})
					if len(votes) > 0:
						resp = vpapi.post('votes', votes)

			# delete incomplete data if insertion of the motion, vote event or votes failed
			except:
				if motion_id:
					vpapi.delete('motions', motion_id)
				if vote_event_id:
					vpapi.delete('vote-events', vote_event_id)
				raise

			scraped_motions_count += 1

	logging.info('Scraped %s motions of term `%s`' % (scraped_motions_count, term))
	return scraped_motions_count

示例#23

0

显示文件

	def scrape_from_group_and_save(group_type, id, term):
		"""Scrape memberships in a given group and save (or update) them.
		If group or MP referred by the membership does not exist, scrape
		and save it/him/her.
		"""
		group = parse.group(group_type, id)

		# if group is not scraped yet, scrape and save it
		g = vpapi.getfirst('organizations',
			where={
				'classification': group_type,
				'identifiers': {'$elemMatch': {'identifier': id, 'scheme': 'nrsr.sk'}}},
			projection={'id': 1})
		if g:
			oid = g['id']
		else:
			o = Organization.scrape(group_type, id)
			oid = o.save()

		roles = {
			'člen': 'member',
			'členka': 'member',
			'predseda': 'chairman',
			'predsedníčka': 'chairwoman',
			'podpredseda': 'vice-chairman',
			'podpredsedníčka': 'vice-chairwoman',
			'vedúci': 'chairman',
			'vedúca': 'chairwoman',
			'náhradník': 'substitute',
			'náhradníčka': 'substitute',
			'overovateľ': 'verifier',
			'overovateľka': 'verifier',
			'poverený vedením klubu': 'chairman',
			'podpredseda poverený vedením výboru': 'vice-chairman',
			'náhradný člen': 'substitute',
			'náhradná členka': 'substitute',
		}

		for member in group['členovia']:
			logging.info('Scraping membership of `%s`' % member['meno'])

			# if member MP is not scraped yet, scrape and save him
			existing = vpapi.getfirst('people',
				where={'identifiers': {'$elemMatch': {'identifier': member['id'], 'scheme': 'nrsr.sk'}}},
				projection={'id': 1})
			if existing:
				pid = existing['id']
			else:
				p = Person.scrape(member['id'], term)
				pid = p.save()

			m = Membership()
			m.person_id = pid
			m.organization_id = oid
			m.sources = [{
				'url': group['url'],
				'note': 'Profil na webe NRSR'
			}]
			# create or update all periods of the membership
			for period in member['obdobia']:
				if period.get('rola'):
					m.label = period['rola'].capitalize() + ' v skupine ' + group['názov']
					m.role = roles[period['rola'].lower()]
				else:
					m.label = 'V skupine ' + group['názov']
				if period.get('od'):
					m.start_date = sk_to_utc(period.get('od'))
				if period.get('do'):
					m.end_date = sk_to_utc(period.get('do'))
				m.save()
				for attr in ('role', 'start_date', 'end_date'):
					if hasattr(m, attr):
						delattr(m, attr)
		logging.info('Scraped %s memberships' % len(group['členovia']))

		# close all open memberships in this group that were not updated
		logging.info('Closing not updated open memberships')
		present = datetime.utcnow() - timedelta(minutes=10)
		query = {
			'organization_id': oid,
			'$or': [{'end_date': {'$exists': False}}, {'end_date': {'$in': [None, '']}}],
			'updated_at': {'$lt': present.isoformat()}
		}
		to_close = vpapi.getall('memberships', where=query)
		for m in to_close:
			vpapi.patch('memberships', m['id'], {'end_date': datestring_add(effective_date, -1)})

示例#24

0

显示文件

文件： run.py 项目： opendatakosovo/parldata-scraper

def scrape(countries, people, votes):
    global effective_date
    effective_date = date.today().isoformat()

    # execute MP's bio data.
    georgia = georgia_scraper.GeorgiaScraper()
    armenia = armenia_scraper.ArmeniaScraper()
    ukraine = ukraine_scraper.UkraineScraper()
    belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper()
    belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper()
    moldova = moldova_scraper.MoldovaScraper()
    references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine,
                  "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova,
                  "belarus-upperhouse": belarus_upperhouse}
    countries_array = []
    if countries == "all":
        for key in references:
            countries_array.append(key)
    else:
        countries_array = countries.split(',')
        indexes = []
        for country in countries_array:
            if country.lower() not in references:
                indexes.append(countries_array.index(country))
        if len(indexes) > 0:
            countries_array.pop(indexes)
    with open(os.path.join(BASE_DIR, 'access.json')) as f:
        creds = json.load(f)
    if len(countries_array) > 0:
        for item in sorted(countries_array):
            if internet_on(): # scrape and post data from parliaments if there's internet connection
                print "\n\tPosting and updating data from %s parliament" % item
                print "\tThis may take a few minutes..."
                vpapi.parliament(creds[item.lower()]['parliament'])
                vpapi.timezone(creds[item.lower()]['timezone'])
                vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password'])
                if people == "yes":
                    members = references[item.lower()].scrape_mp_bio_data()
                    chamber = references[item.lower()].scrape_chamber()
                    parliamentary_groups = references[item.lower()].scrape_parliamentary_groups()
                    committee = references[item.lower()].scrape_committee()
                    data_collections = {
                        "a-people": members,
                        "b-chamber": chamber,
                        "c-parliamentary_groups": parliamentary_groups,
                        "d-committe": committee
                    }
                    # inserts data for each data collection in Visegrad+ Api
                    for collection in sorted(set(data_collections)):
                        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        pbar = ProgressBar(widgets=widgets)
                        print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \
                              collection[2:]
                        if len(data_collections[collection]) > 0:
                            for json_doc in pbar(data_collections[collection]):
                                if collection == "a-people":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "people"
                                elif collection == "c-parliamentary_groups" or collection == "d-committe":
                                    if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\
                                            or item.lower() == "ukraine":
                                        where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']}
                                    else:
                                        where_condition = {'name': json_doc['name']}
                                    collection_of_data = "organizations"
                                elif collection == "b-chamber":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "organizations"

                                existing = vpapi.getfirst(collection_of_data, where=where_condition)
                                if not existing:
                                    resp = vpapi.post(collection_of_data, json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date)

                                    # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")

                            print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:]
                    if item.lower() != "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership(),
                            "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(),
                            "committees": references[item.lower()].scrape_committee_members()
                        }
                    elif item.lower() == "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership()
                        }

                    for data_collection in memberships:
                        widgets_stat = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                        ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        prog_bar = ProgressBar(widgets=widgets_stat)
                        if len(memberships[data_collection]) > 0:
                            print "\n\tPosting and updating data from %s membership data collection\n" % data_collection
                            for json_doc in prog_bar(memberships[data_collection]):
                                existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'],
                                                                                "person_id": json_doc['person_id']})
                                if not existing:
                                    resp = vpapi.post("memberships", json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                            print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection
                        else:
                            print "\n\tThere is no data from %s membership data collection\n" % data_collection
                            continue
                if votes == "yes":
                    if item.lower() == "ukraine":
                        events = references[item.lower()].scrape_events()
                        try:
                            if len(events) > 0:
                                widgets_events = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                           ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                                pbar_events = ProgressBar(widgets=widgets_events)
                                for json_doc in pbar_events(events):
                                    existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']})
                                    if not existing_event:
                                        resp = vpapi.post("events", json_doc)
                                    else:
                                        resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date)
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from events data collection"
                            else:
                                print "\n\tThere are no new events"
                        except BaseException as ex:
                            print ex.message
                        else:
                            print "\tThere's not any event to post from %s parliament" % item
                        motions_vote_events = references[item.lower()].vote_events()
                        voting_results = references[item.lower()].scrape_votes()
                        try:
                            if len(voting_results) > 0:
                                resp = vpapi.post("votes", voting_results)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    elif item.lower() == "georgia":
                        voting_data_collections = {
                            "amotions": references[item.lower()].motions(),
                            "bvote-events": references[item.lower()].vote_events(),
                        }
                        votes = references[item.lower()].scrape_votes()
                        for collection in sorted(voting_data_collections):
                            try:
                                if len(voting_data_collections[collection]) > 0:
                                    resp = vpapi.post(collection[1:], voting_data_collections[collection])
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                    print "\n\tFinished Posting and updating data from %s data collection" % collection[1:]
                            except BaseException as ex:
                                print ex.message

                        print "\n\tPosting voting records from Georgia Parliament\n"
                        try:
                            if len(votes) > 0:
                                vpapi.post("votes", votes)
                            print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    else:
                        print "\n\tThere are no voting records for %s" % item
                vpapi.deauthorize()
            else:
                print "\n\tInternet connection problems for %s official parliament web page" % item
                continue
    else:
        print "\n\tInvalid country/ies added"

示例#25

0

显示文件

    def export_speeches(self):
        speeches = self.load_json('speeches')
        people = {}
        prefix_regex = re.compile(
            ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\
(generalni sekretar )', re.U)

        for p in vpapi.getall('people'):
            name = self.normalize_name(p['name'])
            people[name] = p['id']

        for speech in speeches:
            session_id = speech.get('event_id')
            speech['event_id'] = self.events_ids[session_id]
            url = speech['sources'][0]['url']
            if url.endswith('.pdf'):
                parsed_speeches = self.download_pdf(url)
                for n, s in enumerate(parsed_speeches):
                    text_speech = speech.copy()
                    text_speech['text'] = s['text']
                    text_speech['position'] = n + 1
                    text_speech['type'] = 'speech'

                    creator = self.normalize_name(s['creator'])
                    creator = prefix_regex.sub('', creator)

                    if creator in people:
                        text_speech['creator_id'] = people[creator]
                    else:
                        creator_id = None

                        for name in people:
                            if name in creator:
                                creator_id = people[name]
                                break

                        if creator_id is None:
                            resp = vpapi.getfirst('people',
                                                  where={
                                                      'name': {
                                                          '$regex':
                                                          s['creator'],
                                                          'options': 'i'
                                                      }
                                                  })
                            if resp is None:
                                self.log(
                                    'Person "%(creator)s" not found. \
Creating one' % s, WARNING)
                                item = {
                                    'name': s['creator'],
                                    'sources': text_speech['sources']
                                }
                                resp = vpapi.post('people', item)
                            creator_id = resp['id']

                        people[creator] = creator_id
                        text_speech['creator_id'] = creator_id

                    self.get_or_create('speeches',
                                       text_speech,
                                       where_keys=['event_id', 'position'])
            else:
                self.get_or_create('speeches', speech)

示例#26

0

显示文件

文件： georgia_scraper.py 项目： opendatakosovo/parldata-scraper

    def laws(self):
        # Returns the list of the json structure of motions and
        # vote events document with all the information data needed for both.
        laws_url = "http://votes.parliament.ge/en/search/passed_laws?sEcho=1&iColumns=7&sColumns=&iDisplayStart=0" \
                   "&iDisplayLength=3000000&mDataProp_0=0&mDataProp_1=1&mDataProp_2=2&mDataProp_3=3&mDataProp_4=4" \
                   "&mDataProp_5=5&mDataProp_6=6&sSearch=&bRegex=false&sSearch_0=&bRegex_0=false" \
                   "&bSearchable_0=true&sSearch_1=&bRegex_1=false&bSearchable_1=true&sSearch_2=" \
                   "&bRegex_2=false&bSearchable_2=true&sSearch_3=&bRegex_3=false&bSearchable_3=true&sSearch_4=" \
                   "&bRegex_4=false&bSearchable_4=true&sSearch_5=&bRegex_5=false&bSearchable_5=true&sSearch_6=" \
                   "&bRegex_6=false&bSearchable_6=true&iSortCol_0=0&sSortDir_0=desc&iSortingCols=1&bSortable_0=true" \
                   "&bSortable_1=true&bSortable_2=true&bSortable_3=true&bSortable_4=true&bSortable_5=true" \
                   "&bSortable_6=true&parliament=1&start_date=&end_date=&_=1440146282982"

        result = urlopen(laws_url).read()
        json_result = json.loads(result)
        laws_array = []
        last_item = vpapi.getfirst("vote-events", sort="-start_date")

        index_counter = 0
        if last_item:
            law_url = "/en/laws/" + last_item['id']
            for element in json_result['aaData']:
                soup = BeautifulSoup(element[1], "html.parser")
                url_soup = soup.find('a').get('href')
                if law_url == url_soup:
                    break
                index_counter += 1
        else:
            index_counter = len(json_result['aaData'])

        existing = vpapi.getfirst("organizations", where={"identifiers": {"$elemMatch": {"identifier": "8", "scheme": "parliament.ge"}}})
        if existing:
            organization_id = existing['id']

        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        if len(json_result['aaData'][:index_counter]) > 0:
            for item in pbar(json_result['aaData'][:index_counter]):
                soup = BeautifulSoup(item[1], 'html.parser')
                api_name = soup.get_text()
                url = "http://votes.parliament.ge" + soup.find('a').get('href')
                index_of_id = url.index('laws/')
                index = index_of_id + 5
                motion_id = url[index:]
                date = self.local_to_utc(item[0] + " 04:00")
                votes_for = item[3]
                votes_against = item[4]
                # votes_abstain = str(item[5])
                votes_not_present = item[6]
                json_motion = {
                    "date": date,
                    "start_date": date,
                    "sources": [{
                        "url": url,
                        "note": "ვებგვერდი"
                    }],
                    "id": motion_id,
                    "identifier": motion_id,
                    "motion_id": motion_id,
                    "organization_id": organization_id,
                    "text": api_name,
                    "result": "pass",
                    "counts": [
                        {
                            "option": "yes",
                            "value": votes_for
                        },
                        {
                            "option": "no",
                            "value": votes_against
                        },
                        {
                            "option": "absent",
                            "value": votes_not_present
                        }
                    ]
                }
                laws_array.append(json_motion)
        return laws_array

示例#27

0

显示文件

 def get_latest_item(self, endpoint, time_key):
     return vpapi.getfirst(endpoint, sort='-%s' % time_key)

示例#28

0

显示文件

文件： parliaments.py 项目： TransparenCEE/parldata-scrapers-poland-hungary-montenegro

    def export_speeches(self):
        speeches = self.load_json('speeches')
        people = {}
        prefix_regex = re.compile(
            ur'(pred\u015bedavaju\u0107i )|(pred\u015bednik )|\
(generalni sekretar )', re.U)

        for p in vpapi.getall('people'):
            name = self.normalize_name(p['name'])
            people[name] = p['id']

        for speech in speeches:
            session_id = speech.get('event_id')
            speech['event_id'] = self.events_ids[session_id]
            url = speech['sources'][0]['url']
            if url.endswith('.pdf'):
                parsed_speeches = self.download_pdf(url)
                for n, s in enumerate(parsed_speeches):
                    text_speech = speech.copy()
                    text_speech['text'] = s['text']
                    text_speech['position'] = n + 1
                    text_speech['type'] = 'speech'

                    creator = self.normalize_name(s['creator'])
                    creator = prefix_regex.sub('', creator)

                    if creator in people:
                        text_speech['creator_id'] = people[creator]
                    else:
                        creator_id = None

                        for name in people:
                            if name in creator:
                                creator_id = people[name]
                                break

                        if creator_id is None:
                            resp = vpapi.getfirst(
                                'people', where={
                                    'name': {
                                        '$regex': s['creator'],
                                        'options': 'i'
                                    }
                                }
                            )
                            if resp is None:
                                self.log('Person "%(creator)s" not found. \
Creating one' % s, WARNING)
                                item = {
                                    'name': s['creator'],
                                    'sources': text_speech['sources']
                                }
                                resp = vpapi.post('people', item)
                            creator_id = resp['id']

                        people[creator] = creator_id
                        text_speech['creator_id'] = creator_id

                    self.get_or_create(
                        'speeches',
                        text_speech,
                        where_keys=['event_id', 'position']
                    )
            else:
                self.get_or_create('speeches', speech)

示例#29

0

显示文件

文件： ukraine_scraper.py 项目： opendatakosovo/parldata-scraper

    def vote_events(self):
        print "\n\n\tScraping Motions and Vote Events data from Ukraine's parliament..."
        vote_events = parser.vote_events_list()
        index_vote_events = self.get_index("vote-events", "-start_date", vote_events)
        index_motions = self.get_index("motions", "-date", vote_events)
        index = min(index_vote_events, index_motions)
        voting_events = []
        motions = []
        if len(vote_events) > 0:
            print "\n\n\tPosting Motions and Vote events data to the Visegrad+ API from Ukraine's parliament..."
            if len(vote_events[index:]) > 0:
                widgets = [
                    "        Progress: ",
                    Percentage(),
                    " ",
                    Bar(marker="#", left="[", right="]"),
                    " ",
                    ETA(),
                    " - Processed: ",
                    Counter(),
                    " events             ",
                ]
                pbar = ProgressBar(widgets=widgets)
                for motion in pbar(vote_events[index:]):
                    json_motion = self.build_json_motion(
                        motion["date"][:19],
                        motion["sources"][0]["url"],
                        motion["id"],
                        motion["legislative_session_id"],
                        motion["organization_id"],
                        motion["text"],
                        motion["result"],
                    )
                    motions.append(json_motion)
                    existing = vpapi.getfirst("motions", where={"identifier": json_motion["identifier"]})
                    if not existing:
                        vpapi.post("motions", json_motion)
                    else:
                        continue

                    json_vote_event = self.build_vote_event_json(
                        motion["date"][:19],
                        motion["legislative_session_id"],
                        motion["id"],
                        motion["organization_id"],
                        motion["result"],
                        motion["counts"],
                    )
                    voting_events.append(json_vote_event)
                    existing1 = vpapi.getfirst("vote-events", where={"id": json_vote_event["id"]})
                    if not existing1:
                        vpapi.post("vote-events", json_vote_event)
                    else:
                        continue
                print "\n\tFinished posting motions and vote events data."
                print "\tScraped %s motions and vote events" % str(len(vote_events[index:]) * 2)
            else:
                print "\n\tThere are no new motion and vote events data."
        else:
            print "\n\tThere are no new motions or vote events."
        return motions, voting_events