def update_motion_url(self):
     print "\n\tUpdating url of motions"
     motions = vpapi.getall("motions")
     counter = 0
     widgets = [
         "        Progress: ",
         Percentage(),
         " ",
         Bar(marker="#", left="[", right="]"),
         " ",
         ETA(),
         " - Processed: ",
         Counter(),
         " events             ",
     ]
     pbar = ProgressBar(widgets=widgets)
     for motion in motions:
         counter += 1
         sources = motion["sources"]
         url = sources[0]["url"]
         print (str(counter))
         if "http://w1.c1.rada.gov.ua" not in url:
             motion_id = motion["id"]
             motion["sources"][0]["url"] = "http://w1.c1.rada.gov.ua" + url
             items_to_delete = ["created_at", "updated_at", "_links", "id"]
             for item_delete in items_to_delete:
                 del motion[item_delete]
             vpapi.put("motions", motion_id, motion, effective_date=self.effective_date())
         else:
             continue
     print "\n\tFinished updating motions url"
    def scrape_chamber(self):
        # Iterates in every parliamentary group json document and
        # returns the list with the json document structure that Visegrad+ API accepts
        print "\n\tScraping chambers from Belarus Lowerhouse parliament..."
        chambers = parser.chambers()
        chambers_list = []
        url = "http://house.gov.by/index.php/,10087,,,,2,,,0.html"
        for chamber in chambers:
            chamber_json = self.build_organization_doc("chamber", chambers[chamber]['name'], chamber,
                                                       chambers[chamber]['start_date'], chambers[chamber]['end_date'],
                                                       url, "", "")
            if chamber == "2":
                del chamber_json['dissolution_date']
            del chamber_json['contact_details']
            del chamber_json['parent_id']

            existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}})
            if not existing:
                resp = vpapi.post("organizations", chamber_json)
            else:
                # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date())
            if resp["_status"] != "OK":
                raise Exception("Invalid status code")
            chambers_list.append(chamber_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers"
        return chambers_list
예제 #3
0
	def save(self, update_only=False):
		"""If a compatible membership already exists, update it. Otherwise,
		create a new one. If `update_only` is True, only existing memberships
		are updated, no new one is created.
		Memberships are compatible if their fields `start_date`, `role` and `post`
		are compatible. Field 'end_date' is not checked to allow for later corrections
		of guessed end dates used when a member disappears from a group profile.
		"""
		memberships = vpapi.getall('memberships',
			where={'person_id': self.person_id, 'organization_id': self.organization_id},
			sort='-start_date')
		to_save = self.__dict__.copy()

		id = None
		for existing in memberships:
			if self._merge_values('start_date', to_save, existing) \
					and to_save.get('end_date', '9999-12-31') >= existing.get('start_date', '0001-01-01') \
					and self._merge_values('role', to_save, existing) \
					and self._merge_values('post', to_save, existing):
				id = existing['id']
				self._merge_values('end_date', to_save, existing)
				break
			else:
				to_save = self.__dict__.copy()

		if id:
			resp = vpapi.put('memberships', id, to_save)
		else:
			if update_only: return
			resp = vpapi.post('memberships', self.__dict__)

		if resp['_status'] != 'OK':
			raise Exception(self.name, resp)
    def scrape_chamber(self):
        # Scrapes chambers and Returns the list of chambers with all the information needed for each
        url = "http://www.parliament.am/deputies.php?sel=ful&ord=photo&show_session=5&lang=arm&enc=utf8"
        soup = scrape.download_html_file(url)
        chambers_list = []
        print "\n\tScraping chambers from Armenia's parliament...\n"
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        all_options = soup.find("select", {"name": "show_session"}).findAll("option")
        for each_option in pbar(all_options):
            identifier = each_option.get('value')
            name = each_option.get_text()
            url = "http://www.parliament.am/deputies.php?lang=arm&sel=&ord=&show_session=" + identifier
            if "100" not in identifier:
                founding_date = self.terms[identifier]["start_date"]
                dissolution_date = self.terms[identifier]["end_date"]
                chamber_json = self.build_organization_doc("chamber", name, identifier, founding_date,
                                                           dissolution_date, url, "", "")

                del chamber_json['contact_details']
                del chamber_json['parent_id']
                if identifier == "5":
                    del chamber_json['dissolution_date']

                existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}})
                if not existing:
                    resp = vpapi.post("organizations", chamber_json)
                else:
                    resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date())
                if resp["_status"] != "OK":
                    raise Exception("Invalid status code")
                chambers_list.append(chamber_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers_list)) + " chambers"
        return chambers_list
예제 #5
0
	def save(self):
		scraped = self.__dict__
		existing = vpapi.getfirst('people', where={'identifiers': {'$elemMatch': self.identifiers[0]}})
		if not existing:
			resp = vpapi.post('people', scraped)
		else:
			# update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
			resp = vpapi.put('people', existing['id'], scraped, effective_date=effective_date)

		if resp['_status'] != 'OK':
			raise Exception(self.name, resp)
		return resp['id']
def save(scraped):
    import json

    r = vpapi.get("organizations", where={"identifiers": {"$elemMatch": scraped["identifiers"][0]}})
    if not r["_items"]:
        r = vpapi.post("organizations", scraped)
    else:
        # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
        existing = r["_items"][0]
        r = vpapi.put("organizations/%s" % existing["id"], scraped)
    if r["_status"] != "OK":
        raise Exception(self.name, resp)
    return r["id"]
    def get_or_create(self, endpoint, item, refresh=False, where_keys=None):
        sort = []
        embed = []
        where = {}
        if where_keys:
            for key in where_keys:
                where[key] = item[key]
        elif endpoint == 'memberships':
            where = {
                'person_id': item['person_id'],
                'organization_id': item['organization_id']
            }
            where['start_date'] = item.get('start_date', {"$exists": False})

            sort = [('start_date', -1)]
        elif endpoint in ('motions', 'speeches'):
            where = {'sources.url': item['sources'][0]['url']}
        elif endpoint == 'vote-events':
            embed = ['votes']
            if 'motion_id' in item:
                where = {'motion_id': item['motion_id']}
            else:
                where = {'start_date': item['start_date']}
        elif endpoint == 'votes':
            where = {
                'vote_event_id': item['vote_event_id'],
                'voter_id': item['voter_id'],
            }
        elif endpoint == 'events':
            where = {'identifier': item['identifier']}
        else:
            where = {
                'identifiers': {'$elemMatch': item['identifiers'][0]}}
        created = False
        resp = vpapi.getfirst(endpoint, where=where, sort=sort)
        if not resp:
            resp = vpapi.post(endpoint, item)
            created = True
            self.log('Created %s' % resp['_links']['self']['href'], DEBUG)
        else:
            pk = resp['id']
            resp = vpapi.put("%s/%s" % (endpoint, pk), item)
            self.log('Updated %s' % resp['_links']['self']['href'], DEBUG)

        if resp['_status'] != 'OK':
            raise Exception(resp)
        if refresh:
            resp = vpapi.get(
                resp['_links']['self']['href'], sort=sort, embed=embed)
        resp['_created'] = created
        return resp
    def scrape_chamber(self):
        # Scrapes chambers and Returns the list of chambers with all the information needed for each
        url = "http://www.parlament.md/Parlamentarismul%C3%AEnRepublicaMoldova/" \
              "Istorie%C8%99ievolu%C8%9Bie/tabid/96/language/ro-RO/Default.aspx"
        chambers_to_fix = {"XII": "12", "XIII": "13", "XIV": "14", "XV": "15", "XVI": "16", "XVII": "17",
                           "XVIII": "18", "XIX": "19", "XX": "20"}
        chambers = []
        soup = scrape.download_html_file(url)
        print "\n\tScraping chambers from Moldova's parliament..."
        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
        pbar = ProgressBar(widgets=widgets)
        for each_a in pbar(soup.find('div', {"class": "LocalizedContent"}).findAll('a')):
            name = each_a.get_text().strip()
            if name != "":
                url = "http://www.parlament.md" + each_a.get('href')
                if "(" in name:
                    chamber_roman = name[name.index('X'):name.index('(')].replace('-a', "").strip()
                    chamber_identifier = chambers_to_fix[chamber_roman]
                    founding_date = self.terms[chamber_identifier]['start_date']
                    dissolution_date = self.terms[chamber_identifier]['end_date']
                else:
                    chamber_roman = name[-6:len(name)-3].strip()
                    chamber_identifier = chambers_to_fix[chamber_roman]
                    founding_date = self.terms[chamber_identifier]['start_date']
                    dissolution_date = self.terms[chamber_identifier]['end_date']

                chamber_json = self.build_organization_doc("chamber", name, chamber_identifier, founding_date,
                                                           dissolution_date, url, "", "")

                del chamber_json['contact_details']
                del chamber_json['parent_id']
                if chamber_identifier == "20":
                    del chamber_json['dissolution_date']

                existing = vpapi.getfirst("organizations", where={'identifiers': {'$elemMatch': chamber_json['identifiers'][0]}})
                if not existing:
                    resp = vpapi.post("organizations", chamber_json)
                else:
                    # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                    resp = vpapi.put("organizations", existing['id'], chamber_json, effective_date=self.effective_date())
                if resp["_status"] != "OK":
                    raise Exception("Invalid status code")
                chambers.append(chamber_json)
        print "\n\tScraping completed! \n\tScraped " + str(len(chambers)) + " chambers"
        return chambers
예제 #9
0
def scrape(countries, people, votes):
    global effective_date
    effective_date = date.today().isoformat()

    # execute MP's bio data.
    georgia = georgia_scraper.GeorgiaScraper()
    armenia = armenia_scraper.ArmeniaScraper()
    ukraine = ukraine_scraper.UkraineScraper()
    belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper()
    belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper()
    moldova = moldova_scraper.MoldovaScraper()
    references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine,
                  "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova,
                  "belarus-upperhouse": belarus_upperhouse}
    countries_array = []
    if countries == "all":
        for key in references:
            countries_array.append(key)
    else:
        countries_array = countries.split(',')
        indexes = []
        for country in countries_array:
            if country.lower() not in references:
                indexes.append(countries_array.index(country))
        if len(indexes) > 0:
            countries_array.pop(indexes)
    with open(os.path.join(BASE_DIR, 'access.json')) as f:
        creds = json.load(f)
    if len(countries_array) > 0:
        for item in sorted(countries_array):
            if internet_on(): # scrape and post data from parliaments if there's internet connection
                print "\n\tPosting and updating data from %s parliament" % item
                print "\tThis may take a few minutes..."
                vpapi.parliament(creds[item.lower()]['parliament'])
                vpapi.timezone(creds[item.lower()]['timezone'])
                vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password'])
                if people == "yes":
                    members = references[item.lower()].scrape_mp_bio_data()
                    chamber = references[item.lower()].scrape_chamber()
                    parliamentary_groups = references[item.lower()].scrape_parliamentary_groups()
                    committee = references[item.lower()].scrape_committee()
                    data_collections = {
                        "a-people": members,
                        "b-chamber": chamber,
                        "c-parliamentary_groups": parliamentary_groups,
                        "d-committe": committee
                    }
                    # inserts data for each data collection in Visegrad+ Api
                    for collection in sorted(set(data_collections)):
                        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        pbar = ProgressBar(widgets=widgets)
                        print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \
                              collection[2:]
                        if len(data_collections[collection]) > 0:
                            for json_doc in pbar(data_collections[collection]):
                                if collection == "a-people":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "people"
                                elif collection == "c-parliamentary_groups" or collection == "d-committe":
                                    if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\
                                            or item.lower() == "ukraine":
                                        where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']}
                                    else:
                                        where_condition = {'name': json_doc['name']}
                                    collection_of_data = "organizations"
                                elif collection == "b-chamber":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "organizations"

                                existing = vpapi.getfirst(collection_of_data, where=where_condition)
                                if not existing:
                                    resp = vpapi.post(collection_of_data, json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date)

                                    # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")

                            print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:]
                    if item.lower() != "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership(),
                            "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(),
                            "committees": references[item.lower()].scrape_committee_members()
                        }
                    elif item.lower() == "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership()
                        }

                    for data_collection in memberships:
                        widgets_stat = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                        ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        prog_bar = ProgressBar(widgets=widgets_stat)
                        if len(memberships[data_collection]) > 0:
                            print "\n\tPosting and updating data from %s membership data collection\n" % data_collection
                            for json_doc in prog_bar(memberships[data_collection]):
                                existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'],
                                                                                "person_id": json_doc['person_id']})
                                if not existing:
                                    resp = vpapi.post("memberships", json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                            print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection
                        else:
                            print "\n\tThere is no data from %s membership data collection\n" % data_collection
                            continue
                if votes == "yes":
                    if item.lower() == "ukraine":
                        events = references[item.lower()].scrape_events()
                        try:
                            if len(events) > 0:
                                widgets_events = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                           ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                                pbar_events = ProgressBar(widgets=widgets_events)
                                for json_doc in pbar_events(events):
                                    existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']})
                                    if not existing_event:
                                        resp = vpapi.post("events", json_doc)
                                    else:
                                        resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date)
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from events data collection"
                            else:
                                print "\n\tThere are no new events"
                        except BaseException as ex:
                            print ex.message
                        else:
                            print "\tThere's not any event to post from %s parliament" % item
                        motions_vote_events = references[item.lower()].vote_events()
                        voting_results = references[item.lower()].scrape_votes()
                        try:
                            if len(voting_results) > 0:
                                resp = vpapi.post("votes", voting_results)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    elif item.lower() == "georgia":
                        voting_data_collections = {
                            "amotions": references[item.lower()].motions(),
                            "bvote-events": references[item.lower()].vote_events(),
                        }
                        votes = references[item.lower()].scrape_votes()
                        for collection in sorted(voting_data_collections):
                            try:
                                if len(voting_data_collections[collection]) > 0:
                                    resp = vpapi.post(collection[1:], voting_data_collections[collection])
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                    print "\n\tFinished Posting and updating data from %s data collection" % collection[1:]
                            except BaseException as ex:
                                print ex.message

                        print "\n\tPosting voting records from Georgia Parliament\n"
                        try:
                            if len(votes) > 0:
                                vpapi.post("votes", votes)
                            print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    else:
                        print "\n\tThere are no voting records for %s" % item
                vpapi.deauthorize()
            else:
                print "\n\tInternet connection problems for %s official parliament web page" % item
                continue
    else:
        print "\n\tInvalid country/ies added"