Пример #1
0
    def __init__(self, *args, **kwargs):
        super(VisegradSpider, self).__init__(*args, **kwargs)

        vpapi.parliament(self.get_parliament())
        vpapi.authorize(self.get_user(), self.get_password())

        dispatcher.connect(self.spider_opened, signals.spider_opened)
    def __init__(self, log = None):
        vpapi.parliament(self.get_parliament())
        vpapi.authorize(self.get_user(), self.get_password())

        self._chamber = None
        self._ids = {}
        if log is None:
            self.log = scrapy.log.msg
        else:
            self.log = log
Пример #3
0
def main():
	# read command-line arguments
	ap = argparse.ArgumentParser('Scrapes data from Slovak parliament website http://nrsr.sk')
	ap.add_argument('--people', choices=['initial', 'recent', 'none'], default='recent', help='scrape of people, organizations and memberships')
	ap.add_argument('--votes', choices=['initial', 'recent', 'none'], default='recent', help='scrape of motions and votes')
	ap.add_argument('--debates', choices=['initial', 'recent', 'none'], default='recent', help='scrape of speeches from debates')
	ap.add_argument('--term', help='term to scrape recent data from; current term is used when omitted')
	args = ap.parse_args()

	# set-up logging to a local file
	if not os.path.exists(LOGS_DIR):
		os.makedirs(LOGS_DIR)
	logname = datetime.utcnow().strftime('%Y-%m-%d-%H%M%S') + '.log'
	logname = os.path.join(LOGS_DIR, logname)
	logname = os.path.abspath(logname)
	logging.basicConfig(level=logging.DEBUG, format='%(message)s', handlers=[logging.FileHandler(logname, 'w', 'utf-8')])
	logging.getLogger('requests').setLevel(logging.ERROR)

	logging.info('Started')
	try:
		# set-up the API access
		vpapi.parliament('sk/nrsr')
		vpapi.timezone('Europe/Bratislava')
		with open(os.path.join(CONF_DIR, 'private.json'), encoding='utf8') as f:
			creds = json.load(f)
		vpapi.authorize(creds['api_user'], creds['password'])

		# indicate that the scraper has started
		db_log = vpapi.post('logs', {'status': 'running', 'file': logname, 'params': args.__dict__})

		# clear cached source files
		if scrapeutils.USE_WEBCACHE:
			logging.info('Clearing cached files')
			scrapeutils.clear_cache()

		# test parser functions
		logging.info('Testing parser functions')
		out = io.StringIO()
		suite = unittest.TestLoader().loadTestsFromModule(sys.modules['test'])
		result = unittest.TextTestRunner(stream=out).run(suite)
		logging.info(out.getvalue())
		if result.errors or result.failures:
			raise RuntimeError('Unit tests of parser functions failed, update canceled.')

		if args.people == 'initial':
			# initial scrape of all history of people and organizations
			logging.info('Initial scrape - deleting people, organizations and memberships')
			vpapi.delete('memberships')
			vpapi.delete('organizations')
			vpapi.delete('people')
			for term in sorted(parse.terms.keys()):
				scrape_people(term)

		elif args.people == 'recent':
			# incremental scrape of people and organizations since the last scrape
			term = args.term or parse.current_term()
			if term not in parse.terms:
				raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun for the recently finished term once more.' % term)
			scrape_people(term)

		terms_with_old_debates = ('1', '2', '3', '4')
		if args.debates == 'initial':
			# initial scrape of debates from all terms
			logging.info('Initial scrape - deleting speeches and events')
			vpapi.delete('speeches')
			vpapi.delete('events')
			# newer terms are scraped first to get full names of unknown speakers
			for term in sorted(parse.terms.keys()):
				if term in terms_with_old_debates: continue
				scrape_new_debates(term)
			for term in terms_with_old_debates:
				scrape_old_debates(term)

		elif args.debates == 'recent':
			# incremental scrape of debates since the last scrape
			term = args.term or parse.current_term()
			if term not in parse.terms:
				raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term)
			if term in terms_with_old_debates:
				scrape_old_debates(term)
			else:
				scrape_new_debates(term)

		if args.votes == 'initial':
			# initial scrape of votes from all terms
			logging.info('Initial scrape - deleting votes, vote-events and motions')
			vpapi.delete('votes')
			vpapi.delete('vote-events')
			vpapi.delete('motions')
			for term in sorted(parse.terms.keys()):
				scrape_motions(term)

		elif args.votes == 'recent':
			# incremental scrape of votes since the last scrape
			term = args.term or parse.current_term()
			if term not in parse.terms:
				raise Exception('Unknown term `%s`. Scrape canceled. Add it to the terms list in parse.py an rerun once more.' % term)
			scrape_motions(term)

		status = 'finished'

	except BaseException as e:
		logging.critical(e, exc_info=True)
		if hasattr(e, 'response') and hasattr(e.response, '_content'):
			logging.critical(e.response._content.decode('utf-8'))
		status = 'interrupted' if isinstance(e, KeyboardInterrupt) else 'failed'

		# output to console to provoke an e-mail from Cron
		print('Scraping of parliament sk/nrsr failed, see\n\n' + logname + '\n\nfor details.')

	finally:
		logging.info(status.capitalize())
		if 'db_log' in locals():
			vpapi.patch('logs', db_log['id'], {'status': status})
#adding some more motions, vote-events (> 60536)

import scrapeutils
import vpapi
import authentication
import io
import os.path
import logging
from datetime import date, datetime, timedelta
import argparse

LOGS_DIR = '/var/log/scrapers/cz/psp'

vpapi.parliament('cz/psp')
vpapi.authorize(authentication.username,authentication.password)
vpapi.timezone('Europe/Prague')


#motions, vote-events, votes:
def guess_majority(quorum,present):
    if int(quorum) == 120:
        return 'two-thirds representatives majority'
    if int(quorum) == 101 and int(present)<200:
        return 'all representatives majority'
    else:
        return 'simple majority'

def result2result(res):
    if res == "A":
        return "pass"
    else:
Пример #5
0
def scrape(countries, people, votes):
    global effective_date
    effective_date = date.today().isoformat()

    # execute MP's bio data.
    georgia = georgia_scraper.GeorgiaScraper()
    armenia = armenia_scraper.ArmeniaScraper()
    ukraine = ukraine_scraper.UkraineScraper()
    belarus_lowerhouse = belarus_lowerhouse_scraper.BelarusLowerhouseScraper()
    belarus_upperhouse = belarus_upperhouse_scraper.BelarusUpperhouseScraper()
    moldova = moldova_scraper.MoldovaScraper()
    references = {"georgia": georgia, "armenia": armenia, "ukraine": ukraine,
                  "belarus-lowerhouse": belarus_lowerhouse, "moldova": moldova,
                  "belarus-upperhouse": belarus_upperhouse}
    countries_array = []
    if countries == "all":
        for key in references:
            countries_array.append(key)
    else:
        countries_array = countries.split(',')
        indexes = []
        for country in countries_array:
            if country.lower() not in references:
                indexes.append(countries_array.index(country))
        if len(indexes) > 0:
            countries_array.pop(indexes)
    with open(os.path.join(BASE_DIR, 'access.json')) as f:
        creds = json.load(f)
    if len(countries_array) > 0:
        for item in sorted(countries_array):
            if internet_on(): # scrape and post data from parliaments if there's internet connection
                print "\n\tPosting and updating data from %s parliament" % item
                print "\tThis may take a few minutes..."
                vpapi.parliament(creds[item.lower()]['parliament'])
                vpapi.timezone(creds[item.lower()]['timezone'])
                vpapi.authorize(creds[item.lower()]['api_user'], creds[item.lower()]['password'])
                if people == "yes":
                    members = references[item.lower()].scrape_mp_bio_data()
                    chamber = references[item.lower()].scrape_chamber()
                    parliamentary_groups = references[item.lower()].scrape_parliamentary_groups()
                    committee = references[item.lower()].scrape_committee()
                    data_collections = {
                        "a-people": members,
                        "b-chamber": chamber,
                        "c-parliamentary_groups": parliamentary_groups,
                        "d-committe": committee
                    }
                    # inserts data for each data collection in Visegrad+ Api
                    for collection in sorted(set(data_collections)):
                        widgets = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                   ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        pbar = ProgressBar(widgets=widgets)
                        print "\n\tPosting and updating data to the Visegrad+ from %s data collection\n\n" % \
                              collection[2:]
                        if len(data_collections[collection]) > 0:
                            for json_doc in pbar(data_collections[collection]):
                                if collection == "a-people":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "people"
                                elif collection == "c-parliamentary_groups" or collection == "d-committe":
                                    if item.lower() == "armenia" or item.lower() == "belarus-upperhouse"\
                                            or item.lower() == "ukraine":
                                        where_condition = {'name': json_doc['name'], "parent_id": json_doc['parent_id']}
                                    else:
                                        where_condition = {'name': json_doc['name']}
                                    collection_of_data = "organizations"
                                elif collection == "b-chamber":
                                    where_condition = {'identifiers': {'$elemMatch': json_doc['identifiers'][0]}}
                                    collection_of_data = "organizations"

                                existing = vpapi.getfirst(collection_of_data, where=where_condition)
                                if not existing:
                                    resp = vpapi.post(collection_of_data, json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put(collection_of_data, json_obj_id, json_doc, effective_date=effective_date)

                                    # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")

                            print "\n\tFinished Posting and updating data from %s data collection\n" % collection[2:]
                    if item.lower() != "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership(),
                            "parliamentary_groups": references[item.lower()].scrape_parliamentary_group_membership(),
                            "committees": references[item.lower()].scrape_committee_members()
                        }
                    elif item.lower() == "georgia":
                        memberships = {
                            "chambers": references[item.lower()].scrape_membership()
                        }

                    for data_collection in memberships:
                        widgets_stat = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                        ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                        prog_bar = ProgressBar(widgets=widgets_stat)
                        if len(memberships[data_collection]) > 0:
                            print "\n\tPosting and updating data from %s membership data collection\n" % data_collection
                            for json_doc in prog_bar(memberships[data_collection]):
                                existing = vpapi.getfirst("memberships", where={'organization_id': json_doc['organization_id'],
                                                                                "person_id": json_doc['person_id']})
                                if not existing:
                                    resp = vpapi.post("memberships", json_doc)
                                else:
                                    json_obj_id = existing['id']
                                    items_to_delete = ["created_at", "updated_at", "_links", "id"]
                                    for item_delete in items_to_delete:
                                        del existing[item_delete]
                                    if json.loads(json.dumps(json_doc)) == existing:
                                        continue
                                    else:
                                        resp = vpapi.put("memberships", json_obj_id, json_doc, effective_date=effective_date)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                            print "\n\tFinished Posted and updated data from %s membership data collection\n" % data_collection
                        else:
                            print "\n\tThere is no data from %s membership data collection\n" % data_collection
                            continue
                if votes == "yes":
                    if item.lower() == "ukraine":
                        events = references[item.lower()].scrape_events()
                        try:
                            if len(events) > 0:
                                widgets_events = ['        Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'),
                                           ' ', ETA(), " - Processed: ", Counter(), ' items             ']
                                pbar_events = ProgressBar(widgets=widgets_events)
                                for json_doc in pbar_events(events):
                                    existing_event = vpapi.getfirst("events", where={'identifier': json_doc['identifier']})
                                    if not existing_event:
                                        resp = vpapi.post("events", json_doc)
                                    else:
                                        resp = vpapi.put("events", json_doc['id'], json_doc, effective_date=effective_date)
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from events data collection"
                            else:
                                print "\n\tThere are no new events"
                        except BaseException as ex:
                            print ex.message
                        else:
                            print "\tThere's not any event to post from %s parliament" % item
                        motions_vote_events = references[item.lower()].vote_events()
                        voting_results = references[item.lower()].scrape_votes()
                        try:
                            if len(voting_results) > 0:
                                resp = vpapi.post("votes", voting_results)
                                if resp["_status"] != "OK":
                                    raise Exception("Invalid status code")
                                print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    elif item.lower() == "georgia":
                        voting_data_collections = {
                            "amotions": references[item.lower()].motions(),
                            "bvote-events": references[item.lower()].vote_events(),
                        }
                        votes = references[item.lower()].scrape_votes()
                        for collection in sorted(voting_data_collections):
                            try:
                                if len(voting_data_collections[collection]) > 0:
                                    resp = vpapi.post(collection[1:], voting_data_collections[collection])
                                    if resp["_status"] != "OK":
                                        raise Exception("Invalid status code")
                                    print "\n\tFinished Posting and updating data from %s data collection" % collection[1:]
                            except BaseException as ex:
                                print ex.message

                        print "\n\tPosting voting records from Georgia Parliament\n"
                        try:
                            if len(votes) > 0:
                                vpapi.post("votes", votes)
                            print "\n\tFinished Posting and updating data from votes data collection"
                        except BaseException as ex:
                            print ex.message
                    else:
                        print "\n\tThere are no voting records for %s" % item
                vpapi.deauthorize()
            else:
                print "\n\tInternet connection problems for %s official parliament web page" % item
                continue
    else:
        print "\n\tInvalid country/ies added"
Пример #6
0
logname = os.path.join(LOGS_DIR, logname)
logname = os.path.abspath(logname)
logging.basicConfig(level=logging.DEBUG,
                    format='%(message)s',
                    handlers=[logging.FileHandler(logname, 'w', 'utf-8')])
logging.getLogger('requests').setLevel(logging.ERROR)

logging.info(datetime.utcnow().strftime('%Y-%m-%d-%H:%M:%S') + '\tStarted 2')
db_log = vpapi.post('logs', {
    'status': 'running',
    'file': logname,
    'params': []
})

vpapi.parliament('cz/senat')
vpapi.authorize(authentication.username, authentication.password)
vpapi.timezone('Europe/Prague')

o2id = {}
organizations = vpapi.getall("organizations")
for org in organizations:
    o2id[org['name']] = org['id']

p2id = {}
persons = vpapi.getall('people')
for p in persons:
    p2id[p['name']] = p['id']


def pp2id(name, date, p2id):
    if name == 'Jiří Dienstbier':
import scrapeutils
import vpapi
import io
import scrapeutils

vpapi.parliament("cz/psp")
vpapi.authorize("admin", "secret")


def save(scraped):
    import json

    r = vpapi.get("organizations", where={"identifiers": {"$elemMatch": scraped["identifiers"][0]}})
    if not r["_items"]:
        r = vpapi.post("organizations", scraped)
    else:
        # update by PUT is preferred over PATCH to correctly remove properties that no longer exist now
        existing = r["_items"][0]
        r = vpapi.put("organizations/%s" % existing["id"], scraped)
    if r["_status"] != "OK":
        raise Exception(self.name, resp)
    return r["id"]


# zfile = scrapeutils.download('http://www.psp.cz/eknih/cdrom/opendata/poslanci.zip',zipped=True)
# organy = scrapeutils.zipfile2rows(zfile,'organy.unl')
## chamber:
# for row in organy:
#  if row[2] == '11':
#    term = row[3][3:]
#    org = {