def votes( start=None, days=1, cache_name=None, throttle=None, no_commit=False, autoanalyze=False, ): from mptracker.scraper.votes import VoteScraper if start is None: start = models.db.session.execute( 'select date from voting_session ' 'order by date desc limit 1').scalar() + ONE_DAY else: start = parse_date(start) days = int(days) http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) vote_scraper = VoteScraper(http_session) voting_session_patcher = TablePatcher( models.VotingSession, models.db.session, key_columns=['cdeppk'], ) vote_patcher = TablePatcher( models.Vote, models.db.session, key_columns=['voting_session_id', 'mandate_id'], ) proposal_ids = {p.cdeppk_cdep: p.id for p in models.Proposal.query} mandate_lookup = models.MandateLookup() new_voting_session_list = [] with voting_session_patcher.process() as add_voting_session: with vote_patcher.process() as add_vote: for delta in range(days): the_date = start + ONE_DAY * delta if the_date >= date.today(): # don't scrape today, maybe voting is not done yet! break logger.info("Scraping votes from %s", the_date) for voting_session in vote_scraper.scrape_day(the_date): record = model_to_dict( voting_session, ['cdeppk', 'subject', 'subject_html'], ) record['date'] = the_date proposal_cdeppk = voting_session.proposal_cdeppk record['proposal_id'] = (proposal_ids.get(proposal_cdeppk) if proposal_cdeppk else None) record['final'] = bool("vot final" in record['subject'].lower()) vs = add_voting_session(record).row if vs.id is None: models.db.session.flush() new_voting_session_list.append(vs.id) for vote in voting_session.votes: record = model_to_dict(vote, ['choice']) record['voting_session_id'] = vs.id mandate = mandate_lookup.find( vote.mandate_name, vote.mandate_year, vote.mandate_number, ) record['mandate_id'] = mandate.id add_vote(record) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit() if autoanalyze: from mptracker.votes import calculate_voting_session_loyalty logger.info("Scheduling %d jobs", len(new_voting_session_list)) for voting_session_id in new_voting_session_list: calculate_voting_session_loyalty.delay(voting_session_id)
def import_person_xls(xls_path): from mptracker.scraper.person_xls import read_person_xls mandate_lookup = models.MandateLookup() people_data = [] committees = {} committee_memberships = [] groups = {} group_memberships = [] mandate_patcher = TablePatcher(models.Mandate, models.db.session, key_columns=['year', 'cdep_number']) with mandate_patcher.process() as add: for record in read_person_xls(xls_path): mandate = mandate_lookup.find(record.pop('name'), record['year'], record['cdep_number']) person_data = record.pop('person_data') person_data['id'] = mandate.person_id people_data.append(person_data) mandate_committees = record.pop('committees') mp_group = record.pop('mp_group') mandate = add(record).row for data in mandate_committees: committees[data['name']] = None committee_memberships.append( (mandate.id, data['name'], data['role'])) groups[mp_group['short_name']] = None group_memberships.append( (mandate.id, mp_group['short_name'], mp_group['role'])) person_patcher = TablePatcher(models.Person, models.db.session, key_columns=['id']) with person_patcher.process() as add: for person_data in people_data: add(person_data) committee_patcher = TablePatcher(models.MpCommittee, models.db.session, key_columns=['name']) with committee_patcher.process() as add: for name in list(committees): mp_committee = add({'name': name}).row committees[name] = mp_committee.id committee_membership_patcher = TablePatcher(models.MpCommitteeMembership, models.db.session, key_columns=['mandate_id', 'mp_committee_id']) with committee_membership_patcher.process() as add: for mandate_id, name, role in committee_memberships: add({ 'mandate_id': mandate_id, 'mp_committee_id': committees[name], 'role': role, }) mp_group_patcher = TablePatcher(models.MpGroup, models.db.session, key_columns=['short_name']) with mp_group_patcher.process() as add: for short_name in list(groups): mp_group = add({'short_name': short_name}).row groups[short_name] = mp_group.id mp_group_membership_patcher = TablePatcher(models.MpGroupMembership, models.db.session, key_columns=['mandate_id', 'mp_group_id']) with mp_group_membership_patcher.process() as add: for mandate_id, name, role in group_memberships: add({ 'mandate_id': mandate_id, 'mp_group_id': groups[name], 'role': role, }) models.db.session.commit()
def transcripts(start=None, n_sessions=1, cache_name=None, throttle=None): from mptracker.scraper.transcripts import TranscriptScraper if start is None: max_serial = models.db.session.execute( 'select serial from transcript_chapter ' 'order by serial desc limit 1').scalar() start = int(max_serial.split('/')[0]) + 1 cdeppk = int(start) - 1 n_sessions = int(n_sessions) transcript_scraper = TranscriptScraper( session=create_session(cache_name=cache_name, throttle=throttle and float(throttle))) mandate_lookup = models.MandateLookup() transcript_patcher = TablePatcher(models.Transcript, models.db.session, key_columns=['serial']) with transcript_patcher.process() as add: while n_sessions > 0: n_sessions -= 1 cdeppk += 1 logger.info("Fetching session %s", cdeppk) session_data = transcript_scraper.fetch_session(cdeppk) if session_data is None: logger.info("No content") continue for chapter in session_data.chapters: chapter_row = (models.TranscriptChapter.query .filter_by(serial=chapter.serial) .first()) if chapter_row is None: chapter_row = models.TranscriptChapter( serial=chapter.serial) models.db.session.add(chapter_row) models.db.session.flush() chapter_row.date = session_data.date chapter_row.headline = chapter.headline for paragraph in chapter.paragraphs: if paragraph['mandate_chamber'] != 2: continue try: mandate = mandate_lookup.find( paragraph['speaker_name'], paragraph['mandate_year'], paragraph['mandate_number']) except models.LookupError as e: logger.warn("at %s %s", paragraph['serial'], e) continue transcript_data = { 'chapter_id': chapter_row.id, 'text': paragraph['text'], 'serial': paragraph['serial'], 'mandate_id': mandate.id, } add(transcript_data) models.db.session.commit()
def questions( year='2013', reimport_existing=False, cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.questions import QuestionScraper from mptracker.questions import ocr_question from mptracker.policy import calculate_question if reimport_existing: known_urls = set() else: known_urls = set(q.url for q in models.Question.query) def skip_question(url): return url in known_urls http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle), counters=True) questions_scraper = QuestionScraper(session=http_session, skip=skip_question) mandate_lookup = models.MandateLookup() question_patcher = TablePatcher(models.Question, models.db.session, key_columns=['number', 'date']) new_ask_rows = 0 changed = [] with question_patcher.process() as add: for question in questions_scraper.run(int(year)): person_list = question.pop('person') question['addressee'] = '; '.join(question['addressee']) result = add(question) q = result.row old_asked = {ask.mandate_id: ask for ask in q.asked} for name, person_year, person_number in person_list: mandate = mandate_lookup.find(name, person_year, person_number) if mandate.id in old_asked: old_asked.pop(mandate.id) else: ask = models.Ask(mandate=mandate) q.asked.append(ask) ask.set_meta('new', True) logger.info("Adding ask for %s: %s", q, mandate) new_ask_rows += 1 if result.is_changed: changed.append(q) assert not old_asked models.db.session.commit() if new_ask_rows: logger.info("Added %d ask records", new_ask_rows) counters = http_session.counters logger.info("HTTP: %d kb in %s requests, %.2f seconds", counters['bytes'] / 1024, counters['requests'], counters['download_time'].total_seconds()) if autoanalyze: logger.info("Scheduling jobs for %d questions", len(changed)) for question in changed: if question.pdf_url: ocr_question.delay(question.id, autoanalyze=True) if question.policy_domain_id is None: calculate_question.delay(question.id)
def groups( cache_name=None, throttle=None, no_commit=False, ): from mptracker.scraper.groups import GroupScraper, Interval http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) group_scraper = GroupScraper(http_session) mandate_lookup = models.MandateLookup() mandate_intervals = defaultdict(list) groups = list(group_scraper.fetch()) independents = groups[0] assert independents.is_independent for group in groups[1:] + [independents]: for member in group.current_members + group.former_members: (year, chamber, number) = member.mp_ident assert chamber == 2 mandate = mandate_lookup.find(member.mp_name, year, number) interval_list = mandate_intervals[mandate] interval = member.get_interval() if interval.start is None: interval = interval._replace(start=TERM_2012_START) if group.is_independent: if interval_list: start = interval_list[-1].end interval = interval._replace(start=start) interval_list.append(interval) interval_list.sort(key=lambda i: i[0]) for mandate, interval_list in mandate_intervals.items(): # make sure interval_list are continuous new_intervals = [] for interval_one, interval_two in \ zip(interval_list[:-1], interval_list[1:]): assert interval_one.start < interval_one.end if interval_one.end < interval_two.start: interval = Interval( start=interval_one.end, end=interval_two.start, group=independents, ) new_intervals.append(interval) elif interval_one.end > interval_two.start: raise RuntimeError("Overlapping intervals") interval_list.extend(new_intervals) interval_list.sort() mandate_end = mandate.interval.upper if mandate_end == date.max: mandate_end = None if interval_list[-1].end != mandate_end: logger.warn("Mandate %s ends at %s", mandate, interval_list[-1].end) group_patcher = TablePatcher( models.MpGroup, models.db.session, key_columns=['short_name'], ) with group_patcher.process(remove=True) as add_group: for group in groups: record = group.as_dict(['name', 'short_name']) group.row = add_group(record).row models.db.session.flush() membership_patcher = TablePatcher( models.MpGroupMembership, models.db.session, key_columns=['mandate_id', 'mp_group_id', 'interval'], ) with membership_patcher.process( autoflush=1000, remove=True, ) as add_membership: for mandate, interval_list in mandate_intervals.items(): for interval in interval_list: row = add_membership({ 'mandate_id': mandate.id, 'mp_group_id': interval.group.row.id, 'interval': DateRange( interval.start or date.min, interval.end or date.max, ), }).row if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def votes( start=None, n_days=1, cache_name=None, throttle=None, ): from mptracker.scraper.votes import VoteScraper if start is None: start = models.db.session.execute( 'select date from voting_session ' 'order by date desc limit 1').scalar() + ONE_DAY else: start = parse_date(start) n_days = int(n_days) http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) vote_scraper = VoteScraper(http_session) voting_session_patcher = TablePatcher( models.VotingSession, models.db.session, key_columns=['cdeppk'], ) vote_patcher = TablePatcher( models.Vote, models.db.session, key_columns=['voting_session_id', 'mandate_id'], ) proposal_ids = {p.cdeppk_cdep: p.id for p in models.Proposal.query} mandate_lookup = models.MandateLookup() with voting_session_patcher.process() as add_voting_session: with vote_patcher.process() as add_vote: for delta in range(n_days): the_date = start + ONE_DAY * delta if the_date >= date.today(): # don't scrape today, maybe voting is not done yet! break logger.info("Scraping votes from %s", the_date) for voting_session in vote_scraper.scrape_day(the_date): record = model_to_dict( voting_session, ['cdeppk', 'subject', 'subject_html'], ) record['date'] = the_date proposal_cdeppk = voting_session.proposal_cdeppk record['proposal_id'] = (proposal_ids.get(proposal_cdeppk) if proposal_cdeppk else None) vs = add_voting_session(record).row if vs.id is None: models.db.session.flush() for vote in voting_session.votes: record = model_to_dict(vote, ['choice']) record['voting_session_id'] = vs.id mandate = mandate_lookup.find( vote.mandate_name, vote.mandate_year, vote.mandate_number, ) record['mandate_id'] = mandate.id add_vote(record) models.db.session.commit()