def proposals( cache_name=None, throttle=None, autoanalyze=False, ): from mptracker.scraper.proposals import ProposalScraper from mptracker.proposals import ocr_proposal from mptracker.policy import calculate_proposal proposal_scraper = ProposalScraper(create_session( cache_name=cache_name, throttle=float(throttle) if throttle else None)) def cdep_id(mandate): return (mandate.year, mandate.cdep_number) by_cdep_id = {cdep_id(m): m for m in models.Mandate.query if m.year == 2012} id_cdeppk_cdep = {} id_cdeppk_senate = {} for proposal in models.Proposal.query: if proposal.cdeppk_cdep: id_cdeppk_cdep[proposal.cdeppk_cdep] = proposal.id if proposal.cdeppk_senate: id_cdeppk_senate[proposal.cdeppk_senate] = proposal.id chamber_by_slug = {c.slug: c for c in models.Chamber.query} proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys())) all_activity = defaultdict(list) for item in models.ProposalActivityItem.query: all_activity[item.proposal_id].append(item) proposal_patcher = TablePatcher(models.Proposal, models.db.session, key_columns=['id']) activity_patcher = TablePatcher(models.ProposalActivityItem, models.db.session, key_columns=['id']) sp_updates = sp_added = sp_removed = 0 changed = [] seen = [] with proposal_patcher.process(autoflush=1000, remove=True) as add_proposal: with activity_patcher.process(autoflush=1000, remove=True) \ as add_activity: for prop in proposals: record = model_to_dict(prop, ['cdeppk_cdep', 'cdeppk_senate', 'decision_chamber', 'url', 'title', 'date', 'number_bpi', 'number_cdep', 'number_senate', 'proposal_type', 'pdf_url']) slug = prop.decision_chamber if slug: record['decision_chamber'] = chamber_by_slug[slug] idc = id_cdeppk_cdep.get(prop.cdeppk_cdep) ids = id_cdeppk_senate.get(prop.cdeppk_senate) if idc and ids and idc != ids: logger.warn("Two different records for the same proposal: " "(%s, %s). Removing the 2nd.", idc, ids) models.db.session.delete(models.Proposal.query.get(ids)) ids = None record['id'] = idc or ids or models.random_uuid() result = add_proposal(record) row = result.row if result.is_changed: changed.append(row) seen.append(row) new_people = set(by_cdep_id[ci] for ci in prop.sponsorships) existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships} to_remove = set(existing_sponsorships) - set(new_people) to_add = set(new_people) - set(existing_sponsorships) if to_remove: logger.info("Removing sponsors %s: %r", row.id, [cdep_id(m) for m in to_remove]) sp_removed += 1 for m in to_remove: sp = existing_sponsorships[m] models.db.session.delete(sp) if to_add: logger.info("Adding sponsors %s: %r", row.id, [cdep_id(m) for m in to_add]) sp_added += 1 for m in to_add: row.sponsorships.append(models.Sponsorship(mandate=m)) if to_remove or to_add: sp_updates += 1 db_activity = all_activity[row.id] db_activity.sort(key=lambda a: a.order) act_fields = lambda r: (r.date, r.location) if ([act_fields(r) for r in db_activity] != [act_fields(r) for r in prop.activity[:len(db_activity)]]): logger.warn("History doesn't match for %s, " "%d items will be removed", row.id,len(db_activity)) db_activity = [] for n, ac in enumerate(prop.activity): record = model_to_dict(ac, ['date', 'location', 'html']) record['proposal_id'] = row.id record['order'] = n if n < len(db_activity): item = db_activity[n] record['id'] = item.id assert item.date == record['date'] assert item.location == record['location'] assert item.order == record['order'] else: record['id'] = models.random_uuid() add_activity(record) models.db.session.commit() logger.info("Updated sponsorship for %d proposals (+%d, -%d)", sp_updates, sp_added, sp_removed) if autoanalyze: logger.info("Scheduling analysis jobs for %d proposals", len(changed)) for proposal in changed: if proposal.pdf_url: ocr_proposal.delay(proposal.id, autoanalyze=True) logger.info("Scheduling policy jobs for %d proposals", len(seen)) for proposal in seen: if proposal.policy_domain_id is None: calculate_proposal.delay(proposal.id)
def get_proposals( autoanalyze=False, no_commit=False, limit=None, ): import pickle from mptracker.scraper.proposals import SingleProposalScraper from mptracker.proposals import ocr_proposal from mptracker.policy import calculate_proposal index = {'pk_cdep': {}, 'pk_senate': {}} dirty_proposal_set = set() def cdep_id(mandate): return (mandate.year, mandate.cdep_number) sp_updates = sp_added = sp_removed = 0 changed = [] seen = [] with proposal_patcher.process(autoflush=1000) as add_proposal: for proposal in dirty_proposal_set: page_cdep = ( models.ScrapedProposalPage.query .filter_by(chamber=2, pk=proposal.cdeppk_cdep) .first() ) page_senate = ( models.ScrapedProposalPage.query .filter_by(chamber=1, pk=proposal.cdeppk_senate) .first() ) single_scraper = SingleProposalScraper() if page_senate: single_scraper.scrape_page('senate', pickle.loads(page_senate.result)) page_senate.parsed = True if page_cdep: single_scraper.scrape_page('cdep', pickle.loads(page_cdep.result)) page_cdep.parsed = True prop = single_scraper.finalize() prop.id = proposal.id or models.random_uuid() prop.cdeppk_cdep = proposal.cdeppk_cdep prop.cdeppk_senate = proposal.cdeppk_senate record = prop.as_dict(['id', 'cdeppk_cdep', 'cdeppk_senate', 'decision_chamber', 'url', 'title', 'date', 'number_bpi', 'number_cdep', 'number_senate', 'proposal_type', 'pdf_url', 'status', 'status_text', 'modification_date']) record['activity'] = flask.json.dumps([ item.as_dict(['date', 'location', 'html']) for item in prop.activity ]) slug = prop.decision_chamber if slug: record['decision_chamber'] = chamber_by_slug[slug] result = add_proposal(record) row = result.row if result.is_changed: changed.append(row) seen.append(row) new_people = set(by_cdep_id[ci] for ci in prop.sponsorships) existing_sponsorships = {sp.mandate: sp for sp in row.sponsorships} to_remove = set(existing_sponsorships) - set(new_people) to_add = set(new_people) - set(existing_sponsorships) if to_remove: logger.info("Removing sponsors %s: %r", row.id, [cdep_id(m) for m in to_remove]) sp_removed += 1 for m in to_remove: sp = existing_sponsorships[m] models.db.session.delete(sp) if to_add: logger.info("Adding sponsors %s: %r", row.id, [cdep_id(m) for m in to_add]) sp_added += 1 for m in to_add: row.sponsorships.append(models.Sponsorship(mandate=m)) if to_remove or to_add: sp_updates += 1 if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() return models.db.session.commit() logger.info("Updated sponsorship for %d proposals (+%d, -%d)", sp_updates, sp_added, sp_removed) if autoanalyze: logger.info("Scheduling analysis jobs for %d proposals", len(changed)) for proposal in changed: if proposal.pdf_url: ocr_proposal.delay(proposal.id, autoanalyze=True) logger.info("Scheduling policy jobs for %d proposals", len(seen)) for proposal in seen: if proposal.policy_domain_id is None: calculate_proposal.delay(proposal.id)