Пример #1
0
def proposals(
        cache_name=None,
        throttle=None,
        autoanalyze=False,
        ):
    from mptracker.scraper.proposals import ProposalScraper
    from mptracker.proposals import ocr_proposal
    from mptracker.policy import calculate_proposal

    proposal_scraper = ProposalScraper(create_session(
            cache_name=cache_name,
            throttle=float(throttle) if throttle else None))

    def cdep_id(mandate):
        return (mandate.year, mandate.cdep_number)

    by_cdep_id = {cdep_id(m): m
                  for m in models.Mandate.query
                  if m.year == 2012}

    id_cdeppk_cdep = {}
    id_cdeppk_senate = {}
    for proposal in models.Proposal.query:
        if proposal.cdeppk_cdep:
            id_cdeppk_cdep[proposal.cdeppk_cdep] = proposal.id
        if proposal.cdeppk_senate:
            id_cdeppk_senate[proposal.cdeppk_senate] = proposal.id

    chamber_by_slug = {c.slug: c for c in models.Chamber.query}

    proposals = proposal_scraper.fetch_from_mp_pages(set(by_cdep_id.keys()))

    all_activity = defaultdict(list)
    for item in models.ProposalActivityItem.query:
        all_activity[item.proposal_id].append(item)

    proposal_patcher = TablePatcher(models.Proposal,
                                    models.db.session,
                                    key_columns=['id'])

    activity_patcher = TablePatcher(models.ProposalActivityItem,
                                    models.db.session,
                                    key_columns=['id'])

    sp_updates = sp_added = sp_removed = 0

    changed = []
    seen = []

    with proposal_patcher.process(autoflush=1000, remove=True) as add_proposal:
        with activity_patcher.process(autoflush=1000, remove=True) \
                as add_activity:
            for prop in proposals:
                record = model_to_dict(prop, ['cdeppk_cdep', 'cdeppk_senate',
                    'decision_chamber', 'url', 'title', 'date', 'number_bpi',
                    'number_cdep', 'number_senate', 'proposal_type',
                    'pdf_url'])

                slug = prop.decision_chamber
                if slug:
                    record['decision_chamber'] = chamber_by_slug[slug]

                idc = id_cdeppk_cdep.get(prop.cdeppk_cdep)
                ids = id_cdeppk_senate.get(prop.cdeppk_senate)
                if idc and ids and idc != ids:
                    logger.warn("Two different records for the same proposal: "
                                "(%s, %s). Removing the 2nd.", idc, ids)
                    models.db.session.delete(models.Proposal.query.get(ids))
                    ids = None
                record['id'] = idc or ids or models.random_uuid()

                result = add_proposal(record)
                row = result.row
                if result.is_changed:
                    changed.append(row)
                seen.append(row)

                new_people = set(by_cdep_id[ci] for ci in prop.sponsorships)
                existing_sponsorships = {sp.mandate: sp
                                         for sp in row.sponsorships}
                to_remove = set(existing_sponsorships) - set(new_people)
                to_add = set(new_people) - set(existing_sponsorships)
                if to_remove:
                    logger.info("Removing sponsors %s: %r", row.id,
                                [cdep_id(m) for m in to_remove])
                    sp_removed += 1
                    for m in to_remove:
                        sp = existing_sponsorships[m]
                        models.db.session.delete(sp)
                if to_add:
                    logger.info("Adding sponsors %s: %r", row.id,
                                [cdep_id(m) for m in to_add])
                    sp_added += 1
                    for m in to_add:
                        row.sponsorships.append(models.Sponsorship(mandate=m))

                if to_remove or to_add:
                    sp_updates += 1

                db_activity = all_activity[row.id]
                db_activity.sort(key=lambda a: a.order)
                act_fields = lambda r: (r.date, r.location)
                if ([act_fields(r) for r in db_activity] !=
                    [act_fields(r) for r in prop.activity[:len(db_activity)]]):
                    logger.warn("History doesn't match for %s, "
                                "%d items will be removed",
                                row.id,len(db_activity))
                    db_activity = []

                for n, ac in enumerate(prop.activity):
                    record = model_to_dict(ac, ['date', 'location', 'html'])
                    record['proposal_id'] = row.id
                    record['order'] = n
                    if n < len(db_activity):
                        item = db_activity[n]
                        record['id'] = item.id
                        assert item.date == record['date']
                        assert item.location == record['location']
                        assert item.order == record['order']
                    else:
                        record['id'] = models.random_uuid()
                    add_activity(record)

    models.db.session.commit()

    logger.info("Updated sponsorship for %d proposals (+%d, -%d)",
                sp_updates, sp_added, sp_removed)

    if autoanalyze:
        logger.info("Scheduling analysis jobs for %d proposals", len(changed))
        for proposal in changed:
            if proposal.pdf_url:
                ocr_proposal.delay(proposal.id, autoanalyze=True)

        logger.info("Scheduling policy jobs for %d proposals", len(seen))
        for proposal in seen:
            if proposal.policy_domain_id is None:
                calculate_proposal.delay(proposal.id)
Пример #2
0
def get_proposals(
        autoanalyze=False,
        no_commit=False,
        limit=None,
        ):
    import pickle
    from mptracker.scraper.proposals import SingleProposalScraper
    from mptracker.proposals import ocr_proposal
    from mptracker.policy import calculate_proposal

    index = {'pk_cdep': {}, 'pk_senate': {}}


    dirty_proposal_set = set()


    def cdep_id(mandate):
        return (mandate.year, mandate.cdep_number)


    sp_updates = sp_added = sp_removed = 0

    changed = []
    seen = []

    with proposal_patcher.process(autoflush=1000) as add_proposal:
        for proposal in dirty_proposal_set:
            page_cdep = (
                models.ScrapedProposalPage.query
                .filter_by(chamber=2, pk=proposal.cdeppk_cdep)
                .first()
            )
            page_senate = (
                models.ScrapedProposalPage.query
                .filter_by(chamber=1, pk=proposal.cdeppk_senate)
                .first()
            )

            single_scraper = SingleProposalScraper()

            if page_senate:
                single_scraper.scrape_page('senate',
                    pickle.loads(page_senate.result))
                page_senate.parsed = True

            if page_cdep:
                single_scraper.scrape_page('cdep',
                    pickle.loads(page_cdep.result))
                page_cdep.parsed = True

            prop = single_scraper.finalize()

            prop.id = proposal.id or models.random_uuid()
            prop.cdeppk_cdep = proposal.cdeppk_cdep
            prop.cdeppk_senate = proposal.cdeppk_senate


            record = prop.as_dict(['id', 'cdeppk_cdep', 'cdeppk_senate',
                'decision_chamber', 'url', 'title', 'date', 'number_bpi',
                'number_cdep', 'number_senate', 'proposal_type',
                'pdf_url', 'status', 'status_text', 'modification_date'])

            record['activity'] = flask.json.dumps([
                item.as_dict(['date', 'location', 'html'])
                for item in prop.activity
            ])

            slug = prop.decision_chamber
            if slug:
                record['decision_chamber'] = chamber_by_slug[slug]

            result = add_proposal(record)
            row = result.row
            if result.is_changed:
                changed.append(row)
            seen.append(row)

            new_people = set(by_cdep_id[ci] for ci in prop.sponsorships)
            existing_sponsorships = {sp.mandate: sp
                                     for sp in row.sponsorships}
            to_remove = set(existing_sponsorships) - set(new_people)
            to_add = set(new_people) - set(existing_sponsorships)
            if to_remove:
                logger.info("Removing sponsors %s: %r", row.id,
                            [cdep_id(m) for m in to_remove])
                sp_removed += 1
                for m in to_remove:
                    sp = existing_sponsorships[m]
                    models.db.session.delete(sp)
            if to_add:
                logger.info("Adding sponsors %s: %r", row.id,
                            [cdep_id(m) for m in to_add])
                sp_added += 1
                for m in to_add:
                    row.sponsorships.append(models.Sponsorship(mandate=m))

            if to_remove or to_add:
                sp_updates += 1


    if no_commit:
        logger.warn("Rolling back the transaction")
        models.db.session.rollback()
        return

    models.db.session.commit()

    logger.info("Updated sponsorship for %d proposals (+%d, -%d)",
                sp_updates, sp_added, sp_removed)

    if autoanalyze:
        logger.info("Scheduling analysis jobs for %d proposals", len(changed))
        for proposal in changed:
            if proposal.pdf_url:
                ocr_proposal.delay(proposal.id, autoanalyze=True)

        logger.info("Scheduling policy jobs for %d proposals", len(seen))
        for proposal in seen:
            if proposal.policy_domain_id is None:
                calculate_proposal.delay(proposal.id)