Пример #1
0
def test_simple_scraping(session):
    from mptracker.scraper.groups import GroupScraper

    session.url_map.update({
        STRUCTURE_URL:
        PAGES_DIR / 'structura-index-modified',
        STRUCTURE_URL + '?idg=0':
        PAGES_DIR / 'structura-group0',
        STRUCTURE_URL + '?idg=1':
        PAGES_DIR / 'structura-group1',
        STRUCTURE_URL + '?idg=4':
        PAGES_DIR / 'structura-group4',
    })

    scraper = GroupScraper(session)
    groups = list(scraper.fetch())

    indep = groups[0]
    assert indep.is_independent
    assert len(indep.current_members) == 13

    indep_2 = indep.current_members[2]
    assert indep_2.mp_name == "Dumitru Ovidiu-Ioan"
    assert indep_2.start_date is None
    assert indep_2.end_date is None

    psd = groups[1]
    assert psd.name == "Grupul parlamentar al Partidului Social Democrat"
    assert psd.short_name == "PSD"
    assert not psd.is_independent
    assert len(psd.current_members) == 165
    assert len(psd.former_members) == 2

    current_3 = psd.current_members[3]
    assert current_3.mp_name == "Itu Cornel"
    assert current_3.start_date is None
    assert current_3.end_date is None
    assert current_3.title == 'Vicelideri'

    current_84 = psd.current_members[84]
    assert current_84.start_date == date(2013, 9, 18)

    former_0 = psd.former_members[0]
    assert former_0.mp_name == "Cernea Remus-Florinel"
    assert former_0.start_date is None
    assert former_0.end_date == date(2013, 5, 21)

    ppdd = groups[2]
    current_2 = ppdd.current_members[2]
    assert current_2.mp_name == "Ciuhodaru Tudor"

    former_2 = ppdd.former_members[2]
    assert former_2.mp_name == "Chebac Eugen"
    assert former_2.start_date is None
    assert former_2.end_date == date(2013, 9, 30)

    former_4 = ppdd.former_members[4]
    assert former_4.start_date == date(2013, 2, 11)
    assert former_4.end_date == date(2013, 5, 7)
def test_simple_scraping(session):
    from mptracker.scraper.groups import GroupScraper

    session.url_map.update({
        STRUCTURE_URL: PAGES_DIR / 'structura-index-modified',
        STRUCTURE_URL + '?idg=0': PAGES_DIR / 'structura-group0',
        STRUCTURE_URL + '?idg=1': PAGES_DIR / 'structura-group1',
        STRUCTURE_URL + '?idg=4': PAGES_DIR / 'structura-group4',
    })

    scraper = GroupScraper(session)
    groups = list(scraper.fetch())

    indep = groups[0]
    assert indep.is_independent
    assert len(indep.current_members) == 13

    indep_2 = indep.current_members[2]
    assert indep_2.mp_name == "Dumitru Ovidiu-Ioan"
    assert indep_2.start_date is None
    assert indep_2.end_date is None

    psd = groups[1]
    assert psd.name == "Grupul parlamentar al Partidului Social Democrat"
    assert psd.short_name == "PSD"
    assert not psd.is_independent
    assert len(psd.current_members) == 165
    assert len(psd.former_members) == 2

    current_3 = psd.current_members[3]
    assert current_3.mp_name == "Itu Cornel"
    assert current_3.start_date is None
    assert current_3.end_date is None
    assert current_3.title == 'Vicelideri'

    current_84 = psd.current_members[84]
    assert current_84.start_date == date(2013, 9, 18)

    former_0 = psd.former_members[0]
    assert former_0.mp_name == "Cernea Remus-Florinel"
    assert former_0.start_date is None
    assert former_0.end_date == date(2013, 5, 21)

    ppdd = groups[2]
    current_2 = ppdd.current_members[2]
    assert current_2.mp_name == "Ciuhodaru Tudor"

    former_2 = ppdd.former_members[2]
    assert former_2.mp_name == "Chebac Eugen"
    assert former_2.start_date is None
    assert former_2.end_date == date(2013, 9, 30)

    former_4 = ppdd.former_members[4]
    assert former_4.start_date == date(2013, 2, 11)
    assert former_4.end_date == date(2013, 5, 7)
Пример #3
0
def groups(
        cache_name=None,
        throttle=None,
        no_commit=False,
        ):
    from mptracker.scraper.groups import GroupScraper, Interval

    http_session = create_session(cache_name=cache_name,
                                  throttle=throttle and float(throttle))
    group_scraper = GroupScraper(http_session)

    mandate_lookup = models.MandateLookup()
    mandate_intervals = defaultdict(list)

    groups = list(group_scraper.fetch())
    independents = groups[0]
    assert independents.is_independent
    for group in groups[1:] + [independents]:
        for member in group.current_members + group.former_members:
            (year, chamber, number) = member.mp_ident
            assert chamber == 2
            mandate = mandate_lookup.find(member.mp_name, year, number)
            interval_list = mandate_intervals[mandate]

            interval = member.get_interval()
            if interval.start is None:
                interval = interval._replace(start=TERM_2012_START)

            if group.is_independent:
                if interval_list:
                    start = interval_list[-1].end
                    interval = interval._replace(start=start)

            interval_list.append(interval)
            interval_list.sort(key=lambda i: i[0])

    for mandate, interval_list in mandate_intervals.items():
        # make sure interval_list are continuous
        new_intervals = []
        for interval_one, interval_two in \
            zip(interval_list[:-1], interval_list[1:]):

            assert interval_one.start < interval_one.end
            if interval_one.end < interval_two.start:
                interval = Interval(
                    start=interval_one.end,
                    end=interval_two.start,
                    group=independents,
                )
                new_intervals.append(interval)
            elif interval_one.end > interval_two.start:
                raise RuntimeError("Overlapping intervals")

        interval_list.extend(new_intervals)
        interval_list.sort()

        mandate_end = mandate.interval.upper
        if mandate_end == date.max:
            mandate_end = None
        if interval_list[-1].end != mandate_end:
            logger.warn("Mandate %s ends at %s",
                        mandate, interval_list[-1].end)

    group_patcher = TablePatcher(
        models.MpGroup,
        models.db.session,
        key_columns=['short_name'],
    )

    with group_patcher.process(remove=True) as add_group:
        for group in groups:
            record = group.as_dict(['name', 'short_name'])
            group.row = add_group(record).row

        models.db.session.flush()

    membership_patcher = TablePatcher(
        models.MpGroupMembership,
        models.db.session,
        key_columns=['mandate_id', 'mp_group_id', 'interval'],
    )

    with membership_patcher.process(
            autoflush=1000,
            remove=True,
        ) as add_membership:

        for mandate, interval_list in mandate_intervals.items():
            for interval in interval_list:
                row = add_membership({
                    'mandate_id': mandate.id,
                    'mp_group_id': interval.group.row.id,
                    'interval': DateRange(
                        interval.start or date.min,
                        interval.end or date.max,
                    ),
                }).row

    if no_commit:
        logger.warn("Rolling back the transaction")
        models.db.session.rollback()

    else:
        models.db.session.commit()
Пример #4
0
def groups(
        cache_name=None,
        throttle=None,
        no_commit=False,
        year='2012',
        ):
    year = int(year)

    from mptracker.scraper.groups import GroupScraper, Interval

    http_session = create_session(cache_name=cache_name,
                                  throttle=throttle and float(throttle))
    group_scraper = GroupScraper(http_session)

    mandate_lookup = models.MandateLookup()
    mandate_intervals = defaultdict(list)
    term_interval = TERM_INTERVAL[year]

    groups = list(group_scraper.fetch(year))
    independents = groups[0]
    assert independents.is_independent
    for group in groups[1:] + [independents]:
        for member in group.current_members + group.former_members:
            (myear, chamber, number) = member.mp_ident
            assert chamber == 2
            mandate = mandate_lookup.find(member.mp_name, myear, number)
            interval_list = mandate_intervals[mandate]

            interval = member.get_interval()
            if interval.start is None:
                interval = interval._replace(start=term_interval.lower)

            if interval.end is None:
                interval = interval._replace(end=term_interval.upper)

            if group.is_independent:
                if interval_list:
                    start = interval_list[-1].end
                    interval = interval._replace(start=start)

            interval_list.append(interval)
            interval_list.sort(key=lambda i: i[0])

    for mandate, interval_list in mandate_intervals.items():
        # make sure interval_list are continuous
        new_intervals = []
        for interval_one, interval_two in \
            zip(interval_list[:-1], interval_list[1:]):

            assert interval_one.start < interval_one.end
            if interval_one.end < interval_two.start:
                interval = Interval(
                    start=interval_one.end,
                    end=interval_two.start,
                    group=independents,
                )
                new_intervals.append(interval)
            elif interval_one.end > interval_two.start:
                import pdb; pdb.set_trace()
                raise RuntimeError("Overlapping intervals")

        interval_list.extend(new_intervals)
        interval_list.sort()

        mandate_end = mandate.interval.upper
        if mandate_end == date.max:
            mandate_end = None
        if interval_list[-1].end != mandate_end:
            logger.warn("Mandate %s ends at %s",
                        mandate, interval_list[-1].end)

    group_patcher = TablePatcher(
        models.MpGroup,
        models.db.session,
        key_columns=['short_name', 'year'],
    )

    with group_patcher.process(remove=True, filter={'year': year}) as add_group:
        for group in groups:
            record = group.as_dict(['name', 'short_name', 'year'])
            group.row = add_group(record).row

        models.db.session.flush()

    membership_patcher = TablePatcher(
        models.MpGroupMembership,
        models.db.session,
        key_columns=['mandate_id', 'mp_group_id', 'interval'],
    )

    current_membership_query = (
        models.db.session.query(models.MpGroupMembership.id)
        .join(models.MpGroupMembership.mandate)
        .filter_by(year=year)
    )

    remove_membership_ids = set(row.id for row in current_membership_query)
    with membership_patcher.process(autoflush=1000) as add_membership:
        for mandate, interval_list in mandate_intervals.items():
            for interval in interval_list:
                res = add_membership({
                    'mandate_id': mandate.id,
                    'mp_group_id': interval.group.row.id,
                    'interval': DateRange(
                        interval.start or date.min,
                        interval.end or date.max,
                    ),
                })
                if not res.is_new:
                    remove_membership_ids.remove(res.row.id)

    if remove_membership_ids:
        unseen_items = (
            models.MpGroupMembership.query
            .filter(models.MpGroupMembership.id.in_(remove_membership_ids))
        )
        unseen_items.delete(synchronize_session=False)
        logger.info("Deleted %d stale memberships", len(remove_membership_ids))

    if no_commit:
        logger.warn("Rolling back the transaction")
        models.db.session.rollback()

    else:
        models.db.session.commit()