def test_simple_scraping(session): from mptracker.scraper.groups import GroupScraper session.url_map.update({ STRUCTURE_URL: PAGES_DIR / 'structura-index-modified', STRUCTURE_URL + '?idg=0': PAGES_DIR / 'structura-group0', STRUCTURE_URL + '?idg=1': PAGES_DIR / 'structura-group1', STRUCTURE_URL + '?idg=4': PAGES_DIR / 'structura-group4', }) scraper = GroupScraper(session) groups = list(scraper.fetch()) indep = groups[0] assert indep.is_independent assert len(indep.current_members) == 13 indep_2 = indep.current_members[2] assert indep_2.mp_name == "Dumitru Ovidiu-Ioan" assert indep_2.start_date is None assert indep_2.end_date is None psd = groups[1] assert psd.name == "Grupul parlamentar al Partidului Social Democrat" assert psd.short_name == "PSD" assert not psd.is_independent assert len(psd.current_members) == 165 assert len(psd.former_members) == 2 current_3 = psd.current_members[3] assert current_3.mp_name == "Itu Cornel" assert current_3.start_date is None assert current_3.end_date is None assert current_3.title == 'Vicelideri' current_84 = psd.current_members[84] assert current_84.start_date == date(2013, 9, 18) former_0 = psd.former_members[0] assert former_0.mp_name == "Cernea Remus-Florinel" assert former_0.start_date is None assert former_0.end_date == date(2013, 5, 21) ppdd = groups[2] current_2 = ppdd.current_members[2] assert current_2.mp_name == "Ciuhodaru Tudor" former_2 = ppdd.former_members[2] assert former_2.mp_name == "Chebac Eugen" assert former_2.start_date is None assert former_2.end_date == date(2013, 9, 30) former_4 = ppdd.former_members[4] assert former_4.start_date == date(2013, 2, 11) assert former_4.end_date == date(2013, 5, 7)
def groups( cache_name=None, throttle=None, no_commit=False, ): from mptracker.scraper.groups import GroupScraper, Interval http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) group_scraper = GroupScraper(http_session) mandate_lookup = models.MandateLookup() mandate_intervals = defaultdict(list) groups = list(group_scraper.fetch()) independents = groups[0] assert independents.is_independent for group in groups[1:] + [independents]: for member in group.current_members + group.former_members: (year, chamber, number) = member.mp_ident assert chamber == 2 mandate = mandate_lookup.find(member.mp_name, year, number) interval_list = mandate_intervals[mandate] interval = member.get_interval() if interval.start is None: interval = interval._replace(start=TERM_2012_START) if group.is_independent: if interval_list: start = interval_list[-1].end interval = interval._replace(start=start) interval_list.append(interval) interval_list.sort(key=lambda i: i[0]) for mandate, interval_list in mandate_intervals.items(): # make sure interval_list are continuous new_intervals = [] for interval_one, interval_two in \ zip(interval_list[:-1], interval_list[1:]): assert interval_one.start < interval_one.end if interval_one.end < interval_two.start: interval = Interval( start=interval_one.end, end=interval_two.start, group=independents, ) new_intervals.append(interval) elif interval_one.end > interval_two.start: raise RuntimeError("Overlapping intervals") interval_list.extend(new_intervals) interval_list.sort() mandate_end = mandate.interval.upper if mandate_end == date.max: mandate_end = None if interval_list[-1].end != mandate_end: logger.warn("Mandate %s ends at %s", mandate, interval_list[-1].end) group_patcher = TablePatcher( models.MpGroup, models.db.session, key_columns=['short_name'], ) with group_patcher.process(remove=True) as add_group: for group in groups: record = group.as_dict(['name', 'short_name']) group.row = add_group(record).row models.db.session.flush() membership_patcher = TablePatcher( models.MpGroupMembership, models.db.session, key_columns=['mandate_id', 'mp_group_id', 'interval'], ) with membership_patcher.process( autoflush=1000, remove=True, ) as add_membership: for mandate, interval_list in mandate_intervals.items(): for interval in interval_list: row = add_membership({ 'mandate_id': mandate.id, 'mp_group_id': interval.group.row.id, 'interval': DateRange( interval.start or date.min, interval.end or date.max, ), }).row if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()
def groups( cache_name=None, throttle=None, no_commit=False, year='2012', ): year = int(year) from mptracker.scraper.groups import GroupScraper, Interval http_session = create_session(cache_name=cache_name, throttle=throttle and float(throttle)) group_scraper = GroupScraper(http_session) mandate_lookup = models.MandateLookup() mandate_intervals = defaultdict(list) term_interval = TERM_INTERVAL[year] groups = list(group_scraper.fetch(year)) independents = groups[0] assert independents.is_independent for group in groups[1:] + [independents]: for member in group.current_members + group.former_members: (myear, chamber, number) = member.mp_ident assert chamber == 2 mandate = mandate_lookup.find(member.mp_name, myear, number) interval_list = mandate_intervals[mandate] interval = member.get_interval() if interval.start is None: interval = interval._replace(start=term_interval.lower) if interval.end is None: interval = interval._replace(end=term_interval.upper) if group.is_independent: if interval_list: start = interval_list[-1].end interval = interval._replace(start=start) interval_list.append(interval) interval_list.sort(key=lambda i: i[0]) for mandate, interval_list in mandate_intervals.items(): # make sure interval_list are continuous new_intervals = [] for interval_one, interval_two in \ zip(interval_list[:-1], interval_list[1:]): assert interval_one.start < interval_one.end if interval_one.end < interval_two.start: interval = Interval( start=interval_one.end, end=interval_two.start, group=independents, ) new_intervals.append(interval) elif interval_one.end > interval_two.start: import pdb; pdb.set_trace() raise RuntimeError("Overlapping intervals") interval_list.extend(new_intervals) interval_list.sort() mandate_end = mandate.interval.upper if mandate_end == date.max: mandate_end = None if interval_list[-1].end != mandate_end: logger.warn("Mandate %s ends at %s", mandate, interval_list[-1].end) group_patcher = TablePatcher( models.MpGroup, models.db.session, key_columns=['short_name', 'year'], ) with group_patcher.process(remove=True, filter={'year': year}) as add_group: for group in groups: record = group.as_dict(['name', 'short_name', 'year']) group.row = add_group(record).row models.db.session.flush() membership_patcher = TablePatcher( models.MpGroupMembership, models.db.session, key_columns=['mandate_id', 'mp_group_id', 'interval'], ) current_membership_query = ( models.db.session.query(models.MpGroupMembership.id) .join(models.MpGroupMembership.mandate) .filter_by(year=year) ) remove_membership_ids = set(row.id for row in current_membership_query) with membership_patcher.process(autoflush=1000) as add_membership: for mandate, interval_list in mandate_intervals.items(): for interval in interval_list: res = add_membership({ 'mandate_id': mandate.id, 'mp_group_id': interval.group.row.id, 'interval': DateRange( interval.start or date.min, interval.end or date.max, ), }) if not res.is_new: remove_membership_ids.remove(res.row.id) if remove_membership_ids: unseen_items = ( models.MpGroupMembership.query .filter(models.MpGroupMembership.id.in_(remove_membership_ids)) ) unseen_items.delete(synchronize_session=False) logger.info("Deleted %d stale memberships", len(remove_membership_ids)) if no_commit: logger.warn("Rolling back the transaction") models.db.session.rollback() else: models.db.session.commit()