def get_activity(self, page): activity = [] headline = page.find(':contains("Derularea procedurii legislative")') table = list(headline.parents('table').items())[-1] date = None seen_data = False location = None location_countdown = 0 buffer = [] ac = None for row in table.children().items(): if location_countdown > 0: location_countdown -= 1 cols = row.children() date_text = cols.eq(0).text() if date_text == 'Data': seen_data = True continue elif not seen_data: continue if date_text: if ac: activity.append(ac) ac = Activity( date=datetime.strptime(date_text, '%d.%m.%Y').date(), location=location, html="", ) last_col = pq(cols[-1]) if last_col.attr('rowspan'): assert location_countdown == 0 location_countdown = int(last_col.attr('rowspan')) location = last_col.text() else: last_col.find('img[src="/img/spacer.gif"]').remove() (last_col.find('img[src="/img/icon_pdf_small.gif"]') .replaceWith('(pdf)')) html = last_col.html() if html: ac.html += sanitize(html) + '\n' if ac: activity.append(ac) return activity
def get_activity(self, page): activity = [] headline = page.find(':contains("Derularea procedurii legislative")') table = list(headline.parents('table').items())[-1] date = None seen_data = False location = None location_countdown = 0 buffer = [] ac = None for row in table.children().items(): if location_countdown > 0: location_countdown -= 1 cols = row.children() date_text = cols.eq(0).text() if date_text == 'Data': seen_data = True continue elif not seen_data: continue if date_text: if ac: activity.append(ac) ac = Activity( date=datetime.strptime(date_text, '%d.%m.%Y').date(), location=location, html="", ) last_col = pq(cols[-1]) if last_col.attr('rowspan'): assert location_countdown == 0 location_countdown = int(last_col.attr('rowspan')) location = last_col.text() else: last_col.find('img[src="/img/spacer.gif"]').remove() (last_col.find( 'img[src="/img/icon_pdf_small.gif"]').replaceWith('(pdf)')) html = last_col.html() if html: ac.html += sanitize(html) + '\n' if ac: activity.append(ac) return activity
def test_sanitize(in_html, out_html): from mptracker.scraper.common import sanitize assert sanitize(in_html) == out_html