def write_list_of_stenograms_summary_pages(): logger_html.info("Generating html summary page of all stenograms.") all_stenograms_template = templates.get_template("stenograms_template.html") sessionscur = db.cursor() sessionscur.execute( """SELECT description, stenogram_date FROM vote_sessions ORDER BY stenogram_date, session_number""" ) date_stenogr = groupby_list(sessionscur, operator.itemgetter(1)) year_date_stenogr = groupby_list(date_stenogr, lambda d_s: d_s[0].year) years = zip(*year_date_stenogr)[0] for y, y_date_stenogr in year_date_stenogr: month_date_stenogr = [("all", y_date_stenogr)] + groupby_list(y_date_stenogr, lambda d_s: d_s[0].month) months = zip(*month_date_stenogr)[0] for m, m_date_stenogr in month_date_stenogr: with open("generated_html/stenograms%s%s.html" % (y, m), "w") as html_file: html_file.write( all_stenograms_template.render( years=years, months=months, current_y=y, stenogram_mgroup=m_date_stenogr ) ) sitemap.add("stenograms%s%s.html" % (y, m), 0.8) # Copy the most recent one os.system("cp generated_html/stenograms%s%s.html generated_html/stenograms.html" % (y, m)) sitemap.add("stenograms.html", 0.8)
def write_MPs_emails_page(): logger_html.info("Generating list page for MP emails.") # Get all mails into a dict. mailscur = db.cursor() mailscur.execute("""SELECT email, orig_party_name FROM mps ORDER BY orig_party_name""") groups = itertools.groupby(mailscur, operator.itemgetter(1)) mails_by_party_dict = {k: ", ".join(m[0] for m in mails if m[0]) for k, mails in groups} # Generate the webpage with the mails. logger_html.info("Generating html page of MP mail addresses.") mails_template = templates.get_template("mails_template.html") with open("generated_html/mails.html", "w") as html_file: html_file.write(mails_template.render(mails_by_party_dict=mails_by_party_dict)) sitemap.add("mails.html", 0.8)
def write_bills_pages(): logger_html.info("Generating bills html pages.") per_bill_template = templates.get_template("bill_D_S_template.html") billcur = db.cursor() billcur.execute("""SELECT * FROM bills""") count = 0 for name, sig, date, original_url in billcur: print count count += 1 chroncur = db.cursor() chroncur.execute( """SELECT event, event_date FROM bill_history WHERE bill_signature = %s""", (sig,), ) authcur = db.cursor() authcur.execute( """SELECT bill_author FROM bill_authors WHERE bill_signature = %s""", (sig,), ) authors = [(a, "mp_%s.html" % unicode2urlsafe(a)) for (a,) in authcur] authcur.execute( """SELECT COUNT(*) FROM bills_by_government WHERE bill_signature = %s""", (sig,), ) if authcur.fetchone()[0]: authors.append((u"Mинистерски съвет", "http://www.government.bg/")) with open("generated_html/bill_%s_%s.html" % (date.strftime("%Y%m%d"), sig), "w") as html_file: html_file.write( per_bill_template.render( name=name, chronology=chroncur.fetchall(), authors=authors, original_url=original_url ) ) sitemap.add("bill_%s_%s.html" % (date.strftime("%Y%m%d"), sig), 0.7)
} return party_dict.get(name, name) def unidecode(string): """Transliterate unicode to latin.""" return _unidecode.unidecode(string.replace(u'ѝ',u'и') .replace(u'ъ',u'а').replace(u'Ъ',u'А') .replace(u'ь',u'й').replace(u'Ь',u'Й')) def unicode2urlsafe(string): return unidecode(string).replace(' ', '_').lower() mpscur = db.cursor() mpscur.execute("""SELECT mp_name FROM mps ORDER BY mp_name""") names = [n[0] for n in mpscur] links = ["mp_%s.html"%unicode2urlsafe(n) for n in names] fl_names = [' '.join(n.split()[::2]) for n in names] l_names = [n.split()[-1] for n in names] names_links = sortgroupby_list(zip( names,names,links), lambda _:_[0]) fl_names_links = sortgroupby_list(zip(fl_names,names,links), lambda _:_[0]) l_names_links = sortgroupby_list(zip( l_names,names,links), lambda _:_[0]) permited_separators = '=[.,?!:; ()]' re1 = '(?<{s}){k}(?{s})' re2 = '^{k}(?{s})' re3 = '(?<{s}){k}$' re4 = '^{k}$'
def load_votes_regs_data(): global data_loaded if data_loaded: return data_loaded = True logger_html.info("Fetching most of the db in memory.") # All MPs mpscur = db.cursor() mpscur.execute( """SELECT mp_name, orig_party_name, (SELECT LAST(with_party ORDER BY mp_reg.stenogram_date) FROM mp_reg WHERE mp_reg.mp_name = mps.mp_name), original_url FROM mps ORDER BY orig_party_name, mp_name""" ) global mps, parties mps = mpscur.fetchall() parties = groupby_list(mps, operator.itemgetter(1)) # All sessions sescur = db.cursor() sescur.execute( """SELECT stenogram_date, session_number FROM vote_sessions ORDER BY stenogram_date, session_number""" ) global sessions, session_dates sessions = sescur.fetchall() session_dates = groupby_list(sessions, operator.itemgetter(0)) # All dates - includes dates on which no voting was done datecur = db.cursor() datecur.execute( """SELECT stenogram_date FROM stenograms ORDER BY stenogram_date""" ) global all_dates all_dates = [d[0] for d in datecur] def aggregate_sessions_in_dates(array): new_shape = list(array.shape) new_shape[1] = len(session_dates) new_array = np.zeros(new_shape, dtype=np.int32) start = 0 for date_i, date in enumerate(session_dates): end = start + len(date[1]) new_array[:, date_i, :] = np.sum(array[:, start:end, :], 1) start = end return new_array def aggregate_names_in_parties(array): new_shape = list(array.shape) new_shape[0] = len(parties) new_array = np.zeros(new_shape, dtype=np.int32) start = 0 for party_i, party in enumerate(parties): end = start + len(party[1]) new_array[party_i, :, :] = np.sum(array[start:end, :, :], 0) start = end return new_array # All regs. datacur = db.cursor() global mps_dates_reg mps_dates_reg = np.zeros((len(mps), len(all_dates), 3), dtype=np.int32) """ A 3D array with the same structure as described for `mps_sessions_vote`.""" reg_dict = {"present": 0, "absent": 1, "manually_registered": 2} for (mp_i, mp) in enumerate(mps): datacur.execute( """SELECT reg, stenogram_date FROM mp_reg WHERE mp_name = %s ORDER BY stenogram_date""", (mp[0],), ) f = datacur.fetchone() for r in datacur: date_i = index(all_dates, r[1]) # While the index is monotonic it is not always +=1 mps_dates_reg[mp_i, date_i, reg_dict[r[0]]] = 1 global mps_all_reg mps_all_reg = np.sum(mps_dates_reg, 1) # All votes global mps_sessions_vote mps_sessions_vote = np.zeros((len(mps), len(sessions), 4), dtype=np.int32) """ A 3D array sessions / with the index /___yes_no_abst_absent following | 0 0 0 1 structure: names | 1 0 0 0 index | ... Contains votes. If the MP was not even registered for the session it contains only zeros.""" vote_dict = {"yes": 0, "no": 1, "abstain": 2, "absent": 3} for (mp_i, mp) in enumerate(mps): datacur.execute( """SELECT vote, stenogram_date, session_number FROM mp_votes WHERE mp_name = %s ORDER BY stenogram_date, session_number""", (mp[0],), ) for v in datacur.fetchall(): ses_i = index(sessions, v[1:]) # While the index is monotonic it is not always +=1 mps_sessions_vote[mp_i, ses_i, vote_dict[v[0]]] = 1 global mps_dates_vote, parties_sessions_vote, parties_dates_vote, mps_all_vote, all_sessions_vote mps_dates_vote = aggregate_sessions_in_dates(mps_sessions_vote) parties_sessions_vote = aggregate_names_in_parties(mps_sessions_vote) parties_dates_vote = aggregate_sessions_in_dates(parties_sessions_vote) mps_all_vote = np.sum(mps_dates_vote, 1) all_sessions_vote = np.sum(parties_sessions_vote, 0) """3D and 2D arrays with aggregated data.""" global mps_sessions_with_against_party mps_sessions_with_against_party = np.zeros((len(mps), len(sessions), 2), dtype=np.int32) """ A 3D array with the same structure as above. The data is vote with/against party.""" offset = 0 for p_i, (party, p_mps) in enumerate(parties): end = offset + len(p_mps) for mp_i, ses_i in itertools.product(range(offset, end), range(len(sessions))): if not any(mps_sessions_vote[mp_i, ses_i, :2]): continue if parties_sessions_vote[p_i, ses_i, 0] == parties_sessions_vote[p_i, ses_i, 1]: mps_sessions_with_against_party[mp_i, ses_i, 0] = 1 continue mps_sessions_with_against_party[mp_i, ses_i, 0] = mps_sessions_vote[ mp_i, ses_i, np.argmax(parties_sessions_vote[p_i, ses_i, :2]) ] mps_sessions_with_against_party[mp_i, ses_i, 1] = 1 - mps_sessions_with_against_party[mp_i, ses_i, 0] offset = end global mps_dates_with_against_party, mps_all_with_against_party mps_dates_with_against_party = aggregate_sessions_in_dates(mps_sessions_with_against_party) mps_all_with_against_party = np.sum(mps_dates_with_against_party, 1) """3D and 2D arrays with aggregated data.""" global mps_sessions_with_against_all mps_sessions_with_against_all = np.zeros((len(mps), len(sessions), 2), dtype=np.int32) """ A 3D array with the same structure as above. The data is vote with/against all.""" for mp_i, ses_i in itertools.product(range(len(mps)), range(len(sessions))): if not any(mps_sessions_vote[mp_i, ses_i, :2]): continue if all_sessions_vote[ses_i, 0] == all_sessions_vote[ses_i, 1]: mps_sessions_with_against_all[mp_i, ses_i, 0] = 1 continue mps_sessions_with_against_all[mp_i, ses_i, 0] = mps_sessions_vote[ mp_i, ses_i, np.argmax(all_sessions_vote[ses_i, :2]) ] mps_sessions_with_against_all[mp_i, ses_i, 1] = 1 - mps_sessions_with_against_all[mp_i, ses_i, 0] global mps_dates_with_against_all, mps_all_with_against_all mps_dates_with_against_all = aggregate_sessions_in_dates(mps_sessions_with_against_all) mps_all_with_against_all = np.sum(mps_dates_with_against_all, 1) """3D and 2D arrays with aggregated data."""
def write_stenogram_pages(): # Load templates. per_stenogram_template = templates.get_template("stenogramN_template.html") per_stenogram_reg_template = templates.get_template("stenogramNregistration_template.html") per_stenogram_vote_template = templates.get_template("stenogramNvoteI_template.html") # Get all stenograms into an iterator. stenogramcur = db.cursor() stenogramcur.execute( """SELECT stenogram_date, text, vote_line_nb, problem, original_url FROM stenograms ORDER BY stenogram_date""" ) len_stenograms = stenogramcur.rowcount for st_i, (stenogram_date, text, vote_line_nb, problem, original_url) in enumerate(stenogramcur): datestr = stenogram_date.strftime("%Y%m%d") logger_html.info("Generating HTML and plots for %s - %d of %d" % (datestr, st_i + 1, len_stenograms)) def counter(count=[0]): count[0] = count[0] + 1 return count[0] stenogram_text = "<br />\n".join( '<strong id="votesInText%d">%s</strong>' % (counter(), l) if i in vote_line_nb else l for i, l in enumerate(text) ) stenogram_text, divs = annotate_mps(stenogram_text) if problem: logger_html.error("The database reports problems with stenogram %s. Skipping." % datestr) # Generate the main page for the current stenogram. filename = "stenogram%s.html" % datestr with open("generated_html/%s" % filename, "w") as html_file: html_file.write( per_stenogram_template.render( stenogram_date=stenogram_date, problem=True, original_url=original_url, vote_descriptions=None, party_names=None, votes_by_session_type_party=None, reg_presences=None, reg_expected=None, divs=divs, stenogram_text=stenogram_text, ) ) sitemap.add(filename, 0.7) continue ################################ # Registration data per party. # ################################ # Load all party registration data for the current stenogram. subcur.execute( """SELECT party_name, present, expected FROM party_reg WHERE stenogram_date = %s ORDER BY party_name""", (stenogram_date,), ) party_names, reg_presences, reg_expected = zip(*subcur.fetchall()) party_names = [n for n in party_names] reg_presences = np.array(reg_presences) reg_expected = np.array(reg_expected) # Plot registration data. registration_figure(stenogram_date, party_names, reg_presences, reg_expected) # Load the registration-by-name data. subcur.execute( """SELECT mp_name, with_party, reg FROM mp_reg WHERE stenogram_date = %s ORDER BY mp_name""", (stenogram_date,), ) reg_by_name = subcur.fetchall() # Generate registration summary for the current stenogram. filename = "stenogram%sregistration.html" % datestr with open("generated_html/%s" % filename, "w") as html_file: html_file.write( per_stenogram_reg_template.render( stenogram_date=stenogram_date, party_names=party_names, reg_presences=reg_presences, reg_expected=reg_expected, reg_by_name=reg_by_name, ) ) sitemap.add( filename, 0.6, [ ( "registration%s.png" % stenogram_date.strftime("%Y%m%d"), u"Регистрирани и отсъстващи депутати на %s." % stenogram_date.strftime("%Y-%m-%d"), ) ], ) ######################### # Voting sessions data. # ######################### # Check whether there were any voting sessions at all. sesscur = db.cursor() sesscur.execute( """SELECT description FROM vote_sessions WHERE stenogram_date = %s ORDER BY session_number""", (stenogram_date,), ) len_sessions = sesscur.rowcount if len_sessions: ########### # LOADING # ########### # Load all party absence and vote data for all sessions of the current stenogram. # list format: vote_* is [party1_votes, ...], party*_votes is [session1_vote, ...], session*_vote is int votes_yes = [] votes_no = [] votes_abstain = [] votes_total = [] votes_absences = [] votes_absences_percent = [] for party_i, n in enumerate(party_names): subcur.execute( """SELECT yes, no, abstain, total FROM party_votes WHERE stenogram_date = %s AND party_name = %s ORDER BY session_number""", (stenogram_date, n), ) yes, no, abstain, total = map(np.array, zip(*subcur)) votes_yes.append(yes) votes_no.append(no) votes_abstain.append(abstain) votes_total.append(total) absent_party = reg_expected[party_i] - total votes_absences.append(absent_party) votes_absences_percent.append(absent_party * 100 / reg_expected[party_i]) votes_by_session_type_party = np.array([votes_yes, votes_no, votes_abstain, votes_absences]).transpose( 2, 0, 1 ) # Load all session descriptions for the current stenogram. vote_descriptions = [d[0] for d in sesscur] # Load the list-by-mp-name vote for each session. votes_by_session_by_name = [] for session_i in range(len_sessions): subcur.execute( """SELECT mp_name, with_party, vote FROM mp_votes WHERE session_number = %s AND stenogram_date = %s ORDER BY mp_name""", (session_i, stenogram_date), ) votes_by_session_by_name.append(subcur.fetchall()) ############## # PRESENTING # ############## # Plot absences timeseries. absences_figure(stenogram_date, party_names, votes_absences, votes_absences_percent) # Generate plots and html dedicated to a single session. for session_i, (description, votes_by_name, votes_by_type_party) in enumerate( zip(vote_descriptions, votes_by_session_by_name, votes_by_session_type_party) ): # Plot per-session vote data. session_votes_by_party_figure(stenogram_date, session_i, party_names, *votes_by_type_party) # Generate per-session html summary. filename = "stenogram%svote%d.html" % (datestr, session_i + 1) with open("generated_html/%s" % filename, "w") as html_file: html_file.write( per_stenogram_vote_template.render( stenogram_date=stenogram_date, session_i=session_i, description=description, party_names=party_names, votes_by_type_party=votes_by_type_party, votes_by_name=votes_by_name, ) ) sitemap.add( filename, 0.6, [ ( "session%svotes%s.png" % (stenogram_date.strftime("%Y%m%d"), session_i + 1), u"Разпределение на гласовете и отсътвията на депутати по партии на %s за гласуване номер %s." % (stenogram_date.strftime("%Y-%m-%d"), session_i + 1), ) ], ) ####################################################### # Big summary page in case there are voting sessions. # ####################################################### # Generate the main page for the current stenogram. filename = "stenogram%s.html" % datestr with open("generated_html/%s" % filename, "w") as html_file: html_file.write( per_stenogram_template.render( stenogram_date=stenogram_date, problem=False, original_url=original_url, vote_descriptions=vote_descriptions, party_names=party_names, votes_by_session_type_party=votes_by_session_type_party, reg_presences=reg_presences, reg_expected=reg_expected, divs=divs, stenogram_text=stenogram_text, ) ) sitemap.add( filename, 0.7, [ ( "absences%s.png" % stenogram_date.strftime("%Y%m%d"), u"Промяна на броя присъстващи/отсъстващи депутати на %s." % stenogram_date.strftime("%Y-%m-%d"), ) ], ) else: ########################################################## # Big summary page in case there are no voting sessions. # ########################################################## # Generate the main page for the current stenogram. filename = "stenogram%s.html" % datestr with open("generated_html/%s" % filename, "w") as html_file: html_file.write( per_stenogram_template.render( stenogram_date=stenogram_date, problem=False, original_url=original_url, vote_descriptions=None, party_names=party_names, votes_by_session_type_party=None, reg_presences=reg_presences, reg_expected=reg_expected, divs=divs, stenogram_text=stenogram_text, ) ) sitemap.add(filename, 0.7)
u'наложено вето(вето президент)': 'vetoed', u'внесен(преразглеждане зала (след вето))': 'proposed_after_veto', u'повторно приемане(преразглеждане зала (след вето))': 'accepted_after_veto', # TODO the next few are unclear in their definition (raise a warning) u'оспорени текстове(преразглеждане зала (след вето))': 'challenged_after_veto', u'оспорен по принцип(преразглеждане зала (след вето))':'challenged_after_veto', #u'обсъждане(зала първо четене)': 'proposed_1st', see signature 002-02-50 } ############################################################################## # Gather bills. ############################################################################## logger_html_bills = logging.getLogger('html_parser_bills') origurlcur = db.cursor() origurlcur.execute("""SELECT original_url FROM bills""") urls_already_in_db = set(u[0] for u in origurlcur) logger_html_bills.info('Opening calendar.') base_url = 'http://www.parliament.bg' parser_calendar = bs4.BeautifulSoup(urlopen(base_url + '/bg/bills/').read()) for month in parser_calendar.find('div', id='calendar').find_all('a'): href = month.get('href') y,m = map(int, href.split('/')[-1].split('-')) if y<2009 or (y==2009 and m<7): continue # XXX hardcoded check (only last parliament) logger_html_bills.info('Opening calendar %d %d.'%(y, m)) month_page = bs4.BeautifulSoup(urlopen(base_url + href).read()) for a in month_page.find('div', id='monthview').find_all('a'): original_url = base_url + a.get('href') if original_url in urls_already_in_db: