def write_list_of_stenograms_summary_pages():
    logger_html.info("Generating html summary page of all stenograms.")
    all_stenograms_template = templates.get_template("stenograms_template.html")

    sessionscur = db.cursor()
    sessionscur.execute(
        """SELECT description, stenogram_date
                           FROM vote_sessions
                           ORDER BY stenogram_date, session_number"""
    )
    date_stenogr = groupby_list(sessionscur, operator.itemgetter(1))

    year_date_stenogr = groupby_list(date_stenogr, lambda d_s: d_s[0].year)
    years = zip(*year_date_stenogr)[0]
    for y, y_date_stenogr in year_date_stenogr:
        month_date_stenogr = [("all", y_date_stenogr)] + groupby_list(y_date_stenogr, lambda d_s: d_s[0].month)
        months = zip(*month_date_stenogr)[0]
        for m, m_date_stenogr in month_date_stenogr:
            with open("generated_html/stenograms%s%s.html" % (y, m), "w") as html_file:
                html_file.write(
                    all_stenograms_template.render(
                        years=years, months=months, current_y=y, stenogram_mgroup=m_date_stenogr
                    )
                )
                sitemap.add("stenograms%s%s.html" % (y, m), 0.8)

    # Copy the most recent one
    os.system("cp generated_html/stenograms%s%s.html generated_html/stenograms.html" % (y, m))
    sitemap.add("stenograms.html", 0.8)
def write_MPs_emails_page():
    logger_html.info("Generating list page for MP emails.")
    # Get all mails into a dict.
    mailscur = db.cursor()
    mailscur.execute("""SELECT email, orig_party_name FROM mps ORDER BY orig_party_name""")
    groups = itertools.groupby(mailscur, operator.itemgetter(1))
    mails_by_party_dict = {k: ", ".join(m[0] for m in mails if m[0]) for k, mails in groups}

    # Generate the webpage with the mails.
    logger_html.info("Generating html page of MP mail addresses.")
    mails_template = templates.get_template("mails_template.html")
    with open("generated_html/mails.html", "w") as html_file:
        html_file.write(mails_template.render(mails_by_party_dict=mails_by_party_dict))
        sitemap.add("mails.html", 0.8)
def write_bills_pages():
    logger_html.info("Generating bills html pages.")
    per_bill_template = templates.get_template("bill_D_S_template.html")
    billcur = db.cursor()
    billcur.execute("""SELECT * FROM bills""")
    count = 0
    for name, sig, date, original_url in billcur:
        print count
        count += 1
        chroncur = db.cursor()
        chroncur.execute(
            """SELECT event, event_date FROM bill_history
                            WHERE bill_signature = %s""",
            (sig,),
        )
        authcur = db.cursor()
        authcur.execute(
            """SELECT bill_author FROM bill_authors
                           WHERE bill_signature = %s""",
            (sig,),
        )
        authors = [(a, "mp_%s.html" % unicode2urlsafe(a)) for (a,) in authcur]
        authcur.execute(
            """SELECT COUNT(*) FROM bills_by_government
                           WHERE bill_signature = %s""",
            (sig,),
        )
        if authcur.fetchone()[0]:
            authors.append((u"Mинистерски съвет", "http://www.government.bg/"))
        with open("generated_html/bill_%s_%s.html" % (date.strftime("%Y%m%d"), sig), "w") as html_file:
            html_file.write(
                per_bill_template.render(
                    name=name, chronology=chroncur.fetchall(), authors=authors, original_url=original_url
                )
            )
            sitemap.add("bill_%s_%s.html" % (date.strftime("%Y%m%d"), sig), 0.7)
    }
    return party_dict.get(name, name)


def unidecode(string):
    """Transliterate unicode to latin."""
    return _unidecode.unidecode(string.replace(u'ѝ',u'и')
                                      .replace(u'ъ',u'а').replace(u'Ъ',u'А')
                                      .replace(u'ь',u'й').replace(u'Ь',u'Й'))


def unicode2urlsafe(string):
    return unidecode(string).replace(' ', '_').lower()


mpscur = db.cursor()
mpscur.execute("""SELECT mp_name FROM mps ORDER BY mp_name""")
names = [n[0] for n in mpscur]
links = ["mp_%s.html"%unicode2urlsafe(n) for n in names]
fl_names = [' '.join(n.split()[::2]) for n in names]
l_names = [n.split()[-1] for n in names]

names_links    = sortgroupby_list(zip(   names,names,links), lambda _:_[0])
fl_names_links = sortgroupby_list(zip(fl_names,names,links), lambda _:_[0])
l_names_links  = sortgroupby_list(zip( l_names,names,links), lambda _:_[0])

permited_separators =  '=[.,?!:; ()]'
re1 = '(?<{s}){k}(?{s})'
re2 =       '^{k}(?{s})'
re3 = '(?<{s}){k}$'
re4 =       '^{k}$'
def load_votes_regs_data():
    global data_loaded
    if data_loaded:
        return
    data_loaded = True
    logger_html.info("Fetching most of the db in memory.")

    # All MPs
    mpscur = db.cursor()
    mpscur.execute(
        """SELECT mp_name,
                             orig_party_name,
                             (SELECT LAST(with_party ORDER BY mp_reg.stenogram_date) FROM mp_reg WHERE mp_reg.mp_name = mps.mp_name),
                             original_url
                      FROM mps
                      ORDER BY orig_party_name, mp_name"""
    )
    global mps, parties
    mps = mpscur.fetchall()
    parties = groupby_list(mps, operator.itemgetter(1))

    # All sessions
    sescur = db.cursor()
    sescur.execute(
        """SELECT stenogram_date, session_number
                      FROM vote_sessions
                      ORDER BY stenogram_date, session_number"""
    )
    global sessions, session_dates
    sessions = sescur.fetchall()
    session_dates = groupby_list(sessions, operator.itemgetter(0))

    # All dates - includes dates on which no voting was done
    datecur = db.cursor()
    datecur.execute(
        """SELECT stenogram_date
                       FROM stenograms
                       ORDER BY stenogram_date"""
    )
    global all_dates
    all_dates = [d[0] for d in datecur]

    def aggregate_sessions_in_dates(array):
        new_shape = list(array.shape)
        new_shape[1] = len(session_dates)
        new_array = np.zeros(new_shape, dtype=np.int32)
        start = 0
        for date_i, date in enumerate(session_dates):
            end = start + len(date[1])
            new_array[:, date_i, :] = np.sum(array[:, start:end, :], 1)
            start = end
        return new_array

    def aggregate_names_in_parties(array):
        new_shape = list(array.shape)
        new_shape[0] = len(parties)
        new_array = np.zeros(new_shape, dtype=np.int32)
        start = 0
        for party_i, party in enumerate(parties):
            end = start + len(party[1])
            new_array[party_i, :, :] = np.sum(array[start:end, :, :], 0)
            start = end
        return new_array

    # All regs.
    datacur = db.cursor()
    global mps_dates_reg
    mps_dates_reg = np.zeros((len(mps), len(all_dates), 3), dtype=np.int32)
    """ A 3D array with the same structure as described for `mps_sessions_vote`."""
    reg_dict = {"present": 0, "absent": 1, "manually_registered": 2}
    for (mp_i, mp) in enumerate(mps):
        datacur.execute(
            """SELECT reg, stenogram_date FROM mp_reg
                           WHERE mp_name = %s
                           ORDER BY stenogram_date""",
            (mp[0],),
        )
        f = datacur.fetchone()
        for r in datacur:
            date_i = index(all_dates, r[1])  # While the index is monotonic it is not always +=1
            mps_dates_reg[mp_i, date_i, reg_dict[r[0]]] = 1

    global mps_all_reg
    mps_all_reg = np.sum(mps_dates_reg, 1)

    # All votes
    global mps_sessions_vote
    mps_sessions_vote = np.zeros((len(mps), len(sessions), 4), dtype=np.int32)
    """  A 3D array       sessions /
         with the         index   /___yes_no_abst_absent
         following               |    0   0   0   1
         structure:        names |    1   0   0   0
                           index |   ...
         Contains votes. If the MP was not even registered for the session it contains only zeros."""
    vote_dict = {"yes": 0, "no": 1, "abstain": 2, "absent": 3}
    for (mp_i, mp) in enumerate(mps):
        datacur.execute(
            """SELECT vote, stenogram_date, session_number FROM mp_votes
                           WHERE mp_name = %s
                           ORDER BY stenogram_date, session_number""",
            (mp[0],),
        )
        for v in datacur.fetchall():
            ses_i = index(sessions, v[1:])  # While the index is monotonic it is not always +=1
            mps_sessions_vote[mp_i, ses_i, vote_dict[v[0]]] = 1

    global mps_dates_vote, parties_sessions_vote, parties_dates_vote, mps_all_vote, all_sessions_vote
    mps_dates_vote = aggregate_sessions_in_dates(mps_sessions_vote)
    parties_sessions_vote = aggregate_names_in_parties(mps_sessions_vote)
    parties_dates_vote = aggregate_sessions_in_dates(parties_sessions_vote)
    mps_all_vote = np.sum(mps_dates_vote, 1)
    all_sessions_vote = np.sum(parties_sessions_vote, 0)
    """3D and 2D arrays with aggregated data."""

    global mps_sessions_with_against_party
    mps_sessions_with_against_party = np.zeros((len(mps), len(sessions), 2), dtype=np.int32)
    """ A 3D array with the same structure as above. The data is vote with/against party."""
    offset = 0
    for p_i, (party, p_mps) in enumerate(parties):
        end = offset + len(p_mps)
        for mp_i, ses_i in itertools.product(range(offset, end), range(len(sessions))):
            if not any(mps_sessions_vote[mp_i, ses_i, :2]):
                continue
            if parties_sessions_vote[p_i, ses_i, 0] == parties_sessions_vote[p_i, ses_i, 1]:
                mps_sessions_with_against_party[mp_i, ses_i, 0] = 1
                continue
            mps_sessions_with_against_party[mp_i, ses_i, 0] = mps_sessions_vote[
                mp_i, ses_i, np.argmax(parties_sessions_vote[p_i, ses_i, :2])
            ]
            mps_sessions_with_against_party[mp_i, ses_i, 1] = 1 - mps_sessions_with_against_party[mp_i, ses_i, 0]
        offset = end

    global mps_dates_with_against_party, mps_all_with_against_party
    mps_dates_with_against_party = aggregate_sessions_in_dates(mps_sessions_with_against_party)
    mps_all_with_against_party = np.sum(mps_dates_with_against_party, 1)
    """3D and 2D arrays with aggregated data."""

    global mps_sessions_with_against_all
    mps_sessions_with_against_all = np.zeros((len(mps), len(sessions), 2), dtype=np.int32)
    """ A 3D array with the same structure as above. The data is vote with/against all."""
    for mp_i, ses_i in itertools.product(range(len(mps)), range(len(sessions))):
        if not any(mps_sessions_vote[mp_i, ses_i, :2]):
            continue
        if all_sessions_vote[ses_i, 0] == all_sessions_vote[ses_i, 1]:
            mps_sessions_with_against_all[mp_i, ses_i, 0] = 1
            continue
        mps_sessions_with_against_all[mp_i, ses_i, 0] = mps_sessions_vote[
            mp_i, ses_i, np.argmax(all_sessions_vote[ses_i, :2])
        ]
        mps_sessions_with_against_all[mp_i, ses_i, 1] = 1 - mps_sessions_with_against_all[mp_i, ses_i, 0]

    global mps_dates_with_against_all, mps_all_with_against_all
    mps_dates_with_against_all = aggregate_sessions_in_dates(mps_sessions_with_against_all)
    mps_all_with_against_all = np.sum(mps_dates_with_against_all, 1)
    """3D and 2D arrays with aggregated data."""
def write_stenogram_pages():
    # Load templates.
    per_stenogram_template = templates.get_template("stenogramN_template.html")
    per_stenogram_reg_template = templates.get_template("stenogramNregistration_template.html")
    per_stenogram_vote_template = templates.get_template("stenogramNvoteI_template.html")

    # Get all stenograms into an iterator.
    stenogramcur = db.cursor()
    stenogramcur.execute(
        """SELECT stenogram_date, text, vote_line_nb, problem, original_url
                            FROM stenograms
                            ORDER BY stenogram_date"""
    )
    len_stenograms = stenogramcur.rowcount

    for st_i, (stenogram_date, text, vote_line_nb, problem, original_url) in enumerate(stenogramcur):
        datestr = stenogram_date.strftime("%Y%m%d")
        logger_html.info("Generating HTML and plots for %s - %d of %d" % (datestr, st_i + 1, len_stenograms))

        def counter(count=[0]):
            count[0] = count[0] + 1
            return count[0]

        stenogram_text = "<br />\n".join(
            '<strong id="votesInText%d">%s</strong>' % (counter(), l) if i in vote_line_nb else l
            for i, l in enumerate(text)
        )
        stenogram_text, divs = annotate_mps(stenogram_text)

        if problem:
            logger_html.error("The database reports problems with stenogram %s. Skipping." % datestr)
            # Generate the main page for the current stenogram.
            filename = "stenogram%s.html" % datestr
            with open("generated_html/%s" % filename, "w") as html_file:
                html_file.write(
                    per_stenogram_template.render(
                        stenogram_date=stenogram_date,
                        problem=True,
                        original_url=original_url,
                        vote_descriptions=None,
                        party_names=None,
                        votes_by_session_type_party=None,
                        reg_presences=None,
                        reg_expected=None,
                        divs=divs,
                        stenogram_text=stenogram_text,
                    )
                )
                sitemap.add(filename, 0.7)
            continue
        ################################
        # Registration data per party. #
        ################################
        # Load all party registration data for the current stenogram.
        subcur.execute(
            """SELECT party_name, present, expected
                          FROM party_reg
                          WHERE stenogram_date = %s
                          ORDER BY party_name""",
            (stenogram_date,),
        )
        party_names, reg_presences, reg_expected = zip(*subcur.fetchall())
        party_names = [n for n in party_names]
        reg_presences = np.array(reg_presences)
        reg_expected = np.array(reg_expected)
        # Plot registration data.
        registration_figure(stenogram_date, party_names, reg_presences, reg_expected)
        # Load the registration-by-name data.
        subcur.execute(
            """SELECT mp_name, with_party, reg
                          FROM mp_reg
                          WHERE stenogram_date = %s
                          ORDER BY mp_name""",
            (stenogram_date,),
        )
        reg_by_name = subcur.fetchall()
        # Generate registration summary for the current stenogram.
        filename = "stenogram%sregistration.html" % datestr
        with open("generated_html/%s" % filename, "w") as html_file:
            html_file.write(
                per_stenogram_reg_template.render(
                    stenogram_date=stenogram_date,
                    party_names=party_names,
                    reg_presences=reg_presences,
                    reg_expected=reg_expected,
                    reg_by_name=reg_by_name,
                )
            )
            sitemap.add(
                filename,
                0.6,
                [
                    (
                        "registration%s.png" % stenogram_date.strftime("%Y%m%d"),
                        u"Регистрирани и отсъстващи депутати на %s." % stenogram_date.strftime("%Y-%m-%d"),
                    )
                ],
            )
        #########################
        # Voting sessions data. #
        #########################
        # Check whether there were any voting sessions at all.
        sesscur = db.cursor()
        sesscur.execute(
            """SELECT description
                           FROM vote_sessions
                           WHERE stenogram_date = %s
                           ORDER BY session_number""",
            (stenogram_date,),
        )
        len_sessions = sesscur.rowcount
        if len_sessions:
            ###########
            # LOADING #
            ###########
            # Load all party absence and vote data for all sessions of the current stenogram.
            # list format: vote_* is [party1_votes, ...], party*_votes is [session1_vote, ...], session*_vote is int
            votes_yes = []
            votes_no = []
            votes_abstain = []
            votes_total = []
            votes_absences = []
            votes_absences_percent = []
            for party_i, n in enumerate(party_names):
                subcur.execute(
                    """SELECT yes, no, abstain, total
                                  FROM party_votes
                                  WHERE stenogram_date = %s
                                  AND party_name = %s
                                  ORDER BY session_number""",
                    (stenogram_date, n),
                )
                yes, no, abstain, total = map(np.array, zip(*subcur))
                votes_yes.append(yes)
                votes_no.append(no)
                votes_abstain.append(abstain)
                votes_total.append(total)
                absent_party = reg_expected[party_i] - total
                votes_absences.append(absent_party)
                votes_absences_percent.append(absent_party * 100 / reg_expected[party_i])
            votes_by_session_type_party = np.array([votes_yes, votes_no, votes_abstain, votes_absences]).transpose(
                2, 0, 1
            )
            # Load all session descriptions for the current stenogram.
            vote_descriptions = [d[0] for d in sesscur]
            # Load the list-by-mp-name vote for each session.
            votes_by_session_by_name = []
            for session_i in range(len_sessions):
                subcur.execute(
                    """SELECT mp_name, with_party, vote
                                  FROM mp_votes
                                  WHERE session_number = %s
                                  AND stenogram_date = %s
                                  ORDER BY mp_name""",
                    (session_i, stenogram_date),
                )
                votes_by_session_by_name.append(subcur.fetchall())
            ##############
            # PRESENTING #
            ##############
            # Plot absences timeseries.
            absences_figure(stenogram_date, party_names, votes_absences, votes_absences_percent)
            # Generate plots and html dedicated to a single session.
            for session_i, (description, votes_by_name, votes_by_type_party) in enumerate(
                zip(vote_descriptions, votes_by_session_by_name, votes_by_session_type_party)
            ):
                # Plot per-session vote data.
                session_votes_by_party_figure(stenogram_date, session_i, party_names, *votes_by_type_party)
                # Generate per-session html summary.
                filename = "stenogram%svote%d.html" % (datestr, session_i + 1)
                with open("generated_html/%s" % filename, "w") as html_file:
                    html_file.write(
                        per_stenogram_vote_template.render(
                            stenogram_date=stenogram_date,
                            session_i=session_i,
                            description=description,
                            party_names=party_names,
                            votes_by_type_party=votes_by_type_party,
                            votes_by_name=votes_by_name,
                        )
                    )
                    sitemap.add(
                        filename,
                        0.6,
                        [
                            (
                                "session%svotes%s.png" % (stenogram_date.strftime("%Y%m%d"), session_i + 1),
                                u"Разпределение на гласовете и отсътвията на депутати по партии на %s за гласуване номер %s."
                                % (stenogram_date.strftime("%Y-%m-%d"), session_i + 1),
                            )
                        ],
                    )
            #######################################################
            # Big summary page in case there are voting sessions. #
            #######################################################
            # Generate the main page for the current stenogram.
            filename = "stenogram%s.html" % datestr
            with open("generated_html/%s" % filename, "w") as html_file:
                html_file.write(
                    per_stenogram_template.render(
                        stenogram_date=stenogram_date,
                        problem=False,
                        original_url=original_url,
                        vote_descriptions=vote_descriptions,
                        party_names=party_names,
                        votes_by_session_type_party=votes_by_session_type_party,
                        reg_presences=reg_presences,
                        reg_expected=reg_expected,
                        divs=divs,
                        stenogram_text=stenogram_text,
                    )
                )
                sitemap.add(
                    filename,
                    0.7,
                    [
                        (
                            "absences%s.png" % stenogram_date.strftime("%Y%m%d"),
                            u"Промяна на броя присъстващи/отсъстващи депутати на %s."
                            % stenogram_date.strftime("%Y-%m-%d"),
                        )
                    ],
                )
        else:
            ##########################################################
            # Big summary page in case there are no voting sessions. #
            ##########################################################
            # Generate the main page for the current stenogram.
            filename = "stenogram%s.html" % datestr
            with open("generated_html/%s" % filename, "w") as html_file:
                html_file.write(
                    per_stenogram_template.render(
                        stenogram_date=stenogram_date,
                        problem=False,
                        original_url=original_url,
                        vote_descriptions=None,
                        party_names=party_names,
                        votes_by_session_type_party=None,
                        reg_presences=reg_presences,
                        reg_expected=reg_expected,
                        divs=divs,
                        stenogram_text=stenogram_text,
                    )
                )
                sitemap.add(filename, 0.7)
예제 #7
0
u'наложено вето(вето президент)':                      'vetoed',
u'внесен(преразглеждане зала (след вето))':            'proposed_after_veto',
u'повторно приемане(преразглеждане зала (след вето))': 'accepted_after_veto',
# TODO the next few are unclear in their definition (raise a warning)
u'оспорени текстове(преразглеждане зала (след вето))': 'challenged_after_veto',
u'оспорен по принцип(преразглеждане зала (след вето))':'challenged_after_veto',
#u'обсъждане(зала първо четене)':                       'proposed_1st', see signature 002-02-50
}


##############################################################################
# Gather bills.
##############################################################################
logger_html_bills = logging.getLogger('html_parser_bills')

origurlcur = db.cursor()
origurlcur.execute("""SELECT original_url FROM bills""")
urls_already_in_db = set(u[0] for u in origurlcur)

logger_html_bills.info('Opening calendar.')
base_url = 'http://www.parliament.bg'
parser_calendar = bs4.BeautifulSoup(urlopen(base_url + '/bg/bills/').read())
for month in parser_calendar.find('div', id='calendar').find_all('a'):
    href = month.get('href')
    y,m = map(int, href.split('/')[-1].split('-'))
    if y<2009 or (y==2009 and m<7): continue # XXX hardcoded check (only last parliament)
    logger_html_bills.info('Opening calendar %d %d.'%(y, m))
    month_page = bs4.BeautifulSoup(urlopen(base_url + href).read())
    for a in month_page.find('div', id='monthview').find_all('a'):
        original_url = base_url + a.get('href')
        if original_url in urls_already_in_db: