def parse_excel_by_party(filename): u""" Parse excel files with vote statistics by party. One excel file per stenogram. Assumptions =========== - There is a total of six parties (see parties_count above) - For each session, there is a line containing either `vote_marker` or `registration_marker`, and that gives the kind of the session. There is only one registration per stenogram. - After this line, there are two lines we don't care about, and the next parties_count consecutive lines contain the vote/presence statistics by party. """ book = xlrd.open_workbook(filename, logfile=excel_warnings) sheet = book.sheet_by_index(0) rows = sheet.nrows sessions = [] row = 0 while row < rows: first = sheet.cell_value(rowx=row, colx=0) if registration_marker in first: row += 2 per_party_dict = {} for i in range(parties_count): row += 1 party = canonical_party_name(sheet.cell_value(rowx=row, colx=0).strip().upper()) present = int(sheet.cell_value(rowx=row, colx=1)) expected = int(sheet.cell_value(rowx=row, colx=2)) per_party_dict[party] = reg_stats_per_party_tuple(present, expected) reg_by_party_dict = per_party_dict elif vote_marker in first: description = first.split(vote_marker)[-1].strip() time, description = description.split(u'по тема') time = datetime.datetime.strptime(time[-6:], '%H:%M ') description = description.strip() row += 2 votes_by_party_dict = {} for i in range(parties_count): row += 1 party = canonical_party_name(sheet.cell_value(rowx=row, colx=0).strip().upper()) yes = int(sheet.cell_value(rowx=row, colx=1)) no = int(sheet.cell_value(rowx=row, colx=2)) abstained = int(sheet.cell_value(rowx=row, colx=3)) total = int(sheet.cell_value(rowx=row, colx=4)) votes_by_party_dict[party] = vote_stats_per_party_tuple(yes, no, abstained, total) sessions.append(session_tuple(description, time, None, votes_by_party_dict)) row += 1 return reg_by_party_dict, sessions
def parse_excel_by_name(filename): """ Parse excel files with vote statistics by representative. Assumptions =========== The .xls file starts with two lines we don't care about. All remaining lines contain the following fields, from left to right: - representative name - two fields we skip - representative's party - undefined number of fields containing stuff about how the representative voted. """ # XXX Workarounds # Correct spelling errors in names of MPs. def MP_name_spellcheck(name): tr_dict = {u'МАРИЯНА ПЕТРОВА ИВАНОВА-НИКОЛОВА': u'МАРИАНА ПЕТРОВА ИВАНОВА-НИКОЛОВА', u'ВЕНЦЕСЛАВ ВАСИЛЕВ ВЪРБАНОВ': u'ВЕНЦИСЛАВ ВАСИЛЕВ ВЪРБАНОВ', u'АЛЕКСАНДЪР СТОЙЧЕВ СТОЙЧЕВ': u'АЛЕКСАНДЪР СТОЙЧЕВ СТОЙКОВ'} if name in tr_dict: logger_workaround.warning("Spelling error: %s" % name) return tr_dict[name] return name # Remove unregistered MPs. def filter_names(*args): to_filter_out = [u'МИХАИЛ ВЛАДИМИРОВ ВЛАДОВ', u'НИКОЛАЙ НАНКОВ НАНКОВ'] zip_args = zip(*args) filtered = filter(lambda a: a[0] not in to_filter_out, zip_args) if len(filtered) != len(zip_args): logger_workaround.warning("An MP was filtered out of the by-names list, because they are not registered as an MP.") return zip(*filtered) return args # XXX End of Workarounds. # Translate the registration and vote markers. tr_reg = {u'О':'absent', u'П':'present', u'Р':'manually_registered'} tr_vote = {'+':'yes', '-':'no', '=':'abstain', '0':'absent'} book = xlrd.open_workbook(filename, logfile=excel_warnings) sheet = book.sheet_by_index(0) cols = sheet.ncols rows = sheet.nrows names = [MP_name_spellcheck(' '.join(sheet.cell_value(rowx=row, colx=0).upper().split())) for row in range(2, rows)] parties = [canonical_party_name(sheet.cell_value(rowx=row, colx=3).strip().upper()) for row in range(2, rows)] reg_sessions = [] vote_sessions = [] for col in range(4, cols): values = [sheet.cell_value(rowx=row, colx=col) for row in range(2, rows)] if all(v in tr_reg.keys() for v in values): reg_sessions.append([tr_reg[v] for v in values]) elif all(v in tr_vote.keys() for v in values): vote_sessions.append([tr_vote[v] for v in values]) elif all(v=='' for v in values): logger_excel.warning("Empty column found in the by_names excell file. Skipping it.") else: logger_excel.error("Strange column found in the by_names excell file. Skipping it.") vote_sessions = zip(*vote_sessions) if len(reg_sessions) > 1: logger_excel.warning("There are more than one registration for this stenogram.") elif len(reg_sessions) != 1: raise ValueError('No registrations detected in the by-names file.') return filter_names(names, parties, reg_sessions[-1], vote_sessions)
try: r = xmltodict.parse(xml_str) name = ( " ".join( [ r["schema"]["Profile"]["Names"]["FirstName"]["@value"], r["schema"]["Profile"]["Names"]["SirName"]["@value"], r["schema"]["Profile"]["Names"]["FamilyName"]["@value"], ] ) .encode("UTF-8") .upper() .strip() ) force = " ".join(r["schema"]["Profile"]["PoliticalForce"]["@value"].split(" ")[:-1]) force = canonical_party_name(force).encode("UTF-8") mail = r["schema"]["Profile"]["E-mail"]["@value"].encode("UTF-8").replace(";", ",").replace(":", ",").strip() except xml.parsers.expat.ExpatError: logger_mps.warning("Parsing the xml file for MP %s failed. Trying csv." % i) try: csv_file = urlopen("http://www.parliament.bg/export.php/bg/csv/MP/%d" % i) data = [l.strip().replace(""", '"').split(";")[:-1] for l in csv_file.readlines()] name = " ".join([d.strip() for d in data[0]]) mail = ", ".join([d.strip() for d in data[9][1:]]) mail = mail.replace(";", ",").replace(":", ",") force = " ".join(data[6][-1].decode("UTF-8").split(" ")[:-1]) force = canonical_party_name(force).encode("UTF-8") except Exception, e: logger_mps.error("The csv file for MP %s is unparsable as well due to %s. Skipping this id." % (i, str(e))) continue url_list.append(original_url)