def MP_name_spellcheck(name): tr_dict = {u'МАРИЯНА ПЕТРОВА ИВАНОВА-НИКОЛОВА': u'МАРИАНА ПЕТРОВА ИВАНОВА-НИКОЛОВА', u'ВЕНЦЕСЛАВ ВАСИЛЕВ ВЪРБАНОВ': u'ВЕНЦИСЛАВ ВАСИЛЕВ ВЪРБАНОВ', u'АЛЕКСАНДЪР СТОЙЧЕВ СТОЙЧЕВ': u'АЛЕКСАНДЪР СТОЙЧЕВ СТОЙКОВ'} if name in tr_dict: logger_workaround.warning("Spelling error: %s" % name) return tr_dict[name] return name
def filter_names(*args): to_filter_out = [u'МИХАИЛ ВЛАДИМИРОВ ВЛАДОВ', u'НИКОЛАЙ НАНКОВ НАНКОВ'] zip_args = zip(*args) filtered = filter(lambda a: a[0] not in to_filter_out, zip_args) if len(filtered) != len(zip_args): logger_workaround.warning("An MP was filtered out of the by-names list, because they are not registered as an MP.") return zip(*filtered) return args
complete_stenogram_page = f.read().decode('utf-8') parser = StenogramsHTMLParser(complete_stenogram_page) date_string = parser.date.strftime('%d%m%y') except Exception as e: logger_to_db.error("Parsing problem with ID %s. %s"%(ID,str(e))) continue try: filename = re.search(r"/pub/StenD/(\d*iv%s.xls)" % date_string, complete_stenogram_page).groups()[0] by_name_web = urlopen("http://www.parliament.bg/pub/StenD/%s" % filename) by_name_temp = open('/tmp/temp.excel', 'wb') by_name_temp.write(by_name_web.read()) by_name_temp.close() if ID == '2766': # XXX Workaround malformated excel file. logger_workaround.warning('Using the workaround for ID 2766.') mp_names, mp_parties, mp_reg_session, mp_vote_sessions = parse_excel_by_name('workarounds/iv050712_ID2766_line32-33_workaround.xls') else: mp_names, mp_parties, mp_reg_session, mp_vote_sessions = parse_excel_by_name('/tmp/temp.excel') except Exception as e: logger_to_db.error("No MP name excel file was found for ID %s due to %s"%(ID,str(e))) problem_by_name = True try: filename = re.search(r"/pub/StenD/(\d*gv%s.xls)" % date_string, complete_stenogram_page).groups()[0] by_party_web = urlopen("http://www.parliament.bg/pub/StenD/%s" % filename) by_party_temp = open('/tmp/temp.excel', 'wb') by_party_temp.write(by_party_web.read()) by_party_temp.close() reg_by_party_dict, sessions = parse_excel_by_party('/tmp/temp.excel')