forces_list = [] mails_list = [] url_list = [] # TODO hardcoded value: id of the first mp from the current assembly indices = map(int, open("data/IDs_MPs").readlines()) cur.execute("""SELECT original_url FROM mps""") urls_already_in_db = set(zip(*cur.fetchall())[0]) for i in range(835, max(indices) + 1): original_url = unicode("http://www.parliament.bg/bg/MP/%d" % i) if original_url in urls_already_in_db: continue logger_mps.info("Parsing data for MP id %s" % i) xml_file = unicode("http://www.parliament.bg/export.php/bg/xml/MP/%d" % i) xml_str = urlopen(xml_file).read() try: r = xmltodict.parse(xml_str) name = ( " ".join( [ r["schema"]["Profile"]["Names"]["FirstName"]["@value"], r["schema"]["Profile"]["Names"]["SirName"]["@value"], r["schema"]["Profile"]["Names"]["FamilyName"]["@value"], ] ) .encode("UTF-8") .upper() .strip() ) force = " ".join(r["schema"]["Profile"]["PoliticalForce"]["@value"].split(" ")[:-1])
logger_to_db = logging.getLogger('to_db') cur.execute("""SELECT original_url FROM stenograms""") urls_already_in_db = set(_[0] for _ in cur.fetchall()) stenogram_IDs = [(i, u'http://www.parliament.bg/bg/plenaryst/ns/7/ID/'+i) for i in map(str.strip, open('data/IDs_plenary_stenograms_41').readlines())] stenogram_IDs += [(i, u'http://www.parliament.bg/bg/plenaryst/ns/50/ID/'+i) for i in map(str.strip, open('data/IDs_plenary_stenograms_42').readlines())] for i, (ID, original_url) in enumerate(stenogram_IDs[-5:]): problem_by_name = False problem_by_party = False logger_to_db.info("Parsing stenogram %s - %d of %d." % (ID, i+1, len(stenogram_IDs))) try: f = urlopen(original_url) complete_stenogram_page = f.read().decode('utf-8') parser = StenogramsHTMLParser(complete_stenogram_page) date_string = parser.date.strftime('%d%m%y') except Exception as e: logger_to_db.error("Parsing problem with ID %s. %s"%(ID,str(e))) continue try: filename = re.search(r"/pub/StenD/(\d*iv%s.xls)" % date_string, complete_stenogram_page).groups()[0] by_name_web = urlopen("http://www.parliament.bg/pub/StenD/%s" % filename) by_name_temp = open('/tmp/temp.excel', 'wb') by_name_temp.write(by_name_web.read()) by_name_temp.close() if ID == '2766': # XXX Workaround malformated excel file.
#u'обсъждане(зала първо четене)': 'proposed_1st', see signature 002-02-50 } ############################################################################## # Gather bills. ############################################################################## logger_html_bills = logging.getLogger('html_parser_bills') origurlcur = db.cursor() origurlcur.execute("""SELECT original_url FROM bills""") urls_already_in_db = set(u[0] for u in origurlcur) logger_html_bills.info('Opening calendar.') base_url = 'http://www.parliament.bg' parser_calendar = bs4.BeautifulSoup(urlopen(base_url + '/bg/bills/').read()) for month in parser_calendar.find('div', id='calendar').find_all('a'): href = month.get('href') y,m = map(int, href.split('/')[-1].split('-')) if y<2009 or (y==2009 and m<7): continue # XXX hardcoded check (only last parliament) logger_html_bills.info('Opening calendar %d %d.'%(y, m)) month_page = bs4.BeautifulSoup(urlopen(base_url + href).read()) for a in month_page.find('div', id='monthview').find_all('a'): original_url = base_url + a.get('href') if original_url in urls_already_in_db: continue bill_page = bs4.BeautifulSoup(urlopen(original_url).read()) table = bill_page.find('table', class_='bills') name = table.find_all('tr')[0].find('strong').string.split(u'Законопроект за')[-1].strip() sig = table.find_all('tr')[1].find_all('td')[1].string.strip()