def parse_index(local_name): """Parses the index file, returns list of urls, or False""" try: f = open(local_name, 'rt') except IOError: print "ERROR opening " + local_name return False text = f.read() f.close() soup = BeautifulSoup(text) urls = [] for a in soup.find_all("a"): href = a.get("href") if type(href) is str: if "/abgeordnete/biografien/daten" in href: urls.append(get_complete_url(base_url, a.get("href"))) return urls
def parse_bio(url): """Parses a biography html and returns a class of Abgeordneter, or False.""" local_name = get_bio_filename(url) if local_name == False: return False try: f = open(local_name, 'rt') except IOError: print "ERROR opening " + local_name return False text = f.read() f.close() soup = BeautifulSoup(text) bio = soup.find("div", "standardBox") if not type(bio) is Tag: print "ERROR: bio-box not found" return False # name h1 = bio.find("h1") if not type(h1) is Tag: print "ERROR: h1 name not found" return False # create a structure A = Abgeordneter() A.url = get_complete_url(base_url, url) A.gremium = u"Bundestag" A.period = u"16" texts = h1.text idx = texts.rfind(",") if idx < 0: A.name = texts.strip() else: A.name = texts[:idx].strip() A.party = texts[idx + 1:].strip() #print "[" + A.name + "]" # image url div = soup.find("div", "bildDivPortrait") if type(div) is Tag: img = div.find("img") if type(img) is Tag: src = img.get("src") if not src is None: A.img_url = get_complete_url(base_url, src) #print A.img_url # Wahlkreis for a in soup.find_all("a"): href = a.get("href") if not href == None: if "/wahlkreise" in href: if ":" in a.text: A.wahlkreis = a.text.strip() #print A.wahlkreis # parse next tags after name idx = 0 for i in h1.next_siblings: if type(i) is Tag: tex = i.text.strip() if len(tex): # print "---[" + tex + "]" if idx == 0: A.occupation = tex if idx == 1: A.birth = tex if idx > 1: A.addStatement(tex) idx = idx + 1 # proceed with memberships for i in bio.next_siblings: if type(i) is Tag: # membership box if "Mitglied" in i.text: for div in i.find_all("div"): for h3 in div.find_all("h3"): tex = h3.text.strip() member_types.add(tex) #print "###[" + tex + "]" # ordentliches Mitglied if "Ordentlich" in tex: memb = A.member[0] else: # stellvertretendes Mitglied if "Stellver" in tex: memb = A.member[1] # Funktion in Firmen, Verbänden, etc... else: memb = [] for j in h3.next_siblings: if type(j) is Tag and j.name == "p": if j.get( "class" ) == None or not "kleinAbstand" in j.get( "class"): memb.append(j.text.strip() ) #.encode("UTF-8") ) if "vor der Mitgliedschaft" in tex: A.member[3] += A.concat(memb) else: A.member[2] += A.concat(memb) break # scan ordentlich & stellvertretend for j in h3.next_siblings: if type(j) is Tag: #print j.name if j.name == "h3": break if j.name == "ul": for k in j.find_all("li"): memb.append(k.text.strip() ) #.encode("UTF-8") ) #print A return A
# parse index file for bios urls = [] for i in range(0, 26): urlss = parse_index(get_index_url(i), get_index_filename(i)) if urlss == False: print "parsing failed" else: urls += urlss # download all bios if do_download == True: for url in urls: fn = get_bio_filename(url) if fn == False: quit() download_file(get_complete_url(base_url, url), fn) # parse all biographies errors = 0 people = [] for url in urls: print "parsing " + url a = parse_bio(url) if a is False: print "in " + get_bio_filename(url) errors += 1 else: #print unicode(a) people.append(a) #break;
def parse_bio(url): """Parses a biography html and returns a class of Abgeordneter, or False.""" local_name = get_filename(url) try: f = open(local_name, 'rt') except IOError: print "ERROR opening " + local_name return False text = f.read() f.close() soup = BeautifulSoup(text) cont = soup.find("div", id="content") if not type(cont) is Tag: print "ERROR missing content div" return False #print cont A = Abgeordneter() A.url = url A.gremium = u"Landtag Thüringen" A.period = u"6" # main entry h2 = cont.find("h2", "alternativFont") if not type(h2) is Tag: print "ERROR missing main h2" return False A.name = h2.text # image url img = cont.find("img") if type(cont) is Tag: A.img_url = get_complete_url(base_url, img.get("src")) # info below name for i in h2.next_siblings: if type(i) is Tag: if i.name == "p" and len(i.text): # party if "Fraktion:" in i.text: # split into party and Wahlkreis text = i.text text = text[10:] # remove "Fraktion: " idx = text.find(",") if idx < 0: A.party = text else: A.party = text[:idx] A.wahlkreis = text[idx + 1:].strip() continue # occupation else: if not len(A.occupation): # TODO: They use <br/>s to split different statements A.occupation = i.text continue else: if len(A.party): break # verify #if not len(A.occupation): # print "ERROR missing p after h2"; return False if not len(A.party): print "ERROR missing Fraktion" return False # general data for h2 in cont.find_all("h2"): # birth if "nliche Daten" in h2.text: for i in h2.next_siblings: if type(i) is Tag: if i.name == "ul": for li in i.find_all("li"): if len(A.birth): A.birth += "\n" A.birth += li.text break # memberships if "Funktionen" in h2.text: for i in h2.next_siblings: if type(i) is Tag: if i.name == "ul": for li in i.find_all("li"): A.member[2].append(li.text) # biography dl = cont.find("dl", "vita") if type(dl) is Tag: first = "" for i in dl.children: if type(i) is Tag: if i.name == "dt": first = i.text if i.name == "dd": A.statements.append(first + " " + i.text) first = "" return A