def ep_parse_wikipedia_airports(url): parser = lxml.html.HTMLParser() data, date = fetchdata.getdata(url, country="wikipedia") parser.feed(data) tree = parser.close() res = [] for table in tree.xpath("//table"): for nr, row in enumerate(table.xpath(".//tr")): cols = list([alltext(x) for x in row.xpath(".//td")]) print "#", nr, ": ", cols if nr == 0: if len(cols) == 0 or cols[0].strip() != "Airport": break assert cols[3].strip() == "ICAO" assert cols[4].strip() == "Purpose" assert cols[5].strip().count("El") assert cols[9].strip() == "Coordinates" else: purpose = cols[4].strip() if purpose.count("Unused"): continue if purpose.count("Closed"): continue if purpose.count("Liquidated"): continue if purpose == "Military": continue #Just military icao = cols[3].strip() if icao == "": icao = "ZZZZ" name = cols[0].strip() #print "lats:",row.xpath(".//span[@class='latitude']") lat, = alltexts(row.xpath(".//span[@class='latitude']")) lon, = alltexts(row.xpath(".//span[@class='longitude']")) coords = fixup(lat.strip() + " " + lon.strip()) elevft = float(cols[5].strip()) res.append( dict(pos=mapper.parsecoord(coords), name=name, elev=elevft / 0.3048, icao=icao, date=date, url=url)) return res
continue if not seenreal and re.match( ur".*The\s*line\s*joining.*", text): continue if not seenreal and text.endswith("following points:"): continue if not seenreal and text == "points:": continue if text.endswith("E"): text = text + " - " seenreal = True coords.append(text) last = sub pass assert points == None coordstr = fixup(" ".join(coords)) print "Raw coords:", coordstr points = mapper.parse_coord_str(coordstr) assert ceiling assert floor assert ctrname spaces.append( dict(name=ctrname, points=points, type="CTR", ceiling=ceiling, floor=floor, freqs=freqs)) #not first page: assert points != None