def parse_mountain_area(): p = Parser("/AIP/ENR/ENR%201/ES_ENR_1_1_en.pdf") #alongborder="610213N 0114917E - 632701N 0114917E - 661457N 0141140E - 682200N 0173441E - 683923N 0183004E - 683141N 0194631E - 690945N 0202604E - 683533N 0221411E - 680424N 0233833E - 670159N 0240734E - 663602N 0240455E - " areas = [] for pagenr in xrange(p.get_num_pages()): #print "Processing page %d"%(pagenr,) page = p.parse_page_to_items(pagenr) lines = page.get_lines(page.get_all_items()) allofit = " ".join(lines) allofit = allofit.replace( u"along the Swedish/Norwegian and Swedish/Finnish border to", u"Along the common X/Y state boundary to") allofit = allofit.replace(u"–", "-") coordarea = re.match( ur".*Mountainous\s+area\s+of\s+Sweden.{1,10}lateral\s+limits(.*?)AIRAC.*", allofit) if coordarea: points = [] txt, = coordarea.groups() print "area:<", txt, ">" points = mapper.parse_coord_str(txt, context="sweden") assert (len(points) > 3) print "Point:", len(points) areas.append( dict(name="Mountainous Area", floor="GND", ceiling="UNL", points=points, type="mountainarea", freqs=[])) print len(areas) assert len(areas) == 1 return areas
def parse_areas(areas, atype): areas = splitareas(areas.split("\n")) points = [] for area in areas: name = area[0].strip() assert len(name) if len(area) > 2 and is_alt(area[-2]): ceiling = parse_alt(area[-2]) floor = parse_alt(area[-1]) areapart = area[1:-2] else: try: floor, ceiling = parse_alts(area[-1]) areapart = area[1:-1] except: floor, ceiling = "UNK", "UNK" areapart = area[1:] coords = "-".join([r.replace(" ", "").strip() for r in areapart if not r.startswith("*")]) if coords.count("RADIUS"): coords = fix_circle(coords) # print "name:",name,"coords:",coords yield dict( name=unicode(name, "utf8"), type=atype, floor=floor, freqs=[], ceiling=ceiling, points=mapper.parse_coord_str(coords), )
def parse_areas(areas, atype): areas = splitareas(areas.split("\n")) points = [] for area in areas: name = area[0].strip() assert len(name) if len(area) > 2 and is_alt(area[-2]): ceiling = parse_alt(area[-2]) floor = parse_alt(area[-1]) areapart = area[1:-2] else: try: floor, ceiling = parse_alts(area[-1]) areapart = area[1:-1] except: floor, ceiling = "UNK", "UNK" areapart = area[1:] coords = "-".join([ r.replace(" ", "").strip() for r in areapart if not r.startswith("*") ]) if coords.count("RADIUS"): coords = fix_circle(coords) #print "name:",name,"coords:",coords yield dict(name=unicode(name, 'utf8'), type=atype, floor=floor, freqs=[], ceiling=ceiling, points=mapper.parse_coord_str(coords))
def parse_mountain_area(): p=Parser("/AIP/ENR/ENR%201/ES_ENR_1_1_en.pdf") #alongborder="610213N 0114917E - 632701N 0114917E - 661457N 0141140E - 682200N 0173441E - 683923N 0183004E - 683141N 0194631E - 690945N 0202604E - 683533N 0221411E - 680424N 0233833E - 670159N 0240734E - 663602N 0240455E - " areas=[] for pagenr in xrange(p.get_num_pages()): #print "Processing page %d"%(pagenr,) page=p.parse_page_to_items(pagenr) lines=page.get_lines(page.get_all_items()) allofit=" ".join(lines) allofit=allofit.replace(u"along the Swedish/Norwegian and Swedish/Finnish border to", u"Along the common X/Y state boundary to" ) allofit=allofit.replace(u"–","-") coordarea=re.match(ur".*Mountainous\s+area\s+of\s+Sweden.{1,10}lateral\s+limits(.*?)AIRAC.*",allofit) if coordarea: points=[] txt,=coordarea.groups() print "area:<",txt,">" points=mapper.parse_coord_str(txt,context="sweden") assert(len(points)>3) print "Point:",len(points) areas.append(dict( name="Mountainous Area", floor="GND", ceiling="UNL", points=points, type="mountainarea", freqs=[])) print len(areas) assert len(areas)==1 return areas
def emit(): if name == None: raise Exception("Area is missing name") if ceiling == None or floor == None: raise Exception("Area is missing floor or ceiling") cd = " - ".join(coords) ret = dict(floor=floor, ceiling=ceiling, freqs=[], type="segel", name=name + " glider sector", points=mapper.parse_coord_str(cd)) return ret
def emit(): if name==None: raise Exception("Area is missing name") if ceiling==None or floor==None: raise Exception("Area is missing floor or ceiling") cd=" - ".join(coords) ret=dict( floor=floor, ceiling=ceiling, freqs=[], type="segel", name=name+" glider sector", points=mapper.parse_coord_str(cd)) return ret
def ee_parse_tma(): def fixgote(raw): return raw p = parse.Parser(r"/2012-03-08/pdf/EE-ENR-2.1.pdf", fixgote, country='ee') res = [] for pagenr in xrange(1, p.get_num_pages()): parsed = parse_page(p, pagenr) #pagenr) res.extend(parsed) res.append( dict(name="TALLIN FIR", icao="EETT", floor='GND', ceiling='-', freqs=[], type='FIR', date=datetime(2011, 03, 25), points=mapper.parse_coord_str(""" 592818N 0280236E - Along the common Estonian/X state boundary to 573100N 0272000E - Along the common Estonian/X state boundary to 575300N 0242200E - 575228N 0242124E- 575502N 0241540E-575357N 0241234E- 575357N 0233604E-574658N 0233855E- 574011N 0233456E-573538N 0232422E- 573511N 0231051E-574208N 0225957E- 574650N 0225428E-575627N 0224227E- 575539N 0223501E-574645N 0220836E- 574458N 0215458E-574547N 0215034E- 574712N 0214300E-575124N 0213848E- 575342N 0213648E-580700N 0212900E- 582448N 0203834E-590000N 0210000E- 595300N 0245100E-595430N 0252000E- 595300N 0255200E-595200N 0255830E- 593642N 0273812E-592818N 0280236E """, context='estonia')))
def ee_parse_tma(): def fixgote(raw): return raw p=parse.Parser(r"/2012-03-08/pdf/EE-ENR-2.1.pdf",fixgote,country='ee') res=[] for pagenr in xrange(1,p.get_num_pages()): parsed=parse_page(p,pagenr)#pagenr) res.extend(parsed) res.append(dict( name="TALLIN FIR", icao="EETT", floor='GND', ceiling='-', freqs=[], type='FIR', date=datetime(2011,03,25), points=mapper.parse_coord_str(""" 592818N 0280236E - Along the common Estonian/X state boundary to 573100N 0272000E - Along the common Estonian/X state boundary to 575300N 0242200E - 575228N 0242124E- 575502N 0241540E-575357N 0241234E- 575357N 0233604E-574658N 0233855E- 574011N 0233456E-573538N 0232422E- 573511N 0231051E-574208N 0225957E- 574650N 0225428E-575627N 0224227E- 575539N 0223501E-574645N 0220836E- 574458N 0215458E-574547N 0215034E- 574712N 0214300E-575124N 0213848E- 575342N 0213648E-580700N 0212900E- 582448N 0203834E-590000N 0210000E- 595300N 0245100E-595430N 0252000E- 595300N 0255200E-595200N 0255830E- 593642N 0273812E-592818N 0280236E """,context='estonia')))
def fi_parse_restrictions(): spaces=[] p=parse.Parser("/ais/eaip/pdf/enr/EF_ENR_5_2_EN.pdf",lambda x: x,country='fi') for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) headings=list(page.get_by_regex(ur"EF T[RS]A \d+"))+[None] for tra,next in izip(headings,headings[1:]): y1=tra.y2+0.1 if next: y2=next.y1-0.1 else: y2=100 o=[] for line in page.get_lines(page.get_partially_in_rect( 0,y1,100,y2)): line=line.strip() if line.endswith("clock-"): line=line.rstrip("-") line=line.replace("to the point -","to the point ") print "Eval",line if line=="":break o.append(line) print "AREA:<","".join(o),">" kind,number=re.match("EF (T[RS]A) (\d+)",tra.text).groups() spaces.append(dict( name="EF %s %s"%(kind,number), points=mapper.parse_coord_str("".join(o),context="finland"), ceiling="UNL", floor="GND", type="TSA", freqs=[] )) p=parse.Parser("/ais/eaip/pdf/enr/EF_ENR_5_1_EN.pdf",lambda x: x,country='fi') for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) raws=list(sorted(page.get_by_regex(ur"(?:EF [PRD]\d+[A-Z]{0,2} .*)|(?:.*Tunnus, nimi ja sivurajat.*)"),key=lambda x:x.y1))+[None] for cur,next in izip(raws[:-1],raws[1:]): if cur.text.count("Tunnus, nimi ja sivurajat"): continue #not a real airspace space=dict() if next==None: y2=100 else: y2=next.y1-1.75 name=cur.text.strip() space['name']=name if name.startswith("EF R28"): continue #This airspace is special, and not supported now (it's the no-mans-land-zone on border to russia!) areaspecprim=page.get_lines(page.get_partially_in_rect(cur.x1+0.01,cur.y2+0.05,cur.x1+50,y2)) areaspec=[] for area in areaspecprim: if len(areaspec) and area.strip()=="": break areaspec.append(area) print "Y-interval:",cur.y1,y2,"next:",next print "Name:",space['name'] print "areaspec:",areaspec space['points']=mapper.parse_coord_str("".join(areaspec)) vertitems=page.get_partially_in_rect(cur.x1+55,cur.y1+0.05,cur.x1+70,y2+1.5) vertspec=[x.strip() for x in page.get_lines(vertitems) if x.strip()] print repr(vertspec) assert len(vertspec)==2 ceiling,floor=vertspec space['ceiling']=ceiling space['floor']=floor space['type']='R' space['freqs']=[] spaces.append(space) return spaces
def ev_parse_tma(): out = [] parser = lxml.html.HTMLParser() #url="/Latvia_EV-ENR-2.1-en-GB.html" cur_airac = get_cur_airac() url = "/eAIPfiles/%s-AIRAC/html/eAIP/EV-ENR-2.1-en-GB.html" % (cur_airac, ) data, date = fetchdata.getdata(url, country='ev') parser.feed(data) tree = parser.close() got_fir = False for table in tree.xpath("//table"): #print "Table with %d children"%(len(table.getchildren()),) rows = list(table.xpath(".//tr")) for idx in xrange(5): headingrow = rows[idx] cols = list(headingrow.xpath(".//th")) #print len(cols) if len(cols) == 5: break else: raise Exception("No heading row") assert idx == 0 #for idx,col in enumerate(cols): # print "Col %d, %s"%(idx,alltext(col)[:10]) nameh, unith, callsignh, freqh, remarkh = cols assert alltext(nameh).lower().count("name") assert alltext(unith).lower().count("unit") assert re.match(ur"call\s*sign", alltext(callsignh).lower()) lastcols = None for row in rows[1:]: cols = list(row.xpath(".//td")) if len(cols) == 5: name, unit, callsign, freq, remark = cols lastcols = cols else: if lastcols: unit, callsign, freq, remark = lastcols[1:] name = cols[0] else: continue lines = [x.strip() for x in alltext(name).split("\n") if x.strip()] if len(lines) == 0: continue spacename = lines[0].strip() if re.match(ur"RIGA\s*UTA|RIGA\s*CTA|RIGA\s*AOR.*", spacename): continue freqstr = alltext(freq) callsignstr = alltext(callsign) if freqstr.strip(): print freqstr freqmhzs = re.findall(ur"\d{3}\.\d{3}", freqstr) assert len(freqmhzs) <= 2 callsigns = [callsignstr.split("\n")[0].strip()] freqs = [] for idx, freqmhz in enumerate(freqmhzs): if freqmhz == '121.500': continue freqs.append((callsigns[idx], float(freqmhz))) print "freqs:", freqs else: freqs = [] assert len(lines) classidx = next(idx for idx, x in reversed(list(enumerate(lines))) if x.lower().count("class of airspace")) if re.match(ur"RIGA\s*FIR.*UIR", spacename, re.UNICODE): got_fir = True lastspaceidx = classidx - 2 floor = "GND" ceiling = "-" type_ = "FIR" else: if lines[classidx - 1].count("/") == 1: floor, ceiling = lines[classidx - 1].split("/") lastspaceidx = classidx - 1 else: floor = lines[classidx - 1] ceiling = lines[classidx - 2] lastspaceidx = classidx - 2 ceiling = strangefix(ceiling) floor = strangefix(floor) mapper.parse_elev(ceiling) mapper.parse_elev(floor) type_ = "TMA" tcoords = lines[1:lastspaceidx] #verify that we got actual altitudes: coords = [] for coord in tcoords: coord = coord.strip().replace("(counter-)", "").replace( "(RIGA DVOR - RIA)", "") if coord.endswith(u"E") or coord.endswith("W"): coord = coord + " -" coords.append(coord) raw = " ".join(coords) raw = re.sub( s(ur"Area bounded by lines successively joining the following points:" ), "", raw) print "Raw:", raw coords = mapper.parse_coord_str(raw, context='latvia') for cleaned in clean_up_polygon(coords): out.append( dict(name=spacename, points=cleaned, type=type_, freqs=freqs, floor=floor, url=url, date=date, ceiling=ceiling)) if type_ == 'FIR': out[-1]['icao'] = "EVRR"
def extract_airfields(filtericao=lambda x:True,purge=True): #print getxml("/AIP/AD/AD 1/ES_AD_1_1_en.pdf") ads=[] p=Parser("/AIP/AD/AD 1/ES_AD_1_1_en.pdf") points=dict() startpage=None for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) if page.count("Aerodrome directory"): startpage=pagenr break if startpage==None: raise Exception("Couldn't find aerodrome directory in file") #print "Startpage: %d"%(startpage,) #nochartf=open("nochart.txt","w") for pagenr in xrange(startpage,p.get_num_pages()): row_y=[] page=p.parse_page_to_items(pagenr) allines=[x for x in (page.get_lines(page.get_partially_in_rect(0,0,15,100))) if x.strip()] for item,next in zip(allines,allines[1:]+[""]): #print "item:",item m=re.match(ur"^\s*[A-ZÅÄÖ]{3,}(?:/.*)?\b.*",item) if m: #print "Candidate, next is:",next if re.match(r"^\s*[A-Z]{4}\b.*",next): #print "Matched:",item #print "y1:",item.y1 row_y.append(item.y1) for y1,y2 in zip(row_y,row_y[1:]+[100.0]): #print "Extacting from y-range: %f-%f"%(y1,y2) items=list(page.get_partially_in_rect(0,y1-0.25,5.0,y2+0.25,ysort=True)) if len(items)>=2: #print "Extract items",items ad=dict(name=unicode(items[0].text).strip(), icao=unicode(items[1].text).strip() ) #print "Icao:",ad['icao'] assert re.match(r"[A-Z]{4}",ad['icao']) if not filtericao(ad): continue if len(items)>=3: #print "Coord?:",items[2].text m=re.match(r".*(\d{6}N)\s*(\d{7}E).*",items[2].text) if m: lat,lon=m.groups() ad['pos']=parse_coords(lat,lon) #print "Items3:",items[3:] elev=re.findall(r"(\d{1,5})\s*ft"," ".join(t.text for t in items[3:])) #print "Elev:",elev assert len(elev)==1 ad['elev']=int(elev[0]) ads.append(ad) big_ad=set() for ad in ads: if not ad.has_key('pos'): big_ad.add(ad['icao']) for ad in ads: icao=ad['icao'] if icao in big_ad: if icao in ['ESIB','ESNY','ESCM','ESPE']: continue try: p=Parser("/AIP/AD/AD 2/%s/ES_AD_2_%s_6_1_en.pdf"%(icao,icao)) except: p=Parser("/AIP/AD/AD 2/%s/ES_AD_2_%s_6-1_en.pdf"%(icao,icao)) ad['aipvacurl']=p.get_url() for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) """ for altline in exitlines: m=re.match(r"(\w+)\s+(\d+N)\s*(\d+E.*)",altline) if not m: continue name,lat,lon=m.groups() try: coord=parse_coords(lat,lon) except Exception: continue points.append(dict(name=name,pos=coord)) """ for kind in xrange(2): if kind==0: hits=page.get_by_regex(r"H[Oo][Ll][Dd][Ii][Nn][Gg]") kind="holding point" if kind==1: hits=page.get_by_regex(r"[Ee]ntry.*[Ee]xit.*point") kind="entry/exit point" if len(hits)==0: continue for holdingheading in hits: items=sorted(page.get_partially_in_rect(holdingheading.x1+2.0,holdingheading.y2+0.1,holdingheading.x1+0.5,100), key=lambda x:x.y1) items=[x for x in items if not x.text.startswith(" ")] #print "Holding items:",items for idx,item in enumerate(items): print "Holding item",item y1=item.y1 if idx==len(items)-1: y2=100 else: y2=items[idx+1].y1 items2=[x for x in page.get_partially_in_rect(item.x1+1,y1+0.3,item.x1+40,y2-0.1) if x.x1>=item.x1-0.25 and x.y1>=y1-0.05 and x.y1<y2-0.05] s=(" ".join(page.get_lines(items2))).strip() print "Holding lines:",repr(page.get_lines(items2)) #if s.startswith("ft Left/3"): #Special case for ESOK # s,=re.match("ft Left/3.*?([A-Z]{4,}.*)",s).groups() #m=re.match("ft Left/\d+.*?([A-Z]{4,}.*)",s) #if m: # s,=m.groups() if s.startswith("LjUNG"): #Really strange problem with ESCF s=s[0]+"J"+s[2:] if s.lower().startswith("holding"): sl=s.split(" ",1) if len(sl)>1: s=sl[1] s=s.strip() if kind=="entry/exit point" and s.startswith("HOLDING"): continue #reached HOLDING-part of VAC #Check for other headings #Fixup strange formatting of points in some holding items: (whitespace between coord and 'E') s=re.sub(ur"(\d+)\s*(N)\s*(\d+)\s*(E)",lambda x:"".join(x.groups()),s) m=re.match(r"([A-Z]{2,}).*?(\d+N)\s*(\d+E).*",s) if not m: m=re.match(r".*?(\d+N)\s*(\d+E).*",s) if not m: continue assert m lat,lon=m.groups() #skavsta if icao=="ESKN": if s.startswith(u"Hold north of T"): name="NORTH" elif s.startswith(u"Hold south of B"): name="SOUTH" else: assert 0 #add more specials here else: continue else: name,lat,lon=m.groups() try: coord=parse_coords(lat,lon) except Exception: print "Couldn't parse:",lat,lon continue #print name,lat,lon,mapper.format_lfv(*mapper.from_str(coord)) if name.count("REMARK") or len(name)<=2: print "Suspicious name: ",name #sys.exit(1) continue points[icao+' '+name]=dict(name=icao+' '+name,icao=icao,pos=coord,kind=kind) #for point in points.items(): # print point #sys.exit(1) def fixhex11(s): out=[] for c in s: i=ord(c) if i>=0x20: out.append(c) continue if i in [0x9,0xa,0xd]: out.append(c) continue out.append(' ') return "".join(out) for ad in ads: icao=ad['icao'] if icao in big_ad: #print "Parsing ",icao p=Parser("/AIP/AD/AD 2/%s/ES_AD_2_%s_en.pdf"%(icao,icao),loadhook=fixhex11) ad['aiptexturl']=p.get_url() firstpage=p.parse_page_to_items(0) te="\n".join(firstpage.get_all_lines()) #print te coords=re.findall(r"ARP.*(\d{6}N)\s*(\d{7}E)",te) if len(coords)>1: raise Exception("First page of airport info (%s) does not contain exactly ONE set of coordinates"%(icao,)) if len(coords)==0: print "Couldn't find coords for ",icao #print "Coords:",coords ad['pos']=parse_coords(*coords[0]) elev=re.findall(r"Elevation.*?(\d{1,5})\s*ft",te,re.DOTALL) if len(elev)>1: raise Exception("First page of airport info (%s) does not contain exactly ONE elevation in ft"%(icao,)) if len(elev)==0: print "Couldn't find elev for ",icao ad['elev']=int(elev[0]) freqs=[] found=False thrs=[] #uprint("-------------------------------------") for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) #uprint("Looking on page %d"%(pagenr,)) if 0: #opening hours are no longer stored in a separate document for any airports. No need to detect which any more (since none are). for item in page.get_by_regex(".*OPERATIONAL HOURS.*"): lines=page.get_lines(page.get_partially_in_rect(0,item.y2+0.1,100,100)) for line in lines: things=["ATS","Fuelling","Operating"] if not line.count("AIP SUP"): continue for thing in things: if line.count(thing): ad['aipsup']=True for item in page.get_by_regex(".*\s*RUNWAY\s*PHYSICAL\s*CHARACTERISTICS\s*.*"): #uprint("Physical char on page") lines=page.get_lines(page.get_partially_in_rect(0,item.y2+0.1,100,100)) seen_end_rwy_text=False for line,nextline in izip(lines,lines[1:]+[None]): #uprint("MAtching: <%s>"%(line,)) if re.match(ur"AD\s+2.13",line): break if line.count("Slope of"): break if line.lower().count("end rwy:"): seen_end_rwy_text=True if line.lower().count("bgn rwy:"): seen_end_rwy_text=True m=re.match(ur".*(\d{6}\.\d+)[\s\(\)\*]*(N).*",line) if not m:continue m2=re.match(ur".*(\d{6,7}\.\d+)\s*[\s\(\)\*]*(E).*",nextline) if not m2:continue latd,n=m.groups() lond,e=m2.groups() assert n=="N" assert e=="E" lat=latd+n lon=lond+e rwytxts=page.get_lines(page.get_partially_in_rect(0,line.y1+0.05,12,nextline.y2-0.05)) uprint("Rwytxts:",rwytxts) rwy=None for rwytxt in rwytxts: #uprint("lat,lon:%s,%s"%(lat,lon)) #uprint("rwytext:",rwytxt) m=re.match(ur"\s*(\d{2}[LRCM]?)\b.*",rwytxt) if m: assert rwy==None rwy=m.groups()[0] if rwy==None and seen_end_rwy_text: continue print "Cur airport:",icao already=False assert rwy!=None seen_end_rwy_text=False for thr in thrs: if thr['thr']==rwy: raise Exception("Same runway twice on airfield:"+icao) thrs.append(dict(pos=mapper.parse_coords(lat,lon),thr=rwy)) assert len(thrs)>=2 for pagenr in xrange(0,p.get_num_pages()): page=p.parse_page_to_items(pagenr) matches=page.get_by_regex(r".*ATS\s+COMMUNICATION\s+FACILITIES.*") #print "Matches of ATS COMMUNICATION FACILITIES on page %d: %s"%(pagenr,matches) if len(matches)>0: commitem=matches[0] curname=None callsign=page.get_by_regex_in_rect(ur"Call\s*sign",0,commitem.y1,100,commitem.y2+8)[0] for idx,item in enumerate(page.get_lines(page.get_partially_in_rect(callsign.x1-0.5,commitem.y1,100,100),fudge=0.3,order_fudge=15)): if item.strip()=="": curname=None if re.match(".*RADIO\s+NAVIGATION\s+AND\s+LANDING\s+AIDS.*",item): break #print "Matching:",item m=re.match(r"(.*?)\s*(\d{3}\.\d{1,3})\s*MHz.*",item) #print "MHZ-match:",m if not m: continue #print "MHZ-match:",m.groups() who,sfreq=m.groups() freq=float(sfreq) if abs(freq-121.5)<1e-4: if who.strip(): curname=who continue #Ignore emergency frequency, it is understood if not who.strip(): if curname==None: continue else: curname=who freqs.append((curname.strip().rstrip("/"),freq)) for pagenr in xrange(0,p.get_num_pages()): page=p.parse_page_to_items(pagenr) matches=page.get_by_regex(r".*ATS\s*AIRSPACE.*") #print "Matches of ATS_AIRSPACE on page %d: %s"%(pagenr,matches) if len(matches)>0: heading=matches[0] desigitem,=page.get_by_regex("Designation and lateral limits") vertitem,=page.get_by_regex("Vertical limits") airspaceclass,=page.get_by_regex("Airspace classification") lastname=None subspacelines=dict() subspacealts=dict() for idx,item in enumerate(page.get_lines(page.get_partially_in_rect(desigitem.x2+1,desigitem.y1,100,vertitem.y1-1))): if item.count("ATS airspace not established"): assert idx==0 break if item.strip()=="": continue m=re.match(r"(.*?)(\d{6}N\s+.*)",item) if m: name,coords=m.groups() name=name.strip() else: name=item.strip() coords=None if name: lastname=name if coords: subspacelines.setdefault(lastname,[]).append(coords) assert lastname lastname=None #print "Spaces:",subspacelines #print "ICAO",ad['icao'] #altlines=page.get_lines(page.get_partially_in_rect(vertitem.x2+1,vertitem.y1,100,airspaceclass.y1-0.2)) #print "Altlines:",altlines subspacealts=dict() subspacekeys=subspacelines.keys() allaltlines=" ".join(page.get_lines(page.get_partially_in_rect(vertitem.x1+0.5,vertitem.y1+0.5,100,airspaceclass.y1-0.2))) single_vertlim=False totalts=list(mapper.parse_all_alts(allaltlines)) #print "totalts:",totalts if len(totalts)==2: single_vertlim=True for subspacename in subspacekeys: ceil=None floor=None subnames=[subspacename] if subspacename.split(" ")[-1].strip() in ["TIA","TIZ","CTR","CTR/TIZ"]: subnames.append(subspacename.split(" ")[-1].strip()) #print "Parsing alts for ",subspacename,subnames try: for nametry in subnames: if single_vertlim: #there's only one subspace, parse all of vertical limits field for this single one. items=[vertitem] else: items=page.get_by_regex_in_rect(nametry,vertitem.x2+1,vertitem.y1,100,airspaceclass.y1-0.2) for item in items: alts=[] for line in page.get_lines(page.get_partially_in_rect(item.x1+0.5,item.y1+0.5,100,airspaceclass.y1-0.2)): #print "Parsing:",line line=line.replace(nametry,"").lower().strip() parsed=list(mapper.parse_all_alts(line)) if len(parsed): alts.append(mapper.altformat(*parsed[0])) if len(alts)==2: break if alts: #print "alts:",alts ceil,floor=alts raise StopIteration except StopIteration: pass assert ceil and floor subspacealts[subspacename]=dict(ceil=ceil,floor=floor) spaces=[] for spacename in subspacelines.keys(): altspacename=spacename #print "Altspacename: %s, subspacesalts: %s"%(altspacename,subspacealts) space=dict( name=spacename, ceil=subspacealts[altspacename]['ceil'], floor=subspacealts[altspacename]['floor'], points=parse_coord_str(" ".join(subspacelines[spacename])), freqs=list(set(freqs)) ) if True: vs=[] for p in space['points']: x,y=mapper.latlon2merc(mapper.from_str(p),13) vs.append(Vertex(int(x),int(y))) p=Polygon(vvector(vs)) if p.calc_area()<=30*30: pass#print space pass#print "Area:",p.calc_area() assert p.calc_area()>30*30 #print "Area: %f"%(p.calc_area(),) spaces.append(space) #print space ad['spaces']=spaces found=True if found: break assert found ad['runways']=rwy_constructor.get_rwys(thrs) #Now find any ATS-airspace chartblobnames=[] for ad in ads: icao=ad['icao'] if icao in big_ad: parse_landing_chart.help_plc(ad,"/AIP/AD/AD 2/%s/ES_AD_2_%s_2-1_en.pdf"%(icao,icao), icao,ad['pos'],"se",variant="") parse_landing_chart.help_plc(ad,"/AIP/AD/AD 2/%s/ES_AD_2_%s_6-1_en.pdf"%(icao,icao), icao,ad['pos'],"se",variant="vac") parse_landing_chart.help_plc(ad,"/AIP/AD/AD 2/%s/ES_AD_2_%s_2-3_en.pdf"%(icao,icao), icao,ad['pos'],"se",variant="parking") #aip_text_documents.help_parse_doc(ad,"/AIP/AD/AD 2/%s/ES_AD_2_%s_6_1_en.pdf"%(icao,icao), # icao,"se",title="General Information",category="general") aip_text_documents.help_parse_doc(ad,"/AIP/AD/AD 2/%s/ES_AD_2_%s_en.pdf"%(icao,icao), icao,"se",title="General Information",category="general") #if purge: # parse_landing_chart.purge_old(chartblobnames,country="se") #sys.exit(1) for extra in extra_airfields.extra_airfields: if filtericao(extra): ads.append(extra) print print for k,v in sorted(points.items()): print k,v,mapper.format_lfv(*mapper.from_str(v['pos'])) #print "Num points:",len(points) origads=list(ads) for flygkartan_id,name,lat,lon,dummy in csv.reader(open("fplan/extract/flygkartan.csv"),delimiter=";"): found=None lat=float(lat) lon=float(lon) if type(name)==str: name=unicode(name,'utf8') mercf=mapper.latlon2merc((lat,lon),13) for a in origads: merca=mapper.latlon2merc(mapper.from_str(a['pos']),13) dist=math.sqrt((merca[0]-mercf[0])**2+(merca[1]-mercf[1])**2) if dist<120: found=a break if found: found['flygkartan_id']=flygkartan_id else: d=dict( icao='ZZZZ', name=name, pos=mapper.to_str((lat,lon)), elev=int(get_terrain_elev((lat,lon))), flygkartan_id=flygkartan_id) if filtericao(d): ads.append(d) minor_ad_charts=extra_airfields.minor_ad_charts for ad in ads: if ad['name'].count(u"Långtora"): ad['pos']=mapper.to_str(mapper.from_aviation_format("5944.83N01708.20E")) if ad['name'] in minor_ad_charts: charturl=minor_ad_charts[ad['name']] arp=ad['pos'] if 'icao' in ad and ad['icao'].upper()!='ZZZZ': icao=ad['icao'].upper() else: icao=ad['fake_icao'] parse_landing_chart.help_plc(ad,charturl,icao,arp,country='raw',variant="landing") """ assert icao!=None lc=parse_landing_chart.parse_landing_chart( charturl, icao=icao, arppos=arp,country="raw") assert lc if lc: ad['adcharturl']=lc['url'] ad['adchart']=lc """ #print ads for ad in ads: print "%s: %s - %s (%s ft) (%s)"%(ad['icao'],ad['name'],ad['pos'],ad['elev'],ad.get('flygkartan_id','inte i flygkartan')) for space in ad.get('spaces',[]): for freq in space.get('freqs',[]): print " ",freq #if 'spaces' in ad: # print " spaces: %s"%(ad['spaces'],) #if 'aiptext' in ad: # print "Aip texts:",ad['aiptext'] #else: # print "No aiptext" print "Points:" for point in sorted(points.values(),key=lambda x:x['name']): print point f=codecs.open("extract_airfields.regress.txt","w",'utf8') for ad in ads: r=repr(ad) d=md5.md5(r).hexdigest() f.write("%s - %s - %s\n"%(ad['icao'],ad['name'],d)) f.close() f=codecs.open("extract_airfields.regress-details.txt","w",'utf8') for ad in ads: r=repr(ad) f.write(u"%s - %s - %s\n"%(ad['icao'],ad['name'],r)) f.close() return ads,points.values()
def ev_parse_x(url): out = [] parser = lxml.html.HTMLParser() data, date = fetchdata.getdata(url, country="ev") parser.feed(data) tree = parser.close() got_fir = False for table in tree.xpath("//table"): # print "Table with %d children"%(len(table.getchildren()),) rows = list(table.xpath(".//tr")) # for idx,col in enumerate(cols): # print "Col %d, %s"%(idx,alltext(col)[:10]) headingcols = rows[0].xpath(".//th") if len(headingcols) == 0: continue name, alt = headingcols[0:2] if alltext(name).count("QNH") and len(headingcols) > 6: continue print alltext(name) assert alltext(name).lower().count("name") or alltext(name).lower().count("lateral") print alltext(alt) assert alltext(alt).lower().count("limit") for row in rows[1:]: cols = list(row.xpath(".//td")) if len(cols) < 2: continue name, alt = cols[:2] lines = [x.strip() for x in alltext(name).split("\n") if x.strip()] if len(lines) == 0: continue assert len(lines) spacename = lines[0].strip() if spacename.strip() == "A circle radius 0,5 NM centered on 565705N 0240619E EVR2 RIGA": spacename = "EVR2 RIGA" lines = [spacename, lines[0][: -len(spacename)].strip()] + lines[1:] print spacename if spacename.strip() == "SKRIVERI": continue print "Spacename is:", spacename assert ( spacename[:3] in ["EVR", "EVP", "TSA", "TRA"] or spacename.endswith("ATZ") or spacename.endswith("ATZ (MILITARY)") ) altcand = [] for altc in alltext(alt).split("\n"): if altc.count("Real-time"): continue altcand.append(altc.strip()) print "Altcands:", altcand ceiling, floor = [x.strip() for x in " ".join(altcand).split("/")] ceiling = strangefix(ceiling) floor = strangefix(floor) mapper.parse_elev(ceiling) ifloor = mapper.parse_elev(floor) iceiling = mapper.parse_elev(ceiling) if ifloor >= 9500 and iceiling >= 9500: continue assert ifloor < iceiling freqs = [] raw = " ".join(lines[1:]) raw = re.sub(s(ur"Area bounded by lines successively joining the following points:"), "", raw) print "Raw:", raw coords = mapper.parse_coord_str(raw, context="latvia") for cleaned in clean_up_polygon(coords): out.append( dict( name=spacename, points=cleaned, type="R", freqs=freqs, floor=floor, url=url, date=date, ceiling=ceiling, ) ) return out
def parse_all_tma(): def fixgote(raw): #Fix illogical composition of Göteborg TMA description. 2010 04 02 did_replace = [0] def replacer(args): uprint(args.groups()) y, x, w, h, font = args.groups() uprint(w, h) assert int(w) >= 260 and int(w) < 420 assert int(h) >= 6 and int(h) <= 15 f = float(w) / 270.0 x1 = x y1 = y w1 = 80 h1 = h x2 = 168 * f y2 = y w2 = 150 * f h2 = h did_replace[0] += 1 repl = """<text top="%s" left="%s" width="%s" height="%s" font="%s">Part of GÖTEBORG TMA</text> <text top="%s" left="%s" width="%s" height="%s" font="%s">584558N 0122951E - 584358N 0130950E - </text>""" % ( y1, x1, w1, h1, font, y2, x2, w2, h2, font) uprint("\n======================================\nReplacement:\n", repl) return repl raw = re.sub( r"""<text top="(\d+)" left="(\d+)" width="(\d+)" height="(\d+)" font="(\d+)">\s*Part of GÖTEBORG TMA 584558N 0122951E - 584358N 0130950E - </text>""", replacer, raw) assert did_replace[0] == 1 return raw p = parse.Parser("/AIP/ENR/ENR 2/ES_ENR_2_1_en.pdf") res = [] found = False last_sector = dict() for pagenr in xrange(0, p.get_num_pages()): page = p.parse_page_to_items(pagenr) #print "Num acc-sec:",len(page.get_by_regex(r".*ACC.sectors.*")) #print "Num and acc-sec:",len(page.get_by_regex(r".*and\s+ACC.sectors.*")) sect = (len(page.get_by_regex(r".*ACC.sectors.*")) > 0 and len(page.get_by_regex(r".*and\s+ACC.sector.*")) == 0) #print "ACC-sector2:",sect if found or page.get_by_regex(r".*Terminal Control Areas.*") or sect: found = True else: continue #if sect: parsed = parse_page(p, pagenr, "TMA" if not sect else "sector", last_sector=last_sector) res.extend(parsed) res.append( dict(name="SWEDEN FIR", icao="ESAA", floor='GND', ceiling='-', freqs=[], type='FIR', date=datetime(2011, 4, 9), points=mapper.parse_coord_str(""" 690336N 0203255E - Along the common X/Y state boundary to 653148N 0240824E - 644100N 0225500E - 633700N 0213000E - 632830N 0204000E - 631000N 0201000E - 614000N 0193000E - 610000N 0191905E - 601803N 0190756E - 601130N 0190512E - 593346N 0195859E - 591524N 0203239E - 590000N 0210000E - 573410N 0200900E - 570000N 0195000E - 555100N 0173300E - 545500N 0155200E - 545500N 0150807E - clockwise along an arc centred on 550404N 0144448E and with radius 16.2 NM - 545500N 0142127E - 545500N 0125100E - 552012N 0123827E - Along the common X/Y state boundary to 561253N 0122205E - 583000N 0103000E - 584540N 0103532E - 585332N 0103820E - Along the common X/Y state boundary to 690336N 0203255E """, context="sweden"))) for pa in res: pretty(pa) return res
def parse_page(parser,pagenr,kind="TMA",last_sector=dict()): if kind=="TMA": thirdcols=["ATC unit","AFIS unit"] elif kind=="sector": thirdcols=["FREQ"] elif kind=="R": thirdcols=["Remarks (nature of hazard,"] else: raise Exception("Bad kind") page=parser.parse_page_to_items(pagenr) items=page.items #print "Items:",pitems #print "Possible Areas:" headings=[] for item in items: if item.text==None: continue item.text=item.text.strip() if item.text=="": continue if item.text=="Name": continue if item.y1<25 and item.text in (["Lateral limits","Vertical limits"]+thirdcols): headings.append(item) headings.sort(key=lambda x:x.x1) #print "found candidates:",zone_candidates if len(headings)==0: return [] avg_heading_y=sum(h.y1 for h in headings)/float(len(headings)) uprint("Found headings:",headings) zone_candidates=[] for item in items: if item.text==None or item.text.strip()=="": continue if item.text.strip().startswith("AMDT"): continue if item.text.strip().startswith("The LFV Group"): continue if re.match(ur"\s*LFV\s*AIRAC\s*AMDT\s*\d+/\d+\s*",item.text): continue if item.text.strip()=="LFV": continue if item.text.count('Terminal Information Areas'): continue if item.text.strip().startswith("AIRAC"): continue if kind=="R" and not is_r_or_danger_area_name(item.text.strip()): continue if item.y1>avg_heading_y+1 and item.x1<12 and not item.text in ["Name",'None',"LFV"]: if item.text.count("Established") or item.text.count(u'TROLLHÄTTAN TWR') or item.text.count(u'and/or SÅTENÄS') or item.text.count(u'TWR/TMC') or item.text.strip().endswith("TWR") or item.text.strip().endswith("TWR."): continue if item.text.count("operational hours") or item.text.count("See AIP DENMARK"): continue if item.text.count("hours of"): continue if item.text.count("Upper limit"): continue if item.text.count("that part") or item.text.count("coincides"): continue if item.text.count(u'Danger area EK D395 and') or item.text.count(u'D396 are situated within') or item.text.strip()=="TMA": continue if item.text.count(u'ÖSTGÖTA TMC is closed') or item.text.count(u'and SKAVSTA TWR is') or item.text.strip()=='open.': continue if item.text.count("SAT 0530"): continue if item.text.strip()=='OPS': continue if item.text.strip()==u'ÖSTGÖTA TMC:': continue if item.text.count(u'is open') or item.text.count('is closed'): continue if item.text.count('MON-FRI') or item.text.count('2150'): continue lines2=page.get_lines(page.get_partially_in_rect(12,item.y1+0.2,40,item.y2-0.2)) if len(lines2): zone_candidates.append(item) uprint("Found cands:",zone_candidates) zone_candidates.sort(key=lambda x:x.y1) for zone in zone_candidates: #uprint("Zone:",zone) #assert not zone.text.count("AOR") assert not zone.text.count("FIR") uprint("Headings:",headings) print "Pagenr:",pagenr assert len(headings)==3 ret=[] for i in xrange(len(zone_candidates)): d=dict() cand=zone_candidates[i] if i<len(zone_candidates)-1: nextcand=zone_candidates[i+1] else: nextcand=None y1=cand.y1-0.25 y2=100 if nextcand: y2=nextcand.y1-0.75 for j in xrange(len(headings)): head=headings[j] if j<len(headings)-1: nexthead=headings[j+1] else: nexthead=None x1=head.x1 x2=head.x2 if j==len(headings)-1: x1=headings[j-1].x2+3 x2=100 lines=page.get_lines(page.get_partially_in_rect(x1,y1,x2,y2,xsort=True,ysort=True)) #print ("Parsed %s y,%d-%d, %s: <%s>\n\n"%(cand.text,y1,y2,head.text,lines)).encode('utf8') d[head.text]=lines if kind=="R": if y2==100: y2=y1+3 d['name']=" ".join(x.strip() for x in filter_head_foot(page.get_lines(page.get_partially_in_rect(0,y1,10,y2,xsort=True,ysort=True)))) else: d['name']=cand.text.strip() ret.append(d) allow_head=2 print "Doing fixups--------------------------------------------------" tret=[] for x in ret: #print "Fixing up",x,"allow:",allow_head area="".join(x['Lateral limits']).strip() if allow_head==2 and area!="" and x['name'].strip()!="": allow_head=1 if allow_head!=1: if len(tret): tret[-1]['Lateral limits']+=x['Lateral limits'] tret[-1]['Vertical limits']+=x['Vertical limits'] else: tret.append(x) if allow_head==1: allow_head=0 if not area.endswith('-') and area!="": allow_head=2 #print " Fixed up up",x ret=tret for line in ret: print "Fixed:",line['name']," = ",line['Lateral limits'],line['Vertical limits'] out=[] for d in ret: pa=dict() curname=d['name'] if curname.count(u'Förteckning över'): continue print "D:",d arealines=[l for l in d['Lateral limits'] if l.strip()!=""] last_coord_idx=None #uprint("D:<%s> (area:%s)"%(d,arealines)) if 'FREQ' in d: freqs=[("SWEDEN CONTROL",float(x)) for x in re.findall(r"\d{3}\.\d{3}","\n".join(d['FREQ']))] #print "Parsed freqs:",freqs if freqs: last_sector['freqs']=freqs if kind=='sector': m=re.match(r"ES[A-Z]{2}\s*ACC\s*sector\s*([0-9a-zA-Z]*)",d['name']) if m: last_sector['major']=d['name'] last_sector['majorsector'],=m.groups() if len(arealines)==0: last_sector['name']=d['name'] continue if d['name'].count("Control Area and Upper Control Area"): continue if d['name'].count("SUECIA CTA"): continue if d['name'].count("SUECIA UTA"): continue m=re.match(r"([0-9a-zA-Z]*)(:.*)",d['name']) if m and 'majorsector' in last_sector: sectorname,sub=m.groups() if sectorname==last_sector['majorsector']: d['name']=last_sector['major']+sub #uprint("Fixed up name: ",d['name']) #print "Arealines:",arealines assert len(arealines) if arealines[0].strip()=="Danger area EK D395 and D396 are": arealines=arealines[1:] if arealines[0].strip()=="situated within TMA": arealines=arealines[1:] if arealines==u'Förteckning över CTA / Lists of CTA' or arealines=='Lateral limits': continue for idx in xrange(len(arealines)): if arealines[idx].lower().startswith("established"): last_coord_idx=idx pa['established']=" ".join(l for l in arealines[idx:]) break if arealines[idx].lower().startswith("danger area"): last_coord_idx=idx break if arealines[idx].strip()=="LFV": last_coord_idx=idx break if last_coord_idx==None: last_coord_idx=len(arealines) #uprint("ARealines:",arealines) #uprint("Last coord:",arealines[last_coord_idx-1]) if len(arealines)>last_coord_idx: if arealines[last_coord_idx-1:last_coord_idx+1]==[u'571324N 0161129E -', u'Established during operational hours of']: arealines[last_coord_idx-1]=arealines[last_coord_idx-1].strip("-") #uprint("Last fixed:",arealines[last_coord_idx-1]) assert not arealines[last_coord_idx-1].strip().endswith("-") #for idx in xrange(last_coord_idx-1): # print "arealine: <%s>"%(arealines[idx].strip(),) # assert arealines[idx].strip().endswith("-") or arealines[idx].strip().endswith("to") vertlim=u" ".join(d['Vertical limits']) if vertlim.strip()=="": #print "Object with no vertical limits: %s"%(repr(d['name']),) continue if d['name']=='Control Area': continue uprint("Vertlim: ",vertlim) heightst=re.findall(r"(FL\s*\d{3})|(\d+\s*ft\s*(?:\s*/\s*\d+\s*.\s*GND)?(?:\s*GND)?)|(GND)|(UNL)",vertlim) uprint("Height candidates:",heightst) heights=[] for fl,ht,gnd,unl in heightst: if fl: heights.append(fl) if ht: heights.append(ht.strip()) if gnd: heights.append(gnd.strip()) if unl: heights.append(unl.strip()) uprint("heights for ",d['name'],":",repr(heights)) if len(heights)==0 and d['name']==u'GÖTEBORG TMA': heights=['GND','FL95'] if len(heights)==1 and d['name']==u'Göteborg TMA': heights=['4500','FL95'] assert len(heights)==2 ceiling=heights[0].strip() floor=heights[1].strip() pa['name']=d['name'] pa['floor']=floor pa['ceiling']=ceiling if mapper.parse_elev(floor)>=9500: continue #uprint("Arealines:\n================\n%s\n============\n"%(arealines[:last_coord_idx])) #print pa areacoords=" ".join(arealines[:last_coord_idx]) pa['points']=parse_coord_str(areacoords) vs=[] for p in pa['points']: #print "from_str:",repr(p) x,y=mapper.latlon2merc(mapper.from_str(p),13) vs.append(Vertex(int(x),int(y))) p=Polygon(vvector(vs)) if p.calc_area()<=30*30: pass#print pa #print "Area:",p.calc_area() assert p.calc_area()>30*30 #print "Area: %f"%(p.calc_area(),) #print "Point-counts:",len(pa['points']) for p in pa['points']: assert p.count(",")==1 pa['type']=kind for thirdcol in thirdcols: if thirdcol in d: atc=d[thirdcol] break else: raise Exception("missing thirdcol") #print "ATc: <%s>"%(repr(atc),) freqs=[(y,float(x)) for x,y in re.findall(r"(\d{3}\.\d{3})\s*MHz\n(.*)","\n".join(atc))] if not freqs: freqs=last_sector.get('freqs',[]) #print repr(freqs) pa['freqs']=freqs #uprint("Cleaning up ",pa['name']) for cleaned in clean_up_polygon(list(pa['points'])): d=dict(pa) #print "cleaned",cleaned for i,tup in enumerate(cleaned): assert type(tup)==str latlon=mapper.from_str(tup) lat,lon=latlon assert lat>=-85 and lat<=85 d['points']=cleaned #uprint("cleaned:",pa['name'],len(cleaned),cleaned) #print "name:",d['name'] #print "cleaned points:",d['points'] #print "from:",areacoords #raise Exception() out.append(d) #if pa['name'].lower().count("esrange"): # print "Exit esrange" # sys.exit(1) return out
def fi_parse_tma(): p = parse.Parser(r"/ais/eaip/pdf/enr/EF_ENR_2_1_EN.pdf", fixuphref, country='fi') res = [] atsres = [] for pagenr in xrange(4, p.get_num_pages()): parsed, atsparsed = parse_page(p, pagenr) #pagenr) res.extend(parsed) atsres.extend(atsparsed) #break print "Len ouf out ", len(res) atsout = [] for space in atsres: #print "bef cut:",space['points'] mypolys = [makepoly.poly(space['points'])] for tmaitem in res: if tmaitem['type'] != 'TMA': continue outmypolys = [] assert len(mypolys) >= 1 for mypoly in list(mypolys): tmapoly = makepoly.poly(tmaitem['points']) #print mypoly #print tmapoly shape = mypoly.subtract(tmapoly) newpolys = shape.get_polys() if len(newpolys) > 1: print "Length is:", len(newpolys) #print "Cutting" outmypolys.extend( [shapemerge2d.Polygon(x) for x in list(newpolys)]) #assert len(newpolys)==1 if len(outmypolys) > 1: print "outmypolys:", outmypolys #print "Cut to:",mypoly mypolys = outmypolys for mypoly in mypolys: t = [] for mx, my in [(v.get_x(), v.get_y()) for v in mypoly.get_vertices()]: t.append(mapper.to_str(mapper.merc2latlon((mx, my), 13))) #print "Aft cut:",t newspace = dict(space) newspace['points'] = t atsout.append(newspace) if len(mypolys) > 1: print "Space was split into ", len(mypolys), "parts" res.extend(atsout) res.append( dict(name="FINLAND FIR", icao="EFIN", floor='GND', ceiling='-', freqs=[], type='FIR', date=datetime(2011, 4, 9), points=mapper.parse_coord_str( """ 601130N 0190512E - 601803N 0190756E - 610000N 0191905E - 614000N 0193000E - 631000N 0201000E - 632830N 0204000E - 633700N 0213000E - 644100N 0225500E - 653148N 0240824E - Along the common X/Y state boundary to 690336N 0203255E - Along the common X/Y state boundary to 690307N 0285545E - Along the common X/Y state boundary to 601201N 0271735E - 600800N 0263300E - 595830N 0260642E - 595300N 0255200E - 595430N 0252000E - 595300N 0245100E - 590000N 0210000E - 591524N 0203239E - 593346N 0195859E - 601130N 0190512E """, context="finland"))) #for pa in res: # pretty(pa) return res
def ee_parse_gen_r2(url): spaces = [] parser = lxml.html.HTMLParser() data, date = fetchdata.getdata(url, country='ee') parser.feed(data) tree = parser.close() print "Parsed tree" for tab in tree.xpath(".//table"): print "Found table" for idx, cand in enumerate(tab.xpath(".//tr")): if len(cand.getchildren()) < 3: continue space = dict() #print list(cand.getchildren()) what, vert, remark = list(cand.getchildren())[0:3] whattxt = alltext(what).replace(u"–", "-").replace(u"\xa0", " ") verttxt = alltext(vert) while True: w = re.sub(ur"\(.*?\)", "", whattxt) if w != whattxt: whattxt = w continue break #print idx,whattxt if idx < 3: if idx == 1: assert (whattxt.count("Identification") or whattxt.count("ateral limits")) if idx == 2: assert whattxt.strip() == "1" continue verttxt = verttxt.replace(u"\xa0", u" ") vertlines = [x for x in verttxt.split("\n") if x.strip()] if len(vertlines) == 1: vertlines = [x for x in verttxt.split(" ") if x.strip()] print "Verlintes:", repr(vertlines) #print "wha------------------------ t",whattxt space['ceiling'], space['floor'] = vertlines[:2] mapper.parse_elev(space['ceiling']) ifloor = mapper.parse_elev(space['floor']) if ifloor >= 9500: continue lines = whattxt.split("\n") out = [] merged = "" for line in lines[1:]: line = line.strip().replace(u"–", "-") if line == "": continue if line.endswith("point"): out.append(line + " ") continue if line.endswith("ircle with radius of") or line.endswith( ",") or line.endswith("on") or line.endswith("radius"): merged = " ".join([merged, line]) print "<---Merged:", merged continue if merged: line = " ".join([merged, line]) merged = "" if not line.endswith("-"): line = line + " -" out.append(line + "\n") space['name'] = lines[0].strip() w = "".join(out) print "Parsing:", w if space['name'].startswith('EER1 '): w = ee_parse_tma2.eer1txt fir = mapper.parse_coord_str(ee_parse_tma2.firtxt, context='estonia') fir_context = [fir] space['points'] = mapper.parse_coord_str( w, fir_context=fir_context) else: space['points'] = mapper.parse_coord_str(w, context='estonia') space['type'] = 'R' space['date'] = date space['freqs'] = [] space['url'] = fetchdata.getrawurl(url, 'ee') spaces.append(space) return spaces
def ee_parse_restrictions(): spaces = [] p = parse.Parser("/ee_restricted_and_danger.pdf", lambda x: x, country='ee') for pagenr in xrange(p.get_num_pages()): page = p.parse_page_to_items(pagenr) raws = list( sorted(page.get_by_regex(ur"EE[RD]\d+\s+.*"), key=lambda x: x.y1)) + [None] if len(raws) > 1: elevs = page.get_by_regex(ur"\d+\s*FT\s*MSL|FL\s*\d+") assert elevs elevcol = min(elev.x1 for elev in elevs) assert elevcol != 100 for cur, next in izip(raws[:-1], raws[1:]): #if cur.text.count("Tunnus, nimi ja sivurajat"): continue #not a real airspace space = dict() if next == None: y2 = 100 else: y2 = next.y1 - 1.75 name = cur.text.strip() space['name'] = name areaspecprim = page.get_lines(page.get_partially_in_rect( cur.x1 + 0.01, cur.y2 + 0.05, elevcol - 2, y2), fudge=.25) #print "areaspecprim:\n","\n".join(areaspecprim) areaspec = [] for area in areaspecprim: print "area in ", area area = area.replace(u"–", "-") if len(areaspec) and area.strip() == "": break area = re.sub(ur"\w-$", "", area) areaspec.append(area) #print "Y-interval:",cur.y1,y2,"next:",next #print "Name:",space['name'] #print "areaspec:",areaspec inp = " ".join(areaspec) #print inp #raw_input() tpoints = mapper.parse_coord_str(inp, context='estonia') if name.startswith("EER1"): tseaborder = "592842N 0280054E - 593814N 0273721E - 593953N 0265728E - 594513N 0264327E" seapoints = mapper.parse_coord_str(tseaborder) cont = None points = [] def close(a, b): bearing, dist = mapper.bearing_and_distance( mapper.from_str(a), mapper.from_str(b)) #print (a,b),dist return dist < 1.0 for idx, point in enumerate(tpoints): points.append(point) if close(point, seapoints[0]): print "WAS CLOSE", point, seapoints[0] points.extend(seapoints[1:-1]) for idx2, point in enumerate(tpoints[idx + 1:]): if close(point, seapoints[-1]): points.extend(tpoints[idx + 1 + idx2:]) break else: raise Exception("Couldn't find seaborder end") break else: raise Exception("Couldn't find seaborder") else: points = tpoints space['points'] = points vertitems = page.get_partially_in_rect(elevcol, cur.y1 + 0.05, elevcol + 8, y2 + 1.5) vertspec = [] for v in page.get_lines(vertitems): if v.strip() == "": continue if v.strip().count("Lennuliiklusteeninduse AS"): continue vertspec.append(v.strip()) print "vertspec:", vertspec assert len(vertspec) == 2 ceiling, floor = vertspec if mapper.parse_elev(floor) >= 9500 and mapper.parse_elev( ceiling) >= 9500: continue space['ceiling'] = ceiling space['floor'] = floor space['type'] = 'R' space['freqs'] = [] spaces.append(space) spaces.append( dict(name="EE TSA 1", ceiling="UNL", floor="5000 FT GND", points=mapper.parse_coord_str(u""" 594500N 0255000E – 594500N 0261800E – 592100N 0265800E – 591200N 0261200E – 591600N 0255400E – 594500N 0255000E"""), type="TSA", date=datetime(2011, 03, 25), freqs=[]))
#print "freqname Matched:",line fname, = g.groups() fname = fname.strip() break if not fname: raise Exception("Found no frequency name for freq: " + freq) freqs.append((fname, float(freq))) if len(freqs): break (ceiling, ceilingy), (floor, floory) = verts assert ceilingy < floory assert floory - ceilingy < 5.0 uprint("Analyzing area for %s" % (name, )) assert "".join(areaspec).strip() != "" area = mapper.parse_coord_str("".join(areaspec), context='estonia') uprint("Done analyzing %s" % (name, )) #print area if name.count("CTA") and name.count("TMA") == 0: type_ = "CTA" else: type_ = "TMA" if re.match(ur"\s*TALLINN\s*TMA\s*1\s*", name): out.append( dict(name="TALLIN TMA 2", floor='1700 ft MSL', ceiling='3500 ft MSL', freqs=freqs, type='TMA', points=mapper.parse_coord_str("""
def ee_parse_airfield(icao=None): spaces = [] ad = dict() ad["icao"] = icao sigpoints = [] p = parse.Parser("/ee_%s.pdf" % (icao,), lambda x: x, country="ee") page = p.parse_page_to_items(0) print icao nameregex = ur".*%s\s*[-−]\s*([A-ZÅÄÖ\- ]{3,})" % (icao,) for item in page.get_by_regex(nameregex): print "fontsize:", item.fontsize assert item.fontsize >= 10 ad["name"] = re.match(nameregex, item.text).groups()[0].strip() break else: raise Exception("Found no airfield name!") for item in page.get_by_regex(ur".*Kõrgus merepinnast.*"): lines = page.get_lines(page.get_partially_in_rect(0, item.y1 + 0.1, 100, item.y2 - 0.1)) for line in lines: ft, = re.match(".*?([\d\.]+)\s*FT\.*", line).groups() assert not "elev" in ad print "parsed ft:", ft ad["elev"] = float(ft) for item in page.get_by_regex(ur"ARP koordinaadid"): lines = page.get_lines(page.get_partially_in_rect(item.x1, item.y1, 100, item.y2)) for line in lines: print line for crd in mapper.parsecoords(line): assert not ("pos" in ad) ad["pos"] = crd break else: raise Exception("No coords") ad["runways"] = [] thrs = [] freqs = [] for pagenr in xrange(p.get_num_pages()): page = p.parse_page_to_items(pagenr) print "Parsing page", pagenr for item in page.get_by_regex("\s*RUNWAY\s*PHYSICAL\s*CHARACTERISTICS\s*"): print "Phys char" coords, = page.get_by_regex_in_rect("RWY end coordinates", 0, item.y2, 100, 100) design, = page.get_by_regex_in_rect("Designations", 0, item.y2, 100, 100) lines = page.get_lines(page.get_partially_in_rect(0, design.y2, design.x2, 100)) print "Design", lines rwys = [] for line in lines: m = re.match("(\d{2})", line) if m: print "rwynum", line rwys.append((m.groups()[0], line.y1)) rwys.append((None, 100)) for (rwy, y), (nextrwy, nexty) in izip(rwys, rwys[1:]): lines = page.get_lines(page.get_partially_in_rect(coords.x1, y, coords.x2, nexty - 0.5)) lines = [line for line in lines if line.strip()] print "Lines for rwy", lines thrlat, thrlon, endlat, endlon, undulation = lines[:5] assert undulation.count("GUND") thrs.append(dict(pos=mapper.parse_coords(thrlat, thrlon), thr=rwy)) print thrs if 0: for item in page.get_by_regex("ATS AIRSPACE"): lines = iter(page.get_lines(page.get_partially_in_rect(0, item.y2 + 0.1, 100, 100))) spaces = [] while True: line = lines.next() # print "Read line:",line if line.count("Vertical limits"): break m = re.match(ur".*?/\s+Designation and lateral limits\s*(.*\b(?:CTR|FIZ)\b.*?)\s*:?\s*$", line) if not m: m = re.match(ur"\s*(.*\b(?:CTR|FIZ)\b.*?)\s*:", line) # print "Second try:",m spacename, = m.groups() # print "Got spacename:",spacename assert spacename.strip() != "" coords = [] while True: line = lines.next() # print "Further:",line if line.count("Vertical limits"): break if not re.search(ur"[\d ]+N\s*[\d ]+E", line) and not re.search( ur"circle|cent[red]{1,5}|pitkin|point", line ): break coords.append(line) areaspec = "".join(coords) def fixup(m): lat, lon = m.groups() return lat.replace(" ", "") + " " + lon.replace(" ", "") areaspec = re.sub(ur"([\d ]+N)\s*([\d ]+E)", fixup, areaspec) # print "Fixed areaspec",areaspec # if icao=="EFKS": # areaspec=areaspec.replace("6615 28N","661528N") # Error! REstriction areas! spaces.append(dict(name=spacename, type="CTR", points=mapper.parse_coord_str(areaspec))) if line.count("Vertical limits"): # print "Breaking" break while not line.count("Vertical limits"): line = lines.next() # print "Matching veritcal limits--------------------------------" oldspaces = spaces spaces = [] for space in oldspaces: if space["name"].count("/"): a, b = space["name"].split("/") spaces.append(dict(space, name=a.strip())) spaces.append(dict(space, name=b.strip())) else: spaces.append(space) missing = set([space["name"] for space in spaces]) while True: for space in spaces: # print "Matching ",space['name']," to ",line,"missing:",missing for it in xrange(2): cand = space["name"] if it == 1: if cand.count("CTR"): cand = "CTR" if cand.count("FIZ"): cand = "FIZ" m = re.match(ur".*%s\s*:([^,:-]*)\s*-\s*([^,:-]*)" % (cand,), line) if m: break if len(spaces) == 1 and not m: m = re.match(ur".*Vertical limits\s*(.*)\s*-\s*(.*)", line) if m: for lim in m.groups(): assert lim.count(",") == 0 space["floor"], space["ceiling"] = m.groups() missing.remove(space["name"]) # print "Missing:" if len(missing) == 0: break if len(missing) == 0: break line = lines.next() print "Parse f o n page", pagenr for item2 in page.get_by_regex(ur".*ATS\s*COMMUNICATION\s*FACILITIES.*"): lines = page.get_lines(page.get_partially_in_rect(0, item2.y2 + 0.1, 100, 100)) for line in lines: if line.count("RADIO NAVIGATION AND LANDING AIDS"): break print "Comm line:", line twr = re.match(ur"TWR.*(\d{3}\.\d{3})\b.*", line) if twr: freqs.append(("TWR", float(twr.groups()[0]))) atis = re.match(ur"ATIS.*(\d{3}\.\d{3})", line) if atis: freqs.append(("ATIS", float(atis.groups()[0])))
def ee_parse_tma2(): spaces = [] airac_date = get_airac_date() url = "/%s/html/eAIP/EE-ENR-2.1-en-GB.html" % (airac_date, ) parser = lxml.html.HTMLParser() data, date = fetchdata.getdata(url, country='ee') parser.feed(data) tree = parser.close() icaos = [] def nested(tab): if tab == None: return False if tab.getparent() is None: return False #print dir(tab) if tab.tag == 'table': return True return nested(tab.getparent()) for tab in tree.xpath(".//table"): print "table alltext:", alltext(tab) if nested(tab.getparent()): continue firsttr = tab.xpath(".//tr")[0] ntext = alltext(firsttr) print "firsttr", firsttr print "ntext", ntext if re.match(ur".*FIR\s*/\s*CTA.*", ntext): print "Matches Tallin FIR" name = 'TALLIN FIR' points = mapper.parse_coord_str(firtxt, context='estonia') floor, ceiling = "GND", "FL195" space = {} space['name'] = name space['points'] = points space['floor'] = floor space['ceiling'] = ceiling space['freqs'] = [] space['icao'] = 'EETT' space['type'] = 'FIR' space['date'] = date space['url'] = fetchdata.getrawurl(url, 'ee') spaces.append(space) continue else: name = ntext.strip() space = dict(name=name) print "Name", name assert space['name'].count("TMA") \ or space['name'].count("FIR") if space['name'].count("FIR"): type = 'FIR' else: type = "TMA" freqs = [] points = None floor = None ceiling = None for cand in tab.xpath(".//tr"): if len(cand.getchildren()) != 2: continue nom, what = cand.getchildren() whattxt = alltext(what) nomtxt = alltext(nom) print "nomtxt", nomtxt, "space name", space['name'] if nomtxt.count("Lateral limits"): if space['name'].count("TALLINN TMA 2"): points = mapper.parse_coord_str(""" A circle with radius 20 NM centred on 592448N 0244957E """) else: whattxt = whattxt.replace( "then along the territory dividing line between Estonia and Russia to", "- Along the common Estonian/X state boundary to ") print "Fixed up", whattxt points = mapper.parse_coord_str(whattxt, context='estonia') if nomtxt.count("Vertical limits"): floor, ceiling = whattxt.split(" to ") if nomtxt.count("Call sign"): callsign = whattxt.split("\n")[0] if nomtxt.count("freq"): freqs.extend(re.findall(ur"\d+\.\d+\s*MHz")) assert points and floor and ceiling space['points'] = points space['type'] = type space['floor'] = floor space['ceiling'] = ceiling space['freqs'] = [] space['type'] = type space['date'] = date space['url'] = fetchdata.getrawurl(url, 'ee') for freq in freqs: space['freqs'].append((callsign, freq)) spaces.append(space)
def parse_page(parser,pagenr): page=parser.parse_page_to_items(pagenr) items=page.items minx=min([item.x1 for item in items]) headings=[] majorre=ur"\s*([A-ZÅÄÖ ][A-ZÅÄÖ]{3,})\s+(?:TMA\s*\d*|MIL CTA)\s*(?:-.*)?$" minorre=ur"\s*(?:TMA|MIL CTA [SN]?)\s*[A-ZÅÄÖ ]*\s*" for item in page.get_by_regex(majorre): m,=re.match(majorre,item.text).groups() assert m!=None assert m.strip()!="" headings.append(('major',item.text.strip(),m,item)) for item in page.get_by_regex(minorre): m=re.match(minorre,item.text).group() assert m!=None assert m.strip()!="" #print "Heading %d: %s"%(item.y1,m) headings.append(('minor',item.text.strip(),m,item)) #print headings headings.sort(key=lambda x:x[3].y1) def findheadingfor(y,meta=None): minor=None major=None for (kind,full,name,item) in reversed(headings): if minor==None and kind=="minor" and item.y1<y: minor=name.strip() if meta!=None: meta['minor_y']=item.y1 if major==None and kind=="major" and item.y1<y: major=name.strip() fullname=full if meta!=None: meta['major_y']=item.y1 break assert major!=None and major.strip()!="" if minor!=None: return major+" "+minor return fullname cury=0 coordstrs=page.get_by_regex(ur".*\d{6}N \d{7}E.*") out=[] while True: found=False #print "Looking for coords, y= %d"%(cury,) for titem in coordstrs: #print "Considering coordstr: ",titem.y1 if titem.y1<=cury: continue if titem.x1<40: item=titem found=True break if not found: break cury=item.y1 headmeta=dict() name=findheadingfor(item.y1,headmeta) areaspec=[] #print "Rect: ",0,cury,minx+35,100 y1=cury lines=page.get_lines(page.get_partially_in_rect(0,cury,minx+25,100)) for idx,line in enumerate(lines): if re.search(ur"FL \d+",line) or line.count("FT MSL"): vertidx=idx break #print "Line:",line.encode('utf8') if line.strip()=="": vertidx=idx break cury=max(cury,line.y2+0.5) line=line.replace(u"–","-") if not (line.endswith("-") or line.endswith(" ")): line+=" " areaspec.append(line) verts=[] for idx in xrange(vertidx,len(lines)): #print "Looking for alt:",lines[idx],"y2:",lines[idx].y2 m=re.search(ur"(FL\s+\d+)",lines[idx].strip()) if m: verts.append((m.groups()[0],lines[idx].y1)) m=re.search(ur"(\d+ FT (?:MSL|GND|SFC))",lines[idx].strip()) if m: verts.append((m.groups()[0],lines[idx].y1)) if len(verts)>=2: break y2=verts[-1][1] freqs=[] for attempt in xrange(2): for freqcand in page.get_by_regex(ur".*\d{3}\.\d{1,3}.*"): #print "headmeta:",headmeta #print "attempt:",attempt #print "freqy1:",freqcand.y1 if freqcand.x1<30: continue if attempt==0: if freqcand.y1<y1: continue else: if 'major_y' in headmeta: if freqcand.y1<headmeta['major_y']: continue else: if freqcand.y1<y1: continue if freqcand.y1>y2: continue x,y=freqcand.x1,freqcand.y1 freq,=re.match(ur".*(\d{3}\.\d{3}).*",freqcand.text).groups() if freq=="121.500": continue lines=page.get_lines(page.get_partially_in_rect(x-10,y-1,x-0.5,y+1.5)) fname=None for line in reversed(lines): g=re.match(ur".*\b(\w{3,}\s+(?:Approach|Tower)).*",line) if g: #print "freqname Matched:",line fname,=g.groups() fname=fname.strip() break if not fname: raise Exception("Found no frequency name for freq: "+freq) freqs.append((fname,float(freq))) if len(freqs): break (ceiling,ceilingy),(floor,floory)=verts assert ceilingy<floory assert floory-ceilingy<5.0 uprint("Analyzing area for %s"%(name,)) assert "".join(areaspec).strip()!="" area=mapper.parse_coord_str("".join(areaspec),context='estonia') uprint("Done analyzing %s"%(name,)) #print area if name.count("CTA") and name.count("TMA")==0: type_="CTA" else: type_="TMA" if re.match(ur"\s*TALLINN\s*TMA\s*1\s*",name): out.append(dict( name="TALLIN TMA 2", floor='1700 ft MSL', ceiling='3500 ft MSL', freqs=freqs, type='TMA', points=mapper.parse_coord_str(""" A circle with radius 20 NM centred on 592448N 0244957E """)))
elev=elev, date=date, runways=rwy_constructor.get_rwys(thrs), pos=pos) if adcharturl: ad['adcharturl']=adcharturl if 'adcharts' in addummy: ad['adcharts']=addummy['adcharts'] aip_text_documents.help_parse_doc(ad,url, icao,"ev",title="General Information",category="general") ads.append(ad) spaces.append(dict( name=ctrname, points=mapper.parse_coord_str(ctrarea), ceiling=ceiling, type=type_, floor=floor, freqs=freqs, date=date, url=url )) spilve=dict( icao="EVRS", name="Spilve", elev=5, date=datetime(2011,04,05), pos=mapper.parsecoord("565931N 240428E") )
def extract_airfields(filtericao=lambda x: True, purge=True): # print getxml("/AIP/AD/AD 1/ES_AD_1_1_en.pdf") ads = [] p = Parser("/AIP/AD/AD 1/ES_AD_1_1_en.pdf") points = dict() startpage = None for pagenr in xrange(p.get_num_pages()): page = p.parse_page_to_items(pagenr) if page.count("Aerodrome directory"): startpage = pagenr break if startpage == None: raise Exception("Couldn't find aerodrome directory in file") # print "Startpage: %d"%(startpage,) # nochartf=open("nochart.txt","w") for pagenr in xrange(startpage, p.get_num_pages()): row_y = [] page = p.parse_page_to_items(pagenr) allines = [x for x in (page.get_lines(page.get_partially_in_rect(0, 0, 15, 100))) if x.strip()] for item, next in zip(allines, allines[1:] + [""]): # print "item:",item m = re.match(ur"^\s*[A-ZÅÄÖ]{3,}(?:/.*)?\b.*", item) if m: # print "Candidate, next is:",next if re.match(r"^\s*[A-Z]{4}\b.*", next): # print "Matched:",item # print "y1:",item.y1 row_y.append(item.y1) for y1, y2 in zip(row_y, row_y[1:] + [100.0]): # print "Extacting from y-range: %f-%f"%(y1,y2) items = list(page.get_partially_in_rect(0, y1 - 0.25, 5.0, y2 + 0.25, ysort=True)) if len(items) >= 2: # print "Extract items",items ad = dict(name=unicode(items[0].text).strip(), icao=unicode(items[1].text).strip()) # print "Icao:",ad['icao'] assert re.match(r"[A-Z]{4}", ad["icao"]) if not filtericao(ad): continue if len(items) >= 3: # print "Coord?:",items[2].text m = re.match(r".*(\d{6}N)\s*(\d{7}E).*", items[2].text) if m: lat, lon = m.groups() ad["pos"] = parse_coords(lat, lon) # print "Items3:",items[3:] elev = re.findall(r"(\d{1,5})\s*ft", " ".join(t.text for t in items[3:])) # print "Elev:",elev assert len(elev) == 1 ad["elev"] = int(elev[0]) ads.append(ad) big_ad = set() for ad in ads: if not ad.has_key("pos"): big_ad.add(ad["icao"]) for ad in ads: icao = ad["icao"] if icao in big_ad: if icao in ["ESIB", "ESNY", "ESCM", "ESPE"]: continue try: p = Parser("/AIP/AD/AD 2/%s/ES_AD_2_%s_6_1_en.pdf" % (icao, icao)) except: p = Parser("/AIP/AD/AD 2/%s/ES_AD_2_%s_6-1_en.pdf" % (icao, icao)) ad["aipvacurl"] = p.get_url() for pagenr in xrange(p.get_num_pages()): page = p.parse_page_to_items(pagenr) """ for altline in exitlines: m=re.match(r"(\w+)\s+(\d+N)\s*(\d+E.*)",altline) if not m: continue name,lat,lon=m.groups() try: coord=parse_coords(lat,lon) except Exception: continue points.append(dict(name=name,pos=coord)) """ for kind in xrange(2): if kind == 0: hits = page.get_by_regex(r"H[Oo][Ll][Dd][Ii][Nn][Gg]") kind = "holding point" if kind == 1: hits = page.get_by_regex(r"[Ee]ntry.*[Ee]xit.*point") kind = "entry/exit point" if len(hits) == 0: continue for holdingheading in hits: items = sorted( page.get_partially_in_rect( holdingheading.x1 + 2.0, holdingheading.y2 + 0.1, holdingheading.x1 + 0.5, 100 ), key=lambda x: x.y1, ) items = [x for x in items if not x.text.startswith(" ")] # print "Holding items:",items for idx, item in enumerate(items): print "Holding item", item y1 = item.y1 if idx == len(items) - 1: y2 = 100 else: y2 = items[idx + 1].y1 items2 = [ x for x in page.get_partially_in_rect(item.x1 + 1, y1 + 0.3, item.x1 + 40, y2 - 0.1) if x.x1 >= item.x1 - 0.25 and x.y1 >= y1 - 0.05 and x.y1 < y2 - 0.05 ] s = (" ".join(page.get_lines(items2))).strip() print "Holding lines:", repr(page.get_lines(items2)) # if s.startswith("ft Left/3"): #Special case for ESOK # s,=re.match("ft Left/3.*?([A-Z]{4,}.*)",s).groups() # m=re.match("ft Left/\d+.*?([A-Z]{4,}.*)",s) # if m: # s,=m.groups() if s.startswith("LjUNG"): # Really strange problem with ESCF s = s[0] + "J" + s[2:] if s.lower().startswith("holding"): sl = s.split(" ", 1) if len(sl) > 1: s = sl[1] s = s.strip() if kind == "entry/exit point" and s.startswith("HOLDING"): continue # reached HOLDING-part of VAC # Check for other headings # Fixup strange formatting of points in some holding items: (whitespace between coord and 'E') s = re.sub(ur"(\d+)\s*(N)\s*(\d+)\s*(E)", lambda x: "".join(x.groups()), s) m = re.match(r"([A-Z]{2,}).*?(\d+N)\s*(\d+E).*", s) if not m: m = re.match(r".*?(\d+N)\s*(\d+E).*", s) if not m: continue assert m lat, lon = m.groups() # skavsta if icao == "ESKN": if s.startswith(u"Hold north of T"): name = "NORTH" elif s.startswith(u"Hold south of B"): name = "SOUTH" else: assert 0 # add more specials here else: continue else: name, lat, lon = m.groups() try: coord = parse_coords(lat, lon) except Exception: print "Couldn't parse:", lat, lon continue # print name,lat,lon,mapper.format_lfv(*mapper.from_str(coord)) if name.count("REMARK") or len(name) <= 2: print "Suspicious name: ", name # sys.exit(1) continue points[icao + " " + name] = dict(name=icao + " " + name, icao=icao, pos=coord, kind=kind) # for point in points.items(): # print point # sys.exit(1) def fixhex11(s): out = [] for c in s: i = ord(c) if i >= 0x20: out.append(c) continue if i in [0x9, 0xA, 0xD]: out.append(c) continue out.append(" ") return "".join(out) for ad in ads: icao = ad["icao"] if icao in big_ad: # print "Parsing ",icao p = Parser("/AIP/AD/AD 2/%s/ES_AD_2_%s_en.pdf" % (icao, icao), loadhook=fixhex11) ad["aiptexturl"] = p.get_url() firstpage = p.parse_page_to_items(0) te = "\n".join(firstpage.get_all_lines()) # print te coords = re.findall(r"ARP.*(\d{6}N)\s*(\d{7}E)", te) if len(coords) > 1: raise Exception( "First page of airport info (%s) does not contain exactly ONE set of coordinates" % (icao,) ) if len(coords) == 0: print "Couldn't find coords for ", icao # print "Coords:",coords ad["pos"] = parse_coords(*coords[0]) elev = re.findall(r"Elevation.*?(\d{1,5})\s*ft", te, re.DOTALL) if len(elev) > 1: raise Exception( "First page of airport info (%s) does not contain exactly ONE elevation in ft" % (icao,) ) if len(elev) == 0: print "Couldn't find elev for ", icao ad["elev"] = int(elev[0]) freqs = [] found = False thrs = [] # uprint("-------------------------------------") for pagenr in xrange(p.get_num_pages()): page = p.parse_page_to_items(pagenr) # uprint("Looking on page %d"%(pagenr,)) if ( 0 ): # opening hours are no longer stored in a separate document for any airports. No need to detect which any more (since none are). for item in page.get_by_regex(".*OPERATIONAL HOURS.*"): lines = page.get_lines(page.get_partially_in_rect(0, item.y2 + 0.1, 100, 100)) for line in lines: things = ["ATS", "Fuelling", "Operating"] if not line.count("AIP SUP"): continue for thing in things: if line.count(thing): ad["aipsup"] = True for item in page.get_by_regex(".*\s*RUNWAY\s*PHYSICAL\s*CHARACTERISTICS\s*.*"): # uprint("Physical char on page") lines = page.get_lines(page.get_partially_in_rect(0, item.y2 + 0.1, 100, 100)) seen_end_rwy_text = False for line, nextline in izip(lines, lines[1:] + [None]): # uprint("MAtching: <%s>"%(line,)) if re.match(ur"AD\s+2.13", line): break if line.count("Slope of"): break if line.lower().count("end rwy:"): seen_end_rwy_text = True if line.lower().count("bgn rwy:"): seen_end_rwy_text = True m = re.match(ur".*(\d{6}\.\d+)[\s\(\)\*]*(N).*", line) if not m: continue m2 = re.match(ur".*(\d{6,7}\.\d+)\s*[\s\(\)\*]*(E).*", nextline) if not m2: continue latd, n = m.groups() lond, e = m2.groups() assert n == "N" assert e == "E" lat = latd + n lon = lond + e rwytxts = page.get_lines(page.get_partially_in_rect(0, line.y1 + 0.05, 12, nextline.y2 - 0.05)) uprint("Rwytxts:", rwytxts) rwy = None for rwytxt in rwytxts: # uprint("lat,lon:%s,%s"%(lat,lon)) # uprint("rwytext:",rwytxt) m = re.match(ur"\s*(\d{2}[LRCM]?)\b.*", rwytxt) if m: assert rwy == None rwy = m.groups()[0] if rwy == None and seen_end_rwy_text: continue print "Cur airport:", icao already = False assert rwy != None seen_end_rwy_text = False for thr in thrs: if thr["thr"] == rwy: raise Exception("Same runway twice on airfield:" + icao) thrs.append(dict(pos=mapper.parse_coords(lat, lon), thr=rwy)) assert len(thrs) >= 2 for pagenr in xrange(0, p.get_num_pages()): page = p.parse_page_to_items(pagenr) matches = page.get_by_regex(r".*ATS\s+COMMUNICATION\s+FACILITIES.*") # print "Matches of ATS COMMUNICATION FACILITIES on page %d: %s"%(pagenr,matches) if len(matches) > 0: commitem = matches[0] curname = None callsign = page.get_by_regex_in_rect(ur"Call\s*sign", 0, commitem.y1, 100, commitem.y2 + 8)[0] for idx, item in enumerate( page.get_lines( page.get_partially_in_rect(callsign.x1 - 0.5, commitem.y1, 100, 100), fudge=0.3, order_fudge=15, ) ): if item.strip() == "": curname = None if re.match(".*RADIO\s+NAVIGATION\s+AND\s+LANDING\s+AIDS.*", item): break # print "Matching:",item m = re.match(r"(.*?)\s*(\d{3}\.\d{1,3})\s*MHz.*", item) # print "MHZ-match:",m if not m: continue # print "MHZ-match:",m.groups() who, sfreq = m.groups() freq = float(sfreq) if abs(freq - 121.5) < 1e-4: if who.strip(): curname = who continue # Ignore emergency frequency, it is understood if not who.strip(): if curname == None: continue else: curname = who freqs.append((curname.strip().rstrip("/"), freq)) for pagenr in xrange(0, p.get_num_pages()): page = p.parse_page_to_items(pagenr) matches = page.get_by_regex(r".*ATS\s*AIRSPACE.*") # print "Matches of ATS_AIRSPACE on page %d: %s"%(pagenr,matches) if len(matches) > 0: heading = matches[0] desigitem, = page.get_by_regex("Designation and lateral limits") vertitem, = page.get_by_regex("Vertical limits") airspaceclass, = page.get_by_regex("Airspace classification") lastname = None subspacelines = dict() subspacealts = dict() for idx, item in enumerate( page.get_lines(page.get_partially_in_rect(desigitem.x2 + 1, desigitem.y1, 100, vertitem.y1 - 1)) ): if item.count("ATS airspace not established"): assert idx == 0 break if item.strip() == "": continue m = re.match(r"(.*?)(\d{6}N\s+.*)", item) if m: name, coords = m.groups() name = name.strip() else: name = item.strip() coords = None if name: lastname = name if coords: subspacelines.setdefault(lastname, []).append(coords) assert lastname lastname = None # print "Spaces:",subspacelines # print "ICAO",ad['icao'] # altlines=page.get_lines(page.get_partially_in_rect(vertitem.x2+1,vertitem.y1,100,airspaceclass.y1-0.2)) # print "Altlines:",altlines subspacealts = dict() subspacekeys = subspacelines.keys() allaltlines = " ".join( page.get_lines( page.get_partially_in_rect( vertitem.x1 + 0.5, vertitem.y1 + 0.5, 100, airspaceclass.y1 - 0.2 ) ) ) single_vertlim = False totalts = list(mapper.parse_all_alts(allaltlines)) # print "totalts:",totalts if len(totalts) == 2: single_vertlim = True for subspacename in subspacekeys: ceil = None floor = None subnames = [subspacename] if subspacename.split(" ")[-1].strip() in ["TIA", "TIZ", "CTR", "CTR/TIZ"]: subnames.append(subspacename.split(" ")[-1].strip()) # print "Parsing alts for ",subspacename,subnames try: for nametry in subnames: if ( single_vertlim ): # there's only one subspace, parse all of vertical limits field for this single one. items = [vertitem] else: items = page.get_by_regex_in_rect( nametry, vertitem.x2 + 1, vertitem.y1, 100, airspaceclass.y1 - 0.2 ) for item in items: alts = [] for line in page.get_lines( page.get_partially_in_rect( item.x1 + 0.5, item.y1 + 0.5, 100, airspaceclass.y1 - 0.2 ) ): # print "Parsing:",line line = line.replace(nametry, "").lower().strip() parsed = list(mapper.parse_all_alts(line)) if len(parsed): alts.append(mapper.altformat(*parsed[0])) if len(alts) == 2: break if alts: # print "alts:",alts ceil, floor = alts raise StopIteration except StopIteration: pass assert ceil and floor subspacealts[subspacename] = dict(ceil=ceil, floor=floor) spaces = [] for spacename in subspacelines.keys(): altspacename = spacename # print "Altspacename: %s, subspacesalts: %s"%(altspacename,subspacealts) space = dict( name=spacename, ceil=subspacealts[altspacename]["ceil"], floor=subspacealts[altspacename]["floor"], points=parse_coord_str(" ".join(subspacelines[spacename])), freqs=list(set(freqs)), ) if True: vs = [] for p in space["points"]: x, y = mapper.latlon2merc(mapper.from_str(p), 13) vs.append(Vertex(int(x), int(y))) p = Polygon(vvector(vs)) if p.calc_area() <= 30 * 30: pass # print space pass # print "Area:",p.calc_area() assert p.calc_area() > 30 * 30 # print "Area: %f"%(p.calc_area(),) spaces.append(space) # print space ad["spaces"] = spaces found = True if found: break assert found ad["runways"] = rwy_constructor.get_rwys(thrs)
if fl: return fl if alt: return alt+"FT MSL" if gnd: return "GND" if unl: return "UNL" ceiling,floor=[fixupalt(h) for h in [h1,h2]] if mapper.parse_elev(floor)>=9500: continue kind,name=re.match("EP (TSA|TRA|TFR) ([\d\w]+)",tra.text).groups() def fix_coords(s): def fixer(m): a,b,c,d, e,f,g,h=m.groups() return "%02d%02d%02d%s %03d%02d%02d%s - "%(int(a),int(b),int(c),d, int(e),int(f),int(g),h) return re.sub(ur"(\d{2,3})°(\d{2})'(\d{2})''([NS])\s*(\d{2,3})°(\d{2})'(\d{2})''([EW])",fixer,s) coordstr2=fix_coords("".join(o)).rstrip().rstrip("-") print "COordstr:",coordstr2 spaces.append(dict( name="EP %s %s"%(kind,name), points=mapper.parse_coord_str(coordstr2,context="poland"), ceiling=ceiling, floor=floor, type="TSA", freqs=[] )) return spaces if __name__=='__main__': for space in ep_parse_tra(): print "space",space
def ee_parse_airfields2(): ads=[] spaces=[] airac_date=get_airac_date() print "airac",airac_date overview_url="/%s/html/eAIP/EE-AD-0.6-en-GB.html"%(airac_date,) parser=lxml.html.HTMLParser() data,date=fetchdata.getdata(overview_url,country='ee') parser.feed(data) tree=parser.close() icaos=[] for cand in tree.xpath(".//h3"): txts=alltexts(cand.xpath(".//a")) aps=re.findall(r"EE[A-Z]{2}"," ".join(txts)) if aps: icao,=aps if alltext(cand).count("HELIPORT"): print "Ignore heliport",icao continue icaos.append(icao) for icao in icaos: ad=dict(icao=icao) url="/%s/html/eAIP/EE-AD-2.%s-en-GB.html"%(airac_date,icao) data,date=fetchdata.getdata(url,country='ee') parser.feed(data) tree=parser.close() thrs=[] for h3 in tree.xpath(".//h3"): txt=alltext(h3) print repr(txt) ptrn=ur"\s*%s\s+[—-]\s+(.*)"%(unicode(icao.upper()),) m=re.match(ptrn,txt,re.UNICODE) if m: assert not 'name' in ad ad['name']=m.groups()[0] for tr in tree.xpath(".//tr"): txt=alltext(tr) m=re.match(r".*coordinates\s*and\s*site.*(\d{6}N\s*\d{7}E).*",txt) #print "Matching,",txt,":",m if m: crds,=m.groups() ad['pos']=mapper.anyparse(crds) space=dict() for table in tree.xpath(".//table"): for tr in table.xpath(".//tr"): trtxt=alltext(tr) if trtxt.count("Designation and lateral limits"): space=dict() coords=tr.getchildren()[2] lines=alltext(coords).split("\n") if lines[0].strip()=='NIL': continue zname,what,spill=re.match(ur"(.*)\s+(CTR|TIZ|FIZ)(.*)",lines[0]).groups() if spill and spill.strip(): rest=[spill]+lines[1:] else: rest=lines[1:] what=what.strip() assert ad['name'].upper().strip().count(zname.upper().strip()) assert what in ['FIZ','TIZ','CTR'] space['type']=what space['points']=mapper.parse_coord_str("\n".join(rest)) space['name']=zname+" "+what space['date']=date space['url']=fetchdata.getrawurl(url,'ee') if trtxt.count("Vertical limits"): vlim=alltext(tr.getchildren()[2]) if vlim.strip()=='NIL': continue space['floor'],space['ceiling']=vlim.split(" to ") #space['freqs']=x #hlc=False for h4 in tree.xpath(".//h4"): txt=alltext(h4) if txt.lower().count("charts"): par=h4.getparent() for table in par.xpath(".//table"): for idx,tr in enumerate(table.xpath(".//tr")): name,page=\ tr.getchildren() nametxt=alltext(name) print "nametxt:",nametxt,"link:" for reg,variant in [ (r"Aerodrome.*Chart.*","") , (r"Landing.*Chart.*","landing"), (r".*Parking.*Chart.*","parking"), (r".*Visual.*Approach.*|.*\bVAC\b.*","vac") ]: if re.match(reg,nametxt): for a in page.xpath(".//a"): print "linklabel",a.text print "attrib:",a.attrib href=a.attrib['href'] print "Bef repl",href if href.lower().endswith("pdf"): href=href.replace("../../graphics","/%s/graphics"%(airac_date,)) print "href:",href,airac_date assert href parse_landing_chart.help_plc(ad,href, icao,ad['pos'],"ee",variant=variant) """arp=ad['pos'] lc=parse_landing_chart.parse_landing_chart( href, icao=icao, arppos=arp,country="ee") assert lc if lc: ad['adcharturl']=lc['url'] ad['adchart']=lc hlc=True #chartblobnames.append(lc['blobname']) """ #assert hlc for h4 in tree.xpath(".//h4"): txt=alltext(h4) if txt.count("RUNWAY PHYSICAL"): par=h4.getparent() for table in par.xpath(".//table"): prevnametxt="" for idx,tr in enumerate(table.xpath(".//tr")): if idx==0: fc=alltext(tr.getchildren()[0]) print "FC",fc if not fc.count("Designations"): break #skip table if idx<2:continue if len(tr.getchildren())==1:continue print "c:",tr.getchildren(),alltexts(tr.getchildren()) desig,trubrg,dims,strength,thrcoord,threlev=tr.getchildren() rwy=re.match(r"(\d{2}[LRC]?)",alltext(desig)) altc=alltext(thrcoord) print "Matching",altc print "rwymatch:",alltext(desig) m=re.match(r"\s*(\d+\.?\d*N)[\s\n]*(\d+\.?\d*E).*",altc,re.DOTALL|re.MULTILINE) if m: lat,lon=m.groups() print "Got latlon",lat,lon thrs.append(dict(pos=mapper.parse_coords(lat,lon),thr=rwy.groups()[0])) space['freqs']=[] for h4 in tree.xpath(".//h4"): txt=alltext(h4) if txt.count("ATS COMMUNICATION"): par=h4.getparent() for table in par.xpath(".//table"): for idx,tr in enumerate(table.xpath(".//tr")): print "cs",repr(tr.getchildren()),alltexts(tr.getchildren()) print len(tr.getchildren()) if len(tr.getchildren())!=5: if "".join(alltexts(tr.getchildren())).count(u"EMERG"): continue #Sometimes emergency freq is listed, and then it is without callsign service,callsign,frequency,hours,remarks=\ tr.getchildren() callsigntxt=alltext(callsign) if idx<2: if idx==0: assert callsigntxt.strip()=="Call sign" if idx==1: assert callsigntxt.strip()=="2" continue ftext=alltext(frequency) print "matching freq",ftext for freq in re.findall(ur"\b\d{3}\.\d{1,3}",ftext): freqmhz=float(freq) space['freqs'].append((callsigntxt.strip(),freqmhz)) if space and 'points' in space: assert 'freqs' in space assert 'points' in space assert 'floor' in space assert 'ceiling' in space assert 'type' in space spaces.append(space) if thrs: ad['runways']=rwy_constructor.get_rwys(thrs) aip_text_documents.help_parse_doc(ad,url, icao,"ee",title="General Information",category="general") ad['date']=date ad['url']=fetchdata.getrawurl(url,'ee') print "AD:",ad assert 'pos' in ad assert 'name' in ad ads.append(ad)
heights = ['GND', 'FL95'] if len(heights) == 1 and d['name'] == u'Göteborg TMA': heights = ['4500', 'FL95'] assert len(heights) == 2 ceiling = heights[0].strip() floor = heights[1].strip() pa['name'] = d['name'] pa['floor'] = floor pa['ceiling'] = ceiling if mapper.parse_elev(floor) >= 9500: continue #uprint("Arealines:\n================\n%s\n============\n"%(arealines[:last_coord_idx])) #print pa areacoords = " ".join(arealines[:last_coord_idx]) pa['points'] = parse_coord_str(areacoords) vs = [] for p in pa['points']: #print "from_str:",repr(p) x, y = mapper.latlon2merc(mapper.from_str(p), 13) vs.append(Vertex(int(x), int(y))) p = Polygon(vvector(vs)) if p.calc_area() <= 30 * 30: pass #print pa #print "Area:",p.calc_area() assert p.calc_area() > 30 * 30 #print "Area: %f"%(p.calc_area(),) #print "Point-counts:",len(pa['points']) for p in pa['points']:
def ee_parse_gen_r2(url): spaces=[] parser=lxml.html.HTMLParser() data,date=fetchdata.getdata(url,country='ee') parser.feed(data) tree=parser.close() print "Parsed tree" for tab in tree.xpath(".//table"): print "Found table" for idx,cand in enumerate(tab.xpath(".//tr")): if len(cand.getchildren())<3: continue space=dict() #print list(cand.getchildren()) what,vert,remark=list(cand.getchildren())[0:3] whattxt=alltext(what).replace(u"–","-").replace(u"\xa0"," ") verttxt=alltext(vert) while True: w=re.sub(ur"\(.*?\)","",whattxt) if w!=whattxt: whattxt=w continue break #print idx,whattxt if idx<3: if idx==1: assert (whattxt.count("Identification") or whattxt.count("ateral limits")) if idx==2: assert whattxt.strip()=="1" continue verttxt=verttxt.replace(u"\xa0",u" ") vertlines=[x for x in verttxt.split("\n") if x.strip()] if len(vertlines)==1: vertlines=[x for x in verttxt.split(" ") if x.strip()] print "Verlintes:",repr(vertlines) #print "wha------------------------ t",whattxt space['ceiling'],space['floor']=vertlines[:2] mapper.parse_elev(space['ceiling']) ifloor=mapper.parse_elev(space['floor']) if ifloor>=9500: continue lines=whattxt.split("\n") out=[] merged="" for line in lines[1:]: line=line.strip().replace(u"–","-") if line=="":continue if line.endswith("point"): out.append(line+" ") continue if line.endswith("ircle with radius of") or line.endswith(",") or line.endswith("on") or line.endswith("radius"): merged=" ".join([merged,line]) print "<---Merged:",merged continue if merged: line=" ".join([merged,line]) merged="" if not line.endswith("-"): line=line+" -" out.append(line+"\n") space['name']=lines[0].strip() w="".join(out) print "Parsing:",w if space['name'].startswith('EER1 '): w=ee_parse_tma2.eer1txt fir=mapper.parse_coord_str(ee_parse_tma2.firtxt,context='estonia') fir_context=[fir] space['points']=mapper.parse_coord_str(w,fir_context=fir_context) else: space['points']=mapper.parse_coord_str(w,context='estonia') space['type']='R' space['date']=date space['freqs']=[] space['url']=fetchdata.getrawurl(url,'ee') spaces.append(space) return spaces
def ev_parse_tma(): out = [] parser = lxml.html.HTMLParser() # url="/Latvia_EV-ENR-2.1-en-GB.html" cur_airac = get_cur_airac() url = "/eAIPfiles/%s-AIRAC/html/eAIP/EV-ENR-2.1-en-GB.html" % (cur_airac,) data, date = fetchdata.getdata(url, country="ev") parser.feed(data) tree = parser.close() got_fir = False for table in tree.xpath("//table"): # print "Table with %d children"%(len(table.getchildren()),) rows = list(table.xpath(".//tr")) for idx in xrange(5): headingrow = rows[idx] cols = list(headingrow.xpath(".//th")) # print len(cols) if len(cols) == 5: break else: raise Exception("No heading row") assert idx == 0 # for idx,col in enumerate(cols): # print "Col %d, %s"%(idx,alltext(col)[:10]) nameh, unith, callsignh, freqh, remarkh = cols assert alltext(nameh).lower().count("name") assert alltext(unith).lower().count("unit") assert re.match(ur"call\s*sign", alltext(callsignh).lower()) lastcols = None for row in rows[1:]: cols = list(row.xpath(".//td")) if len(cols) == 5: name, unit, callsign, freq, remark = cols lastcols = cols else: if lastcols: unit, callsign, freq, remark = lastcols[1:] name = cols[0] else: continue lines = [x.strip() for x in alltext(name).split("\n") if x.strip()] if len(lines) == 0: continue spacename = lines[0].strip() if re.match(ur"RIGA\s*UTA|RIGA\s*CTA|RIGA\s*AOR.*", spacename): continue freqstr = alltext(freq) callsignstr = alltext(callsign) if freqstr.strip(): print freqstr freqmhzs = re.findall(ur"\d{3}\.\d{3}", freqstr) assert len(freqmhzs) <= 2 callsigns = [callsignstr.split("\n")[0].strip()] freqs = [] for idx, freqmhz in enumerate(freqmhzs): if freqmhz == "121.500": continue freqs.append((callsigns[idx], float(freqmhz))) print "freqs:", freqs else: freqs = [] assert len(lines) classidx = next(idx for idx, x in reversed(list(enumerate(lines))) if x.lower().count("class of airspace")) if re.match(ur"RIGA\s*FIR.*UIR", spacename, re.UNICODE): got_fir = True lastspaceidx = classidx - 2 floor = "GND" ceiling = "-" type_ = "FIR" else: if lines[classidx - 1].count("/") == 1: floor, ceiling = lines[classidx - 1].split("/") lastspaceidx = classidx - 1 else: floor = lines[classidx - 1] ceiling = lines[classidx - 2] lastspaceidx = classidx - 2 ceiling = strangefix(ceiling) floor = strangefix(floor) mapper.parse_elev(ceiling) mapper.parse_elev(floor) type_ = "TMA" tcoords = lines[1:lastspaceidx] # verify that we got actual altitudes: coords = [] for coord in tcoords: coord = coord.strip().replace("(counter-)", "").replace("(RIGA DVOR - RIA)", "") if coord.endswith(u"E") or coord.endswith("W"): coord = coord + " -" coords.append(coord) raw = " ".join(coords) raw = re.sub(s(ur"Area bounded by lines successively joining the following points:"), "", raw) print "Raw:", raw coords = mapper.parse_coord_str(raw, context="latvia") for cleaned in clean_up_polygon(coords): out.append( dict( name=spacename, points=cleaned, type=type_, freqs=freqs, floor=floor, url=url, date=date, ceiling=ceiling, ) ) if type_ == "FIR": out[-1]["icao"] = "EVRR"
else: raise Exception("No limitstr") cstr = [] spacename = coordstr[0] assert spacename == "CTR" for sub in coordstr[1:]: cstr.append(sub.strip().rstrip(".")) def fixfunc(m): return "".join(m.groups()) raw = re.sub(ur"(\d{2,3})\s*(\d{2})\s*(\d{2})\s*([NSEW])", fixfunc, "".join(cstr)).replace(",", " - ") print "parsing raw:", raw points = mapper.parse_coord_str(raw, context='lithuania') print "Limitstr", limitstr floor, ceiling = re.match(ur"(.*)\s*to\s*(.*)", limitstr).groups() mapper.parse_elev(floor) mapper.parse_elev(ceiling) spacenamestem = spacename.strip() if spacenamestem.endswith("CTR"): spacenamestem = spacenamestem[:-3].strip() if spacenamestem.endswith("FIZ"): spacenamestem = spacenamestem[:-3].strip() #construct names newfreqs = [] for serv, freq in freqs: serv = serv.strip()
def ee_parse_tma2(): spaces=[] airac_date=get_airac_date() url="/%s/html/eAIP/EE-ENR-2.1-en-GB.html"%(airac_date,) parser=lxml.html.HTMLParser() data,date=fetchdata.getdata(url,country='ee') parser.feed(data) tree=parser.close() icaos=[] def nested(tab): if tab==None: return False if tab.getparent() is None: return False #print dir(tab) if tab.tag=='table': return True return nested(tab.getparent()) for tab in tree.xpath(".//table"): print "table alltext:",alltext(tab) if nested(tab.getparent()): continue firsttr=tab.xpath(".//tr")[0] ntext=alltext(firsttr) print "firsttr",firsttr print "ntext",ntext if re.match(ur".*FIR\s*/\s*CTA.*",ntext): print "Matches Tallin FIR" name='TALLIN FIR' points=mapper.parse_coord_str(firtxt,context='estonia') floor,ceiling="GND","FL195" space={} space['name']=name space['points']=points space['floor']=floor space['ceiling']=ceiling space['freqs']=[] space['icao']='EETT' space['type']='FIR' space['date']=date space['url']=fetchdata.getrawurl(url,'ee') spaces.append(space) continue else: name=ntext.strip() space=dict(name=name) print "Name",name assert space['name'].count("TMA") \ or space['name'].count("FIR") if space['name'].count("FIR"): type='FIR' else: type="TMA" freqs=[] points=None floor=None ceiling=None for cand in tab.xpath(".//tr"): if len(cand.getchildren())!=2: continue nom,what=cand.getchildren() whattxt=alltext(what) nomtxt=alltext(nom) print "nomtxt",nomtxt,"space name",space['name'] if nomtxt.count("Lateral limits"): if space['name'].count("TALLINN TMA 2"): points=mapper.parse_coord_str(""" A circle with radius 20 NM centred on 592448N 0244957E """) else: whattxt=whattxt.replace( "then along the territory dividing line between Estonia and Russia to", "- Along the common Estonian/X state boundary to " ) print "Fixed up",whattxt points=mapper.parse_coord_str(whattxt,context='estonia') if nomtxt.count("Vertical limits"): floor,ceiling=whattxt.split(" to ") if nomtxt.count("Call sign"): callsign=whattxt.split("\n")[0] if nomtxt.count("freq"): freqs.extend(re.findall(ur"\d+\.\d+\s*MHz")) assert points and floor and ceiling space['points']=points space['type']=type space['floor']=floor space['ceiling']=ceiling space['freqs']=[] space['type']=type space['date']=date space['url']=fetchdata.getrawurl(url,'ee') for freq in freqs: space['freqs'].append((callsign,freq)) spaces.append(space)
lat, lon = m.groups() return lat.replace(" ", "") + " " + lon.replace(" ", "") areaspec = re.sub(ur"([\d ]+N)\s*([\d ]+E)", fixup, areaspec) areaspec = re.sub( ur"\(.*/\s*equal\s*to\s*Malmi\s*CTR\s*lateral\s*limits\)", "", areaspec) #print "Fixed areaspec",areaspec #if icao=="EFKS": # areaspec=areaspec.replace("6615 28N","661528N") #Error! REstriction areas! spaces.append( dict(name=spacename, type="CTR", points=mapper.parse_coord_str(areaspec))) if line.count("Vertical limits"): #print "Breaking" break while not line.count("Vertical limits"): line = lines.next() #print "Matching veritcal limits--------------------------------" oldspaces = spaces spaces = [] for space in oldspaces: if space['name'].count("/"): a, b = space['name'].split("/") spaces.append(dict(space, name=a.strip())) spaces.append(dict(space, name=b.strip())) else: spaces.append(space)
from datetime import datetime import fplan.lib.mapper as mapper import re from fplan.lib.poly_cleaner import clean_up_polygon def ey_parse_tma(): out=[] def emit(name,coordstr,limits,type="TMA",freqs=[],date=datetime(2011,03,25),icao=None): ceiling,floor=limits.split("/") def compact(m): return "".join(m.groups()) coordstr=re.sub(ur"(\d{2,3})\s*(\d{2})\s*(\d{2})",compact,coordstr) coordstr=re.sub(ur"NM from KNA to ","NM from 545740N 0240519E to",coordstr) print coordstr tpoints=mapper.parse_coord_str(coordstr,context='lithuania') f1=mapper.parse_elev(floor) c1=mapper.parse_elev(ceiling) if c1!='-': assert c1>f1 for points in clean_up_polygon(tpoints): out.append( dict( name=name, floor=floor, ceiling=ceiling, freqs=freqs, points=points, type=type ) )
for line in reversed(lines): if re.match(ur"[A-ZÅÄÖ ]{3,}",line): #print "freqname Matched:",line fname=line.strip() break if not fname: raise Exception("Found no frequency name for freq: "+freq) freqs.append((fname,float(freq))) if len(freqs): break (ceiling,ceilingy),(floor,floory)=verts assert ceilingy<floory assert floory-ceilingy<5.0 #uprint("Analyzing area for %s"%(name,)) assert "".join(areaspec).strip()!="" print areaspec area=mapper.parse_coord_str("".join(areaspec)) #uprint("Done analyzing %s"%(name,)) #print area if name.count("CTA") and name.count("TMA")==0: type_="CTA" else: type_="TMA" out.append(dict( floor=floor, ceiling=ceiling, freqs=freqs, type=type_, name=name, points=area))
def ev_parse_x(url): out = [] parser = lxml.html.HTMLParser() data, date = fetchdata.getdata(url, country="ev") parser.feed(data) tree = parser.close() got_fir = False for table in tree.xpath("//table"): #print "Table with %d children"%(len(table.getchildren()),) rows = list(table.xpath(".//tr")) #for idx,col in enumerate(cols): # print "Col %d, %s"%(idx,alltext(col)[:10]) headingcols = rows[0].xpath(".//th") if len(headingcols) == 0: continue name, alt = headingcols[0:2] if alltext(name).count("QNH") and len(headingcols) > 6: continue print alltext(name) assert alltext(name).lower().count("name") or alltext( name).lower().count("lateral") print alltext(alt) assert alltext(alt).lower().count("limit") for row in rows[1:]: cols = list(row.xpath(".//td")) if len(cols) < 2: continue name, alt = cols[:2] lines = [x.strip() for x in alltext(name).split("\n") if x.strip()] if len(lines) == 0: continue assert len(lines) spacename = lines[0].strip() if spacename.strip( ) == "A circle radius 0,5 NM centered on 565705N 0240619E EVR2 RIGA": spacename = "EVR2 RIGA" lines = [spacename, lines[0][:-len(spacename)].strip() ] + lines[1:] print spacename if spacename.strip() == "SKRIVERI": continue print "Spacename is:", spacename assert spacename[:3] in ["EVR","EVP","TSA","TRA"] or \ spacename.endswith("ATZ") or \ spacename.endswith("ATZ (MILITARY)") altcand = [] for altc in alltext(alt).split("\n"): if altc.count("Real-time"): continue altcand.append(altc.strip()) print "Altcands:", altcand ceiling, floor = [x.strip() for x in " ".join(altcand).split("/")] ceiling = strangefix(ceiling) floor = strangefix(floor) mapper.parse_elev(ceiling) ifloor = mapper.parse_elev(floor) iceiling = mapper.parse_elev(ceiling) if ifloor >= 9500 and iceiling >= 9500: continue assert ifloor < iceiling freqs = [] raw = " ".join(lines[1:]) raw = re.sub( s(ur"Area bounded by lines successively joining the following points:" ), "", raw) print "Raw:", raw coords = mapper.parse_coord_str(raw, context='latvia') for cleaned in clean_up_polygon(coords): out.append( dict(name=spacename, points=cleaned, type="R", freqs=freqs, floor=floor, url=url, date=date, ceiling=ceiling)) return out
def fi_parse_tma(): p=parse.Parser(r"/ais/eaip/pdf/enr/EF_ENR_2_1_EN.pdf",fixuphref,country='fi') res=[] atsres=[] for pagenr in xrange(4,p.get_num_pages()): parsed,atsparsed=parse_page(p,pagenr)#pagenr) res.extend(parsed) atsres.extend(atsparsed) #break print "Len ouf out ",len(res) atsout=[] for space in atsres: #print "bef cut:",space['points'] mypolys=[makepoly.poly(space['points'])] for tmaitem in res: if tmaitem['type']!='TMA': continue outmypolys=[] assert len(mypolys)>=1 for mypoly in list(mypolys): tmapoly=makepoly.poly(tmaitem['points']) #print mypoly #print tmapoly shape=mypoly.subtract(tmapoly) newpolys=shape.get_polys() if len(newpolys)>1: print "Length is:", len(newpolys) #print "Cutting" outmypolys.extend([shapemerge2d.Polygon(x) for x in list(newpolys)]) #assert len(newpolys)==1 if len(outmypolys)>1: print "outmypolys:",outmypolys #print "Cut to:",mypoly mypolys=outmypolys for mypoly in mypolys: t=[] for mx,my in [(v.get_x(),v.get_y()) for v in mypoly.get_vertices()]: t.append(mapper.to_str(mapper.merc2latlon((mx,my),13))) #print "Aft cut:",t newspace=dict(space) newspace['points']=t atsout.append(newspace) if len(mypolys)>1: print "Space was split into ",len(mypolys),"parts" res.extend(atsout) res.append(dict( name="FINLAND FIR", icao="EFIN", floor='GND', ceiling='-', freqs=[], type='FIR', date=datetime(2011,4,9), points=mapper.parse_coord_str(""" 601130N 0190512E - 601803N 0190756E - 610000N 0191905E - 614000N 0193000E - 631000N 0201000E - 632830N 0204000E - 633700N 0213000E - 644100N 0225500E - 653148N 0240824E - Along the common X/Y state boundary to 690336N 0203255E - Along the common X/Y state boundary to 690307N 0285545E - Along the common X/Y state boundary to 601201N 0271735E - 600800N 0263300E - 595830N 0260642E - 595300N 0255200E - 595430N 0252000E - 595300N 0245100E - 590000N 0210000E - 591524N 0203239E - 593346N 0195859E - 601130N 0190512E """,context="finland"))) #for pa in res: # pretty(pa) return res
def find_areas(page): areastarts=sorted( list(page.get_by_regex(r".*?\d{4,6}[NS].*"))+ list(page.get_by_regex(r".*?\d{5,7}[EW].*")) , key=lambda x:(x.y1,x.x1)) #for area in areastarts: # print "Area font:",area.fontsize,area.font,"bolditalic:",area.bold,area.italic # print " - Area:",area.text print "Found %d area-lines on page"%(len(areastarts),) print areastarts if len(areastarts)==0: return idx=0 cury=None while True: firstdiff=None process=[] miny=None maxy=None while idx<len(areastarts): process.append(areastarts[idx]) cury=areastarts[idx].y1 if miny==None or maxy==None: miny=cury maxy=cury miny=min(areastarts[idx].y1,miny) maxy=max(areastarts[idx].y2,maxy) #print "Diff:",diff,"firstdiff:",firstdiff,"delta:",diff-firstdiff if diff!=None and firstdiff!=None else '' idx+=1 if idx<len(areastarts): diff=areastarts[idx].y1-cury if diff!=0: if firstdiff==None: firstdiff=diff #print "Diff:",diff if diff>6.0: #print "Diff too big" break if firstdiff and diff>1.35*firstdiff: #print "bad spacing",diff,1.5*firstdiff break #print "Determined that these belong to one area:",process if len(process): alltext="\n".join(page.get_lines(process)) print "<%s>"%(alltext,) anyarea=re.findall(r"((?:\d{4,6}[NS]\s*\d{5,7}[EW])+)",alltext,re.DOTALL|re.MULTILINE) print "Matching:" print anyarea if not len(anyarea): continue if len(anyarea)>=3: coords=parse_coord_str(" - ".join(anyarea),filter_repeats=True) print "AREA:" print coords print "====================================" assert len(coords)>=3 coordfontsize=process[0].fontsize areaname=None for item in reversed(sorted(page.get_partially_in_rect(0,0,100,process[0].y1),key=lambda x:(x.y1,x.x1))): if item.text.strip()=="": continue #print "fontsize",item.fontsize,item.text,"y1:",item.y1 if item.fontsize>process[0].fontsize or item.bold>process[0].bold or item.italic>process[0].italic: assert item.y1!=None miny=min(item.y1,miny) print "Found name: <%s>. Fonts: %d, %d, Fontsize: %s, old fontsize: %s"%(item.text,item.font,process[0].font,item.fontsize,process[0].fontsize) prevx1=item.x1 revname=[] for nameitem in reversed(sorted(page.get_partially_in_rect(0,item.y1+0.01,item.x2,item.y2-0.01),key=lambda x:(x.x1))): if prevx1-nameitem.x2>3.0: break revname.append(nameitem.text.strip()) areaname=" ".join(reversed(revname)) break yield (areaname,coords,dict(y1=miny,y2=maxy)) if idx>=len(areastarts): break
def ee_parse_airfields2(): ads = [] spaces = [] airac_date = get_airac_date() print "airac", airac_date overview_url = "/%s/html/eAIP/EE-AD-0.6-en-GB.html" % (airac_date, ) parser = lxml.html.HTMLParser() data, date = fetchdata.getdata(overview_url, country='ee') parser.feed(data) tree = parser.close() icaos = [] for cand in tree.xpath(".//h3"): txts = alltexts(cand.xpath(".//a")) aps = re.findall(r"EE[A-Z]{2}", " ".join(txts)) if aps: icao, = aps if alltext(cand).count("HELIPORT"): print "Ignore heliport", icao continue icaos.append(icao) for icao in icaos: ad = dict(icao=icao) url = "/%s/html/eAIP/EE-AD-2.%s-en-GB.html" % (airac_date, icao) data, date = fetchdata.getdata(url, country='ee') parser.feed(data) tree = parser.close() thrs = [] for h3 in tree.xpath(".//h3"): txt = alltext(h3) print repr(txt) ptrn = ur"\s*%s\s+[—-]\s+(.*)" % (unicode(icao.upper()), ) m = re.match(ptrn, txt, re.UNICODE) if m: assert not 'name' in ad ad['name'] = m.groups()[0] for tr in tree.xpath(".//tr"): txt = alltext(tr) m = re.match(r".*coordinates\s*and\s*site.*(\d{6}N\s*\d{7}E).*", txt) #print "Matching,",txt,":",m if m: crds, = m.groups() ad['pos'] = mapper.anyparse(crds) space = dict() for table in tree.xpath(".//table"): for tr in table.xpath(".//tr"): trtxt = alltext(tr) if trtxt.count("Designation and lateral limits"): space = dict() coords = tr.getchildren()[2] lines = alltext(coords).split("\n") if lines[0].strip() == 'NIL': continue zname, what, spill = re.match(ur"(.*)\s+(CTR|TIZ|FIZ)(.*)", lines[0]).groups() if spill and spill.strip(): rest = [spill] + lines[1:] else: rest = lines[1:] what = what.strip() assert ad['name'].upper().strip().count( zname.upper().strip()) assert what in ['FIZ', 'TIZ', 'CTR'] space['type'] = what space['points'] = mapper.parse_coord_str("\n".join(rest)) space['name'] = zname + " " + what space['date'] = date space['url'] = fetchdata.getrawurl(url, 'ee') if trtxt.count("Vertical limits"): vlim = alltext(tr.getchildren()[2]) if vlim.strip() == 'NIL': continue space['floor'], space['ceiling'] = vlim.split(" to ") #space['freqs']=x #hlc=False for h4 in tree.xpath(".//h4"): txt = alltext(h4) if txt.lower().count("charts"): par = h4.getparent() for table in par.xpath(".//table"): for idx, tr in enumerate(table.xpath(".//tr")): name,page=\ tr.getchildren() nametxt = alltext(name) print "nametxt:", nametxt, "link:" for reg, variant in [ (r"Aerodrome.*Chart.*", ""), (r"Landing.*Chart.*", "landing"), (r".*Parking.*Chart.*", "parking"), (r".*Visual.*Approach.*|.*\bVAC\b.*", "vac") ]: if re.match(reg, nametxt): for a in page.xpath(".//a"): print "linklabel", a.text print "attrib:", a.attrib href = a.attrib['href'] print "Bef repl", href if href.lower().endswith("pdf"): href = href.replace( "../../graphics", "/%s/graphics" % (airac_date, )) print "href:", href, airac_date assert href parse_landing_chart.help_plc( ad, href, icao, ad['pos'], "ee", variant=variant) """arp=ad['pos'] lc=parse_landing_chart.parse_landing_chart( href, icao=icao, arppos=arp,country="ee") assert lc if lc: ad['adcharturl']=lc['url'] ad['adchart']=lc hlc=True #chartblobnames.append(lc['blobname']) """ #assert hlc for h4 in tree.xpath(".//h4"): txt = alltext(h4) if txt.count("RUNWAY PHYSICAL"): par = h4.getparent() for table in par.xpath(".//table"): prevnametxt = "" for idx, tr in enumerate(table.xpath(".//tr")): if idx == 0: fc = alltext(tr.getchildren()[0]) print "FC", fc if not fc.count("Designations"): break #skip table if idx < 2: continue if len(tr.getchildren()) == 1: continue print "c:", tr.getchildren(), alltexts( tr.getchildren()) desig, trubrg, dims, strength, thrcoord, threlev = tr.getchildren( ) rwy = re.match(r"(\d{2}[LRC]?)", alltext(desig)) altc = alltext(thrcoord) print "Matching", altc print "rwymatch:", alltext(desig) m = re.match(r"\s*(\d+\.?\d*N)[\s\n]*(\d+\.?\d*E).*", altc, re.DOTALL | re.MULTILINE) if m: lat, lon = m.groups() print "Got latlon", lat, lon thrs.append( dict(pos=mapper.parse_coords(lat, lon), thr=rwy.groups()[0])) space['freqs'] = [] for h4 in tree.xpath(".//h4"): txt = alltext(h4) if txt.count("ATS COMMUNICATION"): par = h4.getparent() for table in par.xpath(".//table"): for idx, tr in enumerate(table.xpath(".//tr")): print "cs", repr(tr.getchildren()), alltexts( tr.getchildren()) print len(tr.getchildren()) if len(tr.getchildren()) != 5: if "".join(alltexts( tr.getchildren())).count(u"EMERG"): continue #Sometimes emergency freq is listed, and then it is without callsign service,callsign,frequency,hours,remarks=\ tr.getchildren() callsigntxt = alltext(callsign) if idx < 2: if idx == 0: assert callsigntxt.strip() == "Call sign" if idx == 1: assert callsigntxt.strip() == "2" continue ftext = alltext(frequency) print "matching freq", ftext for freq in re.findall(ur"\b\d{3}\.\d{1,3}", ftext): freqmhz = float(freq) space['freqs'].append( (callsigntxt.strip(), freqmhz)) if space and 'points' in space: assert 'freqs' in space assert 'points' in space assert 'floor' in space assert 'ceiling' in space assert 'type' in space spaces.append(space) if thrs: ad['runways'] = rwy_constructor.get_rwys(thrs) aip_text_documents.help_parse_doc(ad, url, icao, "ee", title="General Information", category="general") ad['date'] = date ad['url'] = fetchdata.getrawurl(url, 'ee') print "AD:", ad assert 'pos' in ad assert 'name' in ad ads.append(ad)
def ee_parse_restrictions(): spaces=[] p=parse.Parser("/ee_restricted_and_danger.pdf",lambda x: x,country='ee') for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) raws=list(sorted(page.get_by_regex(ur"EE[RD]\d+\s+.*"),key=lambda x:x.y1))+[None] if len(raws)>1: elevs=page.get_by_regex(ur"\d+\s*FT\s*MSL|FL\s*\d+") assert elevs elevcol=min(elev.x1 for elev in elevs) assert elevcol!=100 for cur,next in izip(raws[:-1],raws[1:]): #if cur.text.count("Tunnus, nimi ja sivurajat"): continue #not a real airspace space=dict() if next==None: y2=100 else: y2=next.y1-1.75 name=cur.text.strip() space['name']=name areaspecprim=page.get_lines(page.get_partially_in_rect(cur.x1+0.01,cur.y2+0.05,elevcol-2,y2), fudge=.25) #print "areaspecprim:\n","\n".join(areaspecprim) areaspec=[] for area in areaspecprim: print "area in ",area area=area.replace(u"–","-") if len(areaspec) and area.strip()=="": break area=re.sub(ur"\w-$","",area) areaspec.append(area) #print "Y-interval:",cur.y1,y2,"next:",next #print "Name:",space['name'] #print "areaspec:",areaspec inp=" ".join(areaspec) #print inp #raw_input() tpoints=mapper.parse_coord_str(inp,context='estonia') if name.startswith("EER1"): tseaborder="592842N 0280054E - 593814N 0273721E - 593953N 0265728E - 594513N 0264327E" seapoints=mapper.parse_coord_str(tseaborder) cont=None points=[] def close(a,b): bearing,dist=mapper.bearing_and_distance( mapper.from_str(a),mapper.from_str(b)) #print (a,b),dist return dist<1.0 for idx,point in enumerate(tpoints): points.append(point) if close(point,seapoints[0]): print "WAS CLOSE",point,seapoints[0] points.extend(seapoints[1:-1]) for idx2,point in enumerate(tpoints[idx+1:]): if close(point,seapoints[-1]): points.extend(tpoints[idx+1+idx2:]) break else: raise Exception("Couldn't find seaborder end") break else: raise Exception("Couldn't find seaborder") else: points=tpoints space['points']=points vertitems=page.get_partially_in_rect(elevcol,cur.y1+0.05,elevcol+8,y2+1.5) vertspec=[] for v in page.get_lines(vertitems): if v.strip()=="": continue if v.strip().count("Lennuliiklusteeninduse AS"): continue vertspec.append(v.strip()) print "vertspec:",vertspec assert len(vertspec)==2 ceiling,floor=vertspec if mapper.parse_elev(floor)>=9500 and mapper.parse_elev(ceiling)>=9500: continue space['ceiling']=ceiling space['floor']=floor space['type']='R' space['freqs']=[] spaces.append(space) spaces.append(dict( name="EE TSA 1", ceiling="UNL", floor="5000 FT GND", points=mapper.parse_coord_str(u""" 594500N 0255000E – 594500N 0261800E – 592100N 0265800E – 591200N 0261200E – 591600N 0255400E – 594500N 0255000E"""), type="TSA", date=datetime(2011,03,25), freqs=[]))
def ep_parse_tma(): spaces = [] pages, date = miner.parse('/_Poland_EP_ENR_2_1_en.pdf', country='ep', usecache=True, maxcacheage=86400 * 7) for nr, page in enumerate(pages): #if nr!=1: continue #print "page",nr #print page.items desigs = page.get_by_regex(ur".*DESIGNATION AND LATERAL.*", re.DOTALL) for desig, next in izip(desigs, desigs[1:] + [None]): if nr == 0: #FIR uwagi = page.get_by_regex_in_rect(ur".*UWAGI\s*/\s*REMARKS.*", 0, desig.y2, 100, 100, re.DOTALL)[0] coords = page.get_lines2( page.get_partially_in_rect(0, desig.y2 + 0.5, desig.x2 + 10, uwagi.y1 - 0.5)) raw = "\n".join(coords) #print "Raw:\n",raw d = md5.md5(raw.encode('utf8')).hexdigest() assert d == "f336800a8183f1360415d2afef38e9ae" #print "Md5-digest",d #/further along the state border to the point 54°36’14.03”N 019°24’15.02”E - raw = fixup(u""" 54°27’28.03”N 019°38’24.05”E - 54°36’14.03”N 019°24’15.02”E - 55°50’58.98”N 017°32’52.80”E - 54°54’58.84”N 015°51’52.92”E - 54°55’00.00”N 015°08’07.00”E - /from this point the arc of 30 km radius centred at point 55°04’04”N 014°44’48”E - 54°55’00”N 014°21’27”E - 54°07’38”N 014°15’17”E - 54°07’34”N 014°12’05”E - 53°59’16”N 014°14’32”E - 53°55’40”N 014°13’34”E - <hack_longway_around_border>/further along the state border to the point 542615N 0194751E """) ##print "rw:",raw fir = mapper.parse_coord_str(raw, context='poland') fir_context = [ fir ] #In principle, a FIR could consist of multiple non-overlapping regions. In this case, the list here would contain more than one list of points #print fir #sys.exit(1) spaces.append( dict(points=fir, name="WARSZAWA FIR", icao="EPWW", floor="GND", ceiling="-", freqs=[], type="FIR", date=date)) continue areas = page.get_partially_in_rect(50, desig.y1 - 3, 100, desig.y1 - 0.5) #print "partially: <%s>"%(areas,) if len(areas) == 0: #print "Found continuation of area:",area pass else: lines = [] for s in reversed(page.get_lines2(areas)): if s.y1 >= desig.y1: break if re.match("\d+ \w{3} 2[01]\d{2}", s): break if re.match(ur"\s*AIP\s*POLAND\s*", s): #not real area. break if s.count("Responsibility boundary within SECTOR"): lines = [] #not real area name break m = re.match(".*\d+\.?\d*\s*([\w\s()]+)\s*$", s, re.UNICODE) if m: print "matched name", s, "as: <%s>" % (m.groups()) lines = [m.groups()[0]] break lines.append(s.strip()) if len(lines) == 0: pass #print "Continuation of area:",area else: area = " ".join(lines) print "areastr:", area print "Parsing area\n-------------------------------------------------\n\n", area uwagis = page.get_by_regex_in_rect(ur".*UWAGI/REMARKS.*", 0, desig.y2 + 1, 100, 100, re.DOTALL) y2 = 100 if len(uwagis): #print "Uwagi y1:",uwagis[0].y1 y2 = min(uwagis[0].y1 - 0.1, y2) if next: y2 = min(next.y1, y2) #print "next.y1",next.y1 #print "End of desig",y2 #print desig units = page.get_by_regex_in_rect(ur".*UNIT PROVIDING.*", desig.x2, desig.y1, 100, desig.y2, re.DOTALL) if len(units) == 0: continue unit, = units vertlim, = page.get_by_regex_in_rect(ur".*VERTICAL LIMITS.*", desig.x2, desig.y1, 100, desig.y2, re.DOTALL) freq, = page.get_by_regex_in_rect(ur".*FREQUENCY.*", desig.x2, desig.y1, 100, desig.y2, re.DOTALL) #print "Looking in ",desig.y2+0.5,y2 desigs = page.get_partially_in_rect(0, desig.y2 + 0.5, desig.x2 + 1, y2 - 0.8) #print "desigs,",repr(desigs) """ def clump(desigs): out=[] y1=1e30 y2=None for desig in desigs: if y2!=None: delta=desig.y1-y2 if delta> y1=min(desig.y1,y1) y2=max(desig.y2,y2) out.append(desig.text) """ #last_curfreq=None #out=[] if re.match(ur".*ATS\s*SERVICES\s*DELEGATION.*", area): break raws = [] found_x1 = None for sub in desigs: #print "\n\n-> y2",y2," cur sub:",sub.y1 if sub.y1 >= y2: break wholerow = page.get_lines2( page.get_partially_in_rect(0, sub.y1 + 0.25, 100, sub.y2 - 0.25)) wholerowstr = " ".join(wholerow) #print "Parse:<%s>"%(wholerowstr,) if re.match(ur".*\d+\.\d+\s+[\w\s*]+CONTROL\s*AREA\s*$", wholerowstr, re.UNICODE): break if re.match(ur".*\d+\s+ATS\s*SERVICES\s*DELEGATION.*", wholerowstr, re.UNICODE): break
#print "freqname Matched:",line fname = line.strip() break if not fname: raise Exception("Found no frequency name for freq: " + freq) freqs.append((fname, float(freq))) if len(freqs): break (ceiling, ceilingy), (floor, floory) = verts assert ceilingy < floory assert floory - ceilingy < 5.0 #uprint("Analyzing area for %s"%(name,)) assert "".join(areaspec).strip() != "" print areaspec area = mapper.parse_coord_str("".join(areaspec)) #uprint("Done analyzing %s"%(name,)) #print area if name.count("CTA") and name.count("TMA") == 0: type_ = "CTA" else: type_ = "TMA" out.append( dict(floor=floor, ceiling=ceiling, freqs=freqs, type=type_, name=name, points=area))
ur".*The\s*line\s*joining.*", text): continue if not seenreal and text.endswith("following points:"): continue if not seenreal and text == "points:": continue if text.endswith("E"): text = text + " - " seenreal = True coords.append(text) last = sub pass assert points == None coordstr = fixup(" ".join(coords)) print "Raw coords:", coordstr points = mapper.parse_coord_str(coordstr) assert ceiling assert floor assert ctrname spaces.append( dict(name=ctrname, points=points, type="CTR", ceiling=ceiling, floor=floor, freqs=freqs)) #not first page: assert points != None return dict( name=name,
def fi_parse_airfield(icao=None): spaces=[] ad=dict() assert icao!=None ad['icao']=icao sigpoints=[] #https://ais.fi/ais/eaip/pdf/aerodromes/EF_AD_2_EFET_EN.pdf #https://ais.fi/ais/eaip/aipcharts/efet/EF_AD_2_EFET_VAC.pdf #vacp=parse.Parser("/ais/eaip/aipcharts/%s/EF_AD_2_%s_VAC.pdf"%(icao.lower(),icao),lambda x: x,country="fi") def remove_italics(x): return x.replace("<i>","").replace("</i>","") p=parse.Parser("/ais/eaip/pdf/aerodromes/EF_AD_2_%s_EN.pdf"%(icao,),remove_italics,country="fi") #The following doesn't actually work, since finnish VAC are bitmaps!!! :-( if 0: vacpage=vacp.parse_page_to_items(0) repp=vacpage.get_by_regex("\s*REPORTING\s*POINTS\s*") assert len(repp)>0 for item in repp: lines=iter(page.get_lines(page.get_partially_in_rect(item.x1,item.y2+0.1,100,100))) for line in lines: uprint("Looking for reporting points:%s"%(line,)) name,lat,lon=re.match(ur"([A-ZÅÄÖ\s ]{3,})\s*([ \d]+N)\s*([ \d]+E).*",line) sigpoints.append(dict( name=icao+" "+name.strip(), kind="reporting", pos=mapper.parse_coords(lat.replace(" ",""),lon.replace(" ","")))) page=p.parse_page_to_items(0) nameregex=ur"%s\s+-\s+([A-ZÅÄÖ\- ]{3,})"%(icao,) for item in page.get_by_regex(nameregex): #print "fontsize:",item.fontsize assert item.fontsize>=14 ad['name']=re.match(nameregex,item.text).groups()[0].strip() break for item in page.get_by_regex(ur".*ELEV\s*/\s*REF.*"): lines=page.get_lines(page.get_partially_in_rect(0,item.y1+0.1,100,item.y2-0.1)) for line in lines: print "Line:",line ft,=re.match(".*ELEV.*([\d\.]+)\s*FT.*",line).groups() assert not 'elev' in ad ad['elev']=float(ft) for item in page.get_by_regex(ur"Mittapisteen.*sijainti"): lines=page.get_lines(page.get_partially_in_rect(item.x1,item.y1,100,item.y2)) for line in lines: for crd in mapper.parsecoords(line): assert not ('pos' in ad) ad['pos']=crd parse_landing_chart.help_plc(ad, "/ais/eaip/aipcharts/%s/EF_AD_2_%s_ADC.pdf"%(icao.lower(),icao.upper()), icao,ad['pos'],country='fi' ) parse_landing_chart.help_plc(ad, "/ais/eaip/aipcharts/%s/EF_AD_2_%s_VAC.pdf"%(icao.lower(),icao.upper()), icao,ad['pos'],country='fi',variant='VAC' ) parse_landing_chart.help_plc(ad, "/ais/eaip/aipcharts/%s/EF_AD_2_%s_LDG.pdf"%(icao.lower(),icao.upper()), icao,ad['pos'],country='fi',variant='landing' ) parse_landing_chart.help_plc(ad, "/ais/eaip/aipcharts/%s/EF_AD_2_%s_APDC.pdf"%(icao.lower(),icao.upper()), icao,ad['pos'],country='fi',variant='parking' ) aip_text_documents.help_parse_doc(ad,"/ais/eaip/pdf/aerodromes/EF_AD_2_%s_EN.pdf"%(icao.upper(),), icao,"fi",title="General Information",category="general") ad['runways']=[] thrs=[] freqs=[] for pagenr in xrange(p.get_num_pages()): page=p.parse_page_to_items(pagenr) if page==None: continue for item in page.get_by_regex("\s*RUNWAY\s*PHYSICAL\s*CHARACTERISTICS\s*"): lines=page.get_lines(page.get_partially_in_rect(0,item.y2+0.1,100,100)) for line in lines: if re.match(ur"AD\s+2.13",line): break m=re.match(ur".*?(RWY END)?\s*\*?(\d{6}\.\d+N)\s*(\d{6,7}\.\d+E).*",line) if not m:continue rwyend,lat,lon=m.groups() rwytxts=page.get_lines(page.get_partially_in_rect(0,line.y1,12,line.y2)) print "Rwytxts:",rwytxts rwytxt,=rwytxts uprint("rwytext:",rwytxt) rwy,=re.match(ur"\s*(\d{2}[LRCM]?)\s*[\d.]*\s*",rwytxt).groups() have_thr=False for thr in thrs: if thr['thr']==rwy: have_thr=True if rwyend!=None and have_thr: continue thrs.append(dict(pos=mapper.parse_coords(lat,lon),thr=rwy)) for item in page.get_by_regex("ATS AIRSPACE"): lines=iter(page.get_lines(page.get_partially_in_rect(0,item.y2+0.1,100,100))) spaces=[] line=lines.next() while True: while line.strip()=="": line=lines.next() print "Read line:",line if line.count("Vertical limits"): break m=re.match(ur".*?/\s+Designation and lateral limits\s*(.*\b(?:CTR|FIZ)\b.*?)\s*:?\s*$",line) if not m: m=re.match(ur"\s*(.*\b(?:CTR|FIZ)\b.*?)\s*:",line) #print "Second try:",m spacename,=m.groups() #print "Got spacename:",spacename assert spacename.strip()!="" coords=[] while True: line=lines.next() print "Further:",line if line.count("Vertical limits"): print "Breaking" break if not re.search(ur"[\d ]+N\s*[\d ]+E",line) and \ not re.search(ur"circle|cent[red]{1,5}|pitkin|point|equal\s*to",line): print "Breaking" break coords.append(line) areaspec="".join(coords) def fixup(m): lat,lon=m.groups() return lat.replace(" ","")+" "+lon.replace(" ","") areaspec=re.sub(ur"([\d ]+N)\s*([\d ]+E)",fixup,areaspec) areaspec=re.sub(ur"\(.*/\s*equal\s*to\s*Malmi\s*CTR\s*lateral\s*limits\)","",areaspec) #print "Fixed areaspec",areaspec #if icao=="EFKS": # areaspec=areaspec.replace("6615 28N","661528N") #Error! REstriction areas! spaces.append(dict( name=spacename, type="CTR", points=mapper.parse_coord_str(areaspec))) if line.count("Vertical limits"): #print "Breaking" break while not line.count("Vertical limits"): line=lines.next() #print "Matching veritcal limits--------------------------------" oldspaces=spaces spaces=[] for space in oldspaces: if space['name'].count("/"): a,b=space['name'].split("/") spaces.append(dict(space,name=a.strip())) spaces.append(dict(space,name=b.strip())) else: spaces.append(space) missing=set([space['name'] for space in spaces]) while True: for space in spaces: for it in xrange(3): cand=space['name'] if it==1: if cand.count("CTR"): cand="CTR" if cand.count("FIZ"): cand="FIZ" if it==2: if cand.count("CTR"): cand=r"CTR\s*/[\sA-Z]+" if cand.count("FIZ UPPER"): cand="FIZ UPPER" if cand.count("FIZ LOWER"): cand="FIZ LOWER" m=re.match(ur".*%s\s*:([^,:-]*)\s*-\s*([^,:-]*)"%(cand,),line) print "Matching ",cand," to ",line,"missing:",missing,m if m: break if len(spaces)==1 and not m: m=re.match(ur".*Vertical limits\s*(.*)\s*-\s*(.*)",line) if m: print "*****MATCH!!:::",m.groups() for lim in m.groups(): assert lim.count(",")==0 space['floor'],space['ceiling']=m.groups() missing.remove(space['name']) #print "Missing:" if len(missing)==0: break if len(missing)==0: break #print "Still missing:",missing line=lines.next() print "Parse f o n page",pagenr for item2 in page.get_by_regex(ur".*ATS\s*COMMUNICATION\s*FACILITIES.*"): lines=page.get_lines(page.get_partially_in_rect(0,item2.y2+0.1,100,100)) for line in lines: if line.count("RADIO NAVIGATION AND LANDING AIDS"): break print "Comm line:",line twr=re.match(ur"TWR.*(\d{3}\.\d{3})\b.*",line) if twr: freqs.append(('TWR',float(twr.groups()[0]))) atis=re.match(ur"ATIS.*(\d{3}\.\d{3})",line) if atis: freqs.append(('ATIS',float(atis.groups()[0])))
continue kind, name = re.match("EP (TSA|TRA|TFR) ([\d\w]+)", tra.text).groups() def fix_coords(s): def fixer(m): a, b, c, d, e, f, g, h = m.groups() return "%02d%02d%02d%s %03d%02d%02d%s - " % ( int(a), int(b), int(c), d, int(e), int(f), int(g), h) return re.sub( ur"(\d{2,3})°(\d{2})'(\d{2})''([NS])\s*(\d{2,3})°(\d{2})'(\d{2})''([EW])", fixer, s) coordstr2 = fix_coords("".join(o)).rstrip().rstrip("-") print "COordstr:", coordstr2 spaces.append( dict(name="EP %s %s" % (kind, name), points=mapper.parse_coord_str(coordstr2, context="poland"), ceiling=ceiling, floor=floor, type="TSA", freqs=[])) return spaces if __name__ == '__main__': for space in ep_parse_tra(): print "space", space
coords.append(line) else: if line.count("SEKTOR"): subname = lines[0].strip() raw = " ".join(coords) def s(x): return x.replace(" ", ur"\s*") #raw=re.sub(s(ur"Linia łącząca następujące punkty : / The line joining the following points :? "), # "",raw) #print "raw area:<%s>"%(repr(raw),) points = mapper.parse_coord_str(raw, context="poland", fir_context=fir_context) if len(curvert) == 0: lastspace = spaces[-1] assert len(curunit) == 0 assert len(curfreq) == 0 lastspace['points'].extend(points) else: curvert_out = [] for cur in curvert: cur = cur.strip() if cur.endswith("C"): cur = cur[:-1].strip() if not cur: continue curvert_out.append(cur)
def parse_all_tma(): def fixgote(raw): #Fix illogical composition of Göteborg TMA description. 2010 04 02 did_replace=[0] def replacer(args): uprint(args.groups()) y,x,w,h,font=args.groups() uprint(w,h) assert int(w)>=260 and int(w)<420 assert int(h)>=6 and int(h)<=15 f=float(w)/270.0 x1=x y1=y w1=80 h1=h x2=168*f y2=y w2=150*f h2=h did_replace[0]+=1 repl="""<text top="%s" left="%s" width="%s" height="%s" font="%s">Part of GÖTEBORG TMA</text> <text top="%s" left="%s" width="%s" height="%s" font="%s">584558N 0122951E - 584358N 0130950E - </text>"""%( y1,x1,w1,h1,font,y2,x2,w2,h2,font) uprint("\n======================================\nReplacement:\n",repl) return repl raw=re.sub(r"""<text top="(\d+)" left="(\d+)" width="(\d+)" height="(\d+)" font="(\d+)">\s*Part of GÖTEBORG TMA 584558N 0122951E - 584358N 0130950E - </text>""",replacer,raw) assert did_replace[0]==1 return raw p=parse.Parser("/AIP/ENR/ENR 2/ES_ENR_2_1_en.pdf") res=[] found=False last_sector=dict() for pagenr in xrange(0,p.get_num_pages()): page=p.parse_page_to_items(pagenr) #print "Num acc-sec:",len(page.get_by_regex(r".*ACC.sectors.*")) #print "Num and acc-sec:",len(page.get_by_regex(r".*and\s+ACC.sectors.*")) sect=(len(page.get_by_regex(r".*ACC.sectors.*"))>0 and len(page.get_by_regex(r".*and\s+ACC.sector.*"))==0) #print "ACC-sector2:",sect if found or page.get_by_regex(r".*Terminal Control Areas.*") or sect: found=True else: continue #if sect: parsed=parse_page(p,pagenr,"TMA" if not sect else "sector",last_sector=last_sector) res.extend(parsed) res.append(dict( name="SWEDEN FIR", icao="ESAA", floor='GND', ceiling='-', freqs=[], type='FIR', date=datetime(2011,4,9), points=mapper.parse_coord_str(""" 690336N 0203255E - Along the common X/Y state boundary to 653148N 0240824E - 644100N 0225500E - 633700N 0213000E - 632830N 0204000E - 631000N 0201000E - 614000N 0193000E - 610000N 0191905E - 601803N 0190756E - 601130N 0190512E - 593346N 0195859E - 591524N 0203239E - 590000N 0210000E - 573410N 0200900E - 570000N 0195000E - 555100N 0173300E - 545500N 0155200E - 545500N 0150807E - clockwise along an arc centred on 550404N 0144448E and with radius 16.2 NM - 545500N 0142127E - 545500N 0125100E - 552012N 0123827E - Along the common X/Y state boundary to 561253N 0122205E - 583000N 0103000E - 584540N 0103532E - 585332N 0103820E - Along the common X/Y state boundary to 690336N 0203255E """,context="sweden"))) for pa in res: pretty(pa) return res
def ep_parse_tma(): spaces=[] pages,date=miner.parse('/_Poland_EP_ENR_2_1_en.pdf', country='ep',usecache=True, maxcacheage=86400*7 ) for nr,page in enumerate(pages): #if nr!=1: continue #print "page",nr #print page.items desigs=page.get_by_regex(ur".*DESIGNATION AND LATERAL.*",re.DOTALL) for desig,next in izip(desigs,desigs[1:]+[None]): if nr==0: #FIR uwagi=page.get_by_regex_in_rect(ur".*UWAGI\s*/\s*REMARKS.*", 0,desig.y2,100,100,re.DOTALL)[0] coords=page.get_lines2(page.get_partially_in_rect( 0,desig.y2+0.5,desig.x2+10,uwagi.y1-0.5)) raw="\n".join(coords) #print "Raw:\n",raw d=md5.md5(raw.encode('utf8')).hexdigest() assert d=="f336800a8183f1360415d2afef38e9ae" #print "Md5-digest",d #/further along the state border to the point 54°36’14.03”N 019°24’15.02”E - raw=fixup(u""" 54°27’28.03”N 019°38’24.05”E - 54°36’14.03”N 019°24’15.02”E - 55°50’58.98”N 017°32’52.80”E - 54°54’58.84”N 015°51’52.92”E - 54°55’00.00”N 015°08’07.00”E - /from this point the arc of 30 km radius centred at point 55°04’04”N 014°44’48”E - 54°55’00”N 014°21’27”E - 54°07’38”N 014°15’17”E - 54°07’34”N 014°12’05”E - 53°59’16”N 014°14’32”E - 53°55’40”N 014°13’34”E - <hack_longway_around_border>/further along the state border to the point 542615N 0194751E """) ##print "rw:",raw fir=mapper.parse_coord_str(raw,context='poland') fir_context=[fir]#In principle, a FIR could consist of multiple non-overlapping regions. In this case, the list here would contain more than one list of points #print fir #sys.exit(1) spaces.append( dict( points=fir, name="WARSZAWA FIR", icao="EPWW", floor="GND", ceiling="-", freqs=[], type="FIR", date=date )) continue areas=page.get_partially_in_rect(50,desig.y1-3,100,desig.y1-0.5) #print "partially: <%s>"%(areas,) if len(areas)==0: #print "Found continuation of area:",area pass else: lines=[] for s in reversed(page.get_lines2(areas)): if s.y1>=desig.y1: break if re.match("\d+ \w{3} 2[01]\d{2}",s): break if re.match(ur"\s*AIP\s*POLAND\s*",s): #not real area. break if s.count("Responsibility boundary within SECTOR"): lines=[] #not real area name break m=re.match(".*\d+\.?\d*\s*([\w\s()]+)\s*$",s,re.UNICODE) if m: print "matched name",s,"as: <%s>"%(m.groups()) lines=[m.groups()[0]] break lines.append(s.strip()) if len(lines)==0: pass #print "Continuation of area:",area else: area=" ".join(lines) print "areastr:",area print "Parsing area\n-------------------------------------------------\n\n",area uwagis=page.get_by_regex_in_rect(ur".*UWAGI/REMARKS.*", 0,desig.y2+1,100,100,re.DOTALL) y2=100 if len(uwagis): #print "Uwagi y1:",uwagis[0].y1 y2=min(uwagis[0].y1-0.1,y2) if next: y2=min(next.y1,y2) #print "next.y1",next.y1 #print "End of desig",y2 #print desig units=page.get_by_regex_in_rect(ur".*UNIT PROVIDING.*", desig.x2,desig.y1,100,desig.y2,re.DOTALL) if len(units)==0: continue unit,=units vertlim,=page.get_by_regex_in_rect(ur".*VERTICAL LIMITS.*", desig.x2,desig.y1,100,desig.y2,re.DOTALL) freq,=page.get_by_regex_in_rect(ur".*FREQUENCY.*", desig.x2,desig.y1,100,desig.y2,re.DOTALL) #print "Looking in ",desig.y2+0.5,y2 desigs=page.get_partially_in_rect(0,desig.y2+0.5,desig.x2+1,y2-0.8) #print "desigs,",repr(desigs) """ def clump(desigs): out=[] y1=1e30 y2=None for desig in desigs: if y2!=None: delta=desig.y1-y2 if delta> y1=min(desig.y1,y1) y2=max(desig.y2,y2) out.append(desig.text) """ #last_curfreq=None #out=[] if re.match(ur".*ATS\s*SERVICES\s*DELEGATION.*",area): break raws=[] found_x1=None for sub in desigs: #print "\n\n-> y2",y2," cur sub:",sub.y1 if sub.y1>=y2: break wholerow=page.get_lines2(page.get_partially_in_rect(0,sub.y1+0.25,100,sub.y2-0.25)) wholerowstr=" ".join(wholerow) #print "Parse:<%s>"%(wholerowstr,) if re.match(ur".*\d+\.\d+\s+[\w\s*]+CONTROL\s*AREA\s*$",wholerowstr,re.UNICODE): break if re.match(ur".*\d+\s+ATS\s*SERVICES\s*DELEGATION.*",wholerowstr,re.UNICODE): break
break else: raise Exception("No limitstr") cstr=[] spacename=coordstr[0] assert spacename=="CTR" for sub in coordstr[1:]: cstr.append(sub.strip().rstrip(".")) def fixfunc(m): return "".join(m.groups()) raw=re.sub(ur"(\d{2,3})\s*(\d{2})\s*(\d{2})\s*([NSEW])", fixfunc, "".join(cstr)).replace(","," - ") print "parsing raw:",raw points=mapper.parse_coord_str(raw,context='lithuania') print "Limitstr",limitstr floor,ceiling=re.match(ur"(.*)\s*to\s*(.*)",limitstr).groups() mapper.parse_elev(floor) mapper.parse_elev(ceiling) spacenamestem=spacename.strip() if spacenamestem.endswith("CTR"): spacenamestem=spacenamestem[:-3].strip() if spacenamestem.endswith("FIZ"): spacenamestem=spacenamestem[:-3].strip() #construct names newfreqs=[] for serv,freq in freqs: serv=serv.strip()
if line.endswith("E"): line+=" - " coords.append(line) else: if line.count("SEKTOR"): subname=lines[0].strip() raw=" ".join(coords) def s(x): return x.replace(" ",ur"\s*") #raw=re.sub(s(ur"Linia łącząca następujące punkty : / The line joining the following points :? "), # "",raw) #print "raw area:<%s>"%(repr(raw),) points=mapper.parse_coord_str(raw,context="poland",fir_context=fir_context) if len(curvert)==0: lastspace=spaces[-1] assert len(curunit)==0 assert len(curfreq)==0 lastspace['points'].extend(points) else: curvert_out=[] for cur in curvert: cur=cur.strip() if cur.endswith("C"): cur=cur[:-1].strip() if not cur:continue curvert_out.append(cur) print "Raw curvert_out:",repr(curvert_out)
def find_areas(page): areastarts = sorted(list(page.get_by_regex(r".*?\d{4,6}[NS].*")) + list(page.get_by_regex(r".*?\d{5,7}[EW].*")), key=lambda x: (x.y1, x.x1)) #for area in areastarts: # print "Area font:",area.fontsize,area.font,"bolditalic:",area.bold,area.italic # print " - Area:",area.text print "Found %d area-lines on page" % (len(areastarts), ) print areastarts if len(areastarts) == 0: return idx = 0 cury = None while True: firstdiff = None process = [] miny = None maxy = None while idx < len(areastarts): process.append(areastarts[idx]) cury = areastarts[idx].y1 if miny == None or maxy == None: miny = cury maxy = cury miny = min(areastarts[idx].y1, miny) maxy = max(areastarts[idx].y2, maxy) #print "Diff:",diff,"firstdiff:",firstdiff,"delta:",diff-firstdiff if diff!=None and firstdiff!=None else '' idx += 1 if idx < len(areastarts): diff = areastarts[idx].y1 - cury if diff != 0: if firstdiff == None: firstdiff = diff #print "Diff:",diff if diff > 6.0: #print "Diff too big" break if firstdiff and diff > 1.35 * firstdiff: #print "bad spacing",diff,1.5*firstdiff break #print "Determined that these belong to one area:",process if len(process): alltext = "\n".join(page.get_lines(process)) print "<%s>" % (alltext, ) anyarea = re.findall(r"((?:\d{4,6}[NS]\s*\d{5,7}[EW])+)", alltext, re.DOTALL | re.MULTILINE) print "Matching:" print anyarea if not len(anyarea): continue if len(anyarea) >= 3: coords = parse_coord_str(" - ".join(anyarea), filter_repeats=True) print "AREA:" print coords print "====================================" assert len(coords) >= 3 coordfontsize = process[0].fontsize areaname = None for item in reversed( sorted(page.get_partially_in_rect( 0, 0, 100, process[0].y1), key=lambda x: (x.y1, x.x1))): if item.text.strip() == "": continue #print "fontsize",item.fontsize,item.text,"y1:",item.y1 if item.fontsize > process[ 0].fontsize or item.bold > process[ 0].bold or item.italic > process[0].italic: assert item.y1 != None miny = min(item.y1, miny) print "Found name: <%s>. Fonts: %d, %d, Fontsize: %s, old fontsize: %s" % ( item.text, item.font, process[0].font, item.fontsize, process[0].fontsize) prevx1 = item.x1 revname = [] for nameitem in reversed( sorted(page.get_partially_in_rect( 0, item.y1 + 0.01, item.x2, item.y2 - 0.01), key=lambda x: (x.x1))): if prevx1 - nameitem.x2 > 3.0: break revname.append(nameitem.text.strip()) areaname = " ".join(reversed(revname)) break yield (areaname, coords, dict(y1=miny, y2=maxy)) if idx >= len(areastarts): break