def parse_stations(self, html): bs = BeautifulSoup(html) tables = bs.findAll('table', {'class':'show_fw'}) st = {} for i in range(2): trs = tables[i].findAll('tr') direction = clean_text(trs[0].text.replace('Fahrtrichtung', '')) sta = [] for tr in trs[2:-1]: if tr.a: sta.append((clean_text(tr.a.text), defaults.base_url + tr.a['href'])) else: sta.append((clean_text(tr.text), None)) st[direction] = sta return st
def parse_lines(self, html): """ Parse lines from html """ bs = BeautifulSoup(html) # get tables lines = bs.findAll('td', {'class':'auswahl'}) l = {} for line in lines: if line.a: hr = line.a['href'].split('?', 1)[-1] href = defaults.station_base + hr if line.text: l[line.text] = href elif line.img: l[line.img['alt']] = href return l
class sParser: """ Parser for search response """ def __init__(self, html): self.soup = BeautifulSoup(html) def check_page(self): if self.soup.find('form', {'id': 'form_efaresults'}): return PageType.RESULT if self.soup.find('div', {'class':'form_error'}): return PageType.CORRECTION return PageType.UNKNOWN state = property(check_page) def get_correction(self): names_origin = self.soup.find('select', {'id': 'nameList_origin'}) names_destination = self.soup.find('select', {'id': 'nameList_destination'}) places_origin = self.soup.find('select', {'id': 'placeList_origin'}) places_destination = self.soup.find('select', {'id': 'placeList_destination'}) if any([names_origin, names_destination, places_origin, places_destination]): dict = {} if names_origin: dict['origin'] = map(lambda x: x.text, names_origin.findAll('option')) if names_destination: dict['destination'] = map(lambda x: x.text, names_destination.findAll('option')) if places_origin: dict['place_origin'] = map(lambda x: x.text, names_origin.findAll('option')) if names_destination: dict['place_destination'] = map(lambda x: x.text, names_destination.findAll('option')) return dict else: raise ParserError('Unable to parse html') def get_result(self): return rParser(str(self.soup))
def parse_departures(self, html): bs = BeautifulSoup(html) dep = [] # Check for error messages msg = bs.findAll('span', {'class': 'rot fett'}) if msg and len(msg) > 0 and unicode(msg[0].text).find(u'technischen St') > 0: print '\n'.join(map(lambda x: x.text.replace(' ', ''), msg)) return [] errtable = bs.find('table', {'class':'errortable'}) if errtable and clean_text(errtable.text): print "Errortable found" print errtable.text return [] if bs.table and bs.table.tr: st_td = bs.table.tr.findAll('td') if st_td: station = clean_text(st_td[-1].text) else: print "Unexpected Error: Stationname not found" print "Debug:", st_td.encode('UTF-8') else: print "Unexpected Error: table or tr not found" print bs return [] # zusatztext crap zt = bs.find('td', {'class':'zusatztext'}) if zt: ma = ZUSATZTEXT_REGEX.search(zt.text) if ma: line = ma.group(1) direction = ma.group(2) if direction == direction.upper(): direction = direction.capitalize() tim = int(ma.group(3)) d = Departure(line=line, direction=direction, lowfloor=True, station=station, time=tim) dep.append(d) else: print zt.text table = bs.find('table', {'class':'imagetable'}) if not table: print "table not found" return [] if errtable: print "Warning: Empty errortable found" return dep trs = table.findAll('tr') for tr in trs[1:]: tds = tr.findAll('td') line = clean_text(tds[0].text) direction = clean_text(tds[1].text) if direction.startswith(line): direction = direction.lstrip(line).strip() if direction == direction.upper(): direction = direction.capitalize() lf_img = tds[-1].img lowfloor = lf_img and lf_img.has_key('alt') d = {'line': line, 'direction': direction, 'lowfloor': lowfloor, 'station': station} # parse time tim = clean_text(tds[2].text) dts = DELTATIME_REGEX.search(tim) abs = ABSTIME_REGEX.search(tim) if tim.find(u'...in K\xfcrze') >= 0: d['time'] = 0 elif abs: d['time'] = calc_datetime(abs.group(1)) elif tim.isdigit(): d['time'] = int(tim) elif dts: # is timedelta d['time'] = int(dts.group(1)) else: print "Error parsing time:", tim continue dep.append(Departure(**d)) return dep
def __init__(self, html): self.soup = BeautifulSoup(html) self._overview = None self._details = None
class rParser: """ Parser for routing results """ def __init__(self, html): self.soup = BeautifulSoup(html) self._overview = None self._details = None @classmethod def get_tdtext(cls, x, cl): return x.find('td', {'class': cl}).text @classmethod def get_change(cls, x): y = rParser.get_tdtext(x, 'col_change') if y: return int(y) else: return 0 @classmethod def get_price(cls, x): y = rParser.get_tdtext(x, 'col_price') if y == '*': return 0.0 if y.find(','): return float(y.replace(',', '.')) else: return 0.0 @classmethod def get_date(cls, x): y = rParser.get_tdtext(x, 'col_date') if y: return datetime.strptime(y, '%d.%m.%Y').date() else: return None @classmethod def get_datetime(cls, x): y = rParser.get_tdtext(x, 'col_time') if y: if (y.find("-") > 0): # overview mode times = map(lambda z: time(*map(int, z.split(':'))), y.split('-')) d = rParser.get_date(x) from_dtime = datetime.combine(d, times[0]) if times[0] > times[1]: # dateline crossing to_dtime = datetime.combine(d + timedelta(1), times[1]) else: to_dtime = datetime.combine(d, times[1]) return [from_dtime, to_dtime] else: dtregex = {'date' : '\d\d\.\d\d', 'time': '\d\d:\d\d'} regex = "\s*(?P<date1>{date})?\s*(?P<time1>{time})\s*(?P<date2>{date})?\s*(?P<time2>{time})\s*".format(**dtregex) ma = re.match(regex, y) if not ma: return [] gr = ma.groupdict() def extract_datetime(gr, n): if 'date%d' % n in gr and gr['date%d' % n]: if gr['time%d' % n] == '24:00': gr['time%d' % n] = '0:00' from_dtime = datetime.strptime(str(datetime.today().year) + gr['date%d' % n] + gr['time%d' % n], '%Y%d.%m.%H:%M') else: d = datetime.today().date() # Strange times possible at wienerlinien if gr['time%d' % n] == '24:00': gr['time%d' % n] = '0:00' d += timedelta(days=1) t = datetime.strptime(gr['time%d' % n], '%H:%M').time() return datetime.combine(d, t) # detail mode from_dtime = extract_datetime(gr, 1) to_dtime = extract_datetime(gr, 2) return [from_dtime, to_dtime] else: return [] def __iter__(self): for detail in self.details(): yield detail def _parse_details(self): tours = self.soup.findAll('div', {'class': 'data_table tourdetail'}) trips = map(lambda x: map(lambda y: { 'timespan': rParser.get_datetime(y), 'station': map(lambda z: z[2:].strip(), filter(lambda x: type(x) == NavigableString, y.find('td', {'class': 'col_station'}).contents)), # filter non NaviStrings 'info': map(lambda x: x.strip(), filter(lambda z: type(z) == NavigableString, y.find('td', {'class': 'col_info'}).contents)), }, x.find('tbody').findAll('tr')), tours) # all routes return trips @property def details(self): """returns list of trip details [ [ { 'time': [datetime.time, datetime.time] if time else [], 'station': [u'start', u'end'] if station else [], 'info': [u'start station' if station else u'details for walking', u'end station' if station else u'walking duration'] }, ... # next trip step ], ... # next trip possibility ] """ if not self._details: self._details = self._parse_details() return self._details def _parse_overview(self): # get overview table table = self.soup.find('table', {'id': 'tbl_fahrten'}) # check if there is an overview table if table and table.findAll('tr'): # get rows rows = table.findAll('tr')[1:] # cut off headline overview = map(lambda x: { 'timespan': rParser.get_datetime(x), 'change': rParser.get_change(x), 'price': rParser.get_price(x), }, rows) else: raise ParserError('Unable to parse overview') return overview @property def overview(self): """dict containing date: datetime time: [time, time] duration: time change: int price: float """ if not self._overview: try: self._overview = self._parse_overview() except AttributeError: f = open(DEBUGLOG, 'w') f.write(str(self.soup)) f.close() return self._overview
def __init__(self, html): self.soup = BeautifulSoup(html)