def parse_url(url, today=False, canteentype="Mittagsmensa", this_week="", next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[: url.find("essen/") + 6] + "wissenswertes/lebensmittelkennzeichnung" legend_doc = parse(urlopen(legend_url)).find(id="artikel") allergene = buildLegend( text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)" ) allergene["EI"] = "Ei" zusatzstoffe = buildLegend( text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)" ) for tr in legend_doc.find_all("tr"): tds = tr.find_all("td") if len(tds) != 2: continue title = tds[0].find("strong") if title is None: continue else: title = title.text text = tds[1].text.replace("enthält", "").strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + "-kommende-woche", canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) print(canteen.toXMLFeed()) return canteen.toXMLFeed()
def metadata(self, request): meta = LazyBuilder(version=self.parser.version) meta.feeds.append(Feed( name='today', hour='8-14', url='/'.join([request.host, self.parser.name, self.name, 'today.xml']), priority=0, source=None, dayOfMonth='*', dayOfWeek='*', minute='0', retry=None )) meta.feeds.append(Feed( name='full', hour='8', url='/'.join([request.host, self.parser.name, self.name, 'full.xml']), priority=0, source=None, dayOfMonth='*', dayOfWeek='*', minute='0', retry=None )) return meta.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() parse_week(url + '.html', canteen) if not today: parse_week(url + '-w1.html', canteen) parse_week(url + '-w2.html', canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() content = urlopen(url).read() document = parse(content, 'lxml') available_weeks = parse_available_weeks(document) # for the case that the start date is not auto set by the page e.g. on weekends noskip = find_start_date(document) is None employees_fee, guests_fee = parse_fees(document) groups = parse_ingredients(document) for idx, week in enumerate(available_weeks): if idx > 0 or noskip: content = urlopen("{}?selWeek={}".format(url, week)).read() document = parse(content, 'lxml') parse_meals_for_canteen(document, canteen, employees_fee, guests_fee, groups, today) if today: break return canteen.toXMLFeed()
def feed_all(self, name): canteen = LazyBuilder() date = self.__now() # Get this week lastWeekday = -1 while self.handler(canteen, self.xml2locId[name], date.date()): date += datetime.timedelta(days=1) if lastWeekday > date.weekday(): break lastWeekday = date.weekday() # Skip over weekend if date.weekday() > 4: date += datetime.timedelta(days=7-date.weekday()) # Get next week lastWeekday = -1 while self.handler(canteen, self.xml2locId[name], date.date()): date += datetime.timedelta(days=1) if lastWeekday > date.weekday(): break lastWeekday = date.weekday() return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() parse_week(url + '.html?view=list', canteen) if not today: parse_week(url + '-w1.html?view=list', canteen) parse_week(url + '-w2.html?view=list', canteen) return canteen.toXMLFeed()
def parse_url(url, mensa, *weeks, today): canteen = LazyBuilder() for week in weeks: parse_week(url + week, canteen, mensa) if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'} document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read()) for td in document.find_all('td', 'beschreibung'): legend[td.previous_sibling.previous_sibling.text] = td.text document = parse(urlopen(base + '/unsere-preise/').read()) prices = {} for tr in document.find('table', 'essenspreise').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read()) except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.date.resolution continue else: raise e else: errorCount = 0 for tr in document.find('table', 'zusatzstoffe').find_all('tr'): identifier = tr.find_all('td')[0].text \ .replace('(', '').replace(')', '') legend[identifier] = tr.find_all('td')[1].text.strip() canteen.setLegendData(legend) mensa_data = document.find('table', 'menu') category = None for menu_tr in mensa_data.find_all('tr'): if menu_tr.find('td', 'headline'): continue if menu_tr.find('td', 'gericht').text: category = menu_tr.find('td', 'gericht').text data = menu_tr.find('td', 'beschreibung') name = data.find('span').text.strip() notes = [span['title'] for span in data.find_all('span', title=True)] canteen.addMeal( date, category, name, notes, prices.get(category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {}) ) date += datetime.date.resolution if today: break return canteen.toXMLFeed()
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel') allergene = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)') allergene['EI'] = 'Ei' zusatzstoffe = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)') suballergene = re.compile( r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)') for tr in legend_doc.find_all('tr'): tds = tr.find_all('td') if len(tds) != 2: continue title = tds[0].find('strong') if title is None: continue else: title = title.text lines = tds[1].text.split('\n') for line in lines[1:]: try_allergine = suballergene.match(line) if try_allergine: allergene[try_allergine.group('name')] = try_allergine.group( 'value') text = lines[0].replace('enthält', '').strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() try: xml_data = urlopen(url).read() except Exception: return canteen.toXMLFeed() root = ET.fromstring(xml_data) for day in root: date = time.strftime('%d.%m.%Y', time.localtime(int(day.get('timestamp')))) for item in day: title = item.find('title').text description = get_description(title) notes = build_notes_string(title) plist = [item.find('preis1').text, item.find('preis2').text, item.find('preis3').text] food_type = get_food_types(item.find('piktogramme').text) canteen.addMeal(date, food_type, description, notes, plist, roles) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() parse_week(url + (datetime.date.today() + datetime.date.resolution * 7).strftime('/%Y/%W/'), canteen) if not today: parse_week(url + (datetime.date.today() + datetime.date.resolution * 14).strftime('/%Y/%W/'), canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read()) for day_div in document.find_all('div', 'day') + document.find_all( 'article', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date') continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = "{}-{}-{}".format( year, date_test.group('month'), date_test.group('day'), ) if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text: canteen.setDayClosed(date) continue closed_candidate = False for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue if 'geschlossen' in name: closed_candidate = True continue category = meal_article.find('div')['title'] notes = [ v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] ] if meal_article.find('div', 'additive'): notes += [ v[0] for v in extra_regex.findall( meal_article.find('div', 'additive').text) ] price_div = meal_article.find('div', 'price') if price_div is None: canteen.addMeal(date, category, name, notes) continue prices = {} for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')): price = price_regex.search(price_div['data-' + v]) if price: prices[r] = price.group('price') elif v == 'default': prices = {} break canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() day = datetime.date.today() for _ in range(21): parse_day(canteen, '{}&date={}'.format(url, day.strftime('%Y-%m-%d'))) if today: break day += datetime.timedelta(days=1) return canteen.toXMLFeed()
def feed(self, name): canteen = LazyBuilder() if name in self.xmlnames: parse_url(canteen, name) # all categories else : xmlname_enty = [x for x in self.xmlnames if x[0] == name][0] parse_url(canteen, *xmlname_enty) # only certain categories return canteen.toXMLFeed()
def parse_url(url, today): canteen = LazyBuilder() canteen.setAdditionalCharges('student', {}) if today: parse_week(url, canteen) # base url only contains current day else: parse_week(url + 'week', canteen) parse_week(url + 'nextweek', canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, 'lxml') legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = { int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text)) } else: extraLegend = {} canteen = LazyBuilder() for day_td in document.find_all('td', text=day_regex): date = day_regex.search(day_td.string).group('date') table = None for element in day_td.parents: if element.name == 'table': table = element break if not table: continue for tr in table.tbody.find_all('tr'): if 'geschlossen' in tr.text or 'Feiertage' in tr.text: match = day_range_regex.search(tr.text) if not match: canteen.setDayClosed(date) else: fromDate = datetime.datetime.strptime( match.group('from'), '%d.%m.%Y') toDate = datetime.datetime.strptime( match.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution continue if len(tr) != 2: continue # no meal strings = list(tr.contents[0].strings) name = strings[0] # prices: prices = strings[-1].split('|') if '-' in map(lambda v: v.strip(), prices): prices = {} # notes: notes = [] for img in tr.contents[1].find_all('img'): notes.append(img['alt'].replace('Symbol', '').strip()) for extra in list( set(map(lambda v: int(v), extra_regex.findall(tr.text)))): if extra in extraLegend: notes.append(extraLegend[extra]) canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False): base_data = load_base_data() canteen = LazyBuilder() with urlopen(url) as response: data = json.loads(response.read().decode()) for day in data['days']: date = datetime.datetime.strptime(day['date'], UTC_DATE_STRING).date() if today and (datetime.date.today() != date): continue for counter in day['counters']: counter_name = counter['displayName'] counter_description = counter['description'] counter_hours = counter.get('openingHours') for meal in counter['meals']: if 'knownMealId' in meal: # This is meant to allow recognizing recurring meals, # for features like marking meals as favorites. # Up to now, not really used in the mensaar.de API, # nor functional in this API parser. # The meal will still be recognized as every other meal. print('knownMealId: %s' % meal['knownMealId'], file=sys.stderr) meal_name = meal['name'] if 'category' in meal: meal_name = '%s: %s' % (meal['category'], meal_name) meal_notes = ( # The description is typically the location # (but not required to be by the API specification). build_location(counter_description) + build_hours(counter_hours) + build_notes( base_data, meal['notices'], meal['components'])) meal_prices = {} if 'prices' in meal: prices = meal['prices'] for role in prices: if role in ROLES: meal_prices[base_data['roles'] [role]] = prices[role] if 'pricingNotice' in meal: meal_notes.append(meal['pricingNotice']) canteen.addMeal(date, counter_name, meal_name, meal_notes, meal_prices) return canteen.toXMLFeed()
def parse_url(url, today=False): base_data = load_base_data() canteen = LazyBuilder() with urlopen(url) as response: data = json.loads(response.read().decode()) for day in data['days']: date = datetime.datetime.strptime(day['date'], UTC_DATE_STRING).date() if today and (datetime.date.today() != date): continue for counter in day['counters']: counter_name = counter['displayName'] counter_description = counter['description'] counter_hours = counter.get('openingHours') for meal in counter['meals']: if 'knownMealId' in meal: # This is meant to allow recognizing recurring meals, # for features like marking meals as favorites. # Up to now, not really used in the mensaar.de API, # nor functional in this API parser. # The meal will still be recognized as every other meal. print('knownMealId: %s' % meal['knownMealId'], file=sys.stderr) meal_name = meal['name'] if 'category' in meal: meal_name = '%s: %s' % (meal['category'], meal_name) meal_notes = ( # The description is typically the location # (but not required to be by the API specification). build_location(counter_description) + build_hours(counter_hours) + build_notes(base_data, meal['notices'], meal['components'])) meal_prices = {} if 'prices' in meal: prices = meal['prices'] for role in prices: if role in ROLES: meal_prices[base_data['roles'][role]] = prices[role] if 'pricingNotice' in meal: meal_notes.append(meal['pricingNotice']) canteen.addMeal(date, counter_name, meal_name, meal_notes, meal_prices) return canteen.toXMLFeed()
def parsePlan(url, internalMensaId, today): canteen = LazyBuilder() end = False while (url != None): dom = BeautifulSoup(urlopen(url).read(), 'lxml') date = dom.select('#mensa_date > p')[0].contents[0] menuDefinition = dom.find(id=internalMensaId) menuDescription = menuDefinition.parent.find('dd') tables = menuDescription.select('table') legend = {} legend = buildLegend(legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)') if tables != None and len(tables) == 1: table = tables[0] rows = table.find_all('tr') for row in rows: menuNameElement = row.select('td[class="mensa_col_55"] > b') if menuNameElement != None and menuNameElement[0].contents != None: menuName = menuNameElement[0].contents[0] category = 'Gericht' # get notes notes = {} notesElement = row.select('td[class="mensa_col_55"] > span') if notesElement != None and len(notesElement) > 0 and notesElement[0].text != None: notes = [legend.get(n, n) for n in notesElement[0].text.split(' ') if n] # get prices prices = {} for td in row.select('td[class="mensa_col_15"]'): priceElement = td.find('b') groupElement = td.find('span') if priceElement != None and groupElement != None and groupElement.contents != None and len(groupElement.contents) > 0 and priceElement.contents != None and len(priceElement.contents) > 0: group = str(groupElement.contents[0]) price = str(priceElement.contents[0]) if group == 'Stud.:': prices['student'] = price elif group == 'Bed.:': prices['employee'] = price elif group == 'Gast:': prices['other'] = price canteen.addMeal(date, category, menuName, notes, prices) else: canteen.setDayClosed(date) # check for further pages nextPageLink = dom.find(id='next_day_link') if nextPageLink == None or today: url = None else: url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href'] return canteen.toXMLFeed()
def render_menu(menu): """Render the menu for a canteen into an OpenMensa XML feed. :param dict menu: the Python representation of the API JSON response :return: the XML feed as string """ builder = LazyBuilder() if menu: for day in _active_days(menu): _process_day(builder, day) return builder.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read(), 'lxml') for day_div in document.find_all('div', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date "{}"'.format( day_div['data-day'])) continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day')) closed_candidate = day_div.find('div', 'holiday') is not None for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue category = meal_article.find('div', 'icon')['title'] notes = [] prices = {} additives = meal_article.find('div', 'additnr') if additives: notes += [ additive.text for additive in additives.find_all('li') ] notes += [ v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes ] price_div = meal_article.find('div', 'price') if price_div: for k, v in price_map.items(): price = price_div['data-' + k] if price: prices[v] = price canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() day = datetime.date.today() emptyCount = 0 while emptyCount < 7: if not parse_day(canteen, '{}&day={}&month={}&year={}&limit=25' .format(url, day.day, day.month, day.year), day.strftime('%Y-%m-%d')): emptyCount += 1 else: emptyCount = 0 if today: break day += datetime.date.resolution return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content) legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))} else: extraLegend = {} canteen = LazyBuilder() for day_td in document.find_all('td', text=day_regex): date = day_regex.search(day_td.string).group('date') table = None for element in day_td.parents: if element.name == 'table': table = element break if not table: continue for tr in table.tbody.find_all('tr'): if 'geschlossen' in tr.text or 'Feiertage' in tr.text: match = day_range_regex.search(tr.text) if not match: canteen.setDayClosed(date) else: fromDate = datetime.datetime.strptime(match.group('from'), '%d.%m.%Y') toDate = datetime.datetime.strptime(match.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution continue if len(tr) != 3: continue # no meal strings = list(tr.contents[0].strings) name = strings[0] # prices: prices = strings[-1].split('|') print(prices) if '-' in map(lambda v: v.strip(), prices): prices = {} # notes: notes = [] for img in tr.contents[1].find_all('img'): notes.append(img['alt'].replace('Symbol', '').strip()) for extra in list(set(map(lambda v: int(v), extra_regex.findall(tr.text)))): if extra in extraLegend: notes.append(extraLegend[extra]) canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url)) canteen.setLegendData( text=legend_doc.find(id='artikel').text, regex=r'(?P<name>(\d+|[A-Z]+))\s+=\s+(?P<value>\w+( |\t|\w)*)' ) parse_week(url + this_week, canteen, canteentype) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read()) for day_div in document.find_all('div', 'day') + document.find_all('article', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date') continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = "{}-{}-{}".format(year, date_test.group('month'), date_test.group('day'), ) if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text: canteen.setDayClosed(date) continue closed_candidate = False for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue if 'geschlossen' in name: closed_candidate = True continue category = meal_article.find('div', 'desc').text notes = [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title']] if meal_article.find('div', 'additive'): notes += [v[0] for v in extra_regex.findall(meal_article.find('div', 'additive').text)] price_div = meal_article.find('div', 'price') if price_div is None: canteen.addMeal(date, category, name, notes) continue prices = {} for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')): price = price_regex.search(price_div['data-' + v]) if price: prices[r] = price.group('price') elif v == 'default': prices = {} break canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def feed_all(self, name): canteen = LazyBuilder() date = self.__now() # Get this week while self.handler(canteen, name, date.date()): date += datetime.timedelta(days=1) # Skip over weekend if date.weekday() > 4: date += datetime.timedelta(days=7-date.weekday()) # Get next week while self.handler(canteen, name, date.date()): date += datetime.timedelta(days=1) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() day = datetime.date.today() emptyCount = 0 totalCount = 0 while emptyCount < 7 and totalCount < 32: if not parse_day( canteen, '{}&day={}&month={}&year={}&limit=25'.format( url, day.day, day.month, day.year), day.strftime('%Y-%m-%d')): emptyCount += 1 else: emptyCount = 0 if today: break totalCount += 1 day += datetime.date.resolution return canteen.toXMLFeed()
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel') allergene = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)' ) allergene['EI'] = 'Ei' zusatzstoffe = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)' ) suballergene = re.compile(r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)') for tr in legend_doc.find_all('tr'): tds = tr.find_all('td') if len(tds) != 2: continue title = tds[0].find('strong') if title is None: continue else: title = title.text lines = tds[1].text.split('\n') for line in lines[1:]: try_allergine = suballergene.match(line) if try_allergine: allergene[try_allergine.group('name')] = try_allergine.group('value') text = lines[0].replace('enthält', '').strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read(), 'lxml') for day_div in document.find_all('div', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date "{}"'.format(day_div['data-day'])) continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day')) closed_candidate = day_div.find('div', 'holiday') is not None for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue category = meal_article.find('div', 'icon')['title'] notes = [] prices = {} additives = meal_article.find('div', 'additnr') if additives: notes += [additive.text for additive in additives.find_all('li')] notes += [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes] price_div = meal_article.find('div', 'price') if price_div: for k, v in price_map.items(): price = price_div['data-' + k] if price: prices[v] = price canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): global legend canteen = LazyBuilder() canteen.setLegendData(legend) day = datetime.date.today() emptyCount = 0 totalCount = 0 while emptyCount < 7 and totalCount < 32: if not parse_day(canteen, '{}&tag={}&monat={}&jahr={}' .format(url, day.day, day.month, day.year), day.strftime('%Y-%m-%d')): emptyCount += 1 else: emptyCount = 0 if today: break totalCount += 1 day += datetime.date.resolution return canteen.toXMLFeed()
def parse_url(url, today=False): global legend canteen = LazyBuilder() canteen.setLegendData(legend) day = datetime.date.today() emptyCount = 0 totalCount = 0 while emptyCount < 7 and totalCount < 32: if not parse_day( canteen, '{}&tag={}&monat={}&jahr={}'.format( url, day.day, day.month, day.year), day.strftime('%Y-%m-%d')): emptyCount += 1 else: emptyCount = 0 if today: break totalCount += 1 day += datetime.date.resolution return canteen.toXMLFeed()
def parse_url(url, data_canteen, today=False): canteen = LazyBuilder() data = urlopen(url).read().decode('utf-8') document = parse(data, 'lxml') dish = document.find(class_='neo-menu-single-dishes') if dish is not None: dishes = dish.find_all(name='tr', attrs={"data-canteen": data_canteen}) else: dishes = [] side = document.find(class_='neo-menu-single-modals') if side is not None: dishes = dishes + side.find_all(name='tr', attrs={"data-canteen": data_canteen}) for dish in dishes: parse_dish(dish, canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() canteen.extra_regex = re.compile('\((?P<extra>[0-9a-zA-Z]{1,3}' '(?:,[0-9a-zA-Z]{1,3})*)\)', re.UNICODE) legend_url = 'https://www.stwdo.de/mensa-co/allgemein/zusatzstoffe/' legend = parse_legend(legend_url) canteen.setLegendData(legend) day = datetime.date.today() week = getWeekdays(day) for wDay in week: py = {'tx_pamensa_mensa[date]' : wDay} payload = urlencode(py).encode('ascii') data = rq.urlopen(url, payload).read().decode('utf-8') soup = BeautifulSoup(data, 'html.parser') parse_day(canteen, soup, wDay) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() canteen.extra_regex = re.compile( '\((?P<extra>[0-9a-zA-Z]{1,3}' '(?:,[0-9a-zA-Z]{1,3})*)\)', re.UNICODE) legend_url = 'https://www.stwdo.de/mensa-co/allgemein/zusatzstoffe/' legend = parse_legend(legend_url) canteen.setLegendData(legend) day = datetime.date.today() week = getWeekdays(day) for wDay in week: py = {'tx_pamensa_mensa[date]': wDay} payload = urlencode(py).encode('ascii') data = rq.urlopen(url, payload).read().decode('utf-8') soup = BeautifulSoup(data, 'html.parser') parse_day(canteen, soup, wDay) return canteen.toXMLFeed()
def render_meta(canteen, menu_feed_url): """Render a OpenMensa XML meta feed for a given canteen. :param Canteen canteen: the canteen :param menu_feed_url: the canteen menu URL :return: the XML meta feed as string """ builder = LazyBuilder() builder.name = canteen.name builder.address = canteen.street builder.city = canteen.city builder.define(name='full', priority='0', url=menu_feed_url, source=None, dayOfWeek='*', dayOfMonth='*', hour='8-18', minute='0', retry='30 1') return builder.toXMLFeed()
def parsePlan(url, internalMensaId, today): canteen = LazyBuilder() end = False while (url != None): dom = BeautifulSoup(urlopen(url).read(), 'lxml') date = dom.select('#mensa_date > p')[0].contents[0] menuDefinition = dom.find(id=internalMensaId) menuDescription = menuDefinition.parent.find('dd') tables = menuDescription.select('table') legend = {} legend = buildLegend( legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)' ) if tables != None and len(tables) == 1: table = tables[0] rows = table.find_all('tr') for row in rows: menuNameElement = row.select('td[class="mensa_col_55"] > b') if menuNameElement != None and menuNameElement[ 0].contents != None: menuName = menuNameElement[0].contents[0] category = 'Gericht' # get notes notes = {} notesElement = row.select( 'td[class="mensa_col_55"] > span') if notesElement != None and len( notesElement) > 0 and notesElement[0].text != None: notes = [ legend.get(n, n) for n in notesElement[0].text.split(' ') if n ] # get prices prices = {} for td in row.select('td[class="mensa_col_15"]'): priceElement = td.find('b') groupElement = td.find('span') if priceElement != None and groupElement != None and groupElement.contents != None and len( groupElement.contents ) > 0 and priceElement.contents != None and len( priceElement.contents) > 0: group = str(groupElement.contents[0]) price = str(priceElement.contents[0]) if group == 'Stud.:': prices['student'] = price elif group == 'Bed.:': prices['employee'] = price elif group == 'Gast:': prices['other'] = price canteen.addMeal(date, category, menuName, notes, prices) else: canteen.setDayClosed(date) # check for further pages nextPageLink = dom.find(id='next_day_link') if nextPageLink == None or today: url = None else: url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href'] return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() parse_week(url, date.today(), canteen) if not today: parse_week(url, date.today() + date.resolution * 7, canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, "lxml") canteen = LazyBuilder() table = document.find_all('table')[0] def debug_print(food_type, food_description, priceing): if (priceing is None): print(date + ': ' + food_type + ": " + food_description) else: print(date + ': ' + food_type + ": " + food_description + " : ", end='') for e in priceing: print(e, end=' ') print() def is_new_entry(tds): td = tds[0] return td.string is not None and date_regex.search( td.string) is not None def is_end_of_entry(tds): for td in tds: if (td.string is None or td.string.strip() != ''): return False return True def is_action_entry(td): return td.text == 'Aktion' def is_closed(tds): return is_new_entry(tds) and get_pricing(tds, 4, 7) is None def refactor_date(raw_date): now = datetime.datetime.now() day = date_regex.search(raw_date).group('day') month = date_regex.search(raw_date).group('month') year = now.year if month == '01' and now.month == 12: # if list depicts meals from this and the next year year += 1 elif month == '12' and now.month == 1: # if list depicts meals form this and the last year year -= 1 return day + '.' + month + '.' + str(year) def parse_foot_type(td): type = '' if td.string is None: img = td.find_all('img')[0] src = img.get('src') if ('msc' in src): type += 'Fish MSC ' elif ('vegan' in src): type += 'Vegan ' #Sometimes none categorized food is possible, therfore we need to cover this, #otherwhise openmensa.org will faile dueto an empty tag. elif (td.string.strip() == ''): type += 'Tipp ' else: if ('R' in td.string): type += 'Rind ' if ('S' in td.string): type += 'Schwein ' if ('G' in td.string): type += 'Geflügel ' if ('V' in td.string): type += 'Vegetarisch ' if ('F' in td.string): type += 'Fisch ' if ('L' in td.string): type += 'Lamm ' if ('W' in td.string): type += 'Wild ' tl = list(type)[:-1] return ''.join(tl) def get_refs(td): return td.find_all('sup') def get_foot_description(td): refl = get_refs(td) description = td.text for ref in refl: description = description.replace(' ' + ref.text, '', 1) if description[0] == ' ': description = description.replace(' ', '', 1) return description def get_notes(td): refl = get_refs(td) strl = [] for ref in refl: strl.extend(ref.string.split(',')) strl = list(set(strl)) return strl def build_notes_string(td): refs = get_notes(td) food_is = '' food_contains = '' for r in refs: # parse food is footnotes if r == '1': food_is += 'mit Farbstoffen, ' elif r == '4': food_is += 'geschwärzt, ' elif r == '7': food_is += 'mit Antioxidationsmittel, ' elif r == '8': food_is += 'mit Geschmacksverstärker, ' elif r == '9': food_is += 'geschwefelt, ' elif r == '10': food_is += 'geschwärzt, ' elif r == '11': food_is += 'gewachst, ' elif r == '12': food_is += 'mit Phosphat, ' elif r == '5': food_is += 'mit Süßungsmittel, ' # parse allergic footnotes elif r == 'a1': food_contains += 'Gluten, ' elif r == 'a2': food_contains += 'Krebstiere, ' elif r == 'a3': food_contains += 'Eier, ' elif r == 'a4': food_contains += 'Fisch, ' elif r == 'a5': food_contains += 'Erdnüsse, ' elif r == 'a6': food_contains += 'Soja, ' elif r == 'a7': food_contains += 'Milch/Laktose, ' elif r == 'a8': food_contains += 'Schalenfrüchte, ' elif r == 'a9': food_contains += 'Sellerie, ' elif r == 'a10': food_contains += 'Senf, ' elif r == 'a11': food_contains += 'Sesam, ' elif r == 'a12': food_contains += 'Schwefeldioxid/Sulfite, ' elif r == 'a13': food_contains += 'Lupinen, ' elif r == 'a14': food_contains += 'Weichtiere, ' else: food_contains += 'undefinierte Chemikalien:' + r + ', ' notes = '' if food_is != '': notes += 'Gericht ist ' + food_is if food_contains != '': if food_is == '': notes += 'Gericht enthält ' else: notes += 'und enthält ' notes += food_contains if notes != '': nl = list(notes) del nl[len(nl) - 1] nl[len(nl) - 1] = '.' notes = ''.join(nl) return notes def get_pricing(tds, f, t): priceing = [] #sometimes we dont don't get 7 elements, than this might be a special day if len(tds) < 7: return None for i in range(f, t): raw_price = tds[i].string.strip() if raw_price == '': return None else: priceing.append(price_regex.search(raw_price).group('val')) return priceing # state helper inside_valide_entry = False date = '' for tr in table.find_all('tr'): tds = tr.find_all('td') if (is_new_entry(tds)): try: raw_date = tds[0].string date = refactor_date(raw_date) if (is_closed(tds)): # sometismes a canteen might look closed but actually its spargeltage if "Spargeltage" in tds[3].text: canteen.addMeal(date, "Spargel", "Spargel Tag", ["Spargel und andere Gerichte."], None, None) else: canteen.setDayClosed(date) else: inside_valide_entry = True except Exception as e: traceback.print_exception(*sys.exc_info()) if (is_end_of_entry(tds)): inside_valide_entry = False elif inside_valide_entry: try: notes = [] if is_action_entry(tds[0]): food_type = parse_foot_type(tds[1]) food_description = get_foot_description(tds[2]) notes_string = build_notes_string(tds[2]) if (notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 3, 6) canteen.addMeal(date, 'Aktion: ' + food_type, food_description, notes, prices, roles if prices else None) else: food_type = parse_foot_type(tds[2]) food_description = get_foot_description(tds[3]) notes_string = build_notes_string(tds[3]) if (notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 4, 7) canteen.addMeal(date, food_type, food_description, notes, prices, roles if prices else None) except Exception as e: traceback.print_exception(*sys.exc_info()) return canteen.toXMLFeed()
def parse_url(url, today): canteen = LazyBuilder() if not today: parse_week(url, canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'AA': 'Weizen', 'AB': 'Roggen', 'AC': 'Gerste', 'AD': 'Hafer', 'AE': 'Dinkel', 'AF': 'Kamut', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'HA': 'Mandel', 'HB': 'Haselnuss', 'HC': 'Walnuss', 'HD': 'Cashew', 'HE': 'Pecannuss', 'HF': 'Paranuss', 'HG': 'Pistazie', 'HH': 'Macadamianuss', 'HI': 'Queenslandnuss', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'O': 'Nitrat', 'P': 'Nitritpökelsalz', 'ZTA': 'Alkohol', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } # Create regular expressions for categories hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") # Get two weeks for full.xml and only the current one for today.xml # On error 404 continue with next isoweek # Returns an empty feed if both isoweeks result in error 404 # At most locations the data doesn't exist on term break weeks = 1 if today else 2 for w in range(0, weeks): kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % { 'location': url, 'isoweek': kw }) except HTTPError as e: if e.code == 404: continue else: raise e # Decode data from ISO charset f = f.read().decode('iso8859-1') # Set roles for prices roles = ('student', 'employee', 'other') # Read csv data and skip the csv header mealreader = reader(f.splitlines(), delimiter=';') next(mealreader) for row in mealreader: mdate = row[0] category = row[2] mname = row[3] mtype = row[4] prices = [row[6], row[7], row[8]] # determine category for the current meal if category == 'Suppe': pass elif hg.match(category): category = 'Hauptgerichte' elif b.match(category): category = 'Beilagen' elif n.match(category): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(category)) # Extract the notes from brackets in the meal name # Remove the brackets, notes and improve readability notes = [] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') # Extract notes from current brackets and avoid empty notes for i in mname[apos + 1:bpos].split(','): if i: notes.append(i) # Check if brackets are at the end of the meal name if bpos == len(mname) - 1: # Remove brackets and break bracket loop mname = mname[:apos] bpos = -1 else: # Remove current brackets, improve readability # and find the next brackets mname = mname[:apos].rstrip( ) + ' und ' + mname[bpos + 1:].lstrip() bpos = mname.find(')') # Remove trailing whitespaces in the meal name mname = mname.rstrip() # Add meal type notes to notes list and avoid empty notes for i in mtype.split(','): if i: notes.append('ZT' + i) # Translate notes via legend to human readable information mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) # Try to add the meal try: canteen.addMeal(mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format( mdate, category, mname, e), file=sys.stderr) # empty meal ... pass # return xml data return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() # prices are stored on a separate page document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml') prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get( 'class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read(), 'lxml') errorCount = 0 except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.timedelta(days=1) continue else: raise e # extract legend legend = {} legends = document.find('div', 'tx-stwm-speiseplan') additions = legends.find('div', 'c-schedule__filter-body') for table in additions.find_all('div', 'c-schedule__filter-item'): for ingredient in table.find('ul').find_all('li'): name = ingredient.find('dt').text.strip() description = ingredient.find('dd').text.strip() legend[name] = description for label in legends.find('ul', 'c-schedule__type-list').find_all('li'): name = label.find('dt').text.replace('(', '').replace(')', '').strip() description = label.find('dd').text.strip() legend[name] = description # extract meals mensa_data = document.find('ul', 'c-schedule__list') category = None for meal in mensa_data.find_all('li'): # update category or use previous one if not specified category_text = meal.find('dt', 'c-schedule__term').text.strip() if category_text: category = category_text data = meal.find('dd').find('p', 'js-schedule-dish-description') name = data.contents[0].strip() # name is the first text node if not name: continue # notes are contained in 3 boxes (type, additional, allergen) and # are comma-separated lists enclosed in brackets or parentheses notes = [] for note in meal.find_all('span', 'c-schedule__marker'): note_text = note.find('span', 'u-text-sup').text \ .replace('(', '').replace(')', '') \ .replace('[', '').replace(']', '') notes += [n for n in note_text.split(',') if n] # some meals contain the GQB label in their name (instead of in notes) if '(GQB)' in name: name = name.replace('(GQB)', '').strip() notes.append('GQB') # the price for both meals is specified as Bio-/Aktionsgericht price_category = category \ .replace('Aktionsessen', 'Bio-/Aktionsgericht') \ .replace('Biogericht', 'Bio-/Aktionsgericht') \ .strip() canteen.addMeal(date, category, name, [legend.get(n, n) for n in notes], prices.get(price_category, {})) date += datetime.timedelta(days=1) if today: break return canteen.toXMLFeed()
def main(url='https://www.stw-bremen.de/de/essen-trinken/mensa-nw-1', out='xml'): # TODO: replace ids with a findall food-plan-* wildcard data = {} # dict to store parsed data today = dt.date.today() s = requests.session() r = s.get(url) # get request from stw server html = r.content # the raw html code of the returned page soup = BeautifulSoup(html, 'html.parser') # source code parser canteen = LazyBuilder() days = soup.find_all(id=re.compile("^food-plan-")) #print(len(days)) #for id in ids: # for each day for html_day in days: date_id = html_day['id'] # food-plan-3 workday_offset = int(date_id.split('-')[-1]) #print(workday_offset) date = get_date_from_id(workday_offset) date_str = dt.datetime.strftime(date, '%Y-%m-%d') data[date_str] = {} # init dict for each id # The information for each meal is stored in a seperate table with class # food-category, to get all categories (not hardcoded loop them) html_meals = html_day.find_all("table", "food-category") for meal in html_meals: # meal is still a html code string category_name = meal.find('th', 'category-name').string meal_text = '' # since there are added line breaks and <sup> tags, I use the strings # generator instead of the get_text() or .text methods meal_parts = meal.find('td', 'field-name-field-description').strings for m in meal_parts: # m is an iteratable part of the html contents if not m.parent.name == 'sup': meal_text += str(m) #meal_text = meal_text.rstrip() # remove win/unix linebreaks meal_text = meal_text.replace('\r', '') meal_text = meal_text.replace('\n', ' ') meal_text = meal_text.replace('* * *', '; ') meal_price_a = meal.find('td', 'field-name-field-price-students').text meal_price_b = meal.find('td', 'field-name-field-price-employees').text m = {} m['text'] = meal_text m['A'] = meal_price_a m['B'] = meal_price_b data[date_str][category_name] = m #Use LazyBuilder: canteen.addMeal(date, category_name, meal_text, prices={ 'student': meal_price_a, 'employee': meal_price_b }) om = canteen.toXMLFeed() #print(data) j = json.dumps(data, ensure_ascii=False) # without s saves to file #print(j) if out == 'xml': return om elif out == 'json': return j
def parse_url(url, today=False): today = datetime.date.today() if today.weekday() == 6: # Sunday today += datetime.timedelta(days=1) # Tomorrow if "%s" in url: url = url % today.strftime('%Y_%m_%d') try: content = requests.get(url).text except requests.exceptions.ConnectionError as e: logging.warning(str(e)) content = requests.get(url, verify=False).text document = BeautifulSoup(content, "html.parser") canteen = LazyBuilder() # Prices for employees and guests try: p = price_employee_regex.search(document.find("main").text).groupdict() employee = float(p["employee"].split(",")[0]) + \ float(p["employee"].split(",")[1]) / 100 p = price_guest_regex.search(document.find("main").text).groupdict() guest = float(p["employee"].split(",")[0]) + \ float(p["employee"].split(",")[1]) / 100 except (AttributeError, TypeError, KeyError, ValueError): employee_multiplier = 1.25 guest_multiplier = 1.60 employee = None guest = None # Date p = datespan_regex.search(document.find( "div", {"class": "maincontent"}).find("h2").text).groupdict() if len(p["from"].split(".")[2]) == 0: p["from"] += p["to"].split(".")[2] fromdate = datetime.datetime.strptime(p["from"], "%d.%m.%Y") maincontent = document.find("div", {"class": "maincontent"}) table = maincontent.find("table") if not table: if maincontent: # Die Speisenausgabe DHBW Eppelheim ist vom dd.mm.yyyy – dd.mm.yyyy # geschlossen p = datespan_regex.search(maincontent.text) if p: fromdate = datetime.datetime.strptime(p["from"], "%d.%m.%Y") todate = datetime.datetime.strptime(p["to"], "%d.%m.%Y") while fromdate <= todate: canteen.setDayClosed(fromdate.strftime('%d.%m.%Y')) fromdate += datetime.timedelta(1) return canteen.toXMLFeed() trs = table.find_all("tr") date = None for tr in trs: tds = tr.find_all("td") if len(tds) == 4: td0, td1, td2, td3 = tds day = td0.text.strip() date = fromdate + datetime.timedelta(days=daysGerman.index(day)) date = date.strftime('%d.%m.%Y') else: td0 = None td1, td2, td3 = tds notes = [] if "feiertag" in td1.text.lower() or "geschlossen" in td1.text.lower(): canteen.setDayClosed(date) continue categoryName = td1.text.strip()[:-1] mealName = td2.text.strip() if not categoryName or not mealName: continue prices = [] try: price = float(euro_regex.search( td3.text).group(1).replace(",", ".")) prices.append(price) if employee is not None: prices.append(employee) else: prices.append(price * employee_multiplier) if guest is not None: prices.append(guest) else: prices.append(price * guest_multiplier) except (AttributeError, TypeError, KeyError, ValueError): notes.append(td3.text.strip()) notes = [x for x in notes if x] canteen.addMeal(date, categoryName, mealName, notes if notes else None, prices if prices else None, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'ZTA': 'Aktionsgericht', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } #canteen.setLegendData(legend) hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") #for w in 0, 1: for w in [0]: kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % {'location': url, 'isoweek': kw}) except HTTPError as e: if e.code == 404: continue else: raise e f = f.read().decode('iso8859-1') roles = ('student', 'employee', 'other') initline = True mealreader = reader(f.splitlines(), delimiter=';') for row in mealreader: if initline: initline = False else: if row[2] == 'Suppe': category = 'Suppe' elif hg.match(row[2]): category = 'Hauptgerichte' elif b.match(row[2]): category = 'Beilagen' elif n.match(row[2]): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(row[2])) mdate = row[0] notes = [] mname = row[3] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') for i in mname[apos+1:bpos].split(','): notes.append(i) if bpos == len(mname)-1: mname = mname[:apos] + mname[bpos+1:] bpos = -1 else: mname = mname[:apos] + ' und ' + mname[bpos+1:] bpos = mname.find(')') if mname.rfind(' ') == len(mname)-1: mname = mname[:len(mname)-1] mtype = row[4] if mtype != '': for i in mtype.split(','): notes.append('ZT' + i) prices = [row[6], row[7], row[8]] mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) try: canteen.addMeal(mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format(mdate, category, mname, e), file=sys.stderr) # empty meal ... pass return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'ZTA': 'Aktionsgericht', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } #canteen.setLegendData(legend) hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") #for w in 0, 1: for w in [0]: kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % { 'location': url, 'isoweek': kw }) except HTTPError as e: if e.code == 404: continue else: raise e f = f.read().decode('iso8859-1') roles = ('student', 'employee', 'other') initline = True mealreader = reader(f.splitlines(), delimiter=';') for row in mealreader: if initline: initline = False else: if row[2] == 'Suppe': category = 'Suppe' elif hg.match(row[2]): category = 'Hauptgerichte' elif b.match(row[2]): category = 'Beilagen' elif n.match(row[2]): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(row[2])) mdate = row[0] notes = [] mname = row[3] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') for i in mname[apos + 1:bpos].split(','): notes.append(i) if bpos == len(mname) - 1: mname = mname[:apos] + mname[bpos + 1:] bpos = -1 else: mname = mname[:apos] + ' und ' + mname[bpos + 1:] bpos = mname.find(')') if mname.rfind(' ') == len(mname) - 1: mname = mname[:len(mname) - 1] mtype = row[4] if mtype != '': for i in mtype.split(','): notes.append('ZT' + i) prices = [row[6], row[7], row[8]] mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) try: canteen.addMeal(mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format( mdate, category, mname, e), file=sys.stderr) # empty meal ... pass return canteen.toXMLFeed()
document = parse(content, 'html.parser') items = document.find_all('a', {"class": "item"}) for item in items: title = item.strong.string if not title: continue numbers = item.small.string notes = [] if numbers: for number in numbers.split(','): number = int(number.strip()) if number > len(legend): continue notes.append(legend[number]) row = item.parent.parent price = row.find_all('td')[-1].string prices = {} if price: subprice = price.split('/') if len(subprice) == 3: prices = {'student': subprice[0], 'employee': subprice[1], 'other': subprice[2]} else: prices = {'other': price} canteen.addMeal(datetime.date(date.year, date.month, date.day), "Mittagessen", title, notes=notes, prices=prices) date = date + datetime.timedelta(1) print(canteen.toXMLFeed())
def parse_url(url, today=False): canteen = LazyBuilder() canteen.setAdditionalCharges('student', {}) parse_week(url, canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() parse_week(url + '&wann=2', canteen) if not today: parse_week(url + '&wann=3', canteen) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'ZTA': 'Aktionsgericht', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } # Create regular expressions for categories hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") # Get current and next isoweek and try to get the data # On error 404 continue with next isoweek # Returns an empty feed if both isoweeks result in error 404 # At most locations the data doesn't exist on term break for w in 0, 1: kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % {'location': url, 'isoweek': kw}) except HTTPError as e: if e.code == 404: continue else: raise e # Decode data from ISO charset f = f.read().decode('iso8859-1') # Set roles for prices roles = ('student', 'employee', 'other') # Read csv data and skip the csv header mealreader = reader(f.splitlines(), delimiter=';') next(mealreader) for row in mealreader: mdate = row[0] category = row[2] mname = row[3] mtype = row[4] prices = [row[6], row[7], row[8]] # determine category for the current meal if category == 'Suppe': pass elif hg.match(category): category = 'Hauptgerichte' elif b.match(category): category = 'Beilagen' elif n.match(category): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(category)) # Extract the notes from brackets in the meal name # Remove the brackets, notes and improve readability notes = [] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') # Extract notes from current brackets and avoid empty notes for i in mname[apos+1:bpos].split(','): if i: notes.append(i) # Check if brackets are at the end of the meal name if bpos == len(mname)-1: # Remove brackets and break bracket loop mname = mname[:apos] bpos = -1 else: # Remove current brackets, improve readability # and find the next brackets mname = mname[:apos].rstrip() + ' und ' + mname[bpos+1:].lstrip() bpos = mname.find(')') # Remove trailing whitespaces in the meal name mname = mname.rstrip() # Add meal type notes to notes list and avoid empty notes for i in mtype.split(','): if i: notes.append('ZT' + i) # Translate notes via legend to human readable information mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) # Try to add the meal try: canteen.addMeal( mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format(mdate, category, mname, e), file=sys.stderr) # empty meal ... pass # return xml data return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() # prices are stored on a separate page document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml') prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read(), 'lxml') errorCount = 0 except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.timedelta(days=1) continue else: raise e # extract legend legend = {} legends = document.find('div', 'tx-stwm-speiseplan') additions = legends.find('div', 'c-schedule__filter-body') for table in additions.find_all('div', 'c-schedule__filter-item'): for ingredient in table.find('ul').find_all('li'): name = ingredient.find('dt').text.strip() description = ingredient.find('dd').text.strip() legend[name] = description for label in legends.find('ul', 'c-schedule__type-list').find_all('li'): name = label.find('dt').text.replace('(', '').replace(')', '').strip() description = label.find('dd').text.strip() legend[name] = description # extract meals mensa_data = document.find('ul', 'c-schedule__list') category = None for meal in mensa_data.find_all('li'): # update category or use previous one if not specified category_text = meal.find('dt', 'c-schedule__term').text.strip() if category_text: category = category_text data = meal.find('dd').find('p', 'js-schedule-dish-description') name = data.contents[0].strip() # name is the first text node if not name: continue # notes are contained in 3 boxes (type, additional, allergen) and # are comma-separated lists enclosed in brackets or parentheses notes = [] for note in meal.find_all('span', 'c-schedule__marker'): note_text = note.find('span', 'u-text-sup').text \ .replace('(', '').replace(')', '') \ .replace('[', '').replace(']', '') notes += [n for n in note_text.split(',') if n] # some meals contain the GQB label in their name (instead of in notes) if '(GQB)' in name: name = name.replace('(GQB)', '').strip() notes.append('GQB') # the price for both meals is specified as Bio-/Aktionsgericht price_category = category \ .replace('Aktionsessen', 'Bio-/Aktionsgericht') \ .replace('Biogericht', 'Bio-/Aktionsgericht') \ .strip() canteen.addMeal(date, category, name, [legend.get(n, n) for n in notes], prices.get(price_category, {}) ) date += datetime.timedelta(days=1) if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, "lxml") canteen = LazyBuilder() table = document.find_all('table')[0] def debug_print(food_type, food_description, priceing): if(priceing is None): print(date+': '+food_type+": "+food_description) else: print(date+': '+food_type+": "+food_description+" : ", end='') for e in priceing: print(e, end=' ') print() def is_new_entry(tds): td = tds[0] return td.string is not None and date_regex.search(td.string) is not None def is_end_of_entry(tds): for td in tds: if(td.string is None or td.string.strip() != ''): return False return True def is_action_entry(td): return td.text == 'Aktion' def is_closed(tds): return is_new_entry(tds) and get_pricing(tds, 4, 7) is None def refactor_date(raw_date): now = datetime.datetime.now() day = date_regex.search(raw_date).group('day') month = date_regex.search(raw_date).group('month') year = now.year if month == '01' and now.month == 12: # if list depicts meals from this and the next year year+=1 elif month == '12' and now.month == 1: # if list depicts meals form this and the last year year-=1 return day+'.'+month+'.'+str(year) def parse_foot_type(td): type = '' if td.string is None: if len(td.find_all('img')) == 0: return None else: img = td.find_all('img')[0] src = img.get('src') if('msc' in src): type += 'Fish MSC ' elif('vegan' in src): type += 'Vegan ' #Sometimes none categorized food is possible, therfore we need to cover this, #otherwhise openmensa.org will faile dueto an empty tag. elif(td.string.strip() == ''): type += 'Tipp ' else: if('R' in td.string): type += 'Rind ' if('S' in td.string): type += 'Schwein ' if('G' in td.string): type += 'Geflügel ' if('V' in td.string): type += 'Vegetarisch ' if('F' in td.string): type += 'Fisch ' if('L' in td.string): type += 'Lamm ' if('W' in td.string): type += 'Wild ' tl = list(type)[:-1] return ''.join(tl) def get_refs(td): return td.find_all('sup') def get_foot_description(td): refl = get_refs(td) description = td.text for ref in refl: description = description.replace(' '+ref.text, '', 1) if description[0] == ' ': description = description.replace(' ', '', 1) return description def get_notes(td): refl = get_refs(td) strl = [] for ref in refl: strl.extend(ref.string.split(',')) strl = list(set(strl)) return strl def build_notes_string(td): refs = get_notes(td) food_is = '' food_contains = '' for r in refs: # parse food is footnotes if r == '1': food_is += 'mit Farbstoffen, ' elif r == '4': food_is += 'geschwärzt, ' elif r == '7': food_is += 'mit Antioxidationsmittel, ' elif r == '8': food_is += 'mit Geschmacksverstärker, ' elif r == '9': food_is += 'geschwefelt, ' elif r == '10': food_is += 'geschwärzt, ' elif r == '11': food_is += 'gewachst, ' elif r == '12': food_is += 'mit Phosphat, ' elif r == '5': food_is += 'mit Süßungsmittel, ' # parse allergic footnotes elif r == 'a1': food_contains += 'Gluten, ' elif r == 'a2': food_contains += 'Krebstiere, ' elif r == 'a3': food_contains += 'Eier, ' elif r == 'a4': food_contains += 'Fisch, ' elif r == 'a5': food_contains += 'Erdnüsse, ' elif r == 'a6': food_contains += 'Soja, ' elif r == 'a7': food_contains += 'Milch/Laktose, ' elif r == 'a8': food_contains += 'Schalenfrüchte, ' elif r == 'a9': food_contains += 'Sellerie, ' elif r == 'a10': food_contains += 'Senf, ' elif r == 'a11': food_contains += 'Sesam, ' elif r == 'a12': food_contains += 'Schwefeldioxid/Sulfite, ' elif r == 'a13': food_contains += 'Lupinen, ' elif r == 'a14': food_contains += 'Weichtiere, ' else: food_contains += 'undefinierte Chemikalien:'+r+', ' notes = '' if food_is != '': notes += 'Gericht ist ' + food_is if food_contains != '': if food_is == '': notes += 'Gericht enthält ' else: notes += 'und enthält ' notes += food_contains if notes != '': nl = list(notes) del nl[len(nl)-1] nl[len(nl)-1] = '.' notes = ''.join(nl) return notes def get_pricing(tds, f, t): priceing = [] #sometimes we dont don't get 7 elements, than this might be a special day if len(tds) < 7: return None for i in range(f, t): raw_price = tds[i].string.strip() if raw_price == '': return None else: priceing.append(price_regex.search(raw_price).group('val')) return priceing # state helper inside_valide_entry = False date = '' for tr in table.find_all('tr'): tds = tr.find_all('td') if(is_new_entry(tds)): try: raw_date = tds[0].string date = refactor_date(raw_date) if(is_closed(tds)): # sometismes a canteen might look closed but actually its spargeltage if "Spargeltage" in tds[3].text: canteen.addMeal(date, "Spargel", "Spargel Tag", ["Spargel und andere Gerichte."], None, None) else: canteen.setDayClosed(date) else: inside_valide_entry = True except Exception as e: traceback.print_exception(*sys.exc_info()) if(is_end_of_entry(tds)): inside_valide_entry = False elif inside_valide_entry: try: notes = [] if is_action_entry(tds[0]): food_type = parse_foot_type(tds[1]) food_description = get_foot_description(tds[2]) notes_string = build_notes_string(tds[2]) if(notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 3, 6) canteen.addMeal(date, 'Aktion: '+food_type, food_description, notes, prices, roles if prices else None) else: food_type = parse_foot_type(tds[2]) food_description = get_foot_description(tds[3]) notes_string = build_notes_string(tds[3]) if(notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 4, 7) if food_type is not None: canteen.addMeal(date, food_type, food_description, notes, prices, roles if prices else None) except Exception as e: traceback.print_exception(*sys.exc_info()) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'} document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read()) for td in document.find_all('td', 'beschreibung'): legend[td.parent.find('td', 'gericht').text] = td.text document = parse(urlopen(base + '/mensa-preise/').read()) prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get( 'class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read()) except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.date.resolution continue else: raise e else: errorCount = 0 for tr in document.find('table', 'zusatzstoffe').find_all('tr'): identifier = tr.find_all('td')[0].text \ .replace('(', '').replace(')', '') legend[identifier] = tr.find_all('td')[1].text.strip() canteen.setLegendData(legend) mensa_data = document.find('table', 'menu') category = None for menu_tr in mensa_data.find_all('tr'): if menu_tr.find('td', 'headline'): continue if menu_tr.find('td', 'gericht').text: category = menu_tr.find('td', 'gericht').text data = menu_tr.find('td', 'beschreibung') name = data.find('span').text.strip() if not name: continue notes = [ span['title'] for span in data.find_all('span', title=True) ] canteen.addMeal( date, category, name, notes, prices.get( category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {})) date += datetime.date.resolution if today: break return canteen.toXMLFeed()