def parse_url(url, today=False): canteen = LazyBuilder() legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'} document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read()) for td in document.find_all('td', 'beschreibung'): legend[td.previous_sibling.previous_sibling.text] = td.text document = parse(urlopen(base + '/unsere-preise/').read()) prices = {} for tr in document.find('table', 'essenspreise').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read()) except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.date.resolution continue else: raise e else: errorCount = 0 for tr in document.find('table', 'zusatzstoffe').find_all('tr'): identifier = tr.find_all('td')[0].text \ .replace('(', '').replace(')', '') legend[identifier] = tr.find_all('td')[1].text.strip() canteen.setLegendData(legend) mensa_data = document.find('table', 'menu') category = None for menu_tr in mensa_data.find_all('tr'): if menu_tr.find('td', 'headline'): continue if menu_tr.find('td', 'gericht').text: category = menu_tr.find('td', 'gericht').text data = menu_tr.find('td', 'beschreibung') name = data.find('span').text.strip() notes = [span['title'] for span in data.find_all('span', title=True)] canteen.addMeal( date, category, name, notes, prices.get(category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {}) ) date += datetime.date.resolution if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read()) for day_div in document.find_all('div', 'day') + document.find_all( 'article', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date') continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = "{}-{}-{}".format( year, date_test.group('month'), date_test.group('day'), ) if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text: canteen.setDayClosed(date) continue closed_candidate = False for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue if 'geschlossen' in name: closed_candidate = True continue category = meal_article.find('div')['title'] notes = [ v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] ] if meal_article.find('div', 'additive'): notes += [ v[0] for v in extra_regex.findall( meal_article.find('div', 'additive').text) ] price_div = meal_article.find('div', 'price') if price_div is None: canteen.addMeal(date, category, name, notes) continue prices = {} for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')): price = price_regex.search(price_div['data-' + v]) if price: prices[r] = price.group('price') elif v == 'default': prices = {} break canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, 'lxml') legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = { int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text)) } else: extraLegend = {} canteen = LazyBuilder() for day_td in document.find_all('td', text=day_regex): date = day_regex.search(day_td.string).group('date') table = None for element in day_td.parents: if element.name == 'table': table = element break if not table: continue for tr in table.tbody.find_all('tr'): if 'geschlossen' in tr.text or 'Feiertage' in tr.text: match = day_range_regex.search(tr.text) if not match: canteen.setDayClosed(date) else: fromDate = datetime.datetime.strptime( match.group('from'), '%d.%m.%Y') toDate = datetime.datetime.strptime( match.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution continue if len(tr) != 2: continue # no meal strings = list(tr.contents[0].strings) name = strings[0] # prices: prices = strings[-1].split('|') if '-' in map(lambda v: v.strip(), prices): prices = {} # notes: notes = [] for img in tr.contents[1].find_all('img'): notes.append(img['alt'].replace('Symbol', '').strip()) for extra in list( set(map(lambda v: int(v), extra_regex.findall(tr.text)))): if extra in extraLegend: notes.append(extraLegend[extra]) canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False): base_data = load_base_data() canteen = LazyBuilder() with urlopen(url) as response: data = json.loads(response.read().decode()) for day in data['days']: date = datetime.datetime.strptime(day['date'], UTC_DATE_STRING).date() if today and (datetime.date.today() != date): continue for counter in day['counters']: counter_name = counter['displayName'] counter_description = counter['description'] counter_hours = counter.get('openingHours') for meal in counter['meals']: if 'knownMealId' in meal: # This is meant to allow recognizing recurring meals, # for features like marking meals as favorites. # Up to now, not really used in the mensaar.de API, # nor functional in this API parser. # The meal will still be recognized as every other meal. print('knownMealId: %s' % meal['knownMealId'], file=sys.stderr) meal_name = meal['name'] if 'category' in meal: meal_name = '%s: %s' % (meal['category'], meal_name) meal_notes = ( # The description is typically the location # (but not required to be by the API specification). build_location(counter_description) + build_hours(counter_hours) + build_notes( base_data, meal['notices'], meal['components'])) meal_prices = {} if 'prices' in meal: prices = meal['prices'] for role in prices: if role in ROLES: meal_prices[base_data['roles'] [role]] = prices[role] if 'pricingNotice' in meal: meal_notes.append(meal['pricingNotice']) canteen.addMeal(date, counter_name, meal_name, meal_notes, meal_prices) return canteen.toXMLFeed()
def parse_url(url, today=False): base_data = load_base_data() canteen = LazyBuilder() with urlopen(url) as response: data = json.loads(response.read().decode()) for day in data['days']: date = datetime.datetime.strptime(day['date'], UTC_DATE_STRING).date() if today and (datetime.date.today() != date): continue for counter in day['counters']: counter_name = counter['displayName'] counter_description = counter['description'] counter_hours = counter.get('openingHours') for meal in counter['meals']: if 'knownMealId' in meal: # This is meant to allow recognizing recurring meals, # for features like marking meals as favorites. # Up to now, not really used in the mensaar.de API, # nor functional in this API parser. # The meal will still be recognized as every other meal. print('knownMealId: %s' % meal['knownMealId'], file=sys.stderr) meal_name = meal['name'] if 'category' in meal: meal_name = '%s: %s' % (meal['category'], meal_name) meal_notes = ( # The description is typically the location # (but not required to be by the API specification). build_location(counter_description) + build_hours(counter_hours) + build_notes(base_data, meal['notices'], meal['components'])) meal_prices = {} if 'prices' in meal: prices = meal['prices'] for role in prices: if role in ROLES: meal_prices[base_data['roles'][role]] = prices[role] if 'pricingNotice' in meal: meal_notes.append(meal['pricingNotice']) canteen.addMeal(date, counter_name, meal_name, meal_notes, meal_prices) return canteen.toXMLFeed()
def parsePlan(url, internalMensaId, today): canteen = LazyBuilder() end = False while (url != None): dom = BeautifulSoup(urlopen(url).read(), 'lxml') date = dom.select('#mensa_date > p')[0].contents[0] menuDefinition = dom.find(id=internalMensaId) menuDescription = menuDefinition.parent.find('dd') tables = menuDescription.select('table') legend = {} legend = buildLegend(legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)') if tables != None and len(tables) == 1: table = tables[0] rows = table.find_all('tr') for row in rows: menuNameElement = row.select('td[class="mensa_col_55"] > b') if menuNameElement != None and menuNameElement[0].contents != None: menuName = menuNameElement[0].contents[0] category = 'Gericht' # get notes notes = {} notesElement = row.select('td[class="mensa_col_55"] > span') if notesElement != None and len(notesElement) > 0 and notesElement[0].text != None: notes = [legend.get(n, n) for n in notesElement[0].text.split(' ') if n] # get prices prices = {} for td in row.select('td[class="mensa_col_15"]'): priceElement = td.find('b') groupElement = td.find('span') if priceElement != None and groupElement != None and groupElement.contents != None and len(groupElement.contents) > 0 and priceElement.contents != None and len(priceElement.contents) > 0: group = str(groupElement.contents[0]) price = str(priceElement.contents[0]) if group == 'Stud.:': prices['student'] = price elif group == 'Bed.:': prices['employee'] = price elif group == 'Gast:': prices['other'] = price canteen.addMeal(date, category, menuName, notes, prices) else: canteen.setDayClosed(date) # check for further pages nextPageLink = dom.find(id='next_day_link') if nextPageLink == None or today: url = None else: url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href'] return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read(), 'lxml') for day_div in document.find_all('div', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date "{}"'.format( day_div['data-day'])) continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day')) closed_candidate = day_div.find('div', 'holiday') is not None for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue category = meal_article.find('div', 'icon')['title'] notes = [] prices = {} additives = meal_article.find('div', 'additnr') if additives: notes += [ additive.text for additive in additives.find_all('li') ] notes += [ v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes ] price_div = meal_article.find('div', 'price') if price_div: for k, v in price_map.items(): price = price_div['data-' + k] if price: prices[v] = price canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content) legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))} else: extraLegend = {} canteen = LazyBuilder() for day_td in document.find_all('td', text=day_regex): date = day_regex.search(day_td.string).group('date') table = None for element in day_td.parents: if element.name == 'table': table = element break if not table: continue for tr in table.tbody.find_all('tr'): if 'geschlossen' in tr.text or 'Feiertage' in tr.text: match = day_range_regex.search(tr.text) if not match: canteen.setDayClosed(date) else: fromDate = datetime.datetime.strptime(match.group('from'), '%d.%m.%Y') toDate = datetime.datetime.strptime(match.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution continue if len(tr) != 3: continue # no meal strings = list(tr.contents[0].strings) name = strings[0] # prices: prices = strings[-1].split('|') print(prices) if '-' in map(lambda v: v.strip(), prices): prices = {} # notes: notes = [] for img in tr.contents[1].find_all('img'): notes.append(img['alt'].replace('Symbol', '').strip()) for extra in list(set(map(lambda v: int(v), extra_regex.findall(tr.text)))): if extra in extraLegend: notes.append(extraLegend[extra]) canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() try: xml_data = urlopen(url).read() except Exception: return canteen.toXMLFeed() root = ET.fromstring(xml_data) for day in root: date = time.strftime('%d.%m.%Y', time.localtime(int(day.get('timestamp')))) for item in day: title = item.find('title').text description = get_description(title) notes = build_notes_string(title) plist = [item.find('preis1').text, item.find('preis2').text, item.find('preis3').text] food_type = get_food_types(item.find('piktogramme').text) canteen.addMeal(date, food_type, description, notes, plist, roles) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read()) for day_div in document.find_all('div', 'day') + document.find_all('article', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date') continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = "{}-{}-{}".format(year, date_test.group('month'), date_test.group('day'), ) if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text: canteen.setDayClosed(date) continue closed_candidate = False for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue if 'geschlossen' in name: closed_candidate = True continue category = meal_article.find('div', 'desc').text notes = [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title']] if meal_article.find('div', 'additive'): notes += [v[0] for v in extra_regex.findall(meal_article.find('div', 'additive').text)] price_div = meal_article.find('div', 'price') if price_div is None: canteen.addMeal(date, category, name, notes) continue prices = {} for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')): price = price_regex.search(price_div['data-' + v]) if price: prices[r] = price.group('price') elif v == 'default': prices = {} break canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read(), 'lxml') for day_div in document.find_all('div', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date "{}"'.format(day_div['data-day'])) continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day')) closed_candidate = day_div.find('div', 'holiday') is not None for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue category = meal_article.find('div', 'icon')['title'] notes = [] prices = {} additives = meal_article.find('div', 'additnr') if additives: notes += [additive.text for additive in additives.find_all('li')] notes += [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes] price_div = meal_article.find('div', 'price') if price_div: for k, v in price_map.items(): price = price_div['data-' + k] if price: prices[v] = price canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def main(url='https://www.stw-bremen.de/de/essen-trinken/mensa-nw-1', out='xml'): # TODO: replace ids with a findall food-plan-* wildcard data = {} # dict to store parsed data today = dt.date.today() s = requests.session() r = s.get(url) # get request from stw server html = r.content # the raw html code of the returned page soup = BeautifulSoup(html, 'html.parser') # source code parser canteen = LazyBuilder() days = soup.find_all(id=re.compile("^food-plan-")) #print(len(days)) #for id in ids: # for each day for html_day in days: date_id = html_day['id'] # food-plan-3 workday_offset = int(date_id.split('-')[-1]) #print(workday_offset) date = get_date_from_id(workday_offset) date_str = dt.datetime.strftime(date, '%Y-%m-%d') data[date_str] = {} # init dict for each id # The information for each meal is stored in a seperate table with class # food-category, to get all categories (not hardcoded loop them) html_meals = html_day.find_all("table", "food-category") for meal in html_meals: # meal is still a html code string category_name = meal.find('th', 'category-name').string meal_text = '' # since there are added line breaks and <sup> tags, I use the strings # generator instead of the get_text() or .text methods meal_parts = meal.find('td', 'field-name-field-description').strings for m in meal_parts: # m is an iteratable part of the html contents if not m.parent.name == 'sup': meal_text += str(m) #meal_text = meal_text.rstrip() # remove win/unix linebreaks meal_text = meal_text.replace('\r', '') meal_text = meal_text.replace('\n', ' ') meal_text = meal_text.replace('* * *', '; ') meal_price_a = meal.find('td', 'field-name-field-price-students').text meal_price_b = meal.find('td', 'field-name-field-price-employees').text m = {} m['text'] = meal_text m['A'] = meal_price_a m['B'] = meal_price_b data[date_str][category_name] = m #Use LazyBuilder: canteen.addMeal(date, category_name, meal_text, prices={ 'student': meal_price_a, 'employee': meal_price_b }) om = canteen.toXMLFeed() #print(data) j = json.dumps(data, ensure_ascii=False) # without s saves to file #print(j) if out == 'xml': return om elif out == 'json': return j
def parse_url(url, today=False): canteen = LazyBuilder() # prices are stored on a separate page document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml') prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get( 'class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read(), 'lxml') errorCount = 0 except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.timedelta(days=1) continue else: raise e # extract legend legend = {} legends = document.find('div', 'tx-stwm-speiseplan') additions = legends.find('div', 'c-schedule__filter-body') for table in additions.find_all('div', 'c-schedule__filter-item'): for ingredient in table.find('ul').find_all('li'): name = ingredient.find('dt').text.strip() description = ingredient.find('dd').text.strip() legend[name] = description for label in legends.find('ul', 'c-schedule__type-list').find_all('li'): name = label.find('dt').text.replace('(', '').replace(')', '').strip() description = label.find('dd').text.strip() legend[name] = description # extract meals mensa_data = document.find('ul', 'c-schedule__list') category = None for meal in mensa_data.find_all('li'): # update category or use previous one if not specified category_text = meal.find('dt', 'c-schedule__term').text.strip() if category_text: category = category_text data = meal.find('dd').find('p', 'js-schedule-dish-description') name = data.contents[0].strip() # name is the first text node if not name: continue # notes are contained in 3 boxes (type, additional, allergen) and # are comma-separated lists enclosed in brackets or parentheses notes = [] for note in meal.find_all('span', 'c-schedule__marker'): note_text = note.find('span', 'u-text-sup').text \ .replace('(', '').replace(')', '') \ .replace('[', '').replace(']', '') notes += [n for n in note_text.split(',') if n] # some meals contain the GQB label in their name (instead of in notes) if '(GQB)' in name: name = name.replace('(GQB)', '').strip() notes.append('GQB') # the price for both meals is specified as Bio-/Aktionsgericht price_category = category \ .replace('Aktionsessen', 'Bio-/Aktionsgericht') \ .replace('Biogericht', 'Bio-/Aktionsgericht') \ .strip() canteen.addMeal(date, category, name, [legend.get(n, n) for n in notes], prices.get(price_category, {})) date += datetime.timedelta(days=1) if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, "lxml") canteen = LazyBuilder() table = document.find_all('table')[0] def debug_print(food_type, food_description, priceing): if (priceing is None): print(date + ': ' + food_type + ": " + food_description) else: print(date + ': ' + food_type + ": " + food_description + " : ", end='') for e in priceing: print(e, end=' ') print() def is_new_entry(tds): td = tds[0] return td.string is not None and date_regex.search( td.string) is not None def is_end_of_entry(tds): for td in tds: if (td.string is None or td.string.strip() != ''): return False return True def is_action_entry(td): return td.text == 'Aktion' def is_closed(tds): return is_new_entry(tds) and get_pricing(tds, 4, 7) is None def refactor_date(raw_date): now = datetime.datetime.now() day = date_regex.search(raw_date).group('day') month = date_regex.search(raw_date).group('month') year = now.year if month == '01' and now.month == 12: # if list depicts meals from this and the next year year += 1 elif month == '12' and now.month == 1: # if list depicts meals form this and the last year year -= 1 return day + '.' + month + '.' + str(year) def parse_foot_type(td): type = '' if td.string is None: img = td.find_all('img')[0] src = img.get('src') if ('msc' in src): type += 'Fish MSC ' elif ('vegan' in src): type += 'Vegan ' #Sometimes none categorized food is possible, therfore we need to cover this, #otherwhise openmensa.org will faile dueto an empty tag. elif (td.string.strip() == ''): type += 'Tipp ' else: if ('R' in td.string): type += 'Rind ' if ('S' in td.string): type += 'Schwein ' if ('G' in td.string): type += 'Geflügel ' if ('V' in td.string): type += 'Vegetarisch ' if ('F' in td.string): type += 'Fisch ' if ('L' in td.string): type += 'Lamm ' if ('W' in td.string): type += 'Wild ' tl = list(type)[:-1] return ''.join(tl) def get_refs(td): return td.find_all('sup') def get_foot_description(td): refl = get_refs(td) description = td.text for ref in refl: description = description.replace(' ' + ref.text, '', 1) if description[0] == ' ': description = description.replace(' ', '', 1) return description def get_notes(td): refl = get_refs(td) strl = [] for ref in refl: strl.extend(ref.string.split(',')) strl = list(set(strl)) return strl def build_notes_string(td): refs = get_notes(td) food_is = '' food_contains = '' for r in refs: # parse food is footnotes if r == '1': food_is += 'mit Farbstoffen, ' elif r == '4': food_is += 'geschwärzt, ' elif r == '7': food_is += 'mit Antioxidationsmittel, ' elif r == '8': food_is += 'mit Geschmacksverstärker, ' elif r == '9': food_is += 'geschwefelt, ' elif r == '10': food_is += 'geschwärzt, ' elif r == '11': food_is += 'gewachst, ' elif r == '12': food_is += 'mit Phosphat, ' elif r == '5': food_is += 'mit Süßungsmittel, ' # parse allergic footnotes elif r == 'a1': food_contains += 'Gluten, ' elif r == 'a2': food_contains += 'Krebstiere, ' elif r == 'a3': food_contains += 'Eier, ' elif r == 'a4': food_contains += 'Fisch, ' elif r == 'a5': food_contains += 'Erdnüsse, ' elif r == 'a6': food_contains += 'Soja, ' elif r == 'a7': food_contains += 'Milch/Laktose, ' elif r == 'a8': food_contains += 'Schalenfrüchte, ' elif r == 'a9': food_contains += 'Sellerie, ' elif r == 'a10': food_contains += 'Senf, ' elif r == 'a11': food_contains += 'Sesam, ' elif r == 'a12': food_contains += 'Schwefeldioxid/Sulfite, ' elif r == 'a13': food_contains += 'Lupinen, ' elif r == 'a14': food_contains += 'Weichtiere, ' else: food_contains += 'undefinierte Chemikalien:' + r + ', ' notes = '' if food_is != '': notes += 'Gericht ist ' + food_is if food_contains != '': if food_is == '': notes += 'Gericht enthält ' else: notes += 'und enthält ' notes += food_contains if notes != '': nl = list(notes) del nl[len(nl) - 1] nl[len(nl) - 1] = '.' notes = ''.join(nl) return notes def get_pricing(tds, f, t): priceing = [] #sometimes we dont don't get 7 elements, than this might be a special day if len(tds) < 7: return None for i in range(f, t): raw_price = tds[i].string.strip() if raw_price == '': return None else: priceing.append(price_regex.search(raw_price).group('val')) return priceing # state helper inside_valide_entry = False date = '' for tr in table.find_all('tr'): tds = tr.find_all('td') if (is_new_entry(tds)): try: raw_date = tds[0].string date = refactor_date(raw_date) if (is_closed(tds)): # sometismes a canteen might look closed but actually its spargeltage if "Spargeltage" in tds[3].text: canteen.addMeal(date, "Spargel", "Spargel Tag", ["Spargel und andere Gerichte."], None, None) else: canteen.setDayClosed(date) else: inside_valide_entry = True except Exception as e: traceback.print_exception(*sys.exc_info()) if (is_end_of_entry(tds)): inside_valide_entry = False elif inside_valide_entry: try: notes = [] if is_action_entry(tds[0]): food_type = parse_foot_type(tds[1]) food_description = get_foot_description(tds[2]) notes_string = build_notes_string(tds[2]) if (notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 3, 6) canteen.addMeal(date, 'Aktion: ' + food_type, food_description, notes, prices, roles if prices else None) else: food_type = parse_foot_type(tds[2]) food_description = get_foot_description(tds[3]) notes_string = build_notes_string(tds[3]) if (notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 4, 7) canteen.addMeal(date, food_type, food_description, notes, prices, roles if prices else None) except Exception as e: traceback.print_exception(*sys.exc_info()) return canteen.toXMLFeed()
def parsePlan(url, internalMensaId, today): canteen = LazyBuilder() end = False while (url != None): dom = BeautifulSoup(urlopen(url).read(), 'lxml') date = dom.select('#mensa_date > p')[0].contents[0] menuDefinition = dom.find(id=internalMensaId) menuDescription = menuDefinition.parent.find('dd') tables = menuDescription.select('table') legend = {} legend = buildLegend( legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)' ) if tables != None and len(tables) == 1: table = tables[0] rows = table.find_all('tr') for row in rows: menuNameElement = row.select('td[class="mensa_col_55"] > b') if menuNameElement != None and menuNameElement[ 0].contents != None: menuName = menuNameElement[0].contents[0] category = 'Gericht' # get notes notes = {} notesElement = row.select( 'td[class="mensa_col_55"] > span') if notesElement != None and len( notesElement) > 0 and notesElement[0].text != None: notes = [ legend.get(n, n) for n in notesElement[0].text.split(' ') if n ] # get prices prices = {} for td in row.select('td[class="mensa_col_15"]'): priceElement = td.find('b') groupElement = td.find('span') if priceElement != None and groupElement != None and groupElement.contents != None and len( groupElement.contents ) > 0 and priceElement.contents != None and len( priceElement.contents) > 0: group = str(groupElement.contents[0]) price = str(priceElement.contents[0]) if group == 'Stud.:': prices['student'] = price elif group == 'Bed.:': prices['employee'] = price elif group == 'Gast:': prices['other'] = price canteen.addMeal(date, category, menuName, notes, prices) else: canteen.setDayClosed(date) # check for further pages nextPageLink = dom.find(id='next_day_link') if nextPageLink == None or today: url = None else: url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href'] return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'ZTA': 'Aktionsgericht', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } # Create regular expressions for categories hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") # Get current and next isoweek and try to get the data # On error 404 continue with next isoweek # Returns an empty feed if both isoweeks result in error 404 # At most locations the data doesn't exist on term break for w in 0, 1: kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % {'location': url, 'isoweek': kw}) except HTTPError as e: if e.code == 404: continue else: raise e # Decode data from ISO charset f = f.read().decode('iso8859-1') # Set roles for prices roles = ('student', 'employee', 'other') # Read csv data and skip the csv header mealreader = reader(f.splitlines(), delimiter=';') next(mealreader) for row in mealreader: mdate = row[0] category = row[2] mname = row[3] mtype = row[4] prices = [row[6], row[7], row[8]] # determine category for the current meal if category == 'Suppe': pass elif hg.match(category): category = 'Hauptgerichte' elif b.match(category): category = 'Beilagen' elif n.match(category): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(category)) # Extract the notes from brackets in the meal name # Remove the brackets, notes and improve readability notes = [] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') # Extract notes from current brackets and avoid empty notes for i in mname[apos+1:bpos].split(','): if i: notes.append(i) # Check if brackets are at the end of the meal name if bpos == len(mname)-1: # Remove brackets and break bracket loop mname = mname[:apos] bpos = -1 else: # Remove current brackets, improve readability # and find the next brackets mname = mname[:apos].rstrip() + ' und ' + mname[bpos+1:].lstrip() bpos = mname.find(')') # Remove trailing whitespaces in the meal name mname = mname.rstrip() # Add meal type notes to notes list and avoid empty notes for i in mtype.split(','): if i: notes.append('ZT' + i) # Translate notes via legend to human readable information mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) # Try to add the meal try: canteen.addMeal( mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format(mdate, category, mname, e), file=sys.stderr) # empty meal ... pass # return xml data return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'AA': 'Weizen', 'AB': 'Roggen', 'AC': 'Gerste', 'AD': 'Hafer', 'AE': 'Dinkel', 'AF': 'Kamut', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'HA': 'Mandel', 'HB': 'Haselnuss', 'HC': 'Walnuss', 'HD': 'Cashew', 'HE': 'Pecannuss', 'HF': 'Paranuss', 'HG': 'Pistazie', 'HH': 'Macadamianuss', 'HI': 'Queenslandnuss', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'O': 'Nitrat', 'P': 'Nitritpökelsalz', 'ZTA': 'Alkohol', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } # Create regular expressions for categories hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") # Get two weeks for full.xml and only the current one for today.xml # On error 404 continue with next isoweek # Returns an empty feed if both isoweeks result in error 404 # At most locations the data doesn't exist on term break weeks = 1 if today else 2 for w in range(0, weeks): kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % { 'location': url, 'isoweek': kw }) except HTTPError as e: if e.code == 404: continue else: raise e # Decode data from ISO charset f = f.read().decode('iso8859-1') # Set roles for prices roles = ('student', 'employee', 'other') # Read csv data and skip the csv header mealreader = reader(f.splitlines(), delimiter=';') next(mealreader) for row in mealreader: mdate = row[0] category = row[2] mname = row[3] mtype = row[4] prices = [row[6], row[7], row[8]] # determine category for the current meal if category == 'Suppe': pass elif hg.match(category): category = 'Hauptgerichte' elif b.match(category): category = 'Beilagen' elif n.match(category): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(category)) # Extract the notes from brackets in the meal name # Remove the brackets, notes and improve readability notes = [] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') # Extract notes from current brackets and avoid empty notes for i in mname[apos + 1:bpos].split(','): if i: notes.append(i) # Check if brackets are at the end of the meal name if bpos == len(mname) - 1: # Remove brackets and break bracket loop mname = mname[:apos] bpos = -1 else: # Remove current brackets, improve readability # and find the next brackets mname = mname[:apos].rstrip( ) + ' und ' + mname[bpos + 1:].lstrip() bpos = mname.find(')') # Remove trailing whitespaces in the meal name mname = mname.rstrip() # Add meal type notes to notes list and avoid empty notes for i in mtype.split(','): if i: notes.append('ZT' + i) # Translate notes via legend to human readable information mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) # Try to add the meal try: canteen.addMeal(mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format( mdate, category, mname, e), file=sys.stderr) # empty meal ... pass # return xml data return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'ZTA': 'Aktionsgericht', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } #canteen.setLegendData(legend) hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") #for w in 0, 1: for w in [0]: kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % {'location': url, 'isoweek': kw}) except HTTPError as e: if e.code == 404: continue else: raise e f = f.read().decode('iso8859-1') roles = ('student', 'employee', 'other') initline = True mealreader = reader(f.splitlines(), delimiter=';') for row in mealreader: if initline: initline = False else: if row[2] == 'Suppe': category = 'Suppe' elif hg.match(row[2]): category = 'Hauptgerichte' elif b.match(row[2]): category = 'Beilagen' elif n.match(row[2]): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(row[2])) mdate = row[0] notes = [] mname = row[3] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') for i in mname[apos+1:bpos].split(','): notes.append(i) if bpos == len(mname)-1: mname = mname[:apos] + mname[bpos+1:] bpos = -1 else: mname = mname[:apos] + ' und ' + mname[bpos+1:] bpos = mname.find(')') if mname.rfind(' ') == len(mname)-1: mname = mname[:len(mname)-1] mtype = row[4] if mtype != '': for i in mtype.split(','): notes.append('ZT' + i) prices = [row[6], row[7], row[8]] mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) try: canteen.addMeal(mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format(mdate, category, mname, e), file=sys.stderr) # empty meal ... pass return canteen.toXMLFeed()
def parse_url(url, today=False): today = datetime.date.today() if today.weekday() == 6: # Sunday today += datetime.timedelta(days=1) # Tomorrow url = url % today.strftime('%Y_%m_%d') if not url.startswith("http://") and not url.startswith("https://"): raise RuntimeError("url is not an allowed URL: '%s'" % url) try: content = requests.get(url).text except requests.exceptions.ConnectionError as e: logging.warning(e) content = requests.get(url, verify=False).text document = BeautifulSoup(content, "html.parser") canteen = LazyBuilder() # Prices for employees and guests try: p = price_regex.search(document.find( "p", {"id": "message"}).text).groupdict() employee_multiplier = 1.0 + int(p["employee"]) / 100.0 guest_multiplier = 1.0 + int(p["guest"]) / 100.0 except (AttributeError, TypeError, KeyError, ValueError): employee_multiplier = 1.25 guest_multiplier = 1.60 trs = document.find("table", {"id": "previewTable"}).find_all("tr") canteenCategories = [] firstTr = True previous = None # previous tr row for tr in trs: closed = False mealsFound = False if firstTr: # First table row contains the names of the different categories firstTr = False for th in tr.find_all("th")[1:]: canteenCategories.append(th.text.strip()) elif previous is None: # Normal table row containing meal information previous = tr else: # Price table row date = day_regex.search(previous.find("td", {"class": "first"})[ "data-date"]).group('date') if "geschlossen" == previous.find_all("td")[1].text.strip(): closed = date cat = 0 for td0, td1 in zip(previous.find_all("td")[ 1:], tr.find_all("td")): if "heute kein Angebot" in td0.text or "geschlossen" in td0.text: cat += 1 continue notes = [] # Category if td0.find("h2"): categoryName = canteenCategories[cat] + " " + \ correctCapitalization(td0.find("h2").text.strip()) else: categoryName = canteenCategories[cat] if "Kubusangebote am Themenpark" in td0.text: canteen.addMeal(date, categoryName, "Kubusangebote am Themenpark", []) cat += 1 continue # Name if td0.find("p"): name = removeextras_regex.sub("", td0.find("p").text) else: name = categoryName # No name available, let's just use the category name # Prices prices = [] spans = td1.find_all("span", {"class": "label"}) if spans: try: price = float(euro_regex.search( spans[0].text).group(1).replace(",", ".")) except (AttributeError, TypeError, KeyError, ValueError): notes.append(spans[0].text.strip() + " Preis") if len(spans) == 2: notes.append(spans[1].text.strip() + " Preis") prices = (price, price * employee_multiplier, price * guest_multiplier) # Notes: vegan, vegetarisch, ... notes += [icon["title"] for icon in td1.find_all("span", {"class": "icon"})] canteen.addMeal(date, categoryName, name, notes, prices, roles if prices else None) mealsFound = True cat += 1 previous = None if not mealsFound and closed: canteen.setDayClosed(closed) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() # prices are stored on a separate page document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml') prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read(), 'lxml') errorCount = 0 except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.timedelta(days=1) continue else: raise e # extract legend legend = {} legends = document.find('div', 'tx-stwm-speiseplan') additions = legends.find('div', 'c-schedule__filter-body') for table in additions.find_all('div', 'c-schedule__filter-item'): for ingredient in table.find('ul').find_all('li'): name = ingredient.find('dt').text.strip() description = ingredient.find('dd').text.strip() legend[name] = description for label in legends.find('ul', 'c-schedule__type-list').find_all('li'): name = label.find('dt').text.replace('(', '').replace(')', '').strip() description = label.find('dd').text.strip() legend[name] = description # extract meals mensa_data = document.find('ul', 'c-schedule__list') category = None for meal in mensa_data.find_all('li'): # update category or use previous one if not specified category_text = meal.find('dt', 'c-schedule__term').text.strip() if category_text: category = category_text data = meal.find('dd').find('p', 'js-schedule-dish-description') name = data.contents[0].strip() # name is the first text node if not name: continue # notes are contained in 3 boxes (type, additional, allergen) and # are comma-separated lists enclosed in brackets or parentheses notes = [] for note in meal.find_all('span', 'c-schedule__marker'): note_text = note.find('span', 'u-text-sup').text \ .replace('(', '').replace(')', '') \ .replace('[', '').replace(']', '') notes += [n for n in note_text.split(',') if n] # some meals contain the GQB label in their name (instead of in notes) if '(GQB)' in name: name = name.replace('(GQB)', '').strip() notes.append('GQB') # the price for both meals is specified as Bio-/Aktionsgericht price_category = category \ .replace('Aktionsessen', 'Bio-/Aktionsgericht') \ .replace('Biogericht', 'Bio-/Aktionsgericht') \ .strip() canteen.addMeal(date, category, name, [legend.get(n, n) for n in notes], prices.get(price_category, {}) ) date += datetime.timedelta(days=1) if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'} document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read()) for td in document.find_all('td', 'beschreibung'): legend[td.parent.find('td', 'gericht').text] = td.text document = parse(urlopen(base + '/mensa-preise/').read()) prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get( 'class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read()) except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.date.resolution continue else: raise e else: errorCount = 0 for tr in document.find('table', 'zusatzstoffe').find_all('tr'): identifier = tr.find_all('td')[0].text \ .replace('(', '').replace(')', '') legend[identifier] = tr.find_all('td')[1].text.strip() canteen.setLegendData(legend) mensa_data = document.find('table', 'menu') category = None for menu_tr in mensa_data.find_all('tr'): if menu_tr.find('td', 'headline'): continue if menu_tr.find('td', 'gericht').text: category = menu_tr.find('td', 'gericht').text data = menu_tr.find('td', 'beschreibung') name = data.find('span').text.strip() if not name: continue notes = [ span['title'] for span in data.find_all('span', title=True) ] canteen.addMeal( date, category, name, notes, prices.get( category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {})) date += datetime.date.resolution if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, "lxml") canteen = LazyBuilder() table = document.find_all('table')[0] def debug_print(food_type, food_description, priceing): if(priceing is None): print(date+': '+food_type+": "+food_description) else: print(date+': '+food_type+": "+food_description+" : ", end='') for e in priceing: print(e, end=' ') print() def is_new_entry(tds): td = tds[0] return td.string is not None and date_regex.search(td.string) is not None def is_end_of_entry(tds): for td in tds: if(td.string is None or td.string.strip() != ''): return False return True def is_action_entry(td): return td.text == 'Aktion' def is_closed(tds): return is_new_entry(tds) and get_pricing(tds, 4, 7) is None def refactor_date(raw_date): now = datetime.datetime.now() day = date_regex.search(raw_date).group('day') month = date_regex.search(raw_date).group('month') year = now.year if month == '01' and now.month == 12: # if list depicts meals from this and the next year year+=1 elif month == '12' and now.month == 1: # if list depicts meals form this and the last year year-=1 return day+'.'+month+'.'+str(year) def parse_foot_type(td): type = '' if td.string is None: if len(td.find_all('img')) == 0: return None else: img = td.find_all('img')[0] src = img.get('src') if('msc' in src): type += 'Fish MSC ' elif('vegan' in src): type += 'Vegan ' #Sometimes none categorized food is possible, therfore we need to cover this, #otherwhise openmensa.org will faile dueto an empty tag. elif(td.string.strip() == ''): type += 'Tipp ' else: if('R' in td.string): type += 'Rind ' if('S' in td.string): type += 'Schwein ' if('G' in td.string): type += 'Geflügel ' if('V' in td.string): type += 'Vegetarisch ' if('F' in td.string): type += 'Fisch ' if('L' in td.string): type += 'Lamm ' if('W' in td.string): type += 'Wild ' tl = list(type)[:-1] return ''.join(tl) def get_refs(td): return td.find_all('sup') def get_foot_description(td): refl = get_refs(td) description = td.text for ref in refl: description = description.replace(' '+ref.text, '', 1) if description[0] == ' ': description = description.replace(' ', '', 1) return description def get_notes(td): refl = get_refs(td) strl = [] for ref in refl: strl.extend(ref.string.split(',')) strl = list(set(strl)) return strl def build_notes_string(td): refs = get_notes(td) food_is = '' food_contains = '' for r in refs: # parse food is footnotes if r == '1': food_is += 'mit Farbstoffen, ' elif r == '4': food_is += 'geschwärzt, ' elif r == '7': food_is += 'mit Antioxidationsmittel, ' elif r == '8': food_is += 'mit Geschmacksverstärker, ' elif r == '9': food_is += 'geschwefelt, ' elif r == '10': food_is += 'geschwärzt, ' elif r == '11': food_is += 'gewachst, ' elif r == '12': food_is += 'mit Phosphat, ' elif r == '5': food_is += 'mit Süßungsmittel, ' # parse allergic footnotes elif r == 'a1': food_contains += 'Gluten, ' elif r == 'a2': food_contains += 'Krebstiere, ' elif r == 'a3': food_contains += 'Eier, ' elif r == 'a4': food_contains += 'Fisch, ' elif r == 'a5': food_contains += 'Erdnüsse, ' elif r == 'a6': food_contains += 'Soja, ' elif r == 'a7': food_contains += 'Milch/Laktose, ' elif r == 'a8': food_contains += 'Schalenfrüchte, ' elif r == 'a9': food_contains += 'Sellerie, ' elif r == 'a10': food_contains += 'Senf, ' elif r == 'a11': food_contains += 'Sesam, ' elif r == 'a12': food_contains += 'Schwefeldioxid/Sulfite, ' elif r == 'a13': food_contains += 'Lupinen, ' elif r == 'a14': food_contains += 'Weichtiere, ' else: food_contains += 'undefinierte Chemikalien:'+r+', ' notes = '' if food_is != '': notes += 'Gericht ist ' + food_is if food_contains != '': if food_is == '': notes += 'Gericht enthält ' else: notes += 'und enthält ' notes += food_contains if notes != '': nl = list(notes) del nl[len(nl)-1] nl[len(nl)-1] = '.' notes = ''.join(nl) return notes def get_pricing(tds, f, t): priceing = [] #sometimes we dont don't get 7 elements, than this might be a special day if len(tds) < 7: return None for i in range(f, t): raw_price = tds[i].string.strip() if raw_price == '': return None else: priceing.append(price_regex.search(raw_price).group('val')) return priceing # state helper inside_valide_entry = False date = '' for tr in table.find_all('tr'): tds = tr.find_all('td') if(is_new_entry(tds)): try: raw_date = tds[0].string date = refactor_date(raw_date) if(is_closed(tds)): # sometismes a canteen might look closed but actually its spargeltage if "Spargeltage" in tds[3].text: canteen.addMeal(date, "Spargel", "Spargel Tag", ["Spargel und andere Gerichte."], None, None) else: canteen.setDayClosed(date) else: inside_valide_entry = True except Exception as e: traceback.print_exception(*sys.exc_info()) if(is_end_of_entry(tds)): inside_valide_entry = False elif inside_valide_entry: try: notes = [] if is_action_entry(tds[0]): food_type = parse_foot_type(tds[1]) food_description = get_foot_description(tds[2]) notes_string = build_notes_string(tds[2]) if(notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 3, 6) canteen.addMeal(date, 'Aktion: '+food_type, food_description, notes, prices, roles if prices else None) else: food_type = parse_foot_type(tds[2]) food_description = get_foot_description(tds[3]) notes_string = build_notes_string(tds[3]) if(notes_string != ""): notes.append(notes_string) prices = get_pricing(tds, 4, 7) if food_type is not None: canteen.addMeal(date, food_type, food_description, notes, prices, roles if prices else None) except Exception as e: traceback.print_exception(*sys.exc_info()) return canteen.toXMLFeed()
def parse_url(url, today=False): today = datetime.date.today() if today.weekday() == 6: # Sunday today += datetime.timedelta(days=1) # Tomorrow if "%s" in url: url = url % today.strftime('%Y_%m_%d') try: content = requests.get(url).text except requests.exceptions.ConnectionError as e: logging.warning(str(e)) content = requests.get(url, verify=False).text document = BeautifulSoup(content, "html.parser") canteen = LazyBuilder() # Prices for employees and guests try: p = price_employee_regex.search(document.find("main").text).groupdict() employee = float(p["employee"].split(",")[0]) + \ float(p["employee"].split(",")[1]) / 100 p = price_guest_regex.search(document.find("main").text).groupdict() guest = float(p["employee"].split(",")[0]) + \ float(p["employee"].split(",")[1]) / 100 except (AttributeError, TypeError, KeyError, ValueError): employee_multiplier = 1.25 guest_multiplier = 1.60 employee = None guest = None # Date p = datespan_regex.search(document.find( "div", {"class": "maincontent"}).find("h2").text).groupdict() if len(p["from"].split(".")[2]) == 0: p["from"] += p["to"].split(".")[2] fromdate = datetime.datetime.strptime(p["from"], "%d.%m.%Y") maincontent = document.find("div", {"class": "maincontent"}) table = maincontent.find("table") if not table: if maincontent: # Die Speisenausgabe DHBW Eppelheim ist vom dd.mm.yyyy – dd.mm.yyyy # geschlossen p = datespan_regex.search(maincontent.text) if p: fromdate = datetime.datetime.strptime(p["from"], "%d.%m.%Y") todate = datetime.datetime.strptime(p["to"], "%d.%m.%Y") while fromdate <= todate: canteen.setDayClosed(fromdate.strftime('%d.%m.%Y')) fromdate += datetime.timedelta(1) return canteen.toXMLFeed() trs = table.find_all("tr") date = None for tr in trs: tds = tr.find_all("td") if len(tds) == 4: td0, td1, td2, td3 = tds day = td0.text.strip() date = fromdate + datetime.timedelta(days=daysGerman.index(day)) date = date.strftime('%d.%m.%Y') else: td0 = None td1, td2, td3 = tds notes = [] if "feiertag" in td1.text.lower() or "geschlossen" in td1.text.lower(): canteen.setDayClosed(date) continue categoryName = td1.text.strip()[:-1] mealName = td2.text.strip() if not categoryName or not mealName: continue prices = [] try: price = float(euro_regex.search( td3.text).group(1).replace(",", ".")) prices.append(price) if employee is not None: prices.append(employee) else: prices.append(price * employee_multiplier) if guest is not None: prices.append(guest) else: prices.append(price * guest_multiplier) except (AttributeError, TypeError, KeyError, ValueError): notes.append(td3.text.strip()) notes = [x for x in notes if x] canteen.addMeal(date, categoryName, mealName, notes if notes else None, prices if prices else None, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() legend = { '1': 'mit Farbstoff', '2': 'mit Konservierungsstoff', '3': 'mit Antioxidationsmittel', '4': 'mit Geschmacksverstärker', '5': 'geschwefelt', '6': 'geschwärzt', '7': 'gewachst', '8': 'mit Phosphat', '9': 'mit Süssungsmittel Saccharin', '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle', '11': 'mit Süssungsmittel Cyclamat', '12': 'mit Süssungsmittel Acesulfam', '13': 'chininhaltig', '14': 'coffeinhaltig', '15': 'gentechnisch verändert', '16': 'enthält Sulfite', '17': 'enthält Phenylalanin', 'A': 'Gluten', 'B': 'Krebstiere', 'C': 'Eier', 'D': 'Fisch', 'E': 'Erdnüsse', 'F': 'Soja', 'G': 'Milch und Milchprodukte', 'H': 'Schalenfrüchte', 'I': 'Sellerie', 'J': 'Senf', 'K': 'Sesamsamen', 'L': 'Schwefeldioxid und Sulfite', 'M': 'Lupinen', 'N': 'Weichtiere', 'ZTA': 'Aktionsgericht', 'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen', 'ZTF': 'Fisch', 'ZTG': 'Geflügel', 'ZTL': 'Lamm', 'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)', 'ZTMV': 'Mensa Vital', 'ZTR': 'Rindfleisch', 'ZTS': 'Schweinefleisch', 'ZTV': 'vegetarisch', 'ZTVG': 'vegan', 'ZTW': 'Wild' } #canteen.setLegendData(legend) hg = re.compile("^HG[1-9]$") b = re.compile("^B[1-9]$") n = re.compile("^N[1-9]$") #for w in 0, 1: for w in [0]: kw = (date.today() + timedelta(weeks=w)).isocalendar()[1] try: f = urlopen('%(location)s/%(isoweek)d.csv' % { 'location': url, 'isoweek': kw }) except HTTPError as e: if e.code == 404: continue else: raise e f = f.read().decode('iso8859-1') roles = ('student', 'employee', 'other') initline = True mealreader = reader(f.splitlines(), delimiter=';') for row in mealreader: if initline: initline = False else: if row[2] == 'Suppe': category = 'Suppe' elif hg.match(row[2]): category = 'Hauptgerichte' elif b.match(row[2]): category = 'Beilagen' elif n.match(row[2]): category = 'Nachspeisen' else: raise RuntimeError('Unknown category: ' + str(row[2])) mdate = row[0] notes = [] mname = row[3] bpos = mname.find(')') while bpos != -1: apos = mname.find('(') for i in mname[apos + 1:bpos].split(','): notes.append(i) if bpos == len(mname) - 1: mname = mname[:apos] + mname[bpos + 1:] bpos = -1 else: mname = mname[:apos] + ' und ' + mname[bpos + 1:] bpos = mname.find(')') if mname.rfind(' ') == len(mname) - 1: mname = mname[:len(mname) - 1] mtype = row[4] if mtype != '': for i in mtype.split(','): notes.append('ZT' + i) prices = [row[6], row[7], row[8]] mnotes = [] for i in notes: mnotes.append(legend.get(i, legend.get(i[2:], i))) try: canteen.addMeal(mdate, category, mname, mnotes, prices, roles) except ValueError as e: print('could not add meal {}/{} "{}" due to "{}"'.format( mdate, category, mname, e), file=sys.stderr) # empty meal ... pass return canteen.toXMLFeed()
document = parse(content, 'html.parser') items = document.find_all('a', {"class": "item"}) for item in items: title = item.strong.string if not title: continue numbers = item.small.string notes = [] if numbers: for number in numbers.split(','): number = int(number.strip()) if number > len(legend): continue notes.append(legend[number]) row = item.parent.parent price = row.find_all('td')[-1].string prices = {} if price: subprice = price.split('/') if len(subprice) == 3: prices = {'student': subprice[0], 'employee': subprice[1], 'other': subprice[2]} else: prices = {'other': price} canteen.addMeal(datetime.date(date.year, date.month, date.day), "Mittagessen", title, notes=notes, prices=prices) date = date + datetime.timedelta(1) print(canteen.toXMLFeed())