def parse_week(url, canteen): document = parse(urlopen(url).read()) for day_table in document.find_all('table', 'speiseplan'): try: date = extractDate(day_table.thead.tr.th.text) except ValueError: # There was no valid date in the table header, which happens eg # for special "Aktionswoche" tables. # TODO: check if this table contains any meals, which was not the # case when it was used for the first time. continue if day_table.find('td', 'keinangebot'): canteen.setDayClosed(date) continue for meal_tr in day_table.tbody.children: if len(meal_tr.find_all('a') or []) < 1: continue name = meal_tr.td.text if ': ' in name: category, name = name.split(': ', 1) else: category = 'Angebote' if len(name) > 200: name = name[:200] + ' ...' notes = [] for img in meal_tr.contents[1].find_all('img'): notes.append(img['title']) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)
def parse_week(url, canteen): document = parse(urlopen(url).read()) for day_table in document.find_all("table", "speiseplan"): try: date = extractDate(day_table.thead.tr.th.text) except ValueError: # There was no valid date in the table header, which happens eg # for special "Aktionswoche" tables. # TODO: check if this table contains any meals, which was not the # case when it was used for the first time. continue if day_table.find("td", "keinangebot"): canteen.setDayClosed(date) continue for meal_tr in day_table.tbody.children: if len(meal_tr.find_all("a") or []) < 1: continue name = meal_tr.td.text if ": " in name: category, name = name.split(": ", 1) else: category = "Angebote" if len(name) > 200: name = name[:200] + " ..." notes = [] for img in meal_tr.contents[1].find_all("img"): notes.append(img["title"]) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)
def parse_week(url, canteen): data = urlopen(url).read().decode('utf-8') document = parse(data, 'lxml') # The day plans are in a div with no special class or id. Thus # we try to find a div with a heading "Speiseplan " for week_heading in document(class_='swdd-ueberschrift', text=speiseplan_regex): week_div = week_heading.parent # The meals for each day a in card. Again there is no class or id to # select the meal cards. Thus we lookung for all card with a card-header # which stores the date for card_header in week_div.find_all(class_='card-header'): day_card = card_header.parent try: date = extractDate(card_header.text) except ValueError: # There was no valid date in the table header, which happens eg # for special "Aktionswoche" cards. # TODO: check if this card contains any meals, which was not the # case when it was used for the first time. continue # Check if there is a "kein Angebot" item if day_card.find(class_='list-group-item', text=kein_angebot_regex): canteen.setDayClosed(date) continue # Iterate over the list-group-item within the card which are used # for individual meals for meal in day_card.find_all(class_='list-group-item'): name = meal.find(name='span') if name is not None: name = name.text else: continue if ': ' in name: category, name = name.split(': ', 1) else: category = 'Angebote' notes = [img['alt'] for img in meal.find_all(class_='swdd-spl-symbol')] if '* ' in name: name, note = name.split('* ', 1) notes.append(note) if meal.strong is not None: prices = price_regex.findall(meal.strong.text) else: prices = [] canteen.addMeal(date, category, name, notes, prices, roles)
def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}): document = parse(urlopen(url).read(), 'lxml') for day_table in document.find_all('table', 'swbs_speiseplan'): caption = day_table.find('th', 'swbs_speiseplan_head').text if type not in caption: continue date = extractDate(caption) meals = day_table.find_all('tr') pos = 0 while pos < len(meals): meal_tr = meals[pos] if not meal_tr.find('td'): # z.B Headline pos += 1 continue tds = meal_tr.find_all('td') category = re.sub(r' \(\d\)', '', tds[0].text.strip()) name = tds[1].text.strip() if tds[1].find('a', href='http://www.stw-on.de/mensavital'): notes = ['MensaVital'] else: notes = [] for img in tds[2].find_all('img'): title = img['title'] if ':' in title: kind, value = title.split(':') if kind == 'Allergene': for allergen in value.split(','): notes.append( allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]]) elif kind == 'Zusatzstoffe': for zusatzstoff in value.split(','): notes.append(zusatzstoffe[zusatzstoff.strip()]) else: print('Unknown image type "{}"'.format(kind)) else: notes.append(title.replace('enthält ', '')) prices = { 'student': tds[3].text.strip(), 'employee': tds[4].text.strip(), 'other': tds[5].text.strip() } if pos < len(meals) - 1: nextTds = meals[pos + 1].find_all('td') if nextTds[0].text.strip() == '': pos += 1 for img in nextTds[1].find_all('img'): notes.append(img['title']) pos += 1 canteen.addMeal(date, category or 'Sonstiges', name, notes, prices)
def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}): document = parse(urlopen(url).read(), 'lxml') for day_table in document.find_all('table', 'swbs_speiseplan'): caption = day_table.find('th', 'swbs_speiseplan_head').text if type not in caption: continue date = extractDate(caption) meals = day_table.find_all('tr') pos = 0 while pos < len(meals): meal_tr = meals[pos] if not meal_tr.find('td'): # z.B Headline pos += 1 continue tds = meal_tr.find_all('td') category = re.sub(r' \(\d\)', '', tds[0].text.strip()) name = tds[1].text.strip() if tds[1].find('a', href='http://www.stw-on.de/mensavital'): notes = ['MensaVital'] else: notes = [] for img in tds[2].find_all('img'): title = img['title'] if ':' in title: kind, value = title.split(':') if kind == 'Allergene': for allergen in value.split(','): notes.append(allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]]) elif kind == 'Zusatzstoffe': for zusatzstoff in value.split(','): notes.append(zusatzstoffe[zusatzstoff.strip()]) else: print('Unknown image type "{}"'.format(kind)) else: notes.append(title.replace('enthält ', '')) prices = { 'student': tds[3].text.strip(), 'employee': tds[4].text.strip(), 'other': tds[5].text.strip() } if pos < len(meals) - 1: nextTds = meals[pos+1].find_all('td') if nextTds[0].text.strip() == '': pos += 1 for img in nextTds[1].find_all('img'): notes.append(img['title']) pos += 1 canteen.addMeal(date, category or 'Sonstiges', name, notes, prices)
def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}): document = parse(urlopen(url).read()) for day_table in document.find_all("table", "swbs_speiseplan"): caption = day_table.find("th", "swbs_speiseplan_head").text if type not in caption: continue date = extractDate(caption) meals = day_table.find_all("tr") pos = 0 while pos < len(meals): meal_tr = meals[pos] if not meal_tr.find("td"): # z.B Headline pos += 1 continue tds = meal_tr.find_all("td") category = re.sub(r" \(\d\)", "", tds[0].text.strip()) name = tds[1].text.strip() if tds[1].find("a", href="http://www.stw-on.de/mensavital"): notes = ["MensaVital"] else: notes = [] for img in tds[2].find_all("img"): title = img["title"] if ":" in title: kind, value = title.split(":") if kind == "Allergene": for allergen in value.split(","): notes.append(allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]]) elif kind == "Zusatzstoffe": for zusatzstoff in value.split(","): notes.append(zusatzstoffe[zusatzstoff.strip()]) else: print('Unknown image type "{}"'.format(kind)) else: notes.append(title.replace("enthält ", "")) prices = {"student": tds[3].text.strip(), "employee": tds[4].text.strip(), "other": tds[5].text.strip()} if pos < len(meals) - 1: nextTds = meals[pos + 1].find_all("td") if nextTds[0].text.strip() == "": pos += 1 for img in nextTds[1].find_all("img"): notes.append(img["title"]) pos += 1 canteen.addMeal(date, category, name, notes, prices)
def parse_week(url, canteen): document = parse(urlopen(url).read()) for day_table in document.find_all('table', 'speiseplan'): date = extractDate(day_table.thead.tr.th.text) if day_table.find('td', 'keinangebot'): canteen.setDayClosed(date) continue for meal_tr in day_table.tbody.children: if len(meal_tr.find_all('a') or []) < 1: continue name = meal_tr.td.text if ': ' in name: category, name = name.split(': ', 1) else: category = 'Angebote' if len(name) > 200: name = name[:200] + ' ...' notes = [] for img in meal_tr.contents[1].find_all('img'): notes.append(img['title']) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)
def parse_dish(dish, canteen): date = extractDate(dish['data-date']) name = dish.find(class_='neo-menu-single-title') if name is not None: notes = set(x['title'] for x in name.find_all(name='abbr')) else: return name = re.sub(notes_regex, '', name.text.strip()) if len(name) == 0: return # Fix formating issues: name = re.sub(whitspace_regex, ' ', name) # Multiple Whitespace name = re.sub(comma_regex, ', ', name.strip(', ')) # No whitspace after comma name = re.sub(bracket_regex, ' (', name) category = dish.find(class_='neo-menu-single-type') if category is not None: category = category.text elif dish.find_previous(name='h2') is not None: # A side category = 'Beilagen: ' + dish.find_previous( name='h2').text.capitalize() else: # Just in case category = 'Unbekannt' price = dish.find(class_='neo-menu-single-price') if price is not None: prices = price_regex.findall(price.text) else: prices = {} canteen.addMeal(date, category, name, notes, prices, roles) return
def parse_week(url, canteen, type): document = parse(urlopen(url).read()) for day_table in document.find_all('table', 'swbs_speiseplan'): caption = day_table.find('th', 'swbs_speiseplan_head').text if type not in caption: continue date = extractDate(caption) for meal_tr in day_table.find_all('tr'): if not meal_tr.find('td'): # z.B Headline continue tds = meal_tr.find_all('td') category = tds[0].text.strip() name = tds[1].text if tds[1].find('a', href='http://www.stw-on.de/mensavital'): notes = ['MensaVital'] else: notes = [] prices = { 'student': tds[2].text, 'employee': tds[3].text, 'other': tds[4].text } canteen.addMeal(date, category, name, notes, prices)
def parse_dish(dish, canteen): date = extractDate(dish['data-date']) name = dish.find(class_='neo-menu-single-title') if name is not None: notes = set(x['title'] for x in name.find_all(name='abbr')) else: return name = re.sub(notes_regex, '', name.text.strip()) if len(name) == 0: return # Fix formating issues: name = re.sub(whitspace_regex, ' ', name) # Multiple Whitespace name = re.sub(comma_regex, ', ', name.strip(', ')) # No whitspace after comma name = re.sub(bracket_regex, ' (', name) category = dish.find(class_='neo-menu-single-type') if category is not None: category = category.text elif dish.find_previous(name='h2') is not None: # A side category = 'Beilagen: ' + dish.find_previous(name='h2').text.capitalize() else: # Just in case category = 'Unbekannt' price = dish.find(class_='neo-menu-single-price') if price is not None: prices = price_regex.findall(price.text) else: prices = {} canteen.addMeal(date, category, name, notes, prices, roles) return
def test_unknown_date_format(self): with pytest.raises(ValueError): extractDate('2050.11-24')
def test_d_mm_yyyy(self): assert extractDate('7.03.2013') == self.date
def test_dd_mm_yy(self): assert extractDate('07.03.13') == self.date
def test_passing_of_date_objects(self): assert extractDate(self.date) is self.date
def test_yy_m_d(self): assert extractDate('13-3-7') == self.date
def test_yy_mm_dd(self): assert extractDate('13-03-07') == self.date
def test_yyyy_m_dd(self): assert extractDate('2013-3-07') == self.date
def test_yyyy_mm_d(self): assert extractDate('2013-03-7') == self.date
def test_d_m_yy(self): assert extractDate('7.3.13') == self.date
def test_unknown_month(self): with pytest.raises(ValueError): extractDate('07. Hans 2013')
def test_dd_DENAME_yyyy(self): assert extractDate('07 März 2013') == self.date assert extractDate('07 Maerz 2013') == self.date assert extractDate('07März 2013') == self.date assert extractDate('07Maerz 2013') == self.date
def test_ddDOT_ENNAME_yy(self): assert extractDate('07. March 13') == self.date assert extractDate('07. march 13') == self.date assert extractDate('07.March 13') == self.date assert extractDate('07.march 13') == self.date
def test_dd_m_yyyy(self): assert extractDate('07.3.2013') == self.date
def test_dd_ENNAME_yy(self): assert extractDate('07 March 13') == self.date assert extractDate('07 march 13') == self.date assert extractDate('07March 13') == self.date assert extractDate('07march 13') == self.date
def test_ddDOT_DENAME_yy(self): assert extractDate('07. März 13') == self.date assert extractDate('07. Maerz 13') == self.date assert extractDate('07.März 13') == self.date assert extractDate('07.Maerz 13') == self.date