def parse_dish(self, dish_str): # ingredients dish_ingredients = Ingredients("mediziner-mensa") matches = re.findall(self.ingredients_regex, dish_str) while len(matches) > 0: for x in matches: if len(x) > 0: dish_ingredients.parse_ingredients(x[0]) dish_str = re.sub(self.ingredients_regex, " ", dish_str) matches = re.findall(self.ingredients_regex, dish_str) dish_str = re.sub(r"\s+", " ", dish_str).strip() dish_str = dish_str.replace(" , ", ", ") # price dish_price = Prices() for x in re.findall(self.price_regex, dish_str): if len(x) > 0: dish_price = Prices( Price( float(x[0].replace("€", "").replace(",", ".").strip()))) dish_str = re.sub(self.price_regex, "", dish_str) return Dish(dish_str, dish_price, dish_ingredients.ingredient_set, "Tagesgericht")
def __parse_dishes(menu_html, location): # obtain the names of all dishes in a passed menu dish_names = [ dish.rstrip() for dish in menu_html.xpath( "//p[@class='js-schedule-dish-description']/text()") ] # make duplicates unique by adding (2), (3) etc. to the names dish_names = util.make_duplicates_unique(dish_names) # obtain the types of the dishes (e.g. 'Tagesgericht 1') dish_types = [ type.text if type.text else '' for type in menu_html.xpath("//span[@class='stwm-artname']") ] # obtain all ingredients dish_markers_additional = menu_html.xpath( "//span[contains(@class, 'c-schedule__marker--additional')]/@data-essen" ) dish_markers_allergen = menu_html.xpath( "//span[contains(@class, 'c-schedule__marker--allergen')]/@data-essen" ) dish_markers_type = menu_html.xpath( "//span[contains(@class, 'c-schedule__marker--type')]/@data-essen") # create dictionary out of dish name and dish type dishes_dict = {} dishes_tup = zip(dish_names, dish_types, dish_markers_additional, dish_markers_allergen, dish_markers_type) for dish_name, dish_type, dish_marker_additional, dish_marker_allergen, dish_marker_type in dishes_tup: dishes_dict[dish_name] = (dish_type, dish_marker_additional, dish_marker_allergen, dish_marker_type) # create Dish objects with correct prices; if price is not available, -1 is used instead dishes = [] for name in dishes_dict: if not dishes_dict[name] and dishes: # some dishes are multi-row. That means that for the same type the dish is written in multiple rows. # From the second row on the type is then just empty. In that case, we just use the price and # ingredients of the previous dish. dishes.append( Dish(name, dishes[-1].price, dishes[-1].ingredients, dishes[-1].dish_type)) else: dish_ingredients = Ingredients(location) dish_ingredients.parse_ingredients(dishes_dict[name][1]) dish_ingredients.parse_ingredients(dishes_dict[name][2]) dish_ingredients.parse_ingredients(dishes_dict[name][3]) dishes.append( Dish( name, StudentenwerkMenuParser.prices.get( dishes_dict[name][0], "N/A"), dish_ingredients.ingredient_set, dishes_dict[name][0])) return dishes
def get_menus(self, text, year, week_number): menus = {} lines = text.splitlines() count = 0 # remove headline etc. for line in lines: # Find the line which is the header of the table and includes the day of week line_shrink = line.replace(" ", "").replace("\n", "").lower() # Note we do not include 'montag' und 'freitag' since they are also used in the line before the table # header to indicate the range of the week “Monday … until Friday _” if any(x in line_shrink for x in ('dienstag', 'mittwoch', 'donnerstag')): break count += 1 else: warn( "NotImplemented: IPP parsing failed. Menu text is not a weekly menu. First line: '{}'" .format(lines[0])) return None lines = lines[count:] weekdays = lines[0] # The column detection is done through the string "Tagessuppe siehe Aushang" which is at the beginning of # every column. However, due to center alignment the column do not begin at the 'T' character and broader # text in the column might be left of this character, which then gets truncated. But the gap between the 'T' # and the '€' character of the previous column¹ — the real beginning of the current column — is always three, # which will be subtracted here. Monday is the second column, so the value should never become negative # although it is handled here. # ¹or 'e' of "Internationale Küche" if it is the monday column # find lines which match the regex # lines[1:] == exclude the weekday line which also can contain `Geschlossen` soup_lines_iter = (x for x in lines[1:] if self.split_days_regex.search(x)) soup_line1 = next(soup_lines_iter) soup_line2 = next(soup_lines_iter, '') # Sometimes on closed days, the keywords are written instead of the week of day instead of the soup line positions1 = [ (max(a.start() - 3, 0), a.end()) for a in list(re.finditer(self.split_days_regex_closed, weekdays)) ] positions2 = [(max(a.start() - 3, 0), a.end()) for a in list( re.finditer(self.split_days_regex_soup_one_line, soup_line1))] # In the second line there is just 'Aushang' (two lines "Tagessuppe siehe Aushang" or # closed days ("Geschlossen", "Feiertag") positions3 = [(max(a.start() - 14, 0), a.end() + 3) for a in list( re.finditer(self.split_days_regex_soup_two_line, soup_line2))] # closed days ("Geschlossen", "Feiertag", …) can be in first line and second line positions4 = [ (max(a.start() - 3, 0), a.end()) for a in list(re.finditer(self.split_days_regex_closed, soup_line1)) + list(re.finditer(self.split_days_regex_closed, soup_line2)) ] if positions3: # Two lines "Tagessuppe siehe Aushang" soup_line_index = lines.index(soup_line2) else: soup_line_index = lines.index(soup_line1) positions = sorted(positions1 + positions2 + positions3 + positions4) if len(positions) != 5: warn( "IPP PDF parsing of week {} in year {} failed. Only {} of 5 columns detected." .format(week_number, year, len(positions))) return None pos_mon = positions[0][0] pos_tue = positions[1][0] pos_wed = positions[2][0] pos_thu = positions[3][0] pos_fri = positions[4][0] lines_weekdays = { "mon": "", "tue": "", "wed": "", "thu": "", "fri": "" } # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the # soups) instead of the first menu, if there is a day where the bistro is closed. for line in lines[soup_line_index + 3:]: lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace( "\n", " ") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace( "\n", " ") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace( "\n", " ") lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace( "\n", " ") lines_weekdays["fri"] += " " + line[pos_fri:].replace("\n", " ") for key in lines_weekdays: # Appends `?€` to „Überraschungsmenü“ if it do not have a price. The second '€' is a separator for the # later split lines_weekdays[key] = self.surprise_without_price_regex.sub( r"\g<1>?€ € \g<2>", lines_weekdays[key]) # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT) lines_weekdays[key] = unicodedata.normalize( "NFKC", lines_weekdays[key]) # remove multi-whitespaces lines_weekdays[key] = ' '.join(lines_weekdays[key].split()) # get all dish including name and price dish_names_price = re.findall(self.dish_regex, lines_weekdays[key] + ' ') # create dish types # since we have the same dish types every day we can use them if there are 4 dishes available if len(dish_names_price) == 4: dish_types = [ "Veggie", "Traditionelle Küche", "Internationale Küche", "Specials" ] else: dish_types = ["Tagesgericht"] * len(dish_names_price) # create ingredients # all dishes have the same ingridients ingredients = Ingredients("ipp-bistro") ingredients.parse_ingredients("Mi,Gl,Sf,Sl,Ei,Se,4") # create list of Dish objects counter = 0 dishes = [] for (dish_name, price) in dish_names_price: dishes.append( Dish(dish_name.strip(), Prices(Price(price.replace(',', '.').strip())), ingredients.ingredient_set, dish_types[counter])) counter += 1 date = self.get_date(year, week_number, self.weekday_positions[key]) # create new Menu object and add it to dict menu = Menu(date, dishes) # remove duplicates menu.remove_duplicates() menus[date] = menu return menus
def get_menus(self, text, year, week_number): menus = {} lines = text.splitlines() count = 0 # remove headline etc. for line in lines: if line.replace(" ", "").replace( "\n", "").lower() == "montagdienstagmittwochdonnerstagfreitag": break count += 1 lines = lines[count:] # we assume that the weeksdays are now all in the first line pos_mon = lines[0].find("Montag") pos_tue = lines[0].find("Dienstag") pos_wed = lines[0].find("Mittwoch") pos_thu = lines[0].find("Donnerstag") pos_fri = lines[0].find("Freitag") # The text is formatted as table using whitespaces. Hence, we need to get those parts of each line that refer # to the respective week day lines_weekdays = { "mon": "", "tue": "", "wed": "", "thu": "", "fri": "" } for line in lines: lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace( "\n", " ").replace("Montag", "") lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace( "\n", " ").replace("Dienstag", "") lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace( "\n", " ").replace("Mittwoch", "") lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace( "\n", " ").replace("Donnerstag", "") lines_weekdays["fri"] += " " + line[pos_fri:].replace( "\n", " ").replace("Freitag", "") # currently, up to 5 dishes are on the menu num_dishes = 5 line_aktion = [] if year < 2018: # in older versions of the FMI Bistro menu, the Aktionsgericht was the same for the whole week num_dishes = 3 line_aktion = [s for s in lines if "Aktion" in s] if len(line_aktion) == 1: line_aktion_pos = lines.index(line_aktion[0]) - 2 aktionsgericht = ' '.join( lines[line_aktion_pos:line_aktion_pos + 3]) aktionsgericht = aktionsgericht \ .replace("Montag – Freitag", "") \ .replace("Tagessuppe täglich wechselndes Angebot", "") \ .replace("ab € 1,00", "") \ .replace("Aktion", "") num_dishes += aktionsgericht.count('€') for key in lines_weekdays: lines_weekdays[ key] = aktionsgericht + ", " + lines_weekdays[key] # Process menus for each day for key in lines_weekdays: # stop parsing day when bistro is closed at that day if "geschlossen" in lines_weekdays[key].lower(): continue # extract all allergens dish_allergens = [] for x in re.findall(self.allergens_regex, lines_weekdays[key]): if len(x) > 0: dish_allergens.append( re.sub(r"((Allergene:)|\s|\n)*", "", x[0])) else: dish_allergens.append("") lines_weekdays[key] = re.sub(self.allergens_regex, "", lines_weekdays[key]) # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT) lines_weekdays[key] = unicodedata.normalize( "NFKC", lines_weekdays[key]) # remove multi-whitespaces lines_weekdays[key] = ' '.join(lines_weekdays[key].split()) # remove no allergens indicator lines_weekdays[key] = lines_weekdays[key].replace("./.", "") # get all dish including name and price dish_names = re.findall(self.dish_regex, lines_weekdays[key]) # get dish prices prices = re.findall(self.price_regex, ' '.join(dish_names)) # convert prices to float prices = [ Prices( Price( float( price.replace("€", "").replace(",", ".").strip()))) for price in prices ] # remove price and commas from dish names dish_names = [ re.sub(self.price_regex, "", dish).replace(",", "").strip() for dish in dish_names ] # create list of Dish objects; only take first 3/4 as the following dishes are corrupt and not necessary dishes = [] for (dish_name, price, dish_allergen) in list(zip(dish_names, prices, dish_allergens)): # filter empty dishes if dish_name: ingredients = Ingredients("fmi-bistro") ingredients.parse_ingredients(dish_allergen) dishes.append( Dish(dish_name, price, ingredients.ingredient_set, "Tagesgericht")) dishes = dishes[:num_dishes] date = self.get_date(year, week_number, self.weekday_positions[key]) # create new Menu object and add it to dict menu = Menu(date, dishes) # remove duplicates menu.remove_duplicates() menus[date] = menu return menus