Exemplo n.º 1
0
    def get_menus(self, text, year, week_number):
        menus = {}
        count = 0
        lines = text.replace("Extraessen", "").splitlines()
        for line in lines:
            if "Montag" in line:
                break

            count += 1

        lines = lines[count:]

        # get rid of Zusatzstoffe and Allergene: everything below the last ***-delimiter is irrelevant
        last_relevant_line = len(lines)
        for index, line in enumerate(lines):
            if "***" in line:
                last_relevant_line = index
        lines = lines[:last_relevant_line]

        days_list = [d for d in
                     re.split(r"(Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag),\s\d{1,2}.\d{1,2}.\d{4}",
                              "\n".join(lines).replace("*", "").strip())
                     if d not in ["", "Montag", "Dienstag", "Mittwoch", "Donnerstag", "Freitag", "Samstag", "Sonntag"]]
        if len(days_list) != 7:
            # as the Mediziner Mensa is part of hospital, it should serve food on each day
            return None
        days = {"mon": days_list[0], "tue": days_list[1], "wed": days_list[2], "thu": days_list[3], "fri": days_list[4],
                "sat": days_list[5], "sun": days_list[6]}

        for key in days:
            day_lines = unicodedata.normalize("NFKC", days[key]).splitlines(True)
            soup_str = ""
            mains_str = ""
            for day_line in day_lines:
                soup_str += day_line[:36].strip() + "\n"
                mains_str += day_line[40:100].strip() + "\n"

            soup_str = soup_str.replace("-\n", "").strip().replace("\n", " ")
            soup = self.parse_dish(soup_str)
            dishes = []
            if (soup.name not in ["", "Feiertag"]):
                dishes.append(soup)
            # https://regex101.com/r/MDFu1Z/1
            for dish_str in re.split(r"(\n{2,}|(?<!mit)\n(?=[A-Z]))", mains_str):
                dish_str = dish_str.strip().replace("\n", " ")
                dish = self.parse_dish(dish_str)
                dish.name = dish.name.strip()
                if dish.name not in ["", "Feiertag"]:
                    dishes.append(dish)

            date = self.get_date(year, week_number, self.weekday_positions[key])
            menu = Menu(date, dishes)
            # remove duplicates
            menu.remove_duplicates()
            menus[date] = menu

        return menus
Exemplo n.º 2
0
    def get_menus(self, text, year, week_number):
        menus = {}
        lines = text.splitlines()
        count = 0
        # remove headline etc.
        for line in lines:
            # Find the line which is the header of the table and includes the day of week
            line_shrink = line.replace(" ", "").replace("\n", "").lower()
            # Note we do not include 'montag' und 'freitag' since they are also used in the line before the table
            # header to indicate the range of the week “Monday … until Friday _”
            if any(x in line_shrink
                   for x in ('dienstag', 'mittwoch', 'donnerstag')):
                break

            count += 1

        else:
            warn(
                "NotImplemented: IPP parsing failed. Menu text is not a weekly menu. First line: '{}'"
                .format(lines[0]))
            return None

        lines = lines[count:]
        weekdays = lines[0]

        # The column detection is done through the string "Tagessuppe siehe Aushang" which is at the beginning of
        # every column. However, due to center alignment the column do not begin at the 'T' character and broader
        # text in the column might be left of this character, which then gets truncated. But the gap between the 'T'
        # and the '€' character of the previous column¹ — the real beginning of the current column — is always three,
        # which will be subtracted here. Monday is the second column, so the value should never become negative
        # although it is handled here.
        # ¹or 'e' of "Internationale Küche" if it is the monday column

        # find lines which match the regex
        # lines[1:] == exclude the weekday line which also can contain `Geschlossen`
        soup_lines_iter = (x for x in lines[1:]
                           if self.split_days_regex.search(x))

        soup_line1 = next(soup_lines_iter)
        soup_line2 = next(soup_lines_iter, '')

        # Sometimes on closed days, the keywords are written instead of the week of day instead of the soup line
        positions1 = [
            (max(a.start() - 3, 0), a.end())
            for a in list(re.finditer(self.split_days_regex_closed, weekdays))
        ]

        positions2 = [(max(a.start() - 3, 0), a.end()) for a in list(
            re.finditer(self.split_days_regex_soup_one_line, soup_line1))]
        # In the second line there is just 'Aushang' (two lines "Tagessuppe siehe Aushang" or
        # closed days ("Geschlossen", "Feiertag")
        positions3 = [(max(a.start() - 14, 0), a.end() + 3) for a in list(
            re.finditer(self.split_days_regex_soup_two_line, soup_line2))]
        # closed days ("Geschlossen", "Feiertag", …) can be in first line and second line
        positions4 = [
            (max(a.start() - 3, 0), a.end()) for a in
            list(re.finditer(self.split_days_regex_closed, soup_line1)) +
            list(re.finditer(self.split_days_regex_closed, soup_line2))
        ]

        if positions3:  # Two lines "Tagessuppe siehe Aushang"
            soup_line_index = lines.index(soup_line2)
        else:
            soup_line_index = lines.index(soup_line1)

        positions = sorted(positions1 + positions2 + positions3 + positions4)

        if len(positions) != 5:
            warn(
                "IPP PDF parsing of week {} in year {} failed. Only {} of 5 columns detected."
                .format(week_number, year, len(positions)))
            return None

        pos_mon = positions[0][0]
        pos_tue = positions[1][0]
        pos_wed = positions[2][0]
        pos_thu = positions[3][0]
        pos_fri = positions[4][0]

        lines_weekdays = {
            "mon": "",
            "tue": "",
            "wed": "",
            "thu": "",
            "fri": ""
        }
        # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the
        # soups) instead of the first menu, if there is a day where the bistro is closed.
        for line in lines[soup_line_index + 3:]:
            lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace(
                "\n", " ")
            lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace(
                "\n", " ")
            lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace(
                "\n", " ")
            lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace(
                "\n", " ")
            lines_weekdays["fri"] += " " + line[pos_fri:].replace("\n", " ")

        for key in lines_weekdays:
            # Appends `?€` to „Überraschungsmenü“ if it do not have a price. The second '€' is a separator for the
            # later split
            lines_weekdays[key] = self.surprise_without_price_regex.sub(
                r"\g<1>?€ € \g<2>", lines_weekdays[key])
            # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT)
            lines_weekdays[key] = unicodedata.normalize(
                "NFKC", lines_weekdays[key])
            # remove multi-whitespaces
            lines_weekdays[key] = ' '.join(lines_weekdays[key].split())
            # get all dish including name and price
            dish_names_price = re.findall(self.dish_regex,
                                          lines_weekdays[key] + ' ')
            # create dish types
            # since we have the same dish types every day we can use them if there are 4 dishes available
            if len(dish_names_price) == 4:
                dish_types = [
                    "Veggie", "Traditionelle Küche", "Internationale Küche",
                    "Specials"
                ]
            else:
                dish_types = ["Tagesgericht"] * len(dish_names_price)

            # create ingredients
            # all dishes have the same ingridients
            ingredients = Ingredients("ipp-bistro")
            ingredients.parse_ingredients("Mi,Gl,Sf,Sl,Ei,Se,4")
            # create list of Dish objects
            counter = 0
            dishes = []
            for (dish_name, price) in dish_names_price:
                dishes.append(
                    Dish(dish_name.strip(),
                         Prices(Price(price.replace(',', '.').strip())),
                         ingredients.ingredient_set, dish_types[counter]))
                counter += 1
            date = self.get_date(year, week_number,
                                 self.weekday_positions[key])
            # create new Menu object and add it to dict
            menu = Menu(date, dishes)
            # remove duplicates
            menu.remove_duplicates()
            menus[date] = menu

        return menus
Exemplo n.º 3
0
    def get_menus(self, text, year, week_number):
        menus = {}
        lines = text.splitlines()
        count = 0
        # remove headline etc.
        for line in lines:
            if line.replace(" ", "").replace(
                    "\n",
                    "").lower() == "montagdienstagmittwochdonnerstagfreitag":
                break

            count += 1

        lines = lines[count:]
        # we assume that the weeksdays are now all in the first line
        pos_mon = lines[0].find("Montag")
        pos_tue = lines[0].find("Dienstag")
        pos_wed = lines[0].find("Mittwoch")
        pos_thu = lines[0].find("Donnerstag")
        pos_fri = lines[0].find("Freitag")

        # The text is formatted as table using whitespaces. Hence, we need to get those parts of each line that refer
        #  to the respective week day
        lines_weekdays = {
            "mon": "",
            "tue": "",
            "wed": "",
            "thu": "",
            "fri": ""
        }
        for line in lines:
            lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace(
                "\n", " ").replace("Montag", "")
            lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace(
                "\n", " ").replace("Dienstag", "")
            lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace(
                "\n", " ").replace("Mittwoch", "")
            lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace(
                "\n", " ").replace("Donnerstag", "")
            lines_weekdays["fri"] += " " + line[pos_fri:].replace(
                "\n", " ").replace("Freitag", "")

        # currently, up to 5 dishes are on the menu
        num_dishes = 5
        line_aktion = []
        if year < 2018:
            # in older versions of the FMI Bistro menu, the Aktionsgericht was the same for the whole week
            num_dishes = 3
            line_aktion = [s for s in lines if "Aktion" in s]
            if len(line_aktion) == 1:
                line_aktion_pos = lines.index(line_aktion[0]) - 2
                aktionsgericht = ' '.join(
                    lines[line_aktion_pos:line_aktion_pos + 3])
                aktionsgericht = aktionsgericht \
                    .replace("Montag – Freitag", "") \
                    .replace("Tagessuppe täglich wechselndes Angebot", "") \
                    .replace("ab € 1,00", "") \
                    .replace("Aktion", "")
                num_dishes += aktionsgericht.count('€')
                for key in lines_weekdays:
                    lines_weekdays[
                        key] = aktionsgericht + ", " + lines_weekdays[key]

        # Process menus for each day
        for key in lines_weekdays:
            # stop parsing day when bistro is closed at that day
            if "geschlossen" in lines_weekdays[key].lower():
                continue

            # extract all allergens
            dish_allergens = []
            for x in re.findall(self.allergens_regex, lines_weekdays[key]):
                if len(x) > 0:
                    dish_allergens.append(
                        re.sub(r"((Allergene:)|\s|\n)*", "", x[0]))
                else:
                    dish_allergens.append("")
            lines_weekdays[key] = re.sub(self.allergens_regex, "",
                                         lines_weekdays[key])
            # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT)
            lines_weekdays[key] = unicodedata.normalize(
                "NFKC", lines_weekdays[key])
            # remove multi-whitespaces
            lines_weekdays[key] = ' '.join(lines_weekdays[key].split())

            # remove no allergens indicator
            lines_weekdays[key] = lines_weekdays[key].replace("./.", "")
            # get all dish including name and price
            dish_names = re.findall(self.dish_regex, lines_weekdays[key])
            # get dish prices
            prices = re.findall(self.price_regex, ' '.join(dish_names))
            # convert prices to float
            prices = [
                Prices(
                    Price(
                        float(
                            price.replace("€", "").replace(",", ".").strip())))
                for price in prices
            ]
            # remove price and commas from dish names
            dish_names = [
                re.sub(self.price_regex, "", dish).replace(",", "").strip()
                for dish in dish_names
            ]
            # create list of Dish objects; only take first 3/4 as the following dishes are corrupt and not necessary
            dishes = []
            for (dish_name, price,
                 dish_allergen) in list(zip(dish_names, prices,
                                            dish_allergens)):
                # filter empty dishes
                if dish_name:
                    ingredients = Ingredients("fmi-bistro")
                    ingredients.parse_ingredients(dish_allergen)
                    dishes.append(
                        Dish(dish_name, price, ingredients.ingredient_set,
                             "Tagesgericht"))
            dishes = dishes[:num_dishes]
            date = self.get_date(year, week_number,
                                 self.weekday_positions[key])
            # create new Menu object and add it to dict
            menu = Menu(date, dishes)
            # remove duplicates
            menu.remove_duplicates()
            menus[date] = menu

        return menus
Exemplo n.º 4
0
    def get_menus(self, text, year, week_number):
        menus = {}
        lines = text.splitlines()
        count = 0
        # remove headline etc.
        for line in lines:
            if line.replace(" ", "").replace(
                    "\n",
                    "").lower() == "montagdienstagmittwochdonnerstagfreitag":
                break

            count += 1

        lines = lines[count:]
        weekdays = lines[0]

        # The column detection is done through the string "Tagessuppe siehe Aushang" which is at the beginning of
        # every column. However, due to center alignment the column do not begin at the 'T' character and broader
        # text in the column might be left of this character, which then gets truncated. But the gap between the 'T'
        # and the '€' character of the previous column¹ — the real beginning of the current column — is always three,
        # which will be subtracted here. Monday is the second column, so the value should never become negative
        # although it is handled here.
        # ¹or 'e' of "Internationale Küche" if it is the monday column

        # find lines which match the regex
        soup_lines_iter = (x for x in lines if self.split_days_regex.search(x))

        soup_line1 = next(soup_lines_iter)
        soup_line2 = next(soup_lines_iter, '')

        positions1 = [(max(a.start() - 3, 0), a.end()) for a in list(
            re.finditer(self.split_days_regex_soup_one_line, soup_line1))]
        # In the second line there is just 'Aushang' (two lines "Tagessuppe siehe Aushang" or
        # closed days ("Geschlossen", "Feiertag")
        positions2 = [(max(a.start() - 14, 0), a.end() + 3) for a in list(
            re.finditer(self.split_days_regex_soup_two_line, soup_line2))]
        positions3 = [(max(a.start() - 3, 0), a.end()) for a in list(
            re.finditer(self.split_days_regex_closed, soup_line2))]

        if positions2:  # Two lines "Tagessuppe siehe Aushang"
            soup_line_index = lines.index(soup_line2)
        else:
            soup_line_index = lines.index(soup_line1)

        positions = sorted(positions1 + positions2 + positions3)

        if len(positions) != 5:
            warn(
                "IPP PDF parsing of week {} in year {} failed. Only {} of 5 columns detected."
                .format(week_number, year, len(positions)))
            return None

        pos_mon = positions[0][0]
        pos_tue = positions[1][0]
        pos_wed = positions[2][0]
        pos_thu = positions[3][0]
        pos_fri = positions[4][0]

        lines_weekdays = {
            "mon": "",
            "tue": "",
            "wed": "",
            "thu": "",
            "fri": ""
        }
        # it must be lines[3:] instead of lines[2:] or else the menus would start with "Preis ab 0,90€" (from the
        # soups) instead of the first menu, if there is a day where the bistro is closed.
        for line in lines[soup_line_index + 3:]:
            lines_weekdays["mon"] += " " + line[pos_mon:pos_tue].replace(
                "\n", " ")
            lines_weekdays["tue"] += " " + line[pos_tue:pos_wed].replace(
                "\n", " ")
            lines_weekdays["wed"] += " " + line[pos_wed:pos_thu].replace(
                "\n", " ")
            lines_weekdays["thu"] += " " + line[pos_thu:pos_fri].replace(
                "\n", " ")
            lines_weekdays["fri"] += " " + line[pos_fri:].replace("\n", " ")

        for key in lines_weekdays:
            # get rid of two-character umlauts (e.g. SMALL_LETTER_A+COMBINING_DIACRITICAL_MARK_UMLAUT)
            lines_weekdays[key] = unicodedata.normalize(
                "NFKC", lines_weekdays[key])
            # remove multi-whitespaces
            lines_weekdays[key] = ' '.join(lines_weekdays[key].split())
            # get all dish including name and price
            dish_names = re.findall(self.dish_regex, lines_weekdays[key] + " ")
            # get dish prices
            prices = re.findall(self.price_regex, ' '.join(dish_names))
            # convert prices to float
            prices = [
                float(price.replace("€", "").replace(",", ".").strip())
                for price in prices
            ]
            # remove price and commas from dish names
            dish_names = [
                re.sub(self.price_regex, "", dish).strip()
                for dish in dish_names
            ]
            # create list of Dish objects
            dishes = [
                Dish(dish_name, price)
                for (dish_name, price) in list(zip(dish_names, prices))
            ]
            date = self.get_date(year, week_number,
                                 self.weekday_positions[key])
            # create new Menu object and add it to dict
            menu = Menu(date, dishes)
            # remove duplicates
            menu.remove_duplicates()
            menus[date] = menu

        return menus