Exemplo n.º 1
0
def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'}
    document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read())
    for td in document.find_all('td', 'beschreibung'):
        legend[td.previous_sibling.previous_sibling.text] = td.text
    document = parse(urlopen(base + '/unsere-preise/').read())
    prices = {}
    for tr in document.find('table', 'essenspreise').find_all('tr'):
        meal = tr.find('th')
        if not meal or not meal.text.strip():
            continue
        if len(tr.find_all('td', 'betrag')) < 3:
            continue
        if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []):
            continue
        meal = meal.text.strip()
        prices[meal] = {}
        for role, _id in [('student', 0), ('employee', 1), ('other', 2)]:
            price_html = tr.find_all('td', 'betrag')[_id].text
            price_search = price_regex.search(price_html)
            if price_search:
                prices[meal][role] = price_search.group('price')
    errorCount = 0
    date = datetime.date.today()
    while errorCount < 7:
        try:
            document = parse(urlopen(url.format(date)).read())
        except HTTPError as e:
            if e.code == 404:
                errorCount += 1
                date += datetime.date.resolution
                continue
            else:
                raise e
        else:
            errorCount = 0
        for tr in document.find('table', 'zusatzstoffe').find_all('tr'):
            identifier = tr.find_all('td')[0].text \
                           .replace('(', '').replace(')', '')
            legend[identifier] = tr.find_all('td')[1].text.strip()
        canteen.setLegendData(legend)
        mensa_data = document.find('table', 'menu')
        category = None
        for menu_tr in mensa_data.find_all('tr'):
            if menu_tr.find('td', 'headline'):
                continue
            if menu_tr.find('td', 'gericht').text:
                category = menu_tr.find('td', 'gericht').text
            data = menu_tr.find('td', 'beschreibung')
            name = data.find('span').text.strip()
            notes = [span['title'] for span in data.find_all('span', title=True)]
            canteen.addMeal(
                date, category, name, notes,
                prices.get(category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {})
            )
        date += datetime.date.resolution
        if today:
            break
    return canteen.toXMLFeed()
Exemplo n.º 2
0
def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read())
    for day_div in document.find_all('div', 'day') + document.find_all(
            'article', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date')
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = "{}-{}-{}".format(
                year,
                date_test.group('month'),
                date_test.group('day'),
            )
        if 'nodata' in day_div.attrs.get('class',
                                         []) or 'GESCHLOSSEN' in day_div.text:
            canteen.setDayClosed(date)
            continue
        closed_candidate = False
        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue
            if 'geschlossen' in name:
                closed_candidate = True
                continue
            category = meal_article.find('div')['title']
            notes = [
                v['title'] for v in meal_article.find_all('div', 'theicon')
                if v['title']
            ]
            if meal_article.find('div', 'additive'):
                notes += [
                    v[0] for v in extra_regex.findall(
                        meal_article.find('div', 'additive').text)
                ]
            price_div = meal_article.find('div', 'price')
            if price_div is None:
                canteen.addMeal(date, category, name, notes)
                continue
            prices = {}
            for v, r in (('default', 'student'), ('bed', 'employee'),
                         ('guest', 'other')):
                price = price_regex.search(price_div['data-' + v])
                if price:
                    prices[r] = price.group('price')
                elif v == 'default':
                    prices = {}
                    break
            canteen.addMeal(date, category, name, notes, prices)
        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)
    return canteen.toXMLFeed()
Exemplo n.º 3
0
def parse_url(url, today=False):
    content = urlopen(url).read()
    document = parse(content, 'lxml')
    legends = document.find_all('div', {'class': 'legende'})
    if len(legends) > 0:
        extraLegend = {
            int(v[0]): v[1]
            for v in reversed(legend_regex.findall(legends[0].text))
        }
    else:
        extraLegend = {}
    canteen = LazyBuilder()
    for day_td in document.find_all('td', text=day_regex):
        date = day_regex.search(day_td.string).group('date')
        table = None
        for element in day_td.parents:
            if element.name == 'table':
                table = element
                break
        if not table:
            continue
        for tr in table.tbody.find_all('tr'):
            if 'geschlossen' in tr.text or 'Feiertage' in tr.text:
                match = day_range_regex.search(tr.text)
                if not match:
                    canteen.setDayClosed(date)
                else:
                    fromDate = datetime.datetime.strptime(
                        match.group('from'), '%d.%m.%Y')
                    toDate = datetime.datetime.strptime(
                        match.group('to'), '%d.%m.%Y')
                    while fromDate <= toDate:
                        canteen.setDayClosed(fromDate.strftime('%Y-%m-%d'))
                        fromDate += datetime.date.resolution
                continue
            if len(tr) != 2:
                continue  # no meal
            strings = list(tr.contents[0].strings)
            name = strings[0]
            # prices:
            prices = strings[-1].split('|')
            if '-' in map(lambda v: v.strip(), prices):
                prices = {}
            # notes:
            notes = []
            for img in tr.contents[1].find_all('img'):
                notes.append(img['alt'].replace('Symbol', '').strip())
            for extra in list(
                    set(map(lambda v: int(v), extra_regex.findall(tr.text)))):
                if extra in extraLegend:
                    notes.append(extraLegend[extra])
            canteen.addMeal(date, 'Hauptgerichte', name, notes, prices,
                            roles if prices else None)
    return canteen.toXMLFeed()
Exemplo n.º 4
0
def parse_url(url, today=False):
    base_data = load_base_data()

    canteen = LazyBuilder()
    with urlopen(url) as response:
        data = json.loads(response.read().decode())

    for day in data['days']:
        date = datetime.datetime.strptime(day['date'], UTC_DATE_STRING).date()

        if today and (datetime.date.today() != date):
            continue

        for counter in day['counters']:
            counter_name = counter['displayName']
            counter_description = counter['description']
            counter_hours = counter.get('openingHours')

            for meal in counter['meals']:
                if 'knownMealId' in meal:
                    # This is meant to allow recognizing recurring meals,
                    # for features like marking meals as favorites.
                    # Up to now, not really used in the mensaar.de API,
                    # nor functional in this API parser.
                    # The meal will still be recognized as every other meal.
                    print('knownMealId: %s' % meal['knownMealId'],
                          file=sys.stderr)

                meal_name = meal['name']
                if 'category' in meal:
                    meal_name = '%s: %s' % (meal['category'], meal_name)

                meal_notes = (
                    # The description is typically the location
                    # (but not required to be by the API specification).
                    build_location(counter_description) +
                    build_hours(counter_hours) + build_notes(
                        base_data, meal['notices'], meal['components']))

                meal_prices = {}
                if 'prices' in meal:
                    prices = meal['prices']
                    for role in prices:
                        if role in ROLES:
                            meal_prices[base_data['roles']
                                        [role]] = prices[role]

                if 'pricingNotice' in meal:
                    meal_notes.append(meal['pricingNotice'])

                canteen.addMeal(date, counter_name, meal_name, meal_notes,
                                meal_prices)

    return canteen.toXMLFeed()
Exemplo n.º 5
0
def parse_url(url, today=False):
    base_data = load_base_data()

    canteen = LazyBuilder()
    with urlopen(url) as response:
        data = json.loads(response.read().decode())

    for day in data['days']:
        date = datetime.datetime.strptime(day['date'], UTC_DATE_STRING).date()

        if today and (datetime.date.today() != date):
            continue

        for counter in day['counters']:
            counter_name = counter['displayName']
            counter_description = counter['description']
            counter_hours = counter.get('openingHours')

            for meal in counter['meals']:
                if 'knownMealId' in meal:
                    # This is meant to allow recognizing recurring meals,
                    # for features like marking meals as favorites.
                    # Up to now, not really used in the mensaar.de API,
                    # nor functional in this API parser.
                    # The meal will still be recognized as every other meal.
                    print('knownMealId: %s' % meal['knownMealId'], file=sys.stderr)

                meal_name = meal['name']
                if 'category' in meal:
                    meal_name = '%s: %s' % (meal['category'], meal_name)

                meal_notes = (
                    # The description is typically the location
                    # (but not required to be by the API specification).
                    build_location(counter_description) +
                    build_hours(counter_hours) +
                    build_notes(base_data, meal['notices'], meal['components']))

                meal_prices = {}
                if 'prices' in meal:
                    prices = meal['prices']
                    for role in prices:
                        if role in ROLES:
                            meal_prices[base_data['roles'][role]] = prices[role]

                if 'pricingNotice' in meal:
                    meal_notes.append(meal['pricingNotice'])

                canteen.addMeal(date, counter_name,
                                meal_name, meal_notes, meal_prices)

    return canteen.toXMLFeed()
Exemplo n.º 6
0
def parsePlan(url, internalMensaId, today):
    canteen = LazyBuilder()
    end = False
    while (url != None):
        dom = BeautifulSoup(urlopen(url).read(), 'lxml')
        date = dom.select('#mensa_date > p')[0].contents[0]
        menuDefinition = dom.find(id=internalMensaId)
        menuDescription = menuDefinition.parent.find('dd')
        tables = menuDescription.select('table')
        legend = {}
        legend = buildLegend(legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)')
        if tables != None and len(tables) == 1:
            table = tables[0]
            rows = table.find_all('tr')
            for row in rows:
                menuNameElement = row.select('td[class="mensa_col_55"] > b')
                if menuNameElement != None and menuNameElement[0].contents != None:
                    menuName = menuNameElement[0].contents[0]
                    category = 'Gericht'

                    # get notes
                    notes = {}
                    notesElement = row.select('td[class="mensa_col_55"] > span')
                    if notesElement != None and len(notesElement) > 0 and notesElement[0].text != None:
                        notes = [legend.get(n, n) for n in notesElement[0].text.split(' ') if n]

                    # get prices
                    prices = {}
                    for td in row.select('td[class="mensa_col_15"]'):
                        priceElement = td.find('b')
                        groupElement = td.find('span')
                        if priceElement != None and groupElement != None and groupElement.contents != None and len(groupElement.contents) > 0 and priceElement.contents != None and len(priceElement.contents) > 0:
                            group = str(groupElement.contents[0])
                            price = str(priceElement.contents[0])
                            if group == 'Stud.:':
                                prices['student'] = price
                            elif group == 'Bed.:':
                                prices['employee'] = price
                            elif group == 'Gast:':
                                prices['other'] = price

                    canteen.addMeal(date, category, menuName, notes, prices)
        else:
            canteen.setDayClosed(date)

        # check for further pages
        nextPageLink = dom.find(id='next_day_link')
        if nextPageLink == None or today:
            url = None
        else:
            url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href']
    return canteen.toXMLFeed()
Exemplo n.º 7
0
def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read(), 'lxml')

    for day_div in document.find_all('div', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date "{}"'.format(
                day_div['data-day']))
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = '{}-{}-{}'.format(year, date_test.group('month'),
                                     date_test.group('day'))

        closed_candidate = day_div.find('div', 'holiday') is not None

        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue

            category = meal_article.find('div', 'icon')['title']
            notes = []
            prices = {}

            additives = meal_article.find('div', 'additnr')
            if additives:
                notes += [
                    additive.text for additive in additives.find_all('li')
                ]
            notes += [
                v['title'] for v in meal_article.find_all('div', 'theicon')
                if v['title'] and v['title'] not in notes
            ]

            price_div = meal_article.find('div', 'price')
            if price_div:
                for k, v in price_map.items():
                    price = price_div['data-' + k]
                    if price:
                        prices[v] = price
            canteen.addMeal(date, category, name, notes, prices)

        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)

    return canteen.toXMLFeed()
Exemplo n.º 8
0
def parse_url(url, today=False):
    content = urlopen(url).read()
    document = parse(content)
    legends = document.find_all('div', {'class': 'legende'})
    if len(legends) > 0:
        extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))}
    else:
        extraLegend = {}
    canteen = LazyBuilder()
    for day_td in document.find_all('td', text=day_regex):
        date = day_regex.search(day_td.string).group('date')
        table = None
        for element in day_td.parents:
            if element.name == 'table':
                table = element
                break
        if not table:
            continue
        for tr in table.tbody.find_all('tr'):
            if 'geschlossen' in tr.text or 'Feiertage' in tr.text:
                match = day_range_regex.search(tr.text)
                if not match:
                    canteen.setDayClosed(date)
                else:
                    fromDate = datetime.datetime.strptime(match.group('from'), '%d.%m.%Y')
                    toDate = datetime.datetime.strptime(match.group('to'), '%d.%m.%Y')
                    while fromDate <= toDate:
                        canteen.setDayClosed(fromDate.strftime('%Y-%m-%d'))
                        fromDate += datetime.date.resolution
                continue
            if len(tr) != 3:
                continue  # no meal
            strings = list(tr.contents[0].strings)
            name = strings[0]
            # prices:
            prices = strings[-1].split('|')
            print(prices)
            if '-' in map(lambda v: v.strip(), prices):
                prices = {}
            # notes:
            notes = []
            for img in tr.contents[1].find_all('img'):
                notes.append(img['alt'].replace('Symbol', '').strip())
            for extra in list(set(map(lambda v: int(v), extra_regex.findall(tr.text)))):
                if extra in extraLegend:
                    notes.append(extraLegend[extra])
            canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None)
    return canteen.toXMLFeed()
def parse_url(url, today=False):
    canteen = LazyBuilder()
    try:
        xml_data = urlopen(url).read()
    except Exception:
        return canteen.toXMLFeed()
    root = ET.fromstring(xml_data)
    for day in root:
        date = time.strftime('%d.%m.%Y', time.localtime(int(day.get('timestamp'))))
        for item in day:
            title = item.find('title').text
            description = get_description(title)
            notes = build_notes_string(title)
            plist = [item.find('preis1').text, item.find('preis2').text, item.find('preis3').text]
            food_type = get_food_types(item.find('piktogramme').text)
            canteen.addMeal(date, food_type, description, notes, plist, roles)
    return canteen.toXMLFeed()
Exemplo n.º 10
0
def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read())
    for day_div in document.find_all('div', 'day') + document.find_all('article', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date')
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = "{}-{}-{}".format(year, date_test.group('month'), date_test.group('day'), )
        if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text:
            canteen.setDayClosed(date)
            continue
        closed_candidate = False
        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue
            if 'geschlossen' in name:
                closed_candidate = True
                continue
            category = meal_article.find('div', 'desc').text
            notes = [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title']]
            if meal_article.find('div', 'additive'):
                notes += [v[0] for v in extra_regex.findall(meal_article.find('div', 'additive').text)]
            price_div = meal_article.find('div', 'price')
            if price_div is None:
                canteen.addMeal(date, category, name, notes)
                continue
            prices = {}
            for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')):
                price = price_regex.search(price_div['data-' + v])
                if price:
                    prices[r] = price.group('price')
                elif v == 'default':
                    prices = {}
                    break
            canteen.addMeal(date, category, name, notes, prices)
        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)
    return canteen.toXMLFeed()
Exemplo n.º 11
0
def parse_url(url, today=False):
    canteen = LazyBuilder()
    document = parse(urlopen(url).read(), 'lxml')

    for day_div in document.find_all('div', attrs={'data-day': True}):
        # parse date, warning: calculate year number needed
        date_test = day_regex.search(day_div['data-day'])
        if not date_test:
            print('Error: unable to parse date "{}"'.format(day_div['data-day']))
            continue
        else:
            year = datetime.datetime.now().year
            if datetime.datetime.now().month > int(date_test.group('month')):
                year += 1  # date from next year
            date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day'))

        closed_candidate = day_div.find('div', 'holiday') is not None

        for meal_article in day_div.find_all('article', 'menu'):
            name = meal_article.find('div', 'title').text
            if not name:
                continue

            category = meal_article.find('div', 'icon')['title']
            notes = []
            prices = {}

            additives = meal_article.find('div', 'additnr')
            if additives:
                notes += [additive.text for additive in additives.find_all('li')]
            notes += [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes]

            price_div = meal_article.find('div', 'price')
            if price_div:
                for k, v in price_map.items():
                    price = price_div['data-' + k]
                    if price:
                        prices[v] = price
            canteen.addMeal(date, category, name, notes, prices)

        if closed_candidate and not canteen.hasMealsFor(date):
            canteen.setDayClosed(date)

    return canteen.toXMLFeed()
Exemplo n.º 12
0
def main(url='https://www.stw-bremen.de/de/essen-trinken/mensa-nw-1',
         out='xml'):

    # TODO: replace ids with a findall food-plan-* wildcard
    data = {}  # dict to store parsed data
    today = dt.date.today()

    s = requests.session()
    r = s.get(url)  # get request from stw server
    html = r.content  # the raw html code of the returned page
    soup = BeautifulSoup(html, 'html.parser')  # source code parser

    canteen = LazyBuilder()

    days = soup.find_all(id=re.compile("^food-plan-"))
    #print(len(days))
    #for id in ids:  # for each day
    for html_day in days:
        date_id = html_day['id']  # food-plan-3
        workday_offset = int(date_id.split('-')[-1])
        #print(workday_offset)
        date = get_date_from_id(workday_offset)
        date_str = dt.datetime.strftime(date, '%Y-%m-%d')
        data[date_str] = {}  # init dict for each id
        # The information for each meal is stored in a seperate table with class
        # food-category, to get all categories (not hardcoded loop them)
        html_meals = html_day.find_all("table", "food-category")
        for meal in html_meals:
            # meal is still a html code string
            category_name = meal.find('th', 'category-name').string
            meal_text = ''
            # since there are added line breaks and <sup> tags, I use the strings
            # generator instead of the get_text() or .text methods
            meal_parts = meal.find('td',
                                   'field-name-field-description').strings
            for m in meal_parts:  # m is an iteratable part of the html contents
                if not m.parent.name == 'sup':
                    meal_text += str(m)
            #meal_text = meal_text.rstrip()  # remove win/unix linebreaks
            meal_text = meal_text.replace('\r', '')
            meal_text = meal_text.replace('\n', ' ')
            meal_text = meal_text.replace('* * *', '; ')
            meal_price_a = meal.find('td',
                                     'field-name-field-price-students').text
            meal_price_b = meal.find('td',
                                     'field-name-field-price-employees').text

            m = {}
            m['text'] = meal_text
            m['A'] = meal_price_a
            m['B'] = meal_price_b
            data[date_str][category_name] = m

            #Use LazyBuilder:
            canteen.addMeal(date,
                            category_name,
                            meal_text,
                            prices={
                                'student': meal_price_a,
                                'employee': meal_price_b
                            })
    om = canteen.toXMLFeed()

    #print(data)
    j = json.dumps(data, ensure_ascii=False)  # without s saves to file
    #print(j)

    if out == 'xml':
        return om
    elif out == 'json':
        return j
Exemplo n.º 13
0
def parse_url(url, today=False):
    canteen = LazyBuilder()

    # prices are stored on a separate page
    document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml')
    prices = {}
    for tr in document.find('div', 'ce-bodytext').find_all('tr'):
        meal = tr.find('th')
        if not meal or not meal.text.strip():
            continue
        if len(tr.find_all('td', 'betrag')) < 3:
            continue
        if 'titel' in meal.attrs.get(
                'class', []) or 'zeilentitel' in meal.attrs.get('class', []):
            continue
        meal = meal.text.strip()
        prices[meal] = {}
        for role, _id in [('student', 0), ('employee', 1), ('other', 2)]:
            price_html = tr.find_all('td', 'betrag')[_id].text
            price_search = price_regex.search(price_html)
            if price_search:
                prices[meal][role] = price_search.group('price')

    errorCount = 0
    date = datetime.date.today()
    while errorCount < 7:
        try:
            document = parse(urlopen(url.format(date)).read(), 'lxml')
            errorCount = 0
        except HTTPError as e:
            if e.code == 404:
                errorCount += 1
                date += datetime.timedelta(days=1)
                continue
            else:
                raise e

        # extract legend
        legend = {}
        legends = document.find('div', 'tx-stwm-speiseplan')
        additions = legends.find('div', 'c-schedule__filter-body')
        for table in additions.find_all('div', 'c-schedule__filter-item'):
            for ingredient in table.find('ul').find_all('li'):
                name = ingredient.find('dt').text.strip()
                description = ingredient.find('dd').text.strip()
                legend[name] = description
        for label in legends.find('ul',
                                  'c-schedule__type-list').find_all('li'):
            name = label.find('dt').text.replace('(', '').replace(')',
                                                                  '').strip()
            description = label.find('dd').text.strip()
            legend[name] = description

        # extract meals
        mensa_data = document.find('ul', 'c-schedule__list')
        category = None
        for meal in mensa_data.find_all('li'):
            # update category or use previous one if not specified
            category_text = meal.find('dt', 'c-schedule__term').text.strip()
            if category_text:
                category = category_text

            data = meal.find('dd').find('p', 'js-schedule-dish-description')
            name = data.contents[0].strip()  # name is the first text node
            if not name:
                continue

            # notes are contained in 3 boxes (type, additional, allergen) and
            # are comma-separated lists enclosed in brackets or parentheses
            notes = []
            for note in meal.find_all('span', 'c-schedule__marker'):
                note_text = note.find('span', 'u-text-sup').text \
                    .replace('(', '').replace(')', '') \
                    .replace('[', '').replace(']', '')
                notes += [n for n in note_text.split(',') if n]

            # some meals contain the GQB label in their name (instead of in notes)
            if '(GQB)' in name:
                name = name.replace('(GQB)', '').strip()
                notes.append('GQB')

            # the price for both meals is specified as Bio-/Aktionsgericht
            price_category = category \
                .replace('Aktionsessen', 'Bio-/Aktionsgericht') \
                .replace('Biogericht', 'Bio-/Aktionsgericht') \
                .strip()

            canteen.addMeal(date, category, name,
                            [legend.get(n, n) for n in notes],
                            prices.get(price_category, {}))

        date += datetime.timedelta(days=1)
        if today:
            break

    return canteen.toXMLFeed()
Exemplo n.º 14
0
def parse_url(url, today=False):
    content = urlopen(url).read()
    document = parse(content, "lxml")
    canteen = LazyBuilder()
    table = document.find_all('table')[0]

    def debug_print(food_type, food_description, priceing):
        if (priceing is None):
            print(date + ': ' + food_type + ": " + food_description)
        else:
            print(date + ': ' + food_type + ": " + food_description + " : ",
                  end='')
            for e in priceing:
                print(e, end=' ')
            print()

    def is_new_entry(tds):
        td = tds[0]
        return td.string is not None and date_regex.search(
            td.string) is not None

    def is_end_of_entry(tds):
        for td in tds:
            if (td.string is None or td.string.strip() != ''):
                return False
        return True

    def is_action_entry(td):
        return td.text == 'Aktion'

    def is_closed(tds):
        return is_new_entry(tds) and get_pricing(tds, 4, 7) is None

    def refactor_date(raw_date):
        now = datetime.datetime.now()
        day = date_regex.search(raw_date).group('day')
        month = date_regex.search(raw_date).group('month')
        year = now.year
        if month == '01' and now.month == 12:
            # if list depicts meals from this and the next year
            year += 1
        elif month == '12' and now.month == 1:
            # if list depicts meals form this and the last year
            year -= 1
        return day + '.' + month + '.' + str(year)

    def parse_foot_type(td):
        type = ''
        if td.string is None:
            img = td.find_all('img')[0]
            src = img.get('src')
            if ('msc' in src):
                type += 'Fish MSC '
            elif ('vegan' in src):
                type += 'Vegan '
        #Sometimes none categorized food is possible, therfore we need to cover this,
        #otherwhise openmensa.org will faile dueto an empty tag.
        elif (td.string.strip() == ''):
            type += 'Tipp '
        else:
            if ('R' in td.string):
                type += 'Rind '
            if ('S' in td.string):
                type += 'Schwein '
            if ('G' in td.string):
                type += 'Geflügel '
            if ('V' in td.string):
                type += 'Vegetarisch '
            if ('F' in td.string):
                type += 'Fisch '
            if ('L' in td.string):
                type += 'Lamm '
            if ('W' in td.string):
                type += 'Wild '
        tl = list(type)[:-1]
        return ''.join(tl)

    def get_refs(td):
        return td.find_all('sup')

    def get_foot_description(td):
        refl = get_refs(td)
        description = td.text
        for ref in refl:
            description = description.replace(' ' + ref.text, '', 1)
        if description[0] == ' ':
            description = description.replace(' ', '', 1)
        return description

    def get_notes(td):
        refl = get_refs(td)
        strl = []
        for ref in refl:
            strl.extend(ref.string.split(','))
        strl = list(set(strl))
        return strl

    def build_notes_string(td):
        refs = get_notes(td)
        food_is = ''
        food_contains = ''
        for r in refs:
            # parse food is footnotes
            if r == '1':
                food_is += 'mit Farbstoffen, '
            elif r == '4':
                food_is += 'geschwärzt, '
            elif r == '7':
                food_is += 'mit Antioxidationsmittel, '
            elif r == '8':
                food_is += 'mit Geschmacksverstärker, '
            elif r == '9':
                food_is += 'geschwefelt, '
            elif r == '10':
                food_is += 'geschwärzt, '
            elif r == '11':
                food_is += 'gewachst, '
            elif r == '12':
                food_is += 'mit Phosphat, '
            elif r == '5':
                food_is += 'mit Süßungsmittel, '
            # parse allergic footnotes
            elif r == 'a1':
                food_contains += 'Gluten, '
            elif r == 'a2':
                food_contains += 'Krebstiere, '
            elif r == 'a3':
                food_contains += 'Eier, '
            elif r == 'a4':
                food_contains += 'Fisch, '
            elif r == 'a5':
                food_contains += 'Erdnüsse, '
            elif r == 'a6':
                food_contains += 'Soja, '
            elif r == 'a7':
                food_contains += 'Milch/Laktose, '
            elif r == 'a8':
                food_contains += 'Schalenfrüchte, '
            elif r == 'a9':
                food_contains += 'Sellerie, '
            elif r == 'a10':
                food_contains += 'Senf, '
            elif r == 'a11':
                food_contains += 'Sesam, '
            elif r == 'a12':
                food_contains += 'Schwefeldioxid/Sulfite, '
            elif r == 'a13':
                food_contains += 'Lupinen, '
            elif r == 'a14':
                food_contains += 'Weichtiere, '
            else:
                food_contains += 'undefinierte Chemikalien:' + r + ', '
        notes = ''
        if food_is != '':
            notes += 'Gericht ist ' + food_is
        if food_contains != '':
            if food_is == '':
                notes += 'Gericht enthält '
            else:
                notes += 'und enthält '
            notes += food_contains
        if notes != '':
            nl = list(notes)
            del nl[len(nl) - 1]
            nl[len(nl) - 1] = '.'
            notes = ''.join(nl)
        return notes

    def get_pricing(tds, f, t):
        priceing = []
        #sometimes we dont don't get 7 elements, than this might be a special day
        if len(tds) < 7:
            return None
        for i in range(f, t):
            raw_price = tds[i].string.strip()
            if raw_price == '':
                return None
            else:
                priceing.append(price_regex.search(raw_price).group('val'))
        return priceing

    # state helper
    inside_valide_entry = False
    date = ''

    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if (is_new_entry(tds)):
            try:
                raw_date = tds[0].string
                date = refactor_date(raw_date)
                if (is_closed(tds)):
                    # sometismes a canteen might look closed but actually its spargeltage
                    if "Spargeltage" in tds[3].text:
                        canteen.addMeal(date, "Spargel", "Spargel Tag",
                                        ["Spargel und andere Gerichte."], None,
                                        None)
                    else:
                        canteen.setDayClosed(date)
                else:
                    inside_valide_entry = True
            except Exception as e:
                traceback.print_exception(*sys.exc_info())
        if (is_end_of_entry(tds)):
            inside_valide_entry = False
        elif inside_valide_entry:
            try:
                notes = []
                if is_action_entry(tds[0]):
                    food_type = parse_foot_type(tds[1])
                    food_description = get_foot_description(tds[2])
                    notes_string = build_notes_string(tds[2])
                    if (notes_string != ""):
                        notes.append(notes_string)
                    prices = get_pricing(tds, 3, 6)
                    canteen.addMeal(date, 'Aktion: ' + food_type,
                                    food_description, notes, prices,
                                    roles if prices else None)
                else:
                    food_type = parse_foot_type(tds[2])
                    food_description = get_foot_description(tds[3])
                    notes_string = build_notes_string(tds[3])
                    if (notes_string != ""):
                        notes.append(notes_string)
                    prices = get_pricing(tds, 4, 7)
                    canteen.addMeal(date, food_type, food_description, notes,
                                    prices, roles if prices else None)
            except Exception as e:
                traceback.print_exception(*sys.exc_info())

    return canteen.toXMLFeed()
Exemplo n.º 15
0
def parsePlan(url, internalMensaId, today):
    canteen = LazyBuilder()
    end = False
    while (url != None):
        dom = BeautifulSoup(urlopen(url).read(), 'lxml')
        date = dom.select('#mensa_date > p')[0].contents[0]
        menuDefinition = dom.find(id=internalMensaId)
        menuDescription = menuDefinition.parent.find('dd')
        tables = menuDescription.select('table')
        legend = {}
        legend = buildLegend(
            legend,
            str(dom),
            regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)'
        )
        if tables != None and len(tables) == 1:
            table = tables[0]
            rows = table.find_all('tr')
            for row in rows:
                menuNameElement = row.select('td[class="mensa_col_55"] > b')
                if menuNameElement != None and menuNameElement[
                        0].contents != None:
                    menuName = menuNameElement[0].contents[0]
                    category = 'Gericht'

                    # get notes
                    notes = {}
                    notesElement = row.select(
                        'td[class="mensa_col_55"] > span')
                    if notesElement != None and len(
                            notesElement) > 0 and notesElement[0].text != None:
                        notes = [
                            legend.get(n, n)
                            for n in notesElement[0].text.split(' ') if n
                        ]

                    # get prices
                    prices = {}
                    for td in row.select('td[class="mensa_col_15"]'):
                        priceElement = td.find('b')
                        groupElement = td.find('span')
                        if priceElement != None and groupElement != None and groupElement.contents != None and len(
                                groupElement.contents
                        ) > 0 and priceElement.contents != None and len(
                                priceElement.contents) > 0:
                            group = str(groupElement.contents[0])
                            price = str(priceElement.contents[0])
                            if group == 'Stud.:':
                                prices['student'] = price
                            elif group == 'Bed.:':
                                prices['employee'] = price
                            elif group == 'Gast:':
                                prices['other'] = price

                    canteen.addMeal(date, category, menuName, notes, prices)
        else:
            canteen.setDayClosed(date)

        # check for further pages
        nextPageLink = dom.find(id='next_day_link')
        if nextPageLink == None or today:
            url = None
        else:
            url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href']
    return canteen.toXMLFeed()
def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {
        '1':     'mit Farbstoff',
        '2':     'mit Konservierungsstoff',
        '3':     'mit Antioxidationsmittel',
        '4':     'mit Geschmacksverstärker',
        '5':     'geschwefelt',
        '6':     'geschwärzt',
        '7':     'gewachst',
        '8':     'mit Phosphat',
        '9':     'mit Süssungsmittel Saccharin',
        '10':    'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle',
        '11':    'mit Süssungsmittel Cyclamat',
        '12':    'mit Süssungsmittel Acesulfam',
        '13':    'chininhaltig',
        '14':    'coffeinhaltig',
        '15':    'gentechnisch verändert',
        '16':    'enthält Sulfite',
        '17':    'enthält Phenylalanin',
        'A':     'Gluten',
        'B':     'Krebstiere',
        'C':     'Eier',
        'D':     'Fisch',
        'E':     'Erdnüsse',
        'F':     'Soja',
        'G':     'Milch und Milchprodukte',
        'H':     'Schalenfrüchte',
        'I':     'Sellerie',
        'J':     'Senf',
        'K':     'Sesamsamen',
        'L':     'Schwefeldioxid und Sulfite',
        'M':     'Lupinen',
        'N':     'Weichtiere',
        'ZTA':   'Aktionsgericht',
        'ZTB':   'mit ausschließlich biologisch erzeugten Rohstoffen',
        'ZTF':   'Fisch',
        'ZTG':   'Geflügel',
        'ZTL':   'Lamm',
        'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)',
        'ZTMV':  'Mensa Vital',
        'ZTR':   'Rindfleisch',
        'ZTS':   'Schweinefleisch',
        'ZTV':   'vegetarisch',
        'ZTVG':  'vegan',
        'ZTW':   'Wild'
    }

    # Create regular expressions for categories
    hg = re.compile("^HG[1-9]$")
    b = re.compile("^B[1-9]$")
    n = re.compile("^N[1-9]$")

    # Get current and next isoweek and try to get the data
    # On error 404 continue with next isoweek
    # Returns an empty feed if both isoweeks result in error 404
    # At most locations the data doesn't exist on term break
    for w in 0, 1:
        kw = (date.today() + timedelta(weeks=w)).isocalendar()[1]
        try:
            f = urlopen('%(location)s/%(isoweek)d.csv' %
                        {'location': url, 'isoweek': kw})
        except HTTPError as e:
            if e.code == 404:
                continue
            else:
                raise e

        # Decode data from ISO charset
        f = f.read().decode('iso8859-1')

        # Set roles for prices
        roles = ('student', 'employee', 'other')

        # Read csv data and skip the csv header
        mealreader = reader(f.splitlines(), delimiter=';')
        next(mealreader)
        for row in mealreader:
            mdate = row[0]
            category = row[2]
            mname = row[3]
            mtype = row[4]
            prices = [row[6], row[7], row[8]]

            # determine category for the current meal
            if category == 'Suppe':
                pass
            elif hg.match(category):
                category = 'Hauptgerichte'
            elif b.match(category):
                category = 'Beilagen'
            elif n.match(category):
                category = 'Nachspeisen'
            else:
                raise RuntimeError('Unknown category: ' + str(category))

            # Extract the notes from brackets in the meal name
            # Remove the brackets, notes and improve readability
            notes = []
            bpos = mname.find(')')
            while bpos != -1:
                apos = mname.find('(')
                # Extract notes from current brackets and avoid empty notes
                for i in mname[apos+1:bpos].split(','):
                    if i:
                        notes.append(i)
                # Check if brackets are at the end of the meal name
                if bpos == len(mname)-1:
                    # Remove brackets and break bracket loop
                    mname = mname[:apos]
                    bpos = -1
                else:
                    # Remove current brackets, improve readability
                    # and find the next brackets
                    mname = mname[:apos].rstrip() + ' und ' + mname[bpos+1:].lstrip()
                    bpos = mname.find(')')

            # Remove trailing whitespaces in the meal name
            mname = mname.rstrip()

            # Add meal type notes to notes list and avoid empty notes
            for i in mtype.split(','):
                if i:
                    notes.append('ZT' + i)

            # Translate notes via legend to human readable information
            mnotes = []
            for i in notes:
                mnotes.append(legend.get(i, legend.get(i[2:], i)))

            # Try to add the meal
            try:
                canteen.addMeal( mdate, category, mname,
                                mnotes, prices, roles)
            except ValueError as e:
                print('could not add meal {}/{} "{}" due to "{}"'.format(mdate, category, mname, e), file=sys.stderr)
                # empty meal ...
                pass

    # return xml data
    return canteen.toXMLFeed()
def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {
        '1': 'mit Farbstoff',
        '2': 'mit Konservierungsstoff',
        '3': 'mit Antioxidationsmittel',
        '4': 'mit Geschmacksverstärker',
        '5': 'geschwefelt',
        '6': 'geschwärzt',
        '7': 'gewachst',
        '8': 'mit Phosphat',
        '9': 'mit Süssungsmittel Saccharin',
        '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle',
        '11': 'mit Süssungsmittel Cyclamat',
        '12': 'mit Süssungsmittel Acesulfam',
        '13': 'chininhaltig',
        '14': 'coffeinhaltig',
        '15': 'gentechnisch verändert',
        '16': 'enthält Sulfite',
        '17': 'enthält Phenylalanin',
        'A': 'Gluten',
        'AA': 'Weizen',
        'AB': 'Roggen',
        'AC': 'Gerste',
        'AD': 'Hafer',
        'AE': 'Dinkel',
        'AF': 'Kamut',
        'B': 'Krebstiere',
        'C': 'Eier',
        'D': 'Fisch',
        'E': 'Erdnüsse',
        'F': 'Soja',
        'G': 'Milch und Milchprodukte',
        'H': 'Schalenfrüchte',
        'HA': 'Mandel',
        'HB': 'Haselnuss',
        'HC': 'Walnuss',
        'HD': 'Cashew',
        'HE': 'Pecannuss',
        'HF': 'Paranuss',
        'HG': 'Pistazie',
        'HH': 'Macadamianuss',
        'HI': 'Queenslandnuss',
        'I': 'Sellerie',
        'J': 'Senf',
        'K': 'Sesamsamen',
        'L': 'Schwefeldioxid und Sulfite',
        'M': 'Lupinen',
        'N': 'Weichtiere',
        'O': 'Nitrat',
        'P': 'Nitritpökelsalz',
        'ZTA': 'Alkohol',
        'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen',
        'ZTF': 'Fisch',
        'ZTG': 'Geflügel',
        'ZTL': 'Lamm',
        'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)',
        'ZTMV': 'Mensa Vital',
        'ZTR': 'Rindfleisch',
        'ZTS': 'Schweinefleisch',
        'ZTV': 'vegetarisch',
        'ZTVG': 'vegan',
        'ZTW': 'Wild'
    }

    # Create regular expressions for categories
    hg = re.compile("^HG[1-9]$")
    b = re.compile("^B[1-9]$")
    n = re.compile("^N[1-9]$")

    # Get two weeks for full.xml and only the current one for today.xml
    # On error 404 continue with next isoweek
    # Returns an empty feed if both isoweeks result in error 404
    # At most locations the data doesn't exist on term break
    weeks = 1 if today else 2
    for w in range(0, weeks):
        kw = (date.today() + timedelta(weeks=w)).isocalendar()[1]
        try:
            f = urlopen('%(location)s/%(isoweek)d.csv' % {
                'location': url,
                'isoweek': kw
            })
        except HTTPError as e:
            if e.code == 404:
                continue
            else:
                raise e

        # Decode data from ISO charset
        f = f.read().decode('iso8859-1')

        # Set roles for prices
        roles = ('student', 'employee', 'other')

        # Read csv data and skip the csv header
        mealreader = reader(f.splitlines(), delimiter=';')
        next(mealreader)
        for row in mealreader:
            mdate = row[0]
            category = row[2]
            mname = row[3]
            mtype = row[4]
            prices = [row[6], row[7], row[8]]

            # determine category for the current meal
            if category == 'Suppe':
                pass
            elif hg.match(category):
                category = 'Hauptgerichte'
            elif b.match(category):
                category = 'Beilagen'
            elif n.match(category):
                category = 'Nachspeisen'
            else:
                raise RuntimeError('Unknown category: ' + str(category))

            # Extract the notes from brackets in the meal name
            # Remove the brackets, notes and improve readability
            notes = []
            bpos = mname.find(')')
            while bpos != -1:
                apos = mname.find('(')
                # Extract notes from current brackets and avoid empty notes
                for i in mname[apos + 1:bpos].split(','):
                    if i:
                        notes.append(i)
                # Check if brackets are at the end of the meal name
                if bpos == len(mname) - 1:
                    # Remove brackets and break bracket loop
                    mname = mname[:apos]
                    bpos = -1
                else:
                    # Remove current brackets, improve readability
                    # and find the next brackets
                    mname = mname[:apos].rstrip(
                    ) + ' und ' + mname[bpos + 1:].lstrip()
                    bpos = mname.find(')')

            # Remove trailing whitespaces in the meal name
            mname = mname.rstrip()

            # Add meal type notes to notes list and avoid empty notes
            for i in mtype.split(','):
                if i:
                    notes.append('ZT' + i)

            # Translate notes via legend to human readable information
            mnotes = []
            for i in notes:
                mnotes.append(legend.get(i, legend.get(i[2:], i)))

            # Try to add the meal
            try:
                canteen.addMeal(mdate, category, mname, mnotes, prices, roles)
            except ValueError as e:
                print('could not add meal {}/{} "{}" due to "{}"'.format(
                    mdate, category, mname, e),
                      file=sys.stderr)
                # empty meal ...
                pass

    # return xml data
    return canteen.toXMLFeed()
def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {
        '1':     'mit Farbstoff',
        '2':     'mit Konservierungsstoff',
        '3':     'mit Antioxidationsmittel',
        '4':     'mit Geschmacksverstärker',
        '5':     'geschwefelt',
        '6':     'geschwärzt',
        '7':     'gewachst',
        '8':     'mit Phosphat',
        '9':     'mit Süssungsmittel Saccharin',
        '10':    'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle',
        '11':    'mit Süssungsmittel Cyclamat',
        '12':    'mit Süssungsmittel Acesulfam',
        '13':    'chininhaltig',
        '14':    'coffeinhaltig',
        '15':    'gentechnisch verändert',
        '16':    'enthält Sulfite',
        '17':    'enthält Phenylalanin',
        'A':     'Gluten',
        'B':     'Krebstiere',
        'C':     'Eier',
        'D':     'Fisch',
        'E':     'Erdnüsse',
        'F':     'Soja',
        'G':     'Milch und Milchprodukte',
        'H':     'Schalenfrüchte',
        'I':     'Sellerie',
        'J':     'Senf',
        'K':     'Sesamsamen',
        'L':     'Schwefeldioxid und Sulfite',
        'M':     'Lupinen',
        'N':     'Weichtiere',
        'ZTA':   'Aktionsgericht',
        'ZTB':   'mit ausschließlich biologisch erzeugten Rohstoffen',
        'ZTF':   'Fisch',
        'ZTG':   'Geflügel',
        'ZTL':   'Lamm',
        'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)',
        'ZTMV':  'Mensa Vital',
        'ZTR':   'Rindfleisch',
        'ZTS':   'Schweinefleisch',
        'ZTV':   'vegetarisch',
        'ZTVG':  'vegan',
        'ZTW':   'Wild'
    }
    #canteen.setLegendData(legend)

    hg = re.compile("^HG[1-9]$")
    b = re.compile("^B[1-9]$")
    n = re.compile("^N[1-9]$")

    #for w in 0, 1:
    for w in [0]:
        kw = (date.today() + timedelta(weeks=w)).isocalendar()[1]
        try:
            f = urlopen('%(location)s/%(isoweek)d.csv' %
                        {'location': url, 'isoweek': kw})
        except HTTPError as e:
            if e.code == 404:
                continue
            else:
                raise e
        f = f.read().decode('iso8859-1')

        roles = ('student', 'employee', 'other')

        initline = True
        mealreader = reader(f.splitlines(), delimiter=';')
        for row in mealreader:
            if initline:
                initline = False
            else:
                if row[2] == 'Suppe':
                    category = 'Suppe'
                elif hg.match(row[2]):
                    category = 'Hauptgerichte'
                elif b.match(row[2]):
                    category = 'Beilagen'
                elif n.match(row[2]):
                    category = 'Nachspeisen'
                else:
                    raise RuntimeError('Unknown category: ' + str(row[2]))

                mdate = row[0]
                notes = []

                mname = row[3]
                bpos = mname.find(')')
                while bpos != -1:
                    apos = mname.find('(')
                    for i in mname[apos+1:bpos].split(','):
                        notes.append(i)
                    if bpos == len(mname)-1:
                        mname = mname[:apos] + mname[bpos+1:]
                        bpos = -1
                    else:
                        mname = mname[:apos] + ' und ' + mname[bpos+1:]
                        bpos = mname.find(')')
                if mname.rfind(' ') == len(mname)-1:
                    mname = mname[:len(mname)-1]

                mtype = row[4]
                if mtype != '':
                    for i in mtype.split(','):
                        notes.append('ZT' + i)

                prices = [row[6], row[7], row[8]]

                mnotes = []
                for i in notes:
                    mnotes.append(legend.get(i, legend.get(i[2:], i)))

                try:
                    canteen.addMeal(mdate, category, mname,
                                    mnotes, prices, roles)
                except ValueError as e:
                    print('could not add meal {}/{} "{}" due to "{}"'.format(mdate, category, mname, e), file=sys.stderr)
                    # empty meal ...
                    pass

    return canteen.toXMLFeed()
Exemplo n.º 19
0
def parse_url(url, today=False):
    today = datetime.date.today()
    if today.weekday() == 6:  # Sunday
        today += datetime.timedelta(days=1)  # Tomorrow

    url = url % today.strftime('%Y_%m_%d')

    if not url.startswith("http://") and not url.startswith("https://"):
        raise RuntimeError("url is not an allowed URL: '%s'" % url)
    try:
        content = requests.get(url).text
    except requests.exceptions.ConnectionError as e:
        logging.warning(e)
        content = requests.get(url, verify=False).text

    document = BeautifulSoup(content, "html.parser")
    canteen = LazyBuilder()

    # Prices for employees and guests
    try:
        p = price_regex.search(document.find(
            "p", {"id": "message"}).text).groupdict()
        employee_multiplier = 1.0 + int(p["employee"]) / 100.0
        guest_multiplier = 1.0 + int(p["guest"]) / 100.0
    except (AttributeError, TypeError, KeyError, ValueError):
        employee_multiplier = 1.25
        guest_multiplier = 1.60

    trs = document.find("table", {"id": "previewTable"}).find_all("tr")

    canteenCategories = []

    firstTr = True
    previous = None   # previous tr row
    for tr in trs:
        closed = False
        mealsFound = False
        if firstTr:
            # First table row contains the names of the different categories
            firstTr = False

            for th in tr.find_all("th")[1:]:
                canteenCategories.append(th.text.strip())

        elif previous is None:
            # Normal table row containing meal information
            previous = tr

        else:
            # Price table row
            date = day_regex.search(previous.find("td", {"class": "first"})[
                                    "data-date"]).group('date')

            if "geschlossen" == previous.find_all("td")[1].text.strip():
                closed = date

            cat = 0
            for td0, td1 in zip(previous.find_all("td")[
                                1:], tr.find_all("td")):
                if "heute kein Angebot" in td0.text or "geschlossen" in td0.text:
                    cat += 1
                    continue

                notes = []

                # Category
                if td0.find("h2"):
                    categoryName = canteenCategories[cat] + " " + \
                        correctCapitalization(td0.find("h2").text.strip())
                else:
                    categoryName = canteenCategories[cat]

                if "Kubusangebote am Themenpark" in td0.text:
                    canteen.addMeal(date, categoryName,
                                    "Kubusangebote am Themenpark", [])
                    cat += 1
                    continue

                # Name
                if td0.find("p"):
                    name = removeextras_regex.sub("", td0.find("p").text)
                else:
                    name = categoryName  # No name available, let's just use the category name

                # Prices
                prices = []
                spans = td1.find_all("span", {"class": "label"})
                if spans:
                    try:
                        price = float(euro_regex.search(
                            spans[0].text).group(1).replace(",", "."))
                    except (AttributeError, TypeError, KeyError, ValueError):
                        notes.append(spans[0].text.strip() + " Preis")
                    if len(spans) == 2:
                        notes.append(spans[1].text.strip() + " Preis")
                    prices = (price, price * employee_multiplier,
                              price * guest_multiplier)

                # Notes: vegan, vegetarisch, ...
                notes += [icon["title"]
                          for icon in td1.find_all("span", {"class": "icon"})]

                canteen.addMeal(date, categoryName, name,
                                notes, prices, roles if prices else None)

                mealsFound = True
                cat += 1

            previous = None
        if not mealsFound and closed:
            canteen.setDayClosed(closed)

    return canteen.toXMLFeed()
Exemplo n.º 20
0
def parse_url(url, today=False):
    canteen = LazyBuilder()

    # prices are stored on a separate page
    document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml')
    prices = {}
    for tr in document.find('div', 'ce-bodytext').find_all('tr'):
        meal = tr.find('th')
        if not meal or not meal.text.strip():
            continue
        if len(tr.find_all('td', 'betrag')) < 3:
            continue
        if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []):
            continue
        meal = meal.text.strip()
        prices[meal] = {}
        for role, _id in [('student', 0), ('employee', 1), ('other', 2)]:
            price_html = tr.find_all('td', 'betrag')[_id].text
            price_search = price_regex.search(price_html)
            if price_search:
                prices[meal][role] = price_search.group('price')

    errorCount = 0
    date = datetime.date.today()
    while errorCount < 7:
        try:
            document = parse(urlopen(url.format(date)).read(), 'lxml')
            errorCount = 0
        except HTTPError as e:
            if e.code == 404:
                errorCount += 1
                date += datetime.timedelta(days=1)
                continue
            else:
                raise e

        # extract legend
        legend = {}
        legends = document.find('div', 'tx-stwm-speiseplan')
        additions = legends.find('div', 'c-schedule__filter-body')
        for table in additions.find_all('div', 'c-schedule__filter-item'):
            for ingredient in table.find('ul').find_all('li'):
                name = ingredient.find('dt').text.strip()
                description = ingredient.find('dd').text.strip()
                legend[name] = description
        for label in legends.find('ul', 'c-schedule__type-list').find_all('li'):
            name = label.find('dt').text.replace('(', '').replace(')', '').strip()
            description = label.find('dd').text.strip()
            legend[name] = description

        # extract meals
        mensa_data = document.find('ul', 'c-schedule__list')
        category = None
        for meal in mensa_data.find_all('li'):
            # update category or use previous one if not specified
            category_text = meal.find('dt', 'c-schedule__term').text.strip()
            if category_text:
                category = category_text

            data = meal.find('dd').find('p', 'js-schedule-dish-description')
            name = data.contents[0].strip() # name is the first text node
            if not name:
                continue

            # notes are contained in 3 boxes (type, additional, allergen) and
            # are comma-separated lists enclosed in brackets or parentheses
            notes = []
            for note in meal.find_all('span', 'c-schedule__marker'):
                note_text = note.find('span', 'u-text-sup').text \
                    .replace('(', '').replace(')', '') \
                    .replace('[', '').replace(']', '')
                notes += [n for n in note_text.split(',') if n]

            # some meals contain the GQB label in their name (instead of in notes)
            if '(GQB)' in name:
                name = name.replace('(GQB)', '').strip()
                notes.append('GQB')

            # the price for both meals is specified as Bio-/Aktionsgericht
            price_category = category \
                .replace('Aktionsessen', 'Bio-/Aktionsgericht') \
                .replace('Biogericht', 'Bio-/Aktionsgericht') \
                .strip()

            canteen.addMeal(date, category, name,
                [legend.get(n, n) for n in notes],
                prices.get(price_category, {})
            )

        date += datetime.timedelta(days=1)
        if today:
            break

    return canteen.toXMLFeed()
Exemplo n.º 21
0
def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'}
    document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read())
    for td in document.find_all('td', 'beschreibung'):
        legend[td.parent.find('td', 'gericht').text] = td.text
    document = parse(urlopen(base + '/mensa-preise/').read())
    prices = {}
    for tr in document.find('div', 'ce-bodytext').find_all('tr'):
        meal = tr.find('th')
        if not meal or not meal.text.strip():
            continue
        if len(tr.find_all('td', 'betrag')) < 3:
            continue
        if 'titel' in meal.attrs.get(
                'class', []) or 'zeilentitel' in meal.attrs.get('class', []):
            continue
        meal = meal.text.strip()
        prices[meal] = {}
        for role, _id in [('student', 0), ('employee', 1), ('other', 2)]:
            price_html = tr.find_all('td', 'betrag')[_id].text
            price_search = price_regex.search(price_html)
            if price_search:
                prices[meal][role] = price_search.group('price')
    errorCount = 0
    date = datetime.date.today()
    while errorCount < 7:
        try:
            document = parse(urlopen(url.format(date)).read())
        except HTTPError as e:
            if e.code == 404:
                errorCount += 1
                date += datetime.date.resolution
                continue
            else:
                raise e
        else:
            errorCount = 0
        for tr in document.find('table', 'zusatzstoffe').find_all('tr'):
            identifier = tr.find_all('td')[0].text \
                           .replace('(', '').replace(')', '')
            legend[identifier] = tr.find_all('td')[1].text.strip()
        canteen.setLegendData(legend)
        mensa_data = document.find('table', 'menu')
        category = None
        for menu_tr in mensa_data.find_all('tr'):
            if menu_tr.find('td', 'headline'):
                continue
            if menu_tr.find('td', 'gericht').text:
                category = menu_tr.find('td', 'gericht').text
            data = menu_tr.find('td', 'beschreibung')
            name = data.find('span').text.strip()
            if not name:
                continue
            notes = [
                span['title'] for span in data.find_all('span', title=True)
            ]
            canteen.addMeal(
                date, category, name, notes,
                prices.get(
                    category.replace('Aktionsessen', 'Bio-/Aktionsgericht'),
                    {}))
        date += datetime.date.resolution
        if today:
            break
    return canteen.toXMLFeed()
Exemplo n.º 22
0
def parse_url(url, today=False):
    content = urlopen(url).read()
    document = parse(content, "lxml")
    canteen = LazyBuilder()
    table = document.find_all('table')[0]

    def debug_print(food_type, food_description, priceing):
            if(priceing is None):
                print(date+': '+food_type+": "+food_description)
            else:
                print(date+': '+food_type+": "+food_description+" : ", end='')
                for e in priceing:
                    print(e, end=' ')
                print()

    def is_new_entry(tds):
        td = tds[0]
        return td.string is not None and date_regex.search(td.string) is not None

    def is_end_of_entry(tds):
        for td in tds:
            if(td.string is None or td.string.strip() != ''):
                return False
        return True

    def is_action_entry(td):
        return td.text == 'Aktion'

    def is_closed(tds):
        return is_new_entry(tds) and get_pricing(tds, 4, 7) is None

    def refactor_date(raw_date):
        now = datetime.datetime.now()
        day = date_regex.search(raw_date).group('day')
        month = date_regex.search(raw_date).group('month')
        year = now.year
        if month == '01' and now.month == 12:
            # if list depicts meals from this and the next year
            year+=1
        elif month == '12' and now.month == 1:
            # if list depicts meals form this and the last year
            year-=1
        return day+'.'+month+'.'+str(year)

    def parse_foot_type(td):
        type = ''
        if td.string is None:
            if len(td.find_all('img')) == 0:
                return None
            else:
                img = td.find_all('img')[0]
                src = img.get('src')
                if('msc' in src):
                    type += 'Fish MSC '
                elif('vegan' in src):
                    type += 'Vegan '
        #Sometimes none categorized food is possible, therfore we need to cover this,
        #otherwhise openmensa.org will faile dueto an empty tag.
        elif(td.string.strip() == ''):
            type += 'Tipp '
        else:
            if('R' in td.string):
                type += 'Rind '
            if('S' in td.string):
                type += 'Schwein '
            if('G' in td.string):
                type += 'Geflügel '
            if('V' in td.string):
                type += 'Vegetarisch '
            if('F' in td.string):
                type += 'Fisch '
            if('L' in td.string):
                type += 'Lamm '
            if('W' in td.string):
                type += 'Wild '
        tl = list(type)[:-1]
        return ''.join(tl)

    def get_refs(td):
        return td.find_all('sup')

    def get_foot_description(td):
        refl = get_refs(td)
        description = td.text
        for ref in refl:
            description = description.replace(' '+ref.text, '', 1)
        if description[0] == ' ':
            description = description.replace(' ', '', 1)
        return description

    def get_notes(td):
        refl = get_refs(td)
        strl = []
        for ref in refl:
            strl.extend(ref.string.split(','))
        strl = list(set(strl))
        return strl

    def build_notes_string(td):
        refs = get_notes(td)
        food_is = ''
        food_contains = ''
        for r in refs:
            # parse food is footnotes
            if r == '1':
                food_is += 'mit Farbstoffen, '
            elif r == '4':
                food_is += 'geschwärzt, '
            elif r == '7':
                food_is += 'mit Antioxidationsmittel, '
            elif r == '8':
                food_is += 'mit Geschmacksverstärker, '
            elif r == '9':
                food_is += 'geschwefelt, '
            elif r == '10':
                food_is += 'geschwärzt, '
            elif r == '11':
                food_is += 'gewachst, '
            elif r == '12':
                food_is += 'mit Phosphat, '
            elif r == '5':
                food_is += 'mit Süßungsmittel, '
            # parse allergic footnotes
            elif r == 'a1':
                food_contains += 'Gluten, '
            elif r == 'a2':
                food_contains += 'Krebstiere, '
            elif r == 'a3':
                food_contains += 'Eier, '
            elif r == 'a4':
                food_contains += 'Fisch, '
            elif r == 'a5':
                food_contains += 'Erdnüsse, '
            elif r == 'a6':
                food_contains += 'Soja, '
            elif r == 'a7':
                food_contains += 'Milch/Laktose, '
            elif r == 'a8':
                food_contains += 'Schalenfrüchte, '
            elif r == 'a9':
                food_contains += 'Sellerie, '
            elif r == 'a10':
                food_contains += 'Senf, '
            elif r == 'a11':
                food_contains += 'Sesam, '
            elif r == 'a12':
                food_contains += 'Schwefeldioxid/Sulfite, '
            elif r == 'a13':
                food_contains += 'Lupinen, '
            elif r == 'a14':
                food_contains += 'Weichtiere, '
            else:
                food_contains += 'undefinierte Chemikalien:'+r+', '
        notes = ''
        if food_is != '':
            notes += 'Gericht ist ' + food_is
        if food_contains != '':
            if food_is == '':
                notes += 'Gericht enthält '
            else:
                notes += 'und enthält '
            notes += food_contains
        if notes != '':
            nl = list(notes)
            del nl[len(nl)-1]
            nl[len(nl)-1] = '.'
            notes = ''.join(nl)
        return notes

    def get_pricing(tds, f, t):
        priceing = []
        #sometimes we dont don't get 7 elements, than this might be a special day
        if len(tds) < 7:
            return None
        for i in range(f, t):
            raw_price = tds[i].string.strip()
            if raw_price == '':
                return None
            else:
                priceing.append(price_regex.search(raw_price).group('val'))
        return priceing

    # state helper
    inside_valide_entry = False
    date = ''

    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if(is_new_entry(tds)):
            try:
                raw_date = tds[0].string
                date = refactor_date(raw_date)
                if(is_closed(tds)):
                    # sometismes a canteen might look closed but actually its spargeltage
                    if "Spargeltage" in tds[3].text:
                        canteen.addMeal(date, "Spargel", "Spargel Tag", ["Spargel und andere Gerichte."], None, None)
                    else:
                        canteen.setDayClosed(date)
                else:
                    inside_valide_entry = True
            except Exception as e:
                traceback.print_exception(*sys.exc_info())
        if(is_end_of_entry(tds)):
            inside_valide_entry = False
        elif inside_valide_entry:
            try:
                notes = []
                if is_action_entry(tds[0]):
                    food_type = parse_foot_type(tds[1])
                    food_description = get_foot_description(tds[2])
                    notes_string = build_notes_string(tds[2])
                    if(notes_string != ""):
                        notes.append(notes_string)
                    prices = get_pricing(tds, 3, 6)


                    canteen.addMeal(date, 'Aktion: '+food_type, food_description, notes, prices, roles if prices else None)
                else:
                        food_type = parse_foot_type(tds[2])
                        food_description = get_foot_description(tds[3])
                        notes_string = build_notes_string(tds[3])
                        if(notes_string != ""):
                            notes.append(notes_string)
                        prices = get_pricing(tds, 4, 7)
                        if food_type is not None:
                            canteen.addMeal(date, food_type, food_description, notes, prices, roles if prices else None)
            except Exception as e:
                traceback.print_exception(*sys.exc_info())

    return canteen.toXMLFeed()
Exemplo n.º 23
0
def parse_url(url, today=False):
    today = datetime.date.today()
    if today.weekday() == 6:  # Sunday
        today += datetime.timedelta(days=1)  # Tomorrow

    if "%s" in url:
        url = url % today.strftime('%Y_%m_%d')

    try:
        content = requests.get(url).text
    except requests.exceptions.ConnectionError as e:
        logging.warning(str(e))
        content = requests.get(url, verify=False).text

    document = BeautifulSoup(content, "html.parser")
    canteen = LazyBuilder()

    # Prices for employees and guests
    try:
        p = price_employee_regex.search(document.find("main").text).groupdict()
        employee = float(p["employee"].split(",")[0]) + \
            float(p["employee"].split(",")[1]) / 100

        p = price_guest_regex.search(document.find("main").text).groupdict()
        guest = float(p["employee"].split(",")[0]) + \
            float(p["employee"].split(",")[1]) / 100
    except (AttributeError, TypeError, KeyError, ValueError):
        employee_multiplier = 1.25
        guest_multiplier = 1.60
        employee = None
        guest = None

    # Date
    p = datespan_regex.search(document.find(
        "div", {"class": "maincontent"}).find("h2").text).groupdict()

    if len(p["from"].split(".")[2]) == 0:
        p["from"] += p["to"].split(".")[2]
    fromdate = datetime.datetime.strptime(p["from"], "%d.%m.%Y")

    maincontent = document.find("div", {"class": "maincontent"})
    table = maincontent.find("table")
    if not table:
        if maincontent:
            # Die Speisenausgabe DHBW Eppelheim ist vom dd.mm.yyyy – dd.mm.yyyy
            # geschlossen
            p = datespan_regex.search(maincontent.text)
            if p:
                fromdate = datetime.datetime.strptime(p["from"], "%d.%m.%Y")
                todate = datetime.datetime.strptime(p["to"], "%d.%m.%Y")
                while fromdate <= todate:
                    canteen.setDayClosed(fromdate.strftime('%d.%m.%Y'))
                    fromdate += datetime.timedelta(1)

        return canteen.toXMLFeed()

    trs = table.find_all("tr")

    date = None
    for tr in trs:

        tds = tr.find_all("td")

        if len(tds) == 4:
            td0, td1, td2, td3 = tds

            day = td0.text.strip()

            date = fromdate + datetime.timedelta(days=daysGerman.index(day))
            date = date.strftime('%d.%m.%Y')

        else:
            td0 = None
            td1, td2, td3 = tds

        notes = []

        if "feiertag" in td1.text.lower() or "geschlossen" in td1.text.lower():
            canteen.setDayClosed(date)
            continue

        categoryName = td1.text.strip()[:-1]
        mealName = td2.text.strip()

        if not categoryName or not mealName:
            continue

        prices = []
        try:
            price = float(euro_regex.search(
                td3.text).group(1).replace(",", "."))
            prices.append(price)
            if employee is not None:
                prices.append(employee)
            else:
                prices.append(price * employee_multiplier)
            if guest is not None:
                prices.append(guest)
            else:
                prices.append(price * guest_multiplier)
        except (AttributeError, TypeError, KeyError, ValueError):
            notes.append(td3.text.strip())

        notes = [x for x in notes if x]
        canteen.addMeal(date, categoryName, mealName, notes if notes else None,
                        prices if prices else None, roles if prices else None)

    return canteen.toXMLFeed()
def parse_url(url, today=False):
    canteen = LazyBuilder()
    legend = {
        '1': 'mit Farbstoff',
        '2': 'mit Konservierungsstoff',
        '3': 'mit Antioxidationsmittel',
        '4': 'mit Geschmacksverstärker',
        '5': 'geschwefelt',
        '6': 'geschwärzt',
        '7': 'gewachst',
        '8': 'mit Phosphat',
        '9': 'mit Süssungsmittel Saccharin',
        '10': 'mit Süssungsmittel Aspartam, enth. Phenylalaninquelle',
        '11': 'mit Süssungsmittel Cyclamat',
        '12': 'mit Süssungsmittel Acesulfam',
        '13': 'chininhaltig',
        '14': 'coffeinhaltig',
        '15': 'gentechnisch verändert',
        '16': 'enthält Sulfite',
        '17': 'enthält Phenylalanin',
        'A': 'Gluten',
        'B': 'Krebstiere',
        'C': 'Eier',
        'D': 'Fisch',
        'E': 'Erdnüsse',
        'F': 'Soja',
        'G': 'Milch und Milchprodukte',
        'H': 'Schalenfrüchte',
        'I': 'Sellerie',
        'J': 'Senf',
        'K': 'Sesamsamen',
        'L': 'Schwefeldioxid und Sulfite',
        'M': 'Lupinen',
        'N': 'Weichtiere',
        'ZTA': 'Aktionsgericht',
        'ZTB': 'mit ausschließlich biologisch erzeugten Rohstoffen',
        'ZTF': 'Fisch',
        'ZTG': 'Geflügel',
        'ZTL': 'Lamm',
        'ZTMSC': 'zertifizierte nachhaltige Fischerei (MSC-C-53400)',
        'ZTMV': 'Mensa Vital',
        'ZTR': 'Rindfleisch',
        'ZTS': 'Schweinefleisch',
        'ZTV': 'vegetarisch',
        'ZTVG': 'vegan',
        'ZTW': 'Wild'
    }
    #canteen.setLegendData(legend)

    hg = re.compile("^HG[1-9]$")
    b = re.compile("^B[1-9]$")
    n = re.compile("^N[1-9]$")

    #for w in 0, 1:
    for w in [0]:
        kw = (date.today() + timedelta(weeks=w)).isocalendar()[1]
        try:
            f = urlopen('%(location)s/%(isoweek)d.csv' % {
                'location': url,
                'isoweek': kw
            })
        except HTTPError as e:
            if e.code == 404:
                continue
            else:
                raise e
        f = f.read().decode('iso8859-1')

        roles = ('student', 'employee', 'other')

        initline = True
        mealreader = reader(f.splitlines(), delimiter=';')
        for row in mealreader:
            if initline:
                initline = False
            else:
                if row[2] == 'Suppe':
                    category = 'Suppe'
                elif hg.match(row[2]):
                    category = 'Hauptgerichte'
                elif b.match(row[2]):
                    category = 'Beilagen'
                elif n.match(row[2]):
                    category = 'Nachspeisen'
                else:
                    raise RuntimeError('Unknown category: ' + str(row[2]))

                mdate = row[0]
                notes = []

                mname = row[3]
                bpos = mname.find(')')
                while bpos != -1:
                    apos = mname.find('(')
                    for i in mname[apos + 1:bpos].split(','):
                        notes.append(i)
                    if bpos == len(mname) - 1:
                        mname = mname[:apos] + mname[bpos + 1:]
                        bpos = -1
                    else:
                        mname = mname[:apos] + ' und ' + mname[bpos + 1:]
                        bpos = mname.find(')')
                if mname.rfind(' ') == len(mname) - 1:
                    mname = mname[:len(mname) - 1]

                mtype = row[4]
                if mtype != '':
                    for i in mtype.split(','):
                        notes.append('ZT' + i)

                prices = [row[6], row[7], row[8]]

                mnotes = []
                for i in notes:
                    mnotes.append(legend.get(i, legend.get(i[2:], i)))

                try:
                    canteen.addMeal(mdate, category, mname, mnotes, prices,
                                    roles)
                except ValueError as e:
                    print('could not add meal {}/{} "{}" due to "{}"'.format(
                        mdate, category, mname, e),
                          file=sys.stderr)
                    # empty meal ...
                    pass

    return canteen.toXMLFeed()
Exemplo n.º 25
0
	document = parse(content, 'html.parser')

	items = document.find_all('a', {"class": "item"})

	for item in items:
		title = item.strong.string
		if not title:
			continue
		numbers = item.small.string
		notes = []
		if numbers:
			for number in numbers.split(','):
				number = int(number.strip())
				if number > len(legend):
					continue
				notes.append(legend[number])
		row = item.parent.parent
		price = row.find_all('td')[-1].string
		prices = {}
		if price:
			subprice = price.split('/')
			if len(subprice) == 3:
				prices = {'student': subprice[0], 'employee': subprice[1], 'other': subprice[2]}
			else:
				prices = {'other': price}
		canteen.addMeal(datetime.date(date.year, date.month, date.day), "Mittagessen", title, notes=notes, prices=prices)

	date = date + datetime.timedelta(1)

print(canteen.toXMLFeed())