def parse_url(url, today=False, canteentype="Mittagsmensa", this_week="", next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[: url.find("essen/") + 6] + "wissenswertes/lebensmittelkennzeichnung" legend_doc = parse(urlopen(legend_url)).find(id="artikel") allergene = buildLegend( text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)" ) allergene["EI"] = "Ei" zusatzstoffe = buildLegend( text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)" ) for tr in legend_doc.find_all("tr"): tds = tr.find_all("td") if len(tds) != 2: continue title = tds[0].find("strong") if title is None: continue else: title = title.text text = tds[1].text.replace("enthält", "").strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + "-kommende-woche", canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) print(canteen.toXMLFeed()) return canteen.toXMLFeed()
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel') allergene = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)') allergene['EI'] = 'Ei' zusatzstoffe = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)') suballergene = re.compile( r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)') for tr in legend_doc.find_all('tr'): tds = tr.find_all('td') if len(tds) != 2: continue title = tds[0].find('strong') if title is None: continue else: title = title.text lines = tds[1].text.split('\n') for line in lines[1:]: try_allergine = suballergene.match(line) if try_allergine: allergene[try_allergine.group('name')] = try_allergine.group( 'value') text = lines[0].replace('enthält', '').strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges("student", {"other": 1.5}) document = parse(urlopen(url).read()) global legend regex = "\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)" legend = buildLegend(legend, document.find(id="additives").text, regex=regex) days = ( "montag", "dienstag", "mittwoch", "donnerstag", "freitag", "montagNaechste", "dienstagNaechste", "mittwochNaechste", "donnerstagNaechste", "freitagNaechste", ) for day in days: data = document.find("div", id=day) headline = document.find("a", attrs={"data-anchor": "#" + day}) parse_day(canteen, headline.text, data) return canteen.toXMLFeed()
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel') allergene = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)' ) allergene['EI'] = 'Ei' zusatzstoffe = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)' ) suballergene = re.compile(r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)') for tr in legend_doc.find_all('tr'): tds = tr.find_all('td') if len(tds) != 2: continue title = tds[0].find('strong') if title is None: continue else: title = title.text lines = tds[1].text.split('\n') for line in lines[1:]: try_allergine = suballergene.match(line) if try_allergine: allergene[try_allergine.group('name')] = try_allergine.group('value') text = lines[0].replace('enthält', '').strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) return canteen.toXMLFeed()
def parsePlan(url, internalMensaId, today): canteen = LazyBuilder() end = False while (url != None): dom = BeautifulSoup(urlopen(url).read(), 'lxml') date = dom.select('#mensa_date > p')[0].contents[0] menuDefinition = dom.find(id=internalMensaId) menuDescription = menuDefinition.parent.find('dd') tables = menuDescription.select('table') legend = {} legend = buildLegend(legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)') if tables != None and len(tables) == 1: table = tables[0] rows = table.find_all('tr') for row in rows: menuNameElement = row.select('td[class="mensa_col_55"] > b') if menuNameElement != None and menuNameElement[0].contents != None: menuName = menuNameElement[0].contents[0] category = 'Gericht' # get notes notes = {} notesElement = row.select('td[class="mensa_col_55"] > span') if notesElement != None and len(notesElement) > 0 and notesElement[0].text != None: notes = [legend.get(n, n) for n in notesElement[0].text.split(' ') if n] # get prices prices = {} for td in row.select('td[class="mensa_col_15"]'): priceElement = td.find('b') groupElement = td.find('span') if priceElement != None and groupElement != None and groupElement.contents != None and len(groupElement.contents) > 0 and priceElement.contents != None and len(priceElement.contents) > 0: group = str(groupElement.contents[0]) price = str(priceElement.contents[0]) if group == 'Stud.:': prices['student'] = price elif group == 'Bed.:': prices['employee'] = price elif group == 'Gast:': prices['other'] = price canteen.addMeal(date, category, menuName, notes, prices) else: canteen.setDayClosed(date) # check for further pages nextPageLink = dom.find(id='next_day_link') if nextPageLink == None or today: url = None else: url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href'] return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges('student', {'other': 1.5}) document = parse(urlopen(url).read()) global legend regex = '\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)' legend = buildLegend(legend, document.find(id='additives').text, regex=regex) days = ('montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'montagNaechste', 'dienstagNaechste', 'mittwochNaechste', 'donnerstagNaechste', 'freitagNaechste') for day in days: data = document.find('div', id=day) headline = document.find('a', attrs={'data-anchor': '#' + day}) parse_day(canteen, headline.text, data) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges('student', {'other': 1.5}) document = parse(urlopen(url).read()) global legend regex = '(?P<name>(\d|[A-Z])+)\)\s*' + \ '(?P<value>\w+((\s+\w+)*[^0-9)]))' legend = buildLegend(legend, document.find(id='additives').text, regex=regex) days = ('montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'montagNaechste', 'dienstagNaechste', 'mittwochNaechste', 'donnerstagNaechste', 'freitagNaechste') for day in days: data = document.find('div', id=day) headline = document.find('a', attrs={'data-anchor': '#' + day}) parse_day(canteen, headline.text, data) return canteen.toXMLFeed()
def test_note_extraction(self): text = '1) Schwein a)Farbstoff' legend = {'1': 'Schwein', 'a': 'Farbstoff'} assert buildLegend({}, text=text) == legend
def test_dict_passthrought(self): d = {} assert buildLegend(d) is d
def parse_legend(document): regex = '\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)' return buildLegend(text=document.find(id='additives').text, regex=regex)
def parsePlan(url, internalMensaId, today): canteen = LazyBuilder() end = False while (url != None): dom = BeautifulSoup(urlopen(url).read(), 'lxml') date = dom.select('#mensa_date > p')[0].contents[0] menuDefinition = dom.find(id=internalMensaId) menuDescription = menuDefinition.parent.find('dd') tables = menuDescription.select('table') legend = {} legend = buildLegend( legend, str(dom), regex='<strong>(?P<name>\w+)\s*</strong>\s*-\s*(?P<value>[\w\s)(]+)' ) if tables != None and len(tables) == 1: table = tables[0] rows = table.find_all('tr') for row in rows: menuNameElement = row.select('td[class="mensa_col_55"] > b') if menuNameElement != None and menuNameElement[ 0].contents != None: menuName = menuNameElement[0].contents[0] category = 'Gericht' # get notes notes = {} notesElement = row.select( 'td[class="mensa_col_55"] > span') if notesElement != None and len( notesElement) > 0 and notesElement[0].text != None: notes = [ legend.get(n, n) for n in notesElement[0].text.split(' ') if n ] # get prices prices = {} for td in row.select('td[class="mensa_col_15"]'): priceElement = td.find('b') groupElement = td.find('span') if priceElement != None and groupElement != None and groupElement.contents != None and len( groupElement.contents ) > 0 and priceElement.contents != None and len( priceElement.contents) > 0: group = str(groupElement.contents[0]) price = str(priceElement.contents[0]) if group == 'Stud.:': prices['student'] = price elif group == 'Bed.:': prices['employee'] = price elif group == 'Gast:': prices['other'] = price canteen.addMeal(date, category, menuName, notes, prices) else: canteen.setDayClosed(date) # check for further pages nextPageLink = dom.find(id='next_day_link') if nextPageLink == None or today: url = None else: url = 'https://www.studentenwerk-rostock.de/' + nextPageLink['href'] return canteen.toXMLFeed()