def parse_url(url, today=False): canteen = LazyBuilder() content = urlopen(url).read() document = parse(content, 'lxml') available_weeks = parse_available_weeks(document) # for the case that the start date is not auto set by the page e.g. on weekends noskip = find_start_date(document) is None employees_fee, guests_fee = parse_fees(document) groups = parse_ingredients(document) for idx, week in enumerate(available_weeks): if idx > 0 or noskip: content = urlopen("{}?selWeek={}".format(url, week)).read() document = parse(content, 'lxml') parse_meals_for_canteen(document, canteen, employees_fee, guests_fee, groups, today) if today: break return canteen.toXMLFeed()
def create_opinions(user_id): """ retrieve the recipes the user visited and didn't comment, format them then return them in a form intended to be in the left part @param user_id the id of the user @return string containing all the opinion forms """ search_rows = db_execute_out(""" SELECT DISTINCT recipe_id FROM search WHERE user_id LIKE {0} AND recipe_id NOT NULL AND recipe_id NOT IN ( SELECT DISTINCT recipe_id FROM opinions WHERE author LIKE {0} ); """.format(user_id)) if search_rows == [] or search_rows is None: return parse( """ <h4>How did you find theese recipes ?</h4><p>No recipe to comment</p> """, 'lxml').prettify(formatter='html') opinion_list = format_recipes([x[0] for x in search_rows]) # constructing the web page part config = SafeConfigParser() config.read(CONFIG_FILE) with open(config.get('html', 'opinion_form_path')) as _fd: search_panel = _fd.read() soup = parse('<h4>How did you find theese recipes ?</h4><div></div>', 'lxml') form_group = soup.div form_group['class'] = 'container-fluid' # creating a form for each recipe for recipe in opinion_list: form = parse(search_panel, 'lxml') # hidden info r_id = form.select('input#$recipe_info')[0] r_id['id'] = 'recipe_info_{}'.format(str(recipe['id'])) r_id['value'] = str(recipe['id']) u_id = form.select('input#$user_info')[0] u_id['id'] = 'user_info_{}'.format(str(recipe['id'])) u_id['value'] = str(user_id) # the form head = form.select('form#$id_form')[0] head['id'] = '{}_{}_form_head'.format(str(user_id), str(recipe['id'])) # the button button = form.select('button#$id_button')[0] button['id'] = '{}_{}_form'.format(str(user_id), str(recipe['id'])) # the img img = form.select('img')[0] img['src'] = recipe['img'] # the fav button fav_button = form.select('button#$fav_id')[0] fav_button['id'] = 'fav_{}_{}'.format(str(user_id), str(recipe['id'])) form_group.append(form) return soup.prettify(formatter='html')
def parse_url(url, today=False): canteen = LazyBuilder() legend = {'f': 'fleischloses Gericht', 'v': 'veganes Gericht'} document = parse(urlopen(base + '/speiseplan/zusatzstoffe-de.html').read()) for td in document.find_all('td', 'beschreibung'): legend[td.previous_sibling.previous_sibling.text] = td.text document = parse(urlopen(base + '/unsere-preise/').read()) prices = {} for tr in document.find('table', 'essenspreise').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read()) except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.date.resolution continue else: raise e else: errorCount = 0 for tr in document.find('table', 'zusatzstoffe').find_all('tr'): identifier = tr.find_all('td')[0].text \ .replace('(', '').replace(')', '') legend[identifier] = tr.find_all('td')[1].text.strip() canteen.setLegendData(legend) mensa_data = document.find('table', 'menu') category = None for menu_tr in mensa_data.find_all('tr'): if menu_tr.find('td', 'headline'): continue if menu_tr.find('td', 'gericht').text: category = menu_tr.find('td', 'gericht').text data = menu_tr.find('td', 'beschreibung') name = data.find('span').text.strip() notes = [span['title'] for span in data.find_all('span', title=True)] canteen.addMeal( date, category, name, notes, prices.get(category.replace('Aktionsessen', 'Bio-/Aktionsgericht'), {}) ) date += datetime.date.resolution if today: break return canteen.toXMLFeed()
def create_opinions(user_id): """ retrieve the recipes the user visited and didn't comment, format them then return them in a form intended to be in the left part @param user_id the id of the user @return string containing all the opinion forms """ search_rows = db_execute_out(""" SELECT DISTINCT recipe_id FROM search WHERE user_id LIKE {0} AND recipe_id NOT NULL AND recipe_id NOT IN ( SELECT DISTINCT recipe_id FROM opinions WHERE author LIKE {0} ); """.format(user_id)) if search_rows == [] or search_rows is None: return parse(""" <h4>How did you find theese recipes ?</h4><p>No recipe to comment</p> """, 'lxml').prettify(formatter='html') opinion_list = format_recipes([x[0] for x in search_rows]) # constructing the web page part config = SafeConfigParser() config.read(CONFIG_FILE) with open(config.get('html', 'opinion_form_path')) as _fd: search_panel = _fd.read() soup = parse('<h4>How did you find theese recipes ?</h4><div></div>', 'lxml') form_group = soup.div form_group['class'] = 'container-fluid' # creating a form for each recipe for recipe in opinion_list: form = parse(search_panel, 'lxml') # hidden info r_id = form.select('input#$recipe_info')[0] r_id['id'] = 'recipe_info_{}'.format(str(recipe['id'])) r_id['value'] = str(recipe['id']) u_id = form.select('input#$user_info')[0] u_id['id'] = 'user_info_{}'.format(str(recipe['id'])) u_id['value'] = str(user_id) # the form head = form.select('form#$id_form')[0] head['id'] = '{}_{}_form_head'.format(str(user_id), str(recipe['id'])) # the button button = form.select('button#$id_button')[0] button['id'] = '{}_{}_form'.format(str(user_id), str(recipe['id'])) # the img img = form.select('img')[0] img['src'] = recipe['img'] # the fav button fav_button = form.select('button#$fav_id')[0] fav_button['id'] = 'fav_{}_{}'.format(str(user_id), str(recipe['id'])) form_group.append(form) return soup.prettify(formatter='html')
def parse_week(url, canteen): document = parse(urlopen(url).read()) for day_table in document.find_all("table", "speiseplan"): try: date = extractDate(day_table.thead.tr.th.text) except ValueError: # There was no valid date in the table header, which happens eg # for special "Aktionswoche" tables. # TODO: check if this table contains any meals, which was not the # case when it was used for the first time. continue if day_table.find("td", "keinangebot"): canteen.setDayClosed(date) continue for meal_tr in day_table.tbody.children: if len(meal_tr.find_all("a") or []) < 1: continue name = meal_tr.td.text if ": " in name: category, name = name.split(": ", 1) else: category = "Angebote" if len(name) > 200: name = name[:200] + " ..." notes = [] for img in meal_tr.contents[1].find_all("img"): notes.append(img["title"]) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)
def instagram(): url = raw_input('\nURL : ') xxx = raw_input('\nDownload? (y/n) ') if 'y' in xxx: bra = raw_input('Output : ') print('{}\n[!] Loading...'.format(R)) save = r.get(url).text soup = parse(save, 'html.parser') love = soup.findAll('script', type='text/javascript') for heart in love: if 'window._sharedData = ' in heart.text: pakboi = heart.text.replace('window._sharedData = ','').replace(';','') pakboi = json.loads(pakboi) pakboi = pakboi["entry_data"]['PostPage'][0]["graphql"]['shortcode_media']["video_url"] # print('{}[!] Sedang Mendownload...'.format(R)) time.sleep(7) pakgerl = r.get(pakboi) pants = open(bra, 'wb') pants.write(pakgerl.content) pants.close print('{}[!] Download Berhasil'.format(GL)) time.sleep(3) print('{}\n[!] Salin File ke internal'.format(R)) time.sleep(3) print('{}[!] Berhasil\n\n\n{}Periksa Pada Internal!!!'.format(GL,BL)) time.sleep(2) os.system('cp '+bra+' /sdcard && rm -rf '+bra)
def facebook(): try: url = raw_input('\nURL : ') xxx = raw_input('\nDownload? (y/n) ') if 'y' in xxx: bra = raw_input('Output : ') print('{}\n[!] Loading...'.format(R)) save = r.get(url).text # print save sop = parse(save, "html.parser") res = sop.find("script", type="application/ld+json") a = json.loads(res.text) b = a['contentUrl'] time.sleep(7) c = r.get(b) d = open(bra, 'wb') d.write(c.content) d.close print('{}[!] Download Berhasil'.format(GL)) time.sleep(3) print('{}\n[!] Salin File ke Internal'.format(R)) time.sleep(3) print('{}[!] Berhasil\n\n\n{}PERIKSA PADA INTERNAL!!!'.format(GL,BL)) time.sleep(2) os.system('cp '+bra+' /sdcard && rm -rf '+bra) except KeyboardInterrupt: exit() except: print('URL TIDAK VALID') time.sleep(int("2")) os.system('python2 main.py')
def facebook(): try: url = raw_input('\n[?] URL : ') ct = raw_input('[?] Download? (y/n): ') if 'y' in ct: file = raw_input('[?] File Name : ') print('[!] Loading...') save = r.get(url).text sop = parse(save, "html.parser") res = sop.find("script", type="application/ld+json") a = json.loads(res.text) b = a['contentUrl'] time.sleep(5) c = r.get(b) d = open(file, 'wb') d.write(c.content) d.close print('[!] Download Succes') time.sleep(1) print('[!] Copy File to Internal') time.sleep(1) print('[!] \x1b[32;1mSuccesfully\x1b[37;1m') time.sleep(2) os.system('cp ' + file + ' /sdcard && rm -rf ' + file) except KeyboardInterrupt: exit() except: print('[!]\x1b[31;1m URL FAILED\x1b[37;1m') time.sleep(int("2")) os.system('python2 dl.py')
def parse_week(url, date, canteen): url += '/{0}/{1:0>2}/'.format(*date.isocalendar()) document = parse(urlopen(url).read()) week_data = document.find('table', id='week-menu') if week_data is None: print('week not found') return weekDays = extractWeekDates(week_data.thead.find_all('th')[0].text) for category_tr in week_data.find_all('tr'): category = category_tr.find('th').text i = 0 for day_td in category_tr.find_all('td'): for meal_data in day_td.find_all('p', 'dish'): if not meal_data.find('strong'): continue name = extra_regex.sub('', meal_data.find('strong').text) name = strip_regex.sub(' ', name).strip() if len(name) > 250: name = name[:245] + '...' notes = [ span['title'] for span in meal_data.find_all('span', 'tooltip') ] notes += [img['title'] for img in meal_data.find_all('img')] prices = price_regex.findall( meal_data.find('span', 'price').text) canteen.addMeal(weekDays[i], category, name, list(set(notes)), prices, ('student', 'employee', 'other')) i += 1
def instagram(): url = input('\n[?] URL : ') ct = input('[?] Download? (y/n): ') if 'y' in ct: bra = input('[?] File Name: ') print('[!] Loading...') save = r.get(url).text soup = parse(save, 'html.parser') love = soup.findAll('script', type='text/javascript') for heart in love: if 'window._sharedData = ' in heart.text: jonson = heart.text.replace('window._sharedData = ', '').replace(';', '') jonson = json.loads(jonson) jonson = jonson["entry_data"]['PostPage'][0]["graphql"][ 'shortcode_media']["video_url"] time.sleep(5) alukar = r.get(jonson) pants = open(bra, 'wb') pants.write(alukar.content) pants.close print('[!] \x1b[32;1mDownload Succesfully\x1b[37;1m') time.sleep(2) os.system('cp ' + bra + ' /sdcard && rm -rf ' + bra) exit()
def parse_week(url, canteen, mensa): document = parse(urlopen(url).read()) # extra legends information canteen.setLegendData(text=document.find(text='Kennzeichnung: ').parent.next_sibling.get_text().replace(' ', ' ')) # additional charges prices = {} for p in document.find_all('p'): match = employeePrice.search(p.text) if match: prices['employee'] = match.group('price') match = otherPrice.search(p.text) if match: prices['other'] = match.group('price') if len(prices) != 2: print('Could not extract addtional charges for employee and others') canteen.setAdditionalCharges('student', prices) # find mensa_data = document.find('h1', text=re.compile(mensa)).parent while type(mensa_data) != Tag or mensa_data.name != 'div'\ or 'tx-cagcafeteria-pi1' not in mensa_data.get('class', []): mensa_data = mensa_data.next_sibling weekDays = extractWeekDates(mensa_data.find('h2').text) for day_headline in mensa_data.find_all('h3'): date = weekDays[day_headline.text] day_table = day_headline.next_sibling.next_sibling for tr_menu in day_table.tbody.find_all('tr'): category = tr_menu.find_all('td')[0].text.strip() name = tr_menu.find_all('td')[1].text.replace('\r\n', ' ').strip() canteen.addMeal(date, category, name, [], tr_menu.find_all('td')[2].text)
def parse_week(url, date, canteen): url += '/{0}/{1:0>2}/'.format(*date.isocalendar()) document = parse(urlopen(url).read()) week_data = document.find('table', id='week-menu') if week_data is None: print('week not found') return weekDays = extractWeekDates(week_data.thead.find_all('th')[0].text) for category_tr in week_data.find_all('tr'): category = category_tr.find('th').text i = 0 for day_td in category_tr.find_all('td'): for meal_data in day_td.find_all('p', 'dish'): if not meal_data.find('strong'): continue name = extra_regex.sub('', meal_data.find('strong').text) name = strip_regex.sub(' ', name).strip() if len(name) > 250: name = name[:245] + '...' notes = [span['title'] for span in meal_data.find_all('span', 'tooltip')] notes += [img['title'] for img in meal_data.find_all('img')] prices = price_regex.findall(meal_data.find('span', 'price').text) canteen.addMeal(weekDays[i], category, name, list(set(notes)), prices, ('student', 'employee', 'other') ) i += 1
def add_options_to_form(table_name, form, tag_id): """ Add in the form having the id tag_id the content of the two first rows of the table_name given (id and name typically) @param table_name the name of the table @param form an option in the config file containing the path to an html file @param tag_id the tag id in the form (exemple : select#type) """ config = SafeConfigParser() config.read(CONFIG_FILE) # adding types to the search form types = db_execute_out("SELECT * FROM " + table_name + " ORDER BY name;") form_path = config.get('html', form) _fd = open(form_path) soup = parse(_fd.read(), "lxml") _fd.close() soup.select(tag_id)[0].string = '' for row in types: opt = soup.new_tag('option') opt.string = row[1] opt['value'] = row[0] soup.select(tag_id)[0].append(opt) # writing the html file html = soup.prettify(formatter='html') with open(form_path, "wb") as _fd: _fd.write(html)
def madeinfo(self, link): infos = [] desc = [] r = requests.get(link) s = parse(r.content, 'lxml') data1 = s.find("div", {"class": "fl-l score"}).attrs data1_rate = s.find("div", {"class": "fl-l score"}).get_text() data1_rate = data1_rate.replace(' ', '') data1_rate = data1_rate.replace('\n', '') data2_rank = s.find("span", {"class": "numbers ranked"}).get_text() data3_info = s.find("span", {"itemprop": "description"}).get_text() desc.append(data3_info) data4_episodes = s.find_all("div", {"class": "spaceit"})[0].get_text() data4_episodes = data4_episodes.replace('\n', '') data4_status = s.find_all("div", {"class": "spaceit"})[1].get_text() data4_status = data4_status.replace('\n', '') data4_air = s.find_all("div", {"class": "spaceit"})[2].get_text() data4_air = data4_air.replace('\n', '') data5_image = s.find("img", {"class": "lazyloaded"}, src=True) infos.append(data4_episodes + "\n" + data4_status + "\n" + data4_air) return { "users": data1["data-user"], "rating": data1_rate, "rank": data2_rank, "inf": desc, "add": infos, "image": data5_image["src"] }
def parse_url(url, today=False, canteentype="Mittagsmensa", this_week="", next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[: url.find("essen/") + 6] + "wissenswertes/lebensmittelkennzeichnung" legend_doc = parse(urlopen(legend_url)).find(id="artikel") allergene = buildLegend( text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)" ) allergene["EI"] = "Ei" zusatzstoffe = buildLegend( text=legend_doc.text.replace("\xa0", " "), regex=r"(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)" ) for tr in legend_doc.find_all("tr"): tds = tr.find_all("td") if len(tds) != 2: continue title = tds[0].find("strong") if title is None: continue else: title = title.text text = tds[1].text.replace("enthält", "").strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + "-kommende-woche", canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) print(canteen.toXMLFeed()) return canteen.toXMLFeed()
def add_options_to_form(table_name, form, tag_id): """ Add in the form having the id tag_id the content of the two first rows of the table_name given (id and name typically) @param table_name the name of the table @param form an option in the config file containing the path to an html file @param tag_id the tag id in the form (exemple : select#type) """ config = SafeConfigParser() config.read(CONFIG_FILE) # adding types to the search form types = db_execute_out("SELECT * FROM "+ table_name +" ORDER BY name;") form_path = config.get('html', form) _fd = open(form_path) soup = parse(_fd.read(), "lxml") _fd.close() soup.select(tag_id)[0].string = '' for row in types: opt = soup.new_tag('option') opt.string = row[1] opt['value'] = row[0] soup.select(tag_id)[0].append(opt) # writing the html file html = soup.prettify(formatter='html') with open(form_path, "wb") as _fd: _fd.write(html)
def parse_url(url, today=False): canteen = OpenMensaCanteen() # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges("student", {"other": 1.5}) document = parse(urlopen(url).read()) global legend regex = "\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)" legend = buildLegend(legend, document.find(id="additives").text, regex=regex) days = ( "montag", "dienstag", "mittwoch", "donnerstag", "freitag", "montagNaechste", "dienstagNaechste", "mittwochNaechste", "donnerstagNaechste", "freitagNaechste", ) for day in days: data = document.find("div", id=day) headline = document.find("a", attrs={"data-anchor": "#" + day}) parse_day(canteen, headline.text, data) return canteen.toXMLFeed()
def animeSearch(self, query: str = None): '''Getting the Result List''' try: if query == None: print("Missing Anime Name!") return anime_names = [] text = query.lower() text = text.replace(' ', '+') link_anime = "https://myanimelist.net/anime.php?q=" + text + "&type=0&score=0&status=0&p=0&r=0&sm=0&sd=0&sy=0&em=0&ed=0&ey=0&c[]=a&c[]=b&c[]=c&c[]=f&gx=0" r = requests.get(link_anime) s = parse(r.content, 'lxml') data_names = s.find_all("a", {"class": "hoverinfo_trigger fw-b fl-l"}) refined = "None" for x in data_names: names = x.text anime_names.append(names) refined = '\n'.join(anime_names[:7]) return refined except Exception as e: print(e)
def parse_week(url, canteen): document = parse(urlopen(url).read()) for day_table in document.find_all('table', 'speiseplan'): try: date = extractDate(day_table.thead.tr.th.text) except ValueError: # There was no valid date in the table header, which happens eg # for special "Aktionswoche" tables. # TODO: check if this table contains any meals, which was not the # case when it was used for the first time. continue if day_table.find('td', 'keinangebot'): canteen.setDayClosed(date) continue for meal_tr in day_table.tbody.children: if len(meal_tr.find_all('a') or []) < 1: continue name = meal_tr.td.text if ': ' in name: category, name = name.split(': ', 1) else: category = 'Angebote' if len(name) > 200: name = name[:200] + ' ...' notes = [] for img in meal_tr.contents[1].find_all('img'): notes.append(img['title']) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)
def animeData(self, query: str = None): '''Getting the links to the selected result''' try: if query == None: print("Missing Name!") return anime_links = [] anime_names = [] query = query.lower() text = query.replace(' ', '+') link_anime = "https://myanimelist.net/anime.php?q=" + text + "&type=0&score=0&status=0&p=0&r=0&sm=0&sd=0&sy=0&em=0&ed=0&ey=0&c[]=a&c[]=b&c[]=c&c[]=f&gx=0" r = requests.get(link_anime) s = parse(r.content, 'lxml') data_links = s.find_all("a", {"class": "hoverinfo_trigger fw-b fl-l"}) for x in data_links: names = x.text.lower() links = x["href"] anime_links.append(links) anime_names.append(names) anime_names = anime_names[:7] anime_links = anime_links[:7] link_found = "None" if query in anime_names: n = anime_names.index("{}".format(query)) link_found = anime_links[n] datas = self.madeinfo(link_found) return datas except Exception as e: print(e)
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel') allergene = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)') allergene['EI'] = 'Ei' zusatzstoffe = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)') suballergene = re.compile( r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)') for tr in legend_doc.find_all('tr'): tds = tr.find_all('td') if len(tds) != 2: continue title = tds[0].find('strong') if title is None: continue else: title = title.text lines = tds[1].text.split('\n') for line in lines[1:]: try_allergine = suballergene.match(line) if try_allergine: allergene[try_allergine.group('name')] = try_allergine.group( 'value') text = lines[0].replace('enthält', '').strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read()) for day_div in document.find_all('div', 'day') + document.find_all( 'article', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date') continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = "{}-{}-{}".format( year, date_test.group('month'), date_test.group('day'), ) if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text: canteen.setDayClosed(date) continue closed_candidate = False for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue if 'geschlossen' in name: closed_candidate = True continue category = meal_article.find('div')['title'] notes = [ v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] ] if meal_article.find('div', 'additive'): notes += [ v[0] for v in extra_regex.findall( meal_article.find('div', 'additive').text) ] price_div = meal_article.find('div', 'price') if price_div is None: canteen.addMeal(date, category, name, notes) continue prices = {} for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')): price = price_regex.search(price_div['data-' + v]) if price: prices[r] = price.group('price') elif v == 'default': prices = {} break canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_week(url, canteen): data = urlopen(url).read().decode('utf-8') document = parse(data, 'lxml') # The day plans are in a div with no special class or id. Thus # we try to find a div with a heading "Speiseplan " for week_heading in document(class_='swdd-ueberschrift', text=speiseplan_regex): week_div = week_heading.parent # The meals for each day a in card. Again there is no class or id to # select the meal cards. Thus we lookung for all card with a card-header # which stores the date for card_header in week_div.find_all(class_='card-header'): day_card = card_header.parent try: date = extractDate(card_header.text) except ValueError: # There was no valid date in the table header, which happens eg # for special "Aktionswoche" cards. # TODO: check if this card contains any meals, which was not the # case when it was used for the first time. continue # Check if there is a "kein Angebot" item if day_card.find(class_='list-group-item', text=kein_angebot_regex): canteen.setDayClosed(date) continue # Iterate over the list-group-item within the card which are used # for individual meals for meal in day_card.find_all(class_='list-group-item'): name = meal.find(name='span') if name is not None: name = name.text else: continue if ': ' in name: category, name = name.split(': ', 1) else: category = 'Angebote' notes = [img['alt'] for img in meal.find_all(class_='swdd-spl-symbol')] if '* ' in name: name, note = name.split('* ', 1) notes.append(note) if meal.strong is not None: prices = price_regex.findall(meal.strong.text) else: prices = [] canteen.addMeal(date, category, name, notes, prices, roles)
def GetData(): # http://www.cbr.ru/ - офф сайт ЦБР cbr = req.urlopen("http://www.cbr.ru/").read().decode("utf-8") Data = parse(cbr, 'html.parser') CurUSDnEUR = Data.find_all('td', {"class": "weak"}) CurUSD, CurEUR = CurUSDnEUR[0].get_text(), CurUSDnEUR[1].get_text() Date = Data.find_all('a', {"href": re.compile("\/currency_base\/daily\.aspx\?date_req=\d{2}\.\d{2}\.\d{4}")}) CurDate, NextDate = Date[0].get_text(), Date[1].get_text() return (CurDate, CurUSD, CurEUR)
def get_recipe(url, base): """ Retrieve a web page and get informations from it @param url the url of the web page to analyze @param base the base url of the web site @return a dictionnary containing all the informations of a recipe or just the urls found on the page and the url of the page : {url, name, img, type, ingredients, add_urls} or {url, add_urls} """ web_page = urllib2.urlopen(url) html = web_page.read() soup = parse(html.decode('utf8', 'replace'), "lxml") # urls on marmiton _urls = [] for i in soup.find_all('a'): curr_url = i.get('href') if curr_url is not None: if base + 'recettes/' in curr_url: _urls.append(curr_url) # ingredients on marmiton ingr_list = [] for i in soup.find_all('div'): if i.get('class') is not None: if 'm_content_recette_ingredients' in i.get('class'): ingr_list = str(i).split('<br/>') ingr_list = clean_ingredients(ingr_list) # image on marmiton _img = '' for i in soup.find_all('a'): if i.get('class') == ['m_content_recette_illu']: _img = i.findChildren()[0].get('src') if len(ingr_list) == 0 or _img == '': return {'url': url, 'add_urls': _urls} # title on marmiton title = soup.title.string title = re.sub(r'[\r|\n|\t]*', '', title) title = re.sub(r'\"', '', title) title = unicodedata.normalize('NFD', title).encode('utf8', 'ignore') # type _type = determine_type(title) return { 'url': url, 'name': title, 'img': _img, 'type': _type, 'ingredients': ingr_list, 'add_urls': _urls }
def create_favs(user_id): """ retrieve the favorites recipes of the user and format them then return them @param user_id the id of the user @return favorites recipes formatted in html """ fav_rows = db_execute_out(""" SELECT idRecipe FROM user_has_favorite_recipes WHERE idUser LIKE \"{}\"; """.format(user_id)) if fav_rows == []: return parse( """ <h4>Favorite List :</h4><p>No favorite</p> """, 'lxml').prettify(formatter='html') favorite_list = format_recipes([x[0] for x in fav_rows]) # constructing the web page part config = SafeConfigParser() config.read(CONFIG_FILE) _fd = open(config.get('html', 'fav_panel')) fav_panel = _fd.read() _fd.close() soup = parse('<h4>Favorite List :</h4><div></div>', 'lxml') panel_group = soup.div panel_group['class'] = 'container-fluid' # creating a panel for each recipe for recipe in favorite_list: panel = parse(fav_panel, 'lxml') # the well well = panel.select('div#$id_fav')[0] well['id'] = 'well_unfav_{}_{}'.format(str(user_id), str(recipe['id'])) unfav = panel.select('button#$unfav_id')[0] unfav['id'] = 'unfav_{}_{}'.format(str(user_id), str(recipe['id'])) # the img img = panel.select('img#$fav_img')[0] img['id'] = str(recipe['id']) + '_favimg' img['src'] = recipe['img'] # the url url = panel.select('a#$fav_url')[0] url['id'] = str(recipe['id']) + '_favurl' url['href'] = recipe['url'] panel_group.append(panel) return soup.prettify(formatter='html')
def create_favs(user_id): """ retrieve the favorites recipes of the user and format them then return them @param user_id the id of the user @return favorites recipes formatted in html """ fav_rows = db_execute_out(""" SELECT idRecipe FROM user_has_favorite_recipes WHERE idUser LIKE \"{}\"; """.format(user_id)) if fav_rows == []: return parse(""" <h4>Favorite List :</h4><p>No favorite</p> """, 'lxml').prettify(formatter='html') favorite_list = format_recipes([x[0] for x in fav_rows]) # constructing the web page part config = SafeConfigParser() config.read(CONFIG_FILE) _fd = open(config.get('html', 'fav_panel')) fav_panel = _fd.read() _fd.close() soup = parse('<h4>Favorite List :</h4><div></div>', 'lxml') panel_group = soup.div panel_group['class'] = 'container-fluid' # creating a panel for each recipe for recipe in favorite_list: panel = parse(fav_panel, 'lxml') # the well well = panel.select('div#$id_fav')[0] well['id'] = 'well_unfav_{}_{}'.format(str(user_id), str(recipe['id'])) unfav = panel.select('button#$unfav_id')[0] unfav['id'] = 'unfav_{}_{}'.format(str(user_id), str(recipe['id'])) # the img img = panel.select('img#$fav_img')[0] img['id'] = str(recipe['id'])+'_favimg' img['src'] = recipe['img'] # the url url = panel.select('a#$fav_url')[0] url['id'] = str(recipe['id'])+'_favurl' url['href'] = recipe['url'] panel_group.append(panel) return soup.prettify(formatter='html')
def parse_week(url, data, canteen): document = parse(urlopen(url, data).read()) # parse extra/notes legend legends = {} legendsData = document.find('table', 'zusatz_std') if legendsData: legends = {int(v[0]): v[1] for v in legend_regex.findall(legendsData.text.replace('\xa0', ' '))} data = document.find('table', 'wo_std') if not data: message = document.find('div', 'Meldung_std') if message: m = day_range_regex.search(message.text) if m: fromDate = datetime.datetime.strptime(m.group('from') + '.' + m.group('year'), '%d.%m.%Y') toDate = datetime.datetime.strptime(m.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution return # iterator about all rows of the table rowIter = iter(document.find('table', 'wo_std').find_all('tr')) # extra category names fro th's of first row headRow = next(rowIter) for br in headRow.find_all('br'): br.replace_with(document.new_string(' - ')) categories = list(map(lambda v: (v.text.strip() + '#').replace(' -#', '#')[:-1], headRow.find_all('th')))[1:] try: while True: tr = next(rowIter) # meal row # extract date from first column: date = day_regex.search(tr.contents[0].text).group('date') if tr.contents[0].get('rowspan') is None: canteen.setDayClosed(date) continue extratr = next(rowIter) # addition meal component row, ToDo # build iterators for lists: categoriesIterator = iter(categories) colIter = iter(tr.find_all('td')) extraIter = iter(extratr.find_all('td')) # skip first row (date): next(colIter) next(extraIter) try: while True: name = next(colIter).text # extract notes from name notes = [legends[int(v)] for v in set(','.join(extra_regex.findall(name)).split(',')) if v and int(v) in legends] # from notes from name name = extra_regex.sub('', name).replace('\xa0', ' ').replace(' ', ' ').strip() # extract price canteen.addMeal(date, next(categoriesIterator), name, notes, next(colIter).text) except StopIteration: pass except StopIteration: pass
def get_content(_file): """ Return the content of the web page inside the body tags @param _file an option in the config file containing the path to an html file @return the content of the body tags in the html file """ config = SafeConfigParser() config.read(CONFIG_FILE) _fd = open(config.get('html', _file), 'r') soup = parse(_fd.read(), "lxml") return soup.find('body').prettify(formatter='html')
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content, 'lxml') legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = { int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text)) } else: extraLegend = {} canteen = LazyBuilder() for day_td in document.find_all('td', text=day_regex): date = day_regex.search(day_td.string).group('date') table = None for element in day_td.parents: if element.name == 'table': table = element break if not table: continue for tr in table.tbody.find_all('tr'): if 'geschlossen' in tr.text or 'Feiertage' in tr.text: match = day_range_regex.search(tr.text) if not match: canteen.setDayClosed(date) else: fromDate = datetime.datetime.strptime( match.group('from'), '%d.%m.%Y') toDate = datetime.datetime.strptime( match.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution continue if len(tr) != 2: continue # no meal strings = list(tr.contents[0].strings) name = strings[0] # prices: prices = strings[-1].split('|') if '-' in map(lambda v: v.strip(), prices): prices = {} # notes: notes = [] for img in tr.contents[1].find_all('img'): notes.append(img['alt'].replace('Symbol', '').strip()) for extra in list( set(map(lambda v: int(v), extra_regex.findall(tr.text)))): if extra in extraLegend: notes.append(extraLegend[extra]) canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() document = parse(urlopen(url).read(), 'lxml') days = ('Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag') for day in days: data = document.find('div', {'data-day': day}) date = data.attrs['data-date'] parse_day(canteen, date, data) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() canteen.setAdditionalCharges('student', {'other': 1.5}) document = parse(urlopen(url).read()) for submit in document.find_all('input'): if submit['type'] != 'submit': continue parse_week(url, urlencode({submit['name']: submit['value']}).encode('utf8'), canteen) if today: break return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() document = parse(urlopen(url).read(), 'lxml') # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges('student', {'other': 1.5}) # unwanted automatic notes extraction would be done in `OpenMensaCanteen.addMeal()` # if we used `LazyBuilder.setLegendData()`, so we bypass it using a custom attribute canteen.legend = parse_legend(document) parse_all_days(canteen, document) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() document = parse(urlopen(url).read(), 'lxml') days = ('Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag') for day in days: data = document.find('div', {'data-day': day}) if data is None: continue date = data.attrs['data-date'] parse_day(canteen, date, data) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read(), 'lxml') for day_div in document.find_all('div', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date "{}"'.format( day_div['data-day'])) continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day')) closed_candidate = day_div.find('div', 'holiday') is not None for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue category = meal_article.find('div', 'icon')['title'] notes = [] prices = {} additives = meal_article.find('div', 'additnr') if additives: notes += [ additive.text for additive in additives.find_all('li') ] notes += [ v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes ] price_div = meal_article.find('div', 'price') if price_div: for k, v in price_map.items(): price = price_div['data-' + k] if price: prices[v] = price canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}): document = parse(urlopen(url).read(), 'lxml') for day_table in document.find_all('table', 'swbs_speiseplan'): caption = day_table.find('th', 'swbs_speiseplan_head').text if type not in caption: continue date = extractDate(caption) meals = day_table.find_all('tr') pos = 0 while pos < len(meals): meal_tr = meals[pos] if not meal_tr.find('td'): # z.B Headline pos += 1 continue tds = meal_tr.find_all('td') category = re.sub(r' \(\d\)', '', tds[0].text.strip()) name = tds[1].text.strip() if tds[1].find('a', href='http://www.stw-on.de/mensavital'): notes = ['MensaVital'] else: notes = [] for img in tds[2].find_all('img'): title = img['title'] if ':' in title: kind, value = title.split(':') if kind == 'Allergene': for allergen in value.split(','): notes.append( allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]]) elif kind == 'Zusatzstoffe': for zusatzstoff in value.split(','): notes.append(zusatzstoffe[zusatzstoff.strip()]) else: print('Unknown image type "{}"'.format(kind)) else: notes.append(title.replace('enthält ', '')) prices = { 'student': tds[3].text.strip(), 'employee': tds[4].text.strip(), 'other': tds[5].text.strip() } if pos < len(meals) - 1: nextTds = meals[pos + 1].find_all('td') if nextTds[0].text.strip() == '': pos += 1 for img in nextTds[1].find_all('img'): notes.append(img['title']) pos += 1 canteen.addMeal(date, category or 'Sonstiges', name, notes, prices)
def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}): document = parse(urlopen(url).read(), 'lxml') for day_table in document.find_all('table', 'swbs_speiseplan'): caption = day_table.find('th', 'swbs_speiseplan_head').text if type not in caption: continue date = extractDate(caption) meals = day_table.find_all('tr') pos = 0 while pos < len(meals): meal_tr = meals[pos] if not meal_tr.find('td'): # z.B Headline pos += 1 continue tds = meal_tr.find_all('td') category = re.sub(r' \(\d\)', '', tds[0].text.strip()) name = tds[1].text.strip() if tds[1].find('a', href='http://www.stw-on.de/mensavital'): notes = ['MensaVital'] else: notes = [] for img in tds[2].find_all('img'): title = img['title'] if ':' in title: kind, value = title.split(':') if kind == 'Allergene': for allergen in value.split(','): notes.append(allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]]) elif kind == 'Zusatzstoffe': for zusatzstoff in value.split(','): notes.append(zusatzstoffe[zusatzstoff.strip()]) else: print('Unknown image type "{}"'.format(kind)) else: notes.append(title.replace('enthält ', '')) prices = { 'student': tds[3].text.strip(), 'employee': tds[4].text.strip(), 'other': tds[5].text.strip() } if pos < len(meals) - 1: nextTds = meals[pos+1].find_all('td') if nextTds[0].text.strip() == '': pos += 1 for img in nextTds[1].find_all('img'): notes.append(img['title']) pos += 1 canteen.addMeal(date, category or 'Sonstiges', name, notes, prices)
def parse_url(url, today=False): content = urlopen(url).read() document = parse(content) legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))} else: extraLegend = {} canteen = LazyBuilder() for day_td in document.find_all('td', text=day_regex): date = day_regex.search(day_td.string).group('date') table = None for element in day_td.parents: if element.name == 'table': table = element break if not table: continue for tr in table.tbody.find_all('tr'): if 'geschlossen' in tr.text or 'Feiertage' in tr.text: match = day_range_regex.search(tr.text) if not match: canteen.setDayClosed(date) else: fromDate = datetime.datetime.strptime(match.group('from'), '%d.%m.%Y') toDate = datetime.datetime.strptime(match.group('to'), '%d.%m.%Y') while fromDate <= toDate: canteen.setDayClosed(fromDate.strftime('%Y-%m-%d')) fromDate += datetime.date.resolution continue if len(tr) != 3: continue # no meal strings = list(tr.contents[0].strings) name = strings[0] # prices: prices = strings[-1].split('|') print(prices) if '-' in map(lambda v: v.strip(), prices): prices = {} # notes: notes = [] for img in tr.contents[1].find_all('img'): notes.append(img['alt'].replace('Symbol', '').strip()) for extra in list(set(map(lambda v: int(v), extra_regex.findall(tr.text)))): if extra in extraLegend: notes.append(extraLegend[extra]) canteen.addMeal(date, 'Hauptgerichte', name, notes, prices, roles if prices else None) return canteen.toXMLFeed()
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url)) canteen.setLegendData( text=legend_doc.find(id='artikel').text, regex=r'(?P<name>(\d+|[A-Z]+))\s+=\s+(?P<value>\w+( |\t|\w)*)' ) parse_week(url + this_week, canteen, canteentype) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype) return canteen.toXMLFeed()
def getPostings(): url = 'https://news.ycombinator.com/jobs' response = requests.get(url) page = parse(response.content, 'lxml') headlines = page.select('a.storylink') timestamps = page.select('span.age') company = re.compile(r'^[A-Z].+ \(YC .\d+\)|^[A-Z]\w+ [a-z]') titles = [title.text for title in headlines] times = [time.text for time in timestamps] urls = [title['href'] for title in headlines] locations = [GeoText(title).cities for title in titles] # companies = [re.findall(company,str(titles))] details = zip(titles, times, urls, locations) return details
def parse_week(canteen, url, place_class=None): content = urlopen(url).read() document = parse(content) legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = { int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text)) } else: extraLegend = {} if place_class: document = document.find(id=place_class) for day_a in document.find_all('a', rel=day_regex): day_data = document.find(id=day_a['href'].replace('#', '')) if not day_data: continue date = day_a['rel'][0] day_table = day_data.table if not day_table: continue if day_table.tbody: day_table = day_table.tbody canteen.clearDay(date) # remove old data about this day for category_tr in day_table.children: if category_tr.name != 'tr': continue if len(category_tr) < 2: continue # no meal category = category_tr.contents[0].text meal_table = category_tr.contents[1].table if meal_table.tbody: meal_table = meal_table.tbody for meal_tr in meal_table.children: if meal_tr.name != 'tr': continue if len(list(meal_tr.children)) != 3: #print('skipping category, unable to parse meal_table: {} tds'.format(len(list(meal_tr.children)))) continue name = meal_tr.contents[1].text # notes, to do canteen.addMeal(date, category, name, [], price_regex.findall(meal_tr.contents[2].text), roles)
def download(self): # Validating input songID = self.input.text() if not songID: self.notification = NotificationDialog( switch['empinp'][config["lang"]]) self.notification.exec() return elif not songID.isdigit(): self.notification = NotificationDialog( switch['typerr'][config["lang"]]) self.notification.exec() return page = parse( load('https://www.newgrounds.com/audio/listen/' f'{songID}').text, 'html.parser') if page.find(id='pageerror') is not None: self.notification = NotificationDialog( switch['404'][config["lang"]]) self.notification.exec() return self.songTitle = page.find('title').text # Getting download link link = 'http://audio.ngfiles.com/' page = str(page) i = page.find('audio.ngfiles.com') + len('audio.ngfiles.com/') while not link.endswith('.mp3'): if page[i] != '\\': link += page[i] i += 1 # Locating file self.dist = (QFileDialog.getSaveFileName( self, switch['savefile'][config["lang"]], link.split('/')[-1], 'MP3 Audio File (*.mp3)')[0]) if not self.dist: return # Downloading self.file = load(link, stream=True) self.progress = ProgressDialog() self.progress.label.setText(switch['downloading'][config["lang"]]( self.songTitle)) self.progress.setWindowTitle(switch['downloading'][config["lang"]]( self.songTitle)) self.progress.bar.setValue(0) self.progress.exec()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read()) for day_div in document.find_all('div', 'day') + document.find_all('article', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date') continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = "{}-{}-{}".format(year, date_test.group('month'), date_test.group('day'), ) if 'nodata' in day_div.attrs.get('class', []) or 'GESCHLOSSEN' in day_div.text: canteen.setDayClosed(date) continue closed_candidate = False for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue if 'geschlossen' in name: closed_candidate = True continue category = meal_article.find('div', 'desc').text notes = [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title']] if meal_article.find('div', 'additive'): notes += [v[0] for v in extra_regex.findall(meal_article.find('div', 'additive').text)] price_div = meal_article.find('div', 'price') if price_div is None: canteen.addMeal(date, category, name, notes) continue prices = {} for v, r in (('default', 'student'), ('bed', 'employee'), ('guest', 'other')): price = price_regex.search(price_div['data-' + v]) if price: prices[r] = price.group('price') elif v == 'default': prices = {} break canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges('student', {'other': 1.5}) document = parse(urlopen(url).read()) global legend regex = '\((?P<name>[\dA-Z]+)\)\s*(?P<value>[\w\s]+)' legend = buildLegend(legend, document.find(id='additives').text, regex=regex) days = ('montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'montagNaechste', 'dienstagNaechste', 'mittwochNaechste', 'donnerstagNaechste', 'freitagNaechste') for day in days: data = document.find('div', id=day) headline = document.find('a', attrs={'data-anchor': '#' + day}) parse_day(canteen, headline.text, data) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = LazyBuilder() document = parse(urlopen(url).read(), 'lxml') for day_div in document.find_all('div', attrs={'data-day': True}): # parse date, warning: calculate year number needed date_test = day_regex.search(day_div['data-day']) if not date_test: print('Error: unable to parse date "{}"'.format(day_div['data-day'])) continue else: year = datetime.datetime.now().year if datetime.datetime.now().month > int(date_test.group('month')): year += 1 # date from next year date = '{}-{}-{}'.format(year, date_test.group('month'), date_test.group('day')) closed_candidate = day_div.find('div', 'holiday') is not None for meal_article in day_div.find_all('article', 'menu'): name = meal_article.find('div', 'title').text if not name: continue category = meal_article.find('div', 'icon')['title'] notes = [] prices = {} additives = meal_article.find('div', 'additnr') if additives: notes += [additive.text for additive in additives.find_all('li')] notes += [v['title'] for v in meal_article.find_all('div', 'theicon') if v['title'] and v['title'] not in notes] price_div = meal_article.find('div', 'price') if price_div: for k, v in price_map.items(): price = price_div['data-' + k] if price: prices[v] = price canteen.addMeal(date, category, name, notes, prices) if closed_candidate and not canteen.hasMealsFor(date): canteen.setDayClosed(date) return canteen.toXMLFeed()
def parse_week(url, canteen, type, allergene={}, zusatzstoffe={}): document = parse(urlopen(url).read()) for day_table in document.find_all("table", "swbs_speiseplan"): caption = day_table.find("th", "swbs_speiseplan_head").text if type not in caption: continue date = extractDate(caption) meals = day_table.find_all("tr") pos = 0 while pos < len(meals): meal_tr = meals[pos] if not meal_tr.find("td"): # z.B Headline pos += 1 continue tds = meal_tr.find_all("td") category = re.sub(r" \(\d\)", "", tds[0].text.strip()) name = tds[1].text.strip() if tds[1].find("a", href="http://www.stw-on.de/mensavital"): notes = ["MensaVital"] else: notes = [] for img in tds[2].find_all("img"): title = img["title"] if ":" in title: kind, value = title.split(":") if kind == "Allergene": for allergen in value.split(","): notes.append(allergene.get(allergen.strip()) or allergene[allergen.strip()[:-1]]) elif kind == "Zusatzstoffe": for zusatzstoff in value.split(","): notes.append(zusatzstoffe[zusatzstoff.strip()]) else: print('Unknown image type "{}"'.format(kind)) else: notes.append(title.replace("enthält ", "")) prices = {"student": tds[3].text.strip(), "employee": tds[4].text.strip(), "other": tds[5].text.strip()} if pos < len(meals) - 1: nextTds = meals[pos + 1].find_all("td") if nextTds[0].text.strip() == "": pos += 1 for img in nextTds[1].find_all("img"): notes.append(img["title"]) pos += 1 canteen.addMeal(date, category, name, notes, prices)
def parse_url(url, today=False, canteentype='Mittagsmensa', this_week='', next_week=True, legend_url=None): canteen = LazyBuilder() canteen.legendKeyFunc = lambda v: v.lower() if not legend_url: legend_url = url[:url.find('essen/') + 6] + 'wissenswertes/lebensmittelkennzeichnung' legend_doc = parse(urlopen(legend_url), 'lxml').find(id='artikel') allergene = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>[A-Z]+) {3,}enthält (?P<value>\w+( |\t|\w)*)' ) allergene['EI'] = 'Ei' zusatzstoffe = buildLegend( text=legend_doc.text.replace('\xa0', ' '), regex=r'(?P<name>\d+) {3,} (enthält )?(?P<value>\w+( |\t|\w)*)' ) suballergene = re.compile(r'(?P<name>[0-9A-Z]+)[^a-zA-Z]*enthält (?P<value>\w+( |\t|\w)*)') for tr in legend_doc.find_all('tr'): tds = tr.find_all('td') if len(tds) != 2: continue title = tds[0].find('strong') if title is None: continue else: title = title.text lines = tds[1].text.split('\n') for line in lines[1:]: try_allergine = suballergene.match(line) if try_allergine: allergene[try_allergine.group('name')] = try_allergine.group('value') text = lines[0].replace('enthält', '').strip() if title.isdigit(): zusatzstoffe[title] = text else: allergene[title] = text parse_week(url + this_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and next_week is True: parse_week(url + '-kommende-woche', canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) if not today and type(next_week) is str: parse_week(url + next_week, canteen, canteentype, allergene=allergene, zusatzstoffe=zusatzstoffe) return canteen.toXMLFeed()
def parse_url(url, today=False): canteen = OpenMensaCanteen() # todo only for: Tellergericht, vegetarisch, Klassiker, Empfehlung des Tages: canteen.setAdditionalCharges('student', {'other': 1.5}) document = parse(urlopen(url).read()) global legend regex = '(?P<name>(\d|[A-Z])+)\)\s*' + \ '(?P<value>\w+((\s+\w+)*[^0-9)]))' legend = buildLegend(legend, document.find(id='additives').text, regex=regex) days = ('montag', 'dienstag', 'mittwoch', 'donnerstag', 'freitag', 'montagNaechste', 'dienstagNaechste', 'mittwochNaechste', 'donnerstagNaechste', 'freitagNaechste') for day in days: data = document.find('div', id=day) headline = document.find('a', attrs={'data-anchor': '#' + day}) parse_day(canteen, headline.text, data) return canteen.toXMLFeed()
def parse_week(canteen, url, place_class=None): content = urlopen(url).read() document = parse(content) legends = document.find_all('div', {'class': 'legende'}) if len(legends) > 0: extraLegend = {int(v[0]): v[1] for v in reversed(legend_regex.findall(legends[0].text))} else: extraLegend = {} if place_class: document = document.find(id=place_class) for day_a in document.find_all('a', rel=day_regex): day_data = document.find(id=day_a['href'].replace('#', '')) if not day_data: continue date = day_a['rel'][0] day_table = day_data.table if not day_table: continue if day_table.tbody: day_table = day_table.tbody canteen.clearDay(date) # remove old data about this day for category_tr in day_table.children: if category_tr.name != 'tr': continue if len(category_tr) < 2: continue # no meal category = category_tr.contents[0].text meal_table = category_tr.contents[1].table if meal_table.tbody: meal_table = meal_table.tbody for meal_tr in meal_table.children: if meal_tr.name != 'tr': continue if len(list(meal_tr.children)) != 3: #print('skipping category, unable to parse meal_table: {} tds'.format(len(list(meal_tr.children)))) continue name = meal_tr.contents[1].text # notes, to do canteen.addMeal(date, category, name, [], price_regex.findall(meal_tr.contents[2].text), roles)
def parse_url(url, data_canteen, today=False): canteen = LazyBuilder() data = urlopen(url).read().decode('utf-8') document = parse(data, 'lxml') dish = document.find(class_='neo-menu-single-dishes') if dish is not None: dishes = dish.find_all(name='tr', attrs={"data-canteen": data_canteen}) else: dishes = [] side = document.find(class_='neo-menu-single-modals') if side is not None: dishes = dishes + side.find_all(name='tr', attrs={"data-canteen": data_canteen}) for dish in dishes: parse_dish(dish, canteen) return canteen.toXMLFeed()
def parse_week(url, canteen): document = parse(urlopen(url).read()) for day_table in document.find_all('table', 'speiseplan'): date = extractDate(day_table.thead.tr.th.text) if day_table.find('td', 'keinangebot'): canteen.setDayClosed(date) continue for meal_tr in day_table.tbody.children: if len(meal_tr.find_all('a') or []) < 1: continue name = meal_tr.td.text if ': ' in name: category, name = name.split(': ', 1) else: category = 'Angebote' if len(name) > 200: name = name[:200] + ' ...' notes = [] for img in meal_tr.contents[1].find_all('img'): notes.append(img['title']) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles)
def parse_week(url, canteen, type): document = parse(urlopen(url).read()) for day_table in document.find_all('table', 'swbs_speiseplan'): caption = day_table.find('th', 'swbs_speiseplan_head').text if type not in caption: continue date = extractDate(caption) for meal_tr in day_table.find_all('tr'): if not meal_tr.find('td'): # z.B Headline continue tds = meal_tr.find_all('td') category = tds[0].text.strip() name = tds[1].text if tds[1].find('a', href='http://www.stw-on.de/mensavital'): notes = ['MensaVital'] else: notes = [] prices = { 'student': tds[2].text, 'employee': tds[3].text, 'other': tds[4].text } canteen.addMeal(date, category, name, notes, prices)
def parse_week(canteen, url, place_class=None): content = urlopen(url).read().decode('utf-8', errors='ignore') document = parse(content, features='lxml') legend = document.find('div', {'id': 'leg'}) if legend and legend.find('br'): # Update legend legend_content = legend.find('br').parent current_img = None for child in legend_content.children: if isinstance(child, str): if current_img is not None: # Last child was a icon, this must be its label s = child.strip() if s.startswith('- '): s = s[2:].strip() extraLegend[current_img] = s current_img = None else: # Text notes for n, text in legend_number_regex.findall(child): extraLegend[n] = text for tag, text in legend_letters_regex.findall(child): extraLegend[tag] = text elif hasattr(child, 'name') and child.name == 'img': # Icon current_img = icon(child['src']) if place_class: document = document.find(id=place_class) for day_a in document.find_all('a', rel=day_regex): day_data = document.find(id=day_a['href'].replace('#', '')) if not day_data: continue date = day_a['rel'][0] day_table = day_data.table if not day_table: continue if day_table.tbody: day_table = day_table.tbody canteen.clearDay(date) # remove old data about this day found_meals = False closed_date_match = None for category_tr in day_table.children: if category_tr.name != 'tr': continue if len(category_tr) < 2: continue # no meal category = category_tr.contents[0].text meal_table = category_tr.contents[1].table if meal_table.tbody: meal_table = meal_table.tbody for meal_tr in meal_table.children: if meal_tr.name != 'tr': continue if len(list(meal_tr.children)) != 3: #print('skipping category, unable to parse meal_table: {} tds'.format(len(list(meal_tr.children)))) if len(list(meal_tr.contents)) > 1 and closed_regex.search(meal_tr.contents[1].text): # Remember closed "meal" closed_date_match = closed_regex.search(meal_tr.contents[1].text) continue found_meals = True td1 = meal_tr.contents[1] span = td1.find('span') if span: name = span.text # Name without notes in <sup> else: name = td1.text # Fallback value: whole line # Add notes from <sup>[Ab,Cd,Ef]</sup> sup = meal_tr.find('sup') if sup: keys = sup.text.strip("[] ") if "[" in sup.text else '' keys_list = [key.strip() for key in keys.split(',')] notes = [extraLegend[key] if key in extraLegend else key for key in keys_list if key] else: notes = [] # Find and convert icons to notes img = meal_tr.find('img') if img: key = icon(img['src']) if key in extraLegend: notes.append(extraLegend[key]) canteen.addMeal(date, category, name, notes, price_regex.findall(meal_tr.contents[2].text), roles) if not found_meals and closed_date_match: # If there were no meals and there's a "geschlossen von .. bis .." message, # let's assume the whole canteen is closed on the mentioned dates match_from = closed_date_match.group("from") match_to = closed_date_match.group("to") now = datetime.datetime.now() year_from = year_to = now.year month_from = int(match_from.split(".")[1]) month_to = int(match_to.split(".")[1]) if now.month > 9: if now.month > month_to: year_to += 1 if now.month > month_from: year_from += 1 fromdate = datetime.datetime.strptime('%s%d' % (match_from, year_from), '%d.%m.%Y') todate = datetime.datetime.strptime('%s%d' % (match_to, year_to), '%d.%m.%Y') if fromdate < now: fromdate = now while fromdate <= todate: canteen.setDayClosed(fromdate.strftime('%d.%m.%Y')) fromdate += datetime.timedelta(1)
def parse_url(url, today=False): canteen = LazyBuilder() # prices are stored on a separate page document = parse(urlopen(base + '/mensa-preise/').read(), 'lxml') prices = {} for tr in document.find('div', 'ce-bodytext').find_all('tr'): meal = tr.find('th') if not meal or not meal.text.strip(): continue if len(tr.find_all('td', 'betrag')) < 3: continue if 'titel' in meal.attrs.get('class', []) or 'zeilentitel' in meal.attrs.get('class', []): continue meal = meal.text.strip() prices[meal] = {} for role, _id in [('student', 0), ('employee', 1), ('other', 2)]: price_html = tr.find_all('td', 'betrag')[_id].text price_search = price_regex.search(price_html) if price_search: prices[meal][role] = price_search.group('price') errorCount = 0 date = datetime.date.today() while errorCount < 7: try: document = parse(urlopen(url.format(date)).read(), 'lxml') errorCount = 0 except HTTPError as e: if e.code == 404: errorCount += 1 date += datetime.timedelta(days=1) continue else: raise e # extract legend legend = {} legends = document.find('div', 'tx-stwm-speiseplan') additions = legends.find('div', 'c-schedule__filter-body') for table in additions.find_all('div', 'c-schedule__filter-item'): for ingredient in table.find('ul').find_all('li'): name = ingredient.find('dt').text.strip() description = ingredient.find('dd').text.strip() legend[name] = description for label in legends.find('ul', 'c-schedule__type-list').find_all('li'): name = label.find('dt').text.replace('(', '').replace(')', '').strip() description = label.find('dd').text.strip() legend[name] = description # extract meals mensa_data = document.find('ul', 'c-schedule__list') category = None for meal in mensa_data.find_all('li'): # update category or use previous one if not specified category_text = meal.find('dt', 'c-schedule__term').text.strip() if category_text: category = category_text data = meal.find('dd').find('p', 'js-schedule-dish-description') name = data.contents[0].strip() # name is the first text node if not name: continue # notes are contained in 3 boxes (type, additional, allergen) and # are comma-separated lists enclosed in brackets or parentheses notes = [] for note in meal.find_all('span', 'c-schedule__marker'): note_text = note.find('span', 'u-text-sup').text \ .replace('(', '').replace(')', '') \ .replace('[', '').replace(']', '') notes += [n for n in note_text.split(',') if n] # some meals contain the GQB label in their name (instead of in notes) if '(GQB)' in name: name = name.replace('(GQB)', '').strip() notes.append('GQB') # the price for both meals is specified as Bio-/Aktionsgericht price_category = category \ .replace('Aktionsessen', 'Bio-/Aktionsgericht') \ .replace('Biogericht', 'Bio-/Aktionsgericht') \ .strip() canteen.addMeal(date, category, name, [legend.get(n, n) for n in notes], prices.get(price_category, {}) ) date += datetime.timedelta(days=1) if today: break return canteen.toXMLFeed()