def _check_source(self): link = self.document['link'] session = requests.Session() if hasattr(self, 'auth_manager'): # Perform authentication self.auth_manager.auth(session) if not self.auth_manager.success: self.logger.error('Authentication failed..') return None html = self.fetch(link, session=session) if not html: self.logger.warning('Fetch "%s" returned nothing' % link) return None bsoup = get_soup(html) if not bsoup: self.logger.warning('BeautifulSoup returned None') return None try: entries = self.parse(bsoup) except Exception as e: self.logger.error('parse: %s' % unicode(e)) return None return entries
def __check_site(self): link = self._get_document_field(self.document, 'announcements.link_site') if not link: self.logger.debug('"link_site" not found in document!') return None html = self.fetch(link) if not html: self.logger.warning('Fetch "%s" returned nothing' % link) return None bsoup = get_soup(html) if not bsoup: self.logger.warning('BeautifulSoup returned None') return None try: entries = self.parse_site(bsoup) except Exception as e: self.logger.error('parse_site: %s', unicode(e)) return None try: entries = self.fix_site_entries(entries, link) except Exception as e: self.logger.error('fix_site_entries: %s', unicode(e)) return None return entries
def parse(self, html): """ Feel free to propose any changes on the schema below. Dictionary format: 'lunch'-> 'main' -> unicode 'salad' -> unicode 'desert' -> unicode 'dinner'-> 'main' -> unicode 'salad' -> unicode 'desert' -> unicode """ # get the cells from the html bsoup = get_soup(html) cells = [self.prettify(cell.text) for cell in bsoup.find_all("td")] # split the cells according to meal. hardcoded positions # [main, served_with, salad, cheese, desert] lunch = [cells[9:16], cells[25:32], cells[33:40], cells[41:48], cells[49:56]] dinner = [cells[65:72], cells[81:88], cells[89:96], cells[97:104], cells[105:112]] # create the menu dictionary menu = list() for i in xrange(7): day_menu = { "name": self.weekdays[i], "date": date_to_datetime(self.latest_monday + timedelta(days=i)), "lunch": { "main": lunch[0][i] + ". " + lunch[1][i], "salad": lunch[2][i] + ". " + lunch[3][i], "desert": lunch[4][i], }, "dinner": { "main": dinner[0][i] + ". " + dinner[1][i], "salad": dinner[2][i] + ". " + dinner[3][i], "desert": dinner[4][i], }, } menu.append(day_menu) return menu
def _check_source(self): html = self.fetch() if not html: self.logger.warning('Fetch returned nothing. Make sure the file exist') return None bsoup = get_soup(html) if not bsoup: self.logger.warning('BeautifulSoup returned None') return None try: entries = self.parse(bsoup) except Exception as e: self.logger.error('parse: %s' % unicode(e)) return None return entries
def parse(bsoup): announcements = [] # get post containining announcements posts = bsoup.find(id='post') # get articles from post articles = posts.find_all('article', class_='loop-entry clearfix') #loop thorugh articles for article in articles: #initialize announcement dictionary announcement = {} # get left part left = article.find('div', class_='loop-entry-left') date_post = left.find('div', class_= 'post-meta').find('div', class_ = 'post-date') announcement['date'] = parse_greek_date( date_post.text ) announcement['has_time'] = False #get right part right = article.find('div', class_='loop-entry-right') announcement['title'] = right.h2.a['title'] announcement['link'] = right.h2.a['href'] paragraphs = right.find_all( 'p' ) #join all paragraps to a single html announcement['html'] = '\n'.join( [unicode(p) for p in paragraphs] ) # get the plaintext from html bsoup = get_soup(announcement['html']) announcement['plaintext'] = bsoup.text.strip() #add to announcements announcements.append(announcement) return announcements