def parse_standard_rss(self, url): headers = self.set_headers() req = urllib.request.Request(url, headers=headers) parse_xml_url = urllib.request.urlopen(req) xml_page = parse_xml_url.read() parse_xml_url.close() soup_page = BeautifulSoup(xml_page, "lxml") channel = soup_page.find("channel") news_list = channel.findAll("item") links = [] for getfeed in news_list: titolo = getfeed.title.text description = "" if getfeed.description.text: description = getfeed.description.text link_id = generate_link_id(titolo) links.append({ 'id': link_id, 'titolo': titolo, 'text': description, 'url': getfeed.link.nextSibling.rstrip(), 'data': parse_date(getfeed.pubdate.text) }) if JUST_ONE_LINK: break return links
def parse_page(self, url): links = [] headers = super().set_headers() results = requests.get(url.strip(), headers=headers) soup = BeautifulSoup(results.text, "html.parser") container = soup.find('ul', class_='collection-page__list') articles = container.find_all('article') if articles: for article in articles: if article.find('h2'): title = article.find('h2').find('a').contents[0].text href = article.find('h2').find('a')['href'] data = article.find('p', class_="article-teaser-vertical__date").text.strip() # print(title) # print (href) # print (data) links.append({ 'id': generate_link_id(title), 'titolo': title, 'text': '', 'url': "https://www.greenbiz.com" + href, 'data': data }) return links
def parse_page(self, url): links = [] headers = super().set_headers() results = requests.get(url.strip(), headers=headers) soup = BeautifulSoup(results.text, "html.parser") container = soup.find('div', id='topics') articles = container.find_all('article', class_='item') if articles: for article in articles: if article.find('h1'): title = article.find('h1').find('a').contents[0] href = article.find('h1').find('a')['href'] data = article.find('div', class_="date").text.strip() links.append({ 'id': generate_link_id(title), 'titolo': title, 'text': '', 'url': href, 'data': data }) return links
def get_items(self): super().get_items() today = get_today_date() headers = super().set_headers() results = requests.get(self.url.strip(), headers=headers) soup = BeautifulSoup(results.text, "html.parser") container = soup.find('article', class_='main-feature') title = container.find('h2') links = [{ 'id': generate_link_id(title.text), 'titolo': title.text, 'text': '', 'url': title.find('a').get('href'), 'data': today }] def get_links_from_structure(articles): links = [] rows = articles.find_all('div', class_='row') for row in rows: a = row.find('a') if a: h3 = a.find('h3') if h3: links.append({ 'id': generate_link_id(h3.text), 'titolo': h3.text, 'text': '', 'url': a.get('href'), 'data': today }) return links container = soup.find('div', class_='cards') links = get_links_from_structure(container) self.links = links
def get_links_from_structure(articles): links = [] rows = articles.find_all('div', class_='row') for row in rows: a = row.find('a') if a: h3 = a.find('h3') if h3: links.append({ 'id': generate_link_id(h3.text), 'titolo': h3.text, 'text': '', 'url': a.get('href'), 'data': today }) return links
def parse_page(self, url): links = [] base_url = self.params['base_url'] article_selector = self.params['article_selector'] title_selector = self.params['title_selector'] link_selector = self.params['link_selector'] if 'date_selector' in self.params: date_selector = self.params['date_selector'] else: date_selector = None try: headers = super().set_headers() results = requests.get(url.strip(), headers=headers) if TEST: print(results.text) soup = BeautifulSoup(results.text, "html.parser") articles = soup.select(article_selector) if TEST: print(article_selector) print(articles) if articles: print("Getting links") if TEST: print("Len: %d" % len(articles)) for article in articles: try: title = article.select(title_selector)[0].text.strip() if TEST: print("---------------------------------") print(title) link = article.select(link_selector)[0]['href'] if not link.startswith("http"): link = base_url + link if TEST: print("---------------------------------") print(link) date = '' try: if date_selector: date = article.select( date_selector)[0].text.strip() except Exception as e1: print( "Exception in parse_page > for loop > get date" ) if TEST: print(date) links.append({ 'id': generate_link_id(title), 'titolo': title, 'text': '', 'url': link, 'data': date }) if TEST: break except Exception as e: # TODO write exception to log for analysis print("Exception in parse_page > for loop") print(e) self.delay() except Exception as e: # TODO write exception to log for analysis print("Exception in parse_page") print(e) links = [] return links
def parse_page(url, params): links = [] logger.info("Parsing url: " + url) base_url = params['base_url'] article_selector = params['article_selector'] title_selector = params['title_selector'] link_selector = params['link_selector'] if 'date_selector' in params: date_selector = params['date_selector'] else: date_selector = None try: headers = set_headers() results = requests.get(url.strip(), headers=headers) soup = BeautifulSoup(results.text, "html.parser") articles = soup.select(article_selector) if articles: if DEBUG: logger.debug(str("Number of links in url: %d" % len(articles))) for article in articles: try: title = article.select(title_selector)[-1].text.strip() if DEBUG: logger.debug("Title: " + title) link = article.select(link_selector)[-1]['href'] if not link.startswith("http"): link = base_url + link if DEBUG: logger.debug("Link: " + link) date = '' try: if date_selector: d = article.select(date_selector)[-1].text.strip() date = parse_date(d) except Exception as e0: logger.err( str("Exception in parse_page > for loop > get date: %s" % e0)) if DEBUG: logger.debug("Date: " + str(date)) links.append({ 'id': generate_link_id(title), 'titolo': title, 'text': '', 'url': link, 'data': date }) if TEST: break except Exception as e: logger.err( str("Exception in parse_page > for loop: %s" % e)) delay() except Exception as e: logger.err(str("Exception in parse_page: %s" % e)) links = [] return links