def get_body(self, with_markup=False): soup = self._get_soup(unmassaged=with_markup) body = "" for tag in soup.select('article.full-article div.article-body'): if with_markup: for subtag in tag.contents: body += str(subtag) else: for subtag in tag.contents: if subtag.name == 'p' and not subtag.get('class'): body += remove_html_tags(str(subtag)).strip() + '\n' if subtag.name == 'ul': for subsubtag in subtag.contents: body += remove_html_tags(str(subsubtag)).strip() + '\n' return body
def get_articles(self): articles = [] for entry in self._feed.entries: publish_date = None published = entry.get('published', None) if published: publish_date = parser().parse(published) body = None content = entry.get('content', None) if content: body = content[0].get('value', None) description = entry.get('description', None) if description: description = unicode(BeautifulSoup(description, features='html')) description = remove_html_tags(description) articles.append(Article(url=self._encode(entry.get('link', None)), title=self._encode(entry.get('title', None)), publish_date=publish_date, description=self._encode(description), body=self._encode(body))) return articles
def get_publish_date(self): for time_tag in self._massaged_soup.select('div.dateline time'): date_string = remove_html_tags(str(time_tag)).strip() date_string = " ".join(date_string.split()[:-1]) try: return datetime.strptime(date_string, "%A, %b. %d %Y, %H:%M %p").date() except ValueError: return None
def get_publish_date(self): for span_element in self._massaged_soup.select('span.published-date'): date_string = remove_html_tags(str(span_element)) date_string = re.sub('Published on', '', date_string).strip() try: return datetime.strptime(date_string, "%a %b %d %Y").date() except ValueError: return None
def get_body(self, with_markup=False): soup = self._get_soup(unmassaged=with_markup) body = "" for tag in soup.select('div.l-main-container div.l-main div.article-entry.text'): if with_markup: for subtag in tag.contents: body += str(subtag) else: for subtag in tag.select('p'): body += remove_html_tags(str(subtag)).strip() + '\n' return body
def get_body(self, with_markup=False): soup = self._get_soup(unmassaged=with_markup) body = "" for tag in soup.select('div.x140x460.clearfix div.column-2.gridcol'): if with_markup: for subtag in tag.contents: body += str(subtag) else: for subtag in tag.select('p'): body += remove_html_tags(str(subtag)).strip() + '\n' return body
def get_body(self, with_markup=False): soup = self._get_soup(unmassaged=with_markup) body = "" for tag in soup.select('div#body div.copy.post-body'): if with_markup: for subtag in tag.contents: body += str(subtag) else: for subtag in tag.contents: if (subtag.name == 'p' and not subtag.get('class')) or subtag.name == 'h3': body += remove_html_tags(str(subtag)).strip() + '\n' return body
def get_body(self, with_markup=False): remove_markup_method = lambda s: remove_html_tags(s).strip() + '\n' return self._parse(self._parse_body, with_markup, remove_markup_method)
def get_title(self, with_markup=False): remove_markup_method = lambda s: remove_html_tags(remove_html_a_tags(s)) return self._parse(self._parse_title, with_markup, remove_markup_method)