Exemplos de remove_html_tags em Python, exemplos de scraping.parsers.general.remove_html_tags em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

    def get_body(self, with_markup=False):
        soup = self._get_soup(unmassaged=with_markup)
        body = ""
        for tag in soup.select('article.full-article div.article-body'):
            if with_markup:
                for subtag in tag.contents:
                    body += str(subtag)
            else:
                for subtag in tag.contents:
                    if subtag.name == 'p' and not subtag.get('class'):
                        body += remove_html_tags(str(subtag)).strip() + '\n'
                    if subtag.name == 'ul':
                        for subsubtag in subtag.contents:
                            body += remove_html_tags(str(subsubtag)).strip() + '\n'

            return body

Exemplo n.º 2

0

Exibir arquivo

Arquivo: topic.py Projeto: johnliu/journal-server

    def get_articles(self):
        articles = []
        for entry in self._feed.entries:

            publish_date = None
            published = entry.get('published', None)
            if published:
                publish_date = parser().parse(published)

            body = None
            content = entry.get('content', None)
            if content:
                body = content[0].get('value', None)

            description = entry.get('description', None)
            if description:
                description = unicode(BeautifulSoup(description, features='html'))
                description = remove_html_tags(description)

            articles.append(Article(url=self._encode(entry.get('link', None)),
                                    title=self._encode(entry.get('title', None)),
                                    publish_date=publish_date,
                                    description=self._encode(description),
                                    body=self._encode(body)))
        return articles

Exemplo n.º 3

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

 def get_publish_date(self):
     for time_tag in self._massaged_soup.select('div.dateline time'):
         date_string = remove_html_tags(str(time_tag)).strip()
         date_string = " ".join(date_string.split()[:-1])
         try:
             return datetime.strptime(date_string, "%A, %b. %d %Y, %H:%M %p").date()
         except ValueError:
             return None

Exemplo n.º 4

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

 def get_publish_date(self):
     for span_element in self._massaged_soup.select('span.published-date'):
         date_string = remove_html_tags(str(span_element))
         date_string = re.sub('Published on', '', date_string).strip()
         try:
             return datetime.strptime(date_string, "%a %b %d %Y").date()
         except ValueError:
             return None

Exemplo n.º 5

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

    def get_body(self, with_markup=False):
        soup = self._get_soup(unmassaged=with_markup)
        body = ""
        for tag in soup.select('div.l-main-container div.l-main div.article-entry.text'):
            if with_markup:
                for subtag in tag.contents:
                    body += str(subtag)
            else:
                for subtag in tag.select('p'):
                    body += remove_html_tags(str(subtag)).strip() + '\n'

            return body

Exemplo n.º 6

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

    def get_body(self, with_markup=False):
        soup = self._get_soup(unmassaged=with_markup)
        body = ""
        for tag in soup.select('div.x140x460.clearfix div.column-2.gridcol'):
            if with_markup:
                for subtag in tag.contents:
                    body += str(subtag)
            else:
                for subtag in tag.select('p'):
                    body += remove_html_tags(str(subtag)).strip() + '\n'

            return body

Exemplo n.º 7

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

    def get_body(self, with_markup=False):
        soup = self._get_soup(unmassaged=with_markup)
        body = ""
        for tag in soup.select('div#body div.copy.post-body'):
            if with_markup:
                for subtag in tag.contents:
                    body += str(subtag)
            else:
                for subtag in tag.contents:
                    if (subtag.name == 'p' and not subtag.get('class')) or subtag.name == 'h3':
                        body += remove_html_tags(str(subtag)).strip() + '\n'

            return body

Exemplo n.º 8

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

 def get_body(self, with_markup=False):
     remove_markup_method = lambda s: remove_html_tags(s).strip() + '\n'
     return self._parse(self._parse_body, with_markup, remove_markup_method)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: article.py Projeto: johnliu/journal-server

 def get_title(self, with_markup=False):
     remove_markup_method = lambda s: remove_html_tags(remove_html_a_tags(s))
     return self._parse(self._parse_title, with_markup, remove_markup_method)