Exemplos de clean_html em Python, exemplos de app.loaders.helpers.clean_html em Python

Exemplo n.º 1

0

Exibir arquivo

def _get_content(html):
    articleDescription = html.find('p', {'class': 'description'})
    articleDescription = clean_html(str(articleDescription))

    articleContent = html.find('div', {'class': 'story-content'})
    articleContent = clean_html(str(articleContent))

    return articleDescription + articleContent

Exemplo n.º 2

0

Exibir arquivo

def _get_date(html):
    # Finds the author and posted date class.
    authorName = html.findAll('p', {'class': 'author-name'})
    timestamp_resultset = str(authorName)

    # Converts to BS object to find span class where posted.
    # Date is present.
    tS = BeautifulSoup(timestamp_resultset, 'lxml')
    tS = tS.find_all('span')

    # Maps and converts to raw string data.
    raw_content_str = map(str, tS)
    date = clean_html(' '.join(raw_content_str))
    if not date:
        date = html.find('div', {'class': 'publishDate'})
        date = clean_html(' '.join(date))
    date = datetime.datetime.strptime(date, "%d %B %Y").strftime("%Y-%m-%d")
    return date

Exemplo n.º 3

0

Exibir arquivo

Arquivo: wired.py Projeto: jamo95/Newsy

def _get_date(html):
    date = html.find('time', {'class': 'date-mdy'})
    #format is mm.dd.yy
    dates = str(date).split('.')
    #reformat
    month = dates[0]
    day = dates[1]
    year = "20" + dates[2]
    publish_date = year + "-" + month + "-" + day
    return clean_html(str(publish_date))

Exemplo n.º 4

0

Exibir arquivo

Arquivo: wired.py Projeto: jamo95/Newsy

def _get_content(html):
    articleText = html.find('article')
    articleText = str(articleText)

    articleSoup = BeautifulSoup(articleText, 'lxml')
    articleSoup = articleSoup.find('div')
    articleSoup = articleSoup.findAll('p')

    raw_content_str = map(str, articleSoup)

    return clean_html(' '.join(raw_content_str))

Exemplo n.º 5

0

Exibir arquivo

def _get_content(html):
    articleText = html.find('div', {'class': 'article-content'})
    articleText = str(articleText)

    articleSoup = BeautifulSoup(articleText, 'lxml')
    articleSoup = articleSoup.findAll('p')
    #print(str(articleSoup))
    raw_content_str = map(str, articleSoup)

    finalReturn = clean_html(' '.join(raw_content_str))

    return finalReturn

Exemplo n.º 6

0

Exibir arquivo

def _get_content(html):
    articleText = html.find('div', {'class': 'article-text-update'})

    if not articleText:
        articleText = html.find('div', {'class': 'article-text text-merri'})

    articleText = str(articleText)

    articleSoup = BeautifulSoup(articleText, 'lxml')
    articleSoup = articleSoup.findAll('p')

    raw_content_str = map(str, articleSoup)

    return clean_html(' '.join(raw_content_str))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: hackernoon.py Projeto: jamo95/Newsy

def _get_content(html):
    articleText = html.findAll('div', {'class': 'section-content'})
    cleanText = clean_html(' '.join(map(str, articleText)))
    return cleanText

Exemplo n.º 8

0

Exibir arquivo

def _get_date(html):
    date = html.find('time', {'class': 'the-time'})
    date = clean_html(str(date))
    return date

Exemplo n.º 9

0

Exibir arquivo

def _get_title(html):
    title = html.find('h1', {'class': 'article-title'})
    title = clean_html(str(title))

    return title

Exemplo n.º 10

0

Exibir arquivo

def _get_content(html):
    raw_content_str = map(str, html.select('.text')[0].contents)
    return clean_html(' '.join(raw_content_str))

Exemplo n.º 11

0

Exibir arquivo

Arquivo: wired.py Projeto: jamo95/Newsy

def _get_title(html):
    title = html.find('h1', {'class': 'title'})
    return clean_html(str(title))

Exemplo n.º 12

0

Exibir arquivo

def _get_title(html):

    title = html.find('h1', {'class': 'story-headline'})

    return clean_html(str(title))