Exemplo n.º 1
0
def _get_content(html):
    articleDescription = html.find('p', {'class': 'description'})
    articleDescription = clean_html(str(articleDescription))

    articleContent = html.find('div', {'class': 'story-content'})
    articleContent = clean_html(str(articleContent))

    return articleDescription + articleContent
Exemplo n.º 2
0
def _get_date(html):
    # Finds the author and posted date class.
    authorName = html.findAll('p', {'class': 'author-name'})
    timestamp_resultset = str(authorName)

    # Converts to BS object to find span class where posted.
    # Date is present.
    tS = BeautifulSoup(timestamp_resultset, 'lxml')
    tS = tS.find_all('span')

    # Maps and converts to raw string data.
    raw_content_str = map(str, tS)
    date = clean_html(' '.join(raw_content_str))
    if not date:
        date = html.find('div', {'class': 'publishDate'})
        date = clean_html(' '.join(date))
    date = datetime.datetime.strptime(date, "%d %B %Y").strftime("%Y-%m-%d")
    return date
Exemplo n.º 3
0
def _get_date(html):
    date = html.find('time', {'class': 'date-mdy'})
    #format is mm.dd.yy
    dates = str(date).split('.')
    #reformat
    month = dates[0]
    day = dates[1]
    year = "20" + dates[2]
    publish_date = year + "-" + month + "-" + day
    return clean_html(str(publish_date))
Exemplo n.º 4
0
def _get_content(html):
    articleText = html.find('article')
    articleText = str(articleText)

    articleSoup = BeautifulSoup(articleText, 'lxml')
    articleSoup = articleSoup.find('div')
    articleSoup = articleSoup.findAll('p')

    raw_content_str = map(str, articleSoup)

    return clean_html(' '.join(raw_content_str))
Exemplo n.º 5
0
def _get_content(html):
    articleText = html.find('div', {'class': 'article-content'})
    articleText = str(articleText)

    articleSoup = BeautifulSoup(articleText, 'lxml')
    articleSoup = articleSoup.findAll('p')
    #print(str(articleSoup))
    raw_content_str = map(str, articleSoup)

    finalReturn = clean_html(' '.join(raw_content_str))

    return finalReturn
Exemplo n.º 6
0
def _get_content(html):
    articleText = html.find('div', {'class': 'article-text-update'})

    if not articleText:
        articleText = html.find('div', {'class': 'article-text text-merri'})

    articleText = str(articleText)

    articleSoup = BeautifulSoup(articleText, 'lxml')
    articleSoup = articleSoup.findAll('p')

    raw_content_str = map(str, articleSoup)

    return clean_html(' '.join(raw_content_str))
Exemplo n.º 7
0
def _get_content(html):
    articleText = html.findAll('div', {'class': 'section-content'})
    cleanText = clean_html(' '.join(map(str, articleText)))
    return cleanText
Exemplo n.º 8
0
def _get_date(html):
    date = html.find('time', {'class': 'the-time'})
    date = clean_html(str(date))
    return date
Exemplo n.º 9
0
def _get_title(html):
    title = html.find('h1', {'class': 'article-title'})
    title = clean_html(str(title))

    return title
Exemplo n.º 10
0
def _get_content(html):
    raw_content_str = map(str, html.select('.text')[0].contents)
    return clean_html(' '.join(raw_content_str))
Exemplo n.º 11
0
def _get_title(html):
    title = html.find('h1', {'class': 'title'})
    return clean_html(str(title))
Exemplo n.º 12
0
def _get_title(html):

    title = html.find('h1', {'class': 'story-headline'})

    return clean_html(str(title))