Пример #1
0
def jsonify_page(urls, topicId, switch="JSON"):
    outlist = list()
    for url in urls:
        # download the page
        soup = sb.getSoup(url)

        # get the page content
        title = titleFormat(sb.listCatch(getTitle(soup)))
        author = sb.listCatch(getAuthor(soup))
        date = getDate(soup)
        # get the image urls
        imageUrls = getImURLS(soup)
        # body comes in list of paragraphs
        body = grabPageText(soup)
        body = getBodyAsString(body)
        if (len(body) == 0):
            body = "/empty"
        # now convert to json dict
        bornAgain = {'title': title, 'author': author,
        'date': date, 'body': body,
        'images': imageUrls, 'url': url,
        'publication': publicationId, 'topic': topicId,
        'posted': False, 'id': 0}
        outlist.append(bornAgain)

    if switch == "JSON":
        return json.dumps(outlist, sort_keys = True, indent = 4)
    else:
        return outlist
Пример #2
0
def getTitle(soup):
    element = soup.select(".post-meta h1")
    dirtyTitle = sb.listCatch(element)
    if dirtyTitle != "/empty":
        return buildTitle(dirtyTitle.split(" "))
    else:
        return dirtyTitle
Пример #3
0
def getDate(soup):
    element = soup.select(".post-date")
    # if the date field is empty, return "/empty"
    dirtyDate = sb.listCatch(element)
    if dirtyDate != "/empty":
        return sb.parseDate(dirtyDate)
    else:
        return dirtyDate
Пример #4
0
def getAuthor(soup):
    body = soup.select(".hentry-content p")
    if len(body) != 0:
        dtitle = [body[-1]]
        title = sb.listCatch(dtitle)
        return title
    else:
        return "/empty"
Пример #5
0
def getAuthor(soup):
    element = soup.select(".post-author")
    # if the author field is empty, return "/empty"
    return sb.listCatch(element)
Пример #6
0
def getDate(soup):
    elements = soup.select(".author")
    # catch the list (if empty) then parse the date
    return sb.parseDate(sb.listCatch(elements))
Пример #7
0
def getTitle(soup):
    dirtyTitle = soup.select(".hentry-meta h1")
    title = sb.listCatch(dirtyTitle)
    return title
Пример #8
0
def getBody(soup):
    body = soup.select(".hentry-content p")
    outText = ""
    for b in body:
        outText = outText + sb.listCatch([b]) + "\n\n"
    return outText
Пример #9
0
def getDate(soup):
    dirtyDate = soup.select(".hentry-meta p")
    date = sb.listCatch(dirtyDate)
    # now sanitize the date
    clean = sb.parseDate(date)
    return clean