def jsonify_page(urls, topicId, switch="JSON"): outlist = list() for url in urls: # download the page soup = sb.getSoup(url) # get the page content title = titleFormat(sb.listCatch(getTitle(soup))) author = sb.listCatch(getAuthor(soup)) date = getDate(soup) # get the image urls imageUrls = getImURLS(soup) # body comes in list of paragraphs body = grabPageText(soup) body = getBodyAsString(body) if (len(body) == 0): body = "/empty" # now convert to json dict bornAgain = {'title': title, 'author': author, 'date': date, 'body': body, 'images': imageUrls, 'url': url, 'publication': publicationId, 'topic': topicId, 'posted': False, 'id': 0} outlist.append(bornAgain) if switch == "JSON": return json.dumps(outlist, sort_keys = True, indent = 4) else: return outlist
def getTitle(soup): element = soup.select(".post-meta h1") dirtyTitle = sb.listCatch(element) if dirtyTitle != "/empty": return buildTitle(dirtyTitle.split(" ")) else: return dirtyTitle
def getDate(soup): element = soup.select(".post-date") # if the date field is empty, return "/empty" dirtyDate = sb.listCatch(element) if dirtyDate != "/empty": return sb.parseDate(dirtyDate) else: return dirtyDate
def getAuthor(soup): body = soup.select(".hentry-content p") if len(body) != 0: dtitle = [body[-1]] title = sb.listCatch(dtitle) return title else: return "/empty"
def getAuthor(soup): element = soup.select(".post-author") # if the author field is empty, return "/empty" return sb.listCatch(element)
def getDate(soup): elements = soup.select(".author") # catch the list (if empty) then parse the date return sb.parseDate(sb.listCatch(elements))
def getTitle(soup): dirtyTitle = soup.select(".hentry-meta h1") title = sb.listCatch(dirtyTitle) return title
def getBody(soup): body = soup.select(".hentry-content p") outText = "" for b in body: outText = outText + sb.listCatch([b]) + "\n\n" return outText
def getDate(soup): dirtyDate = soup.select(".hentry-meta p") date = sb.listCatch(dirtyDate) # now sanitize the date clean = sb.parseDate(date) return clean