def postNass(): # get the nass's data from source nassUrl = 'http://www.nassauweekly.com/' nassSoup = sb.getSoup(nassUrl) nassAboutUrl = 'http://www.nassauweekly.com/about/' nassAboutSoup = sb.getSoup(nassAboutUrl) # the logo #elements = nassSoup.select(".logo img") #el = sb.listCatchItem(elements) #logo = el["src"] logo = 'https://walkercarpenter.files.wordpress.com/2016/02/nass-circle.png?w=800' # about elements = nassAboutSoup.select(".post-content p") about = "" s = " \n " i = 0 # shortened it as database would not accept long strings for p in elements: if i == 2: break about = about + s + p.text i = i + 1 print "PRINTING ABOUT LEN: " print len(about) nass = Publication(name="The Nassau Weekly", logo=logo, description=about) id = nass.ppost() print "nass id: " + str(id) nass.addId(id) return nass
def jsonify_page(urls, topicId, switch="JSON"): outlist = list() for url in urls: soup = sb.getSoup(url) title = getTitle(soup) author = getAuthor(soup) date = getDate(soup) imageUrls = getImages(soup) body = getBody(soup) # now convert to json dict, publication should correspond to nass, topic should be misc bornAgain = { 'title': title, 'author': author, 'date': date, 'body': body, 'images': imageUrls, 'url': url, 'publication': publicationId, 'topic': topicId, "posted": False, "id": 0 } outlist.append(bornAgain) if switch == "JSON": return json.dumps(outlist, sort_keys=True, indent=4) else: return outlist
def postTigerMag(): # get the data from source aboutURL = 'http://www.tigermag.com/about-us/' aboutSoup = sb.getSoup(aboutURL) # the logo logo = 'https://upload.wikimedia.org/wikipedia/en/1/15/The_Princeton_Tiger_Logo.png' # about elements = aboutSoup.select(".hentry-content p") about = "" s = " " # shortened it as database would not accept long strings for p in elements: about = about + s + p.text tigerMag = Publication(name="The Princeton Tiger", logo=logo, description=about) #mId = 22 mId = tigerMag.ppost() print "tiger mag id: " + str(mId) tigerMag.addId(mId) return tigerMag
def testUrl(testUrl): # only download the page once soup = sb.getSoup(testUrl) # get the article title, time, author title = getTitle(soup) sys.stdout.write("Title:\t\t") sys.stdout.write(title[0].text) writeN() author = getAuthor(soup) sys.stdout.write("Author:\t\t") sys.stdout.write(author[0].text) date = getDate(soup) sys.stdout.write("\t\tDate:\t\t") sys.stdout.write(date) writeN() # get the body text of our soup body = grabPageText(soup) # print out the article body for p in body: sys.stdout.write(p.text) writeN()
def jsonify_page(urls, topicId, switch="JSON"): outlist = list() for url in urls: # download the page soup = sb.getSoup(url) # get the page content title = titleFormat(sb.listCatch(getTitle(soup))) author = sb.listCatch(getAuthor(soup)) date = getDate(soup) # get the image urls imageUrls = getImURLS(soup) # body comes in list of paragraphs body = grabPageText(soup) body = getBodyAsString(body) if (len(body) == 0): body = "/empty" # now convert to json dict bornAgain = {'title': title, 'author': author, 'date': date, 'body': body, 'images': imageUrls, 'url': url, 'publication': publicationId, 'topic': topicId, 'posted': False, 'id': 0} outlist.append(bornAgain) if switch == "JSON": return json.dumps(outlist, sort_keys = True, indent = 4) else: return outlist
def getTopicPageUrls(topicPage): soup = sb.getSoup(topicPage) elements = soup.select(".post a") outSet = set() for el in elements: outSet.add(el["href"]) return list(outSet)
def postPrince(): # get the daily prince info from source princeUrl = 'http://www.dailyprincetonian.com/' princeSoup = sb.getSoup(princeUrl) princeAboutUrl = 'http://www.dailyprincetonian.com/page/about' princeAboutSoup = sb.getSoup(princeAboutUrl) # for now we are using the old logo because the new one looks nasty prince = Publication( name="The Daily Princetonian", #logo=sb.listCatchItem(princeSoup.select(".col-md-8 a img"))["src"], logo= 'http://dirgyzwl2hnqq.cloudfront.net/20170330XJxw8OoJDm/dist/img/favicons/apple-touch-icon.png', description=sb.listCatchItem( princeAboutSoup.select(".col-sm-12 p")).text) id = prince.ppost() print "prince id: " + str(id) prince.addId(id) return prince
def getArchiveIssueLinks(archiveUrl="http://www.nassauweekly.com/issue/"): soup = sb.getSoup(archiveUrl) elements = soup.select("div h2 a") issueUrls = list() for el in elements: issueUrls.append(el["href"]) # grab the dates as well! elements = soup.select(".post-date") dates = list() for el in elements: dates.append(sb.parseDate(el.text).split(" ")[0]) return [issueUrls, dates]
def getIssueArticleUrls(issueUrl): soup = sb.getSoup(issueUrl) elements = soup.select(".issue-posts span a") i = 0 urls = list() # every other element is an author link # so only extract the odd elements for el in elements: url = el["href"] # exclude the author links found = re.search("/byline/", url) if found is None: urls.append(url) return urls
def getArticleURLS(params, topicTag): qURL = getPrinceQURL(params[0], params[1], params[2], topicTag) soup = sb.getSoup(qURL) links = soup.select(".clearfix a") urls = list() baseURL = "http://www.dailyprincetonian.com" # links are repeated, so we only select even indexes for i in range(0, len(links), 2): urls.append(links[i]['href']) for i in range(0, len(urls)): urls[i] = baseURL + urls[i] return urls