def createWarning(self, surrogateId, winner, winningScore, runnerUp, runnerUpScore, pushFlag): warning = {} comment = "Election model v6" warning["date"] = datetime.utcnow().isoformat('T') warning["derivedFrom"] = {} warning["derivedFrom"]["derivedIds"] = [surrogateId] warning["derivedFrom"]["location"] = [self.country.title(), self.state.title(), self.city.title()] warning["derivedFrom"]["source"] = "Raw Twitter feed from DataSift and election config Files." warning["derivedFrom"]["model"] = __processor__ warning["model"] = "VoteSentimentEvolutionModel v6" electionType = '' if self.electionType in ['president', 'prime minister']: electionType = 'President/Prime Minister' else: electionType = self.electionType warning["eventType"] = ["Vote", "Election", electionType.title()] warning["confidence"] = round(winningScore / 100, 2) # in cases of very close run races # warning["confidence"] = 1.00 warning["confidenceIsProbability"] = True warning["eventDate"] = self.electionDate warning["population"] = winner.title() warning["location"] = [self.country.title(), self.state.title(), self.city.title()] comment = "Winner: " + winner + " Score: " + str(winningScore) + " Runner-Up: " + runnerUp + " Score: " + str(runnerUpScore) if self.runOff == 'Y': comment = "Elections going to 2nd round with candidates: " + winner + ' and ' + runnerUp warning["comments"] = comment warning['eventCode'] = '02' if self.electionType == 'mayor': warning['eventCode'] = '0213' elif self.electionType in ['president', 'prime minister']: warning['eventCode'] = '0211' elif self.electionType == 'governor': warning['eventCode'] = '0212' warning = message.add_embers_ids(warning) log.info("warning-->\n%s" % warning) # writing warning to file if self.state == '-': warningFileName = "warning_" + self.country + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') elif self.city == '-': warningFileName = "warning_" + self.state + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') else: warningFileName = "warning_" + self.city + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') with open("../data/warnings/" + warningFileName, "w") as f: f.write(json.dumps(warning)) log.info("warning written to file") # writing files to s3 if pushFlag == '1': try: with open("../data/warnings/" + warningFileName, 'r') as f: pushFileToS3(f, 'incoming/predictions/elections/' + warningFileName) log.info("warning pushed to s3") except: log.exception("warning push to S3 failed!! push manually!!") return
def ingest_price(arg, stock, scrape_f): # initiate url s_date = datetime.strftime(datetime.strptime(arg.d, "%Y-%m-%d") + timedelta(days=-30), STOCK_CON[stock]["tFormat"]) e_date = datetime.strftime(datetime.strptime(arg.d, "%Y-%m-%d"), STOCK_CON[stock]["tFormat"]) url = STOCK_CON[stock]["urlStr"] % (s_date, e_date) post_time, last_price, previous_last_price = scrape_f(url) if post_time == "n/a": return None nowstr = datetime.utcnow().isoformat() # create json message msg = { "previousCloseValue": previous_last_price, "date": post_time, "queryTime": nowstr, "originalUpdateTime": post_time, "name": stock, "feed": STOCK_CON[stock]["feed"], "currentValue": last_price, "type": STOCK_CON[stock]["type"], } msg = message.add_embers_ids(msg) return msg
def get_news_by_url(url): news = News() try: soup = BeautifulSoup(urllib2.urlopen(url)) #title title = soup.find("div", "pg-story-head md").find("h2").text news.set_title(title) #postTime author_posttime = soup.find("p", "dateline").text.replace("\n","").lower().replace("\t","").split("/") post_time = author_posttime[1].replace("pm", "").replace("am", "").strip() t_format = "%d %b %Y, %I:%M" post_time = datetime.strptime(post_time, t_format).isoformat() news.set_posttime(post_time) #author author = author_posttime[0] news.set_author(author) #url news.set_url(url) #date date = datetime.utcnow().isoformat() news.set_date(date) #source source = 'elfinancierocr' news.set_source(source) #content, encoding, id, country, labels paragraphs = soup.find("div", "pg-story-body mce").find_all('p') content = " ".join([unicode(p.text) for p in paragraphs]) news.set_content(content) #encoding encoding = 'utf-8' news.set_encoding(encoding) news.news = message.add_embers_ids(news.news) return news.news except: log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0])) return None
def get_news_by_url(url): news = News() try: soup = BeautifulSoup(urllib2.urlopen(url)) # title title = soup.find_all("h1")[0].text news.set_title(title) # postTime post_time = soup.select('meta[name="REVISION_DATE"]')[0]["content"] t_format = "%a %b %d %H:%M:%S %Z %Y" post_time = datetime.strptime(post_time, t_format).isoformat() news.set_posttime(post_time) # author author = soup.select('meta[name="Author"]')[0]["content"] news.set_author(author) # url news.set_url(url) # date date = datetime.utcnow().isoformat() news.set_date(date) # source source = "lta_reuters" news.set_source(source) # content, encoding, id, country, labels paragraphs = soup.find(id="resizeableText").find_all("p") content = " ".join([unicode(p.text) for p in paragraphs]) news.set_content(content) # encoding encoding = "utf-8" news.set_encoding(encoding) news.news = message.add_embers_ids(news.news) return news.news except: log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0])) return None
def createSurrogate(self, winner, winningScore, runnerUp, runnerUpScore, pushFlag): surrogate = {} surrogate["date"] = datetime.utcnow().isoformat('T') surrogate["scores"] = self.scoreCard surrogate["model"] = "VoteSentimentEvolutionModel" surrogate["derivedFrom"] = {} surrogate["derivedFrom"]["derivedIds"] = self.tweetList surrogate["derivedFrom"]["location"] = [self.country.title(), self.state.title(), self.city.title()] surrogate["derivedFrom"]["source"] = "Raw Twitter feed from DataSift and election config Files." surrogate["derivedFrom"]["comments"] = "tweets were filtered by country then state and then by those containing the terms of candidates" surrogate["derivedFrom"]["model"] = __processor__ surrogate["confidence"] = 1.00 surrogate["confidenceIsProbability"] = True surrogate["configuration"] = self.configJson surrogate = message.add_embers_ids(surrogate) log.info("surrogate--->\n%s" % surrogate) #writing surrogate to file if self.state == '-': surrogateFileName = "surrogate_" + self.country + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') elif self.city == '-': surrogateFileName = "surrogate_" + self.state + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') else: surrogateFileName = "surrogate_" + self.city + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') with open("../data/surrogates/" + surrogateFileName, "w") as f: f.write(json.dumps(surrogate)) log.info("surrogate written to file") # writing files to s3 if pushFlag == '1': try: with open("../data/surrogates/" + surrogateFileName, 'r') as f: pushFileToS3(f, 'surrogates/elections/' + surrogateFileName) log.info("surrogate pushed to s3") except: log.exception("surrogate push to S3 failed!! push manually!!") self.createWarning(surrogate["embersId"], winner, winningScore, runnerUp, runnerUpScore, pushFlag) return
def get_news_by_url(url): article = {} try: soup = BeautifulSoup(urllib2.urlopen(url)) # title title = "" titleElements = soup.findAll(id="disqus_title") for ele in titleElements: title = ele.getText().encode('utf-8') article["title"] = title # get article timestamps postTime = "" postTimeElements = soup.findAll(attrs={'class':"datestamp"}) for ele in postTimeElements: timeStamp = float(ele["epoch"]) #postTime = datetime.strftime("%Y-%m-%d %H:%M:%S",datetime.fromtimestamp(timeStamp/1000)) postTime = datetime.fromtimestamp(timeStamp/1000) postTimeStr = postTime.isoformat() article["postTime"] = postTimeStr # get date (should be part of the time?) postDay = postTime.date() article["postDate"] = datetime.strftime(postDay,"%Y-%m-%d"); # author author = "" authorElements = soup.findAll(attrs={'class':"byline"}) for ele in authorElements: author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip(); article["author"] = author # content - FIXME - Extractor undefined content = soup.body.get_text() article["content"] = content # source info source = "Bloomberg News" article["source"] = source # time stamp updateTime = datetime.utcnow().isoformat() article["updateTime"] = updateTime # the message format specified field article["date"] = updateTime # date? why are dates and times separate? updateDate = datetime.strftime(datetime.utcnow(),"%Y-%m-%d") article["updateDate"] = updateDate article["url"] = url article = message.add_embers_ids(article) except KeyboardInterrupt: raise except: log.exception("Could not ingest %s" % (url,)) return {} log.debug("Successfully ingested %s" % (url,)) return article