示例#1
0
    def createWarning(self, surrogateId, winner, winningScore, runnerUp, runnerUpScore, pushFlag):
        warning = {}
        comment = "Election model v6"
        warning["date"] = datetime.utcnow().isoformat('T')
        warning["derivedFrom"] = {}
        warning["derivedFrom"]["derivedIds"] = [surrogateId]
        warning["derivedFrom"]["location"] = [self.country.title(),
                                              self.state.title(),
                                              self.city.title()]
        warning["derivedFrom"]["source"] = "Raw Twitter feed from DataSift and election config Files."
        warning["derivedFrom"]["model"] = __processor__
        warning["model"] = "VoteSentimentEvolutionModel v6"
        electionType = ''
        if self.electionType in ['president', 'prime minister']:
            electionType = 'President/Prime Minister'
        else:
            electionType = self.electionType
        warning["eventType"] = ["Vote", "Election", electionType.title()]
        warning["confidence"] = round(winningScore / 100, 2)  # in cases of very close run races
        # warning["confidence"] = 1.00
        warning["confidenceIsProbability"] = True
        warning["eventDate"] = self.electionDate
        warning["population"] = winner.title()
        warning["location"] = [self.country.title(), self.state.title(),
                               self.city.title()]
        comment = "Winner: " + winner + " Score: " + str(winningScore) + " Runner-Up: " + runnerUp + " Score: " + str(runnerUpScore)
        if self.runOff == 'Y':
            comment = "Elections going to 2nd round with candidates: " + winner + ' and ' + runnerUp
        warning["comments"] = comment
        warning['eventCode'] = '02'
        if self.electionType == 'mayor':
            warning['eventCode'] = '0213'
        elif self.electionType in ['president', 'prime minister']:
            warning['eventCode'] = '0211'
        elif self.electionType == 'governor':
            warning['eventCode'] = '0212'
        warning = message.add_embers_ids(warning)
        log.info("warning-->\n%s" % warning)

        # writing warning to file
        if self.state == '-':
            warningFileName = "warning_" + self.country + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        elif self.city == '-':
            warningFileName = "warning_" + self.state + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        else:
            warningFileName = "warning_" + self.city + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        with open("../data/warnings/" + warningFileName, "w") as f:
            f.write(json.dumps(warning))
        log.info("warning written to file")

        # writing files to s3
        if pushFlag == '1':
            try:
                with open("../data/warnings/" + warningFileName, 'r') as f:
                    pushFileToS3(f, 'incoming/predictions/elections/' +
                                 warningFileName)
                log.info("warning pushed to s3")
            except:
                log.exception("warning push to S3 failed!! push manually!!")
        return
示例#2
0
def ingest_price(arg, stock, scrape_f):
    # initiate url
    s_date = datetime.strftime(datetime.strptime(arg.d, "%Y-%m-%d") + timedelta(days=-30), STOCK_CON[stock]["tFormat"])
    e_date = datetime.strftime(datetime.strptime(arg.d, "%Y-%m-%d"), STOCK_CON[stock]["tFormat"])
    url = STOCK_CON[stock]["urlStr"] % (s_date, e_date)

    post_time, last_price, previous_last_price = scrape_f(url)
    if post_time == "n/a":
        return None

    nowstr = datetime.utcnow().isoformat()

    # create json message
    msg = {
        "previousCloseValue": previous_last_price,
        "date": post_time,
        "queryTime": nowstr,
        "originalUpdateTime": post_time,
        "name": stock,
        "feed": STOCK_CON[stock]["feed"],
        "currentValue": last_price,
        "type": STOCK_CON[stock]["type"],
    }

    msg = message.add_embers_ids(msg)
    return msg
示例#3
0
def get_news_by_url(url):
    news = News()
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))

        #title
        title = soup.find("div", "pg-story-head md").find("h2").text
        news.set_title(title)

        #postTime
        author_posttime = soup.find("p", "dateline").text.replace("\n","").lower().replace("\t","").split("/")
        post_time = author_posttime[1].replace("pm", "").replace("am", "").strip()
        
        t_format = "%d %b %Y, %I:%M"
        post_time = datetime.strptime(post_time, t_format).isoformat()
        news.set_posttime(post_time)

        #author
        author = author_posttime[0]
        news.set_author(author)

        #url
        news.set_url(url)

        #date
        date = datetime.utcnow().isoformat()
        news.set_date(date)

        #source
        source = 'elfinancierocr'
        news.set_source(source)

        #content, encoding, id, country, labels
        paragraphs = soup.find("div", "pg-story-body mce").find_all('p')
        content = " ".join([unicode(p.text) for p in paragraphs])
        news.set_content(content)

        #encoding
        encoding = 'utf-8'
        news.set_encoding(encoding)

        news.news = message.add_embers_ids(news.news)

        return news.news
    except:
        log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0]))
        return None
示例#4
0
def get_news_by_url(url):
    news = News()
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))

        # title
        title = soup.find_all("h1")[0].text
        news.set_title(title)

        # postTime
        post_time = soup.select('meta[name="REVISION_DATE"]')[0]["content"]
        t_format = "%a %b %d %H:%M:%S %Z %Y"
        post_time = datetime.strptime(post_time, t_format).isoformat()
        news.set_posttime(post_time)

        # author
        author = soup.select('meta[name="Author"]')[0]["content"]
        news.set_author(author)

        # url
        news.set_url(url)

        # date
        date = datetime.utcnow().isoformat()
        news.set_date(date)

        # source
        source = "lta_reuters"
        news.set_source(source)

        # content, encoding, id, country, labels
        paragraphs = soup.find(id="resizeableText").find_all("p")
        content = " ".join([unicode(p.text) for p in paragraphs])
        news.set_content(content)

        # encoding
        encoding = "utf-8"
        news.set_encoding(encoding)

        news.news = message.add_embers_ids(news.news)

        return news.news
    except:
        log.exception("Exceptopn when extracting %s %s" % (url, sys.exc_info()[0]))
        return None
示例#5
0
    def createSurrogate(self, winner, winningScore, runnerUp, runnerUpScore, pushFlag):
        surrogate = {}
        surrogate["date"] = datetime.utcnow().isoformat('T')
        surrogate["scores"] = self.scoreCard
        surrogate["model"] = "VoteSentimentEvolutionModel"
        surrogate["derivedFrom"] = {}
        surrogate["derivedFrom"]["derivedIds"] = self.tweetList
        surrogate["derivedFrom"]["location"] = [self.country.title(),
                                                self.state.title(),
                                                self.city.title()]
        surrogate["derivedFrom"]["source"] = "Raw Twitter feed from DataSift and election config Files."
        surrogate["derivedFrom"]["comments"] = "tweets were filtered by country then state and then by those containing the terms of candidates"
        surrogate["derivedFrom"]["model"] = __processor__
        surrogate["confidence"] = 1.00
        surrogate["confidenceIsProbability"] = True
        surrogate["configuration"] = self.configJson
        surrogate = message.add_embers_ids(surrogate)
        log.info("surrogate--->\n%s" % surrogate)

        #writing surrogate to file
        if self.state == '-':
            surrogateFileName = "surrogate_" + self.country + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        elif self.city == '-':
            surrogateFileName = "surrogate_" + self.state + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        else:
            surrogateFileName = "surrogate_" + self.city + "_" + datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
        with open("../data/surrogates/" + surrogateFileName, "w") as f:
            f.write(json.dumps(surrogate))
        log.info("surrogate written to file")

        # writing files to s3
        if pushFlag == '1':
            try:
                with open("../data/surrogates/" + surrogateFileName, 'r') as f:
                    pushFileToS3(f, 'surrogates/elections/' + surrogateFileName)
                log.info("surrogate pushed to s3")
            except:
                log.exception("surrogate push to S3 failed!! push manually!!")
        self.createWarning(surrogate["embersId"], winner, winningScore, runnerUp, runnerUpScore, pushFlag)
        return
def get_news_by_url(url):
    article = {}
    try:
        soup = BeautifulSoup(urllib2.urlopen(url))
        # title
        title = ""
        titleElements = soup.findAll(id="disqus_title")
        for ele in titleElements:
            title = ele.getText().encode('utf-8')
        article["title"] = title 
        
        # get article timestamps
        postTime = ""
        postTimeElements = soup.findAll(attrs={'class':"datestamp"})
        for ele in postTimeElements:
            timeStamp = float(ele["epoch"])
        #postTime = datetime.strftime("%Y-%m-%d %H:%M:%S",datetime.fromtimestamp(timeStamp/1000))
        postTime = datetime.fromtimestamp(timeStamp/1000)
        postTimeStr = postTime.isoformat()
        article["postTime"] = postTimeStr
        
        # get date (should be part of the time?)
        postDay = postTime.date()
        article["postDate"] = datetime.strftime(postDay,"%Y-%m-%d");
        
        # author
        author = ""
        authorElements = soup.findAll(attrs={'class':"byline"})
        for ele in authorElements:
            author = ele.contents[0].strip().replace("By","").replace("-","").replace("and", ",").strip();
        article["author"] = author
        
        # content - FIXME - Extractor undefined
        content = soup.body.get_text()
        article["content"] =  content
        
        # source info
        source = "Bloomberg News"
        article["source"] = source
        
        # time stamp
        updateTime = datetime.utcnow().isoformat()
        article["updateTime"] = updateTime
        # the message format specified field
        article["date"] = updateTime
        
        # date? why are dates and times separate?
        updateDate = datetime.strftime(datetime.utcnow(),"%Y-%m-%d")
        article["updateDate"] = updateDate

        article["url"] =  url        
        article = message.add_embers_ids(article)
        
    except KeyboardInterrupt:
        raise

    except:
        log.exception("Could not ingest %s" % (url,))
        return {}

    log.debug("Successfully ingested %s" % (url,))
    return article