示例#1
0
def matchAllKeywords(text, splitCharacter = " "):
    #returns the keywords that are contained in tweet
    #ignores the order of the keywords in the keywordsString
    foundList = []
    text = text.lower()
    #print keywords
    for keywords in constants.getAllCurrentKeywords():
        if matchOneKeywords(text, keywords.lower().split(splitCharacter)): 
           #print keywords
            foundList.append(keywords)
    return foundList
def updateArticleKeywords():
    urlRoot = 'http://www.barchart.com/futures/news/if'
    headers = {'User-Agent': 'Mozilla'}
    requestRoot = urllib2.Request(urlRoot, None, headers)
    topPage = urllib2.urlopen(requestRoot).read()
    timeNow = arrow.utcnow()
    parsed = BeautifulSoup(topPage)
    allLinks = []
    for urlTag in parsed.find_all(href=re.compile("headlines")):
        urlLink = str('http://www.barchart.com' + urlTag['href'])
        allLinks = allLinks + [urlLink]
    allKeywords = list()
    #for urlLink in ['http://www.barchart.com/headlines/story/546163/keep-selling-soybeans-in-my-opinion']: 
    for urlLink in list(set(allLinks)): #remove duplicates
        requestLink = urllib2.Request(urlLink, None, headers)
        print urlLink
        subPage = urllib2.urlopen(requestLink).read()
        parsedSub = BeautifulSoup(subPage)
        tags = parsedSub.find_all("p")
        theseKeywords = []
        for keywords in constants.getAllCurrentKeywords():
            count = 0
            for tag in tags:
                tagString = tag.encode('utf-8') # dont use tag.string!
                #print tagString
                keywordsMatch = findStrings.keywordsInString(tagString, keywords)
                if keywordsMatch:
                    #print keywords
                    theseKeywords = theseKeywords + [keywords]
        theseKeywords = list(set(theseKeywords))
        for key in theseKeywords:
            allKeywords = allKeywords + [key]
        print theseKeywords
    # save out the data, count duplicates
    for keys, value in Counter(allKeywords).most_common():
        print "{} -> {}".format(keys, value)
        dataManage.writeArticleCount(keys, timeNow, value)
    return allKeywords
示例#3
0
 def initializeMemory(self):
     self.timeCreated = arrow.utcnow()
     self.keywordsCounts = dict()
     for keywords in constants.getAllCurrentKeywords():
         self.keywordsCounts[keywords] = 0
示例#4
0
# coding: utf-8
import ahocorasick
import constants
import re

tree = ahocorasick.KeywordTree()

for keyword in constants.getAllCurrentKeywords():
    for word in keyword.split('+'):
        if word:
            tree.add(word)
tree.make()

tag = '<p> Crude oil does have some bullish fundamentals with craziness going on in Iraq and the fact that economies around the world are improving especially here in the United States with the unemployment number coming out adding another 288,000 jobs as high gas prices are here to stay in my opinion, but as a trader I have to look for a breakout to enter or exit while right now the trend is neutral.  </p>'

for (start, end) in tree.findall(tag):
    print tag[start:end]

allKeywords = constants.getAllCurrentKeywords()

reAll = '.*' + allKeywords[0].replace('+', '.*') + '.*'
matcher = re.compile(reAll)
for keywords in constants.getAllCurrentKeywords()[1:]:
    regularExp = '.*' + keywords.replace('+', '.*') + '.*'
    matcher.compile(regularExp)

matcher = re.compile(reAll)
for m in matcher.finditer(tag):
    print '%02d-%02d: %s' % (m.start(), m.end(), m.group(0))
# print r
# for one in r:
import matplotlib
import constants
import urllib2
import re
import time
import constants
import arrow
from datetime import datetime
import BeautifulSoup
import dataManage
import findStrings
from collections import Counter
# searches the articles at bargraph and extracts the mentions of commoditie
# first go to and scrape http://www.barchart.com/futures/marketoverview
#<td><a href="/headlines/story/526332/how-low-are-cotton-prices-going"><img class="thumbnail" src="https://s3.amazonaws.com/news-media/IF/b141b0d62ad234bfac0c177ca0bd9fc2/mseery.jpg"></a><div class="headline"><h1><a href="/headlines/story/526332/how-low-are-cotton-prices-going">How Low Are Cotton Prices Going?</a></h1></div><div class="byline">Michael #Seery - Seery Futures - Thu Jul 03,  2:50PM CDT</div>Great weather equals lower prices (<a href="/headlines/story/526332/how-low-are-cotton-prices-going">full story</a>)<br class="clr"></td>
allKeywordsList = constants.getAllCurrentKeywords()


def updateArticleKeywords():
    urlRoot = 'http://www.barchart.com/futures/news/if'
    headers = {'User-Agent': 'Mozilla'}
    requestRoot = urllib2.Request(urlRoot, None, headers)
    topPage = urllib2.urlopen(requestRoot).read()
    parsed = BeautifulSoup(topPage)
    allLinks = []
    for urlTag in parsed.find_all(href=re.compile("headlines")):
        urlLink = str('http://www.barchart.com' + urlTag['href'])
        allLinks.append(urlLink)
    allMatchedKeywords = list()
    #for urlLink in ['http://www.barchart.com/headlines/story/546163/keep-selling-soybeans-in-my-opinion']:
    for urlLink in list(set(allLinks)):  #remove duplicates