def matchAllKeywords(text, splitCharacter = " "): #returns the keywords that are contained in tweet #ignores the order of the keywords in the keywordsString foundList = [] text = text.lower() #print keywords for keywords in constants.getAllCurrentKeywords(): if matchOneKeywords(text, keywords.lower().split(splitCharacter)): #print keywords foundList.append(keywords) return foundList
def updateArticleKeywords(): urlRoot = 'http://www.barchart.com/futures/news/if' headers = {'User-Agent': 'Mozilla'} requestRoot = urllib2.Request(urlRoot, None, headers) topPage = urllib2.urlopen(requestRoot).read() timeNow = arrow.utcnow() parsed = BeautifulSoup(topPage) allLinks = [] for urlTag in parsed.find_all(href=re.compile("headlines")): urlLink = str('http://www.barchart.com' + urlTag['href']) allLinks = allLinks + [urlLink] allKeywords = list() #for urlLink in ['http://www.barchart.com/headlines/story/546163/keep-selling-soybeans-in-my-opinion']: for urlLink in list(set(allLinks)): #remove duplicates requestLink = urllib2.Request(urlLink, None, headers) print urlLink subPage = urllib2.urlopen(requestLink).read() parsedSub = BeautifulSoup(subPage) tags = parsedSub.find_all("p") theseKeywords = [] for keywords in constants.getAllCurrentKeywords(): count = 0 for tag in tags: tagString = tag.encode('utf-8') # dont use tag.string! #print tagString keywordsMatch = findStrings.keywordsInString(tagString, keywords) if keywordsMatch: #print keywords theseKeywords = theseKeywords + [keywords] theseKeywords = list(set(theseKeywords)) for key in theseKeywords: allKeywords = allKeywords + [key] print theseKeywords # save out the data, count duplicates for keys, value in Counter(allKeywords).most_common(): print "{} -> {}".format(keys, value) dataManage.writeArticleCount(keys, timeNow, value) return allKeywords
def initializeMemory(self): self.timeCreated = arrow.utcnow() self.keywordsCounts = dict() for keywords in constants.getAllCurrentKeywords(): self.keywordsCounts[keywords] = 0
# coding: utf-8 import ahocorasick import constants import re tree = ahocorasick.KeywordTree() for keyword in constants.getAllCurrentKeywords(): for word in keyword.split('+'): if word: tree.add(word) tree.make() tag = '<p> Crude oil does have some bullish fundamentals with craziness going on in Iraq and the fact that economies around the world are improving especially here in the United States with the unemployment number coming out adding another 288,000 jobs as high gas prices are here to stay in my opinion, but as a trader I have to look for a breakout to enter or exit while right now the trend is neutral. </p>' for (start, end) in tree.findall(tag): print tag[start:end] allKeywords = constants.getAllCurrentKeywords() reAll = '.*' + allKeywords[0].replace('+', '.*') + '.*' matcher = re.compile(reAll) for keywords in constants.getAllCurrentKeywords()[1:]: regularExp = '.*' + keywords.replace('+', '.*') + '.*' matcher.compile(regularExp) matcher = re.compile(reAll) for m in matcher.finditer(tag): print '%02d-%02d: %s' % (m.start(), m.end(), m.group(0)) # print r # for one in r:
import matplotlib import constants import urllib2 import re import time import constants import arrow from datetime import datetime import BeautifulSoup import dataManage import findStrings from collections import Counter # searches the articles at bargraph and extracts the mentions of commoditie # first go to and scrape http://www.barchart.com/futures/marketoverview #<td><a href="/headlines/story/526332/how-low-are-cotton-prices-going"><img class="thumbnail" src="https://s3.amazonaws.com/news-media/IF/b141b0d62ad234bfac0c177ca0bd9fc2/mseery.jpg"></a><div class="headline"><h1><a href="/headlines/story/526332/how-low-are-cotton-prices-going">How Low Are Cotton Prices Going?</a></h1></div><div class="byline">Michael #Seery - Seery Futures - Thu Jul 03, 2:50PM CDT</div>Great weather equals lower prices (<a href="/headlines/story/526332/how-low-are-cotton-prices-going">full story</a>)<br class="clr"></td> allKeywordsList = constants.getAllCurrentKeywords() def updateArticleKeywords(): urlRoot = 'http://www.barchart.com/futures/news/if' headers = {'User-Agent': 'Mozilla'} requestRoot = urllib2.Request(urlRoot, None, headers) topPage = urllib2.urlopen(requestRoot).read() parsed = BeautifulSoup(topPage) allLinks = [] for urlTag in parsed.find_all(href=re.compile("headlines")): urlLink = str('http://www.barchart.com' + urlTag['href']) allLinks.append(urlLink) allMatchedKeywords = list() #for urlLink in ['http://www.barchart.com/headlines/story/546163/keep-selling-soybeans-in-my-opinion']: for urlLink in list(set(allLinks)): #remove duplicates