Пример #1
0
def ft():
    word_matrix, word_vec, article_titles = make_matrix()
    print 'task begin'
    weight_matrix, feature_matrix = factorize(word_matrix, 30, 300)

    np.savetxt('data/weight_matrix.txt', weight_matrix)
    np.savetxt('data/feature_matrix.txt', feature_matrix)
Пример #2
0
def stock_volumes(tickers):
    shortest = 300
    prices = {}
    dates = None
    allporder = []
    allols = []

    for t in tickers:
        # Open the URL
        print 't=', t
        rows=urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?'+\
                             's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=2005' %t +\
                             '&ignore=.csv').readlines()
        # Extract the volume field from every line
        prices[t] = [
            float(r.split(',')[5]) for r in rows[1:] if r.strip() != ''
        ]

        if len(prices[t]) < shortest:
            shortest = len(prices[t])

        if not dates:
            dates = [r.split(',')[0] for r in rows[1:] if r.strip() != '']

    l1 = [[prices[tickers[i]][j] for i in range(len(tickers))]
          for j in range(shortest)]

    w, h = nmf.factorize(matrix(l1), pc=5)

    # Loop over all the features
    for i in range(shape(h)[0]):
        print "Feature %d" % i

        # Get the top stocks for this feature
        ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])]
        ol.sort()
        ol.reverse()

        for j in range(len(tickers)):
            print ol[j]

        allols.append(ol)
        print

        # Show the top dates for this feature
        porder = [(w[d, i], d) for d in range(300)]
        porder.sort()
        porder.reverse()
        f = [(p[0], dates[p[1]]) for p in porder[0:3]]
        print f
        allporder.append(f)
        print

    frog = raw_input('press <ENTER> to continue')
    return allporder
Пример #3
0
def factorize(htmls, features=5, max_iter=50):
    print "### start make data ###"
    allwords, articlewords, articletitles = getarticlewords(htmls)
    print "### end make data ###"
    print "### start make matrix ###"
    wordmatrix, wordvec = makematrix(allwords, articlewords)
    v = matrix(wordmatrix)
    print "### end ###"
    print "### start nmf calc ###"
    weights, feat = nmf.factorize(v, pc=features, max_iter=max_iter)
    print "### end nmf calc ##"
    return allwords, articlewords, articletitles, wordmatrix, wordvec, weights, feat
Пример #4
0
def read_and_nmf(input_file):
    """
    :param input_file: file to be read
    :return: w (components from nmf)
    """
    (rate, data) = read(input_file)
    bee_data = (data[:, 0] + data[:, 1]) / 2.0
    if np.amin(bee_data) < -1 or np.amax(bee_data) > 1:
        bee_data /= float(max(abs(np.amax(bee_data)), abs(np.amin(bee_data))))
    T = len(bee_data) / fs
    X = transform.stft(bee_data, fs, framesz, hop)
    M = abs(X)
    w, h = nmf.factorize(M, pc=NUM_COMPONENTS, iterations=ITERATIONS)
    return w
def read_and_nmf(input_file):
    """
    :param input_file: file to be read
    :return: w (components from nmf)
    """
    (rate, data) = read(input_file)
    bee_data = (data[:, 0] + data[:, 1]) / 2.0
    if np.amin(bee_data) < -1 or np.amax(bee_data) > 1:
        bee_data /= float(max(abs(np.amax(bee_data)), abs(np.amin(bee_data))))
    T = len(bee_data) / fs
    X = transform.stft(bee_data, fs, framesz, hop)
    M = abs(X)
    w, h = nmf.factorize(M, pc=NUM_COMPONENTS, iterations=ITERATIONS)
    return w
Пример #6
0
def testNewsFeatures():
    # allWords, articleWords, articleTitles = getArticleWords()
    # saveNews(allWords, articleWords, articleTitles)
    allWords, articleWords, articleTitles = loadNews()

    wordMatrix, wordVec = makeMatrix(allWords, articleWords)
    print wordVec[0: 10]
    print articleTitles[1]
    print wordMatrix[1][0: 10]

    def wordMatrixFeatures(counts):
        return [wordVec[i] for i in range(len(counts)) if counts[i] > 0]

    print
    print wordMatrixFeatures(wordMatrix[0])

    print
    from pci.ch06 import docClass

    classifier = docClass.NaiveBayes(wordMatrixFeatures)
    classifier.setDB('news.db')

    print articleTitles[0]
    # Train this as an 'government' story
    classifier.train(wordMatrix[0], 'government')

    print articleTitles[1]
    # Train this as an 'market' story
    classifier.train(wordMatrix[1], 'market')

    print articleTitles[2]
    # How is this story classified?
    print classifier.classify(wordMatrix[2])

    print
    from pci.ch03 import clusters

    clust = clusters.hCluster(wordMatrix)
    clusters.drawDendrogram(clust, articleTitles, jpeg='news.jpg')

    print
    import nmf

    weights, features = nmf.factorize(matrix(wordMatrix), 20, 50)
    print weights

    topPatterns, patternNames = showFeatures(weights, features, articleTitles, wordVec)
    showArticles(articleTitles, topPatterns, patternNames)
Пример #7
0
def get_feature():
	v, dates = get_volumn()
	w, h = nmf.factorize(np.matrix(v).transpose(), pc=5)
	
	print w.shape, h.shape
	for i in range(np.shape(h)[0]):
		print "Feature %d" %i

		# get stock according to current feature
		ol = [(h[i,j], stocks[j]) for j in range(np.shape(h)[1])]
		ol.sort()
		ol.reverse()
		for j in range(np.shape(h)[1]):
			print ol[j]
		print

		# get the date
		porder = [(w[d, i], d) for d in range(len(dates))]
		porder.sort()
		porder.reverse()
		print [(p[0], dates[p[1]]) for p in porder[0:5]]
	dates = None
	for ticker in StockTickers:
		url = 'http://ichart.finance.yahoo.com/table.csv?' +\
			  's=%s&d=3&e=1&f=2015&g=d&a=3&b=1&c=2005' % ticker +\
			  '&ignore=.csv'
		rows = urllib2.urlopen(url).readlines()
		prices[ticker] = [float(row.split(',')[5]) for row in rows[1:] if row.strip() != '']
		if len(prices[ticker]) < shortest: shortest = len(prices[ticker])
		if not dates:
			dates = [row.split(',')[0] for row in rows[1:] if row.strip() != '']
	matrix = [[prices[StockTickers[i]][j] for i in range(len(StockTickers))] for j in range(shortest)]
	return np.matrix(matrix), dates, shortest

def showResults(w, h, dates, shortest, topStock=12, topDate=3):
	for i in range(np.shape(h)[0]):
		print 'Feature %d' % i
		stocklist = [(h[i, j], StockTickers[j]) for j in range(len(StockTickers))]
		stocklist = sorted(stocklist, reverse=True)
		for j in range(topStock):
			print stocklist[j]
		print ''
		datelist = [(w[j, i], j) for j in range(shortest)]
		datelist = sorted(datelist, reverse=True)
		print [(date[0], dates[date[1]]) for date in datelist[:topDate]]
		print ''

if __name__ == '__main__':
	m, dates, shortest = downloadFinanceData()
	w, h = nmf.factorize(m)
	showResults(w, h, dates, shortest)
    url = 'http://ichart.finance.yahoo.com/table.csv?' + \
            's=%s&d=11&e=31&f=2012&g=d&a=0&b=1&c=2006' % t + \
            '&ignore=.csv'
    rows = urllib2.urlopen(url).readlines()

    # extract the volume field from eery line
    prices[t] = [float(r.split(',')[5]) for r in rows[1:] if r.strip() != '']
    if len(prices[t]) < shortest: shortest = len(prices[t])
    if not dates:
        dates = [r.split(',')[0] for r in rows[1:] if r.strip() != '']

l1 = [[prices[tickers[i]][j]
    for i in range(len(tickers))]
    for j in range(shortest)]

w, h = nmf.factorize(matrix(l1), pc=5)

print h
print w

# loop over all the features
for i in range(shape(h)[0]):
    print "Feature %d" % i
    # get the top stocks for this feature
    ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])]
    ol.sort()
    ol.reverse()
    for j in range(12):
        print ol[j]
    print
Пример #10
0
from numpy import *
import newsfeatures
import nmf
import pickle

# allw, artw, artt = newsfeatures.getarticlewords()
# pickle.dump(allw, open('input/tmp1.txt','wb'))
# pickle.dump(artw, open('input/tmp2.txt','wb'))
# pickle.dump(artt, open('input/tmp3.txt','wb'))
allw = pickle.load(open('input/tmp1.txt', 'rb'))
artw = pickle.load(open('input/tmp2.txt', 'rb'))
artt = pickle.load(open('input/tmp3.txt', 'rb'))


wordmatrix, wordvec = newsfeatures.makematrix(allw, artw)
print(len(wordvec))
v = matrix(wordmatrix)
weights, feat = nmf.factorize(v, pc=20, iter=50)
Пример #11
0
    # 打开URL
    rows = request.urlopen('http://ichart.finance.yahoo.com/table.csv?' + 
                           's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996' + 
                           '&ignore=.csv').readlines()

    # 从每一行中提取成交量
    prices[t] = [float[r.split(',')[5]) for r in rows1:] if r.strip() != '']
    if len(prices[t]) < shortest:
        shortest = len(prices[t])

    if not dates:
        dates = [r.split(',')[0] for r in rows[1:] if r.strip() != '']

l1 = [[prices[tickers[i]][j] for i in range(len(tickers))] for j in range(shortest)]

w, h = nmf.factorize(np.matrix(l1), pc=5)

print(h)
print(w)

# 遍历所有特征
for i in range(np.shape(h)[0]):
    print("Feature %d" % i)

    # 得到最符合当前特征的
    ol = [(h[i, j], tickers[j]) for j in range(np.shape(h)[])]
    ol.sort()
    ol.reverse()
    for j in range(l2):
        print(ol[j])
    print()
            except IndexError:
                outfile.close()
                return
        outfile.write('\n')


if __name__ == "__main__":
    allw, articlew, artt = getarticlewords()
    wordmatrix, wordvec = makematrix(allw, articlew)

    # print wordvec[0:10]
    # print artt[1]
    # print wordmatrix[1][0:10]

    # hierarchical clustering
    import clusters
    clust = clusters.hcluster(wordmatrix)
    clusters.drawdendrogram(clust, artt, jpeg = 'news.jpg')

    # non-negative matrix factorization
    import nmf
    # m1 = np.matrix([[1, 2, 3], [4, 5, 6]])
    # m2 = np.matrix([[1, 2], [3, 4], [5, 6]])
    # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100)
    # print w * h

    v = np.matrix(wordmatrix)
    weights, feats = nmf.factorize(v, pc = 20, iter = 50)
    topp, pn = showfeatures(weights, feats, artt, wordvec)
    showarticles(artt, topp, pn)
Пример #13
0
            # create a list of articles for this feature
            flist = []
            for j in range(len(titles)):
                # add the artile with its weight
                flist.append((w[j, i], i, titles[j]))
                top_patterns[j].append((w[j, i], i, titles[j]))
                
            # sort reverse
            flist.sort(reverse = True)
            
            # show top 3 articles:
            for f in flist[0:3]:
                spamwriter.writerow(list(f))
                
    return top_patterns, pattern_names


def show_articles(titles, pattern_names, pattern_names, out = 'articles.csv'):
    pass



allw,artw,artt = get_article_words_count(feedlist)
wordmatrix, wordvec = make_matrix(allw, artw)
print wordmatrix, wordvec
v = numpy.matrix(wordmatrix)
weights, feat = nmf.factorize(v, pc = 20, iteration= 50)
print weights, feat

topp, pn = show_features(weights, feat, artt, wordvec)
Пример #14
0
# print artt[0]
# print "--------------"
# classifier.train(wordmatrix[1], 'india')
# print artt[0]
# print "--------------"
# print classifier.classify(wordmatrix[1])
#
# clust = clusters.hcluster(wordmatrix)
# clusters.drawdendrogram(clust, artt, jpeg='new.jpg')

# l1 = [[1, 2, 3,0], [4, 5, 6,0]]
# print l1
# m1 = matrix(l1)
# print m1
# m2 = matrix([[1, 2], [3, 4], [5, 6], [0, 0]])
# print m2
# print m1 * m2
# w, h = nmf.factorize(m2, pc=4, iter=100)
# print w, h
# print w * h
# print m1 * m2
# v = matrix(wordmatrix)
# for i in range(shape(v)[0]):
#         print >> out, v[i]
#         print v[i]
# out.close()
v=matrix(wordmatrix)
weights, feat = nmf.factorize(v, pc=10, iter=50)
topp, pn = newsfeature.showfeatures(weights, feat, artt, wordvec)
newsfeature.showarticles(artt, topp, pn)
Пример #15
0
    return list


csvfile = 'data/use_data.csv'

reader    = readFile(csvfile)
tickers   = reader.pop(0)
tickers.pop(0)
np_data   = matrix(reader)
np_data_T = np_data.T
dates     = np_data_T[0, ]
dates     = dates.tolist()[0]
prices    = np_data_T[1:, ]


w, h = nmf.factorize(prices.T, pc=10)
print w.shape
print h.shape
print len(tickers)
print len(dates)

for i in range(shape(h)[0]):
    print "Feature %d" % i

    # Get the top stocks for this feature
    ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])]
    ol.sort()
    ol.reverse()
    for j in range(len(tickers)):
        print ol[j]
        print
Пример #16
0
        for j in range(len(titles)):
            flist.append((w[j,i],titles[j]))
            toppatterns[j].append((w[j,i],i,titles[j]))
        flist.sort(reverse=True)
        
        for f in flist[:5]:
            outfile.write("%f %s\n" % (f[0],f[1]))
        outfile.write('\n')
    return toppatterns,patternnames       

def showarticles(titles,toppatterns,patternnames,out='data/articles.txt'):
    outfile=open(out,'w')
    for j in range(len(titles)):
        outfile.write(titles[j]+'\n')
        toppatterns[j].sort(reverse=True)
        for i in range(3):
            outfile.write( "%f %s\n" % (toppatterns[j][i][0], " ".join(patternnames[toppatterns[j][i][1]])) )
    outfile.write('\n')
            


if __name__=='__main__':
    allw,artw,artt= getarticlewords()
    wordmatrix,wordvec=makematrix(allw,artw)
    print wordvec[0:10] 
    print wordmatrix[1][0:10]            
    v=matrix(wordmatrix)
    weights,feat=nmf.factorize(v,pc=5,iter=100)
    topp,pn=showfeatures(weights,feat,artt,wordvec)
    showarticles(artt,topp,pn)
Пример #17
0
    allWords, articleWords, articleTitles = getArticleWords(feedlist)
    wordMatrix, wordVector = makeWordsMatrix(allWords, articleWords)

    # Naive Bayers Classification
    # getFeatures = functools.partial(makeWordsMatrixFeatures, wordVector=wordVector)
    # classifier = docclass.naivebayers(getFeatures)
    # classifier.setDatabase('newsfeed.db')
    # classifier.train(wordMatrix[0], 'thing')
    # classifier.train(wordMatrix[1], 'thing')
    # classifier.train(wordMatrix[2], 'solution')
    # classifier.train(wordMatrix[3], 'solution')
    # classifier.train(wordMatrix[4], 'solution')
    # classifier.train(wordMatrix[5], 'thing')
    # classifier.train(wordMatrix[6], 'solution')
    # classifier.train(wordMatrix[7], 'thing')
    # print classifier.classify(wordMatrix[8])
    # print classifier.classify(wordMatrix[9])

    # Clustering
    # cluster = clusters.hcluster(wordMatrix)
    # clusters.drawDendrogram(cluster, articleTitles, jpeg='newsclusters.jpg')
    # cluster = clusters.hcluster(clusters.rotateMatrix(wordMatrix))
    # clusters.drawDendrogram(cluster, wordVector, jpeg='wordclusters.jpg')

    # NMF
    nmfWordMatrix = np.matrix(wordMatrix)
    weights, feats = nmf.factorize(nmfWordMatrix, k=20, maxIterations=50)
    topPatterns, patternNames = showFeatures(weights, feats, articleTitles,
                                             wordVector)
    showArticles(articleTitles, topPatterns, patternNames)
Пример #18
0
def factorize(htmls):
    allwords, articlewords, articletitles = getarticlewords(htmls)
    wordmatrix, wordvec = makematrix(allwords, articlewords)
    v = matrix(wordmatrix)
    weights, feat = nmf.factorize(v, pc = 20, max_iter=50)
    return allwords, articlewords, articletitles, wordmatrix, wordvec, weights, feat
    # transform the stock prices into a an array
    prices[t] = [float(r.split(',')[5]) for r in rows[1:] if r.strip() != '']
    if len(prices[t]) < shortest:
        shortest = len(prices[t])

    if not dates:
        dates = [r.split(',')[0] for r in rows[1:] if r.strip() != '']


# create a matrix of tickers/prices
l1 = [[prices[tickers[i]][j] for i in xrange(len(tickers))]
        for j in xrange(shortest)]


w, h, _ = nmf.factorize(np.matrix(l1), pc=5)
print h
print w

# output the collected and calculated data into stdout
for i in xrange(np.shape(h)[0]):
    print 'Feature %d: ' % i

    ol = [(h[i, j], tickers[j]) for j in xrange(np.shape(h)[1])]
    ol.sort()
    ol.reverse()
    for j in xrange(12):
        print ol[j]
    print

    porder = [(w[d, i], d) for d in xrange(300)]
Пример #20
0
		n = [s[1] for s in slist[0:6]]
		outfile.write(str(n) + '\n')
		patternnames.append(n)

		flist = []
		for j in range(len(titles)):
			flist.append((w[j,i], titles[j]))
			toppatterns[j].append((w[j,i], i, titles[j]))

		flist.sort()
		flist.reverse()

		for f in flist[0:3]:
			outfile.write(str(f)+'\n')
		outfile.write('\n')
	outfile.close()

	return toppatterns, patternnames


print '--- test word vec---'
allw,artw,artt = getarticlewords()
wordmatrix, wordvec = makematrix(allw, artw)
print wordvec[0:10]
print artt[1]
print wordmatrix[1][0:100]

weights, feat = nmf.factorize(wordmatrix, pc = 20, iter=50)