def ft(): word_matrix, word_vec, article_titles = make_matrix() print 'task begin' weight_matrix, feature_matrix = factorize(word_matrix, 30, 300) np.savetxt('data/weight_matrix.txt', weight_matrix) np.savetxt('data/feature_matrix.txt', feature_matrix)
def stock_volumes(tickers): shortest = 300 prices = {} dates = None allporder = [] allols = [] for t in tickers: # Open the URL print 't=', t rows=urllib2.urlopen('http://ichart.finance.yahoo.com/table.csv?'+\ 's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=2005' %t +\ '&ignore=.csv').readlines() # Extract the volume field from every line prices[t] = [ float(r.split(',')[5]) for r in rows[1:] if r.strip() != '' ] if len(prices[t]) < shortest: shortest = len(prices[t]) if not dates: dates = [r.split(',')[0] for r in rows[1:] if r.strip() != ''] l1 = [[prices[tickers[i]][j] for i in range(len(tickers))] for j in range(shortest)] w, h = nmf.factorize(matrix(l1), pc=5) # Loop over all the features for i in range(shape(h)[0]): print "Feature %d" % i # Get the top stocks for this feature ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])] ol.sort() ol.reverse() for j in range(len(tickers)): print ol[j] allols.append(ol) print # Show the top dates for this feature porder = [(w[d, i], d) for d in range(300)] porder.sort() porder.reverse() f = [(p[0], dates[p[1]]) for p in porder[0:3]] print f allporder.append(f) print frog = raw_input('press <ENTER> to continue') return allporder
def factorize(htmls, features=5, max_iter=50): print "### start make data ###" allwords, articlewords, articletitles = getarticlewords(htmls) print "### end make data ###" print "### start make matrix ###" wordmatrix, wordvec = makematrix(allwords, articlewords) v = matrix(wordmatrix) print "### end ###" print "### start nmf calc ###" weights, feat = nmf.factorize(v, pc=features, max_iter=max_iter) print "### end nmf calc ##" return allwords, articlewords, articletitles, wordmatrix, wordvec, weights, feat
def read_and_nmf(input_file): """ :param input_file: file to be read :return: w (components from nmf) """ (rate, data) = read(input_file) bee_data = (data[:, 0] + data[:, 1]) / 2.0 if np.amin(bee_data) < -1 or np.amax(bee_data) > 1: bee_data /= float(max(abs(np.amax(bee_data)), abs(np.amin(bee_data)))) T = len(bee_data) / fs X = transform.stft(bee_data, fs, framesz, hop) M = abs(X) w, h = nmf.factorize(M, pc=NUM_COMPONENTS, iterations=ITERATIONS) return w
def testNewsFeatures(): # allWords, articleWords, articleTitles = getArticleWords() # saveNews(allWords, articleWords, articleTitles) allWords, articleWords, articleTitles = loadNews() wordMatrix, wordVec = makeMatrix(allWords, articleWords) print wordVec[0: 10] print articleTitles[1] print wordMatrix[1][0: 10] def wordMatrixFeatures(counts): return [wordVec[i] for i in range(len(counts)) if counts[i] > 0] print print wordMatrixFeatures(wordMatrix[0]) print from pci.ch06 import docClass classifier = docClass.NaiveBayes(wordMatrixFeatures) classifier.setDB('news.db') print articleTitles[0] # Train this as an 'government' story classifier.train(wordMatrix[0], 'government') print articleTitles[1] # Train this as an 'market' story classifier.train(wordMatrix[1], 'market') print articleTitles[2] # How is this story classified? print classifier.classify(wordMatrix[2]) print from pci.ch03 import clusters clust = clusters.hCluster(wordMatrix) clusters.drawDendrogram(clust, articleTitles, jpeg='news.jpg') print import nmf weights, features = nmf.factorize(matrix(wordMatrix), 20, 50) print weights topPatterns, patternNames = showFeatures(weights, features, articleTitles, wordVec) showArticles(articleTitles, topPatterns, patternNames)
def get_feature(): v, dates = get_volumn() w, h = nmf.factorize(np.matrix(v).transpose(), pc=5) print w.shape, h.shape for i in range(np.shape(h)[0]): print "Feature %d" %i # get stock according to current feature ol = [(h[i,j], stocks[j]) for j in range(np.shape(h)[1])] ol.sort() ol.reverse() for j in range(np.shape(h)[1]): print ol[j] print # get the date porder = [(w[d, i], d) for d in range(len(dates))] porder.sort() porder.reverse() print [(p[0], dates[p[1]]) for p in porder[0:5]]
dates = None for ticker in StockTickers: url = 'http://ichart.finance.yahoo.com/table.csv?' +\ 's=%s&d=3&e=1&f=2015&g=d&a=3&b=1&c=2005' % ticker +\ '&ignore=.csv' rows = urllib2.urlopen(url).readlines() prices[ticker] = [float(row.split(',')[5]) for row in rows[1:] if row.strip() != ''] if len(prices[ticker]) < shortest: shortest = len(prices[ticker]) if not dates: dates = [row.split(',')[0] for row in rows[1:] if row.strip() != ''] matrix = [[prices[StockTickers[i]][j] for i in range(len(StockTickers))] for j in range(shortest)] return np.matrix(matrix), dates, shortest def showResults(w, h, dates, shortest, topStock=12, topDate=3): for i in range(np.shape(h)[0]): print 'Feature %d' % i stocklist = [(h[i, j], StockTickers[j]) for j in range(len(StockTickers))] stocklist = sorted(stocklist, reverse=True) for j in range(topStock): print stocklist[j] print '' datelist = [(w[j, i], j) for j in range(shortest)] datelist = sorted(datelist, reverse=True) print [(date[0], dates[date[1]]) for date in datelist[:topDate]] print '' if __name__ == '__main__': m, dates, shortest = downloadFinanceData() w, h = nmf.factorize(m) showResults(w, h, dates, shortest)
url = 'http://ichart.finance.yahoo.com/table.csv?' + \ 's=%s&d=11&e=31&f=2012&g=d&a=0&b=1&c=2006' % t + \ '&ignore=.csv' rows = urllib2.urlopen(url).readlines() # extract the volume field from eery line prices[t] = [float(r.split(',')[5]) for r in rows[1:] if r.strip() != ''] if len(prices[t]) < shortest: shortest = len(prices[t]) if not dates: dates = [r.split(',')[0] for r in rows[1:] if r.strip() != ''] l1 = [[prices[tickers[i]][j] for i in range(len(tickers))] for j in range(shortest)] w, h = nmf.factorize(matrix(l1), pc=5) print h print w # loop over all the features for i in range(shape(h)[0]): print "Feature %d" % i # get the top stocks for this feature ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])] ol.sort() ol.reverse() for j in range(12): print ol[j] print
from numpy import * import newsfeatures import nmf import pickle # allw, artw, artt = newsfeatures.getarticlewords() # pickle.dump(allw, open('input/tmp1.txt','wb')) # pickle.dump(artw, open('input/tmp2.txt','wb')) # pickle.dump(artt, open('input/tmp3.txt','wb')) allw = pickle.load(open('input/tmp1.txt', 'rb')) artw = pickle.load(open('input/tmp2.txt', 'rb')) artt = pickle.load(open('input/tmp3.txt', 'rb')) wordmatrix, wordvec = newsfeatures.makematrix(allw, artw) print(len(wordvec)) v = matrix(wordmatrix) weights, feat = nmf.factorize(v, pc=20, iter=50)
# 打开URL rows = request.urlopen('http://ichart.finance.yahoo.com/table.csv?' + 's=%s&d=11&e=26&f=2006&g=d&a=3&b=12&c=1996' + '&ignore=.csv').readlines() # 从每一行中提取成交量 prices[t] = [float[r.split(',')[5]) for r in rows1:] if r.strip() != ''] if len(prices[t]) < shortest: shortest = len(prices[t]) if not dates: dates = [r.split(',')[0] for r in rows[1:] if r.strip() != ''] l1 = [[prices[tickers[i]][j] for i in range(len(tickers))] for j in range(shortest)] w, h = nmf.factorize(np.matrix(l1), pc=5) print(h) print(w) # 遍历所有特征 for i in range(np.shape(h)[0]): print("Feature %d" % i) # 得到最符合当前特征的 ol = [(h[i, j], tickers[j]) for j in range(np.shape(h)[])] ol.sort() ol.reverse() for j in range(l2): print(ol[j]) print()
except IndexError: outfile.close() return outfile.write('\n') if __name__ == "__main__": allw, articlew, artt = getarticlewords() wordmatrix, wordvec = makematrix(allw, articlew) # print wordvec[0:10] # print artt[1] # print wordmatrix[1][0:10] # hierarchical clustering import clusters clust = clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust, artt, jpeg = 'news.jpg') # non-negative matrix factorization import nmf # m1 = np.matrix([[1, 2, 3], [4, 5, 6]]) # m2 = np.matrix([[1, 2], [3, 4], [5, 6]]) # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100) # print w * h v = np.matrix(wordmatrix) weights, feats = nmf.factorize(v, pc = 20, iter = 50) topp, pn = showfeatures(weights, feats, artt, wordvec) showarticles(artt, topp, pn)
# create a list of articles for this feature flist = [] for j in range(len(titles)): # add the artile with its weight flist.append((w[j, i], i, titles[j])) top_patterns[j].append((w[j, i], i, titles[j])) # sort reverse flist.sort(reverse = True) # show top 3 articles: for f in flist[0:3]: spamwriter.writerow(list(f)) return top_patterns, pattern_names def show_articles(titles, pattern_names, pattern_names, out = 'articles.csv'): pass allw,artw,artt = get_article_words_count(feedlist) wordmatrix, wordvec = make_matrix(allw, artw) print wordmatrix, wordvec v = numpy.matrix(wordmatrix) weights, feat = nmf.factorize(v, pc = 20, iteration= 50) print weights, feat topp, pn = show_features(weights, feat, artt, wordvec)
# print artt[0] # print "--------------" # classifier.train(wordmatrix[1], 'india') # print artt[0] # print "--------------" # print classifier.classify(wordmatrix[1]) # # clust = clusters.hcluster(wordmatrix) # clusters.drawdendrogram(clust, artt, jpeg='new.jpg') # l1 = [[1, 2, 3,0], [4, 5, 6,0]] # print l1 # m1 = matrix(l1) # print m1 # m2 = matrix([[1, 2], [3, 4], [5, 6], [0, 0]]) # print m2 # print m1 * m2 # w, h = nmf.factorize(m2, pc=4, iter=100) # print w, h # print w * h # print m1 * m2 # v = matrix(wordmatrix) # for i in range(shape(v)[0]): # print >> out, v[i] # print v[i] # out.close() v=matrix(wordmatrix) weights, feat = nmf.factorize(v, pc=10, iter=50) topp, pn = newsfeature.showfeatures(weights, feat, artt, wordvec) newsfeature.showarticles(artt, topp, pn)
return list csvfile = 'data/use_data.csv' reader = readFile(csvfile) tickers = reader.pop(0) tickers.pop(0) np_data = matrix(reader) np_data_T = np_data.T dates = np_data_T[0, ] dates = dates.tolist()[0] prices = np_data_T[1:, ] w, h = nmf.factorize(prices.T, pc=10) print w.shape print h.shape print len(tickers) print len(dates) for i in range(shape(h)[0]): print "Feature %d" % i # Get the top stocks for this feature ol = [(h[i, j], tickers[j]) for j in range(shape(h)[1])] ol.sort() ol.reverse() for j in range(len(tickers)): print ol[j] print
for j in range(len(titles)): flist.append((w[j,i],titles[j])) toppatterns[j].append((w[j,i],i,titles[j])) flist.sort(reverse=True) for f in flist[:5]: outfile.write("%f %s\n" % (f[0],f[1])) outfile.write('\n') return toppatterns,patternnames def showarticles(titles,toppatterns,patternnames,out='data/articles.txt'): outfile=open(out,'w') for j in range(len(titles)): outfile.write(titles[j]+'\n') toppatterns[j].sort(reverse=True) for i in range(3): outfile.write( "%f %s\n" % (toppatterns[j][i][0], " ".join(patternnames[toppatterns[j][i][1]])) ) outfile.write('\n') if __name__=='__main__': allw,artw,artt= getarticlewords() wordmatrix,wordvec=makematrix(allw,artw) print wordvec[0:10] print wordmatrix[1][0:10] v=matrix(wordmatrix) weights,feat=nmf.factorize(v,pc=5,iter=100) topp,pn=showfeatures(weights,feat,artt,wordvec) showarticles(artt,topp,pn)
allWords, articleWords, articleTitles = getArticleWords(feedlist) wordMatrix, wordVector = makeWordsMatrix(allWords, articleWords) # Naive Bayers Classification # getFeatures = functools.partial(makeWordsMatrixFeatures, wordVector=wordVector) # classifier = docclass.naivebayers(getFeatures) # classifier.setDatabase('newsfeed.db') # classifier.train(wordMatrix[0], 'thing') # classifier.train(wordMatrix[1], 'thing') # classifier.train(wordMatrix[2], 'solution') # classifier.train(wordMatrix[3], 'solution') # classifier.train(wordMatrix[4], 'solution') # classifier.train(wordMatrix[5], 'thing') # classifier.train(wordMatrix[6], 'solution') # classifier.train(wordMatrix[7], 'thing') # print classifier.classify(wordMatrix[8]) # print classifier.classify(wordMatrix[9]) # Clustering # cluster = clusters.hcluster(wordMatrix) # clusters.drawDendrogram(cluster, articleTitles, jpeg='newsclusters.jpg') # cluster = clusters.hcluster(clusters.rotateMatrix(wordMatrix)) # clusters.drawDendrogram(cluster, wordVector, jpeg='wordclusters.jpg') # NMF nmfWordMatrix = np.matrix(wordMatrix) weights, feats = nmf.factorize(nmfWordMatrix, k=20, maxIterations=50) topPatterns, patternNames = showFeatures(weights, feats, articleTitles, wordVector) showArticles(articleTitles, topPatterns, patternNames)
def factorize(htmls): allwords, articlewords, articletitles = getarticlewords(htmls) wordmatrix, wordvec = makematrix(allwords, articlewords) v = matrix(wordmatrix) weights, feat = nmf.factorize(v, pc = 20, max_iter=50) return allwords, articlewords, articletitles, wordmatrix, wordvec, weights, feat
# transform the stock prices into a an array prices[t] = [float(r.split(',')[5]) for r in rows[1:] if r.strip() != ''] if len(prices[t]) < shortest: shortest = len(prices[t]) if not dates: dates = [r.split(',')[0] for r in rows[1:] if r.strip() != ''] # create a matrix of tickers/prices l1 = [[prices[tickers[i]][j] for i in xrange(len(tickers))] for j in xrange(shortest)] w, h, _ = nmf.factorize(np.matrix(l1), pc=5) print h print w # output the collected and calculated data into stdout for i in xrange(np.shape(h)[0]): print 'Feature %d: ' % i ol = [(h[i, j], tickers[j]) for j in xrange(np.shape(h)[1])] ol.sort() ol.reverse() for j in xrange(12): print ol[j] print porder = [(w[d, i], d) for d in xrange(300)]
n = [s[1] for s in slist[0:6]] outfile.write(str(n) + '\n') patternnames.append(n) flist = [] for j in range(len(titles)): flist.append((w[j,i], titles[j])) toppatterns[j].append((w[j,i], i, titles[j])) flist.sort() flist.reverse() for f in flist[0:3]: outfile.write(str(f)+'\n') outfile.write('\n') outfile.close() return toppatterns, patternnames print '--- test word vec---' allw,artw,artt = getarticlewords() wordmatrix, wordvec = makematrix(allw, artw) print wordvec[0:10] print artt[1] print wordmatrix[1][0:100] weights, feat = nmf.factorize(wordmatrix, pc = 20, iter=50)