def buildTFIDFDictionary(csvName): things = csv_object.getThings(csvName) global words global descriptions for thingy in things: description = thingy.description descriptions.append(description) wordInDoc = description words = words.union(wordInDoc) # dictionary = [["None" for x in range(len(things)+1)]"None " for x in range(len(words)+1)]] #define matrix of things and words # dictionary = {} # multiprocessing print cpu_count(), len(things) thingPool = Pool(cpu_count()) results = thingPool.map(thingThreadHelper, things) # for i, thingy in enumerate(things): # dictionary[i] = thing.title # for j, word in enumerate(words): # dictionary[i][0]= word # dictionary[i][j] = tfidf(word, thingy.description, descriptions))) # print type(results) # print results # for r in results: # print r return results
from numpy import array import csv_object from random import shuffle from sklearn.svm import SVC # features: collected, commented, downloads, likes, remixes, views # response: real ( determined my makes >= 10) filename = "FinalItems-Full.csv" # filename = "small.csv" # get our starting data things = csv_object.getThings(filename) shuffle(things) c = 0 for thing in things: if thing.real: c += 1 print c, len(things) training = things[0:len(things) - len(things) / 10] evaluation = things[len(things) - len(things) / 10:len(things)] # create data and target lists from training set dataT = [] targetT = [] for x in training: dataT.append([x.collected, x.commented, x.downloads, x.likes, x.remixed, x.views]) targetT.append(x.real)