def CreatePredictSample(self, src): print src if (not self.trained): print "train Classifier Matrix before predict" #split sentence #if src is read from utf-8 file directly, # should using CreatePredictSample(src.decode("utf-8")) wordList = mmseg.seg_txt(src) wordList = [word for word in wordList if len(word.decode('utf8'))>1] cols = [] vals = [] #fill partCols, and create csr partCols = [] termFreqs = {} for word in wordList: if (GlobalInfo.termToId.has_key(word)): termId = GlobalInfo.termToId[word] partCols.append(termId) if (not termFreqs.has_key(termId)): termFreqs[termId] = 1 else: termFreqs[termId] += 1 partCols = set(partCols) partCols = list(partCols) partCols.sort() for col in partCols: cols.append(col) vals.append(termFreqs[col]) return [cols, vals]
def CreatePredictMatrix(self, path = ""): if (not self.trained): print "train ClassifierMatrix before predict" return False #get input path inputPath = path if (inputPath == ""): inputPath = self.curNode.GetChild("test_input") f = open(inputPath, "r") rows = [0] cols = [] vals = [] y = [] for line in f: vec = line.split("\t") line = vec[0] y.append(int(vec[1])) #split sentence wordList = mmseg.seg_txt(line) wordList = [word for word in wordList if len(word.decode('utf8'))>1] #fill partCols, and create csr partCols = [] termFreqs = {} curWordCount = 0 for word in wordList: curWordCount += 1 if (GlobalInfo.termToId.has_key(word)): termId = GlobalInfo.termToId[word] partCols.append(termId) if (not termFreqs.has_key(termId)): termFreqs[termId] = 1 else: termFreqs[termId] += 1 partCols = set(partCols) partCols = list(partCols) partCols.sort() for col in partCols: cols.append(col) vals.append(termFreqs[col]) rows.append(rows[len(rows) - 1] + \ len(partCols)) #close file f.close() return [Matrix(rows, cols, vals), y]
def CreateTrainMatrix(self): uid = 0 rows = [0] cols = [] vals = [] y = [] target = -1 uids = [] #fill the matrix's cols and rows for doc in self.db[self.collection].find(): if not doc.has_key(self.field): continue line = doc.get(self.field) user_id = doc['uid'].encode('utf-8') uids.append(user_id) target += 1 y.append(target) wordList = mmseg.seg_txt(line.encode('utf-8')) wordList = [word for word in wordList if len(word.decode('utf8'))>1] #store current row's cols partCols = [] #create dicts and fill partCol #calculate term-frequent in this loop curWordCount = 0 termFres = {} for word in wordList: curWordCount += 1 if (not GlobalInfo.termToId.has_key(word)): GlobalInfo.termToId[word] = uid GlobalInfo.idToTerm[uid] = word uid += 1 termId = GlobalInfo.termToId[word] partCols.append(termId) if (not termFres.has_key(termId)): termFres[termId] = 1 else: termFres[termId] += 1 #fill partCol: delete duplicate and sort partCols = set(partCols) partCols = list(partCols) partCols.sort() #fill cols and vals, fill termToDocCount for col in partCols: cols.append(col) #fill vals with termFrequent vals.append(termFres[col]) #fill idToDocCount if (not GlobalInfo.idToDocCount.has_key(col)): GlobalInfo.idToDocCount[col] = 1 else: GlobalInfo.idToDocCount[col] += 1 #fill rows rows.append(rows[len(rows) - 1] + len(partCols)) #fill classToDocCount if (not GlobalInfo.classToDocCount.has_key(target)): GlobalInfo.classToDocCount[target] = 1 else: GlobalInfo.classToDocCount[target] += 1 #fill GlobalInfo's idToIdf for termId in GlobalInfo.idToTerm.keys(): GlobalInfo.idToIdf[termId] = math.log(float(len(rows) - 1) / (GlobalInfo.idToDocCount[termId] + 1)) #NOTE: now, not mul idf to vals, because not all algorithms need tf * idf #change matrix's vals using tf-idf represent #for r in range(len(rows) - 1): # for c in range(rows[r], rows[r + 1]): # termId = cols[c] # #idf(i) = log(|D| / |{d (ti included)}| + 1 # vals[c] = vals[c] * GlobalInfo.idToIdf[termId] #write dicts out GlobalInfo.Write() self.trained = True return [Matrix(rows, cols, vals), y, uids]