def CreateTrainMatrix(self, path=""): #get input-path inputPath = path if (inputPath == ""): inputPath = self.node.GetChild("train_input").GetValue() f = open(inputPath, "r") uid = 0 rows = [0] cols = [] vals = [] y = [] #fill the matrix's cols and rows for line in f: vec = line.split("\t") line = vec[0] target = int(vec[1]) y.append(target) wordList = self.segmenter.Split(line.decode("utf-8")) #store current row's cols partCols = [] #create dicts and fill partCol #calculate term-frequent in this loop curWordCount = 0 termFres = {} for word in wordList: curWordCount += 1 if (not PyMining.termToId.has_key(word)): PyMining.termToId[word] = uid PyMining.idToTerm[uid] = word uid += 1 termId = PyMining.termToId[word] partCols.append(termId) if (not termFres.has_key(termId)): termFres[termId] = 1 else: termFres[termId] += 1 #fill partCol partCols = set(partCols) partCols = list(partCols) partCols.sort() #fill cols and vals, fill termToDocCount for col in partCols: cols.append(col) #fill vals with termFrequent vals.append(termFres[col]) #fill idToDocCount if (not PyMining.idToDocCount.has_key(col)): PyMining.idToDocCount[col] = 1 else: PyMining.idToDocCount[col] += 1 #fill rows rows.append(rows[len(rows) - 1] + \ len(partCols)) #fill classToDocCount if (not PyMining.classToDocCount.has_key(target)): PyMining.classToDocCount[target] = 1 else: PyMining.classToDocCount[target] += 1 #fill PyMining's idToIdf for termId in PyMining.idToTerm.keys(): PyMining.idToIdf[termId] = math.log( float(len(rows) - 1) / (PyMining.idToDocCount[termId] + 1)) #NOTE: now, not mul idf to vals, because not all algorithms need tf * idf #change matrix's vals using tf-idf represent #for r in range(len(rows) - 1): # for c in range(rows[r], rows[r + 1]): # termId = cols[c] # #idf(i) = log(|D| / |{d (ti included)}| + 1 # vals[c] = vals[c] * PyMining.idToIdf[termId] #close file f.close() #write dicts out PyMining.Write() self.trained = True return [Matrix(rows, cols, vals), y]
def __init__(self, config, nodeName, loadFromFile=False): self.node = config.GetChild(nodeName) self.segmenter = Segmenter(config, "__segmenter__") self.trained = loadFromFile PyMining.Init(config, "__global__", loadFromFile)
#encoding=utf8 from matrix import Matrix from classifier_matrix import ClassifierMatrix from segmenter import Segmenter from py_mining import PyMining from configuration import Configuration from chisquare_filter import ChiSquareFilter from naive_bayes import NaiveBayes if __name__ == "__main__": config = Configuration.FromFile("conf/test.xml") PyMining.Init(config, "__global__") matCreater = ClassifierMatrix(config, "__matrix__") [trainx, trainy] = matCreater.CreateTrainMatrix("data/train.txt") chiFilter = ChiSquareFilter(config, "__filter__") chiFilter.TrainFilter(trainx, trainy) nbModel = NaiveBayes(config, "naive_bayes") nbModel.Train(trainx, trainy) inputStr = "仅售28元!原价698元的康迩福韩国美容美体中心的韩国特色美容套餐1份(紫莱花园店、时代奥城店2店通用):韩国特色面部SPA护理1次+韩国特色面部瘦脸加毛孔净化1次+韩国特色水" [cols, vals] = matCreater.CreatePredictSample(inputStr) [cols, vals] = chiFilter.SampleFilter(cols, vals) probTuple = nbModel.TestSample(cols, vals) print probTuple