def CreateTrainMatrix(self, path=""):
        #get input-path
        inputPath = path
        if (inputPath == ""):
            inputPath = self.node.GetChild("train_input").GetValue()

        f = open(inputPath, "r")
        uid = 0
        rows = [0]
        cols = []
        vals = []
        y = []

        #fill the matrix's cols and rows
        for line in f:
            vec = line.split("\t")
            line = vec[0]
            target = int(vec[1])
            y.append(target)
            wordList = self.segmenter.Split(line.decode("utf-8"))

            #store current row's cols
            partCols = []

            #create dicts and fill partCol
            #calculate term-frequent in this loop
            curWordCount = 0
            termFres = {}
            for word in wordList:
                curWordCount += 1
                if (not PyMining.termToId.has_key(word)):
                    PyMining.termToId[word] = uid
                    PyMining.idToTerm[uid] = word
                    uid += 1
                termId = PyMining.termToId[word]
                partCols.append(termId)
                if (not termFres.has_key(termId)):
                    termFres[termId] = 1
                else:
                    termFres[termId] += 1
            #fill partCol
            partCols = set(partCols)
            partCols = list(partCols)
            partCols.sort()

            #fill cols and vals, fill termToDocCount
            for col in partCols:
                cols.append(col)
                #fill vals with termFrequent
                vals.append(termFres[col])
                #fill idToDocCount
                if (not PyMining.idToDocCount.has_key(col)):
                    PyMining.idToDocCount[col] = 1
                else:
                    PyMining.idToDocCount[col] += 1

            #fill rows
            rows.append(rows[len(rows) - 1] + \
                len(partCols))

            #fill classToDocCount
            if (not PyMining.classToDocCount.has_key(target)):
                PyMining.classToDocCount[target] = 1
            else:
                PyMining.classToDocCount[target] += 1

        #fill PyMining's idToIdf
        for termId in PyMining.idToTerm.keys():
            PyMining.idToIdf[termId] = math.log(
                float(len(rows) - 1) / (PyMining.idToDocCount[termId] + 1))

        #NOTE: now, not mul idf to vals, because not all algorithms need tf * idf
        #change matrix's vals using tf-idf represent
        #for r in range(len(rows) - 1):
        #    for c in range(rows[r], rows[r + 1]):
        #        termId = cols[c]
        #        #idf(i) = log(|D| / |{d (ti included)}| + 1
        #        vals[c] = vals[c] * PyMining.idToIdf[termId]

        #close file
        f.close()

        #write dicts out
        PyMining.Write()

        self.trained = True

        return [Matrix(rows, cols, vals), y]
 def __init__(self, config, nodeName, loadFromFile=False):
     self.node = config.GetChild(nodeName)
     self.segmenter = Segmenter(config, "__segmenter__")
     self.trained = loadFromFile
     PyMining.Init(config, "__global__", loadFromFile)
#encoding=utf8

from matrix import Matrix
from classifier_matrix import ClassifierMatrix
from segmenter import Segmenter
from py_mining import PyMining
from configuration import Configuration
from chisquare_filter import ChiSquareFilter
from naive_bayes import NaiveBayes

if __name__ == "__main__":
    config = Configuration.FromFile("conf/test.xml")
    PyMining.Init(config, "__global__")
    matCreater = ClassifierMatrix(config, "__matrix__")
    [trainx, trainy] = matCreater.CreateTrainMatrix("data/train.txt")
    chiFilter = ChiSquareFilter(config, "__filter__")
    chiFilter.TrainFilter(trainx, trainy)

    nbModel = NaiveBayes(config, "naive_bayes")
    nbModel.Train(trainx, trainy)

    inputStr = "仅售28元!原价698元的康迩福韩国美容美体中心的韩国特色美容套餐1份(紫莱花园店、时代奥城店2店通用):韩国特色面部SPA护理1次+韩国特色面部瘦脸加毛孔净化1次+韩国特色水"
    [cols, vals] = matCreater.CreatePredictSample(inputStr)
    [cols, vals] = chiFilter.SampleFilter(cols, vals)
    probTuple = nbModel.TestSample(cols, vals)
    print probTuple