Exemplo n.º 1
0
    def run(self, train_file, test_file, predict_file, test_result_file, test_statistic,log_file, iter_n):
        """
        Args
        -----------------------------------
        train_file:
            记录训练数据
        test_file:
            记录测试数据
        predict_file:
            记录预测数据与预测结果
        test_result_file
            测试结果与标柱对比
        test_statistic
            测试结果数据统计与准确率记录
        log_file
            每次迭代的样本数量记录结果
        """

        self.iter_n = iter_n 

        self.train_file = train_file
        self.test_file = test_file
        self.predict_file = predict_file
        self.test_result_file = test_result_file
        self.test_statistic = test_statistic

        X_flag, Y_flag, predict = self.get_data(self.data_file)

        #训练与测试
        self.train_test(X_flag, Y_flag)

        name_predict, X_predict = self.name_feature_split(predict)
        name_flag, X_flag = self.name_feature_split(X_flag)

        # 预测
        Y_predict = self.predict(X_predict)

        # 记录结果统计数据
        self.log["predict_all"] = len(Y_predict)
        self.log["predict_pos"] = len(Y_predict) - list(Y_predict).count("others")
        self.log["predict_neg"] = list(Y_predict).count("others")
        self.log["all_pos"] = self.log.get("all_pos",0) + self.log["predict_pos"]
        self.log["all_neg"] = self.log.get("all_neg",0) + self.log["predict_neg"]

        result = dict((name_predict[i], Y_predict[i]) for i in range(len(name_predict)))
        flag = dict((name_flag[i], Y_flag[i]) for i in range(len(name_flag)))
        
        csv = CSVIO(self.data_file)
        s2c = csv.read_one_to_one(0,csv.fields.index("cluster"+str(self.iter_n)))
        self.log["cluter_num"] = len(set(s2c.values()))
        #读取大表中的section label和block label,写入聚类结果文件
        s2sl = csv.read_one_to_one(0, csv.fields.index("section label"))
        s2bl = csv.read_one_to_one(0, csv.fields.index("block label"))
        self.record_result(self.predict_file, [("class",result), ("section label",s2sl),("block label", s2bl)])
        result.update(flag)
        self.append_result(self.data_file, result)

        self.record_log(log_file)
Exemplo n.º 2
0
 def append_result(self, fn, s2l):
     """
     Write to big table file
     """
     print "Add to "+fn
     colname = "cluster"+self.iter_n
     csv = CSVIO(fn)
     csv.column(colname, s2l)
     csv.write(fn, ",", True, True, csv.fields.index(colname))
Exemplo n.º 3
0
    def run(self, result_file, log_file, iter_n):
        """
        Args
        --------------------------------
        result_file:
            record cluster result in clusterX_fY_result.csv, only include sample and cluster number
        log_file:
            统计数据记录文件
        iter_n:
            iteration number
            
        """
        start = time.time()

        self.result_file = result_file
        self.iter_n = iter_n

        self.names, self.X, total = self.get_data(self.data_file)

        if len(self.names
               ) * 1.0 / total < props["stop_ratio"]:  #当本次迭代样本数量很少时停止迭代
            print "Sample number ratio is less than %s, Iteration Stop!" % self.props[
                "stop_ratio"]
            return

        #record other samples
        write_lines(self.props["neg_file"], self.names)

        if self.k < 0:  #如果k为-1,即没有指定k值,则计算k出来
            self.k = self.calculate_k(self.X, self.init)

        sample_label, coef = self.cluster(self.X, self.k, self.init)

        csv = CSVIO(self.data_file)
        s2sl = s2bl = s2th = None
        if "section label" in csv.fields:  #write section label in big table
            s2sl = csv.read_one_to_one(0, csv.fields.index("section label"))
        if "block label" in csv.fields:  #write block label in big table
            s2bl = csv.read_one_to_one(0, csv.fields.index("block label"))
        if "table header" in csv.fields:  #write table header in big table
            s2th = csv.read_one_to_one(0, csv.fields.index("table header"))
        # write to cluster result file
        self.record_result(self.result_file, sample_label, s2sl, s2bl, s2th)
        # write to big table file
        self.append_result(self.data_file, sample_label)

        t = time.time() - start
        self.log["sample_number"] = len(self.names)
        self.log["k"] = self.k
        self.log["centroid_method"] = self.init
        self.log["time_consuming"] = t

        self.record_log(log_file)

        print "Time Consuming:", t
Exemplo n.º 4
0
    def run(self, result_file, log_file,  iter_n):
        """
        Args
        --------------------------------
        result_file:
            record cluster result in clusterX_fY_result.csv, only include sample and cluster number
        log_file:
            统计数据记录文件
        iter_n:
            iteration number
            
        """
        start = time.time()

        self.result_file = result_file
        self.iter_n = iter_n 

        self.names, self.X, total = self.get_data(self.data_file)

        if len(self.names)*1.0/total < props["stop_ratio"]: #当本次迭代样本数量很少时停止迭代
            print "Sample number ratio is less than %s, Iteration Stop!"%self.props["stop_ratio"]
            return

        #record other samples
        write_lines(self.props["neg_file"], self.names)

        if self.k < 0: #如果k为-1,即没有指定k值,则计算k出来
            self.k = self.calculate_k(self.X, self.init)

        sample_label, coef = self.cluster(self.X, self.k, self.init)

        csv = CSVIO(self.data_file)
        s2sl = s2bl = s2th = None
        if "section label" in csv.fields: #write section label in big table
            s2sl = csv.read_one_to_one(0, csv.fields.index("section label"))
        if "block label" in csv.fields: #write block label in big table
            s2bl = csv.read_one_to_one(0, csv.fields.index("block label"))
        if "table header" in csv.fields: #write table header in big table
            s2th = csv.read_one_to_one(0, csv.fields.index("table header"))
        # write to cluster result file
        self.record_result(self.result_file, sample_label, s2sl, s2bl, s2th )
        # write to big table file
        self.append_result(self.data_file, sample_label)

        t = time.time()-start
        self.log["sample_number"] = len(self.names)
        self.log["k"] = self.k
        self.log["centroid_method"] = self.init
        self.log["time_consuming"] = t

        self.record_log(log_file)

        print "Time Consuming:",t
Exemplo n.º 5
0
    def run(self, train_file, test_file, predict_file, test_result_file,
            test_statistic, log_file, iter_n):
        """
        Args
        -----------------------------------
        train_file:
            记录训练数据
        test_file:
            记录测试数据
        predict_file:
            记录预测数据与预测结果
        test_result_file
            测试结果与标柱对比
        log_file
            每次迭代的样本数量记录结果
        """

        self.iter_n = iter_n

        self.train_file = train_file
        self.test_file = test_file
        self.predict_file = predict_file
        self.test_result_file = test_result_file

        X_flag, Y_flag, predict = self.get_data(self.data_file)

        #训练与测试
        self.train_test(X_flag, Y_flag)

        name_predict, X_predict = self.name_feature_split(predict)
        name_flag, X_flag = self.name_feature_split(X_flag)

        # 预测
        Y_predict = self.predict(X_predict)

        result = dict(
            (name_predict[i], Y_predict[i]) for i in range(len(name_predict)))
        flag = dict((name_flag[i], Y_flag[i]) for i in range(len(name_flag)))

        csv = CSVIO(self.data_file)
        #读取大表中的section label和block label,写入聚类结果文件
        s2sl = csv.read_one_to_one(0, csv.fields.index("section label"))
        s2bl = csv.read_one_to_one(0, csv.fields.index("block label"))
        self.record_result(self.predict_file, [("class", result),
                                               ("section label", s2sl),
                                               ("block label", s2bl)])
        result.update(flag)
        self.append_result(self.data_file, result)

        self.record_log(log_file)
Exemplo n.º 6
0
 def record_log(self, log_file):
     """
     统计数据写入文件 
     """
     csv = CSVIO(log_file)
     if not os.path.isfile(log_file):
         csv.column("type", dict((s, s) for s in self.log.keys()))
     csv.column("Iter" + str(self.iter_n), self.log)
     csv.write(log_file, ",", True, True)
Exemplo n.º 7
0
    def run(self, train_file, test_file, predict_file, test_result_file, test_statistic,log_file, iter_n):
        """
        Args
        -----------------------------------
        train_file:
            记录训练数据
        test_file:
            记录测试数据
        predict_file:
            记录预测数据与预测结果
        test_result_file
            测试结果与标柱对比
        log_file
            每次迭代的样本数量记录结果
        """

        self.iter_n = iter_n 

        self.train_file = train_file
        self.test_file = test_file
        self.predict_file = predict_file
        self.test_result_file = test_result_file

        X_flag, Y_flag, predict = self.get_data(self.data_file)

        #训练与测试
        self.train_test(X_flag, Y_flag)

        name_predict, X_predict = self.name_feature_split(predict)
        name_flag, X_flag = self.name_feature_split(X_flag)

        # 预测
        Y_predict = self.predict(X_predict)

        result = dict((name_predict[i], Y_predict[i]) for i in range(len(name_predict)))
        flag = dict((name_flag[i], Y_flag[i]) for i in range(len(name_flag)))
        
        csv = CSVIO(self.data_file)
        #读取大表中的section label和block label,写入聚类结果文件
        s2sl = csv.read_one_to_one(0, csv.fields.index("section label"))
        s2bl = csv.read_one_to_one(0, csv.fields.index("block label"))
        self.record_result(self.predict_file, [("class",result), ("section label",s2sl),("block label", s2bl)])
        result.update(flag)
        self.append_result(self.data_file, result)

        self.record_log(log_file)
Exemplo n.º 8
0
    def record_result(self, fn, columns=[]):
        """
        Write to cluster result file
        """
        print "Write to " + fn

        csv = CSVIO(fn, append=False)
        csv.column("sample", dict((s, s) for s in columns[0][1].keys()))
        for colname, col in columns:
            csv.column(colname, col)
        csv.write(fn, ",", True, True)
Exemplo n.º 9
0
 def record_log(self, log_file):
     """
     log写入文件
     """
     csv = CSVIO(log_file)
     if not os.path.isfile(log_file):
         csv.column("type", dict((s,s) for s in self.log.keys()))
     csv.column("Iter"+str(self.iter_n), self.log)
     csv.write(log_file, ",", True, True )
Exemplo n.º 10
0
    def record_result(self, fn, columns = [] ):
        """
        Write to cluster result file
        """
        print "Write to "+fn

        csv = CSVIO(fn,append = False)
        csv.column("sample", dict((s,s) for s in columns[0][1].keys()))
        for colname, col in columns:
             csv.column(colname, col)
        csv.write(fn, ",", True, True)
Exemplo n.º 11
0
 def append_result(self, fn, s2l):
     """
     Write to big table file
     """
     print "Add to " + fn
     colname = "class" + self.iter_n
     csv = CSVIO(fn)
     csv.column(colname, s2l)
     csv.write(fn, ",", True, True, csv.fields.index(colname))
Exemplo n.º 12
0
    def record_result(self, fn, s2l, s2sl = None, s2bl = None, s2th = None ):
        """
        Write to cluster result file
        """
        print "Write to "+fn

        csv = CSVIO(fn,append = False)
        csv.column("sample", dict((s,s) for s in s2l.keys()))
        if s2sl: csv.column("section label", s2sl)
        if s2bl: csv.column("block label", s2bl)
        if s2th: csv.column("table header", s2th)
        csv.column("cluster", s2l)
        csv.write(fn, ",", True, True, csv.fields.index("cluster"))
Exemplo n.º 13
0
    def record_result(self, fn, s2l, s2sl=None, s2bl=None, s2th=None):
        """
        Write to cluster result file
        """
        print "Write to " + fn

        csv = CSVIO(fn, append=False)
        csv.column("sample", dict((s, s) for s in s2l.keys()))
        if s2sl: csv.column("section label", s2sl)
        if s2bl: csv.column("block label", s2bl)
        if s2th: csv.column("table header", s2th)
        csv.column("cluster", s2l)
        csv.write(fn, ",", True, True, csv.fields.index("cluster"))