def run(self, train_file, test_file, predict_file, test_result_file, test_statistic,log_file, iter_n): """ Args ----------------------------------- train_file: 记录训练数据 test_file: 记录测试数据 predict_file: 记录预测数据与预测结果 test_result_file 测试结果与标柱对比 test_statistic 测试结果数据统计与准确率记录 log_file 每次迭代的样本数量记录结果 """ self.iter_n = iter_n self.train_file = train_file self.test_file = test_file self.predict_file = predict_file self.test_result_file = test_result_file self.test_statistic = test_statistic X_flag, Y_flag, predict = self.get_data(self.data_file) #训练与测试 self.train_test(X_flag, Y_flag) name_predict, X_predict = self.name_feature_split(predict) name_flag, X_flag = self.name_feature_split(X_flag) # 预测 Y_predict = self.predict(X_predict) # 记录结果统计数据 self.log["predict_all"] = len(Y_predict) self.log["predict_pos"] = len(Y_predict) - list(Y_predict).count("others") self.log["predict_neg"] = list(Y_predict).count("others") self.log["all_pos"] = self.log.get("all_pos",0) + self.log["predict_pos"] self.log["all_neg"] = self.log.get("all_neg",0) + self.log["predict_neg"] result = dict((name_predict[i], Y_predict[i]) for i in range(len(name_predict))) flag = dict((name_flag[i], Y_flag[i]) for i in range(len(name_flag))) csv = CSVIO(self.data_file) s2c = csv.read_one_to_one(0,csv.fields.index("cluster"+str(self.iter_n))) self.log["cluter_num"] = len(set(s2c.values())) #读取大表中的section label和block label,写入聚类结果文件 s2sl = csv.read_one_to_one(0, csv.fields.index("section label")) s2bl = csv.read_one_to_one(0, csv.fields.index("block label")) self.record_result(self.predict_file, [("class",result), ("section label",s2sl),("block label", s2bl)]) result.update(flag) self.append_result(self.data_file, result) self.record_log(log_file)
def append_result(self, fn, s2l): """ Write to big table file """ print "Add to "+fn colname = "cluster"+self.iter_n csv = CSVIO(fn) csv.column(colname, s2l) csv.write(fn, ",", True, True, csv.fields.index(colname))
def run(self, result_file, log_file, iter_n): """ Args -------------------------------- result_file: record cluster result in clusterX_fY_result.csv, only include sample and cluster number log_file: 统计数据记录文件 iter_n: iteration number """ start = time.time() self.result_file = result_file self.iter_n = iter_n self.names, self.X, total = self.get_data(self.data_file) if len(self.names ) * 1.0 / total < props["stop_ratio"]: #当本次迭代样本数量很少时停止迭代 print "Sample number ratio is less than %s, Iteration Stop!" % self.props[ "stop_ratio"] return #record other samples write_lines(self.props["neg_file"], self.names) if self.k < 0: #如果k为-1,即没有指定k值,则计算k出来 self.k = self.calculate_k(self.X, self.init) sample_label, coef = self.cluster(self.X, self.k, self.init) csv = CSVIO(self.data_file) s2sl = s2bl = s2th = None if "section label" in csv.fields: #write section label in big table s2sl = csv.read_one_to_one(0, csv.fields.index("section label")) if "block label" in csv.fields: #write block label in big table s2bl = csv.read_one_to_one(0, csv.fields.index("block label")) if "table header" in csv.fields: #write table header in big table s2th = csv.read_one_to_one(0, csv.fields.index("table header")) # write to cluster result file self.record_result(self.result_file, sample_label, s2sl, s2bl, s2th) # write to big table file self.append_result(self.data_file, sample_label) t = time.time() - start self.log["sample_number"] = len(self.names) self.log["k"] = self.k self.log["centroid_method"] = self.init self.log["time_consuming"] = t self.record_log(log_file) print "Time Consuming:", t
def run(self, result_file, log_file, iter_n): """ Args -------------------------------- result_file: record cluster result in clusterX_fY_result.csv, only include sample and cluster number log_file: 统计数据记录文件 iter_n: iteration number """ start = time.time() self.result_file = result_file self.iter_n = iter_n self.names, self.X, total = self.get_data(self.data_file) if len(self.names)*1.0/total < props["stop_ratio"]: #当本次迭代样本数量很少时停止迭代 print "Sample number ratio is less than %s, Iteration Stop!"%self.props["stop_ratio"] return #record other samples write_lines(self.props["neg_file"], self.names) if self.k < 0: #如果k为-1,即没有指定k值,则计算k出来 self.k = self.calculate_k(self.X, self.init) sample_label, coef = self.cluster(self.X, self.k, self.init) csv = CSVIO(self.data_file) s2sl = s2bl = s2th = None if "section label" in csv.fields: #write section label in big table s2sl = csv.read_one_to_one(0, csv.fields.index("section label")) if "block label" in csv.fields: #write block label in big table s2bl = csv.read_one_to_one(0, csv.fields.index("block label")) if "table header" in csv.fields: #write table header in big table s2th = csv.read_one_to_one(0, csv.fields.index("table header")) # write to cluster result file self.record_result(self.result_file, sample_label, s2sl, s2bl, s2th ) # write to big table file self.append_result(self.data_file, sample_label) t = time.time()-start self.log["sample_number"] = len(self.names) self.log["k"] = self.k self.log["centroid_method"] = self.init self.log["time_consuming"] = t self.record_log(log_file) print "Time Consuming:",t
def run(self, train_file, test_file, predict_file, test_result_file, test_statistic, log_file, iter_n): """ Args ----------------------------------- train_file: 记录训练数据 test_file: 记录测试数据 predict_file: 记录预测数据与预测结果 test_result_file 测试结果与标柱对比 log_file 每次迭代的样本数量记录结果 """ self.iter_n = iter_n self.train_file = train_file self.test_file = test_file self.predict_file = predict_file self.test_result_file = test_result_file X_flag, Y_flag, predict = self.get_data(self.data_file) #训练与测试 self.train_test(X_flag, Y_flag) name_predict, X_predict = self.name_feature_split(predict) name_flag, X_flag = self.name_feature_split(X_flag) # 预测 Y_predict = self.predict(X_predict) result = dict( (name_predict[i], Y_predict[i]) for i in range(len(name_predict))) flag = dict((name_flag[i], Y_flag[i]) for i in range(len(name_flag))) csv = CSVIO(self.data_file) #读取大表中的section label和block label,写入聚类结果文件 s2sl = csv.read_one_to_one(0, csv.fields.index("section label")) s2bl = csv.read_one_to_one(0, csv.fields.index("block label")) self.record_result(self.predict_file, [("class", result), ("section label", s2sl), ("block label", s2bl)]) result.update(flag) self.append_result(self.data_file, result) self.record_log(log_file)
def record_log(self, log_file): """ 统计数据写入文件 """ csv = CSVIO(log_file) if not os.path.isfile(log_file): csv.column("type", dict((s, s) for s in self.log.keys())) csv.column("Iter" + str(self.iter_n), self.log) csv.write(log_file, ",", True, True)
def run(self, train_file, test_file, predict_file, test_result_file, test_statistic,log_file, iter_n): """ Args ----------------------------------- train_file: 记录训练数据 test_file: 记录测试数据 predict_file: 记录预测数据与预测结果 test_result_file 测试结果与标柱对比 log_file 每次迭代的样本数量记录结果 """ self.iter_n = iter_n self.train_file = train_file self.test_file = test_file self.predict_file = predict_file self.test_result_file = test_result_file X_flag, Y_flag, predict = self.get_data(self.data_file) #训练与测试 self.train_test(X_flag, Y_flag) name_predict, X_predict = self.name_feature_split(predict) name_flag, X_flag = self.name_feature_split(X_flag) # 预测 Y_predict = self.predict(X_predict) result = dict((name_predict[i], Y_predict[i]) for i in range(len(name_predict))) flag = dict((name_flag[i], Y_flag[i]) for i in range(len(name_flag))) csv = CSVIO(self.data_file) #读取大表中的section label和block label,写入聚类结果文件 s2sl = csv.read_one_to_one(0, csv.fields.index("section label")) s2bl = csv.read_one_to_one(0, csv.fields.index("block label")) self.record_result(self.predict_file, [("class",result), ("section label",s2sl),("block label", s2bl)]) result.update(flag) self.append_result(self.data_file, result) self.record_log(log_file)
def record_result(self, fn, columns=[]): """ Write to cluster result file """ print "Write to " + fn csv = CSVIO(fn, append=False) csv.column("sample", dict((s, s) for s in columns[0][1].keys())) for colname, col in columns: csv.column(colname, col) csv.write(fn, ",", True, True)
def record_log(self, log_file): """ log写入文件 """ csv = CSVIO(log_file) if not os.path.isfile(log_file): csv.column("type", dict((s,s) for s in self.log.keys())) csv.column("Iter"+str(self.iter_n), self.log) csv.write(log_file, ",", True, True )
def record_result(self, fn, columns = [] ): """ Write to cluster result file """ print "Write to "+fn csv = CSVIO(fn,append = False) csv.column("sample", dict((s,s) for s in columns[0][1].keys())) for colname, col in columns: csv.column(colname, col) csv.write(fn, ",", True, True)
def append_result(self, fn, s2l): """ Write to big table file """ print "Add to " + fn colname = "class" + self.iter_n csv = CSVIO(fn) csv.column(colname, s2l) csv.write(fn, ",", True, True, csv.fields.index(colname))
def record_result(self, fn, s2l, s2sl = None, s2bl = None, s2th = None ): """ Write to cluster result file """ print "Write to "+fn csv = CSVIO(fn,append = False) csv.column("sample", dict((s,s) for s in s2l.keys())) if s2sl: csv.column("section label", s2sl) if s2bl: csv.column("block label", s2bl) if s2th: csv.column("table header", s2th) csv.column("cluster", s2l) csv.write(fn, ",", True, True, csv.fields.index("cluster"))
def record_result(self, fn, s2l, s2sl=None, s2bl=None, s2th=None): """ Write to cluster result file """ print "Write to " + fn csv = CSVIO(fn, append=False) csv.column("sample", dict((s, s) for s in s2l.keys())) if s2sl: csv.column("section label", s2sl) if s2bl: csv.column("block label", s2bl) if s2th: csv.column("table header", s2th) csv.column("cluster", s2l) csv.write(fn, ",", True, True, csv.fields.index("cluster"))