def check_no_empty_between_two_significative_data(csv_file): """ 判断csv文件中不存[]在两个有意义的数据行之中 :param csv_file: csv 文件路径 :return: """ significative_begin = False has_none = False indexes = 0 with open(csv_file, 'r') as fp: indexes += 1 reader = csv.reader(fp) for i in reader: if significative_begin is False: if i != []: significative_begin = True else: pass pass if has_none is False: if (i == []) and (significative_begin is True): has_none = True else: pass if (has_none is True) and (significative_begin is True): if i != []: pinf.CKeyInfo('in raw %d data is unsatisfactory' % indexes) return False else: pass pass pass return True
def csv_item_seperated(self, file_list_full_path, rate, out_put_file_path, ignore_indexes=[]): """ 针对使用csv存储大量数据,总数据被分为几个文件,每个文件存储一定量的数据集,本程序要求每个数据文件中不能 不能再两个数据文件中穿插空数据行,但可以在两端最后又空行 :param file_list_full_path: 文件列表[] :param rate: 留一比率,float :param out_put_file_path: 信息输出文件路径, str :param ignore_indexes: 无效行,每个文件需要有自己的无效行[[], [], ...] :return: """ info = dict() info['k-part'] = round(1 / rate) info['config_file_format'] = self.CsvItemSeperated info['ignore_indexes'] = ignore_indexes info['source_file_queue'] = file_list_full_path info['source_file_data_amount'] = dict() pinf.CKeyInfo( 'all file path: %s\n' 'sample rate: %f\n' 'output file path: %s\n' 'ignore indexes: %s' % (file_list_full_path, rate, out_put_file_path, ignore_indexes)) assert False not in [ Ccsv.check_no_empty_between_two_significative_data(i) for i in file_list_full_path ] for i in file_list_full_path: info['source_file_data_amount'][i] = Ccsv.count_csv_file_row(i) total_amount = sum( [info['source_file_data_amount'][i] for i in file_list_full_path]) ignore_list = list() begin = 0 for i in range(0, len(file_list_full_path)): append_list = [j + begin for j in ignore_indexes[i]] ignore_list += append_list begin += info['source_file_data_amount'][file_list_full_path[i]] gather = CFS.n_devide(list(range(0, total_amount)), info['k-part'], ignore_list) for i in range(0, info['k-part']): info[i] = gather[i] fs = open(out_put_file_path, 'w') js = json.dumps(info, indent=4) fs.write(js) fs.close() pass
assert (False not in [len(i) == 3 for i in result]) is True, \ pinf.CError('error in n_devide , get item is not 2 : %s' % [len(i) == 3 for i in result]) a = list(range(0, 12)) result = CS.n_devide(a, 5, [0]) assert len(result) == 5, pinf.CError( 'error in n_devide, get len is not 5 : %d' % len(result)) assert (False not in [len(i) == 3 for i in result]) is True, \ pinf.CError('error in n_devide , get item is not 2 : %s' % [len(i) == 3 for i in result]) return True def test_n_devide_return_target_data(): """ 事实上test_n_devide不出错,这个不会有错 :return: """ t = 'asdfghjklqwertyuiopzxcvbnm' a = [t[i] for i in range(0, len(t))] result = CS.n_devide_return_target_data(a, 5) return True pass if __name__ == '__main__': assert test_sample_except() is True pinf.CKeyInfo('test sample_except successfully') assert test_n_devide() is True pinf.CKeyInfo('test n_devide successfully') assert test_n_devide_return_target_data() is True pinf.CKeyInfo('test n_devide_return_target_data successfully')
def test_calc_IoU(): truth = [100, 100, 100, 100] predict = [50, 50, 100, 100] result = Ciou.calc_IoU(predicted_coordinate=predict, true_coordinate=truth) Cinfo.CKeyInfo('truth: %s, predict: %s, IoU: %f' % (truth, predict, result))
def test_csv_item_seperated(): a = LOO.LeaveOneOut() indexes = a.csv_item_seperated( [ './data/test_LeaveOneOut-test_csv_item_seperated1.csv', './data/test_LeaveOneOut-test_csv_item_seperated2.csv' ], rate=0.1, out_put_file_path= './data/test_LeaveOneOut-test_csv_item_seperated.json', ignore_indexes=[[0, 1], [38]]) indexes = a.csv_item_seperated( ['./data/test_LeaveOneOut-test_csv_item_seperated1.csv'], rate=0.1, out_put_file_path= './data/test_LeaveOneOut-test_csv_item_seperated-only_one.json', ignore_indexes=[[0, 1]]) return True pass if __name__ == '__main__': pinf.CKeyInfo('-------------testing csv_features_seperated--------------') test_csv_features_seperated() pinf.CKeyInfo( '-------------testing csv_reader_features_seperated--------------') assert test_csv_reader_features_seperated() is True pinf.CKeyInfo('successed') pinf.CKeyInfo('-------------testing csv_item_seperated--------------') test_csv_item_seperated() pinf.CKeyInfo('successed')
def csv_features_seperated(self, file_list_full_path, rate, unique_identification, out_put_file_path, type='speed', ignore_indexes=[]): """ 针对样本的特征存储在多个文件中,但是这多个文件有对应的唯一标志,比如a特征存在a文件,b特征存在b文件,但是对于K样本, 在两个文件中都有对应的第m列标志k在相同,同时对于其他样本又不同 csv多文件留一数据预处理,要求file_list_full_path中的标记拥有同等数据量 :param file_list_full_path:list 多个文件的列表,考虑到可能多种标记 :param out_put_file_path:输出文件,依靠该文件,规范化读取数据 :param unique_identification:json文件,考虑到不同csv文件中可能有不同的排序,我们通过标志列来进行数据安排 :param rate:留一比率 , float :param type:主要考虑到,可能总的文件比较大,同时读取会爆内存,提供两种选择方式, 一种是速度导向,吃内存"speed",一种是内存导向,损速度"memory" :param ignore_indexes:针对csv文件需要忽略行的索引 :return: """ pinf.CKeyInfo('all file path: %s\n' 'sample rate: %f\n' 'unique identification: %d\n' 'output file path: %s\n' 'ignore indexes: %s' % (file_list_full_path, rate, unique_identification, out_put_file_path, ignore_indexes)) info = dict() part_num = round(1 / rate) info['k-part'] = part_num info['config_file_format'] = self.CsvFeaturesSeperated info['ignore_indexes'] = ignore_indexes if type == 'speed': key_info = dict() for i in file_list_full_path: reader = csv.reader(open(i, 'r')) # 取标识列信息,存到字典key_info中,该文件的绝对路径作为键值 key_info[i] = [row[unique_identification] for row in reader] assert self._check_same_length(key_info) is True, \ pinf.CError('not all csv file has the same number of data') info['sample_amount'] = len(key_info[file_list_full_path[0]]) # part_num次留一法,每个留一法验证集互斥 index = CFS.n_devide(key_info[file_list_full_path[0]], part=part_num, except_list=ignore_indexes) for i in range(0, part_num): info[i] = dict() info[i][file_list_full_path[0]] = index[i] for j in file_list_full_path[1:]: l = [ key_info[j].index(key_info[file_list_full_path[0]][k]) for k in index[i] ] info[i][j] = l pass pass elif type == 'memory': print('this kind method is not complete!') sys.exit() pass else: print('illegal type') sys.exit() pass fs = open(out_put_file_path, 'w') js = json.dumps(info, indent=4) fs.write(js) fs.close() pass