예제 #1
0
def check_no_empty_between_two_significative_data(csv_file):
    """
    判断csv文件中不存[]在两个有意义的数据行之中
    :param csv_file: csv 文件路径
    :return: 
    """
    significative_begin = False
    has_none = False
    indexes = 0
    with open(csv_file, 'r') as fp:
        indexes += 1
        reader = csv.reader(fp)
        for i in reader:
            if significative_begin is False:
                if i != []:
                    significative_begin = True
                else:
                    pass
                pass
            if has_none is False:
                if (i == []) and (significative_begin is True):
                    has_none = True
                else:
                    pass
            if (has_none is True) and (significative_begin is True):
                if i != []:
                    pinf.CKeyInfo('in raw %d data is unsatisfactory' % indexes)
                    return False
                else:
                    pass
                pass
            pass
        return True
예제 #2
0
 def csv_item_seperated(self,
                        file_list_full_path,
                        rate,
                        out_put_file_path,
                        ignore_indexes=[]):
     """
     针对使用csv存储大量数据,总数据被分为几个文件,每个文件存储一定量的数据集,本程序要求每个数据文件中不能
     不能再两个数据文件中穿插空数据行,但可以在两端最后又空行
     :param file_list_full_path: 文件列表[]
     :param rate: 留一比率,float
     :param out_put_file_path: 信息输出文件路径, str
     :param ignore_indexes: 无效行,每个文件需要有自己的无效行[[], [], ...]
     :return: 
     """
     info = dict()
     info['k-part'] = round(1 / rate)
     info['config_file_format'] = self.CsvItemSeperated
     info['ignore_indexes'] = ignore_indexes
     info['source_file_queue'] = file_list_full_path
     info['source_file_data_amount'] = dict()
     pinf.CKeyInfo(
         'all file path: %s\n'
         'sample rate: %f\n'
         'output file path: %s\n'
         'ignore indexes: %s' %
         (file_list_full_path, rate, out_put_file_path, ignore_indexes))
     assert False not in [
         Ccsv.check_no_empty_between_two_significative_data(i)
         for i in file_list_full_path
     ]
     for i in file_list_full_path:
         info['source_file_data_amount'][i] = Ccsv.count_csv_file_row(i)
     total_amount = sum(
         [info['source_file_data_amount'][i] for i in file_list_full_path])
     ignore_list = list()
     begin = 0
     for i in range(0, len(file_list_full_path)):
         append_list = [j + begin for j in ignore_indexes[i]]
         ignore_list += append_list
         begin += info['source_file_data_amount'][file_list_full_path[i]]
     gather = CFS.n_devide(list(range(0, total_amount)), info['k-part'],
                           ignore_list)
     for i in range(0, info['k-part']):
         info[i] = gather[i]
     fs = open(out_put_file_path, 'w')
     js = json.dumps(info, indent=4)
     fs.write(js)
     fs.close()
     pass
예제 #3
0
    assert (False not in [len(i) == 3 for i in result]) is True, \
        pinf.CError('error in n_devide , get item is not 2 : %s' % [len(i) == 3 for i in result])
    a = list(range(0, 12))
    result = CS.n_devide(a, 5, [0])
    assert len(result) == 5, pinf.CError(
        'error in n_devide, get len is not 5 : %d' % len(result))
    assert (False not in [len(i) == 3 for i in result]) is True, \
        pinf.CError('error in n_devide , get item is not 2 : %s' % [len(i) == 3 for i in result])
    return True


def test_n_devide_return_target_data():
    """
    事实上test_n_devide不出错,这个不会有错
    :return:
    """
    t = 'asdfghjklqwertyuiopzxcvbnm'
    a = [t[i] for i in range(0, len(t))]
    result = CS.n_devide_return_target_data(a, 5)
    return True
    pass


if __name__ == '__main__':
    assert test_sample_except() is True
    pinf.CKeyInfo('test sample_except successfully')
    assert test_n_devide() is True
    pinf.CKeyInfo('test n_devide successfully')
    assert test_n_devide_return_target_data() is True
    pinf.CKeyInfo('test n_devide_return_target_data successfully')
예제 #4
0
def test_calc_IoU():
    truth = [100, 100, 100, 100]
    predict = [50, 50, 100, 100]
    result = Ciou.calc_IoU(predicted_coordinate=predict, true_coordinate=truth)
    Cinfo.CKeyInfo('truth: %s, predict: %s, IoU: %f' % (truth, predict, result))
예제 #5
0
def test_csv_item_seperated():
    a = LOO.LeaveOneOut()
    indexes = a.csv_item_seperated(
        [
            './data/test_LeaveOneOut-test_csv_item_seperated1.csv',
            './data/test_LeaveOneOut-test_csv_item_seperated2.csv'
        ],
        rate=0.1,
        out_put_file_path=
        './data/test_LeaveOneOut-test_csv_item_seperated.json',
        ignore_indexes=[[0, 1], [38]])
    indexes = a.csv_item_seperated(
        ['./data/test_LeaveOneOut-test_csv_item_seperated1.csv'],
        rate=0.1,
        out_put_file_path=
        './data/test_LeaveOneOut-test_csv_item_seperated-only_one.json',
        ignore_indexes=[[0, 1]])
    return True
    pass


if __name__ == '__main__':
    pinf.CKeyInfo('-------------testing csv_features_seperated--------------')
    test_csv_features_seperated()
    pinf.CKeyInfo(
        '-------------testing csv_reader_features_seperated--------------')
    assert test_csv_reader_features_seperated() is True
    pinf.CKeyInfo('successed')
    pinf.CKeyInfo('-------------testing csv_item_seperated--------------')
    test_csv_item_seperated()
    pinf.CKeyInfo('successed')
예제 #6
0
    def csv_features_seperated(self,
                               file_list_full_path,
                               rate,
                               unique_identification,
                               out_put_file_path,
                               type='speed',
                               ignore_indexes=[]):
        """
        针对样本的特征存储在多个文件中,但是这多个文件有对应的唯一标志,比如a特征存在a文件,b特征存在b文件,但是对于K样本,
        在两个文件中都有对应的第m列标志k在相同,同时对于其他样本又不同
        csv多文件留一数据预处理,要求file_list_full_path中的标记拥有同等数据量
        :param file_list_full_path:list 多个文件的列表,考虑到可能多种标记
        :param out_put_file_path:输出文件,依靠该文件,规范化读取数据
        :param unique_identification:json文件,考虑到不同csv文件中可能有不同的排序,我们通过标志列来进行数据安排
        :param rate:留一比率 , float
        :param type:主要考虑到,可能总的文件比较大,同时读取会爆内存,提供两种选择方式,
        一种是速度导向,吃内存"speed",一种是内存导向,损速度"memory"
        :param ignore_indexes:针对csv文件需要忽略行的索引
        :return:
        """
        pinf.CKeyInfo('all file path: %s\n'
                      'sample rate: %f\n'
                      'unique identification: %d\n'
                      'output file path: %s\n'
                      'ignore indexes: %s' %
                      (file_list_full_path, rate, unique_identification,
                       out_put_file_path, ignore_indexes))
        info = dict()
        part_num = round(1 / rate)
        info['k-part'] = part_num
        info['config_file_format'] = self.CsvFeaturesSeperated
        info['ignore_indexes'] = ignore_indexes
        if type == 'speed':
            key_info = dict()
            for i in file_list_full_path:
                reader = csv.reader(open(i, 'r'))
                # 取标识列信息,存到字典key_info中,该文件的绝对路径作为键值
                key_info[i] = [row[unique_identification] for row in reader]
            assert self._check_same_length(key_info) is True, \
                pinf.CError('not all csv file has the same number of data')
            info['sample_amount'] = len(key_info[file_list_full_path[0]])
            # part_num次留一法,每个留一法验证集互斥
            index = CFS.n_devide(key_info[file_list_full_path[0]],
                                 part=part_num,
                                 except_list=ignore_indexes)
            for i in range(0, part_num):
                info[i] = dict()
                info[i][file_list_full_path[0]] = index[i]
                for j in file_list_full_path[1:]:
                    l = [
                        key_info[j].index(key_info[file_list_full_path[0]][k])
                        for k in index[i]
                    ]
                    info[i][j] = l
                    pass
                pass
        elif type == 'memory':
            print('this kind method is not complete!')
            sys.exit()
            pass
        else:
            print('illegal type')
            sys.exit()
            pass

        fs = open(out_put_file_path, 'w')
        js = json.dumps(info, indent=4)
        fs.write(js)
        fs.close()
        pass