def write(self, ob, *, encoding='utf8', if_exists=None, mode=None, **kwargs): """ 保存为文件 :param ob: 写入的内容 如果要写txt文本文件且ob不是文本对象,只会进行简单的字符串化 :param encoding: 强制写入的编码 如果原文件存在且有编码,则使用原文件的编码 如果没有,则默认使用utf8 当然,其实有些格式是用不到编码信息的~~例如pkl文件 :param if_exists: 如果文件已存在,要进行的操作 :param mode: 写入模式(例如 '.json'),默认从扩展名识别,也可以强制指定 :param kwargs: 写入json格式的时候 ensure_ascii: json.dump默认是True,但是我这里默认值改成了False 改成False可以支持在json直接显示中文明文 indent: json.dump是None,我这里默认值遵循json.dump 我原来是2,让文件结构更清晰、更加易读 :return: 返回写入的文件名,这个主要是在写临时文件时有用 """ # # 将ob写入文件path # def get_enc(): # # 编码在需要的时候才获取分析,减少不必要的运算开销 # # 所以封装一个函数接口,需要的时候再计算 # if encoding is None: # # return self.encoding or 'utf8' # return encoding if self.exist_preprcs(if_exists): self.ensure_parent() name, suffix = str(self), self.suffix if not mode: mode = suffix mode = mode.lower() if mode == '.pkl': with open(name, 'wb') as f: pickle.dump(ob, f) elif mode == '.json': with open(name, 'w', encoding=encoding) as f: DictTool.ior(kwargs, {'ensure_ascii': False}) json.dump(ob, f, **kwargs) elif mode == '.yaml': with open(name, 'w', encoding=encoding) as f: yaml.dump(ob, f) elif isinstance(ob, bytes): with open(name, 'wb') as f: f.write(ob) else: # 其他类型认为是文本类型 with open(name, 'w', errors='ignore', encoding=encoding) as f: f.write(str(ob)) return self
def write(self, loc): f = File(loc, self.root, suffix='.json') imfile = self.imfiles[loc] lmdict = LabelmeDict.gen_data(imfile) for label, ann in self.data[loc].items(): a = ann.copy() DictTool.isub(a, ['img']) shape = LabelmeDict.gen_shape(json.dumps(a, ensure_ascii=False), a['points'], a['shape_type'], group_id=a['group_id'], flags=a['flags']) lmdict['shapes'].append(shape) f.write(lmdict, indent=2)
def update_labelattr(cls, lmdict, *, points=False, inplace=True): """ :param points: 是否更新labelattr中的points、bbox等几何信息 并且在无任何几何信息的情况下,增设points """ if not inplace: lmdict = copy.deepcopy(lmdict) for shape in lmdict['shapes']: # 1 属性字典,至少先初始化一个label属性 labelattr = DictTool.json_loads(shape['label'], 'label') # 2 填充其他扩展属性值 keys = set(shape.keys()) stdkeys = set('label,points,group_id,shape_type,flags'.split(',')) for k in (keys - stdkeys): labelattr[k] = shape[k] del shape[k] # 要删除原有的扩展字段值 # 3 处理points等几何信息 if points: if 'bbox' in labelattr: labelattr['bbox'] = ltrb2xywh(rect_bounds(shape['points'])) else: labelattr['points'] = shape['points'] # + 写回shape shape['label'] = json.dumps(labelattr, ensure_ascii=False) return lmdict
def details(self): """ 返回详细的分析表 按行罗列所有键,按列罗列所有字典,中间显示各字典键值 可以把结果保存到excel,然后详细筛选分析 >>> dc = DictCmper({'d1': {'a': 1, 'b': 2}, 'd2': {'b': 3, 'e': 5}, 'd3': {'d': 4}}) >>> dc.details() d1 d2 d3 a 1.0 NaN NaN b 2.0 3.0 NaN e NaN 5.0 NaN d NaN NaN 4.0 """ # 1 获得所有键 # 集合无法保存元素顺序,所以用合并字典来代替 # 还有个三方库orderedset,不想安装。就这样简便解决就好。 keys = DictTool.or_(*self.dicts.values()).keys() # 2 取出所有字典值 ls = [] for k in keys: ls.append([(d[k] if k in d else np.nan) for d in self.dicts.values()]) # 3 转为df表格 df = pd.DataFrame.from_records(ls, columns=self.dicts.keys()) df.index = keys return df
def to_labelmes_advance(): m = 50 # 匹配run时,上文关联的文字长度,越长越严格 # 1 将带有下划线的run对象,使用特殊的hash规则存储起来 content = [] # 使用已遍历到的文本内容作为hash值 elements = {} for p in self.paragraphs: for r in p.runs: # 要去掉空格,不然可能对不上。试过strip不行。分段会不太一样,还是把所有空格删了靠谱。 content.append(re.sub(r'\s+', '', r.text)) if r.underline or is_color( r.font.color.rgb): # 对有下划线、颜色的对象进行存储 # print(r.text + ',', r.underline, r.font.color.rgb, ''.join(content)) etag = get_etag( ''.join(content)[-m:]) # 全部字符都判断的话太严格了,所以减小区间~ elements[etag] = r # 2 检查json标注中为span的,其哈希规则是否有对应,则要单独设置扩展属性 content = '' for i, file in enumerate(imfiles): page = doc.load_page(i) lmdict = LabelmeDict.gen_data(file) lmdict['shapes'] = page.get_labelme_shapes('dict', views=views, scale=scale) for sp in lmdict['shapes']: attrs = DictTool.json_loads(sp['label'], 'label') if attrs['category_name'] == 'span': content += re.sub(r'\s+', '', attrs['text']) etag = get_etag(content[-m:]) # print(content) if etag in elements: # print(content) r = elements[etag] # 对应的原run对象 attrs = DictTool.json_loads(sp['label']) x = r.underline if x: attrs['underline'] = int(x) x = r.font.color.rgb if is_color(x): attrs['color'] = list(x) sp['label'] = json.dumps(attrs) file.with_suffix('.json').write(lmdict, indent=indent)
def parse_shape(cls, shape, image=None): """ 解析一个shape的数据为dict字典 """ # 1 解析原label为字典 attrs = DictTool.json_loads(shape['label'], 'label') attrs.update(DictTool.sub(shape, ['label'])) # 2 中心点 center shape_type = shape['shape_type'] pts = shape['points'] if shape_type in ('rectangle', 'polygon', 'line'): attrs['center'] = np.array(np.array(pts).mean(axis=0), dtype=int).tolist() elif shape_type == 'circle': attrs['center'] = pts[0] # 3 外接矩形 rect if shape_type in ('rectangle', 'polygon'): attrs['ltrb'] = np.array(pts, dtype=int).reshape(-1).tolist() elif shape_type == 'circle': x, y = pts[0] r = ((x - pts[1][0])**2 + (y - pts[1][1])**2)**0.5 attrs['ltrb'] = [ round_int(v) for v in [x - r, y - r, x + r, y + r] ] # 4 图片数据 img, etag if image is not None and attrs['ltrb']: attrs['img'] = xlcv.get_sub(image, attrs['ltrb']) # attrs['etag'] = get_etag(attrs['img']) # TODO 这里目的其实就是让相同图片对比尽量相似,所以可以用dhash而不是etag # 5 中心点像素值 pixel p = attrs['center'] if image is not None and p: attrs['pixel'] = tuple(image[p[1], p[0]].tolist()[::-1]) # if 'rect' in attrs: # del attrs['rect'] # 旧版的格式数据,删除 return attrs
def add_shape(name, refdict, add_keys, drop_keys=('bbox', )): """ 生成一个标注框 """ msgdict = {'category_name': name} msgdict.update(add_keys) DictTool.ior(msgdict, refdict) DictTool.isub(msgdict, drop_keys) bbox = [round_int(v * scale) for v in refdict['bbox']] if 'size' in msgdict: x = round_unit(msgdict['size'], 0.5) msgdict['size'] = round_int( x) if (x * 10) % 10 < 1 else x # 没有小数的时候,优先展示为11,而不是11.0 if 'color' in msgdict: # 把color映射为直观的(r, g, b) # 这个pdf解析器获取的color,不一定精确等于原值,可能会有偏差,小一个像素 v = msgdict['color'] msgdict['color'] = (v // 256 // 256, (v // 256) % 256, v % 256) if 'origin' in msgdict: msgdict['origin'] = [round_int(v) for v in msgdict['origin']] sp = LabelmeDict.gen_shape(json.dumps(msgdict), bbox) shapes.append(sp)
def to_coco_gt_dict(self, categories=None): """ 将labelme转成 coco gt 标注的格式 分两种大情况 1、一种是raw原始数据转labelme标注后,首次转coco格式,这种编号等相关数据都可以重新生成 raw_data --可视化--> labelme --转存--> coco 2、还有种原来就是coco,转labelme修改标注后,又要再转回coco,这种应该尽量保存原始值 coco --> labelme --手动修改--> labelme' --> coco' 这种在coco转labelme时,会做一些特殊标记,方便后续转回coco 3、 1, 2两种情况是可以连在一起,然后形成 labelme 和 coco 之间的多次互转的 :param categories: 类别 默认只设一个类别 {'id': 0, 'name': 'text', 'supercategory'} 支持自定义,所有annotations的category_id :return: gt_dict 注意,如果对文件顺序、ann顺序有需求的,请先自行操作self.data数据后,再调用该to_coco函数 对image_id、annotation_id有需求的,需要使用CocoData进一步操作 """ from pyxllib.data.coco import CocoGtData if not categories: if 'categories' in self.extdata: # coco 转过来的labelme,存储有原始的 categories categories = self.extdata['categories'] else: categories = [{'id': 0, 'name': 'text', 'supercategory': ''}] # 1 第一轮遍历:结构处理 jsonfile, lmdict --> data(image, shapes) img_id, ann_id, data = 0, 0, [] for jsonfile, lmdict in self.rp2data.items(): # 1.0 升级为字典类型 lmdict = LabelmeDict.update_labelattr(lmdict, points=True) for sp in lmdict['shapes']: # label转成字典 sp['label'] = json.loads(sp['label']) # 1.1 找shapes里的image image = None # 1.1.1 xltype='image' for sp in filter(lambda x: x.get('xltype', None) == 'image', lmdict['shapes']): image = DictTool.json_loads(sp['label']) if not image: raise ValueError(sp['label']) # TODO 删除 coco_eval 等字段? del image['xltype'] break # 1.1.2 shapes里没有图像级标注则生成一个 if image is None: # TODO file_name 加上相对路径? image = CocoGtData.gen_image(-1, lmdict['imagePath'], lmdict['imageHeight'], lmdict['imageWidth']) img_id = max(img_id, image.get('id', -1)) # 1.2 遍历shapes shapes = [] for sp in lmdict['shapes']: label = sp['label'] if 'xltype' not in label: # 普通的标注框 d = sp['label'].copy() # DictTool.isub_(d, '') ann_id = max(ann_id, d.get('id', -1)) shapes.append(d) elif label['xltype'] == 'image': # image,图像级标注数据;之前已经处理了,这里可以跳过 pass elif label['xltype'] == 'seg': # seg,衍生的分割标注框,在转回coco时可以丢弃 pass else: raise ValueError data.append([image, shapes]) # 2 第二轮遍历:处理id等问题 images, annotations = [], [] for image, shapes in data: # 2.1 image if image.get('id', -1) == -1: img_id += 1 image['id'] = img_id images.append(image) # 2.2 annotations for sp in shapes: sp['image_id'] = img_id if sp.get('id', -1) == -1: ann_id += 1 sp['id'] = ann_id # 如果没有框类别,会默认设置一个。 (强烈建议外部业务功能代码自行设置好category_id) if 'category_id' not in sp: sp['category_id'] = categories[0]['id'] DictTool.isub(sp, ['category_name']) ann = CocoGtData.gen_annotation(**sp) annotations.append(ann) # 3 result gt_dict = CocoGtData.gen_gt_dict(images, annotations, categories) return gt_dict
def write_json(self, data, encoding='utf8', **kwargs): with open(self, 'w', encoding=encoding) as f: DictTool.ior(kwargs, {'ensure_ascii': False}) json.dump(data, f, **kwargs)
def to_labelme_cls(self, root, *, bbox=True, seg=False, info=False): """ :param root: 图片根目录 :return: extdata,存储了一些匹配异常信息 """ root, data = Dir(root), {} catid2name = {x['id']: x['name'] for x in self.gt_dict['categories']} # 1 准备工作,构建文件名索引字典 gs = PathGroups.groupby(root.select_files('**/*')) # 2 遍历生成labelme数据 not_finds = set() # coco里有的图片,root里没有找到 multimatch = dict() # coco里的某张图片,在root找到多个匹配文件 for img, anns in tqdm(self.group_gt(reserve_empty=True), disable=not info): # 2.1 文件匹配 imfiles = gs.find_files(img['file_name']) if not imfiles: # 没有匹配图片的,不处理 not_finds.add(img['file_name']) continue elif len(imfiles) > 1: multimatch[img['file_name']] = imfiles imfile = imfiles[0] else: imfile = imfiles[0] # 2.2 数据内容转换 lmdict = LabelmeDict.gen_data(imfile) img = DictTool.or_(img, {'xltype': 'image'}) lmdict['shapes'].append( LabelmeDict.gen_shape(json.dumps(img, ensure_ascii=False), [[-10, 0], [-5, 0]])) for ann in anns: if bbox: ann = DictTool.or_( ann, {'category_name': catid2name[ann['category_id']]}) label = json.dumps(ann, ensure_ascii=False) shape = LabelmeDict.gen_shape(label, xywh2ltrb(ann['bbox'])) lmdict['shapes'].append(shape) if seg: # 把分割也显示出来(用灰色) for x in ann['segmentation']: an = { 'box_id': ann['id'], 'xltype': 'seg', 'shape_color': [191, 191, 191] } label = json.dumps(an, ensure_ascii=False) lmdict['shapes'].append(LabelmeDict.gen_shape( label, x)) f = imfile.with_suffix('.json') data[f.relpath(root)] = lmdict return LabelmeDataset(root, data, extdata={ 'categories': self.gt_dict['categories'], 'not_finds': not_finds, 'multimatch': Groups(multimatch) })