def get_translated_lines_converter(file_id_to_lines, category_to_translated): """转换格式 Args: file_id_to_lines (dict[int: list[str]]): key 为 file_id, value 为 list<line>, 每行的格式为 "ID","Unknown","Index","Offset","Text" category_to_translated (dict[str: list]): dict, key 为 category, value 为 list of [file_id, unknown, index, text] Returns: en_line_to_zh_line (dict[str: str]): key 为原文的行, value 为译文的行 """ translated_count_dry = 0 # 不包括重复的 # translated_dict, {"en_line": "zh_line"} en_line_to_zh_line = {} # 已经处理过的 file_id translated_file_ids = [] # 遍历从每个 xls 读入的数据 for category, translated_data in sorted(category_to_translated.items()): translated_count_dry += len(translated_data) possible_file_ids = [] # 根据 category 决定处理方法 if category in file_id_of_list.keys(): possible_file_ids = file_id_of_list[category] elif category in file_id_of_array.keys(): possible_file_ids = file_id_of_array[category] elif category in file_id_of_pair.keys(): possible_file_ids = file_id_of_pair[category] translated_file_ids.extend(possible_file_ids) # 需要判断的行 possible_lines = [] for file_id in possible_file_ids: file_id = int(file_id) if file_id in file_id_to_lines: possible_lines.extend( [line for line in file_id_to_lines[int(file_id)]]) # load translation en_line_to_zh_line_of_category = get_en_line_to_zh_line( possible_lines, translated_data) # merge translation en_line_to_zh_line = merge_dict(en_line_to_zh_line, en_line_to_zh_line_of_category) log.info('%d(%d) lines translated' % (translated_count_dry, len(en_line_to_zh_line))) return en_line_to_zh_line
def get_translated_lines_converter(file_id_to_lines, category_to_translated): """转换格式 Args: file_id_to_lines (dict[int: list[str]]): key 为 file_id, value 为 list<line>, 每行的格式为 "ID","Unknown","Index","Offset","Text" category_to_translated (dict[str: list]): dict, key 为 category, value 为 list of [file_id, unknown, index, text] Returns: en_line_to_zh_line (dict[str: str]): key 为原文的行, value 为译文的行 """ translated_count_dry = 0 # 不包括重复的 # translated_dict, {"en_line": "zh_line"} en_line_to_zh_line = {} # 已经处理过的 file_id translated_file_ids = [] # 遍历从每个 xls 读入的数据 for category, translated_data in sorted(category_to_translated.items()): translated_count_dry += len(translated_data) possible_file_ids = [] # 根据 category 决定处理方法 if category in file_id_of_list.keys(): possible_file_ids = file_id_of_list[category] elif category in file_id_of_array.keys(): possible_file_ids = file_id_of_array[category] elif category in file_id_of_pair.keys(): possible_file_ids = file_id_of_pair[category] translated_file_ids.extend(possible_file_ids) # 需要判断的行 possible_lines = [] for file_id in possible_file_ids: file_id = int(file_id) if file_id in file_id_to_lines: possible_lines.extend([line for line in file_id_to_lines[int(file_id)]]) # load translation en_line_to_zh_line_of_category = get_en_line_to_zh_line(possible_lines, translated_data) # merge translation en_line_to_zh_line = merge_dict(en_line_to_zh_line, en_line_to_zh_line_of_category) log.info('%d(%d) lines translated' % (translated_count_dry, len(en_line_to_zh_line))) return en_line_to_zh_line
def merge_pipeline(self, target): merged_pipeline = self.to_json().copy() # logger.debug("Base:") # logger.debug(self.to_json()) # logger.debug("Target:") # logger.debug(target) for k, v in target.items(): if k in merged_pipeline: if isinstance(v, list): merged_pipeline[k] = merged_pipeline[k] + v elif isinstance(v, dict): merged_pipeline[k] = merge_dict(merged_pipeline[k], v) else: merged_pipeline[k] = v else: merged_pipeline[k] = v # logger.debug("Returned: ") # logger.debug(merged_pipeline) return Pipeline(merged_pipeline)
def chart_ocr(img, word_bbox_role, tool, pad=False): word_bbox_except_len = len(word_bbox_role.values()) word_bbox_unexcept = {} for idx, prop in word_bbox_role.items(): minr, minc, maxr, maxc = prop['bbox'] if pad: word_img = np.ones((2*(maxr - minr), 2*(maxc - minc))) word_img[(maxr - minr)//2:(maxr - minr)//2 + (maxr - minr), (maxc - minc)//2:(maxc - minc)//2 + (maxc - minc)] = img[minr:maxr, minc:maxc] else: pad_size = 2 word_img = img[minr-pad_size:maxr+pad_size, minc-pad_size:maxc+pad_size] # im = Image.fromarray((word_img * 255.0).astype('uint8'), mode='L') # im.save('./word_bbox_%d.png' % idx) img_aug = ocr_image_preprocess(word_img, rotate_aug=True) txt_cand = {} for idx_img, word_img_aug in enumerate(img_aug): if word_img_aug is not None: txt_tmp = ocr(tool, word_img_aug, 'txt') if txt_tmp != '': if txt_tmp not in txt_cand: txt_cand[txt_tmp] = 0 txt_cand[txt_tmp] += 1 if not txt_cand: word_bbox_role[idx]['txt'] = 'UNKNOWN' else: ocr_voted = list(txt_cand.keys())[list(txt_cand.values()).index(sorted(txt_cand.values())[-1])] word_bbox_unexcept, ocr_voted, add_flag = \ ocr_postprocess(ocr_voted, prop['bbox'], prop['bbox'], word_bbox_unexcept, prop['role'], word_bbox_except_len) if not add_flag: word_bbox_role[idx]['txt'] = ocr_voted if word_bbox_unexcept: word_bbox_role = utils.merge_dict(word_bbox_role, word_bbox_unexcept) return word_bbox_role
def get_sum(self): take_count = [] for i in range(0, 8): take_count.append(0) record_count = [] for i in range(0, 11): record_count.append(0) live_count = [] for i in range(0, 8): live_count.append(0) option_count = { 'wb': {}, 'aaa_mode': {} } record_option = [] live_option = [] timelapse_option = { 'option': [], 'interval': {} } for item in self.result_list: take_count = [take_count[i] + item['take_count'][i] for i in range(0, 8)] record_count = [record_count[i] + item['record_count'][i] for i in range(0, 11)] live_count = [live_count[i] + item['live_count'][i] for i in range(0, 8)] merge_dict(option_count['wb'], item['option_count']['wb']) merge_dict(option_count['aaa_mode'], item['option_count']['aaa_mode']) record_option.extend(item['record_option']) live_option.extend(item['live_option']) timelapse_option['option'].extend(item['timelapse_option']['option']) merge_dict(timelapse_option['interval'], item['timelapse_option']['interval']) result = { 'take_count': take_count, 'record_count': record_count, 'live_count': live_count, 'option_count': option_count, 'record_option': record_option, 'live_option': live_option, 'timelapse_option': timelapse_option, } return result
def bboxes_postprocess(bboxes, fig_type): '''Add bbox for legend packer/axis line and redefine axis bbox/tick by removing gridline''' def bbox_merge(bbox_cur, bbox_add): return [min(bbox_cur[0], bbox_add[0]), min(bbox_cur[1], bbox_add[1]), max(bbox_cur[2], bbox_add[2]), max(bbox_cur[3], bbox_add[3])] # def merge_dict(dict1, dict2): # z = dict1.copy() # start with x's keys and values # z.update(dict2) # modifies z with y's keys and values & returns None # return z bboxes_packer = {} bboxes_path = {} for element_type, element_bbox in bboxes.items(): if FID.LEGEND_SYMBOL_ID in element_type: # add bbox for legend packer idx = element_type.split('_')[-1] bboxes_packer[FID.LEGEND_PACKER_ID + idx] = bbox_merge(element_bbox, bboxes[FID.LEGEND_TEXT_ID + idx]) if FID.DRAWING_OBJECT_ID in element_type: if (fig_type == 'Line_chart' and 'path' in element_type) or (fig_type == 'Area_chart' and 'line_path' in element_type): # add location for evert point in path of a line element bbox_path = {} path_splits = element_bbox.encode("utf-8").split(' ') path_splits.remove('') # the last element in path for path_idx, path_split in enumerate(path_splits): path_cur = path_split.split(' ') assert path_cur[0] in ['M', 'L', 'z'], ' !! Error: unexpection type: {} in svg path'.format(path_cur[0]) point_loc = path_cur[1:] if point_loc: # remove the 'z' element (just in case) bbox_path[element_type + '_%d' % path_idx] = list(map(float, point_loc)) bboxes_path[element_type] = bbox_path if FID.X_AXIS_ID == element_type: # use top of tickline as axis bbox (offset label included) element_bbox[1] = bboxes[FID.X_AXIS_MAJOR_TICKLINE_ID + str(1)][1] if FID.X_AXIS_OFFSET_ID in list(bboxes): element_bbox = bbox_merge(element_bbox, bboxes[FID.X_AXIS_OFFSET_ID]) if FID.Y_AXIS_ID == element_type: # use right of tickline as axis bbox (offset label included) element_bbox[2] = bboxes[FID.Y_AXIS_MAJOR_TICKLINE_ID + str(1)][2] if FID.Y_AXIS_OFFSET_ID in list(bboxes): element_bbox = bbox_merge(element_bbox, bboxes[FID.Y_AXIS_OFFSET_ID]) if FID.X_AXIS_MAJOR_TICK_ID in element_type: # use top of tickline as axis bbox element_bbox[1] = bboxes[FID.X_AXIS_MAJOR_TICKLINE_ID + element_type.split('_')[-1]][1] if FID.X_AXIS_MINOR_TICK_ID in element_type: # use top of tickline as axis bbox element_bbox[1] = bboxes[FID.X_AXIS_MINOR_TICKLINE_ID + element_type.split('_')[-1]][1] if FID.Y_AXIS_MAJOR_TICK_ID in element_type: # use right of tickline as axis bbox element_bbox[2] = bboxes[FID.Y_AXIS_MAJOR_TICKLINE_ID + element_type.split('_')[-1]][2] if FID.Y_AXIS_MINOR_TICK_ID in element_type: # use right of tickline as axis bbox element_bbox[2] = bboxes[FID.Y_AXIS_MINOR_TICKLINE_ID + element_type.split('_')[-1]][2] if fig_type is not 'Pie_chart': # add in bbox for axis line bboxes[FID.X_AXIS_LINE_ID] = bboxes[FID.X_AXIS_ID] bboxes[FID.X_AXIS_LINE_ID][3] = bboxes[FID.X_AXIS_MAJOR_TICKLINE_ID + str(1)][3] bboxes[FID.Y_AXIS_LINE_ID] = bboxes[FID.Y_AXIS_ID] bboxes[FID.Y_AXIS_LINE_ID][0] = bboxes[FID.Y_AXIS_MAJOR_TICKLINE_ID + str(1)][0] return merge_dict(merge_dict(bboxes, bboxes_packer), bboxes_path)
def clone(self): return Step(merge_dict({}, self.to_json()))
def clone(self): return Pipeline(merge_dict({}, self.to_json()))