def test_to_list(self): objs = [ { "x0": 0, "top": 0, "x1": 20, "bottom": 20, }, { "x0": 10, "top": 10, "x1": 15, "bottom": 15, }, ] assert utils.to_list(objs) == objs assert utils.to_list(tuple(objs)) == objs assert utils.to_list((o for o in objs)) == objs assert utils.to_list(pd.DataFrame(objs)) == objs
def extract_text(chars, x_tolerance=utils.DEFAULT_X_TOLERANCE, y_tolerance=utils.DEFAULT_Y_TOLERANCE): if len(chars) == 0: return None chars = utils.to_list(chars) doctop_clusters = utils.cluster_objects(chars, "doctop", y_tolerance) lines = (collate_line(line_chars, x_tolerance) for line_chars in doctop_clusters) coll = "|&|".join(lines) return coll
def segment_to_lines(segments, x_tolerance=X_TOLERANCE): '''对含有chars的块进行分行''' segment_lines = [] for segment in segments: if not isinstance(segment, list): segment_lines.append(segment) elif len(segment) == 0: segment_lines.append(segment) else: chars = to_list(segment) clusters = cluster_chars(chars) line_chars = [ collate_line_chars(cluster, x_tolerance) for cluster in clusters ] segment_lines.append(line_chars) return segment_lines
def extract_words(page, x_tolerance=DEFAULT_X_TOLERANCE, y_tolerance=DEFAULT_Y_TOLERANCE, keep_blank_chars=False): x_tolerance = decimalize(x_tolerance) y_tolerance = decimalize(y_tolerance) def process_word_chars(chars): x0, top, x1, bottom = objects_to_bbox(chars) return { "x0": x0, "x1": x1, "top": top, "bottom": bottom, "text": "".join(map(itemgetter("text"), chars)), "chars": chars } def make_set_clusters(doctop_cluster): new_clusters = [] for c in doctop_cluster: new_cluster = [simplejson.dumps(c[i]) for i in range(len(c))] new_cluster = list(set(new_cluster)) cluster_to_dict = [] for i in range(len(new_cluster)): d = simplejson.loads(new_cluster[i]) for k in d.keys(): if type(d[k]) == float: d[k] = Decimal(str(d[k])) cluster_to_dict.append(d) new_clusters.append(cluster_to_dict) return new_clusters def check_two_chars(char1, char2): if abs(char1['x0'] - char2['x0']) < 1: return False return True def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE): get_text = itemgetter("text") chars_sorted = sorted(chars, key=itemgetter("x0")) new_chars_sorted = [] for i in range(len(chars_sorted)): if i == 0 or check_two_chars(chars_sorted[i], chars_sorted[i - 1]): new_chars_sorted.append(chars_sorted[i]) chars_sorted = new_chars_sorted words = [] current_word = [] for char in chars_sorted: if not keep_blank_chars and get_text(char).isspace(): if len(current_word) > 0: words.append(current_word) current_word = [] else: pass elif len(current_word) == 0: current_word.append(char) else: last_char = current_word[-1] if char["x0"] > (last_char["x1"] + tolerance): words.append(current_word) current_word = [] current_word.append(char) if len(current_word) > 0: words.append(current_word) processed_words = list(map(process_word_chars, words)) return processed_words chars = to_list(page.chars) doctop_clusters = cluster_objects(chars, "doctop", y_tolerance) doctop_clusters = make_set_clusters(doctop_clusters) nested = [ get_line_words(line_chars, tolerance=x_tolerance) for line_chars in doctop_clusters ] # text = ''.join([nested[2][i]['x0'] for i in range(len(nested[2]))]) # x0 = [nested[2][i]['x0'] for i in range(len(nested[2]))] # print(x0) # print(nested[2]) # print(2 / 0) words = list(itertools.chain(*nested)) return words
def draw_circles(self, list_of_circles, **kwargs): for x in utils.to_list(list_of_circles): self.draw_circle(x, **kwargs) return self
def draw_rects(self, list_of_rects, **kwargs): for x in utils.to_list(list_of_rects): self.draw_rect(x, **kwargs) return self
def draw_hlines(self, locations, **kwargs): for x in utils.to_list(locations): self.draw_hline(x, **kwargs) return self