Exemplo n.º 1
0
def move_to_avg(objs, orientation):
    """
    Move `objs` vertically/horizontally to their average x/y position.
    """
    if orientation not in ("h", "v"):
        raise ValueError("Orientation must be 'v' or 'h'")
    if len(objs) == 0: return []
    move_axis = "v" if orientation == "h" else "h"
    attr = "top" if orientation == "h" else "x0"
    values = list(map(itemgetter(attr), objs))
    q = pow(10, utils.decimalize(values[0]).as_tuple().exponent)
    avg = utils.decimalize(float(sum(values) / len(values)), q)
    new_objs = [ utils.move_object(obj, move_axis, avg - obj[attr])
        for obj in objs ]
    return new_objs
Exemplo n.º 2
0
 def __init__(self, pdf, page_obj, initial_doctop=0):
     self.pdf = pdf
     self.page_obj = page_obj
     self.mediabox = page_obj.attrs["MediaBox"]
     self.decimalize = lambda x: utils.decimalize(x, self.pdf.precision)
     self.width = self.decimalize(self.mediabox[2] - self.mediabox[0])
     self.height = self.decimalize(self.mediabox[3] - self.mediabox[1])
     self.pageid = page_obj.pageid
     self.initial_doctop = self.decimalize(initial_doctop)
Exemplo n.º 3
0
 def __init__(self, pdf, page_obj, initial_doctop=0):
     self.pdf = pdf
     self.page_obj = page_obj
     self.mediabox = page_obj.attrs["MediaBox"]
     self.decimalize = lambda x: utils.decimalize(x, self.pdf.precision)
     self.width = self.decimalize(self.mediabox[2] - self.mediabox[0])
     self.height = self.decimalize(self.mediabox[3] - self.mediabox[1])
     self.pageid = page_obj.pageid
     self.initial_doctop = self.decimalize(initial_doctop)
Exemplo n.º 4
0
 def test_decimalize(self):
     d = Decimal("1.011")
     assert utils.decimalize(1.011) == d
     assert [utils.decimalize(1.011)] == [d]
     assert utils.decimalize(d) == d
     assert id(utils.decimalize(d)) == id(d)
     assert utils.decimalize(1) == Decimal("1")
     with pytest.raises(ValueError):
         utils.decimalize("1")
def extract_words(page,
                  x_tolerance=DEFAULT_X_TOLERANCE,
                  y_tolerance=DEFAULT_Y_TOLERANCE,
                  keep_blank_chars=False):
    x_tolerance = decimalize(x_tolerance)
    y_tolerance = decimalize(y_tolerance)

    def process_word_chars(chars):
        x0, top, x1, bottom = objects_to_bbox(chars)
        return {
            "x0": x0,
            "x1": x1,
            "top": top,
            "bottom": bottom,
            "text": "".join(map(itemgetter("text"), chars)),
            "chars": chars
        }

    def make_set_clusters(doctop_cluster):
        new_clusters = []
        for c in doctop_cluster:
            new_cluster = [simplejson.dumps(c[i]) for i in range(len(c))]
            new_cluster = list(set(new_cluster))
            cluster_to_dict = []
            for i in range(len(new_cluster)):
                d = simplejson.loads(new_cluster[i])
                for k in d.keys():
                    if type(d[k]) == float:
                        d[k] = Decimal(str(d[k]))
                cluster_to_dict.append(d)
            new_clusters.append(cluster_to_dict)
        return new_clusters

    def check_two_chars(char1, char2):
        if abs(char1['x0'] - char2['x0']) < 1:
            return False
        return True

    def get_line_words(chars, tolerance=DEFAULT_X_TOLERANCE):
        get_text = itemgetter("text")
        chars_sorted = sorted(chars, key=itemgetter("x0"))
        new_chars_sorted = []
        for i in range(len(chars_sorted)):
            if i == 0 or check_two_chars(chars_sorted[i], chars_sorted[i - 1]):
                new_chars_sorted.append(chars_sorted[i])
        chars_sorted = new_chars_sorted
        words = []
        current_word = []

        for char in chars_sorted:
            if not keep_blank_chars and get_text(char).isspace():
                if len(current_word) > 0:
                    words.append(current_word)
                    current_word = []
                else:
                    pass
            elif len(current_word) == 0:
                current_word.append(char)
            else:
                last_char = current_word[-1]
                if char["x0"] > (last_char["x1"] + tolerance):
                    words.append(current_word)
                    current_word = []
                current_word.append(char)

        if len(current_word) > 0:
            words.append(current_word)
        processed_words = list(map(process_word_chars, words))
        return processed_words

    chars = to_list(page.chars)
    doctop_clusters = cluster_objects(chars, "doctop", y_tolerance)
    doctop_clusters = make_set_clusters(doctop_clusters)
    nested = [
        get_line_words(line_chars, tolerance=x_tolerance)
        for line_chars in doctop_clusters
    ]
    # text = ''.join([nested[2][i]['x0'] for i in range(len(nested[2]))])
    # x0 = [nested[2][i]['x0'] for i in range(len(nested[2]))]
    # print(x0)
    # print(nested[2])
    # print(2 / 0)

    words = list(itertools.chain(*nested))
    return words
Exemplo n.º 6
0
 def decimalize(self, x):
     return utils.decimalize(x, self.pdf.precision)