示例#1
0
def extractTextList(self):
    text_list = []
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)

    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject) and len(_text.strip()):
                text_list.append(_text.strip())
        elif operator == b_("T*"):
            pass
        elif operator == b_("'"):
            pass
            _text = operands[0]
            if isinstance(_text, TextStringObject) and len(operands[0]):
                text_list.append(operands[0])
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject) and len(_text):
                text_list.append(_text)
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject) and len(i):
                    text_list.append(i)
    return text_list
示例#2
0
def is_continuation(content, item):
	if content.operations[item - 1][1] == b_("Tm"):

		# Search previous "Tm"
		for bef in range(-2, -15, -1):
			try:
				if content.operations[item - bef][1] == b_("Tm"):
					prev_val = content.operations[item - bef][0]

					break
			except IndexError:
				return False
		else:
			return False

		key_1_preve = '{0:.5f}'.format(prev_val[4]).split(".")[1]
		key_2_preve = '{0:.5f}'.format(prev_val[5]).split(".")[1]

		prev_curr = content.operations[item - 1][0]
		key_1_curr = '{0:.5f}'.format(prev_curr[4]).split(".")[1]
		key_2_curr = '{0:.5f}'.format(prev_curr[5]).split(".")[1]

		# if key_1_curr != key_1_preve or key_2_curr != key_2_preve:
		if key_1_curr == key_1_preve:
			return True

	return False
示例#3
0
 def original_extractText(self):
     text = u_("")
     content = self["/Contents"].getObject()
     if not isinstance(content, ContentStream):
         content = ContentStream(content, self.pdf)
     # Note: we check all strings are TextStringObjects.  ByteStringObjects
     # are strings where the byte->string encoding was unknown, so adding
     # them to the text here would be gibberish.
     for operands, operator in content.operations:
         if operator == b_("Tj"):
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += _text
         elif operator == b_("T*"):
             text += "\n"
         elif operator == b_("'"):
             text += "\n"
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += operands[0]
         elif operator == b_('"'):
             _text = operands[2]
             if isinstance(_text, TextStringObject):
                 text += "\n"
                 text += _text
         elif operator == b_("TJ"):
             for i in operands[0]:
                 if isinstance(i, TextStringObject):
                     text += i
             text += "\n"
     return text
示例#4
0
def is_header(content, item):
	if content.operations[item - 1][1] == b_("Td"):
		return True
	elif content.operations[item - 1][1] == b_("Tm") and \
			content.operations[item - 2][1] == b_("Tf"):

		if content.operations[item - 3][1] == b_("BT") or \
			content.operations[item - 3][1] == b_("scn"):
			return True
		else:
			return False
	else:
		return False
示例#5
0
def extractText_with_separator(self, remove_headers=False):
	text = u_("")
	content = self["/Contents"].getObject()
	if not isinstance(content, ContentStream):
		content = ContentStream(content, self.pdf)
	# Note: we check all strings are TextStringObjects.  ByteStringObjects
	# are strings where the byte->string encoding was unknown, so adding
	# them to the text here would be gibberish.
	for item, (operands, operator) in enumerate(content.operations):
		if operator == b_("Tj"):

			# Skip headers?
			if is_header(content, item):
				continue

			if not is_continuation(content, item):
				text += "\n"

			_text = operands[0]
			if isinstance(_text, TextStringObject):
				text += _text

		elif operator == b_("T*"):
			text += "\n"
		elif operator == b_("'"):
			text += "\n"
			_text = operands[0]
			if isinstance(_text, TextStringObject):
				text += operands[0]
		elif operator == b_('"'):
			_text = operands[2]
			if isinstance(_text, TextStringObject):
				text += "\n"
				text += _text
		elif operator == b_("TJ"):

			# Skip headers?
			if is_header(content, item):
				continue

			if not is_continuation(content, item):
				text += "\n"

			for i in operands[0]:
				if isinstance(i, TextStringObject):
					text += i

			# text += "\n"
	return text
示例#6
0
def extractText_patch(self):
    """
    Locate all text drawing commands, in the order they are provided in the
    content stream, and extract the text.  This works well for some PDF
    files, but poorly for others, depending on the generator used.  This will
    be refined in the future.  Do not rely on the order of text coming out of
    this function, as it will change if this function is made more
    sophisticated.

    :return: a unicode string object.
    """
    text = u_("")
    content = self["/Contents"].getObject()
    if not isinstance(content, ContentStream):
        content = ContentStream(content, self.pdf)
    # Note: we check all strings are TextStringObjects.  ByteStringObjects
    # are strings where the byte->string encoding was unknown, so adding
    # them to the text here would be gibberish.
    for operands, operator in content.operations:
        if operator == b_("Tj"):
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += _text
        elif operator == b_("T*"):
            text += "\n"
        elif operator == b_("'"):
            text += "\n"
            _text = operands[0]
            if isinstance(_text, TextStringObject):
                text += operands[0]
        elif operator == b_('"'):
            _text = operands[2]
            if isinstance(_text, TextStringObject):
                text += "\n"
                text += _text
        elif operator == b_("TJ"):
            for i in operands[0]:
                if isinstance(i, TextStringObject):
                    text += i
                elif isinstance(i, NumberObject) and i < -125:
                    text += " "

            text += "\n"
    return text
示例#7
0
    def alt_extractText(self):
        """
        Locate text and include "\n"

        :return: a unicode string object.
        """
        pic = ""
        tic = "~"
        text = u_("")
        content = self["/Contents"].getObject()
        if not isinstance(content, ContentStream):
            content = ContentStream(content, self.pdf)
        for operands, operator in content.operations:
            if operator == b_("Tj"):
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += _text + pic
            elif operator == b_("T*"):
                text += "\n"
            elif operator == b_("'"):
                text += "\n"
                _text = operands[0]
                if isinstance(_text, TextStringObject):
                    text += operands[0]
            elif operator == b_('"'):
                _text = operands[2]
                if isinstance(_text, TextStringObject):
                    text += "\n"
                    text += _text
            elif operator == b_("TJ"):
                for i in operands[0]:
                    if isinstance(i, TextStringObject):
                        text += i
                text += "\n"
            else:
                text += tic
        return text
 def customExtractText(self):
     text = u_("")
     content = self["/Contents"].getObject()
     if not isinstance(content, ContentStream):
         content = ContentStream(content, self.pdf)
     # Note: we check all strings are TextStringObjects.  ByteStringObjects
     # are strings where the byte->string encoding was unknown, so adding
     # them to the text here would be gibberish.
     for operands, operator in content.operations:
         if operator == b_("Tj"):
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += _text
         elif operator == b_("T*"):
             text += "\n"
         elif operator == b_("'"):
             text += "\n"
             _text = operands[0]
             if isinstance(_text, TextStringObject):
                 text += operands[0]
         elif operator == b_('"'):
             _text = operands[2]
             if isinstance(_text, TextStringObject):
                 text += "\n"
                 text += _text
         elif operator == b_("TJ"):
             for i in operands[0]:
                 if isinstance(i, TextStringObject):
                     text += i
                 elif isinstance(i, FloatObject) or isinstance(i, NumberObject):
                     if i < -100:
                         text += " "
         elif operator == b_("TD") or operator == b_("Tm"):
             if len(text) > 0 and text[-1] != " " and text[-1] != "\n":
                 text += " "
     text = text.replace(" - ", "-")
     text = re.sub("\\s+", " ", text)
     return text