def test_get_number(self): test_data = { '123': 123, '43445': 43445, '+17': 17, '-98': -98, '34.5': 34.5, '-3.62': -3.62, '+123.6': 123.6, '4.': 4.0, '-0.002': -0.002, '0.0': 0, } for key in test_data: with closing(NamedTemporaryFile()) as f: leading = map(lambda i: random.randint(0, 255), xrange(random.randint(0, 255))) f.write(bytearray(leading)) f.write(key) f.write(random.choice((' ', '\r', '\n'))) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) numeric_object = p.get_number(len(leading)) assert numeric_object.data == test_data[key] assert numeric_object.start_pos == len(leading) assert numeric_object.end_pos == len(leading) + len(key) with closing(NamedTemporaryFile()) as f: f.write('abc5\r') f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) with pytest.raises(PDFLexerError): numeric_object = p.get_number(0)
def test_get_name(self): test_data = (('/Name1', 'Name1'), ('/ASomewhatLongerName', 'ASomewhatLongerName'), ('/A;Name_With-Various***Characters?', 'A;Name_With-Various***Characters?'), ('/1.2', '1.2'), ('/$$', '$$'), ('/@pattern', '@pattern'), ('/.notdef', '.notdef'), ('/lime#20Green', 'lime Green'), ('/paired#28#29parentheses', 'paired()parentheses'), ('/The_Key_of_F#23_Minor', 'The_Key_of_F#_Minor'), ('/A#42', 'AB')) for name, expect_val in test_data: with closing(NamedTemporaryFile()) as f: f.write(name) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) name_object = p.get_name(0) assert name_object.data == expect_val assert name_object.start_pos == 0 assert name_object.end_pos == len(name)
def test_get_name(self): test_data = ( ('/Name1', 'Name1'), ('/ASomewhatLongerName', 'ASomewhatLongerName'), ('/A;Name_With-Various***Characters?', 'A;Name_With-Various***Characters?'), ('/1.2', '1.2'), ('/$$', '$$'), ('/@pattern', '@pattern'), ('/.notdef', '.notdef'), ('/lime#20Green', 'lime Green'), ('/paired#28#29parentheses', 'paired()parentheses'), ('/The_Key_of_F#23_Minor', 'The_Key_of_F#_Minor'), ('/A#42', 'AB') ) for name, expect_val in test_data: with closing(NamedTemporaryFile()) as f: f.write(name) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) name_object = p.get_name(0) assert name_object.data == expect_val assert name_object.start_pos == 0 assert name_object.end_pos == len(name)
def test_get_stream(self): # empty stream with closing(NamedTemporaryFile()) as f: eol1 = random.choice(('\r\n', '\n')) eol2 = random.choice(('\r\n', '\r', '\n')) f.write('<<>>\nstream' + eol1 + eol2 + 'endstream') f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) s = p.get_stream(0) assert s.data == '' assert s.start_pos == 0 assert s.end_pos == 5 + 6 + len(eol1) + len(eol2) + 8 + 1 assert s.stream_dict.data == {} with closing(NamedTemporaryFile()) as f: eol1 = random.choice(('\r\n', '\n')) eol2 = random.choice(('\r\n', '\r', '\n')) data = self._rand_string(random.randint(0, 65536)) f.write('<<>>\nstream' + eol1 + data + eol2 + 'endstream') f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) s = p.get_stream(0) assert s.data == data assert s.start_pos == 0 assert s.end_pos == 5 + 6 + len(eol1) + len(data) + len(eol2) + 8 + 1 assert s.stream_dict.data == {}
def test_get_indirect_object(self): test_data = ( ("""1 0 obj 123 endobj""", 1, 0, 123), ("""2 0 obj -0.1 endobj""", 2, 0, -0.1), ("""3 0 obj (billing) endobj""", 3, 0, 'billing'), ("""4 0 obj <62696c6c696e67> endobj""", 4, 0, 'billing'), ("""5 0 obj /billing endobj""", 5, 0, 'billing'), ("""6 0 obj <</a/billing /b 123 /c true /d [ 1 2 3 ] /e (billing) /f<62696c6c696e67> /g null /h<<>>>> endobj""", 6, 0, { 'a': 'billing', 'b': 123, 'c': True, 'd': [1, 2, 3], 'e': 'billing', 'f': 'billing', 'g': None, 'h': {} }), ("""7 0 obj [/a /b 1 2 3 0 R] endobj""", 7, 0, ['a', 'b', 1, 2, [3, 0]]), ) for test_str, obj_num, gen_num, data in test_data: with closing(NamedTemporaryFile()) as f: f.write(test_str) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) io = p.get_indirect_object(0) assert io.object_num == obj_num assert io.generation_num == gen_num assert io.start_pos == 0 assert io.end_pos == len(test_str) assert json.dumps(io.data.data, sort_keys=True) == \ json.dumps(data, sort_keys=True)
def _test_by_json_dump(self, pdf_json_str, real_json, isdict=True): with closing(NamedTemporaryFile()) as f: f.write(pdf_json_str) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) d = p.get_dictionary(0) if isdict else p.get_array(0) assert json.dumps(d.data, sort_keys=True) == \ json.dumps(real_json, sort_keys=True)
def test_get_hexadecimal_string(self): # normal case test_data = self._rand_string(random.randint(0, 255)) hex_str = ''.join(map(self._to_hex_notation, test_data)) with closing(NamedTemporaryFile()) as f: f.write('<%s>' % hex_str) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) string_object = p.get_hexadecimal_string(0) assert string_object.data == test_data assert string_object.start_pos == 0 assert string_object.end_pos == len(test_data) * 2 + 2 # empty case with closing(NamedTemporaryFile()) as f: f.write('<>') f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) string_object = p.get_hexadecimal_string(0) assert string_object.data == '' assert string_object.start_pos == 0 assert string_object.end_pos == 2 # interleave white space characters test_data = self._rand_string(random.randint(0, 255)) hex_str = ''.join(map(self._to_hex_notation, test_data)) prob = 0.1 hexs = [] for i in xrange(len(hex_str)): while random.random() <= prob: hexs.append(random.choice((' ', '\n', '\r', '\t'))) hexs.append(hex_str[i]) hex_str = ''.join(hexs) with closing(NamedTemporaryFile()) as f: f.write('<%s>' % hex_str) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) string_object = p.get_hexadecimal_string(0) assert string_object.data == test_data assert string_object.start_pos == 0 assert string_object.end_pos == len(hex_str) + 2
def test_get_indirect_reference(self): with closing(NamedTemporaryFile()) as f: obj_num = random.randint(1, 2**31 - 1) gen_num = random.randint(0, 65535) write_val = '%s %s R' % (obj_num, gen_num) f.write(write_val) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) indirect_ref = p.get_indirect_reference(0) assert indirect_ref.object_num == obj_num assert indirect_ref.generation_num == gen_num assert indirect_ref.start_pos == 0 assert indirect_ref.end_pos == len(write_val)
def test_get_indirect_object(self): test_data = ( ("""1 0 obj 123 endobj""", 1, 0, 123), ("""2 0 obj -0.1 endobj""", 2, 0, -0.1), ("""3 0 obj (billing) endobj""", 3, 0, 'billing'), ("""4 0 obj <62696c6c696e67> endobj""", 4, 0, 'billing'), ("""5 0 obj /billing endobj""", 5, 0, 'billing'), ("""6 0 obj <</a/billing /b 123 /c true /d [ 1 2 3 ] /e (billing) /f<62696c6c696e67> /g null /h<<>>>> endobj""", 6, 0, {'a': 'billing', 'b': 123, 'c': True, 'd': [1, 2, 3], 'e': 'billing', 'f': 'billing', 'g': None, 'h': {}}), ("""7 0 obj [/a /b 1 2 3 0 R] endobj""", 7, 0, ['a', 'b', 1, 2, [3, 0]]), ) for test_str, obj_num, gen_num, data in test_data: with closing(NamedTemporaryFile()) as f: f.write(test_str) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) io = p.get_indirect_object(0) assert io.object_num == obj_num assert io.generation_num == gen_num assert io.start_pos == 0 assert io.end_pos == len(test_str) assert json.dumps(io.data.data, sort_keys=True) == \ json.dumps(data, sort_keys=True)
def test_get_stream(self): # empty stream with closing(NamedTemporaryFile()) as f: eol1 = random.choice(('\r\n', '\n')) eol2 = random.choice(('\r\n', '\r', '\n')) f.write('<<>>\nstream' + eol1 + eol2 + 'endstream') f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) s = p.get_stream(0) assert s.data == '' assert s.start_pos == 0 assert s.end_pos == 5 + 6 + len(eol1) + len(eol2) + 8 + 1 assert s.stream_dict.data == {} with closing(NamedTemporaryFile()) as f: eol1 = random.choice(('\r\n', '\n')) eol2 = random.choice(('\r\n', '\r', '\n')) data = self._rand_string(random.randint(0, 65536)) f.write('<<>>\nstream' + eol1 + data + eol2 + 'endstream') f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) s = p.get_stream(0) assert s.data == data assert s.start_pos == 0 assert s.end_pos == 5 + 6 + len(eol1) + len(data) + len( eol2) + 8 + 1 assert s.stream_dict.data == {}
def test_decode(self): test_data = """\ 7342 0 obj <</DecodeParms<</Columns 5/Predictor 12>>/Filter/FlateDecode/ID[<5FC47DB1E9FC42E4990C818A453E009A><1C2BCADD5BB547BA849DAD158DE8DFD0>]/Index[375 1 1482 1 7112 1 7328 15]/Info 7112 0 R/Length 73/Prev 20448229/Root 7114 0 R/Size 7343/Type/XRef/W[1 4 0]>>stream h\xdebbd\xb4`\xcdgb```\xaecb\xfc\x7fB\xe4\x00\xd3\x7fF\x8b_\xf3A"\xb2\x06 \x92\xa9\x1bD28\x82Ik0i\x02&\xf5A$\xa3%\x82\xcd\xa0\x07&\xcd\x11lF\x1d0\xdb\x08\xcc\xbe\x0f\x10`\x00\xfd\x99\n\xb5 endstream endobj""" with closing(NamedTemporaryFile()) as f: f.write(test_data) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) obj = p.get_indirect_object(0) assert obj.object_num == 7342 assert obj.generation_num == 0 stream_obj = obj.data stream_dict = stream_obj.stream_dict.data assert stream_dict['DecodeParms'] == { 'Columns': 5, 'Predictor': 12 } assert stream_dict['Filter'] == 'FlateDecode' assert stream_dict['Index'] == [ 375, 1, 1482, 1, 7112, 1, 7328, 15 ] assert stream_dict['Info'] == (7112, 0) assert stream_dict['Length'] == 73 assert stream_dict['Prev'] == 20448229 assert stream_dict['Root'] == (7114, 0) assert stream_dict['Size'] == 7343 assert stream_dict['Type'] == 'XRef' assert stream_dict['W'] == [1, 4, 0] stream_data = stream_obj.decode()
def test_get_literal_string(self): test_data = ( ('This is a string', 'This is a string'), ('Strings may contain newlines\nand such.', 'Strings may contain newlines\nand such.'), ('Strings may contain balanced parentheses ()', 'Strings may contain balanced parentheses ()'), ('Strings may contain special characters (*!&}^% and so on).', 'Strings may contain special characters (*!&}^% and so on).'), ('', ''), ('It has zero (0) length.', 'It has zero (0) length.'), ('These \\ntwo strings \\nare\\n the same.', 'These \ntwo strings \nare\n the same.'), ('This string has an end-of-line at the end of it.\n', 'This string has an end-of-line at the end of it.\n'), ('So does this one.\\n', 'So does this one.\n'), ('The string contains \\245two octal characters\\307.', 'The string contains \245two octal characters\307.'), ('\n\n\n', '\n'), ('\r\r\r', '\n'), ('\r\n\r\n', '\n'), ('\\n\\r\\t\\b\\f\\(\\)\\\\', '\n\r\t\b\f()\\') ) for (literal, expect_val) in test_data: with closing(NamedTemporaryFile()) as f: content = ''.join(('(', literal, ')')) f.write(bytearray(content)) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) string_object = p.get_literal_string(0) assert string_object.data == expect_val assert string_object.start_pos == 0 assert string_object.end_pos == len(content)
def test_get_literal_string(self): test_data = ( ('This is a string', 'This is a string'), ('Strings may contain newlines\nand such.', 'Strings may contain newlines\nand such.'), ('Strings may contain balanced parentheses ()', 'Strings may contain balanced parentheses ()'), ('Strings may contain special characters (*!&}^% and so on).', 'Strings may contain special characters (*!&}^% and so on).'), ('', ''), ('It has zero (0) length.', 'It has zero (0) length.'), ('These \\ntwo strings \\nare\\n the same.', 'These \ntwo strings \nare\n the same.'), ('This string has an end-of-line at the end of it.\n', 'This string has an end-of-line at the end of it.\n'), ('So does this one.\\n', 'So does this one.\n'), ('The string contains \\245two octal characters\\307.', 'The string contains \245two octal characters\307.'), ('\n\n\n', '\n'), ('\r\r\r', '\n'), ('\r\n\r\n', '\n'), ('\\n\\r\\t\\b\\f\\(\\)\\\\', '\n\r\t\b\f()\\')) for (literal, expect_val) in test_data: with closing(NamedTemporaryFile()) as f: content = ''.join(('(', literal, ')')) f.write(bytearray(content)) f.write(self._rand_white_space() + self._rand_string(8)) f.flush() with closing(mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)) as stream: p = PDFLexer(stream) string_object = p.get_literal_string(0) assert string_object.data == expect_val assert string_object.start_pos == 0 assert string_object.end_pos == len(content)