def p_dictionary_entry_list(p): ''' dictionary_entry_list : dictionary_entry_list NAME object | ''' if len(p) == 1: p[0]=[] else: key_node = create_leaf('name', p[2], span=p.lexspan(2)) dictionary_span = (p.lexspan(2)[0],p.lexspan(3)[1]) dictionary_node = create_tree('entry', [key_node,p[3]], span=dictionary_span) p[0] = p[1] + [dictionary_node]
def p_dictionary(p): ''' dictionary : DOUBLE_LESS_THAN_SIGN dictionary_entry_list DOUBLE_GREATER_THAN_SIGN ''' p[0] = create_tree('dictionary', p[2], span=(p.lexspan(1)[0], p.lexspan(3)[1]))
def p_array(p): ''' array : LEFT_SQUARE_BRACKET object_list RIGHT_SQUARE_BRACKET ''' p[0] = create_tree('array', p[2], span=p.lexspan(0))
def p_pdf(p): ''' pdf : HEADER pdf_update_list''' header = create_leaf('header', p[1], span=p.lexspan(1)) p[0] = create_tree('pdf', [header] + p[2], span=p.lexspan(0), version="OPAF!" )
def p_pdf_brute_end(p): ''' pdf_brute_end : XREF TRAILER dictionary STARTXREF EOF''' xref = create_tree('xref', [p[3]],span=(0,p.lexspan(4)[0]-1), xref=p[1]) pdf_end = create_leaf('startxref', p[4], span=(p.lexspan(4)[0],p.lexspan(0)[1])) p[0] = [xref, pdf_end]
def p_pdf_update(p): ''' pdf_update : body xref pdf_end ''' p[0] = create_tree('pdf_update', p[1]+[p[2],p[3]],span=(0xffffffff,-1)) [p[0].span_expand(e.span) for e in p[1]+[p[2],p[3]]]
def p_xref_common(p): ''' xref : XREF TRAILER dictionary ''' data = create_leaf('data', str(p[1]), span=p.lexspan(0)) p[0] = create_tree('xref',[p[3], data], span=p.lexspan(0))
def p_indirect_object_stream(p): ''' indirect_object_stream : OBJ dictionary STREAM_DATA ENDOBJ ''' stream_data = create_leaf('data',p[3],span=(p.lexspan(2)[0],p.lexspan(4)[1])) stream = create_tree('stream',[p[2], stream_data],span=p.lexspan(0)) p[0] = create_tree('indirect_object', [stream],span=p.lexspan(0), id="%d %d"%p[1])
def p_indirect_object(p): ''' indirect_object : OBJ object ENDOBJ ''' ref = "%d %d"%p[1] p[0] = create_tree('indirect_object', [p[2]], span=p.lexspan(0), id=ref)
if len(xml_pdf_ends) == 0: logger.info("%%%%EOF tag was not found! Creating a dummy.") dummy_startxref = create_leaf('startxref', -1, span=(len(pdf),len(pdf))) print dummy_startxref.value allobjects.append(dummy_startxref) if len(xml_headers) == 0: logger.info("%%%%PDF-N-M tag was not found! Creating a dummy.") allobjects.append(create_leaf('header', "NOVERSION", span=(0,0))) #Sort it as they appear in the file allobjects = sorted(allobjects,lambda x,y: cmp(x.span[0], y.span[0])) #recreate XML structure 'best' we can... assert allobjects[0].tag == 'header' root_element = create_tree('pdf', [allobjects.pop(0)], span=(0,len(pdf)), version="OPAF!(raw)") update = create_tree('pdf_update', [],span=(0xfffffff,-1)) while len(allobjects)>0: thing = allobjects.pop(0) update.append(thing) update.span_expand(thing.span) if thing.tag == 'startxref': root_element.append(update) update = create_tree('pdf_update',[],span=(0xfffffff,-1)) if len(update)>0: logger.info("Missing ending %%EOF") root_element.append(update) return root_element