Exemplo n.º 1
def p_dictionary_entry_list(p):
    ''' dictionary_entry_list : dictionary_entry_list NAME object
                              |  '''
    if len(p) == 1:
        key_node = create_leaf('name', p[2], span=p.lexspan(2))
        dictionary_span = (p.lexspan(2)[0],p.lexspan(3)[1])
        dictionary_node = create_tree('entry', [key_node,p[3]], span=dictionary_span)
        p[0] = p[1] + [dictionary_node]
Exemplo n.º 2
def p_object_ref(p):
    ''' object : R '''
    p[0] = create_leaf('R', p[1], span=p.lexspan(1))
Exemplo n.º 3
def p_object_null(p):
    ''' object : NULL '''                    
    p[0] = create_leaf('null', None, span=p.lexspan(1))
Exemplo n.º 4
def p_object_false(p):
    ''' object : FALSE '''                    
    p[0] = create_leaf('bool', False, span=p.lexspan(1))
Exemplo n.º 5
def p_object_true(p):
    ''' object : TRUE '''                    
    p[0] = create_leaf('bool', True, span=p.lexspan(1))
Exemplo n.º 6
def p_object_number(p):
    ''' object : NUMBER '''
    x = p[1]
    x = float(int(float(x))) == float(x) and int(float(x)) or float(x)
    p[0] = create_leaf('number', x, span=p.lexspan(1))
Exemplo n.º 7
def p_object_hexstring(p):
    ''' object : HEXSTRING '''                    
    p[0] = create_leaf('string', p[1], span=p.lexspan(1))
Exemplo n.º 8
def p_object_name(p):
    ''' object : NAME '''
    p[0] = create_leaf('name', p[1], span=p.lexspan(1))
Exemplo n.º 9
def bruteParser(pdf):
        This will try to parse any object in the file based on obj/endobj and few other kewords.
        This is an ad-hoc parsing wich will try to read the file in any posile way. 
        It may produce phantom overlaped XML objects. Yo may check this issues afterwards.
        Also it is slow.
        #Search for the PDF header
        headers = list(re.finditer(r'%PDF-1\.[0-7]',pdf))
        xml_headers = []
        for header in headers:
            start = header.start()
            end = header.end()
            version = header.group(0)[-3:]
            xml_headers.append(create_leaf('header', version,span=(start,end)))
        logger.info('Found %d headers'%len(xml_headers))
        #Search the startxref. And xrefs.
        startxrefs = list(re.finditer(r'startxref[\x20\r\n\t\x0c\x00]+[0-9]+[\x20\r\n\t\x0c\x00]+%%EOF',pdf))
        xrefs = list(re.finditer(r'xref',pdf))    
        xml_xrefs = []
        xml_pdf_ends = []
        for xref in xrefs:
            start = xref.start()
            for end in [x.end() for x in startxrefs if x.start()>xref.end()]:
                logger.info("Searching for a xref, trailer and %%%%EOF at [%s:%s]"%(start,end))
                potential_xref = pdf[start:end]
                    xml_xref, xml_pdf_end = parse('pdf_brute_end', potential_xref)
                    #fix lexspan and append
                    #fix lexspan and append
                except Exception, e:
                    print e
                    logger.info("Couldn't parse a xref, trailer and %%%%EOF at [%s:%s] (%s)"%(start,end,e))

        #use the force
        #This algorithm will try to match any obj with any endobj and will keep it 
        #if a sane object is found inside. Overlapping is possible here, you may analize it
        #cut it off from the xml later, using the lexspan markers.
        delimiter = r"[()<>\[\]/%\x20\r\n\t\x0c\x00]"
        objs = list(re.finditer(r'\d+\x20\d+\x20obj'+delimiter, pdf))
        endobjs = list(re.finditer(delimiter+r'endobj', pdf))
        streams = list(re.finditer(delimiter+'stream'+delimiter, pdf))
        endstreams = list(re.finditer('endstream'+delimiter+'endobj', pdf))
        xml_iobjects = []
        logger.info("Found %d Object starting points"%len(objs))
        logger.info("Found %d Object ending points"%len(endobjs))
        for m in objs:
            start = m.start()
            for end in [x.end() for x in endobjs if x.start()>m.end()]:
                    logger.debug("Parsing potential object at %s~%s"%(start,end))
                    potential_obj = pdf[start:end]
                    # If for some reason there are "endstreams" keywords inside the 
                    # stream let's momentaneaously escape them, so it can be parsed  
                    # with the strict parser
                    escape_endstreams = [e.start()+start for e in endstreams if e.start()>start and e.end()<end ]
                    for e in escape_endstreams[:-1]:
                        potential_obj = potential_obj[:e] +"X"*9 + potential_obj[e+9:]
                    #Try to strictly parse an indirect object
                    xml_iobject = parse('indirect',potential_obj)

                    #fix lexspan

                    #FIX: fix escape
                    #WRONG offset!!!!!!!!!!!!
                    pl = payload(xml_iobject)
                    #Un-escape the "endstream" keywords
                    for e in escape_endstreams[:-1]:
                        pl = pl[:e] +"endstream" + pl[e+9:]
                    setpayload(xml_iobject, pl)

                    #append to the list

                    #Just parse the first object we can of this try.
                    #Comment out the following line to search for phantoms 
                    #(i.e. objects inside objects or overlaped objects)
                except Exception,e:
                    logger.debug("Received exception %s when parsing potential object at [%s:%s]."%(e, start,end))
Exemplo n.º 10
def p_pdf(p):
    ''' pdf : HEADER pdf_update_list'''
    header = create_leaf('header', p[1], span=p.lexspan(1))
    p[0] = create_tree('pdf', [header] + p[2], span=p.lexspan(0), version="OPAF!" )
Exemplo n.º 11
def p_pdf_brute_end(p):
    ''' pdf_brute_end : XREF TRAILER  dictionary STARTXREF EOF'''
    xref = create_tree('xref', [p[3]],span=(0,p.lexspan(4)[0]-1), xref=p[1])
    pdf_end = create_leaf('startxref', p[4], span=(p.lexspan(4)[0],p.lexspan(0)[1]))
    p[0] = [xref, pdf_end] 
Exemplo n.º 12
def p_pdf_end(p):
    ''' pdf_end : STARTXREF EOF'''
    p[0] = create_leaf('startxref', p[1], span=p.lexspan(0))
Exemplo n.º 13
def p_xref_common(p):
    ''' xref : XREF TRAILER dictionary '''
    data = create_leaf('data', str(p[1]), span=p.lexspan(0))
    p[0] = create_tree('xref',[p[3], data], span=p.lexspan(0))
Exemplo n.º 14
def p_indirect_object_stream(p):
    ''' indirect_object_stream : OBJ dictionary STREAM_DATA ENDOBJ '''
    stream_data = create_leaf('data',p[3],span=(p.lexspan(2)[0],p.lexspan(4)[1]))
    stream = create_tree('stream',[p[2], stream_data],span=p.lexspan(0))
    p[0] =  create_tree('indirect_object', [stream],span=p.lexspan(0), id="%d %d"%p[1])
Exemplo n.º 15

                    #Just parse the first object we can of this try.
                    #Comment out the following line to search for phantoms 
                    #(i.e. objects inside objects or overlaped objects)
                except Exception,e:
                    logger.debug("Received exception %s when parsing potential object at [%s:%s]."%(e, start,end))
        logger.info("Succesfully parsed %d/%d Objects ending points"%(len(xml_iobjects),len(endobjs)*len(objs)))

        #sum all the objects
        allobjects = xml_headers + xml_xrefs + xml_pdf_ends + xml_iobjects

        if len(xml_pdf_ends) == 0:
            logger.info("%%%%EOF tag was not found! Creating a dummy.")
            dummy_startxref = create_leaf('startxref', -1, span=(len(pdf),len(pdf)))
            print dummy_startxref.value

        if len(xml_headers) == 0:
            logger.info("%%%%PDF-N-M tag was not found! Creating a dummy.")
            allobjects.append(create_leaf('header', "NOVERSION", span=(0,0)))

        #Sort it as they appear in the file
        allobjects = sorted(allobjects,lambda x,y: cmp(x.span[0], y.span[0]))

        #recreate XML structure 'best' we can...
        assert allobjects[0].tag == 'header'
        root_element = create_tree('pdf', [allobjects.pop(0)], span=(0,len(pdf)), version="OPAF!(raw)")
        update = create_tree('pdf_update', [],span=(0xfffffff,-1))