def bkpage(pdf_name):

    f = open(pdf_name, 'rb')
    pdf = PdfFileReader(f)

    # map page ids to page numbers
    pg_id_num_map = _setup_page_id_to_num(pdf)
    outlines = pdf.getOutlines()
    bookmarks_info = outlines_pg_zoom_info(outlines, pg_id_num_map)

    #print bookmarks_info
    page_title = {}

    for key in bookmarks_info.keys():
        a = bookmarks_info[key]
        page_title[a['title'].encode('utf-8')] = int(a['page'])
    #print page_title

    a = sorted(page_title.items(), key=lambda (k, v): (v, k))
    #print a
    l = []
    title_tokens = []

    for x in a:
        l.append(x[0])
        title_tokens.append(x[0].split())
        #print x[1], ":", x[0]

    return (page_title, l, title_tokens)
Пример #2
0
 def get_outlines(pdf_file_path):
     "Get outlines"
     if not pdf_file_path:
         raise # TODO
     with open(pdf_file_path) as f:
         pdf_reader = PdfFileReader(f)
         return pdf_reader.getOutlines()
Пример #3
0
                    intr = open(subheading[i],'w')
                    for k in xrange(len(part)-1):
                        intr.write(part[k])
                    c=0
                    part=[]
            elif i+1==len(subheading):
                intr = open(subheading[i],'w')
                for k in xrange(len(part)-1):
                    intr.write(part[k])


path = "C:\Users\Wu/nordron-sciinfo\Code\Rainy\code\extraction" #please change path
article = textract.process('y.pdf', m='pdfminer') #please chang the file name
f = open('y.pdf', 'rb') #import PDF file
p = PdfFileReader(f)
o = p.getOutlines()  #read outlines in pdf


list = []
dimension(o, list)
pdftotxt(article)

#build a list of subtitle
subheading = []
for j in range(0,len(list)):
    sub = list[j]["/Title"]
    subheading.append(sub)
#title=title_extractor(path, filename)
myfile = open("pdf_1.txt")
line = myfile.readlines() # read txt file line by line
split(line,subheading,path)