示例#1
0
文件: pdf_2jpeg.py 项目: edsomjr/TEP
def main(argv):
    if len(argv) >= 4:
        num_vis = int(argv[2])
        if len(argv) == num_vis*2 + 3:
            filename = argv[1]
            ini = []
            fin = []
            it = 3
            for i in range(0, num_vis):
                ini.append(int(argv[it]))
                fin.append(int(argv[it+1]))
                it+=2
        else:
            print ('usage:\n    python pdf_2jpeg.py <pdf> <numero de visualizaçoes> <pagina inicial 1> <pagina final1> ... <pagina inicial N> <pagina final N>')
            return
    else:
        print ('usage:\n    python pdf_2jpeg.py <pdf> <numero de visualizaçoes> <pagina inicial 1> <pagina final1> ... <pagina inicial N> <pagina final N>')
        return

    directory = filename.strip('.pdf') + '/images'
    pages = convert_from_path(filename, 500)

    if not os.path.exists(directory):
        os.makedirs(directory)

    for i in range(0, num_vis):
        current_dir = directory+'/vis-'+str(i)
        if not os.path.exists(current_dir):
            os.makedirs(current_dir)
        for j in range(ini[i], fin[i]):
            pages[j].save(current_dir+'/out'+(str(j).zfill(2))+'.jpg', 'JPEG')

    return
示例#2
0
    def pdf2png(self,pdfPath,out_file):
        """ 
        PDFをpngに変換
        出力ファイルは%d必須
        """
        if "%d" not in out_file :
            raise ValueError("複数ファイル対応のため、'%d'必須です")

        # 早速変換
        images = convert_from_path(pdfPath)

        # 保存
        for i,img in enumerate(images) :
            outpath = out_file % (i,)
            img.save(outpath, "png")

            # リサイズ
            # PILで読み込んで
            img = Image.open(outpath)
            # Doリサイズ
            size = int(A4[0]*1.6), int(A4[1]*1.6) # A4の倍にする
            img_resized = img.resize(size, Image.LANCZOS)
            img_resized.save(outpath)
示例#3
0
文件: ocr.py 项目: tarbagan/pravo
import pytesseract
from pdf2image import convert_from_path
import glob

pdfs = glob.glob(r"yourPath\*.pdf")

for pdf_path in pdfs:
    pages = convert_from_path(pdf_path, 500)

    for pageNum,imgBlob in enumerate(pages):
        text = pytesseract.image_to_string(imgBlob,lang='eng')

        with open(f'{pdf_path[:-4]}_page{pageNum}.txt', 'w') as the_file:
            the_file.write(text)
示例#4
0
def pdf_to_img(pdf_file):
    return pdf2image.convert_from_path(pdf_file)
示例#5
0
def convertPDF(input_path, output_path, temp_path):
    '''
    Converts a pdf file into a text file 
    '''

    file_name = input_path.stem
    PDF_file = str(input_path)

    '''
    Converting PDF to images
    '''

    # Store all the pages of the PDF in a variable
    pages = convert_from_path(PDF_file)

    # Counter to store images of each page of PDF to image
    image_counter = 1

    # Iterate through all the pages stored above
    for page in pages:

        # Declaring filename for each page of PDF as JPG
        image_dir = temp_path
        imagefilename = image_dir+file_name+"_page_"+str(image_counter)+".jpg"

        # Save the image of the page in system
        page.save(imagefilename, 'JPEG')

        # Increment the counter to update filename
        image_counter = image_counter + 1

    '''
    Recognizing text from the images using OCR
    '''

    # Variable to get count of total number of pages
    filelimit = image_counter-1

    # Creating a text file to write the output
    output_dir = output_path
    outfile = Path(output_dir, file_name+".txt")

    # Open the file in append mode so that
    # All contents of all images are added to the same file
    f = open(str(outfile), "a")

    # Iterate from 1 to total number of pages
    for i in range(1, filelimit + 1):

        # Set filename to recognize text from
        imagefilename = image_dir+file_name+"_page_"+str(i)+".jpg"

        # Recognize the text as string in image using pytesserct
        text = str(((pytesseract.image_to_string(Image.open(imagefilename)))))

        # The recognized text is stored in variable text
        # Any string processing may be applied on text
        # Here, basic formatting has been done:
        # In many PDFs, at line ending, if a word can't
        # be written fully, a 'hyphen' is added.
        # The rest of the word is written in the next line
        text = text.replace('-\n', '')

        # Finally, write the processed text to the file.
        f.write(text)

    # Close the file after writing all the text.
    f.close()
示例#6
0
 def test_conversion_from_path(self):
     start_time = time.time()
     images_from_path = convert_from_path('./tests/test.pdf')
     self.assertTrue(len(images_from_path) == 1)
     print('test_conversion_from_path: {} sec'.format(time.time() -
                                                      start_time))
示例#7
0
 def test_empty_if_file_not_found(self):
     start_time = time.time()
     with self.assertRaises(Exception):
         convert_from_path('./tests/totally_a_real_file_in_folder.xyz')
     print('test_empty_if_file_not_found: {} sec'.format(time.time() -
                                                         start_time))
示例#8
0
        r"C:\Axis AI Challenge @ Akash_Abhishek\INPUT FILES")
    if each.endswith('.jpeg')
]
results_tiff = [
    each for each in os.listdir(
        r"C:\Axis AI Challenge @ Akash_Abhishek\INPUT FILES")
    if each.endswith('.tiff')
]
results_tif = [
    each for each in os.listdir(
        r"C:\Axis AI Challenge @ Akash_Abhishek\INPUT FILES")
    if each.endswith('.tif')
]

pdf = [
    convert_from_path(in_file, 500) for in_file in glob.glob(
        r"C:\Axis AI Challenge @ Akash_Abhishek\INPUT FILES\*.pdf")
]
total_no_pdf = len(pdf)

for i in range(0, len(pdf)):
    pages = pdf[i]
    name = results_pdf[i]
    name_list = os.path.splitext(name)[0]
    file_name.append(name_list)

    for j in range(0, len(pages)):
        page = pages[j]
        page.save(
            r"C:\Axis AI Challenge @ Akash_Abhishek\PROCESSED FILES 1\{}\PDF_{} page_{}.jpg"
            .format(i + 1, i + 1, j + 1))
示例#9
0
                pdf2convert = filename
            else:
                print('There are more than two pdf files in the directory')
                time.sleep(delay_b4_exit)
                exit()
    return pdf2convert

spare_files = [os.path.basename(__file__)]
pdf2use = get_pdf_to_use(dir2use)
if pdf2use == None:
    print('There are no pdf files in the current directory')
    time.sleep(delay_b4_exit)
    exit()
spare_files.append(pdf2use)

query = input("All the files in {} will be deleted except {}. Type yes to proceed and any other key to exit\n".format(dir2use, spare_files))
if query != 'yes':
    exit()
clean_dir(dir2use, spare_files=spare_files)
print('converting {} to images '.format(pdf2use))

output_file_name = str(pdf2use).replace('.pdf', '')
pdf2image.convert_from_path(os.path.join(dir2use, pdf2use), output_folder=dir2use, output_file=output_file_name, fmt='jpeg')

print('conversion successfull')
time.sleep(delay_b4_exit)




示例#10
0
import tempfile
from os.path import isfile, join
from os import listdir
import cv2
import pytesseract
from pdf2image import convert_from_path

FOLDER = 'samples/'

ONLY_FILES = [f for f in listdir(FOLDER) if isfile(join(FOLDER, f))]

for current_file in ONLY_FILES:
    full_file = FOLDER + current_file
    with tempfile.TemporaryDirectory() as path:
        images_from_path = convert_from_path(full_file,
                                             fmt='jpeg',
                                             output_folder=path)
        for image in images_from_path:
            cv_image = cv2.imread(image.filename)
            gray = cv2.cvtColor(cv_image, cv2.COLOR_BGR2GRAY)
            gray = cv2.threshold(gray, 0, 255,
                                 cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
            text = pytesseract.image_to_string(gray)
            print(text)
示例#11
0
def main(_argv):
    physical_devices = tf.config.experimental.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)
    yolo = YoloV3(classes=total_number_of_logos) # number of classes/logos, needs to be updated if another logo is added
    yolo.load_weights('./weights/yolov3-custom.tf').expect_partial() # file path to weights
    class_names = [c.strip() for c in open('./data/labels/custom.names').readlines()] # file path to classes list, needs to be updated if another logo is added
    if FLAGS.count:
        count = FLAGS.count
    excel = []
    images = []
    for i in range(count):
        con = convert_from_path('data/pdf/test (' + str(i+1) + ').pdf', output_folder='data/images', fmt="jpg", single_file=True, output_file='test (' + str(i+1) + ')')
        excel.append('data/excel/test (' + str(i+1) + ').xlsx')
        images.append('data/images/test (' + str(i+1) + ').jpg')
    raw_images = []
    for image in images:
        img_raw = tf.image.decode_image(
            open(image, 'rb').read(), channels=3)
        raw_images.append(img_raw)
    i = 0 # index number for main loop
    logos = [] # list of detected logos for each image
    approvals = [] # list of excel data for each image
    for raw_img in raw_images:
        img = tf.expand_dims(raw_img, 0)
        img = transform_images(img, 416) # image size
        t1 = time.time()
        boxes, scores, classes, nums = yolo(img)
        t2 = time.time()
        logging.info('time: {}'.format(t2 - t1))
        img = cv2.cvtColor(raw_img.numpy(), cv2.COLOR_RGB2BGR)
        img = draw_outputs(img, (boxes, scores, classes, nums), class_names)
        cv2.imwrite('./detections/detection (' + str(i+1) + ').jpg', img) # image output
# LABEL EXTRACTION
        temp_names = [] # temporary list for each image's logo detections
        for j in range(nums[0]):
            repeat = True
            temp_pair = [] # temporary list for each logo and its status
            if (j > 0):
                for k in range(len(temp_names)):
                    if (class_names[int(classes[0][j])] == temp_names[k][0]):
                        repeat = False
                        break
            if (repeat): # if not a repeated logo, update main logo list
                temp_pair.append(class_names[int(classes[0][j])]) # append logo 
                temp_pair.append(False) # append status
                temp_names.append(temp_pair) # append pair
        logos.append(temp_names) # append names list to main logo list
# EXCEL EXTRACTION
        wb = load_workbook(excel[i])
        sheet = wb.active
        rows = sheet.max_row
        temp_sheet = [] # temporary list for each image's excel data
        for j in range(rows-1):
            temp_rows = [] # temporary list for each row's excel data
            temp_rows.append(str(sheet.cell(row=j+2, column=4).value).upper().strip())
            temp_rows.append(str(sheet.cell(row=j+2, column=5).value).upper().strip())
            temp_rows.append("00FF0000") # Red by default
            temp_sheet.append(temp_rows)
        approvals.append(temp_sheet) # append sheet list to main approvals list
# EXCEL TRANSLATION
        for j in range(len(approvals[i])):
            if (approvals[i][j][0] in extola):
                temp_trans = extola[approvals[i][j][0]]
            else:
                temp_trans = ["NAL"] # No Associated Logo
            approvals[i][j][0] = temp_trans
# EXCEL COMPARED TO LABEL
# "APPROVAL STATUS"             "On label"   "Not on label"
# "APPROVED"                    "Green"     "Red"
# "NO REQUIREMENTS"             "Red"       "Green"
# "APPROVAL NOT APPLICABLE"     "Red"       "Green"
# "APPROVAL NOT REQUIRED"       "Red"       "Green"
# "CONTACT CISCO PARTNER/IOR"   "Red"       "Green"
# "NOT APPROVED"                "Red"       "Green"
# "PENDING"                     "Red"       "Green"
# "RENEWAL IN PROGESS"          "Red"       "Green"
# "NONE"/"UNKNOWN"              "Red"       "Red"
# 
# "00FF0000" (Red) needs attention
# "0000FF00" (Green) good to go
# "000000FF" (Blue) missing logo
#         
        for j in range(len(approvals[i])):
            flag = True
            k = 0
            temp_count = 0
            while (flag):
                if (k == len(logos[i])): # logo not on label
                    flag = False
                    if (approvals[i][j][1] == "APPROVED"):
                        approvals[i][j][2] = "00FF0000" # Red
                    elif ((approvals[i][j][1] == "APPROVAL NOT APPLICABLE")or(approvals[i][j][1] == "APPROVAL NOT REQUIRED")or(approvals[i][j][1] == "CONTACT CISCO PARTNER/IOR")or
                          (approvals[i][j][1] == "NOT APPROVED")or(approvals[i][j][1] == "PENDING")or(approvals[i][j][1] == "RENEWAL IN PROGESS")or(approvals[i][j][1] == "NO REQUIREMENTS")):
                        approvals[i][j][2] = "0000FF00" # Green
                    elif ((approvals[i][j][1] == "NONE")or(approvals[i][j][1] == "UNKNOWN")):
                        approvals[i][j][2] = "00FF0000" # Red
                        sheet.cell(row=j+2, column=5).value = "Unknown"
                elif (approvals[i][j][0][0] == "NAL"): # no logo to detect
                    flag = False
                    if ((approvals[i][j][1] == "APPROVAL NOT APPLICABLE")or(approvals[i][j][1] == "APPROVAL NOT REQUIRED")or(approvals[i][j][1] == "CONTACT CISCO PARTNER/IOR")or
                        (approvals[i][j][1] == "NOT APPROVED")or(approvals[i][j][1] == "PENDING")or(approvals[i][j][1] == "RENEWAL IN PROGESS")or(approvals[i][j][1] == "APPROVED")or(approvals[i][j][1] == "NO REQUIREMENTS")):
                        approvals[i][j][2] = "0000FF00" # Green
                    elif ((approvals[i][j][1] == "NONE")or(approvals[i][j][1] == "UNKNOWN")):
                        approvals[i][j][2] = "00FF0000" # Red
                        sheet.cell(row=j+2, column=5).value = "Unknown"  
                else: # continue or logo on label
                    for X in range(len(approvals[i][j][0])):
                        if (approvals[i][j][0][X] == logos[i][k][0]): # logo on label
                            logos[i][k][1] = True
                            temp_count+=1
                            if (temp_count == len(approvals[i][j][0])):
                                flag = False
                            if ((approvals[i][j][1] == "APPROVAL NOT APPLICABLE")or(approvals[i][j][1] == "APPROVAL NOT REQUIRED")or(approvals[i][j][1] == "CONTACT CISCO PARTNER/IOR")or
                                (approvals[i][j][1] == "NOT APPROVED")or(approvals[i][j][1] == "PENDING")or(approvals[i][j][1] == "RENEWAL IN PROGESS")):
                                approvals[i][j][2] = "00FF0000" # Red
                            elif ((temp_count == len(approvals[i][j][0]))and(approvals[i][j][1] == "APPROVED")or(approvals[i][j][1] == "NO REQUIREMENTS")):
                                approvals[i][j][2] = "0000FF00" # Green
                            elif ((approvals[i][j][1] == "NONE")or(approvals[i][j][1] == "UNKNOWN")):
                                approvals[i][j][2] = "00FF0000" # Red
                                sheet.cell(row=j+2, column=5).value = "Unknown"
                k+=1
            sheet.cell(row=j+2, column=5).fill = PatternFill(start_color=approvals[i][j][2], end_color=approvals[i][j][2], fill_type='solid')
# LABEL COMPARED TO EXCEL
        new_row=1
        for j in range(len(logos[i])):
            if (logos[i][j][1] == False): # not on excel so add it in a new row
                sheet.cell(row=new_row+rows, column=1).value = str(sheet.cell(row=rows, column=1).value) #1 Product Name
                sheet.cell(row=new_row+rows, column=3).value = str(sheet.cell(row=rows, column=3).value) #3 Desc
                sheet.cell(row=new_row+rows, column=4).value = logos[i][j][0] #4 Country
                sheet.cell(row=new_row+rows, column=5).value = "Unknown" #5 Approval Status
                sheet.cell(row=new_row+rows, column=5).fill = PatternFill(start_color="000000FF", end_color="000000FF", fill_type='solid') #5 Blue
                for k in range(5):
                    sheet.cell(row=new_row+rows, column=k+6).value = str(sheet.cell(row=rows, column=k+6).value) #6-10
                new_row+=1
        wb.save(excel[i])
        i+=1
# DISPLAY
    for j in range(i):
        print("\nL" + str(j+1) + ": ", end="")
        temp_print = []
        for k in range(len(logos[j])):
            temp_print.append(logos[j][k][0])
        print(temp_print, "\nE" + str(j+1) + ": ", end="")
        temp_print = []
        for k in range(len(approvals[j])):
            temp_print.append(approvals[j][k][0])
        print(temp_print)
    print("")
示例#12
0

def add_timestamp_filename(filename):
    filename = filename.split(".")
    file = filename[0]
    file_ext = filename[1]
    now = datetime.now()
    timestamp = str(now.timestamp())
    timestamp = timestamp.replace(".", "")
    filename = file + timestamp + "." + file_ext

    return filename


#
# pdf_file = "p75.pdf"
#
# files = []
# with(Image(filename=pdf_file, resolution=500)) as conn:
#     for index, image in enumerate(conn.sequence):
#         image_name = os.path.splitext(pdf_file)[0] + str(index + 1) + '.png'
#         Image(image).save(filename=image_name)
#         files.append(image_name)

from pdf2image import convert_from_path
images = convert_from_path("p75.pdf",
                           500,
                           poppler_path=r'D:\poppler-0.68.0\bin')
for i, image in enumerate(images):
    fname = 'image' + str(i) + '.png'
    image.save(fname, "PNG")
示例#13
0
                                   file)) and filename != '.DS_Store':
        i += 1
        print("Processing " + filename + " " + str(i) + " of " + str(noFiles))
        pdf_file = open(sys.argv[1] + '/' + filename, 'rb')
        read_pdf = PyPDF2.PdfFileReader(pdf_file)
        number_of_pages = read_pdf.getNumPages()
        print(number_of_pages)
        for x in range(number_of_pages):
            print("Processing Page" + str(x) + " of " + str(number_of_pages))
            page = read_pdf.getPage(x)
            print(filename)

            basename = os.path.basename(sys.argv[1] + '/' + filename)
            new_name, _ = os.path.splitext(basename)
            d = sys.argv[1] + '/images/'
            # nn = d+new_name + "_pg_" + str(x)
            nn = d + new_name
            print(sys.argv[1] + "/" + filename)
            print(nn)
            images = convert_from_path(sys.argv[1] + "/" + filename)
            finalFilename = d + new_name + ".jpg"

            for image in images:
                image.save(finalFilename, 'JPEG')

            # text = textract.process(sys.argv[1]+"/"+filename, method='tesseract', language='eng')
            # # f= open(d+new_name + "_pg_" + str(x) + ".txt", "wb")
            # f= open(d+new_name + ".txt", "wb")
            # f.write(text)
            # f.close
import os
from pdf2image import convert_from_path
rootdir = '/home/cso/Downloads/customer_pdfs'

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        full_path = os.path.join(subdir, file)
        if full_path.endswith('.pdf'):
            converted_name = os.path.join(
                subdir, '%s_conv.jpg' % (file.replace('.pdf', '')))
            converted_name_2 = os.path.join(
                subdir, '%s_conv_2.jpg' % (file.replace('.pdf', '')))
            images = convert_from_path(full_path,
                                       500,
                                       output_folder=subdir,
                                       fmt='jpg',
                                       first_page=7,
                                       last_page=7)
            images2 = convert_from_path(full_path,
                                        500,
                                        output_folder=subdir,
                                        fmt='jpg',
                                        first_page=9,
                                        last_page=9)
            os.rename(images[0].filename, converted_name)
            os.rename(images2[0].filename, converted_name_2)
	lc_year = year_ago.strftime("%y")
	month = year_ago.strftime("%m")
	day = year_ago.strftime("%d")

	aux=['E','F','M','A','Y','J','L','G','S','O','N','D']
	letter_month=aux[int(month)-1]
except:
	sys.exit()


	#LaVanguardia
try:
	url = f'http://hemeroteca-paginas.lavanguardia.com/LVE05/PUB/{year}/{month}/{day}/LVG{year}{month}{day}0011LB.pdf'
	wget.download(url, 'temporal/lavanguardia.pdf')

	pages = convert_from_path('temporal/lavanguardia.pdf', 500)
	time.sleep(3)
	for page in pages:
		page.save('temporal/lavanguardia.jpg', 'JPEG')
	time.sleep(3)

except:
	os.system("cp fail.jpg temporal/lavanguardia.jpg")
#ABC
try:
	url = f'https://static.abc.es/media/hemeroteca/{year}/{month}/{day}/abc-madrid-{year}{month}{day}-1-t6w--620x846.jpg'
	wget.download(url, 'temporal/abc.jpg')
	time.sleep(3)
except:
	os.system("cp fail.jpg temporal/abc.jpg")
示例#16
0
文件: ocr.py 项目: sunilhariharan/ocr
def ocr(PDF_file, outfile, output):
    f = open(outfile, "a")
    pages = convert_from_path(PDF_file, 500)

    for page in pages:

        filename = "page.png"
        page.save(filename, 'JPEG')
        cv_img = cv2.imread(filename, 0)

        blur = cv2.GaussianBlur(cv_img, (5, 5), 0)
        ret3, th3 = cv2.threshold(blur, 0, 255,
                                  cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        cv2.imwrite('denoised.png', th3)
        text = str(((pytesseract.image_to_string(Image.open('denoised.png')))))
        text = text.replace('-\n', '')

        f.write(text)

    f.close()

    with open(outfile, "r") as f:
        searchlines = f.readlines()

    searchlines = list(map(lambda s: s.strip(), searchlines))

    for i in searchlines:
        if i == '':
            searchlines.remove(i)

    applicant_details = searchlines[searchlines.index(
        "2. Applicant Details"):searchlines.index("2. Applicant Details") + 8]

    applicant_name = applicant_details[1:4][0].split(
        'Title')[1] + applicant_details[1:4][1].split(
            'First name')[1] + applicant_details[1:4][2].split('Surname')[1]

    applicant_name = applicant_name.strip()

    applicant_company_name = applicant_details[4].split('Company name')[1]

    applicant_company_name = applicant_company_name.strip()

    applicant_address = applicant_details[5].split(
        'Address line 1 ')[1] + applicant_details[6].split('Address line 2')[
            1] + applicant_details[7].split('Address line 3')[1]

    applicant_address = applicant_address.strip()

    subs = 'Description of proposed materials and finishes'

    res = [i for i in searchlines if subs in i]

    l1 = [i.split(': ') for i in res]

    l2 = []
    for i in l1:
        l2.append(i[1])

    for i in searchlines:
        if i == '':
            searchlines.remove(i)

    l3 = []
    if l2 == []:
        agent_details = searchlines[searchlines.index("3. Agent Details"):
                                    searchlines.index("3. Agent Details") + 11]
        agent_name = agent_details[1:4][0].split(
            'Title')[1] + agent_details[1:4][1].split(
                'First name')[1] + agent_details[1:4][2].split('Surname')[1]
        agent_name = agent_name.strip()
        agent_company_name = agent_details[7]
        agent_company_name = agent_company_name.strip()
        agent_address = agent_details[8] + ',' + agent_details[
            9] + ',' + agent_details[10].split('Address line 3')[1]
        agent_address = agent_address.strip()
        l3.append(
            searchlines[searchlines.index("Walls - Materials"):searchlines.
                        index("Walls - Materials") + 8])
        l4 = []
        for i in range(len(l3[0])):
            if i % 2 == 1:
                l4.append(l3[0][i])
        f = open(output, "w")
        f.write("1. Applicant details \n" + "Name : " + applicant_name + '\n' +
                "Company : " + applicant_company_name + '\n' + "Address : " +
                applicant_address + '\n' + '\n' + "2. Agent details\n" +
                "Name : " + agent_name + '\n' + "Company : " +
                agent_company_name + '\n' + "Address : " + agent_address +
                '\n' + '\n' + "3.Materials : " + l4[0] + "," + l4[1] + "," +
                l4[2] + "," + l4[3])
        f.close()
    else:
        agent_details = searchlines[searchlines.index("3. Agent Details"):
                                    searchlines.index("3. Agent Details") + 8]
        agent_name = agent_details[1:4][0].split(
            'Title')[1] + agent_details[1:4][1].split(
                'First name')[1] + agent_details[1:4][2].split('Surname')[1]
        agent_name = agent_name.strip()
        agent_company_name = agent_details[4].split('Company name')[1]
        agent_company_name = agent_company_name.strip()
        agent_address = agent_details[5].split(
            'Address line 1')[1] + ',' + agent_details[6].split(
                'Address line 2')[1] + ',' + agent_details[7].split(
                    'Address line 3')[1]
        agent_address = agent_address.strip()
        f = open(output, "w")
        f.write("1. Applicant details \n" + "Name : " + applicant_name + '\n' +
                "Company : " + applicant_company_name + '\n' + "Address : " +
                applicant_address + '\n' + '\n' + "2. Agent details\n" +
                "Name : " + agent_name + '\n' + "Company : " +
                agent_company_name + '\n' + "Address : " + agent_address +
                '\n' + '\n' + "3.Materials : " + l2[0] + "," + l2[1] + "," +
                l2[2] + "," + l2[3])
        f.close()
示例#17
0
# coding: utf-8

# In[1]:

from pdf2image import convert_from_path
pages = convert_from_path('pdf1.pdf', 500)

# In[2]:

for page in pages:
    page.save('out.jpg', 'JPEG')

# In[3]:

from PIL import Image
import pytesseract
#import tesseract

# In[4]:

text = pytesseract.image_to_string(Image.open('out.jpg'))

# In[5]:

text

# In[ ]:
示例#18
0
import os
from pdf2image import convert_from_path
import PIL
#4896*6336
pdf_dir = r"C:\Users\randy\Downloads\the_way_to_train"
save_dir = r'C:\Users\randy\Downloads\the_way_to_train\PDF2IMG'

os.chdir(pdf_dir)
for pdf_file in os.listdir(pdf_dir):

    if pdf_file.endswith(".pdf"):

        pages = convert_from_path(pdf_file, dpi=96)
        pdf_file = pdf_file[:-4]

        for page in pages:
            print("%s/%s-page%d.png" % (save_dir,pdf_file,pages.index(page)))
           # page.save("%s/%s-page%d.png" % (save_dir,pdf_file,pages.index(page)), "png")
            if page == [-1]:
                print(page)

'''
if __name__ == "__main__":
    pdf2img(pdf_dir)
'''
示例#19
0
 def test_conversion_from_path_241(self):  # pragma: no cover
     start_time = time.time()
     images_from_path = convert_from_path('./tests/test_241.pdf')
     self.assertTrue(len(images_from_path) == 241)
     print('test_conversion_from_path_241: {} sec'.format(
         (time.time() - start_time) / 241.))
示例#20
0
 def pdf2img(self, pdf_path):
     return pdf2image.convert_from_path(pdf_path)
示例#21
0
 def test_empty_if_corrupted_pdf(self):
     start_time = time.time()
     with self.assertRaises(Exception):
         convert_from_path('./tests/test_corrupted.pdf')
     print('test_empty_if_corrupted_pdf: {} sec'.format(time.time() -
                                                        start_time))
示例#22
0
def process_score_image_request():
    if request.method == "POST":
        file = request.files['file']
        if file and allowed_file(file.filename):
            filename = secure_filename(file.filename)
            file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
            # create config
            cfg = get_cfg()
            # below path applies to current installation location of Detectron2
            cfgFile = "DLA_mask_rcnn_X_101_32x8d_FPN_3x.yaml"
            cfg.merge_from_file(cfgFile)
            cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
            cfg.MODEL.WEIGHTS = "model_final_trimmed.pth"
            cfg.MODEL.DEVICE = "cpu"  # we use a CPU Detectron copy
            # boxes = outputs['instances'].pred_boxes.tensor.cpu().numpy()[0]
            classes = ['text', 'title', 'list', 'table', 'figure']
            default_predictor = detectron2.engine.defaults.DefaultPredictor(
                cfg)
            pages = convert_from_path("/uploads/" + filename,
                                      dpi=200,
                                      fmt='jpeg')

            for idx, p in enumerate(pages):
                print(idx)
                # pages
                im = np.array(p)[:, :, ::-1]
                predictions = default_predictor(im)
                instances = predictions["instances"].to('cpu')
                MetadataCatalog.get(cfg.DATASETS.TEST[0]).thing_classes = [
                    'text', 'title', 'list', 'table', 'figure'
                ]
                pred_classes = instances.pred_classes

                labels = [classes[i] for i in pred_classes]
                label_count = [{i: labels.count(i)} for i in labels]
                label_count = [
                    dict(y) for y in set(
                        tuple(x.items()) for x in label_count)
                ]
                label_count = [{
                    k: [v, []]
                } for label in label_count for k, v in label.items()]
                print(label_count)
                page_label_count = {f"page {idx}": label_count}
                print(page_label_count)

                # print(label_count)

                def add_content(content):
                    for i in label_count:
                        for k, v in i.items():
                            if k == label:
                                v[1].append(content)
                    return True

                boxes = instances.pred_boxes
                if isinstance(boxes, detectron2.structures.boxes.Boxes):
                    boxes = boxes.tensor.numpy()
                else:
                    boxes = np.asarray(boxes)

                from PIL import Image
                import math
                table = []
                list_ = []
                text = []
                title = []
                # content = [table]

                for label, bbox in zip(labels, boxes):

                    # getting prediction bboxes from model outputs

                    x2 = math.ceil(bbox[0])
                    x1 = math.ceil(bbox[1])
                    y2 = math.ceil(bbox[2])
                    y1 = math.ceil(bbox[3])
                    crop_img = im[x1:y1, x2:y2]
                    if len(crop_img) <= 8:
                        continue

                    if label == "table":
                        print(label)
                        # add_content(img_(crop_img[ : , : , -1]))
                    elif label == "list":
                        add_content(extract_from_images(crop_img))
                    elif label == "title":
                        add_content(extract_from_images(crop_img))
                    elif label != "figure":
                        add_content(extract_from_images(crop_img))
                        # print(page_label_count)
                #print(page_label_count)
                for k, v in page_label_count.items():
                    # sendToNeo4j("MERGE (d:Document)-[:Page]->(p: Page {page_num: $k})", k=k)
                    for i in v:
                        for l, m in i.items():
                            # print(m)
                            if l == 'figure':
                                sendToNeo4j(
                                    "MERGE (d:Document) MERGE(d)-[:Page]->(p: Page {page_num: $page}) MERGE(p)-[:Figure_count {figure: $m}]->(f:Figure {figure: 'figure'})",
                                    m=m[0],
                                    page=k)
                            if l == 'text':
                                sendToNeo4j(
                                    "MERGE (d:Document) MERGE(d)-[:Page]->(p: Page {page_num: $page}) MERGE(p)-[:Paragraph_count {text: $m}]->(pa:Paragraph {text: $text})",
                                    m=m[0],
                                    page=k,
                                    text=m[1])
                            if l == 'title':
                                sendToNeo4j(
                                    "MERGE (d:Document) MERGE(d)-[:Page]->(p: Page {page_num: $page}) MERGE(p)-[:Title_count {title: $m}]->(t:Title {title: $title})",
                                    m=m[0],
                                    page=k,
                                    title=m[1])
                            if l == 'table':
                                sendToNeo4j(
                                    "MERGE (d:Document) MERGE(d)-[:Page]->(p: Page {page_num: $page}) MERGE(p)-[:Table_count {table: $m}]->(ta:Table {table: $table})",
                                    m=m[0],
                                    page=k,
                                    table=m[1])
                            if l == 'form':
                                sendToNeo4j(
                                    "MERGE (d:Document) MERGE(d)-[:Page]->(p: Page {page_num: $page}) MERGE(p)-[:Form_count {form: $m}]->(fo:Form {form: $form})",
                                    m=m[0],
                                    page=k,
                                    form=m[1])

                                # sendToNeo4j('MERGE(p:Page{page:$page_label_count.keys()[0]', keys=page_label_count.keys()[0])

        return render_template('index.html')
 def post(self):
     ############ Get Input Params ################
     data = request.get_json()
     file_url = data['file_url']
     file_type = data['file_type'].lower()
     ### Check If PDF and Image Directory Exists ###
     if not (os.path.exists(DownloadDirectory)):
         os.mkdir(DownloadDirectory)
     if not (os.path.exists(ExtractedImageDirectory)):
         os.mkdir(ExtractedImageDirectory)
     ############# Download File ##################
     file_type = file_type.lower()
     random_integer = str(random.randint(100000, 999999))
     random_letters = ''.join(random.choice(letters) for i in range(5))
     random_combination = random_integer + random_letters
     FileName = "File_" + random_combination
     DownloadFilePath = DownloadDirectory + FileName
     ExtractedImageFileName = ExtractedImageDirectory + FileName
     ########### Check File Type ####################
     if file_type == "png":
         FileName = FileName + ".png"
     elif file_type == "pdf":
         FileName = FileName + ".pdf"
     else:
         return {
             'msg': 'Error',
             'description': 'Unsupported file extension.'
         }
     ########## Download File ###################
     try:
         response = requests.get(str(file_url))
     except:
         return {
             'msg':
             'Error',
             'description':
             'Unable to download file. Please check the file url again.'
         }
     ############# Write downloaded file to local ##################
     try:
         with open(DownloadFilePath, 'wb') as f:
             f.write(response.content)
     except:
         return {
             'msg': 'Error',
             'description': 'Unable to save downloaded file.'
         }
     ############## Save Image from Downloaded File ###################
     ImageList = []
     if file_type == "png":
         ExtractedImageFilePath = ExtractedImageFileName + ".png"
         try:
             im = Image.open(DownloadFilePath)
         except:
             os.remove(DownloadFilePath)
             return {
                 'msg': 'Error',
                 'description': 'Unable to open downloaded file.'
             }
         try:
             im.save(ExtractedImageFilePath)
             ImageList.append(ExtractedImageFilePath)
             os.remove(DownloadFilePath)
         except:
             os.remove(DownloadFilePath)
             return {
                 'msg': 'Error',
                 'description': 'Unable to save image file.'
             }
     elif file_type == "pdf":
         try:
             print("--------Inside-------------------")
             images = convert_from_path(DownloadFilePath, dpi=500)
             print(len(images))
             for pagenum, image in enumerate(images):
                 image = image.convert('LA')
                 ExtractedImageFilePath = ExtractedImageFileName + "_Page" + str(
                     pagenum + 1) + ".png"
                 image.save(ExtractedImageFilePath)
                 ImageList.append(ExtractedImageFilePath)
             os.remove(DownloadFilePath)
         except:
             os.remove(DownloadFilePath)
             return {
                 'msg': 'Error',
                 'description': 'Unable to convert pdf to image.'
             }
     else:
         return {'msg': 'Error', 'description': 'Unsupported File Format.'}
     ##### Check and compress File Size #######
     for image in ImageList:
         while os.stat(image).st_size > 9437184:
             im = Image.open(image)
             im.save(image, optimize=True, quality=80)
     ########### Check For Templates ###########
     FirstImage = ImageList[0]
     try:
         with io.open(FirstImage, 'rb') as gen_image_file:
             content = gen_image_file.read()
     except:
         for image in ImageList:
             os.remove(image)
         return {
             'msg':
             'Error',
             'description':
             'Unable to read extracted image for template detection.'
         }
     try:
         client = vision.ImageAnnotatorClient()
         #image = vision.types.Image(content=content)
         #image = vision.Image(content=content)
         image = vision.Image(content=content)
         response = client.text_detection(image=image)
     except Exception as e:
         print(e)
         for image in ImageList:
             os.remove(image)
         return {
             'msg': 'Error',
             'description': 'Unable to invoke Google vision api.'
         }
     ############ Create Dict for Vision API response ###########
     DictResponse = MessageToDict(response._pb)
     WholeContentDescription = DictResponse['textAnnotations'][0][
         'description'].lower()
     ################# Match Template ############################
     TemplateName = ""
     for templatename, keywords in Templates.items():
         matchfound = True
         for keyword in keywords:
             if keyword not in WholeContentDescription:
                 matchfound = False
         if matchfound:
             TemplateName = templatename
             break
     if TemplateName == "":
         for image in ImageList:
             os.remove(image)
         return {
             'msg': 'Error',
             'description': 'Unable to find a matching Template.'
         }
     ############## Yang Ming Template #############################
     if TemplateName == "YangMingTemplate":
         try:
             response = ExtractDataForYongMingTemplate(DictResponse)
             if response == "missing keywords":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Google Vision API unable to find all the mandatory keywords for Yang Ming Invoice.'
                 }
             else:
                 for image in ImageList:
                     os.remove(image)
                 return response
         except:
             for image in ImageList:
                 os.remove(image)
             return {
                 'msg':
                 'Error',
                 'description':
                 'Unknown issue occured. Please connect with system administrator with the input file.'
             }
     ############## Maersk Template #############################
     if TemplateName == "MaerskTemplate":
         try:
             response = ProcessMaerskInvoice(ImageList)
             if response == "invocation error":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg': 'Error',
                     'description': 'Unable to Invoke Google Vision API'
                 }
             elif response == "missing keywords":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Google Vision API unable to find all the mandatory keywords for Maersk Invoice.'
                 }
             elif response == "unable to extract data from Google Vision API":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Unable to extract data from Google Vision API.'
                 }
             else:
                 for image in ImageList:
                     os.remove(image)
                 return response
         except:
             print(e)
             for image in ImageList:
                 os.remove(image)
             return {
                 'msg':
                 'Error',
                 'description':
                 'Unknown issue occured. Please connect with system administrator with the input file.'
             }
     ############## Evergreen Template #############################
     if TemplateName == "EvergreenTemplate":
         try:
             response = ProcessEvergreenInvoice(ImageList)
             if response == "invocation error":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg': 'Error',
                     'description': 'Unable to Invoke Google Vision API'
                 }
             elif response == "missing keywords":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Google Vision API unable to find all the mandatory keywords for Evergreen Invoice.'
                 }
             elif response == "unable to extract data from Google Vision API":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Unable to extract data from Google Vision API.'
                 }
             else:
                 for image in ImageList:
                     os.remove(image)
                 return response
         except:
             for image in ImageList:
                 os.remove(image)
             return {
                 'msg':
                 'Error',
                 'description':
                 'Unknown issue occured. Please connect with system administrator with the input file.'
             }
     ################## OOCL Template ##############################
     if TemplateName == "OOCLTemplate":
         try:
             response = ProcessOOCLInvoice(ImageList)
             if response == "invocation error":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg': 'Error',
                     'description': 'Unable to Invoke Google Vision API'
                 }
             elif response == "missing keywords":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Google Vision API unable to find all the mandatory keywords for Evergreen Invoice.'
                 }
             elif response == "unable to extract data from Google Vision API":
                 for image in ImageList:
                     os.remove(image)
                 return {
                     'msg':
                     'Error',
                     'description':
                     'Unable to extract data from Google Vision API.'
                 }
             else:
                 for image in ImageList:
                     os.remove(image)
                 return response
         except Exception as e:
             print(e)
             #raise e
             for image in ImageList:
                 os.remove(image)
             return {
                 'msg':
                 'Error',
                 'description':
                 'Unknown issue occured. Please connect with system administrator with the input file.'
             }
示例#24
0
def pipeline(directory):
    # Function that performs a set of operations stopping
    # at the type CHECKPOINT
    global G

    images = os.listdir(directory)

    graphs = []
    for i in images:

        print("[*] Parsing image file {}".format(i))

        # Open image depending on image format
        full_path = os.path.join(directory, i)
        if i.split('.')[1].lower() == "pdf":
            img = convert_from_path(full_path)[0].convert('RGB')
        else:
            img = Image.open(full_path).convert('RGB')

        # Trim border
        img = np.array(img)
        border_image = trim_meta(img)

        # Grab ECU mappings and the line colors in image
        print("\t|---> [*] Extracting Text")
        ecu_map = extract_text(border_image)

        # Build relationships based off lines
        lines_img = line_filter(border_image, color='lines')
        lines_img = cv2.cvtColor(lines_img, cv2.COLOR_GRAY2RGB)

        print("\t|---> [*] Building Relationships")
        graph = build_relationships(lines_img, ecu_map.copy())
        graphs.append(graph.copy())

        G.clear()

    # Concatenate page continuation lines
    print("[*] Composing continued page graphs")
    F = None
    for i in range(0, len(graphs) - 1):
        print(5 * " " + "|---> Page {} <--> Page {}".format(i, i + 1))
        g, h = (F, graphs[i + 1]) if F != None else (graphs[i], graphs[i + 1])

        out_nodes = [n for n in g.nodes if "OUT-" in n]
        next_in_nodes = [n for n in h.nodes if "IN-" in n]
        # next_in_nodes = next_in_nodes[:-1]
        if len(out_nodes) != len(next_in_nodes):
            print("")
            print(
                "[-] Length of OUT[{}] and IN[{}] nodes does not match".format(
                    len(out_nodes), len(next_in_nodes)))
            print("    returning all graphs")
            return graphs

        new_names = [
            "CONNECT-JUNCT-{}-{}".format(i, n)
            for n in range(0, len(next_in_nodes))
        ]
        mapping_out = dict(list(zip(out_nodes, new_names)))
        mapping_in = dict(list(zip(next_in_nodes, new_names)))

        G_ = nx.relabel_nodes(g, mapping_out)
        H_ = nx.relabel_nodes(h, mapping_in)
        F = nx.compose(G_, H_)

    return F
示例#25
0
import os
import tempfile
from pdf2image import convert_from_path
import cv2

files = filter(os.path.isfile, os.listdir(os.curdir))
for singlefile in files:
    if_pdf = singlefile.split('.')
    if if_pdf[1] == 'pdf':
        try:
            images_from_path = convert_from_path(singlefile,
                                                 last_page=1,
                                                 first_page=0)
            base_filename = if_pdf[0] + '.jpg'
            save_dir = os.getcwd()
            images_from_path[0].save(os.path.join(save_dir, base_filename),
                                     'JPEG')
            img = cv2.imread(base_filename)
            dim = (299, 299)
            resizeImg = cv2.resize(img, dim)
            cv2.imwrite(base_filename, resizeImg)
            #os.remove(singlefile)
        except:
            print(singlefile)

print("Done!")
示例#26
0
def parseCheckYN(page):

    # set path
    setpath = '/Users/natewagner/Documents/Surveys/batch1/'
    path = setpath + page
    
    # Convert to png and to greyscale / rotate
    images = convert_from_path(path)
    images_bw = images[0].convert('L') 
    images_bw = images_bw.transpose(Image.ROTATE_270)
    
    # set path to pytesseract
    pytesseract.pytesseract.tesseract_cmd = r'/usr/local/Cellar/tesseract/4.1.1/bin/tesseract'
    
    
    # extract account number
    accnt_num_dims = (1000+155, 150-140, 1600-200, 75)
    accnt_num = images_bw.crop(accnt_num_dims)    
    accnt_number = pytesseract.image_to_string(accnt_num)
    
    
    # extract company name
    comp_info_dims = (465+250, 1100, 1250+400, 1145)
    comp_info = images_bw.crop(comp_info_dims) 
    bus_name = pytesseract.image_to_string(comp_info)
    
    
    # question 2
    question2_dims = (150-40, 783+145, 290+10, 923+195)
    question2 = images_bw.crop(question2_dims)    
    Q2p = []
    for pixel in iter(question2.getdata()):
        Q2p.append(pixel)
    
    q2df = pd.DataFrame(Q2p).transpose()
    q2df['question'] = 'question2'
    q2df['accnt_num'] = accnt_number
    q2df['bus_name'] = bus_name
    #q2df.head(20)
    
    
    
    # question 3
    question3_dims = (135-30, 860+80, 275+20, 1000+130)
    question3 = images_bw.crop(question3_dims) 
    Q3p = []
    for pixel in iter(question3.getdata()):
        Q3p.append(pixel)
    
    q3df = pd.DataFrame(Q3p).transpose()
    q3df['question'] = 'question3'
    q3df['accnt_num'] = accnt_number
    q3df['bus_name'] = bus_name
    
    
    
    # question 4
    question4_dims = (115, 1070, 305, 1070+190)
    question4 = images_bw.crop(question4_dims) 
    Q4p = []
    for pixel in iter(question4.getdata()):
        Q4p.append(pixel)
    
    q4df = pd.DataFrame(Q4p).transpose()
    q4df['question'] = 'question4'
    q4df['accnt_num'] = accnt_number
    q4df['bus_name'] = bus_name
    
    
    
    # question 8
    question8_dims = (1060+300, 928-250, 1250+300, 1118-250)
    question8 = images_bw.crop(question8_dims) 
    Q8p = []
    for pixel in iter(question8.getdata()):
        Q8p.append(pixel)
    
    q8df = pd.DataFrame(Q8p).transpose()
    q8df['question'] = 'question8'
    q8df['accnt_num'] = accnt_number
    q8df['bus_name'] = bus_name
    
    
    
    # question 9
    question9_dims = (1060+112, 928-180, 1250+112, 1118-180)
    question9 = images_bw.crop(question9_dims) 
    Q9p = []
    for pixel in iter(question9.getdata()):
        Q9p.append(pixel)
    
    q9df = pd.DataFrame(Q9p).transpose()
    q9df['question'] = 'question9'
    q9df['accnt_num'] = accnt_number
    q9df['bus_name'] = bus_name

    check_YN_data = pd.concat([q2df, q3df, q4df, q8df, q9df])
    check_YN_data['survey'] = str(page)
    
    return(check_YN_data)
 def __init__(self, pdf_path):
     self.images = convert_from_path(pdf_path)
示例#28
0
# Import libraries
from PIL import Image
import pytesseract
import sys
from pdf2image import convert_from_path
import os

# Path of the pdf
PDF_file = "/home/naresh/Tesseract/Languagefiles/Hindi/law_commisionofindia/law_commisionofindia.pdf"
''' 
Part #1 : Converting PDF to images 
'''

# Store all the pages of the PDF in a variable
pages = convert_from_path(PDF_file, 500)

# Counter to store images of each page of PDF to image
image_counter = 1

for page in pages:

    filename = "/home/naresh/Tesseract/PDF_Images/Hindi/" + "law_commisionofindia" + str(
        image_counter) + ".jpg"

    # Save the image of the page in system
    page.save(filename, 'JPEG')

    # Increment the counter to update filename
    image_counter = image_counter + 1
''' 
    if r > 120 and r < 254 and g > 120 and g < 254 and b > 120 and b < 254:
        return True
    else:
        return False


# Handling of images for removing the watermark
def handle(imgs):
    for i in range(imgs.shape[0]):
        for j in range(imgs.shape[1]):
            if select_pixel(imgs[i][j][0], imgs[i][j][1], imgs[i][j][2]):
                imgs[i][j][0] = imgs[i][j][1] = imgs[i][j][2] = 255
    return imgs


images = convert_from_path(pdfFile)

try:
    os.mkdir(dirname + '\img')
except FileExistsError:
    print('Folder exist')
index = 0
for img in images:
    index += 1
    img = np.array(img)
    print(img.shape)
    img = handle(img)
    io.imsave(dirname + '\img\img' + str(index) + '.jpg', img)
    print(index)

# Merging images to a sigle PDF
示例#30
0
from pdf2image import convert_from_path
import sys, os
os.makedirs("pyout", exist_ok=True)
convert_from_path(sys.argv[1], output_folder="pyout", fmt="jpeg")
示例#31
0
def resume_generate_action(request):
    context = {}
    print("Entering generate_action")

    if not os.path.isdir(PDF_DIR):
        os.mkdir(PDF_DIR)

    if not os.path.isdir(JPG_DIR):
        os.mkdir(JPG_DIR)

    # create html file
    html_filename = request.user.username + ".html"
    create_html(request, html_filename)

    # html to pdf
    pdf_filename = PDF_DIR + request.user.username + ".pdf"
    # path_wkthmltopdf = r'C:\Users\dell\Anaconda3\Lib\site-packages\wkhtmltopdf\bin\wkhtmltopdf.exe'
    # config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf)
    # pdfkit.from_file(html_filename, pdf_filename, configuration=config)
    pdfkit.from_file(html_filename, pdf_filename)

    # pdf to image
    jpg_filename = JPG_DIR + request.user.username + ".jpg"
    pages = convert_from_path(pdf_filename, 500)
    page = pages[0]
    page.save(jpg_filename, 'JPEG')

    position_list = []
    for position_text in request.POST.getlist('positions[]'):
        position = Position.objects.get(text=position_text)
        position_list.append(position)

    location_list = []
    for location_text in request.POST.getlist('locations[]'):
        location = Location.objects.get(text=location_text)
        location_list.append(location)

    # Check whether bio already exists
    employee = Employee.objects.get(user=request.user)
    bio_list = Employee.objects.get(user=request.user).bio_list
    new_bio = None
    for bio in bio_list.all():
        if bio.education_level != request.POST['education_level']:
            print("education_level")
            continue
        if bio.job_type != request.POST['job_type']:
            print("job_type")
            continue
        if list(bio.positions.all()) != position_list:
            print(bio.positions.all())
            print(position_list)
            print("positions")
            continue
        if list(bio.locations.all()) != location_list:
            print("locations")
            continue
        new_bio = bio
        break

    print(request.POST)
    if new_bio == None:
        print("Generating new bio")
        # If bio does not exist, create a new bio
        new_bio = Bio(user=request.user,
                      education_level=request.POST['education_level'],
                      job_type=request.POST['job_type'])
        new_bio.save()
        for position_text in request.POST.getlist('positions[]'):
            position = Position.objects.get(text=position_text)
            new_bio.positions.add(position)

        for location_text in request.POST.getlist('locations[]'):
            location = Location.objects.get(text=location_text)
            new_bio.locations.add(location)
        new_bio.save()
    else:
        print("bio already exist")
        # If bio already exists, set the previous master resume as slave
        for resume in new_bio.resume_list.all():
            if resume.master == True:
                resume.master = False
                resume.save()
        new_bio.save()

    # Construct new resume
    # save pdf into resume
    new_resume = Resume(user=request.user, master=True)
    f = open(pdf_filename, 'rb')
    pdf_file = File(f)
    new_resume.file.save(pdf_filename, pdf_file)
    f.close()

    # save image into resume
    f = open(jpg_filename, 'rb')
    jpg_file = File(f)
    new_resume.picture.save(jpg_filename, jpg_file)
    f.close()

    # delete local files
    os.remove(html_filename)
    os.remove(pdf_filename)
    os.remove(jpg_filename)

    new_resume.save()

    new_bio.resume_list.add(new_resume)
    new_bio.save()

    if new_bio not in employee.bio_list.all():
        employee.bio_list.add(new_bio)
        employee.save()

    length = len(employee.bio_list.all())

    context = {
        'user': request.user,
        'bio_list': employee.bio_list.all(),
        'len': length
    }

    return render(request, 'eyee_management.html', context)
示例#32
0
文件: eval.py 项目: yasudakn/yolact
def evalvideo(net:Yolact, path:str):
    # If the path is a digit, parse it as a webcam index
    is_webcam = path.isdigit()
    
    if is_webcam:
        vid = cv2.VideoCapture(int(path))
    else:
        vid = cv2.VideoCapture(path)
    
    if not vid.isOpened():
        print('Could not open video "%s"' % path)
        exit(-1)
    
    vid.set(cv2.CAP_PROP_FRAME_HEIGHT, args.video_res[1])
    vid.set(cv2.CAP_PROP_FRAME_WIDTH, args.video_res[0])
    vid.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc('H', '2', '6', '4'))
    W = vid.get(cv2.CAP_PROP_FRAME_WIDTH)
    H = vid.get(cv2.CAP_PROP_FRAME_HEIGHT)
    print('capture video size: w{} h{}'.format(W, H))

    net = CustomDataParallel(net).cuda()
    transform = torch.nn.DataParallel(FastBaseTransform()).cuda()
    frame_times = MovingAverage(100)
    fps = 0
    # The 0.8 is to account for the overhead of time.sleep
    frame_time_target = 1 / vid.get(cv2.CAP_PROP_FPS)
    running = True
    slide_num =0
    bg_imgs = []
    mask_alpha = args.mask_alpha
    bgvid = None

    if args.mask_bg_path is not None:
        if Path(args.mask_bg_path).suffix == '.pdf':
            pdfimages = pdf2image.convert_from_path(args.mask_bg_path)
            for pdfimage in pdfimages:
                cvimage = np.asarray(pdfimage)
                cvimage = cv2.cvtColor(cvimage, cv2.COLOR_RGB2BGR)
                bg_img = cv2.resize(cvimage, (int(W), int(H)))
                bg_img_gpu = torch.from_numpy(
                    bg_img / 255.0
                    ).cuda().float()
                bg_imgs.append(bg_img_gpu)
        elif Path(args.mask_bg_path).suffix == '.mp4':
            bg_imgs = []
            bgvid = cv2.VideoCapture(args.mask_bg_path)
            BG_W = bgvid.get(cv2.CAP_PROP_FRAME_WIDTH)
            BG_H = bgvid.get(cv2.CAP_PROP_FRAME_HEIGHT)
            print('background capture video size: w{} h{}'.format(BG_W, BG_H))
        else:
            bg_img = cv2.resize(cv2.imread(args.mask_bg_path), (int(W), int(H)))
            bg_img_gpu = torch.from_numpy(
                bg_img / 255.0
                ).cuda().float()
            bg_imgs.append(bg_img_gpu)
            

    def cleanup_and_exit():
        print()
        pool.terminate()
        vid.release()
        cv2.destroyAllWindows()
        exit()

    def get_next_frame(vid):
        return [vid.read()[1] for _ in range(args.video_multiframe)]

    def rescale(frame):
        if args.rescale < 1.0:
            h, w = frame.shape[:2]
            new_img = np.zeros((h, w, 3), np.uint8)
            rescale_frame = cv2.resize(frame, dsize=(int(w * args.rescale), int(h * args.rescale)))
            new_img[h - int(h * args.rescale):h, 0:int(w * args.rescale)] = rescale_frame
            return new_img
        else:
            return frame

    def transform_frame(frames):
        with torch.no_grad():
            frames = [
                torch.from_numpy( rescale(frame) ).cuda().float()
                for frame in frames]
            return frames, transform(torch.stack(frames, 0))

    def eval_network(inp):
        with torch.no_grad():
            frames, imgs = inp
            return frames, net(imgs)

    def prep_frame(inp):
        nonlocal slide_num, bg_imgs, mask_alpha, bgvid
        with torch.no_grad():
            frame, preds = inp
            return prep_display(preds, frame, None, None, undo_transform=False, class_color=True, 
                                mask_alpha=mask_alpha, bg_imgs=bg_imgs, slide_num=slide_num, bgvid=bgvid)

    frame_buffer = Queue()
    video_fps = 0

    # All this timing code to make sure that 
    def play_video():
        nonlocal frame_buffer, running, video_fps, is_webcam, slide_num

        video_frame_times = MovingAverage(100)
        frame_time_stabilizer = frame_time_target
        last_time = None
        stabilizer_step = 0.0005

        while running:
            frame_time_start = time.time()

            if not frame_buffer.empty():
                next_time = time.time()
                if last_time is not None:
                    video_frame_times.add(next_time - last_time)
                    video_fps = 1 / video_frame_times.get_avg()
                cv2.imshow(path, frame_buffer.get())
                last_time = next_time

            key_press = cv2.waitKey(1) & 0xff
            if key_press == 27: # Press Escape to close
                running = False
            elif key_press == ord('n'):
                if slide_num < len(bg_imgs) - 1:
                    slide_num = slide_num + 1
            elif key_press == ord('b'):
                if 0 < slide_num:
                    slide_num = slide_num - 1

            buffer_size = frame_buffer.qsize()
            if buffer_size < args.video_multiframe:
                frame_time_stabilizer += stabilizer_step
            elif buffer_size > args.video_multiframe:
                frame_time_stabilizer -= stabilizer_step
                if frame_time_stabilizer < 0:
                    frame_time_stabilizer = 0

            new_target = frame_time_stabilizer if is_webcam else max(frame_time_stabilizer, frame_time_target)

            next_frame_target = max(2 * new_target - video_frame_times.get_avg(), 0)
            target_time = frame_time_start + next_frame_target - 0.001 # Let's just subtract a millisecond to be safe
            # This gives more accurate timing than if sleeping the whole amount at once
            while time.time() < target_time:
                time.sleep(0.001)


    extract_frame = lambda x, i: (x[0][i] if x[1][i] is None else x[0][i].to(x[1][i]['box'].device), [x[1][i]])

    # Prime the network on the first frame because I do some thread unsafe things otherwise
    print('Initializing model... ', end='')
    eval_network(transform_frame(get_next_frame(vid)))
    print('Done.')

    # For each frame the sequence of functions it needs to go through to be processed (in reversed order)
    sequence = [prep_frame, eval_network, transform_frame]
    pool = ThreadPool(processes=len(sequence) + args.video_multiframe + 2)
    pool.apply_async(play_video)

    active_frames = []

    print()
    while vid.isOpened() and running:
        start_time = time.time()

        # Start loading the next frames from the disk
        next_frames = pool.apply_async(get_next_frame, args=(vid,))
        
        # For each frame in our active processing queue, dispatch a job
        # for that frame using the current function in the sequence
        for frame in active_frames:
            frame['value'] = pool.apply_async(sequence[frame['idx']], args=(frame['value'],))
        
        # For each frame whose job was the last in the sequence (i.e. for all final outputs)
        for frame in active_frames:
            if frame['idx'] == 0:
                frame_buffer.put(frame['value'].get())

        # Remove the finished frames from the processing queue
        active_frames = [x for x in active_frames if x['idx'] > 0]

        # Finish evaluating every frame in the processing queue and advanced their position in the sequence
        for frame in list(reversed(active_frames)):
            frame['value'] = frame['value'].get()
            frame['idx'] -= 1

            if frame['idx'] == 0:
                # Split this up into individual threads for prep_frame since it doesn't support batch size
                active_frames += [{'value': extract_frame(frame['value'], i), 'idx': 0} for i in range(1, args.video_multiframe)]
                frame['value'] = extract_frame(frame['value'], 0)

        
        # Finish loading in the next frames and add them to the processing queue
        active_frames.append({'value': next_frames.get(), 'idx': len(sequence)-1})
        
        # Compute FPS
        frame_times.add(time.time() - start_time)
        fps = args.video_multiframe / frame_times.get_avg()

        print('\rProcessing FPS: %.2f | Video Playback FPS: %.2f | Frames in Buffer: %d    ' % (fps, video_fps, frame_buffer.qsize()), end='')
    
    cleanup_and_exit()