from ReadPdfFromUrl import PdfFromUrl from pdftotext import PdfToText ##define an instance of PdfFrom Url object pdffromUrl = PdfFromUrl() ##define an instance of PdfToText object p2t = PdfToText() ##open first pdf file and save it as file1.pdf pdffromUrl.open_pdf_from_url( "http://www.seleniummaster.com/sitecontent/images/Selenium_Master_Test_Case_Base_Template.pdf", "file1.pdf") ##open second pdf file and save it as file2.pdf pdffromUrl.open_pdf_from_url( "http://www.seleniummaster.com/sitecontent/images/Selenium_Master_TestCase_Modified_Template.pdf", "file2.pdf") ##get texts of file1.pdf textOfFile1 = p2t.convert_pdf_to_txt("file1.pdf") ##get texts of file2.pdf textOfFile2 = p2t.convert_pdf_to_txt("file2.pdf") ##get length of file1.pdf texts lengthOfTextFile1 = len(textOfFile1) ##get length of file2.pdf texts lengthOfTextFile2 = len(textOfFile2) ##print text length information print "Length of text of File1", lengthOfTextFile1 print "Length of text of File2", lengthOfTextFile2 ##compare text length if (lengthOfTextFile1 == lengthOfTextFile2): print "Two pdf files' texts are the same" else: print "Two pdf files' texts are different"
from ReadPdfFromUrl import PdfFromUrl from pdftotext import PdfToText import difflib import os #from difflib_data import * #define an instance of PdfFrom Url object pdffromUrl=PdfFromUrl() ##define an instance of PdfToText object p2t=PdfToText() ##open first pdf file and save it as file1.pdf pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_Test_Case_Base_Template.pdf","file1.pdf") ##open second pdf file and save it as file2.pdf pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_TestCase_Modified_Template.pdf","file2.pdf") ##get texts of file1.pdf textOfFile1=p2t.convert_pdf_to_txt("file1.pdf") ##get texts of file2.pdf textOfFile2=p2t.convert_pdf_to_txt("file2.pdf") ##get length of file1.pdf texts lengthOfTextFile1=len(textOfFile1) ##get length of file2.pdf texts lengthOfTextFile2=len(textOfFile2) ##print text length information print "Length of text of File1",lengthOfTextFile1 print "Length of text of File2",lengthOfTextFile2 ##compare text length if(lengthOfTextFile1==lengthOfTextFile2): print "Two pdf files' texts are the same" else: print "Two pdf files' texts are different" d = difflib.HtmlDiff() with open("diff.html","w") as f:
import md5 import pdfx import time (lfile,rfile) = sys.argv[1:] timestr = time.strftime("%Y%m%d-%H%M%S") lpdf = pdfx.PDFx(lfile) lmetadata = lpdf.get_metadata() print lmetadata rpdf = pdfx.PDFx(rfile) rmetadata = rpdf.get_metadata() print rmetadata p2t=PdfToText() textOfFile1=p2t.convert_pdf_to_txt(lfile).decode('utf-8').encode('ascii','ignore') textOfFile2=p2t.convert_pdf_to_txt(rfile).decode('utf-8').encode('ascii','ignore') #print textOfFile1 #print textOfFile2 ##open first pdf file and save it as file1.pdf #pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_Test_Case_Base_Template.pdf","file1.pdf") ##open second pdf file and save it as file2.pdf #pdffromUrl.open_pdf_from_url("http://www.seleniummaster.com/sitecontent/images/Selenium_Master_TestCase_Modified_Template.pdf","file2.pdf") ##get texts of file1.pdf ##get length of file1.pdf texts #textOfFile1 = textOfFile1.decode('UTF-8','ignore') #textOfFile1 = textOfFile1.encode('ascii','ignore') #textOfFile2 = textOfFile2.decode('UTF-8','ignore')