def check_blank(orig_link_path): print "\noriginal link: ", orig_link_path # create image from original link orig_img_name = 'orig.png' try: for line in run_command([PHANTOMJS, RASTERIZE, orig_link_path, orig_img_name]): print (line) timeit.default_timer # from last call start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_img_name, 0) img1_size = os.path.getsize(orig_img_name) # calculate OCR value from original image ocr = blank.ocr(orig_img_name) # remove temporary images os.remove(orig_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes import datetime current_time = datetime.datetime.utcnow() return ocr except Exception, e: print "Error:", e, " Please check if OCR tool is running!"
def compare_screenshots_sift_ext2(orig_link_path, wayback_link_path, ocr_flag, psnr_flag): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_path, ", ocr flag: ", ocr_flag, ", psnr flag: ", psnr_flag import datetime current_file_id = timeit.default_timer() # create image from original link orig_img_name = 'qa/orig/' + str(current_file_id) + '.png' # create image from Wayback link wayback_img_name = 'wayback.png' try: # Set the timer for five seconds for file creation due to programm hanging # if file could not be created from the link by internet t = Timer(FILE_CREATION_TIME, create_file(orig_link_path, orig_img_name)) t.start() tw = Timer(FILE_CREATION_TIME, create_file(wayback_link_path, wayback_img_name)) tw.start() if os.path.exists(orig_img_name): start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_img_name, 0) img1_size = os.path.getsize(orig_img_name) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_img_name) psnr_similarity = "None" psnr_threshold = ts_psnr psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." psnr_similarity = "DIFFERENT" psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(orig_img_name, wayback_img_name) # remove temporary images os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes, PSNR value # and original file wayback_url_list = wayback_link_path.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp import datetime current_time = datetime.datetime.utcnow() orig_img_name = 'http://127.0.0.1:8000/' + orig_img_name #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) insert_mongo(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) return mc, fc1, ocr, psnr_similarity else: print "Warning: original file could not be retrieved from internet!" except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def check_blank(orig_link_path): print "\noriginal link: ", orig_link_path # create image from original link orig_img_name = 'orig.png' try: for line in run_command( [PHANTOMJS, RASTERIZE, orig_link_path, orig_img_name]): print(line) timeit.default_timer # from last call start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_img_name, 0) img1_size = os.path.getsize(orig_img_name) # calculate OCR value from original image ocr = blank.ocr(orig_img_name) # remove temporary images os.remove(orig_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes import datetime current_time = datetime.datetime.utcnow() return ocr except Exception, e: print "Error:", e, " Please check if OCR tool is running!"
def compare_images_by_path_and_link_ext(orig_link_path, wayback_link_name, ocr_flag, ts1=60, ts2=30, psnr_flag=False, psnr_th=None): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_name, ", ocr flag: ", ocr_flag, ", high ts1: ", ts1, ", low ts2: ", ts2 # create screenshot from Wayback using timestamp wayback_link = urllib.unquote_plus(wayback_link_name) # create image from Wayback link wayback_img_name = "wayback.png" convert_url_to_file(wayback_link, wayback_img_name) print "wayback file: ", wayback_img_name try: timeit.default_timer start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_link_path, 0) img1_size = os.path.getsize(orig_link_path) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_link_path) psnr_similarity = "DIFFERENT" psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." if psnr_th is None: psnr_th = th_psnr psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(orig_link_path, wayback_img_name) # remove temporary images # os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes import datetime current_time = datetime.datetime.utcnow() wayback_url_list = wayback_link.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) insert_mongo(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) print "result message: ", msg return msg except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def compare_images_by_path_and_link_ext(orig_link_path, wayback_link_name, ocr_flag, ts1=60, ts2=30, psnr_flag=False, psnr_th=None): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_name, ", ocr flag: ", ocr_flag, ", high ts1: ", ts1, ", low ts2: ", ts2 # create screenshot from Wayback using timestamp wayback_link = urllib.unquote_plus(wayback_link_name) # create image from Wayback link wayback_img_name = "wayback.png" convert_url_to_file(wayback_link, wayback_img_name) print "wayback file: ", wayback_img_name try: timeit.default_timer start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_link_path, 0) img1_size = os.path.getsize(orig_link_path) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext( img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_link_path) psnr_similarity = "DIFFERENT" psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." if psnr_th is None: psnr_th = th_psnr psnr_similarity, psnr_threshold, psnr_msg = compare_psnr( orig_link_path, wayback_img_name) # remove temporary images # os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes import datetime current_time = datetime.datetime.utcnow() wayback_url_list = wayback_link.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) insert_mongo(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) print "result message: ", msg return msg except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def compare_screenshots_sift_ext2(orig_link_path, wayback_link_path, ocr_flag, psnr_flag): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_path, ", ocr flag: ", ocr_flag, ", psnr flag: ", psnr_flag import datetime current_file_id = timeit.default_timer() # create image from original link orig_img_name = 'qa/orig/' + str(current_file_id) + '.png' # create image from Wayback link wayback_img_name = 'wayback.png' try: # Set the timer for five seconds for file creation due to programm hanging # if file could not be created from the link by internet t = Timer(FILE_CREATION_TIME, create_file(orig_link_path, orig_img_name)) t.start() tw = Timer(FILE_CREATION_TIME, create_file(wayback_link_path, wayback_img_name)) tw.start() if os.path.exists(orig_img_name): start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_img_name, 0) img1_size = os.path.getsize(orig_img_name) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext( img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_img_name) psnr_similarity = "None" psnr_threshold = ts_psnr psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." psnr_similarity = "DIFFERENT" psnr_similarity, psnr_threshold, psnr_msg = compare_psnr( orig_img_name, wayback_img_name) # remove temporary images os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes, PSNR value # and original file wayback_url_list = wayback_link_path.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp import datetime current_time = datetime.datetime.utcnow() orig_img_name = 'http://127.0.0.1:8000/' + orig_img_name #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) insert_mongo(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) return mc, fc1, ocr, psnr_similarity else: print "Warning: original file could not be retrieved from internet!" except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"