def compare_screenshots_sift_ext2(orig_link_path, wayback_link_path, ocr_flag, psnr_flag): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_path, ", ocr flag: ", ocr_flag, ", psnr flag: ", psnr_flag import datetime current_file_id = timeit.default_timer() # create image from original link orig_img_name = 'qa/orig/' + str(current_file_id) + '.png' # create image from Wayback link wayback_img_name = 'wayback.png' try: # Set the timer for five seconds for file creation due to programm hanging # if file could not be created from the link by internet t = Timer(FILE_CREATION_TIME, create_file(orig_link_path, orig_img_name)) t.start() tw = Timer(FILE_CREATION_TIME, create_file(wayback_link_path, wayback_img_name)) tw.start() if os.path.exists(orig_img_name): start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_img_name, 0) img1_size = os.path.getsize(orig_img_name) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_img_name) psnr_similarity = "None" psnr_threshold = ts_psnr psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." psnr_similarity = "DIFFERENT" psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(orig_img_name, wayback_img_name) # remove temporary images os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes, PSNR value # and original file wayback_url_list = wayback_link_path.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp import datetime current_time = datetime.datetime.utcnow() orig_img_name = 'http://127.0.0.1:8000/' + orig_img_name #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) insert_mongo(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) return mc, fc1, ocr, psnr_similarity else: print "Warning: original file could not be retrieved from internet!" except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def compare_images_by_path_and_link_ext(orig_link_path, wayback_link_name, ocr_flag, ts1=60, ts2=30, psnr_flag=False, psnr_th=None): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_name, ", ocr flag: ", ocr_flag, ", high ts1: ", ts1, ", low ts2: ", ts2 # create screenshot from Wayback using timestamp wayback_link = urllib.unquote_plus(wayback_link_name) # create image from Wayback link wayback_img_name = "wayback.png" convert_url_to_file(wayback_link, wayback_img_name) print "wayback file: ", wayback_img_name try: timeit.default_timer start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_link_path, 0) img1_size = os.path.getsize(orig_link_path) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_link_path) psnr_similarity = "DIFFERENT" psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." if psnr_th is None: psnr_th = th_psnr psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(orig_link_path, wayback_img_name) # remove temporary images # os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes import datetime current_time = datetime.datetime.utcnow() wayback_url_list = wayback_link.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) insert_mongo(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) print "result message: ", msg return msg except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def handleUri(link_name, counter): # create screenshot from Wayback using timestamp link = urllib.unquote_plus(link_name) #print "link: ", link initial_uri = link.replace('.jpg', '') warc_link = "http://localhost:8080/" + initial_uri.replace("http", "/http") #print "warc link: ", warc_link print " [%s] Wayback link %s" % (str(counter), warc_link) # create image from Wayback link warc_img = 'warc.png' try: for line in run_command([PHANTOMJS, RASTERIZE, warc_link, warc_img]): print(line) #timeit.default_timer # from last call start = timeit.default_timer() # compare two images img2 = cv2.imread(warc_img, 0) img2_size = os.path.getsize(warc_img) print "original screenshot: " + file_path + "/" + name img1 = cv2.imread(file_path + "/" + name, 0) img1_size = os.path.getsize(file_path + "/" + name) # threshold for high similarity between screenshots e.g. 60 # threshold for small differencies e.g. 30 # highest similarity is 100, minimal is 0 fc1, fc2, mc, msg = compare_screenshots.compare_ext( img1, img2, 'sift', ts1, ts2) # remove temporary image os.remove(warc_img) # calculate OCR value from original image ocr = blank.ocr_ext(file_path + "/" + name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, URL, wayback timestamp, # execution time in ms, file size, SIFT features total count, # matched features count, OCR count import datetime current_time = datetime.datetime.utcnow() wayback_timestamp = link[:TIMESTAMP_SIZE] print "wayback timestamp: ", wayback_timestamp #print "original link: ", orig_link store_in_file(current_time, execution_time, initial_uri[TIMESTAMP_SIZE:], warc_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size) insert_mongo(current_time, execution_time, initial_uri[TIMESTAMP_SIZE:], warc_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size) except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def handleUri(link_name, counter): # create screenshot from Wayback using timestamp link = urllib.unquote_plus(link_name) #print "link: ", link initial_uri = link.replace('.jpg', '') warc_link = "http://localhost:8080/" + initial_uri.replace("http", "/http") #print "warc link: ", warc_link print " [%s] Wayback link %s" % (str(counter), warc_link) # create image from Wayback link warc_img = 'warc.png' try: for line in run_command([PHANTOMJS, RASTERIZE, warc_link, warc_img]): print (line) #timeit.default_timer # from last call start = timeit.default_timer() # compare two images img2 = cv2.imread(warc_img, 0) img2_size = os.path.getsize(warc_img) print "original screenshot: " + file_path + "/" + name img1 = cv2.imread(file_path + "/" + name, 0) img1_size = os.path.getsize(file_path + "/" + name) # threshold for high similarity between screenshots e.g. 60 # threshold for small differencies e.g. 30 # highest similarity is 100, minimal is 0 fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2) # remove temporary image os.remove(warc_img) # calculate OCR value from original image ocr = blank.ocr_ext(file_path + "/" + name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, URL, wayback timestamp, # execution time in ms, file size, SIFT features total count, # matched features count, OCR count import datetime current_time = datetime.datetime.utcnow() wayback_timestamp = link[:TIMESTAMP_SIZE] print "wayback timestamp: ", wayback_timestamp #print "original link: ", orig_link store_in_file(current_time, execution_time, initial_uri[TIMESTAMP_SIZE:], warc_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size) insert_mongo(current_time, execution_time, initial_uri[TIMESTAMP_SIZE:], warc_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size) except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def compare_images_by_path_and_link_ext(orig_link_path, wayback_link_name, ocr_flag, ts1=60, ts2=30, psnr_flag=False, psnr_th=None): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_name, ", ocr flag: ", ocr_flag, ", high ts1: ", ts1, ", low ts2: ", ts2 # create screenshot from Wayback using timestamp wayback_link = urllib.unquote_plus(wayback_link_name) # create image from Wayback link wayback_img_name = "wayback.png" convert_url_to_file(wayback_link, wayback_img_name) print "wayback file: ", wayback_img_name try: timeit.default_timer start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_link_path, 0) img1_size = os.path.getsize(orig_link_path) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext( img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_link_path) psnr_similarity = "DIFFERENT" psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." if psnr_th is None: psnr_th = th_psnr psnr_similarity, psnr_threshold, psnr_msg = compare_psnr( orig_link_path, wayback_img_name) # remove temporary images # os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes import datetime current_time = datetime.datetime.utcnow() wayback_url_list = wayback_link.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) insert_mongo(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path) print "result message: ", msg return msg except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def compare_screenshots_sift_ext2(orig_link_path, wayback_link_path, ocr_flag, psnr_flag): # create screenshot from Wayback using timestamp print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_path, ", ocr flag: ", ocr_flag, ", psnr flag: ", psnr_flag import datetime current_file_id = timeit.default_timer() # create image from original link orig_img_name = 'qa/orig/' + str(current_file_id) + '.png' # create image from Wayback link wayback_img_name = 'wayback.png' try: # Set the timer for five seconds for file creation due to programm hanging # if file could not be created from the link by internet t = Timer(FILE_CREATION_TIME, create_file(orig_link_path, orig_img_name)) t.start() tw = Timer(FILE_CREATION_TIME, create_file(wayback_link_path, wayback_img_name)) tw.start() if os.path.exists(orig_img_name): start = timeit.default_timer() # compare two images img1 = cv2.imread(orig_img_name, 0) img1_size = os.path.getsize(orig_img_name) img2 = cv2.imread(wayback_img_name, 0) img2_size = os.path.getsize(wayback_img_name) print "extract features ..." fc1, fc2, mc, msg = compare_screenshots.compare_ext( img1, img2, 'sift', ts1, ts2) ocr = 0 if str2bool(ocr_flag): # calculate OCR value from original image print "perform OCR analysis ..." ocr = blank.ocr(orig_img_name) psnr_similarity = "None" psnr_threshold = ts_psnr psnr_msg = "" if str2bool(psnr_flag): # compare images using imagemagick tool and PSNR metric print "perform PSNR analysis ..." psnr_similarity = "DIFFERENT" psnr_similarity, psnr_threshold, psnr_msg = compare_psnr( orig_img_name, wayback_img_name) # remove temporary images os.remove(wayback_img_name) execution_time = timeit.default_timer() - start print "execution time: ", execution_time # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, # original URL, wayback machine URL, wayback timestamp, SIFT features total count, # matched features count, resulting message, OCR count, file sizes, PSNR value # and original file wayback_url_list = wayback_link_path.split("/") wayback_timestamp = wayback_url_list[-2] print "wayback timestamp: ", wayback_timestamp import datetime current_time = datetime.datetime.utcnow() orig_img_name = 'http://127.0.0.1:8000/' + orig_img_name #print "original link: ", orig_link store_in_file(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) insert_mongo(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name) return mc, fc1, ocr, psnr_similarity else: print "Warning: original file could not be retrieved from internet!" except Exception, e: print "Error:", e, " Please check if Wayback machine and MongoDB are running!"