Exemplo n.º 1
0
def compare_screenshots_sift_ext2(orig_link_path, wayback_link_path, ocr_flag, psnr_flag): 
    # create screenshot from Wayback using timestamp
    print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_path, ", ocr flag: ", ocr_flag, ", psnr flag: ", psnr_flag 
    import datetime
    current_file_id =  timeit.default_timer()
    # create image from original link
    orig_img_name = 'qa/orig/' + str(current_file_id) + '.png'
    # create image from Wayback link
    wayback_img_name = 'wayback.png'
    try:
        # Set the timer for five seconds for file creation due to programm hanging 
        # if file could not be created from the link by internet
        t = Timer(FILE_CREATION_TIME, create_file(orig_link_path, orig_img_name))
        t.start() 
        tw = Timer(FILE_CREATION_TIME, create_file(wayback_link_path, wayback_img_name))
        tw.start() 
        if os.path.exists(orig_img_name):
            start = timeit.default_timer()
            # compare two images
            img1 = cv2.imread(orig_img_name, 0)
            img1_size = os.path.getsize(orig_img_name)
            img2 = cv2.imread(wayback_img_name, 0)
            img2_size = os.path.getsize(wayback_img_name)
            print "extract features ..."
            fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2)
            ocr = 0
            if str2bool(ocr_flag):
                # calculate OCR value from original image
                print "perform OCR analysis ..."
                ocr = blank.ocr(orig_img_name)
            psnr_similarity = "None"
            psnr_threshold = ts_psnr
            psnr_msg = ""
            if str2bool(psnr_flag):
               # compare images using imagemagick tool and PSNR metric 
               print "perform PSNR analysis ..."
               psnr_similarity = "DIFFERENT"
               psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(orig_img_name, wayback_img_name)
            # remove temporary images
            os.remove(wayback_img_name)
            execution_time = timeit.default_timer() - start
            print "execution time: ", execution_time
            # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, 
            # original URL, wayback machine URL, wayback timestamp, SIFT features total count, 
            # matched features count, resulting message, OCR count, file sizes, PSNR value
            # and original file
            wayback_url_list = wayback_link_path.split("/")
            wayback_timestamp = wayback_url_list[-2]
            print "wayback timestamp: ", wayback_timestamp
            import datetime
            current_time =  datetime.datetime.utcnow()
            orig_img_name = 'http://127.0.0.1:8000/' + orig_img_name
            #print "original link: ", orig_link
            store_in_file(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name)
            insert_mongo(current_time, execution_time, orig_link_path, wayback_link_path, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_img_name)
            return mc, fc1, ocr, psnr_similarity
        else:
           print "Warning: original file could not be retrieved from internet!"
    except Exception, e:
        print "Error:", e, " Please check if Wayback machine and MongoDB are running!" 
Exemplo n.º 2
0
def compare_images_by_path_and_link_ext(orig_link_path, wayback_link_name, ocr_flag, ts1=60, ts2=30, psnr_flag=False, psnr_th=None): 
    # create screenshot from Wayback using timestamp
    print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_name, ", ocr flag: ", ocr_flag, ", high ts1: ", ts1, ", low ts2: ", ts2 
    
    # create screenshot from Wayback using timestamp
    wayback_link = urllib.unquote_plus(wayback_link_name)
    
    # create image from Wayback link
    wayback_img_name = "wayback.png"
    convert_url_to_file(wayback_link, wayback_img_name)
    print "wayback file: ", wayback_img_name
    try:
        timeit.default_timer 
        start = timeit.default_timer()
         # compare two images
        img1 = cv2.imread(orig_link_path, 0)
        img1_size = os.path.getsize(orig_link_path)
        img2 = cv2.imread(wayback_img_name, 0)
        img2_size = os.path.getsize(wayback_img_name)
        print "extract features ..."
        fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2)
        ocr = 0
        if str2bool(ocr_flag):
            # calculate OCR value from original image
            print "perform OCR analysis ..."
            ocr = blank.ocr(orig_link_path)
        psnr_similarity = "DIFFERENT"
        psnr_msg = ""
        if str2bool(psnr_flag):
            # compare images using imagemagick tool and PSNR metric 
            print "perform PSNR analysis ..."
            if psnr_th is None:
                psnr_th = th_psnr
            psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(orig_link_path, wayback_img_name)
        # remove temporary images
   #     os.remove(wayback_img_name)
        execution_time = timeit.default_timer() - start
        print "execution time: ", execution_time
        # store in CSV and MongoDB resulting timstamp in ms, execution time in sec, 
        # original URL, wayback machine URL, wayback timestamp, SIFT features total count, 
        # matched features count, resulting message, OCR count, file sizes
        import datetime
        current_time =  datetime.datetime.utcnow()
        wayback_url_list = wayback_link.split("/")
        wayback_timestamp = wayback_url_list[-2]
        print "wayback timestamp: ", wayback_timestamp
        #print "original link: ", orig_link
        store_in_file(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path)
        insert_mongo(current_time, execution_time, orig_link_path, wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size, psnr_similarity, psnr_threshold, psnr_msg, orig_link_path)
        print "result message: ", msg
        return msg
    except Exception, e:
        print "Error:", e, " Please check if Wayback machine and MongoDB are running!" 
def handleUri(link_name, counter):
    # create screenshot from Wayback using timestamp
    link = urllib.unquote_plus(link_name)
    #print "link: ", link
    initial_uri = link.replace('.jpg', '')
    warc_link = "http://localhost:8080/" + initial_uri.replace("http", "/http")
    #print "warc link: ", warc_link
    print " [%s] Wayback link %s" % (str(counter), warc_link)
    # create image from Wayback link
    warc_img = 'warc.png'
    try:
        for line in run_command([PHANTOMJS, RASTERIZE, warc_link, warc_img]):
            print(line)
        #timeit.default_timer # from last call
        start = timeit.default_timer()
        # compare two images
        img2 = cv2.imread(warc_img, 0)
        img2_size = os.path.getsize(warc_img)
        print "original screenshot: " + file_path + "/" + name
        img1 = cv2.imread(file_path + "/" + name, 0)
        img1_size = os.path.getsize(file_path + "/" + name)
        # threshold for high similarity between screenshots e.g. 60
        # threshold for small differencies e.g. 30
        # highest similarity is 100, minimal is 0
        fc1, fc2, mc, msg = compare_screenshots.compare_ext(
            img1, img2, 'sift', ts1, ts2)
        # remove temporary image
        os.remove(warc_img)
        # calculate OCR value from original image
        ocr = blank.ocr_ext(file_path + "/" + name)
        execution_time = timeit.default_timer() - start
        print "execution time: ", execution_time
        # store in CSV and MongoDB resulting timstamp in ms, URL, wayback timestamp,
        # execution time in ms, file size, SIFT features total count,
        # matched features count, OCR count
        import datetime
        current_time = datetime.datetime.utcnow()
        wayback_timestamp = link[:TIMESTAMP_SIZE]
        print "wayback timestamp: ", wayback_timestamp
        #print "original link: ", orig_link
        store_in_file(current_time, execution_time,
                      initial_uri[TIMESTAMP_SIZE:], warc_link,
                      wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size,
                      img2_size)
        insert_mongo(current_time, execution_time,
                     initial_uri[TIMESTAMP_SIZE:], warc_link,
                     wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size,
                     img2_size)
    except Exception, e:
        print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
Exemplo n.º 4
0
def handleUri(link_name, counter): 
    # create screenshot from Wayback using timestamp
    link = urllib.unquote_plus(link_name)
    #print "link: ", link 
    initial_uri = link.replace('.jpg', '')
    warc_link = "http://localhost:8080/" + initial_uri.replace("http", "/http")
    #print "warc link: ", warc_link
    print " [%s] Wayback link %s" % (str(counter), warc_link)
    # create image from Wayback link
    warc_img = 'warc.png'
    try:
        for line in run_command([PHANTOMJS, RASTERIZE, warc_link, warc_img]):
            print (line)
        #timeit.default_timer # from last call
        start = timeit.default_timer()
         # compare two images
        img2 = cv2.imread(warc_img, 0)
        img2_size = os.path.getsize(warc_img)
        print "original screenshot: " + file_path + "/" + name
        img1 = cv2.imread(file_path + "/" + name, 0)
        img1_size = os.path.getsize(file_path + "/" + name)
        # threshold for high similarity between screenshots e.g. 60
        # threshold for small differencies e.g. 30
        # highest similarity is 100, minimal is 0
        fc1, fc2, mc, msg = compare_screenshots.compare_ext(img1, img2, 'sift', ts1, ts2)
        # remove temporary image
        os.remove(warc_img)
        # calculate OCR value from original image
        ocr = blank.ocr_ext(file_path + "/" + name)
        execution_time = timeit.default_timer() - start
        print "execution time: ", execution_time
        # store in CSV and MongoDB resulting timstamp in ms, URL, wayback timestamp, 
        # execution time in ms, file size, SIFT features total count, 
        # matched features count, OCR count
        import datetime
        current_time =  datetime.datetime.utcnow()
        wayback_timestamp = link[:TIMESTAMP_SIZE]
        print "wayback timestamp: ", wayback_timestamp
        #print "original link: ", orig_link
        store_in_file(current_time, execution_time, initial_uri[TIMESTAMP_SIZE:], warc_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size)
        insert_mongo(current_time, execution_time, initial_uri[TIMESTAMP_SIZE:], warc_link, wayback_timestamp, fc1, fc2, mc, msg, ocr, img1_size, img2_size)
    except Exception, e:
        print "Error:", e, " Please check if Wayback machine and MongoDB are running!" 
def compare_images_by_path_and_link_ext(orig_link_path,
                                        wayback_link_name,
                                        ocr_flag,
                                        ts1=60,
                                        ts2=30,
                                        psnr_flag=False,
                                        psnr_th=None):
    # create screenshot from Wayback using timestamp
    print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_name, ", ocr flag: ", ocr_flag, ", high ts1: ", ts1, ", low ts2: ", ts2

    # create screenshot from Wayback using timestamp
    wayback_link = urllib.unquote_plus(wayback_link_name)

    # create image from Wayback link
    wayback_img_name = "wayback.png"
    convert_url_to_file(wayback_link, wayback_img_name)
    print "wayback file: ", wayback_img_name
    try:
        timeit.default_timer
        start = timeit.default_timer()
        # compare two images
        img1 = cv2.imread(orig_link_path, 0)
        img1_size = os.path.getsize(orig_link_path)
        img2 = cv2.imread(wayback_img_name, 0)
        img2_size = os.path.getsize(wayback_img_name)
        print "extract features ..."
        fc1, fc2, mc, msg = compare_screenshots.compare_ext(
            img1, img2, 'sift', ts1, ts2)
        ocr = 0
        if str2bool(ocr_flag):
            # calculate OCR value from original image
            print "perform OCR analysis ..."
            ocr = blank.ocr(orig_link_path)
        psnr_similarity = "DIFFERENT"
        psnr_msg = ""
        if str2bool(psnr_flag):
            # compare images using imagemagick tool and PSNR metric
            print "perform PSNR analysis ..."
            if psnr_th is None:
                psnr_th = th_psnr
            psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(
                orig_link_path, wayback_img_name)
        # remove temporary images
#     os.remove(wayback_img_name)
        execution_time = timeit.default_timer() - start
        print "execution time: ", execution_time
        # store in CSV and MongoDB resulting timstamp in ms, execution time in sec,
        # original URL, wayback machine URL, wayback timestamp, SIFT features total count,
        # matched features count, resulting message, OCR count, file sizes
        import datetime
        current_time = datetime.datetime.utcnow()
        wayback_url_list = wayback_link.split("/")
        wayback_timestamp = wayback_url_list[-2]
        print "wayback timestamp: ", wayback_timestamp
        #print "original link: ", orig_link
        store_in_file(current_time, execution_time, orig_link_path,
                      wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr,
                      img1_size, img2_size, psnr_similarity, psnr_threshold,
                      psnr_msg, orig_link_path)
        insert_mongo(current_time, execution_time, orig_link_path,
                     wayback_link, wayback_timestamp, fc1, fc2, mc, msg, ocr,
                     img1_size, img2_size, psnr_similarity, psnr_threshold,
                     psnr_msg, orig_link_path)
        print "result message: ", msg
        return msg
    except Exception, e:
        print "Error:", e, " Please check if Wayback machine and MongoDB are running!"
def compare_screenshots_sift_ext2(orig_link_path, wayback_link_path, ocr_flag,
                                  psnr_flag):
    # create screenshot from Wayback using timestamp
    print "\noriginal link: ", orig_link_path, ", wayback link: ", wayback_link_path, ", ocr flag: ", ocr_flag, ", psnr flag: ", psnr_flag
    import datetime
    current_file_id = timeit.default_timer()
    # create image from original link
    orig_img_name = 'qa/orig/' + str(current_file_id) + '.png'
    # create image from Wayback link
    wayback_img_name = 'wayback.png'
    try:
        # Set the timer for five seconds for file creation due to programm hanging
        # if file could not be created from the link by internet
        t = Timer(FILE_CREATION_TIME, create_file(orig_link_path,
                                                  orig_img_name))
        t.start()
        tw = Timer(FILE_CREATION_TIME,
                   create_file(wayback_link_path, wayback_img_name))
        tw.start()
        if os.path.exists(orig_img_name):
            start = timeit.default_timer()
            # compare two images
            img1 = cv2.imread(orig_img_name, 0)
            img1_size = os.path.getsize(orig_img_name)
            img2 = cv2.imread(wayback_img_name, 0)
            img2_size = os.path.getsize(wayback_img_name)
            print "extract features ..."
            fc1, fc2, mc, msg = compare_screenshots.compare_ext(
                img1, img2, 'sift', ts1, ts2)
            ocr = 0
            if str2bool(ocr_flag):
                # calculate OCR value from original image
                print "perform OCR analysis ..."
                ocr = blank.ocr(orig_img_name)
            psnr_similarity = "None"
            psnr_threshold = ts_psnr
            psnr_msg = ""
            if str2bool(psnr_flag):
                # compare images using imagemagick tool and PSNR metric
                print "perform PSNR analysis ..."
                psnr_similarity = "DIFFERENT"
                psnr_similarity, psnr_threshold, psnr_msg = compare_psnr(
                    orig_img_name, wayback_img_name)
            # remove temporary images
            os.remove(wayback_img_name)
            execution_time = timeit.default_timer() - start
            print "execution time: ", execution_time
            # store in CSV and MongoDB resulting timstamp in ms, execution time in sec,
            # original URL, wayback machine URL, wayback timestamp, SIFT features total count,
            # matched features count, resulting message, OCR count, file sizes, PSNR value
            # and original file
            wayback_url_list = wayback_link_path.split("/")
            wayback_timestamp = wayback_url_list[-2]
            print "wayback timestamp: ", wayback_timestamp
            import datetime
            current_time = datetime.datetime.utcnow()
            orig_img_name = 'http://127.0.0.1:8000/' + orig_img_name
            #print "original link: ", orig_link
            store_in_file(current_time, execution_time, orig_link_path,
                          wayback_link_path, wayback_timestamp, fc1, fc2, mc,
                          msg, ocr, img1_size, img2_size, psnr_similarity,
                          psnr_threshold, psnr_msg, orig_img_name)
            insert_mongo(current_time, execution_time, orig_link_path,
                         wayback_link_path, wayback_timestamp, fc1, fc2, mc,
                         msg, ocr, img1_size, img2_size, psnr_similarity,
                         psnr_threshold, psnr_msg, orig_img_name)
            return mc, fc1, ocr, psnr_similarity
        else:
            print "Warning: original file could not be retrieved from internet!"
    except Exception, e:
        print "Error:", e, " Please check if Wayback machine and MongoDB are running!"