def generatePairedTableOnlyOcr(imageTable): foundOneMatch = False singular = [] # used imagehashing to figure out pairs for elemOneKey, elemOneValue in imageTable.items(): one = elemOneValue["ocr_text"] for elemTwoKey, elemTwoValue in imageTable2.items(): two = elemTwoValue["ocr_text"] if elemOneKey != elemTwoKey: ocrDiff = commonMethods.percentageEditDistance(one, two) if ocrDiff <= 0.3: imageTable[elemOneKey]["strictMatch"].add(elemTwoKey) foundOneMatch = True elif ocrDiff < 0.26: imageTable[elemOneKey]["matchNamesRemove"].add(elemTwoKey) foundOneMatch = True if (not foundOneMatch): singular.append(elemOneKey) foundOneMatch = False # reset value for next element #end of outer for loop # the singulars are taken care of imageTable = commonMethods.addSingulars(imageTable, singular) return imageTable
def findImagePairs(imageTable): addList = [] # ( name, ocr_difference, hash_difference ) removeList = [] for key, value in imageTable.iteritems(): # HERE # print("KEY :"), # print(key) for matchKey in value["matchNamesRemove"]: diffOCRdistance = commonMethods.percentageEditDistance( imageTable[key]["ocr_text"],\ imageTable[matchKey]["ocr_text"] ) diffHashDistance = commonMethods.percentHashDifference( imageTable[key]["hash_value"],\ imageTable[matchKey]["hash_value"] ) # print(matchKey, " " , diffOCRdistance, diffHashDistance) # a picture is a strict match if the ocr is within 3 percent and # image structure is within a 5 percent difference if diffHashDistance <= THRESH_HASH_DISTANCE_STRICT and diffOCRdistance <= THRESH_OCR_DISTANCE_STRICT: imageTable[key]['strictMatch'].add(matchKey) # if image is imageDominant elif imageTable[key]["imageDominant"] or imageTable[matchKey][ "imageDominant"]: if diffHashDistance < THRESH_HASH_DISTANCE_STRICT_IMAGE_DOMINANT: imageTable[key]['strictMatch'].add(matchKey) elif diffHashDistance > THRESH_HASH_DISTANCE_REMOVE_IMAGE_DOMINANT or diffOCRdistance > THRESH_OCR_DISTANCE_REMOVE_IMAGE_DOMINANT: removeList.append(matchKey) # if the picture is both not similar in image structure and writing elif diffHashDistance > THRESH_HASH_DISTANCE_REMOVE and diffOCRdistance > THRESH_OCR_DISTANCE_REMOVE: removeList.append(matchKey) # if not a strict match or needing to be removed checked it against # other elements that also do not need to be removed # CAN BE DONE WITH removed elements also but for the time being # have not checked the feasibility with all elements else: addList.append((matchKey, diffOCRdistance, diffHashDistance)) # populate the matchNamesAdd experimentGraph(imageTable, key, addList) # prune the matchNamesRemove commonMethods.removeItemsFromSet(imageTable, key, removeList) removeList = [] addList = [] return imageTable
def generatePairedTableOnlyOcr2(imageTable, imageTable2): # What format do I want results? # Image1 Image2 confidence (0/1/2) # UIC.CS108F17/L1709171030_Q5.jpg UIC.CS108F18/L1709101035_Q4.jpg result = '' # used imagehashing to figure out pairs for elemOneKey, elemOneValue in imageTable.items(): one = elemOneValue["ocr_text"] for elemTwoKey, elemTwoValue in imageTable2.items(): two = elemTwoValue["ocr_text"] if elemOneKey != elemTwoKey: ocrDiff = commonMethods.percentageEditDistance(one, two) if ocrDiff <= 0.3: # 1 means probable match result += "{}\t{}\t1\n".format(elemOneKey, elemTwoKey) elif ocrDiff < 0.26: # 0 means (almost) positive match result += "{}\t{}\t0\n".format(elemOneKey, elemTwoKey) # the singulars are taken care of return result
def find_image_matches(class1, table1, class2, table2, htmlout=False): '''Match the images in class1 against the images from class2 Returns either an HTML file for checking visually, or a For output, the matches show: course1/image1 course2/image2 match_score possible match_score values are: 0 for high confidence match (both phash and text-ocr are below threshold) 1 medium-high confidence (text match, but no phash match) 2 medium confidence match (phash and text at threshold 2, or text at threshold 1) anything else we leave it out. ''' print("comparing {} with {}".format(len(table1), len(table2))) done = set() result = '' html = '' both = 0 neither = 0 phash = 0 text = 0 for filename1, image1 in table1.items(): for filename2, image2 in table2.items(): key = "{}-{}".format(filename1, filename2) if key in done: continue done.add(key) # also add the opposite of the key, so f1-f2 and f2-f1 done.add("{}-{}".format(filename2, filename1)) diff_text = percentageEditDistance(image1.text, image2.text) diff_phash = percentHashDifference(image1.image_hash, image2.image_hash) msg = '{:.2} {:.2} for {} and {}'.format(float(diff_text), float(diff_phash), filename1, filename2) if diff_text < THRESHOLD_TEXT and diff_phash < THRESHOLD_PHASH: #print("both: {}".format(msg)) # BEST: both phash and ocr text indicate a match both += 1 html += tohtml('both!', diff_text, diff_phash, class1, filename1, class2, filename2) result += "{}\t{}\t0\n".format(filename1, filename2) elif diff_text < THRESHOLD_TEXT: #print("text: {}".format(msg)) # second best: text matches text += 1 html += tohtml('text only', diff_text, diff_phash, class1, filename1, class2, filename2) result += "{}\t{}\t1\n".format(filename1, filename2) elif diff_phash < THRESHOLD_PHASH and diff_text < THRESHOLD_TEXT_WITH_PHASH: #print("phash: {}".format(msg)) # third best: phash matches, text is below a less strict threshold phash += 1 html += tohtml('phash only', diff_text, diff_phash, class1, filename1, class2, filename2) result += "{}\t{}\t2\n".format(filename1, filename2) else: # print("neither: {}".format(msg)) neither += 1 # result += "{}\t{}\t{:.2}\t{:.2}\n".format(filename1, filename2, float(diff_text), float(diff_phash)) print("both: {} phash: {} text: {} neither: {}".format( both, phash, text, neither)) if htmlout: return ''' <html> <head><title></title></head> <body> <table> <tr> <th>message</th> <th>diff text</th> <th>diff phash</th> <th>file1</th> <th>file2</th> </tr> {} </table> </body> </html> '''.format(html) else: return result