def check_on_length(wrong_file, right_file, ending=2000): fWrong = open(wrong_file, "w") fRight = open(right_file, "w") cadidate = open("textOnly.txt") cadidate = cadidate.readlines() total = 0 wrong = 0 widgets = [ 'Test: ', Percentage(), ' ', Bar(marker='0', left='[', right=']'), ' ', ETA(), ' ', FileTransferSpeed() ] # see docs for other options pbar = ProgressBar(widgets=widgets, maxval=ending) pbar.start() iii = 0 for file in cadidate: file = file.rstrip() print(file) pbar.update(iii) iii += 1 file = file.replace(".txt", "") img = cv2.imread("scanned\\" + file + ".png") ftext = open("text\\" + file + ".txt", "r", encoding="utf-8") ## image segmentation text = ftext.read() wordsArray = nltk.word_tokenize(text) thre = binarize(img) rotated = textSkewCorrection(thre) wordList = wordSegmentation(rotated) print(file, len(wordsArray), len(wordList)) total += 1 if len(wordList) != len(wordsArray): wrong += 1 stt = "{0:<20} {1:<20} {2:<20}".format(file, str(len(wordList)), str(len(wordsArray))) fWrong.write(stt + "\n") print("************wrong Image************") write_tokenz(wordsArray) #showWordCuts(img,wordList) break else: fRight.write(file + ".txt" + "\n") print("************corrent Image************") ftext.close() if total == ending: break fWrong.close() fRight.close() pbar.finish()
def segmentationFromPath(imagePath, ticktock): # return segmentationFromPath_ammar(imagePath) img = cv2.imread(imagePath) ticktock.tick("start time") thre = binarize(img) rotated = textSkewCorrection(thre) wordsFromImage = charSegmentation(rotated) return wordsFromImage, rotated
def main(): for i in range(0, 1): file = random.choice(os.listdir("scanned\\")) img = cv2.imread("capr2.png") #"scanned\\"+file) thre = binarize(img) rotated = textSkewCorrection(thre) wordList = charSegmentation(rotated)
def extract(self): folders = glob(f'{self.datasetPath}\\*') for folder in folders: print(f"currently in dire: {folder}") for test_case in glob(folder + '/*'): for f in glob(test_case + '/*.png'): outputFolder = self.outPath + f[7:-4] + ".txt" # characterImage=preprocessImage(f) img = cv2.imread(f) img = cv2.resize(img, (28, 28)) img = binarize(img) self.writeFeatureVector(outputFolder, img.flatten())
def main(): for i in range(0, 10): file = random.choice(os.listdir("scanned\\")) img = cv2.imread("scanned\\" + file) thre = binarize(img) rotated = textSkewCorrection(thre) wordList = wordSegmentation(rotated) for word in wordList: c1, c2 = word['rows'] r1_offset, r1_offset2 = word['columns'] if len(word['subwords']) == 0: cv2.imshow("sub word ", rotated[c1:c2, r1_offset:r1_offset2]) cv2.waitKey(0) else: for subword in word['subwords']: r1, r2 = subword cv2.imshow("sub word ", rotated[c1:c2, r1_offset + r1:r1_offset + r2]) cv2.waitKey(0)
check_on_length("wrong.txt", "right.txt", ending=5000) f = open("right.txt", "r") fw = open("output.txt", "w", encoding="utf-8") files = f.read().split() for file in files: img = cv2.imread("scanned\\" + file.replace(".txt", "") + ".png") ftext = open("text\\" + file, "r", encoding="utf-8") ## image segmentation # text preprocessing text = ftext.read().replace(":", "").replace(".", "").replace("-", "") # tokenizations wordsArray = nltk.word_tokenize(text) for word in wordsArray: fw.writelines([word, " ", str(word.encode("utf8-")), "\n"]) thre = binarize(img) rotated = textSkewCorrection(thre) wordList = charSegmentation(rotated) createDataSet(rotated, wordList, wordsArray, nonConnChOneSide, nonConnChTwoSide, englishName, file) f.close() fw.close()
def imageToFeatureVector(imagePath): img = cv2.imread(imagePath) img = cv2.resize(img, (28, 28)) img = binarize(img) return img.flatten()