Пример #1
0
def check_on_length(wrong_file, right_file, ending=2000):
    fWrong = open(wrong_file, "w")
    fRight = open(right_file, "w")

    cadidate = open("textOnly.txt")
    cadidate = cadidate.readlines()
    total = 0
    wrong = 0
    widgets = [
        'Test: ',
        Percentage(), ' ',
        Bar(marker='0', left='[', right=']'), ' ',
        ETA(), ' ',
        FileTransferSpeed()
    ]  # see docs for other options

    pbar = ProgressBar(widgets=widgets, maxval=ending)
    pbar.start()
    iii = 0
    for file in cadidate:
        file = file.rstrip()
        print(file)
        pbar.update(iii)
        iii += 1
        file = file.replace(".txt", "")
        img = cv2.imread("scanned\\" + file + ".png")
        ftext = open("text\\" + file + ".txt", "r", encoding="utf-8")
        ## image segmentation

        text = ftext.read()
        wordsArray = nltk.word_tokenize(text)

        thre = binarize(img)
        rotated = textSkewCorrection(thre)
        wordList = wordSegmentation(rotated)

        print(file, len(wordsArray), len(wordList))

        total += 1

        if len(wordList) != len(wordsArray):
            wrong += 1
            stt = "{0:<20}  {1:<20}  {2:<20}".format(file, str(len(wordList)),
                                                     str(len(wordsArray)))
            fWrong.write(stt + "\n")
            print("************wrong Image************")
            write_tokenz(wordsArray)
            #showWordCuts(img,wordList)
            break
        else:
            fRight.write(file + ".txt" + "\n")
            print("************corrent Image************")

        ftext.close()
        if total == ending:
            break

    fWrong.close()
    fRight.close()
    pbar.finish()
Пример #2
0
def segmentationFromPath(imagePath, ticktock):
    # return segmentationFromPath_ammar(imagePath)
    img = cv2.imread(imagePath)
    ticktock.tick("start time")
    thre = binarize(img)
    rotated = textSkewCorrection(thre)
    wordsFromImage = charSegmentation(rotated)

    return wordsFromImage, rotated
Пример #3
0
def main():

    for i in range(0, 1):

        file = random.choice(os.listdir("scanned\\"))
        img = cv2.imread("capr2.png")  #"scanned\\"+file)

        thre = binarize(img)
        rotated = textSkewCorrection(thre)
        wordList = charSegmentation(rotated)
Пример #4
0
 def extract(self):
     folders = glob(f'{self.datasetPath}\\*')
     for folder in folders:
         print(f"currently in dire: {folder}")
         for test_case in glob(folder + '/*'):
             for f in glob(test_case + '/*.png'):
                 outputFolder = self.outPath + f[7:-4] + ".txt"
                 # characterImage=preprocessImage(f)
                 img = cv2.imread(f)
                 img = cv2.resize(img, (28, 28))
                 img = binarize(img)
                 self.writeFeatureVector(outputFolder, img.flatten())
Пример #5
0
def main():

    for i in range(0, 10):

        file = random.choice(os.listdir("scanned\\"))
        img = cv2.imread("scanned\\" + file)

        thre = binarize(img)
        rotated = textSkewCorrection(thre)
        wordList = wordSegmentation(rotated)

        for word in wordList:
            c1, c2 = word['rows']
            r1_offset, r1_offset2 = word['columns']
            if len(word['subwords']) == 0:
                cv2.imshow("sub word ", rotated[c1:c2, r1_offset:r1_offset2])
                cv2.waitKey(0)
            else:
                for subword in word['subwords']:
                    r1, r2 = subword
                    cv2.imshow("sub word ",
                               rotated[c1:c2, r1_offset + r1:r1_offset + r2])
                    cv2.waitKey(0)
Пример #6
0
check_on_length("wrong.txt", "right.txt", ending=5000)
f = open("right.txt", "r")
fw = open("output.txt", "w", encoding="utf-8")
files = f.read().split()

for file in files:

    img = cv2.imread("scanned\\" + file.replace(".txt", "") + ".png")
    ftext = open("text\\" + file, "r", encoding="utf-8")
    ## image segmentation

    # text preprocessing
    text = ftext.read().replace(":", "").replace(".", "").replace("-", "")

    # tokenizations
    wordsArray = nltk.word_tokenize(text)
    for word in wordsArray:
        fw.writelines([word, "  ", str(word.encode("utf8-")), "\n"])

    thre = binarize(img)
    rotated = textSkewCorrection(thre)

    wordList = charSegmentation(rotated)

    createDataSet(rotated, wordList, wordsArray, nonConnChOneSide,
                  nonConnChTwoSide, englishName, file)

f.close()
fw.close()
Пример #7
0
def imageToFeatureVector(imagePath):
    img = cv2.imread(imagePath)
    img = cv2.resize(img, (28, 28))
    img = binarize(img)
    return img.flatten()