예제 #1
0
def generateProcessedFiles(splittedName):
    # Get string from Image
    fullName = splittedName[0] + '.' + splittedName[1]
    imgText = pt.image_to_string(Image.open(fullName), lang='ron')

    with open('../TextIntermediar/' + splittedName[0] + "text.txt", 'w') as f:
        f.write(imgText)

    # Get bounding boxes
    pt.run_tesseract(fullName,
                     splittedName[0] + 'output',
                     lang='ron',
                     boxes=True,
                     config="hocr")

    #Remove non-alphanumeric characters
    with open(splittedName[0] + 'output.box', 'r+') as f:
        buf = ''
        for line in f:
            if line[0].isalnum() or line[0] == '(' or line[0] == ')':
                buf += line
        f.seek(0)
        f.write(buf)

    # To read the coordinates
    boxes = []
    with open(splittedName[0] + 'output.box', 'r') as f:
        reader = csv.reader(f, delimiter=' ')
        for row in reader:
            if (len(row) == 6):
                boxes.append(row)
예제 #2
0
def image_coordinates(image_path):
	"This function accepts path to an image, it will use pytesseract and Beautiful Soup to extract out the coordinates of the identified characters in the image "
	
	output_path = image_path[:-4]				
	text_path  =image_path.replace('.jpg', '_char_coord.txt')
	f = open(text_path, 'w')
	
	pytesseract.run_tesseract(image_path, output_path, lang = None, boxes = False, config = "hocr")		#a hocr file with image's name will be created 
	output_path = output_path + ".hocr"
	image_soup = BeautifulSoup(open(output_path))				#opening the .hocr file to extract the bounding box coordinates
	str_image_soup = str(image_soup)							#converting the soup object into string to extract out data
	word_count = str_image_soup.count("word_")					#number of elements with id word_some_number, which is basically the number
	
	for i in range(1, word_count + 1):
		
		str_temp = str(image_soup.find_all(id="word_1_"+ str(i)))
		start_point = int(str_temp.find("<em>"))				#index of '<' in <em> tag
		start_point += 4										#index of the word that it it contains 		
		end_point = int(str_temp.find("</em>"))					#index of '<' in </em> tag
		word = str_temp[start_point:end_point]				
		
		#if(not (word == '')):												#if word is not empty 
		start_point = int(str_temp.find("bbox"))				
		start_point += 5
		end_point = int(str_temp.find(";"))
		bounding_box = str_temp[start_point:end_point]			
		f.write(word + " " + bounding_box + "\n")
예제 #3
0
def ocr(img, mrz_mode=True, extra_cmdline_params=''):
    """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image.

    This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL.

    In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract, but it works so far :)

    :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts.
                     When False, no specific configuration parameters are passed (and you are free to provide your own via `extra_cmdline_params`)
    :param extra_cmdline_params: extra parameters passed to tesseract. When mrz_mode=True, these are appended to whatever is the
                    "best known" configuration at the moment.
                    "--oem 0" is the parameter you might want to pass. This selects the Tesseract's "legacy" OCR engine, which often seems
                    to work better than the new LSTM-based one.
    """
    if img is None or img.shape[-1] == 0:  # Issue #34
        return ''
    input_file_name = '%s.bmp' % _tempnam()
    output_file_name_base = '%s' % _tempnam()
    output_file_name = "%s.txt" % output_file_name_base
    try:
        # Prevent annoying warning about lossy conversion to uint8
        if str(img.dtype).startswith(
                'float') and np.nanmin(img) >= 0 and np.nanmax(img) <= 1:
            img = img.astype(np.float64) * (np.power(2.0, 8) - 1) + 0.499999999
            img = img.astype(np.uint8)
        imwrite(input_file_name, img)

        if mrz_mode:
            # NB: Tesseract 4.0 does not seem to support tessedit_char_whitelist
            config = (
                "--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789><"
                " -c load_system_dawg=F -c load_freq_dawg=F {}"
            ).format(extra_cmdline_params)
        else:
            config = "{}".format(extra_cmdline_params)

        pytesseract.run_tesseract(input_file_name,
                                  output_file_name_base,
                                  'txt',
                                  lang=None,
                                  config=config)

        if sys.version_info.major == 3:
            f = open(output_file_name, encoding='utf-8')
        else:
            f = open(output_file_name)

        try:
            return f.read().strip()
        finally:
            f.close()
    finally:
        pytesseract.cleanup(input_file_name)
        pytesseract.cleanup(output_file_name)
예제 #4
0
def perform_ocr():

    pt.run_tesseract('/app/temp/img.jpg',
                     'file_ocr',
                     extension=".jpg",
                     lang='lao')

    txt = ""
    with codecs.open("/app/file_ocr.txt", encoding="utf-8") as f:
        for line in f:
            txt += line + '\n'

    return txt
예제 #5
0
def ocr(img, config=''):
    """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image.

    This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL.

    In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract (e.g. it
    may lose the NamedTemporaryFile due to its auto-delete behaviour).

    :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts.
    """
    input_file_name = '%s.bmp' % pytesseract.tempnam()
    output_file_name_base = '%s' % pytesseract.tempnam()
    output_file_name = "%s.txt" % output_file_name_base
    try:
        imsave(input_file_name, img)
        status, error_string = pytesseract.run_tesseract(input_file_name,
                                                         output_file_name_base,
                                                         lang=None,
                                                         boxes=False,
                                                         config=config)
        if status:
            errors = pytesseract.get_errors(error_string)
            raise pytesseract.TesseractError(status, errors)
        if 'vie' in config:
            f = codecs.open(output_file_name, encoding='utf-8')
        else:
            f = open(output_file_name)
        try:
            return f.read().strip()
        finally:
            f.close()
    finally:
        pytesseract.cleanup(input_file_name)
        pytesseract.cleanup(output_file_name)
예제 #6
0
def convertimagetoalto(imagepaths, outputfilename, basename):
    index = 0
    for imagepath in imagepaths:
        conf_data = pytesseract.run_tesseract(
            imagepath,
            output_filename_base=outputfilename + '_' + str(index),
            lang='eng+hin',
            extension='xml',
            config='alto --oem 1')
        index += 1
예제 #7
0
def ocr(img, mrz_mode=True, extra_cmdline_params=''):
    """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image.

    This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL.

    In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract, but it works so far :)

    :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts.
    :param extra_cmdline_params:extra parameters to the ocr.py
    """
    input_file_name = '%s.bmp' % _tempnam()
    output_file_name_base = '%s' % _tempnam()
    output_file_name = "%s.txt" % output_file_name_base
    try:
        imsave(input_file_name, img)

        if mrz_mode:
            # NB: Tesseract 4.0 does not seem to support tessedit_char_whitelist
            config = "--psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789>< -c load_system_dawg=F -c load_freq_dawg=F {}".format(
                extra_cmdline_params)
        else:
            config = ""

        pytesseract.run_tesseract(input_file_name,
                                  output_file_name_base,
                                  'txt',
                                  lang=None,
                                  config=config)

        if sys.version_info.major == 3:
            f = open(output_file_name, encoding='utf-8')
        else:
            f = open(output_file_name)

        try:
            return f.read().strip()
        finally:
            f.close()
    finally:
        pytesseract.cleanup(input_file_name)
        pytesseract.cleanup(output_file_name)
예제 #8
0
def ocr(img, mrz_mode=True):
    """Runs Tesseract on a given image. Writes an intermediate tempfile and then runs the tesseract command on the image.

    This is a simplified modification of image_to_string from PyTesseract, which is adapted to SKImage rather than PIL.

    In principle we could have reimplemented it just as well - there are some apparent bugs in PyTesseract (e.g. it
    may lose the NamedTemporaryFile due to its auto-delete behaviour).

    :param mrz_mode: when this is True (default) the tesseract is configured to recognize MRZs rather than arbitrary texts.
    """
    input_file_name = '%s.bmp' % pytesseract.tempnam()
    output_file_name_base = '%s' % pytesseract.tempnam()
    output_file_name = "%s.txt" % output_file_name_base
    try:
        imsave(input_file_name, img)

        if mrz_mode:
            config = "-psm 6 -c tessedit_char_whitelist=ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789>< -c load_system_dawg=F -c load_freq_dawg=F"
        else:
            config = None

        status, error_string = pytesseract.run_tesseract(input_file_name,
                                                         output_file_name_base,
                                                         lang=None,
                                                         boxes=False,
                                                         config=config)
        if status:
            errors = pytesseract.get_errors(error_string)
            raise pytesseract.TesseractError(status, errors)
        f = open(output_file_name)
        try:
            return f.read().strip()
        finally:
            f.close()
    finally:
        pytesseract.cleanup(input_file_name)
        pytesseract.cleanup(output_file_name)
예제 #9
0
def save_and_ocr(myScreenshot, part):
    global after_details
    global i
    myScreenshot = change_colors(myScreenshot)
    if part == "0":
        myScreenshot = advance_contour_filtering(myScreenshot, "")
    cv2.imwrite('output/filename' + part + '_' + str(i) + '.png', myScreenshot)
    if part == "0":
        datails_string = pytesseract.image_to_string(
            Image.open('output/filename' + part + '_' + str(i) + '.png'),
            config='-c load_system_dawg=false load_freq_dawg=false',
            lang="fra+eng").lower()
        if any(x in datails_string for x in ["detail", "detal", "detai"]):
            after_details = True
            myScreenshot = advance_contour_filtering(myScreenshot, "detail")
            cv2.imwrite('output/filename' + part + '_' + str(i) + '.png',
                        myScreenshot)
        if any(x in datails_string for x in ["page"]):
            if after_details == False:
                myScreenshot = advance_contour_filtering(myScreenshot, "group")
            else:
                myScreenshot = advance_contour_filtering(myScreenshot, "page")
            cv2.imwrite('output/filename' + part + '_' + str(i) + '.png',
                        myScreenshot)

        if after_details == False and any(x in datails_string
                                          for x in ["group"]):
            if datails_string.split('\n', 1)[0].endswith(':'):
                myScreenshot = advance_contour_filtering(myScreenshot, "group")
                cv2.imwrite('output/filename' + part + '_' + str(i) + '.png',
                            myScreenshot)
        if datails_string == "":
            myScreenshot = advance_contour_filtering(myScreenshot, "empty")
            cv2.imwrite('output/filename' + part + '_' + str(i) + '.png',
                        myScreenshot)
            datails_string = pytesseract.image_to_string(
                Image.open('output/filename' + part + '_' + str(i) + '.png'),
                config='-c load_system_dawg=false load_freq_dawg=false',
                lang="fra+eng").lower()
            if datails_string == "" and i > 1:
                while not pyautogui.locateOnScreen('break_line.png',
                                                   region=(230, 197, 21, 100)):
                    pyautogui.press('up')
                top3 = pyautogui.locateOnScreen('break_line.png',
                                                region=(230, 197, 21, 100))[1]
                myScreenshot = pyautogui.screenshot(region=(5, top3, 145, 50))
                myScreenshot = cv2.cvtColor(np.array(myScreenshot),
                                            cv2.COLOR_RGB2BGR)
                save_and_ocr(myScreenshot, part)
                while not pyautogui.locateOnScreen('break_line.png',
                                                   region=(230, 900, 21, 100)):
                    pyautogui.press('down')
                return

        if after_details and check_if_last_is_letter(datails_string) == False:
            myScreenshot = advance_contour_filtering(myScreenshot, "footer1")
            cv2.imwrite('output/filename' + part + '_' + str(i) + '.png',
                        myScreenshot)
            datails_string = pytesseract.image_to_string(
                Image.open('output/filename' + part + '_' + str(i) + '.png'),
                config='-c load_system_dawg=false load_freq_dawg=false',
                lang="fra+eng").lower()
            if check_if_last_is_letter(datails_string) == False:
                myScreenshot = advance_contour_filtering(
                    myScreenshot, "footer2")
                cv2.imwrite('output/filename' + part + '_' + str(i) + '.png',
                            myScreenshot)
                datails_string = pytesseract.image_to_string(
                    Image.open('output/filename' + part + '_' + str(i) +
                               '.png'),
                    config='-c load_system_dawg=false load_freq_dawg=false',
                    lang="fra+eng").lower()
                if check_if_last_is_letter(datails_string) == False:
                    myScreenshot = advance_contour_filtering(
                        myScreenshot, "footer3")
                    cv2.imwrite(
                        'output/filename' + part + '_' + str(i) + '.png',
                        myScreenshot)

    if part == "2":
        pytesseract.run_tesseract(
            'output/filename' + part + '_' + str(i) + '.png',
            'output/filename' + part + '_' + str(i),
            lang="fra+eng",
            extension='hocr',
            config='-c load_system_dawg=false load_freq_dawg=false')
        files_were_deleted = False
        hocr2html.main('output/filename' + part + '_' + str(i))
        file_name = 'output/filename' + part + '_' + str(i) + ".html"
        txt_file = open(file_name, 'r', encoding="utf8")
        txt_datas = txt_file.readlines()
        with open(file_name, 'w', encoding="utf8") as f:
            for j, txt_data in enumerate(txt_datas):
                f.write("%s\n" % txt_data.replace('_O', '_0'))
    else:
        pytesseract.run_tesseract(
            'output/filename' + part + '_' + str(i) + '.png',
            'output/filename' + part + '_' + str(i),
            lang="fra+eng",
            extension='',
            config='-c load_system_dawg=false load_freq_dawg=false')
        files_were_deleted = clean_file('output/filename' + part + '_' +
                                        str(i) + '.txt')
    if not files_were_deleted:
        png_index = i
    else:
        png_index = i + 1

    os.remove('output/filename' + part + '_' + str(png_index) + '.png')
예제 #10
0
                cv.putText(text_img,
                           str(idx), (x, y),
                           color=1,
                           fontFace=cv.FONT_HERSHEY_PLAIN,
                           fontScale=1)
                padding_y, padding_x = y - 5 if y > 5 else y, x - 5 if x > 5 else x
                save_img(
                    normalize_img(img[padding_y:y + h, padding_x:x + w], 0,
                                  255),
                    "{}/{}/tess/{}__{}.png".format(output_path, seg,
                                                   img_out_name, idx))
                run_tesseract(
                    input_filename="{}/{}/tess/{}__{}.png".format(
                        output_path, seg, img_out_name, idx),
                    output_filename_base="{}/{}/tess/text/{}__{}".format(
                        output_path, seg, img_out_name, idx),
                    extension="png",
                    lang=args.lang,
                    config=tesseract_config,
                    nice=0)
            text_cont += 1
            cv.rectangle(text_img, (x, y), (x + w, y + h),
                         color=1,
                         thickness=1)
    save_img(img7c, "{}/{}/{}_step_7.pbm".format(output_path, seg,
                                                 img_out_name))
    print("Found {} text parts".format(text_cont))

    black_pixels = np.where(text_img == BLACK)
    white_pixels = np.where(text_img == WHITE)
    text_img[black_pixels] = WHITE
예제 #11
0
	def YaxisData(self):
		try:
			pt.run_tesseract(yAxisFile, '/tmp/y_temp', None, True, '-psm 3 nobatch digits')
			filename = '/tmp/y_temp.box'
			yData = open(filename , "r")
			d = yData.read()
			yData.close()
			data_lines = d.split('\n')
			del d
			data = []
			for line in data_lines:
				l = []
				x = line.split(" ")
				if(len(x) == 6):
					if(x[0].isdigit() or x[0] == '.' or x[0] == '-'):
						l.append(0)
					else:
						l.append(1)

					l.append(x[0])
					l.append(int(x[1]))
					l.append(int(x[2]))
					l.append(int(x[3]))
					l.append(int(x[4]))
					data.append(l)

			# we need to take max distance between 2 and 4
			max = 0
			for line in data:
				if(abs(line[5] - line[3]) > max):
					max = abs(line[5] - line[3])

			l = []
			new_data = []
			for i in range(len(data) - 1):
				l.append(data[i])
				if(abs(data[i][3]-data[i+1][5]) > max):
					new_data.append(list(l))
					l[:] = []

			l.append(data[-1])
			new_data.append(list(l))

			for data in list(new_data):
				for d in data:
					if(d[0] == 1):
						new_data.remove(data)
						break

			new_new_data = []
			for data in new_data:
				l = []
				s = ""
				for d in data:
					s = s + d[1]
				l.append(float(s))
				avg = (data[-1][5] + data[-1][3])/2
				l.append(avg)
				new_new_data.append(l)

			return new_new_data
		except:
			return []
예제 #12
0
	def XaxisData(self):
		try:
			pt.run_tesseract(xAxisFile, '/tmp/x_temp', None, True, '-psm 7 nobatch digits')
			filename = '/tmp/x_temp.box'
			xData = open(filename , "r")
			d = xData.read()
			xData.close()
			data_lines = d.split('\n')
			del d
			data = []
			for line in data_lines:
				l = []
				x = line.split(" ")
				if(len(x) == 6):
					if(x[0].isdigit() or x[0] == '.' or x[0] == '-'):
						l.append(0)
					else:
						l.append(1)

					l.append(x[0])
					l.append(int(x[1]))
					l.append(int(x[2]))
					l.append(int(x[3]))
					l.append(int(x[4]))
					data.append(l)

			# here first data being 0 means first is a number 
			# now the tough part begins here 
			# we will go through all the thichkness and get the maximum thichkness
			max = 0
			for line in data:
				if(abs(line[4] - line[2]) > max):
					max = abs(line[4] - line[2])

			l = []
			new_data = []
			for i in range(len(data) - 1):
				l.append(data[i])
				if(abs(data[i][4]-data[i+1][2]) > max):
					new_data.append(list(l))
					l[:] = []

			l.append(data[-1])
			new_data.append(list(l))

			for data in list(new_data):
				for d in data:
					if(d[0] == 1):
						new_data.remove(data)
						break

			new_new_data = []
			for data in new_data:
				l = []
				s = ""
				for d in data:
					s = s + d[1]
				l.append(float(s))
				if(len(data)%2 == 0):
					avg = (data[(len(data)-1)/2][2] + data[(len(data)-1)/2][4])/2
					l.append(avg)
				else:
					avg = (data[(len(data))/2 -1][2] + data[(len(data)-1)/2][4])/2
					l.append(avg)
				new_new_data.append(l)

			return new_new_data
		except:
			return []
예제 #13
0
import os
import subprocess

from pytesseract import pytesseract

test=int(subprocess.Popen("ls /tmp/test/ | grep .jpg | wc -l", shell=True, stdout=subprocess.PIPE).stdout.read())

for i in range(0,test):
	vjpg = "/tmp/test/some_%d.jpg"%i
	vosc = "cp output.hocr /tmp/test/some_%d.hocr"%i
	pytesseract.run_tesseract(vjpg, 'output', lang=None, boxes=False, config="hocr")
	os.system(vosc)
예제 #14
0
import sys
from pytesseract import pytesseract
#pytesseract.run_tesseract(sys.argv[1], 'output', lang=None, boxes=False, config="hocr")
print( sys.argv[1])
pytesseract.run_tesseract(sys.argv[1], 'output', 'box', lang='jpn',  config="hocr")
예제 #15
0
    Get the coordinates of the bounding box

    Args:
        element_arr(list): List of bs4.element.Tag objects

    Returns:
        coordinates(list): 2D array containing coordinates in the form [x0, y0, x1, y1]
    """
    title_atrs = [element["title"].split(";") for element in element_arr]
    coordinates = [atr_value[0].split(" ")[1:] for atr_value in title_atrs]
    coordinates = [[int(x) for x in coordinate_arr] for coordinate_arr in coordinates]
    return coordinates

filename = 'test.png'

pt.run_tesseract(filename, 'output', lang="yid", extension="box", config="hocr")

hocr = open("output.hocr", "r", encoding="utf-8").read()

#extract coordinate information from hocr
soup = BeautifulSoup(hocr, "html.parser")
words = soup.find_all('span',class_='ocrx_word')
word_coordinates = get_coordinates(words)
lines = soup.find_all('span',class_='ocr_line')
line_coordinates = get_coordinates(lines)
paragraphs = soup.find_all('p',class_='ocr_par')
paragraph_coordinates = get_coordinates(paragraphs)


# Draw the bounding box
img = cv2.imread(filename)
예제 #16
0
    def XaxisData(self):
        try:
            pt.run_tesseract(xAxisFile, '/tmp/x_temp', None, True,
                             '-psm 7 nobatch digits')
            filename = '/tmp/x_temp.box'
            xData = open(filename, "r")
            d = xData.read()
            xData.close()
            data_lines = d.split('\n')
            del d
            data = []
            for line in data_lines:
                l = []
                x = line.split(" ")
                if (len(x) == 6):
                    if (x[0].isdigit() or x[0] == '.' or x[0] == '-'):
                        l.append(0)
                    else:
                        l.append(1)

                    l.append(x[0])
                    l.append(int(x[1]))
                    l.append(int(x[2]))
                    l.append(int(x[3]))
                    l.append(int(x[4]))
                    data.append(l)

            # here first data being 0 means first is a number
            # now the tough part begins here
            # we will go through all the thichkness and get the maximum thichkness
            max = 0
            for line in data:
                if (abs(line[4] - line[2]) > max):
                    max = abs(line[4] - line[2])

            l = []
            new_data = []
            for i in range(len(data) - 1):
                l.append(data[i])
                if (abs(data[i][4] - data[i + 1][2]) > max):
                    new_data.append(list(l))
                    l[:] = []

            l.append(data[-1])
            new_data.append(list(l))

            for data in list(new_data):
                for d in data:
                    if (d[0] == 1):
                        new_data.remove(data)
                        break

            new_new_data = []
            for data in new_data:
                l = []
                s = ""
                for d in data:
                    s = s + d[1]
                l.append(float(s))
                if (len(data) % 2 == 0):
                    avg = (data[(len(data) - 1) / 2][2] +
                           data[(len(data) - 1) / 2][4]) / 2
                    l.append(avg)
                else:
                    avg = (data[(len(data)) / 2 - 1][2] +
                           data[(len(data) - 1) / 2][4]) / 2
                    l.append(avg)
                new_new_data.append(l)

            return new_new_data
        except:
            return []
예제 #17
0
def upload():
    #below checks should be done in js, but just in case
    if "upload" not in request.files:    
        return json.dumps({"status": "Upload not found"})
    f = request.files["upload"]    
    if not allowed_file(f.filename):
        return json.dumps({"status": "Illegal filename"})
    filename = secure_filename(f.filename)
    f.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
    mask = analyze(filename)
    print "starting tesseract"
    hocr = 'pytesseractTemp'
    text = pytesseract.run_tesseract(UPLOAD_FOLDER + '/' + filename, UPLOAD_FOLDER + '/' + hocr, lang='eng', boxes=True, config="hocr")
    boxes = []
    with open(UPLOAD_FOLDER + '/' + hocr + '.box', 'rb') as f:
        reader = csv.reader(f, delimiter = ' ')
        for row in reader:
            if(len(row)==6):
                boxes.append(row)
    boxes = pytesseract.image_to_string(Image.open(UPLOAD_FOLDER + '/' + filename), lang="eng", boxes = True).split('\n')
    text = pytesseract.image_to_string(Image.open(UPLOAD_FOLDER + '/' + filename), lang="eng")

    for b in boxes:
        #marking box        
        be = b.split()
        if len(be) == 6:
            print "marking box"
            print b
            cx = int(be[1]) + int(be[3]) / 2
            cy = int(be[2]) + int(be[4]) / 2
            cv2.circle(mask, (cx, cy), 7, (0, 0, 0), -1)

    cv2.imshow('found circles', mask)
    cv2.waitKey(0)
    
    
    ret = {"status": "success"}
    ret.update(processText(text, boxes))
    #mark it out
    for b in ret["boxes"]:
        #marking box
        
        print "marking box"
        print b
        cx = int(b[1]) + int(b[3]) / 2
        cy = int(b[2]) + int(b[4]) / 2
        cv2.circle(mask, (cx, cy), 7, (0, 0, 0), -1)

    cv2.imshow('found circles', mask)
    cv2.waitKey(0)
    
    #now detect the circle (img should be blurred out)
    #edges = cv2.Canny(mask.copy(), 0, 255)
    """
    blurred = cv2.blur(mask, (5, 5))
    plt.subplot(121),plt.imshow(mask),plt.title('Original')
    plt.xticks([]), plt.yticks([])
    plt.subplot(122),plt.imshow(blurred),plt.title('Blurred')
    plt.xticks([]), plt.yticks([])
    plt.show()
    
    edges = cv2.medianBlur(mask, 5)
    cimg = cv2.cvtColor(edges,cv2.COLOR_GRAY2BGR)

    circles = cv2.HoughCircles(edges,cv2.HOUGH_GRADIENT,1,30,
                            param1=50,param2=30,minRadius=10,maxRadius=200)
    circles = np.uint16(np.around(circles))
    for i in circles[0,:]:
        # draw the outer circle
        cv2.circle(cimg,(i[0],i[1]),i[2],(0,255,0),2)
        # draw the center of the circle
        cv2.circle(cimg,(i[0],i[1]),2,(0,0,255),3)
    cv2.imshow('detected circles',cimg)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    """
    """
    contours = cv2.findContours(mask.copy(), cv2.RETR_EXTERNAL,
                            cv2.CHAIN_APPROX_SIMPLE)
    contours=contours[1]
    for cnt in contours:
        M = cv2.moments(cnt)
        cX = int(M["m10"] / M["m00"])
	cY = int(M["m01"] / M["m00"])
 
	# draw the contour and center of the shape on the image
	cv2.drawContours(mask, [cnt], -1, (0, 255, 0), 2)
	cv2.circle(mask, (cX, cY), 7, (255, 255, 255), -1)
	cv2.putText(mask, "center", (cX - 20, cY - 20),
		cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)
 
    plt.imshow(mask, cmap = 'gray', interpolation = 'bicubic')
    plt.xticks([]), plt.yticks([])  # to hide tick values on X and Y axis
    plt.show()
"""
    #cv2.imshow("Image", mask)
    #cv2.waitKey(0)
    
    return json.dumps(ret)
예제 #18
0
import csv
import cv2

from pytesseract import pytesseract as pt

img_path = './images/PCE28-.jpg'

pt.run_tesseract(img_path, 'output', lang=None, boxes=True, config='hocr')
boxes = []
chars = []

with open('output.box', encoding="utf8") as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
        if len(row) == 6:
            boxes.append(row)

img = cv2.imread(img_path)
h, w = img.shape[:2]
for b in boxes:
    cv2.rectangle(img, (int(b[1]), h-int(b[2])), (int(b[3]), h - int(b[4])), (255, 0, 0), 2)
    cv2.putText(img, b[0], (int(b[1]), h - int(b[2])), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
cv2.imshow("result", img)
cv2.imwrite("result.jpg", img)
cv2.waitKey(0)
예제 #19
0
from PIL import Image
from pytesseract import pytesseract
import argparse
import xmltodict
import json
import cv2
import os
import requests
from puttext import puttext
from nltk.tokenize import sent_tokenize
import math

filename = '../upload/table1.png'
o_filename = '../upload/table2.png'
conf_data = pytesseract.run_tesseract(
            filename,output_filename_base='test',lang='eng+hin',extension='xml', config='alto --oem 1')
f_hin = open("test.xml", "r")
# print(xmltodict.parse(f_hin.read()))
data = xmltodict.parse(f_hin.read())
blocks = data['alto']['Layout']['Page']['PrintSpace']['TextBlock']
for block in blocks:
    textline = block['TextLine']
    text = ''
    height = 0
    x = block['@HPOS']
    y = block['@VPOS']
    word_count = 0
    no_lines = 0
    line_height = 0
    previous_position = 0
    previous_position_x = 0
예제 #20
0
    def post(self):
        """
        Retrieve corresponding value from the test report.
        ---
        tags:
            - OCR
        parameters:
            - in: body
              name: image
              type: string
              required: true
              example: R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
            - in: body
              name: search_terms
              type: array
              items:
               type: string
              required: true
              example: ["fasting","blood","sugar"]
        responses:
            200:
                description: A corresponding value has been extracted successfully
                properties:
                    extracted_value:
                    type: string
                    description: Retrieved value from the report
        """
        req = request.get_json(force=True)
        if not req.get("image") or not req.get("search_terms"):
            return "Invalid image or search terms"
        session_filename = utils.get_current_time()
        reader_module_path = os.path.dirname(os.path.realpath(__file__))
        origin_img_base64 = req["image"]
        origin_img_fp = os.path.join(reader_module_path,
                                     'img-origin/' + session_filename + '.jpg')

        origin_img = Image.open(BytesIO(base64.b64decode(origin_img_base64)))
        origin_img.save(origin_img_fp)
        hocr_filepath = os.path.join(reader_module_path,
                                     'hocr-files/' + session_filename)

        pytesseract.run_tesseract(
            origin_img_fp,
            hocr_filepath,
            extension="box",
            lang=None,
            config="hocr --psm 7 tessedit_char_whitelist=0123456789")

        search_terms = tuple(req["search_terms"])
        # print(search_terms)
        try:
            hocr_result = hocr_search.parse_hocr(search_terms,
                                                 hocr_filepath + '.hocr')
            img_width, img_height = origin_img.size
            cropped_image = origin_img.crop(
                utils.calc_result_box(hocr_result, img_width))
            cropped_img_fp = os.path.join(reader_module_path, 'cropped-imgs/')
            cropped_image.save(cropped_img_fp + session_filename + ".jpg",
                               "jpeg")
            response = google_vision.get_value(cropped_img_fp +
                                               session_filename + ".jpg")
            res_detail = {"extracted_value": response}
            return (res_detail)
        except Exception as e:
            error_detail = {"error": e}
            abort(500, message="Search terms did not match the document")
예제 #21
0
    def YaxisData(self):
        try:
            pt.run_tesseract(yAxisFile, '/tmp/y_temp', None, True,
                             '-psm 3 nobatch digits')
            filename = '/tmp/y_temp.box'
            yData = open(filename, "r")
            d = yData.read()
            yData.close()
            data_lines = d.split('\n')
            del d
            data = []
            for line in data_lines:
                l = []
                x = line.split(" ")
                if (len(x) == 6):
                    if (x[0].isdigit() or x[0] == '.' or x[0] == '-'):
                        l.append(0)
                    else:
                        l.append(1)

                    l.append(x[0])
                    l.append(int(x[1]))
                    l.append(int(x[2]))
                    l.append(int(x[3]))
                    l.append(int(x[4]))
                    data.append(l)

            # we need to take max distance between 2 and 4
            max = 0
            for line in data:
                if (abs(line[5] - line[3]) > max):
                    max = abs(line[5] - line[3])

            l = []
            new_data = []
            for i in range(len(data) - 1):
                l.append(data[i])
                if (abs(data[i][3] - data[i + 1][5]) > max):
                    new_data.append(list(l))
                    l[:] = []

            l.append(data[-1])
            new_data.append(list(l))

            for data in list(new_data):
                for d in data:
                    if (d[0] == 1):
                        new_data.remove(data)
                        break

            new_new_data = []
            for data in new_data:
                l = []
                s = ""
                for d in data:
                    s = s + d[1]
                l.append(float(s))
                avg = (data[-1][5] + data[-1][3]) / 2
                l.append(avg)
                new_new_data.append(l)

            return new_new_data
        except:
            return []
예제 #22
0
def job():
    sql = db.session.query(info_table).filter(
        info_table.status == 'Pending').first()
    sql.status = 'Working'
    fileid = sql.id
    savename = sql.savenamedb
    keyworddb = sql.keyworddb
    outfilename = sql.outputfiledb
    db.session.commit()
    with open(outfilename + '.txt', 'rb') as fp:
        recognized_text = pickle.load(fp)
    filepath = 'remaining/' + savename
    content_text = ''
    # Enter the keyword you want to search for
    keyword = keyworddb
    key = keyword.split(' ', 1)[0]
    result = [k for k in recognized_text if k.startswith(keyword)]
    for l in result:
        str1 = ''.join(l)
        page = recognized_text.index(str1)
        print("page is:", page + 1)
        pagefound = page + 1
        break

    #Page Number loop for keyword
    print('Page Number loop for keyword')
    ls = []
    for x in range(1, len(recognized_text) + 1):
        mat = "Page " + str(x) + " of " + key
        if res.find(mat) != -1:
            ls.append(x)
        else:
            if ls == []:
                #print("page of "+keyword+" is: "+str(pagefound))
                ls.append(pagefound)
                break
            else:
                for i in range(0, len(ls)):
                    ls[i] = int(ls[i])
                print("Number of pages for " + keyword + " is " + str(max(ls)))
                break

    ls2 = []
    if len(ls) > 1:
        for z in range(max(ls)):
            fin = z + pagefound
            z1 = str(fin)
            ls2.append(z1)
        for p1 in ls2:
            print("page of " + keyword + " is: " + p1)
    else:
        ls2 = ls
        for p1 in ls2:
            print("page of " + keyword + " is: " + str(p1))
    # Save pdf to jpg page-wise
    print('Save pdf to jpg page-wise')
    fname = os.path.splitext(os.path.basename(filepath))[0]
    pages = convert_from_path(
        filepath, 843)  # Resolution can be changed according to your use
    i = 1
    for page in pages:
        savepath = 'Images/' + fname + '_' + str(i) + '.jpeg'
        page.save(savepath, 'JPEG')
        i = i + 1

    #Save in contentdb
    print('Save in contentdb')
    for i in ls2:
        content_text = content_text + "|||" + recognized_text[int(i) - 1]
    sql = info_table.query.filter_by(id=fileid).first()
    sql.contentdb = content_text

    #Convert selected page from jpg to hOCR pdf
    print('Convert selected page from jpg to hOCR pdf')
    for i in ls2:
        currentpagepath = 'Images/' + fname + '_' + str(i) + '.jpeg'
        pdf_name = 'PDFs/' + fname + '_' + str(i)
        pytesseract.run_tesseract(currentpagepath,
                                  pdf_name,
                                  lang=None,
                                  config="hocr",
                                  extension='pdf')

    # Merge all pdfs into one
    print('Merge all pdfs into one')
    pdf_list = []
    for i in ls2:
        pdf_name = 'PDFs/' + fname + '_' + str(i) + '.pdf'
        pdf_list.append(pdf_name)

    pdf_output_name = key + '.pdf'
    merger = PdfFileMerger()
    for pdf in pdf_list:
        merger.append(open(pdf, 'rb'))
    with open(pdf_output_name, 'wb') as fout:
        merger.write(fout)

    #Parse the hOCR for Tables and save it to xlsx
    print('Parse the hOCR for Tables and save it to xlsx')
    excel_output_name = key + '.xlsx'
    c.xlsx(pdf_output_name, excel_output_name)

    sql.status = 'Completed'
    print('Status : completed And updating Jsondb')
    json_string = {}
    json_string['json_data'] = []
    json_string['json_data'].append({
        'taskid': sql.id,
        'filename': sql.filenamedb,
        'doctypeid': sql.doctype,
        'accuracy': sql.accuracy,
        'keywordsearched': sql.keyworddb,
        'content': sql.contentdb
    })
    sql.jsondb = json_string
    db.session.commit()
예제 #23
0
import sys
import csv
import cv2
from pytesseract import pytesseract as pt

# extension ?
#pt.run_tesseract(sys.argv[1], 'output', lang=None, boxes=True, config="hocr")
pt.run_tesseract(sys.argv[1], 'output', lang=None, config="hocr")

# To read the coordinates
boxes = []
with open('output.box', 'rb') as f:
    reader = csv.reader(f, delimiter=' ')
    for row in reader:
        if (len(row) == 6):
            boxes.append(row)

# Draw the bounding box
img = cv2.imread('bw.png')
h, w, _ = img.shape
for b in boxes:
    img = cv2.rectangle(img, (int(b[1]), h - int(b[2])),
                        (int(b[3]), h - int(b[4])), (255, 0, 0), 2)

cv2.imshow('output', img)