Пример #1
0
def segment_character2(img_gray):
    gray = img_gray.copy()
    _, img_bin = cv2.threshold(gray, 0, 255,
                               cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)

    img_bin = remove_noise(img_bin, 3)

    kernel = np.ones((2, 1))
    erosion = cv2.morphologyEx(
        img_bin, cv2.MORPH_OPEN,
        kernel)  # cv2.erode(img_bin, kernel, iterations=1)
    cv2.imshow('erosion', erosion)

    ero_inv = cv2.subtract(255, erosion)

    img_rlsa = rlsa.rlsa(ero_inv, True, True, 10)
    res = cv2.subtract(255, img_rlsa)

    cv2.imshow('res', res)

    contours, _ = cv2.findContours(res, cv2.RETR_EXTERNAL,
                                   cv2.CHAIN_APPROX_NONE)

    for c in contours:
        (x, y, w, h) = cv2.boundingRect(c)
        if h > 3:
            cv2.rectangle(gray, (x, y), (x + w, y + h), (0, 0, 0), 1)

    cv2.imshow('final', gray)
    return
def get_rlsa_output(image):
	"""
	Function to return rlsa output after running rlsa on the binary iamge
	"""
	image_rlsa_horizontal = rlsa.rlsa(image, 1, 0, 50)# performing rlsa algorithm on the binary image 
	image_rlsa_horizontal_inverted = cv2.bitwise_not(image_rlsa_horizontal)# inverting the image 
	return image_rlsa_horizontal_inverted
Пример #3
0
def extract_title(img):
    image = cv2.imread(img)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    (thresh, binary) = cv2.threshold(gray, 100, 200,
                                     cv2.THRESH_BINARY | cv2.THRESH_OTSU)
    # cv2.imshow('binary', binary)
    cv2.imwrite('binary.png', binary)
    (contours, _) = cv2.findContours(~binary, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    # find contours
    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 200, 0), 1)
    # cv2.imshow('contour', image)
    cv2.imwrite('contours.png', image)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()

    mask = np.ones(image.shape[:2], dtype="uint8") * 200
    (contours, _) = cv2.findContours(~binary, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    heights = [cv2.boundingRect(contour)[3] for contour in contours]
    avgheight = sum(heights) / len(heights)
    for c in contours:
        [x, y, w, h] = cv2.boundingRect(c)
        if h > 2 * avgheight:
            cv2.drawContours(mask, [c], -1, 0, -1)
    # cv2.imshow('filter', mask)
    cv2.imwrite('filter.png', mask)

    x, y = mask.shape
    value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20  #heuristic
    mask = rlsa.rlsa(mask, True, False, value)  #rlsa application
    # cv2.imshow('rlsah', mask)
    cv2.imwrite('rlsah.png', mask)
    (contours, _) = cv2.findContours(~mask, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)  # find contours
    mask2 = np.ones(image.shape, dtype="uint8") * 200  # blank 3 layer image
    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        if w > 0.60 * image.shape[1]:  # width heuristic applied
            title = image[y:y + h, x:x + w]
            mask2[y:y + h,
                  x:x + w] = title  # copied title contour onto the blank image
            image[y:y + h, x:x +
                  w] = 200  # nullified the title contour on original image
    # cv2.imshow('title', mask2)
    cv2.imwrite('title.png', mask2)
    # cv2.imshow('content', image)
    # cv2.imshow('content.png', image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    title = pytesseract.image_to_string(Image.fromarray(mask2))
    # print(title)
    # title = title.split(" ")
    # print(title)
    return title
Пример #4
0
def get_roi(image):
    """ Возвращает координаты вершин прямоугольников, содержащих области интереса на изображении
        Аргумент image: ndarray - изображение
        Возвращает coordinates: list<list> - список списков формата [x0, x1, y0, y1]
                   roi: list<ndarray> - список изображений блоков контента
    """

    ret, bin_image = cv2.threshold(image, 0, 255,
                                   cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    horizontal_smoothed_image = rlsa.rlsa(bin_image, True, False, 16)
    vertical_smoothed_image = rlsa.rlsa(bin_image, False, True, 8)

    smoothed_image = horizontal_smoothed_image & vertical_smoothed_image
    cv2.waitKey(0)

    ret01, inv_smoothed_image = cv2.threshold(
        smoothed_image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    dilation_kernel = np.ones((3, 3), dtype=int)

    dilated_image = cv2.dilate(inv_smoothed_image,
                               dilation_kernel,
                               iterations=2)

    ret2, labels = cv2.connectedComponents(dilated_image)

    roi = []
    roi_coordinates = []
    for label in range(1, ret2):
        area = np.where(labels == label)
        roi_coordinates.append([
            np.amin(area[1]),
            np.amax(area[1]),
            np.amin(area[0]),
            np.amax(area[0])
        ])
        roi.append(image[np.amin(area[0]):np.amax(area[0]),
                         np.amin(area[1]):np.amax(area[1])])

    return roi_coordinates, roi
Пример #5
0
    def _extract_boxes(self, layout):
        '''
        Desc: extract blocks in the layout, it could be a text block or an image block

        Args:
            - layout (LayoutUtils)

        Returns:
            - a list of Paragraph or Image (Block)
        '''

        img_src = layout.get_src()

        # preprocess
        img_pre = self._preprocess(img_src)

        # rlsa
        img_rlsa = rlsa.rlsa(img_pre, True, True, 10)

        # dilation
        img_rlsa = 255 - img_rlsa  # invert color
        kernel = np.ones(config.BOX_DILATE_KERNEL, np.uint8)
        dilate = cv2.dilate(img_rlsa,
                            kernel,
                            iterations=config.BOX_DILATE_ITER)

        # find bbox
        bboxes = self._calculate_bbox(dilate)

        # calculate order
        graph = self._build_graph(bboxes)
        order = graph.topological_sort()

        # extract subgraph box
        boxes = []
        for id_ in order:
            bbox = bboxes[id_]
            x1, y1, x2, y2 = bbox.get_coords()
            subgraph = img_src[y1:y2, x1:x2]

            # cnn check type
            type_, _ = self._classifier.classify(subgraph)

            if type_ == 'Text':
                para_box = Paragraph(id_, subgraph)
                boxes.append(para_box)

            elif type_ == 'Image':
                img_box = Image(id_, subgraph)
                boxes.append(img_box)

        return boxes
Пример #6
0
    def run_RSLA(image_filename, scale_percent=25, rsla_thresh_h=10, rsla_thresh_v=10, contour_area=5): #todo revisit these defaults
        '''

        :param image_filename: path to image
        :param scale_percent:  percent to scale image before rsla, should divide 100
        :param rsla_thresh_h: threshold for horizontal rsla
        :param rsla_thresh_v: threshold for vertical rsla
        :param contour_area: minimum acceptible contour region area
        :return: list of bounding boxes
        '''
        bounding = []
        image = cv2.imread(image_filename)
        orig_wh = image.shape[:-1]

        width = int(image.shape[1] * scale_percent / 100)
        height = int(image.shape[0] * scale_percent / 100)
        unscale = 100/scale_percent

        dim = (width, height)
        # resize image
        resized = cv2.resize(image, dim, interpolation=cv2.INTER_AREA)
        gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
        (thresh, image_binary) = cv2.threshold(
            gray, 150, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU
        )
        image_rlsa_horizontal = rlsa.rlsa(image_binary, True, False, rsla_thresh_h)
        image_rlsa_vertical = rlsa.rlsa(image_binary, False, True, rsla_thresh_v)
        combo = np.bitwise_or(image_rlsa_horizontal, image_rlsa_vertical)
        combo = cv2.bitwise_not(combo)
        _, contours, _ = cv2.findContours(combo, mode=cv2.RETR_EXTERNAL, method=cv2.CHAIN_APPROX_SIMPLE)

        for contour in contours:
            if cv2.contourArea(contour) > contour_area:
                # combo = cv2.drawContours(combo, contour, -1, (0, 0, 255), thickness=10)
                x, y, w, h = cv2.boundingRect(contour)
                # cv2.rectangle(image, (x * 4, y * 4), ((x+w)*4,(y+h)*4), color=(0,0,255))
                bounding.append((x * unscale, y * unscale, (x+w)*unscale, (y+h)*unscale))

        return bounding
Пример #7
0
    def connect_horizontal(img_bin, rlsa_val=47):
        """Connect dots horizontal"""

        og = img_bin.copy()

        # Setting RLSA
        RLSA_VALUE = rlsa_val
        RLSA_HORIZONTAL = True
        RLSA_VERTICAL = False

        img_bin = cv2.subtract(255, img_bin)
        img_rlsa = rlsa.rlsa(img_bin, RLSA_HORIZONTAL, RLSA_VERTICAL,
                             RLSA_VALUE)
        img_rlsa = cv2.subtract(255, img_bin)

        return img_rlsa
Пример #8
0
def segment_words(img_gray, rlsa_val=7, bin_result=False):
    """ Segment words with RLSA

    params:
        img_gray::ndarray:~ grayscale image
        rlsa_val::integer:~ value for run length smoothing algorithm

    Returns a list of tuple -> ((x,y,w,h), image_array)
    """

    gray = img_gray.copy()

    _, img_bin = cv2.threshold(gray, 0, 255,
                               cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)

    img_bin = remove_noise(img_bin, 30)

    img_bin_og = img_bin.copy()

    img_bin = cv2.subtract(255, img_bin)

    img_rlsa = rlsa.rlsa(img_bin, True, True, rlsa_val)

    res = cv2.subtract(255, img_rlsa)

    contours, _ = cv2.findContours(res, cv2.RETR_EXTERNAL,
                                   cv2.CHAIN_APPROX_NONE)

    res = []
    for c in contours:
        (x, y, w, h) = cv2.boundingRect(c)
        if h > 3:
            if bin_result:
                cropped_img = img_bin_og[y:y + h, x:x + w]
            else:
                cropped_img = gray[y:y + h, x:x + w]

            zipp = ((x, y, w, h), cropped_img)
            res.append(zipp)

    return res
Пример #9
0
def handleFileUpload():
    pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
    filenames = [img for img in glob.glob("images/*.png")]
    images = []
    for img in filenames:
        n = cv2.imread(img)
        images.append(n)

    image = cv2.imread(img)  # reading the image

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # convert2grayscale
    (thresh, binary) = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY
                                     | cv2.THRESH_OTSU)  # convert2binary

    contours, hierarchy = cv2.findContours(~binary, cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_SIMPLE)
    # find contours
    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 1)

    mask = np.ones(
        image.shape[:2], dtype="uint8"
    ) * 255  # create blank image of same dimension of the original image
    contours, hierarchy = cv2.findContours(~binary, cv2.RETR_EXTERNAL,
                                           cv2.CHAIN_APPROX_SIMPLE)
    heights = [cv2.boundingRect(contour)[3]
               for contour in contours]  # collecting heights of each contour
    avgheight = sum(heights) / len(heights)  # average height
    # finding the larger contours
    # Applying Height heuristic
    for c in contours:
        [x, y, w, h] = cv2.boundingRect(c)
        if h > 2 * avgheight:
            cv2.drawContours(mask, [c], -1, 0, -1)

    x, y = mask.shape
    value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20  #heuristic
    mask = rlsa.rlsa(mask, True, False, value)  #rlsa application

    contours, hierarchy = cv2.findContours(
        ~mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)  # find contours
    mask2 = np.ones(image.shape, dtype="uint8") * 255  # blank 3 layer image
    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        if w > 0.60 * image.shape[1]:  # width heuristic applied
            title = image[y:y + h, x:x + w]
            mask2[y:y + h,
                  x:x + w] = title  # copied title contour onto the blank image
            image[y:y + h, x:x +
                  w] = 255  # nullified the title contour on original image

    title = pytesseract.image_to_string(Image.fromarray(mask2))

    im = cv2.imread(img)
    content = pytesseract.image_to_string(im)
    path_to_dir = 'images/'  # path to directory you wish to remove
    files_in_dir = os.listdir(
        path_to_dir)  # get list of files in the directory

    for file in files_in_dir:  # loop to delete each file in folder
        os.remove(f'{path_to_dir}/{file}')

# os.remove(img)

    content = content.replace("\n", " ")

    d = {'title': [title], 'text': [content], 'author': ["Beekash Mohanty"]}

    df_test = pd.DataFrame(data=d)
    #stopwords = {x: 1 for x in stopwords.words('english')}
    non_alphanums = re.compile(u'[^A-Za-z0-9]+')

    def normalize_text(text):
        return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

    print("Loading models")
    pickle_model = "models/wb_transform.pkl"
    clf1 = pkl.load(gzip.open(pickle_model, 'rb'))

    stemmer = SnowballStemmer("english")

    def preprocess(df):
        df['author'].fillna('No author', inplace=True)
        df['title'].fillna('No title', inplace=True)
        df['text'].fillna('No text', inplace=True)

        #search author encoded
        df_author = pd.read_csv('author_cat.csv')

        #TODO check at notebook the values for the author and the equal query set the cateory id right
        df['author_cat'] = 1
        df['stemmed_title'] = df['title'].map(
            lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))
        df['stemmed_text'] = df['text'].map(
            lambda x: ' '.join([stemmer.stem(y) for y in x.split(' ')]))

        # drop the title autor and text
        df.drop(['title', 'author', 'text'], axis=1, inplace=True)

        return df

    df = preprocess(df_test)
    vectorizer = HashingVectorizer(normalize_text,
                                   decode_error='ignore',
                                   n_features=2**23,
                                   non_negative=False,
                                   ngram_range=(1, 2),
                                   norm='l2')

    X_title = vectorizer.transform(df['stemmed_title'])
    #X_title = X_title[:, np.array(np.clip(X_title.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

    X_text = vectorizer.transform(df['stemmed_text'])
    #X_text = X_text[:, np.array(np.clip(X_text.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

    X_author = df['author_cat'].values
    X_author = X_author.reshape(-1, 1)

    sparse_merge = hstack((X_title, X_text, X_author)).tocsr()

    # Remove features with document frequency <= 100
    mask100 = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 1, 0),
                       dtype=bool)
    X = sparse_merge[:, mask100]
    print(X.shape)
    print('Loading model to predict...')
    print('Loading model to predict...')

    y1 = clf1.predict(X)

    bloblist_desc = list()

    df_usa_descr_str = df_test['stemmed_text'].astype(str)
    for row in df_usa_descr_str:
        blob = TextBlob(row)
        bloblist_desc.append(
            (row, blob.sentiment.polarity, blob.sentiment.subjectivity))
        df_usa_polarity_desc = pd.DataFrame(
            bloblist_desc, columns=['sentence', 'sentiment', 'polarity'])

    tweet_counts = loaded_model.method.transform(df_test['stemmed_text'])
    predictions = loaded_model.classifier.predict(tweet_counts)

    def f(df_usa_polarity_desc):
        if df_usa_polarity_desc['sentiment'] > 0:
            val = "Positive"
        elif df_usa_polarity_desc['sentiment'] == 0:
            val = "Neutral"
        else:
            val = "Negative"
        return val

    df_usa_polarity_desc["Sentiment_Type"] = df_usa_polarity_desc.apply(func=f,
                                                                        axis=1)

    cal = np.round(y1, 5) * 100
    if cal > 98:
        m = "This News is Fake"
    elif cal > 90 and cal < 98:
        m = "This News is more likely a Fake"
    else:
        m = "This News is Real"

    return render_template(
        "upload.html",
        prediction_text="Fake Rate={}".format(np.round(y1, 4) * 100) + "%" +
        "->" + m + " " + "   Sentiments=" +
        df_usa_polarity_desc["Sentiment_Type"].values + " " + "Category=" +
        predictions)
Пример #10
0
def process_image(path_to_image, empty_output, output_dir):
    output_path = os.path.dirname(path_to_image)
    last_folder_name = os.path.basename(output_path)
    image_name = os.path.basename(path_to_image)
    image_sans_ext = os.path.splitext(image_name)[0]

    # check if file exists here and exist if not
    try:
        f = open(path_to_image)
        f.close()
    except FileNotFoundError:
        logging.critical('Given image does not exist')
        sys.exit(0)

    logging.info(f"Processing {image_name}")

    founds = glob.glob(f'{output_dir}/{image_sans_ext}-*.xml')
    if len(founds) > 0:
        logging.info(f"FILE EXISTS: {founds}")
        return

    # standardize size of the images maintaining aspect ratio
    if empty_output:
        files = glob.glob('{}/*'.format(output_dir))
        for f in files:
            os.remove(f)

    image = cv2.imread(path_to_image)  #reading the image

    image_height = image.shape[0]
    image_width = image.shape[1]
    if image_width != 2048:
        image = imutils.resize(image, width=2048)

    gray = cv2.cvtColor(image,
                        cv2.COLOR_BGR2GRAY)  # converting to grayscale image
    # applying thresholding technique on the grayscale image
    # all pixels value above 0 will be set to 255 but because we are using THRESH_OTSU
    # we have avoid have to set threshold (i.e. 0 = just a placeholder) since otsu's method does it automatically
    (thresh, im_bw) = cv2.threshold(
        gray, 0, 255,
        cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)  # converting to binary image
    # invert image data using unary tilde operator
    # im_bw = ~im_bw

    # Noise removal step - Perform opening on the thresholded image (erosion followed by dilation)
    kernel = np.ones((2, 2), np.uint8)  # kernel noise size (2,2)
    im_bw = cv2.morphologyEx(
        im_bw, cv2.MORPH_OPEN,
        kernel)  # cleans up random lines that appear on the page
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir, f'{image_sans_ext}-im-negative.png'),
            im_bw)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(os.path.join(output_dir, f'{image_sans_ext}-im-bw.png'),
                    ~im_bw)

    # extract and draw any lines from the image
    lines_mask = draw_lines(image, gray)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir, f'{image_sans_ext}-lines-mask.png'),
            lines_mask)  # debug remove

    # extract complete shapes likes boxes of ads and banners
    found_polygons_mask = extract_polygons(im_bw, lines_mask)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-found-polygons-mask.png'),
            found_polygons_mask)  # debug remove

    # nullifying the mask of unwanted polygons over binary (toss images)
    # this should not only have texts, without images
    text_im_bw = cv2.bitwise_and(im_bw, im_bw, mask=found_polygons_mask)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-text-im-bw-negative.png'),
            ~text_im_bw)

    # initialize blank image for extracted titles
    titles_mask = np.ones(image.shape[:2], dtype="uint8") * 255
    contents_mask = np.ones(image.shape[:2], dtype="uint8") * 255

    (contours, _) = cv2.findContours(text_im_bw, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    heights = [cv2.boundingRect(contour)[3] for contour in contours]
    avgheight = sum(heights) / len(heights)

    title_widths = []
    content_widths = []
    if logging.getLogger().level == logging.DEBUG:
        debug_contents_mask = np.ones(
            image.shape,
            dtype="uint8") * 255  # blank 3 layer image for debug colour
    # finding the larger text
    for c in contours:
        [x, y, w, h] = cv2.boundingRect(c)
        cv2.rectangle(contents_mask, (x, y), (x + w, y + h), (255, 0, 0), 1)
        if h > 2 * avgheight:
            cv2.drawContours(titles_mask, [c], -1, 0, -1)
            title_widths.append(w)
        elif h * w > 20:  # remove specks on dots
            # get the biggest chunks of texts... articles!
            cv2.drawContours(contents_mask, [c], -1, 0, -1)
            content_widths.append(w)
        if logging.getLogger().level == logging.DEBUG:
            cv2.drawContours(debug_contents_mask, [c], -1, 0, -1)
            cv2.rectangle(debug_contents_mask, (x, y), (x + w, y + h),
                          (0, 255, 0), 1)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-debug_drawn_contours.png'),
            debug_contents_mask)

    # helps further detach titles if necessary. This step can be removed
    # titles_mask = cv2.erode(titles_mask, kernel, iterations = 1)
    m_height, m_width = titles_mask.shape  # get image dimensions, height and width

    # make 2D Image mask of proto-original image for cutting contents
    image_mask = np.ones(image.shape,
                         dtype="uint8") * 255  # blank 3 layer image
    image_mask[0:m_height, 0:m_width] = image[0:m_height, 0:m_width]

    # run length smoothing algorithms for vertical and lateral conjoining of pixels
    value = math.ceil(sum(title_widths) / len(title_widths)) * 2
    logging.info(f'RLSA Title Value {value}')
    rlsa_titles_mask = rlsa.rlsa(titles_mask, True, False,
                                 value)  #rlsa application
    rlsa_titles_mask_for_final = rlsa_titles_mask
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir, f'{image_sans_ext}-rlsa-titles-mask.png'),
            rlsa_titles_mask)  # debug remove

    value = math.ceil(sum(content_widths) / len(content_widths)) * 3
    logging.info(f'RLSA Content Value {value}')
    rlsa_contents_mask = rlsa.rlsa(contents_mask, False, True,
                                   value)  #rlsa application
    rlsa_contents_mask_for_avg_width = rlsa_contents_mask
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-rlsa-contents-mask.png'),
            rlsa_contents_mask)  # debug remove

    # get avg properties of columns
    contents_sum_list, contents_x_list, for_avgs_contours_mask = column_summaries(
        image, rlsa_contents_mask_for_avg_width)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-for-avgs-contours-mask.png'),
            for_avgs_contours_mask)  # debug remove
    trimmed_mean = int(stats.trim_mean(contents_sum_list, 0.1))  # trimmed mean
    leftmost_x = min(contents_x_list)

    threshold = 2500  # remove tiny contours that dirtify the image
    ### titles work
    (contours, _) = cv2.findContours(~rlsa_titles_mask, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    # apply some heuristic to differentiate other stranger things masquerading as titles
    nt_contours = [
        contour for contour in contours if cv2.boundingRect(contour)[2] *
        cv2.boundingRect(contour)[3] > threshold
    ]

    total_columns = int(image.shape[1] / trimmed_mean)
    contours = sorted(
        nt_contours,
        key=lambda contour: determine_precedence(
            contour, total_columns, trimmed_mean, leftmost_x, m_height))
    clear_titles_mask = redraw_titles(image, contours)

    # draw_columns(leftmost_x, trimmed_mean, total_columns, clear_titles_mask)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-clear-titles-mask.png'),
            clear_titles_mask)  # debug remove

    ### contents work
    (contours, _) = cv2.findContours(~rlsa_contents_mask, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    # apply some heuristic to different other stranger things masquerading as titles
    nt_contours = [
        contour for contour in contours if cv2.boundingRect(contour)[2] *
        cv2.boundingRect(contour)[3] > threshold
    ]

    contents_contours = sorted(
        nt_contours,
        key=lambda contour: determine_precedence(
            contour, total_columns, trimmed_mean, leftmost_x, m_height))
    clear_contents_mask = redraw_contents(image_mask, contents_contours)
    # draw_columns(leftmost_x, trimmed_mean, total_columns, clear_contents_mask)
    if logging.getLogger().level == logging.DEBUG:
        cv2.imwrite(
            os.path.join(output_dir,
                         f'{image_sans_ext}-sorted-clear-contents-mask.png'),
            clear_contents_mask)

    # start printing individual letters based on titles! The final act
    (contours, _) = cv2.findContours(~rlsa_titles_mask_for_final,
                                     cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)

    # apply some heuristic to different other stranger things masquerading as titles
    nt_contours = [
        contour for contour in contours if cv2.boundingRect(contour)[2] *
        cv2.boundingRect(contour)[3] > threshold
    ]

    contours = sorted(
        nt_contours,
        key=lambda contour: determine_precedence(
            contour, total_columns, trimmed_mean, leftmost_x, m_height))

    article_complete = False
    title_came_up = True
    title_count = len(contours)
    ct_widths = []
    article_mask = np.ones(
        image.shape, dtype="uint8") * 255  # blank layer image for one article
    letter_root = ET.Element("letter")

    desc = ET.SubElement(letter_root, "description")
    ET.SubElement(desc, "MeasurementUnit").text = "pixel"
    ocv_proc = ET.SubElement(desc,
                             "OPenCVProcessing",
                             pageImage=image_sans_ext)
    ET.SubElement(ocv_proc, "ProcessingDateTime").text = str(datetime.today())
    ET.SubElement(ocv_proc, "Script").text = 'Lettersiterate'

    layout = ET.SubElement(letter_root, "Layout")
    page = ET.SubElement(layout, "Page")
    print_space = ET.SubElement(page,
                                "PrintSpace",
                                height=str(image_height),
                                width=str(image_width),
                                xpos=str(0),
                                ypos=str(0))
    # ET.Element(print_space, attrib={'height':image_height, 'width':image_width, 'xpos':0, 'ypos':0})

    # for idx, contour in enumerate(contours):
    for idx, (_curr, _next) in enumerate(zip(contours[::], contours[1::])):
        # https://www.quora.com/How-do-I-iterate-through-a-list-in-python-while-comparing-the-values-at-adjacent-indices/answer/Jignasha-Patel-14
        if article_complete:
            article_mask = np.ones(
                image.shape, dtype="uint8"
            ) * 255  # blank layer image for another separate letter

            # xml file
            letter_root = ET.Element("letter")

            desc = ET.SubElement(letter_root, "description")
            ET.SubElement(desc, "MeasurementUnit").text = "pixel"
            ocv_proc = ET.SubElement(desc, "OPenCVProcessing")
            ET.SubElement(ocv_proc,
                          "ProcessingDateTime").text = str(datetime.today())
            ET.SubElement(ocv_proc, "Script").text = 'Lettersiterate'

            layout = ET.SubElement(letter_root, "Layout")
            page = ET.SubElement(layout, "Page")
            print_space = ET.SubElement(page,
                                        "PrintSpace",
                                        height=str(image_height),
                                        width=str(image_width),
                                        xpos=str(0),
                                        ypos=str(0))

        [cx, cy, cw, ch] = cv2.boundingRect(_curr)
        [nx, ny, nw, nh] = cv2.boundingRect(_next)

        ct_height = cy + ch  # title height in this column
        ct_widths.append(cx + cw)
        ct_width = max(
            ct_widths
        )  # adjust to get longest title width if multiple line title :)

        # dont proceed any further if the next title is right below it on same column
        # continue to next title
        # current and next have to be within the same column
        # detect last article in the columns
        if (idx + 2) == title_count:
            title_came_up = False
        elif cy < ny and ny - (nh * 3) < cy and nx < ct_width:
            # 1) current title is above next
            # 2) next title is directly above current
            # 3) next title is withing the length of the current title. Cannot be in another column
            # and considered directly below current. Phew!, it happened
            title_came_up = True
        else:
            title_came_up = False

        if not title_came_up:
            title_encounters = 0
            # loop through contents within these boundaries and insert them to the canvas
            for content_idx, content_contour in enumerate(contents_contours):
                [x, y, w, h] = cv2.boundingRect(content_contour)
                content_width = x + w
                # length -50 is to be safe sometimes the content cut maybe infringe onto the next title
                # get any content that starts within the title (take out -50) and within the end of the title width
                # and give (+50), it is still below the title
                logging.debug(
                    f"{x} >= {cx-50} and {x} <= {ct_width} and {y+50} > {ct_height}"
                )
                if x >= cx - 50 and x <= ct_width and y + 50 > ct_height:
                    # now that we have limited the content to be within the width and below the title of interest
                    # make sure it does not transgress into other titles. The bane of my existence begins, sigh!
                    for tidx, tcontour in enumerate(contours):
                        [tx, ty, tw, th] = cv2.boundingRect(tcontour)
                        # validating titles that are directly below
                        # 1) it has to be greater than the current title
                        # 2) it starts within the width of the current title
                        # 3) it starts within the width of the current content
                        # 4) it does not start left of the content even if we take out 50 pixels to the left (-50)
                        if tidx > idx and tx < ct_width and tx < content_width and tx > x - 50 and title_encounters < 1:
                            # print(f"TITLE BELOW---> ###{content_idx} ##{tidx} > #{idx} and {tx} < {content_width} and {cx} >= {x-50}")
                            article_mask = cutouts(article_mask,
                                                   clear_contents_mask,
                                                   content_contour)
                            ET.SubElement(print_space,
                                          "BodyText",
                                          height=str(h),
                                          width=str(w),
                                          xpos=str(x),
                                          ypos=str(y),
                                          contourId=str(idx),
                                          bodyTextContourId=str(content_idx))
                            # cv2.putText(article_mask, "###{content_idx},{x},{y}.{w},{h}", cv2.boundingRect(content_contour)[:2], cv2.FONT_HERSHEY_PLAIN, 1.50, [255, 0, 0], 2)
                            title_encounters += 1
                            # hitting a title in this case means we don't need to go any further for current content
                            break

                        # validating titles that are on a different column
                        # 1)it has to be greater than the current title
                        # 2)it starts within the width of the current title
                        # 3)it starts below this content but within the contents limits (meaning it is multicolumn extension)
                        if tidx > idx and tx < ct_width and (
                                ty > y
                                and tx > x - 50) and title_encounters < 1:
                            article_mask = cutouts(article_mask,
                                                   clear_contents_mask,
                                                   content_contour)
                            ET.SubElement(print_space,
                                          "BodyText",
                                          height=str(h),
                                          width=str(w),
                                          xpos=str(x),
                                          ypos=str(y),
                                          contourId=str(idx),
                                          bodyTextContourId=str(content_idx))

                    # validating titles that are at the end of the column
                    # 1) there is no title directly below it
                    if all(x < cv2.boundingRect(tcontour)[0]
                           for tidx, tcontour in enumerate(contours)
                           if tidx > idx and cv2.boundingRect(tcontour)[0] >
                           content_width) and title_encounters < 1:
                        article_mask = cutouts(article_mask,
                                               clear_contents_mask,
                                               content_contour)
                        ET.SubElement(print_space,
                                      "BodyText",
                                      height=str(h),
                                      width=str(w),
                                      xpos=str(x),
                                      ypos=str(y),
                                      contourId=str(idx),
                                      bodyTextContourId=str(content_idx))

        if title_came_up:
            ct_widths.append(cx + cw)
            article_title_p = clear_titles_mask[cy:cy + ch, cx:cx + cw]
            article_mask[
                cy:cy + ch, cx:cx +
                cw] = article_title_p  # copied title contour onto the blank image
            ET.SubElement(print_space,
                          "Title",
                          height=str(ch),
                          width=str(cw),
                          xpos=str(cx),
                          ypos=str(cy),
                          contourId=str(idx))
            article_complete = False
        else:
            ct_widths = []  # reset widths
            article_title_p = clear_titles_mask[cy:cy + ch, cx:cx + cw]
            article_mask[
                cy:cy + ch, cx:cx +
                cw] = article_title_p  # copied title contour onto the blank image
            ET.SubElement(print_space,
                          "Title",
                          height=str(ch),
                          width=str(cw),
                          xpos=str(cx),
                          ypos=str(cy),
                          contourId=str(idx))

            if (idx + 2) == title_count:  # we are at the end
                article_title_p = clear_titles_mask[ny:ny + nh, nx:nx + nw]
                article_mask[
                    ny:ny + nh, nx:nx +
                    nw] = article_title_p  # copied title contour onto the blank image

            file_name = f"article-{str(idx).zfill(2)}"
            if logging.getLogger().level == logging.DEBUG:
                cv2.imwrite(
                    os.path.join(output_dir,
                                 f"{image_sans_ext}-{file_name}.png"),
                    article_mask)
            article_complete = True

            content = pytesseract.image_to_string(
                Image.fromarray(article_mask))
            with open(
                    os.path.join(output_dir,
                                 f'{image_sans_ext}-{file_name}.txt'),
                    'a') as the_file:
                the_file.write(content)
            ET.SubElement(page,
                          "TextBlock",
                          articleNo=str(file_name),
                          contourId=str(idx)).text = content

            tree = ET.ElementTree(letter_root)
            xml_output_file = os.path.join(
                output_dir, f'{image_sans_ext}-{file_name}.xml')
            # this method may cause 'OSError: [Errno 24] Too many open files' and does not prettyprint
            # tree.write(xml_output_file, encoding='utf8')
            # OR
            xmlstr = ET.tostring(letter_root).decode()
            xmlstr = minidom.parseString(xmlstr).toprettyxml(indent="\t",
                                                             newl="\n")
            with open(xml_output_file, 'w+') as outfile:
                outfile.write(xmlstr)
Пример #11
0
# file_path = filedialog.askdirectory()
# print(file_path)
# count_human_true = 0
# count_tree_true = 0
tree = train_classifier()

## for file in os.listdir(file_path):
## file_name = os.path.join(file_path, file)
## counter += 1
## print("cleaning image " + str(counter) + " to path "+ file_path + '_output_' + file)
# open_file = open("samples.txt", "r+")
## beginning
file_name = '9_main-qimg-de9e1056b4cd97bbf39f1c8b4ba68f6a.jpg'
img = cv2.cvtColor(cv2.imread(file_name), cv2.COLOR_BGR2GRAY)
ret, thresh = cv2.threshold(img, 180, 255, cv2.THRESH_BINARY)
img_rlsa_horizontal = rlsa.rlsa(thresh, True, False, 10)
img_rlsa_vertical = rlsa.rlsa(img_rlsa_horizontal, False, True, 15)
# cv2.imwrite("1_rlsa-smoothed.jpg", img_rlsa_vertical)

opening = cv2.morphologyEx(img_rlsa_vertical,
                           cv2.MORPH_OPEN,
                           np.ones((3, 3), np.int),
                           iterations=2)

# cv2.imwrite('2_opened_image.jpg', opening)

sure_bg = cv2.dilate(opening, None, iterations=5)
sure_bg = sure_bg - cv2.erode(sure_bg, None)
dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
dist_transform = ((dist_transform - dist_transform.min()) /
                  (dist_transform.max() - dist_transform.min()) * 255).astype(
Пример #12
0
heights = [cv2.boundingRect(contour)[3]
           for contour in contours]  # collecting heights of each contour
avgheight = sum(heights) / len(heights)  # average height

# finding the larger text
for c in contours:
    [x, y, w, h] = cv2.boundingRect(c)
    if h > 2 * avgheight:
        cv2.drawContours(mask, [c], -1, 0, -1)

cv2.imshow('mask', mask)

x, y = mask.shape  # image dimensions

value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20
mask = rlsa.rlsa(mask, True, False, value)  #rlsa application

cv2.imshow('mask1', mask)

(_, contours, _) = cv2.findContours(~mask, cv2.RETR_EXTERNAL,
                                    cv2.CHAIN_APPROX_SIMPLE)

mask2 = np.ones(image.shape, dtype="uint8") * 255  # blank 3 layer image
for contour in contours:
    [x, y, w, h] = cv2.boundingRect(contour)
    if w > 0.60 * image.shape[1]:  # width heuristic applied
        title = image[y:y + h, x:x + w]
        mask2[y:y + h,
              x:x + w] = title  # copied title contour onto the blank image
        image[y:y + h,
              x:x + w] = 255  # nullified the title contour on original image
Пример #13
0
def process_image(path_to_image, empty_output, out_dir_name):
    image_name = os.path.basename(path_to_image)
    img_sans_ext = os.path.splitext(image_name)[0]

    # check if file exists here and exist if not
    try:
        f = open(path_to_image)
        f.close()
    except FileNotFoundError:
        log.critical('Given image does not exist')
        sys.exit(0)

    log.info(f"Processing {image_name}")

    # create out dir
    current_directory = os.getcwd()
    final_dir = os.path.join(current_directory, r'dates')
    if not os.path.exists(final_dir):
        os.makedirs(final_dir)

    founds = glob.glob(f'{final_dir}/{img_sans_ext}-*.xml')
    if len(founds) > 0:
        log.info(f"FILE EXISTS: {founds}")
        return

    # standardize size of the images maintaining aspect ratio
    if empty_output:
        files = glob.glob('{}/*'.format(final_dir))
        for f in files:
            os.remove(f)

    image = cv2.imread(path_to_image)  # reading the image

    image_width = image.shape[1]
    if image_width != 2048:
        image = imutils.resize(image, width=2048)

    gray = cv2.cvtColor(image,
                        cv2.COLOR_BGR2GRAY)  # converting to grayscale image
    # applying thresholding technique on the grayscale image
    # all pixels value above 0 will be set to 255 but because
    # we are using THRESH_OTSU
    # we have avoid have to set threshold (i.e. 0 = just a placeholder)
    # since otsu's method does it automatically
    (thresh, im_bw) = cv2.threshold(
        gray, 0, 255,
        cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)  # converting to binary image
    # invert image data using unary tilde operator
    # im_bw = ~im_bw

    # Noise removal step - Perform opening on the thresholded image
    # (erosion followed by dilation)
    kernel = np.ones((2, 2), np.uint8)  # kernel noise size (2,2)
    # cleans up random lines that appear on the page
    im_bw = cv2.morphologyEx(im_bw, cv2.MORPH_OPEN, kernel)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(os.path.join(final_dir,
                                 f'{img_sans_ext}-im-negative.png'), im_bw)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(os.path.join(final_dir,
                                 f'{img_sans_ext}-im-bw.png'), ~im_bw)

    # extract and draw any lines from the image
    lines_mask = draw_lines(image, gray)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(os.path.join(final_dir,
                                 f'{img_sans_ext}-lines-mask.png'), lines_mask)

    # extract complete shapes likes boxes of ads and banners
    found_polygons_mask = extract_polygons(im_bw, lines_mask)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(
            os.path.join(final_dir, f'{img_sans_ext}-found-polygons-mask.png'),
            found_polygons_mask)

    # nullifying the mask of unwanted polygons over binary (toss images)
    # this should not only have texts, without images
    text_im_bw = cv2.bitwise_and(im_bw, im_bw, mask=found_polygons_mask)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(
            os.path.join(final_dir, f'{img_sans_ext}-text-im-bw-negative.png'),
            ~text_im_bw)

    # initialize blank image for extracted contents
    contents_mask = np.ones(image.shape[:2], dtype="uint8") * 255

    (contours, _) = cv2.findContours(text_im_bw,
                                     cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    heights = [cv2.boundingRect(contour)[3] for contour in contours]
    avgheight = sum(heights)/len(heights)

    content_widths = []
    if log.getLogger().level == log.DEBUG:
        # blank 3 layer image for debug colour
        debug_mask = np.ones(image.shape, dtype="uint8") * 255
    # finding the larger text
    for c in contours:
        [x, y, w, h] = cv2.boundingRect(c)
        cv2.rectangle(contents_mask, (x, y), (x+w, y+h), (255, 0, 0), 1)
        if h > 2*avgheight:  # avoid titles altogether
            pass
        elif h*w > 20 and x > 1000 and y < 100:  # avoid specks or dots
            # get the biggest chunks of texts... articles!
            cv2.drawContours(contents_mask, [c], -1, 0, -1)
            content_widths.append(w)

        if log.getLogger().level == log.DEBUG:
            cv2.drawContours(debug_mask, [c], -1, 0, -1)
            cv2.rectangle(debug_mask, (x, y), (x+w, y+h), (0, 255, 0), 1)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(os.path.join(
            final_dir, f'{img_sans_ext}-debug_drawn_contours.png'),
            debug_mask)

    # get image dimensions, height and width
    m_height, m_width = contents_mask.shape

    # make 2D Image mask of proto-original image for cutting contents
    # blank 3 layer image
    image_mask = np.ones(image.shape, dtype="uint8") * 255
    image_mask[0: m_height, 0: m_width] = image[0: m_height, 0: m_width]

    try:
        value = math.ceil(sum(content_widths)/len(content_widths))*5
    except ZeroDivisionError as e:
        value = 140
    log.info(f'RLSA Content Value {value}')
    # rlsa application
    rlsa_contents_mask = rlsa.rlsa(contents_mask, True, False, value)
    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(os.path.join(
            final_dir, f'{img_sans_ext}-rlsa-contents-mask.png'),
            rlsa_contents_mask)  # debug remove

    threshold = 1500  # remove tiny contours that dirtify the image

    # contents work
    (contours, _) = cv2.findContours(~rlsa_contents_mask,
                                     cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)
    # apply some heuristic to different other stranger things
    # masquerading as contents
    contents_contours = [contour for contour in contours if
                         cv2.boundingRect(contour)[2] *
                         cv2.boundingRect(contour)[3] > threshold]

    # blank layer image for one article
    article_mask = np.ones(image.shape, dtype="uint8") * 255

    # loop through and insert it to the canvas
    for content_idx, content_contour in enumerate(contents_contours):
        # https://www.quora.com/How-do-I-iterate-through-a-list-in-python-while-comparing-the-values-at-adjacent-indices/answer/Jignasha-Patel-14

        [x, y, w, h] = cv2.boundingRect(content_contour)

        if x > 1000 and y < 100:
            log.debug(f"{x} >= {x-50} and {x} {y+50}")
            article_mask = cutouts(article_mask, image_mask, content_contour)

    angle, rotated_article_mask = correct_skew(article_mask)
    log.info(f'Rotation Angle: {angle}')

    # DIlating the output improved overall readbility by tesseract especially
    # in cases where resulting output was empty
    # https://stackoverflow.com/a/54582118/754432
    cv2.dilate(rotated_article_mask, (5, 5), rotated_article_mask)

    if log.getLogger().level == log.DEBUG:
        cv2.imwrite(os.path.join(final_dir,
                                 f"{img_sans_ext}.png"), rotated_article_mask)

    # 3 Fully automatic page segmentation, but no OSD. (default for tesserocr)
    # 7 means treat the image as a single text line.
    # https://medium.com/better-programming/beginners-guide-to-tesseract-ocr-using-python-10ecbb426c3d
    content = pytesseract.image_to_string(
        Image.fromarray(rotated_article_mask),
        config='--psm 3')

    with open(os.path.join(final_dir, f'{out_dir_name}.csv'), 'a+') as f_out:
        # Using dictionary keys as fieldnames for the CSV file header
        writer = csv.writer(f_out, delimiter='\t')
        # writer = csv.DictWriter(f_out, fieldnames=['file_name', 'raw_date'])
        writer.writerow([img_sans_ext, content.partition('\n')[0]])
Пример #14
0
def main(page,args=None):
    print("processing "+page)
    #read in data
    cls_file=os.path.join(args.clsdir,page+'.json')
    col_rect_file=os.path.join(args.colrectdir,page+'.json')
    row_rect_file=os.path.join(args.rowrectdir,page+'.json')

    bg_img = cv2.imread('/home/ubuntu/results/personnel-records/1956/seg/background.png')

    with open(col_rect_file) as file:
        col_rects = json.load(file)

    with open(row_rect_file) as file:
        row_rects = json.load(file)

    with open(cls_file) as file:
        cls = json.load(file)
    cls=cls['name']
    for key in row_rects.keys():
        row_rects_col = row_rects[key]
        col_img=cv2.imread(os.path.join(args.imgdir,page,page+'_'+key+'.png'))

        col_img_b=Binarization(col_img)
        RLSA_thr=30#50

        _ , M_col = Rect.CropRect(col_img_b, col_rects[int(key)])
        for i in range(len(row_rects_col)):
            if cls[i]=='personnel':
                #detect symbols
                row_img_b , _ =Rect.CropRect(col_img_b, Rect.RectOnDstImg(row_rects_col[i],M_col))
                count=np.sum(row_img_b/255,axis=0)
                count=signal.medfilt(count, 5)
                _, count=cv2.threshold(count, 3, 255, cv2.THRESH_BINARY_INV)
                count=rlsa.rlsa(count.T, True, False, RLSA_thr)

                symbol_intervals=SymbolDetection(255-count[-1],RLSA_thr)

                #decide if we need to move symbols closer
                if symbol_intervals:
                    for ii in range(len(symbol_intervals[:-1])-1,-1,-1):
                        if np.median(symbol_intervals[ii])>0.35*row_img_b.shape[1]:
                            #copy the region of FName (src)
                            row_img, M_col2row = Rect.CropRect(col_img, Rect.RectOnDstImg(ExpandRect(row_rects_col[i]),M_col))
                            src_img=row_img[:,symbol_intervals[ii][0]:symbol_intervals[ii][1]].copy()

                            # t is the distance between current and next symbol
                            t = symbol_intervals[ii+1][0] - symbol_intervals[ii][1]
                            M_row2col = np.linalg.inv(M_col2row)

                            # manually setup mask, for better performance we should automatically find a mask (binarization,DP,etc)
                            roi_pts = np.array([[0, 0],
                                    [src_img.shape[1], 0],
                                    [src_img.shape[1], src_img.shape[0]],
                                    [0, src_img.shape[0]]], dtype="float32")

                            # mask w.r.t M_row2col
                            roi_pts = Rect.PtsOnDstImg(roi_pts, M_row2col)
                            roi_pts = roi_pts - np.min(roi_pts,axis=0)
                            height,width = np.max(roi_pts, axis=0)[::-1]
                            mask = np.zeros([min(height,src_img.shape[0]),min(width,src_img.shape[1])])
                            roi_mask=cv2.fillConvexPoly(mask, roi_pts, 255)

                            # fill the region of FName with random sampled background
                            center = [[np.median(symbol_intervals[ii]), row_img.shape[0] / 2]]
                            center = tuple(Rect.PtsOnDstImg(center,M_row2col,False)[-1])
                            x , y = np.random.randint(bg_img.shape[0]-roi_mask.shape[0],size=1)[0], np.random.randint(bg_img.shape[1]-roi_mask.shape[1],size=1)[0]
                            try:
                                col_img = cv2.seamlessClone(bg_img[x:x+roi_mask.shape[0],y:y+roi_mask.shape[1]], col_img, roi_mask.astype(np.uint8), center, cv2.NORMAL_CLONE)

                                #paste the src region to target region
                                center = [[np.median(symbol_intervals[ii]) + t, row_img.shape[0] / 2]]
                                center = tuple(Rect.PtsOnDstImg(center, M_row2col, False)[-1])
                                col_img = cv2.seamlessClone(src_img, col_img, roi_mask.astype(np.uint8), center, cv2.NORMAL_CLONE)
                                symbol_intervals[ii]=[symbol_intervals[ii][0]+t,symbol_intervals[ii][1]+t]
                            except:
                                # get error if part of src img is out of dst image
                                # compute on original image can avoid this problem, but this is much faster and there is no big difference
                                print("ignore first/last row for "+page+'_'+key )
        cls=cls[len(row_rects_col):]

        if not os.path.isdir(os.path.join(args.outputdir,page)):
            os.mkdir(os.path.join(args.outputdir,page))
            print('creating directory ' + os.path.join(args.outputdir,page))
        cv2.imwrite(os.path.join(args.outputdir,page,page+'_'+key+'.png'),col_img)
Пример #15
0
for c in contours:
    [x, y, w, h] = cv2.boundingRect(c)
    if h > 2 * avgheight:
        cv2.drawContours(mask_titles, [c], -1, 0, -1)
    else:
        cv2.drawContours(mask_contents, [c], -1, 0, -1)

#cv2.imshow('mask_titles', mask_titles)
cv2.imwrite('mask_titles.png', mask_titles)
#cv2.imshow('mask_contents', mask_contents)
cv2.imwrite('mask_contents.png', mask_contents)

x, y = mask_titles.shape  # image dimensions

value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20
rlsa_titles_mask = rlsa.rlsa(mask_titles, True, False,
                             value)  #rlsa application
rlsa_titles_mask_for_final = rlsa.rlsa(mask_titles, True, False,
                                       value)  #rlsa application
cv2.imwrite('rlsa_title_mask.png', rlsa_titles_mask)

value = max(math.ceil(x / 100), math.ceil(y / 100)) + 20
rlsa_contents_mask = rlsa.rlsa(mask_contents, False, True,
                               value)  #rlsa application
rlsa_contents_mask_for_avg_width = rlsa.rlsa(mask_contents, False, True,
                                             value)  #rlsa application
cv2.imwrite('rlsa_contents_mask.png', rlsa_contents_mask)
cv2.imwrite('rlsa_contents_mask_for_avg_width.png',
            rlsa_contents_mask_for_avg_width)

# CALC AVG WIDTHS?!
(for_avgs_contours, _) = cv2.findContours(~rlsa_contents_mask_for_avg_width,
Пример #16
0
# finding the larger text
for idx, contour in enumerate(contours):
    [x, y, w, h] = cv2.boundingRect(contour)
    # cv2.rectangle(image, (x,y), (x+w,y+h), (0, 255, 0), 1)
    if h > 2 * avgheight:
        cv2.drawContours(mask, [contour], -1, 0, -1)  # heading like contours
    else:
        cv2.drawContours(mask_content, [contour], -1, 0,
                         -1)  # everything else not heading-like

cv2.imshow('contour', image)  # on original image
cv2.imwrite('contours.png', image)

# attempt to get large content blocks
image_rlsa = rlsa.rlsa(mask_content, True, True, 10)  # both hori and verti
(contours, _) = cv2.findContours(~image_rlsa, cv2.RETR_EXTERNAL,
                                 cv2.CHAIN_APPROX_SIMPLE)
heights = [cv2.boundingRect(contour)[3]
           for contour in contours]  # collecting heights of each contour
avgheight = sum(heights) / len(heights)  # average height
print(avgheight, 3 * avgheight)
widths = [
    cv2.boundingRect(contour)[2] for contour in contours
    if cv2.boundingRect(contour)[3] > 2 * avgheight
]  # collecting widths of contours with above average height
avgwidth = sum(widths) / len(widths)  # average width

widths.sort()
widths = list(dict.fromkeys(widths))  # remove duplicates
trimmed_widths = stats.trim_mean(top_chunk(widths, 3),
Пример #17
0
def title(image_received):

    image = image_received  # reading the image

    #step 1: Image to Binary

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  #converting into greyscale

    (thresh, binary) = cv2.threshold(
        gray, 150, 255,
        cv2.THRESH_BINARY | cv2.THRESH_OTSU)  #converting into binary image

    # Step 2: Contouring.

    #creating blank image same dimension as the given image.
    mask = np.ones(image.shape[:2], dtype="uint8") * 255
    imghsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    mask_blue = cv2.inRange(imghsv, (0, 0, 0), (20, 20, 20))

    (contours, _) = cv2.findContours(
        mask_blue, cv2.RETR_EXTERNAL,
        cv2.CHAIN_APPROX_NONE)  #finding contours i.e all letters

    #print(len(contours))

    #collecting all the heights of each contours
    heights = [cv2.boundingRect(contour)[3] for contour in contours]

    #finding average height
    average_height = sum(heights) / len(heights)

    #drawing contours

    for contour in contours:
        #drawing rectangles around the countours in main image

        [x, y, w, h] = cv2.boundingRect(contour)
        if (h > 2 * average_height):
            #mask = cv2.rectangle(image, (x,y), (x+w, y+h), (0, 255, 0), 1)
            cv2.drawContours(mask, [contour], -1, 0, -1)
    '''
	cv2.namedWindow('filter',cv2.WINDOW_NORMAL)
	cv2.imshow('filter', mask)
	#cv2.imwrite('headlines.jpg',mask)
	cv2.waitKey(0)
	cv2.destroyAllWindows()
	'''

    #step 3: applying RLSA Horizontal on the image

    x, y = mask.shape

    value = max(math.ceil(x / 100), math.ceil(y / 100)) + 50
    mask = rlsa.rlsa(mask, True, False, value)
    '''
	cv2.namedWindow('rlsah',cv2.WINDOW_NORMAL)
	cv2.imshow('rlsah', mask)
	cv2.waitKey(0)
	cv2.destroyAllWindows()
	'''

    #step 4: applying above image in main image

    #finding contours
    (contours, _) = cv2.findContours(~mask, cv2.RETR_EXTERNAL,
                                     cv2.CHAIN_APPROX_SIMPLE)

    #blank image
    mask2 = np.ones(image.shape, dtype="uint8") * 255

    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        if w > 0.60 * image.shape[1]:
            title = image[y:y + h, x:x + w]

            mask2[y:y + h,
                  x:x + w] = title  #copied title contour onto the blank image
            image[y:y + h,
                  x:x + w] = 255  #nullified the contour on original image
    '''
	cv2.namedWindow('title',cv2.WINDOW_NORMAL)
	cv2.imshow('title', mask2)
	#cv2.imwrite('headlines.jpg',mask)
	cv2.waitKey(0)
	cv2.destroyAllWindows()
	'''

    extracted_title = pytesseract.image_to_string(mask2)
    return (extracted_title)