示例#1
0
    def Segmentation(self):
        logging = Logger(__name__)
        Logger.get_log(logging).info('Segmentation Start')

        TextLevel = self.configList.text_level
        TableLevel = self.configList.table_level

        if not self.configList.tit_choice:
            self.Table = TableExtraction(TableLevel, self.PagesLayout)
            self.Image = ImageExtraction(self.PagesImage, self.PagesLayout)
            self.Text = TextExtraction(TextLevel, self.PagesLayout)
            self.Text.Text = ImgTabOut(self.PgHeight, self.Text.Text,
                                       self.Image.Image, self.Table.Table)

        elif self.configList.tit_choice == 1:
            self.Text = TextExtraction(TextLevel, self.PagesLayout)

        elif self.configList.tit_choice == 2:
            self.Image = ImageExtraction(self.PagesImage, self.PagesLayout)

        elif self.configList.tit_choice == 3:
            self.Table = TableExtraction(TableLevel, self.PagesLayout)

        Logger.get_log(logging).info('Segmentation Finished')
        logging.logger.handlers.clear()
def TitleExtraction(PageLayout):
    title = []
    titleHeight = -1
    titleIndex = 0
    Height = PageLayout.height

    for index in range(len(PageLayout._objs)):
        item = PageLayout._objs[index]
        if item.y0 > 0.8 * Height:
            if isinstance(item, LTTextBoxHorizontal):
                for line in item:
                    if isinstance(line, LTTextLineHorizontal):
                        height = line.height
                        if height > 1.3 * titleHeight:
                            titleHeight = height
                            titleIndex = index
                            title = item
                            break
                        else:
                            break
    if title == []:
        logging = Logger(__name__)
        Logger.get_log(logging).critical('No Title Found')
        logging.logger.handlers.clear()

    return title, titleIndex
def JsonWrite(JsonFile, fileName, fileFolder):
    jsonPath = fileFolder + fileName[:-4] + '.json'
    with open(jsonPath, 'w') as f:
        json.dump(JsonFile, f)

    logging = Logger(__name__)
    Logger.get_log(logging).info('JsonFile Saved')
    logging.logger.handlers.clear()
    def Segmentation(self):
        logging = Logger(__name__)
        Logger.get_log(logging).info('Segmentation Start')
        TextLevel = self.configList.text_level

        self.Text = TextExtraction(TextLevel, self.PagesLayout)

        Logger.get_log(logging).info('Segmentation Finished')
        logging.logger.handlers.clear()
示例#5
0
    def configCheck(self):
        if not self.folder[-1] == '/':
            Logger.get_log(
                self.logging).critical('Configuration - Folder Format Error')
            print("Configuration - Folder may loss '/' to the end of the path")
            y_n = input(
                "Do you want system add '/' to the end of path ? (Y/N)\n")
            if y_n.lower() == 'y' or y_n.lower() == 'yes':
                self.folder += '/'
            else:
                sys.exit()

        if not self.filename == 'all' and not self.filename[-4:] == '.pdf':
            Logger.get_log(self.logging).critical(
                'Configuration - FileName Not End With .pdf ')
            print('Configuration - FileName Not End With \'.pdf\'')
            y_n = input(
                "Do you want system add '.pdf' to the end of filename ? (Y/N)\n"
            )
            if y_n.lower() == 'y' or y_n.lower() == 'yes':
                self.filename += '.pdf'
            else:
                sys.exit()

        if not (self.tit_choice == 0 or self.tit_choice == 1
                or self.tit_choice == 2 or self.tit_choice == 3):
            Logger.get_log(self.logging).critical(
                'Configuration - tit_choice Format Error ')
            while True:
                print('Configuration - tit_choice Format Error')
                tit_choice = input(
                    "Please press 0/1/2/3 to specify a tit_choice \n")
                if tit_choice == '0' or tit_choice == '1' or tit_choice == '2' or tit_choice == '3':
                    self.tit_choice = tit_choice
                    break

        if not (self.text_level == 1 or self.text_level == 2):
            Logger.get_log(self.logging).critical(
                'Configuration - text_level Format Error ')
            while True:
                print('Configuration - text_level Format Error ')
                text_level = input(
                    "Please press 1/2 to specify a text_level \n")
                if text_level == '1' or text_level == '2':
                    self.text_level = text_level
                    break

        if not (self.table_level == 1 or self.table_level == 2):
            Logger.get_log(self.logging).critical(
                'Configuration - table_level Format Error ')
            while True:
                print('Configuration - table_level Format Error ')
                table_level = input(
                    "Please press 1/2 to specify a table_level \n")
                if table_level == '1' or table_level == '2':
                    self.text_level = table_level
                    break
示例#6
0
def pdf2layout(fileName):
    logging = Logger(__name__)
    try:
        PagesLayout = with_pdf(fileName, '', _parse_pages, *tuple(['/tmp']))
        Logger.get_log(logging).info('pdf2xml Completed')
        logging.logger.handlers.clear()
        return PagesLayout
    except Exception:
        Logger.get_log(logging).critical('pdf2xml failed\n')

        return None
示例#7
0
    def Segmentation(self):

        for PageNo in range(len(self.PagesLayout)):
            PageImage = self.PagesImage[PageNo]
            PageLayout = self.PagesLayout[PageNo]
            Image = ImgExtraction(PageImage, PageLayout)
            self.Image.append(Image)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Image Segmentation Finished')
        logging.logger.handlers.clear()
示例#8
0
def JsonWrite(PagesText, fileName, fileFolder):
    pdf_folder = fileFolder + str(fileName) + '/'
    if not os.path.exists(pdf_folder):
        os.mkdir(pdf_folder)
    for index in range(len(PagesText)):
        Page = PagesText[index]
        txtPath = pdf_folder + str(fileName) + '_' + str(index + 1) + '.txt'
        with open(txtPath, 'w') as f:
            for Line in Page:
                f.write(Line + '\n')

    logging = Logger(__name__)
    Logger.get_log(logging).info('JsonFile Saved')
    logging.logger.handlers.clear()
示例#9
0
def ImageWrite(ImageList, fileName, fileFolder):
    imgFolder = fileFolder + str(fileName) + '/'

    if not os.path.exists(imgFolder[:-1]):
        os.mkdir(imgFolder)

    for index in range(len(ImageList)):
        Image = ImageList[index]
        imgName = str(fileName) + '_' + str(index + 1) + '.jpg'
        cv2.imwrite(imgFolder + imgName, Image)

    logging = Logger(__name__)
    Logger.get_log(logging).info('Image Saved')
    logging.logger.handlers.clear()
    def Segmentation(self):
        for PageNo in range(len(self.PagesLayout)):
            PageLayout = self.PagesLayout[PageNo]

            if self.TextLevel == 1:
                Text = Leve1Extraction(PageLayout)
                self.Text.append(Text)

            elif self.TextLevel == 2:
                pass

        logging = Logger(__name__)
        Logger.get_log(logging).info('Text Segmentation Finished')
        logging.logger.handlers.clear()
示例#11
0
def pdf2image(fileName):

    with tempfile.TemporaryDirectory() as path:
        PagesImage = convert_from_path(fileName, output_folder=path)

    for index in range(len(PagesImage)):
        PageImage = PagesImage[index]
        PagesImage[index] = cv2.cvtColor(np.asarray(PageImage),
                                         cv2.COLOR_RGB2BGR)

    logging = Logger(__name__)
    Logger.get_log(logging).info('pdf2image Completed')
    logging.logger.handlers.clear()

    return PagesImage
    def Segmentation(self):
        for PageLayout in self.PagesLayout:
            table = detect_table(PageLayout)

            newTable = []
            new_c_header = []
            new_r_header = []
            new_body = []

            for index in range(len(table)):
                tableItem = table[index]
                newTable.append(Region(tableItem))
                if self.TableLevel == 2:
                    c_header, r_header, body = extraction(
                        PageLayout, tableItem)

                    for cell in c_header:
                        cell.insert(0, Region(cell[0]))
                        cell.remove(cell[1])
                        if not cell[6] == []:
                            for child in cell[6]:
                                child.insert(0, Region(child[0]))
                                child.remove(child[1])
                    for cell in r_header:
                        cell.insert(0, Region(cell[0]))
                        cell.remove(cell[1])
                        if not cell[6] == []:
                            for child in cell[6]:
                                child.insert(0, Region(child[0]))
                                child.remove(child[1])
                    for cell in body:
                        cell.insert(0, Region(cell[0]))
                        cell.remove(cell[1])

                    new_c_header.append(c_header)
                    new_r_header.append(r_header)
                    new_body.append(body)

            self.Table.append(newTable)
            if self.TableLevel == 2:
                self.Column_Header.append(new_c_header)
                self.Row_Header.append(new_r_header)
                self.Body.append(new_body)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Table Segmentation Finished')
        logging.logger.handlers.clear()
    def Segmentation(self):
        FigNoteList = []
        TabNoteList = []

        for PageNo in range(len(self.PagesLayout)):
            PageLayout = self.PagesLayout[PageNo]

            if self.TextLevel == 1:
                Text = Leve1Extraction(PageLayout)
                self.Text.append(Text)

            elif self.TextLevel == 2:
                Page, pgIndex = PageExtraction(PageLayout)
                Note, ntIndex = NoteExtraction(PageLayout)

                FigNoteList.append(FigureNoteExtraction(PageLayout))
                TabNoteList.append(TableNoteExtraction(PageLayout))

                if PageNo == 0:
                    Title, ttIndex = TitleExtraction(PageLayout)
                    Author, auIndex = AuthorExtraction(PageLayout, ttIndex)
                    self.Title.append(Title)
                    self.Author.append(Author)
                else:
                    ttIndex = -1
                    auIndex = []
                    self.Title.append([])
                    self.Author.append([])

                Text = Level2Extraction(PageLayout, pgIndex, ntIndex, ttIndex,
                                        auIndex)

                self.Page.append(Page)
                self.Note.append(Note)
                self.Text.append(Text)

        if self.TextLevel == 2:
            self.TableNote = NotePostProcess(TabNoteList, "T")
            self.FigureNote = NotePostProcess(FigNoteList, "F")
            self.Text = FigTabNoteOut(self.Text, self.TableNote,
                                      self.FigureNote)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Text Segmentation Finished')
        logging.logger.handlers.clear()
示例#14
0
    def __init__(self, pages_layout):
        self.pages_layout = pages_layout

        logging = Logger(__name__)
        logging.logger.info('Analysis Start')

        self.table = TableAnalysis(self.pages_layout)
        self.text = TextAnalysis(self.pages_layout)
        self.figure = FigureAnalysis(self.pages_layout)

        logging.logger.info('Analysis Finished')
示例#15
0
    def __init__(self, pages_layout):

        self.pages_layout = pages_layout
        self.pages_table = []

        for page_layout in self.pages_layout:
            page_table = detect_table(page_layout)
            self.pages_table.append(page_table)

        logging = Logger(__name__)
        logging.logger.info('Table Analysis Finished')
示例#16
0
    def __init__(self, pages_layout):

        self.pages_layout = pages_layout
        self.pages_text = []

        for page_layout in self.pages_layout:
            page_text = detect_text(page_layout)

            self.pages_text.append(page_text)

        logging = Logger(__name__)
        logging.logger.info('Text Analysis Finished')
示例#17
0
    def Segmentation(self):
        for PageNo in range(len(self.PagesLayout)):
            PageLayout = self.PagesLayout[PageNo]

            if self.TextLevel == 1:
                Text = Leve1Extraction(PageLayout)
                self.Text.append(Text)

            elif self.TextLevel == 2:
                Zhang = ZhangExtraction(PageLayout)
                self.Zhang.append(Zhang)
                Jie = JieExtraction(PageLayout)
                self.Jie.append(Jie)
                Tiao = TiaoExtraction(PageLayout)
                self.Tiao.append(Tiao)
                Title = TitleExtraction(PageLayout)
                self.Title.append(Title)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Text Segmentation Finished')
        logging.logger.handlers.clear()
def ImageWrite(ImageList, fileName, fileFolder):
    ImgWrite = False
    for Image in ImageList:
        if isinstance(Image, np.ndarray):
            ImgWrite = True

    if ImgWrite:
        imgFolder = fileFolder + fileName[:-4] + '/'

        if not os.path.exists(imgFolder[:-1]):
            os.mkdir(imgFolder)

        for index in range(len(ImageList)):
            Image = ImageList[index]
            imgName = fileName[:-4] + '_p' + str(index+1) + '.jpg'
            if isinstance(Image, np.ndarray):
                cv2.imwrite(imgFolder + imgName, Image)

        logging = Logger(__name__)
        Logger.get_log(logging).info('Image Saved')
        logging.logger.handlers.clear()
示例#19
0
def image_write(folder, filename, page_images, flag):

    path = os.path.join(folder, filename)
    if not os.path.exists(path):
        os.mkdir(path)

    for idx in range(len(page_images)):

        page_image = page_images[idx]
        page_path = os.path.join(path, filename + '_' + str(idx + 1) + '.jpg')
        cv2.imwrite(page_path, page_image)

    logging = Logger(__name__)
    logging.logger.info(flag + ' Saved')
示例#20
0
def AuthorExtraction(PageLayout, TitleIndex):
    author = []
    auIndex = []
    breakSign = False

    for index in range(TitleIndex + 1, len(PageLayout._objs)):
        Box = PageLayout._objs[index]
        if isinstance(Box, LTTextBoxHorizontal):
            for line in Box:
                lineText = line.get_text().replace(' ', '').lower()
                if lineText.find('abstract') >= 0:
                    abstractIndex = index
                    abstractUpY = line.y1
                    breakSign = True
                    break
            if breakSign:
                break

    if not breakSign:
        logging = Logger(__name__)
        Logger.get_log(logging).critical('No Abstract Found')
        logging.logger.handlers.clear()

        for index in range(TitleIndex + 1, len(PageLayout._objs)):
            Box = PageLayout._objs[index]
            if isinstance(Box, LTTextBoxHorizontal):
                for line in Box:
                    lineText = line.get_text().replace(' ', '').lower()
                    if lineText.find('introduction') >= 0:
                        abstractIndex = index
                        abstractUpY = line.y1
                        breakSign = True
                        break
                if breakSign:
                    break

    if not breakSign:
        logging = Logger(__name__)
        Logger.get_log(logging).critical('No Introduction Found')
        logging.logger.handlers.clear()
        author.append(PageLayout._objs[TitleIndex + 1])
        auIndex.append(TitleIndex + 1)

    else:
        Width = PageLayout.width
        Height = PageLayout.height

        for index in range(TitleIndex + 1, abstractIndex):
            Box = PageLayout._objs[index]
            if Box.y0 > max(abstractUpY, 0.6 * Height) and isinstance(
                    Box, LTTextBoxHorizontal):
                if (Box.x0 + Box.x1) > Width / 4 and (Box.x0 +
                                                      Box.x1) < 7 * Width / 4:
                    author.append(Box)
                    auIndex.append(index)

    return author, auIndex
示例#21
0
def json_write(folder, filename, content):

    path = os.path.join(folder, filename)
    if not os.path.exists(path):
        os.mkdir(path)

    file_dict = {'Pages': []}

    for idx in range(len(content)):

        page_path = os.path.join(path, filename + '_' + str(idx + 1) + '.json')
        with open(page_path, 'w', encoding='utf-8') as f:
            json.dump(content[idx], f, ensure_ascii=False, indent=4)

        file_dict['Pages'].append(content[idx])

    file_path = os.path.join(path, filename + '.json')
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(file_dict, f, ensure_ascii=False, indent=4)

    logging = Logger(__name__)
    logging.logger.info('JsonFile Saved')
示例#22
0
    def __init__(self):

        self.logging = Logger(__name__)
        self.logging.logger.info('Start processing ConfigFile')

        cp = ConfigParser()
        cp.read('conf.cfg')
        self.pdf_folder = cp.get('configuration', 'pdf_folder')
        self.json_output = cp.get('configuration', 'json_output')
        self.ori_output = cp.get('configuration', 'ori_output')
        self.anno_output = cp.get('configuration', 'anno_output')

        if not os.path.exists(self.pdf_folder):
            os.makedirs(self.pdf_folder)
        if not os.path.exists(self.json_output):
            os.makedirs(self.json_output)
        if not os.path.exists(self.ori_output):
            os.makedirs(self.ori_output)
        if not os.path.exists(self.anno_output):
            os.makedirs(self.anno_output)

        self.file_list = sorted(os.listdir(self.pdf_folder))

        self.logging.logger.info('ConfigFile Processed\n')
示例#23
0
 def __init__(self):
     self.logging = Logger(__name__)
     Logger.get_log(self.logging).info('Start processing ConfigFile')
     self.config()
     Logger.get_log(self.logging).info('ConfigFile Processed\n')
示例#24
0
def FNTypeCheck(FigNoteList):
    TypeList = []
    TypeCountList = []

    for pgNum in range(len(FigNoteList)):
        PageFigNote = FigNoteList[pgNum]
        for figNoteIndex in range(len(PageFigNote)):
            figNote = PageFigNote[figNoteIndex]

            figNoteText = figNote[1].get_text()[:-1].lower().replace(" ", "")
            Type = FNTypeCalculate(figNoteText)
            TypeList.append(Type)

    for index in range(len(TypeList) - 1, -1, -1):
        item = TypeList[index]
        if item.find('E') >= 0:
            TypeList.remove(item)

    if not TypeList == []:
        while True:
            Type = TypeList[0]
            TypeCount = TypeList.count(Type)
            TypeCountList.append([Type, TypeCount])
            for index in range(len(TypeList) - 1, -1, -1):
                item = TypeList[index]
                if item == Type:
                    TypeList.remove(item)
            if len(TypeList) == 0:
                break

        MaxTypeCount = [[None, -1]]
        for index in range(len(TypeCountList)):
            TCPair = TypeCountList[index]
            count = TCPair[1]
            if count > MaxTypeCount[0][1]:
                MaxTypeCount[0][0] = TCPair[0]
                MaxTypeCount[0][1] = count

        for index in range(len(TypeCountList)):
            TCPair = TypeCountList[index]
            type = TCPair[0]
            count = TCPair[1]
            if count == MaxTypeCount[0][1] and not type == MaxTypeCount[0][0]:
                MaxTypeCount.append([TCPair[0], count])

        if len(MaxTypeCount) > 1:
            MaxType = '000'
            for item in MaxTypeCount:
                if item[0] > MaxType:
                    MaxType = item[0]

            logging = Logger(__name__)
            Logger.get_log(logging).critical(
                'Same Type of ImageNote: {}'.format(MaxTypeCount))
            logging.logger.handlers.clear()

            return MaxType

        else:
            return MaxTypeCount[0][0]
    else:
        logging = Logger(__name__)
        Logger.get_log(logging).critical('No ImageNote')
        logging.logger.handlers.clear()
        return None