Пример #1
0
def add(self, obj):
    # Don't let first element be empty strings
    if isinstance(obj, LTChar) and self.word_margin:
        margin = self.word_margin * max(obj.width, obj.height)
        if self._x1 < obj.x0-margin:
            LTContainer.add(self, LTAnno(' '))
    if isinstance(obj, LTChar) and not obj.get_text().strip() and not len(self._objs):
        return
    self._x1 = obj.x1
    LTTextLine.add(self, obj)
    return
Пример #2
0
def _clean_textline(item: LTTextLine) -> Optional[LTTextLine]:
    clean_text = keep_allowed_chars(item.get_text()).strip()
    # Skip empty and invalid lines
    if clean_text:
        # TODO: add subscript detection and use latex underscore
        # or superscript
        item.clean_text = clean_text
        item.font_name, item.font_size = _font_of_mention(item)
        return item
    else:
        return None
Пример #3
0
def add(self, obj):
    # Don't let first element be empty strings
    if isinstance(obj, LTChar) and self.word_margin:
        margin = self.word_margin * max(obj.width, obj.height)
        if self._x1 < obj.x0 - margin:
            LTContainer.add(self, LTAnno(' '))
    if isinstance(
            obj,
            LTChar) and not obj.get_text().strip() and not len(self._objs):
        return
    self._x1 = obj.x1
    LTTextLine.add(self, obj)
    return
Пример #4
0
 def get_word_boundaries(
     self, mention: LTTextLine
 ) -> List[Tuple[str, float, float, float, float]]:
     mention_text = mention.get_text()
     mention_chars: List[Tuple[str, int, int, int, int]] = []
     for obj in mention:
         if isinstance(obj, LTChar):
             x0, y0, x1, y1 = obj.bbox
             mention_chars.append([obj.get_text(), y0, x0, y1, x1])
     words = []
     mention_words: List[str] = mention_text.split()  # word split by " " (space)
     char_idx = 0
     for word in mention_words:
         curr_word = [word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf")]
         len_idx = 0
         while len_idx < len(word):
             if mention_chars[char_idx][0] == " ":
                 char_idx += 1
                 continue
             if word[len_idx] != mention_chars[char_idx][0]:
                 self.log.warning(
                     "Out of order ({}, {})".format(word, mention_chars[char_idx][0])
                 )
             curr_word[1] = min(curr_word[1], mention_chars[char_idx][1])
             curr_word[2] = min(curr_word[2], mention_chars[char_idx][2])
             curr_word[3] = max(curr_word[3], mention_chars[char_idx][3])
             curr_word[4] = max(curr_word[4], mention_chars[char_idx][4])
             len_idx += len(mention_chars[char_idx][0])
             char_idx += 1
         words.append(curr_word)
     return words
Пример #5
0
    def get_word_boundaries(
            self, mention: LTTextLine
    ) -> List[Tuple[str, float, float, float, float]]:
        """Split a line of text into words.

        :param mention: a line of text
        :return: a list of words
        """
        mention_text = mention.get_text()
        mention_chars: List[Tuple[str, int, int, int, int]] = []
        for obj in mention:
            if isinstance(obj, LTChar):
                x0, y0, x1, y1 = obj.bbox
                mention_chars.append([obj.get_text(), y0, x0, y1, x1])
        words = []
        mention_words: List[str] = mention_text.split(
        )  # word split by " " (space)
        char_idx = 0
        for word in mention_words:
            curr_word = [
                word,
                float("Inf"),
                float("Inf"),
                float("-Inf"),
                float("-Inf")
            ]
            len_idx = 0
            while len_idx < len(word):
                char: str = mention_chars[char_idx][0]
                if char in [" ", "\xa0"]:
                    char_idx += 1
                    continue
                if word[len_idx:len_idx + len(char)] != char:
                    logger.warning("Out of order ({}, {})".format(
                        word, mention_chars[char_idx][0]))
                curr_word[1] = min(curr_word[1], mention_chars[char_idx][1])
                curr_word[2] = min(curr_word[2], mention_chars[char_idx][2])
                curr_word[3] = max(curr_word[3], mention_chars[char_idx][3])
                curr_word[4] = max(curr_word[4], mention_chars[char_idx][4])
                len_idx += len(mention_chars[char_idx][0])
                char_idx += 1
            words.append(curr_word)
        return words
Пример #6
0
def findStrFromBox2(anno,box,filename,pheight,verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdftotext

    Update time: 2018-07-30 09:48:38.
    '''


    texts=u''
    num=0
    # pdftotext requires int coordinates, scale default dpi of
    # pdftotext (72) to 720, and multiply coordinates by 10.
    coord2str=lambda x: int(round(10.*x))  
    #----------Create a dummy LTTextLine obj----------
    dummy=LTTextLine([1,2,3,4])
    
    #----------------Loop through annos----------------
    for ii,hii in enumerate(anno):

        hiibox=hii['rect']
        dummy.set_bbox(hiibox)   #Needs this step

        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii=[]
            num+=1

            lines=sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):

                    #------Call pdftotext and save to a temp file------
                    # NOTE: pdftotext coordinate has origin at top-left.
                    # Coordinates from Mendeley has origin at bottom-left.
                    """
                    args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\
                            '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\
                            '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\
                            os.path.abspath(filename),'tmp.txt']
                    """
                    # NOTE: use '-' as the output for pdftotext to direct the 
                    # output to stdout. Quite some speed up. How could I not
                    # notice this before!
                    args=['pdftotext','-f',hii['page'],'-l',hii['page'],\
                            '-r',720,'-x',coord2str(hiibox[0]),'-y',\
                            coord2str(pheight-hiibox[3]),'-W',\
                            coord2str(hiibox[2]-hiibox[0]),'-H',\
                            coord2str(hiibox[3]-hiibox[1]),\
                            os.path.abspath(filename),'-']
                    args=map(str,args)

                    pp=Popen(args,stdout=PIPE,stderr=PIPE)
                    #while pp.poll() !=0:
                        #time.sleep(0.01)
                    #tii=tools.readFile('tmp.txt',False)

                    tii=pp.communicate()[0]
                    tii=tools.deu(tii)
                    textii.append(tii)

                    # break to avoid double sampling. Lines from lineii may
                    # overlap, and may fetch a highlight twice if not break.
                    break
                 
            #----------------Concatenate texts----------------
            textii=u''.join(textii).strip(' ')

            textii=textii.strip('\n')
            textii=textii.replace('\n',' ')

            #---------------Join with next line---------------
            if len(texts)>1 and texts[-1]=='-':
                texts=texts[:-1]
                joiner=u''
            else:
                joiner=u' '

            #---------------Jump---------------
            linegap,chargap=measureGap(lines)
            textii=textii.strip()
            if ii==0 or len(texts)==0:
                texts+=joiner+textii
                lastbox=hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii,linegap,chargap):
                    textii=u' ...... '+textii 
                    texts+=joiner+textii
                else:
                    texts+=joiner+textii

            lastbox=hiibox
                
    texts=texts.strip()

    #------------------Do some fixes------------------
    if len(texts)>0:
        texts=wordfix.fixWord(texts)

    return texts, num
Пример #7
0
def findStrFromBox(anno,box,verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdfminer
    '''

    texts=u''
    num=0

    #----------------Loop through annos----------------
    for ii,hii in enumerate(anno):

        #----------Create a dummy LTTextLine obj----------
        hiibox=hii['rect']
        dummy=LTTextLine(hiibox)
        dummy.set_bbox(hiibox)   #Needs this step
    
        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii=[]
            num+=1

            lines=sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):
                    #chars=sortX(lineii._objs)
                    chars=lineii._objs

                    #----------------Loop through chars----------------
                    for charii in chars:
                        if type(charii)==LTAnno:
                            textii.append(charii.get_text())
                        elif type(charii)==LTChar:
                            if charii.is_hoverlap(dummy) and\
                                    charii.is_voverlap(dummy):
                                textii.append(charii.get_text())

            #----------------Concatenate texts----------------
            textii=u''.join(textii).strip(' ')

            textii=textii.strip('\n')
            textii=textii.replace('\n',' ')

            #---------------Join with next line---------------
            if len(texts)>1 and texts[-1]=='-':
                texts=texts[:-1]
                joiner=u''
            else:
                joiner=u' '

            #---------------Jump---------------
            linegap,chargap=measureGap(lines)
            textii=textii.strip()
            if ii==0 or len(texts)==0:
                texts+=joiner+textii
                lastbox=hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii,linegap,chargap):
                    textii=u' ...... '+textii 
                    texts+=joiner+textii
                else:
                    texts+=joiner+textii

            lastbox=hiibox
                
    texts=texts.strip()
    #------------------Do some fixes------------------
    if len(texts)>0:
        texts=wordfix.fixWord(texts)

    return texts, num
Пример #8
0
def findStrFromBox(anno, box, verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdfminer
    '''

    texts = u''
    num = 0

    #----------------Loop through annos----------------
    for ii, hii in enumerate(anno):

        #----------Create a dummy LTTextLine obj----------
        hiibox = hii['rect']
        dummy = LTTextLine(hiibox)
        dummy.set_bbox(hiibox)  #Needs this step

        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii = []
            num += 1

            lines = sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):
                    #chars=sortX(lineii._objs)
                    chars = lineii._objs

                    #----------------Loop through chars----------------
                    for charii in chars:
                        if type(charii) == LTAnno:
                            textii.append(charii.get_text())
                        elif type(charii) == LTChar:
                            if charii.is_hoverlap(dummy) and\
                                    charii.is_voverlap(dummy):
                                textii.append(charii.get_text())

            #----------------Concatenate texts----------------
            textii = u''.join(textii).strip(' ')

            textii = textii.strip('\n')
            textii = textii.replace('\n', ' ')

            #---------------Join with next line---------------
            if len(texts) > 1 and texts[-1] == '-':
                texts = texts[:-1]
                joiner = u''
            else:
                joiner = u' '

            #---------------Jump---------------
            linegap, chargap = measureGap(lines)
            textii = textii.strip()
            if ii == 0 or len(texts) == 0:
                texts += joiner + textii
                lastbox = hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii, linegap, chargap):
                    textii = u' ...... ' + textii
                    texts += joiner + textii
                else:
                    texts += joiner + textii

            lastbox = hiibox

    texts = texts.strip()
    #------------------Do some fixes------------------
    if len(texts) > 0:
        texts = wordfix.fixWord(texts)

    return texts, num
Пример #9
0
def findStrFromBox2(anno, box, filename, pheight, verbose=True):
    '''Locate and extract strings from a page layout obj

    Extract text using pdftotext
    '''

    texts = u''
    num = 0
    # pdftotext requires int coordinates, scale default dpi of
    # pdftotext (72) to 720, and multiply coordinates by 10.
    coord2str = lambda x: int(round(10. * x))

    #----------------Loop through annos----------------
    for ii, hii in enumerate(anno):

        #----------Create a dummy LTTextLine obj----------
        hiibox = hii['rect']
        dummy = LTTextLine(hiibox)
        dummy.set_bbox(hiibox)  #Needs this step

        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii = []
            num += 1

            lines = sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):

                    #------Call pdftotext and same to a temp file------
                    # NOTE: pdftotext coordinate has origin at top-left.
                    # Coordinates from Mendeley has origin at bottom-left.
                    args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\
                            '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\
                            '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\
                            os.path.abspath(filename),'tmp.txt']
                    args = map(str, args)

                    pp = Popen(args)
                    while pp.poll() != 0:
                        time.sleep(0.01)

                    tii = tools.readFile('tmp.txt', False)
                    textii.append(tii)

                    # break to avoid double sampling. Lines from lineii may
                    # overlap, and may fetch a highlight twice if not break.
                    break

            #----------------Concatenate texts----------------
            textii = u''.join(textii).strip(' ')

            textii = textii.strip('\n')
            textii = textii.replace('\n', ' ')

            #---------------Join with next line---------------
            if len(texts) > 1 and texts[-1] == '-':
                texts = texts[:-1]
                joiner = u''
            else:
                joiner = u' '

            #---------------Jump---------------
            linegap, chargap = measureGap(lines)
            textii = textii.strip()
            if ii == 0 or len(texts) == 0:
                texts += joiner + textii
                lastbox = hiibox
            else:
                #lastbox=anno[ii-1]['rect']
                if checkJump(lastbox, hiibox, lineii, linegap, chargap):
                    textii = u' ...... ' + textii
                    texts += joiner + textii
                else:
                    texts += joiner + textii

            lastbox = hiibox

    texts = texts.strip()

    #------------------Do some fixes------------------
    if len(texts) > 0:
        texts = wordfix.fixWord(texts)

    return texts, num
Пример #10
0
def findStrFromBox(anno,box,verbose=True):
    '''Locate and extract strings from a page layout obj

    '''


    texts=u''
    num=0

    #-----------Sort annotations vertically-----------
    anno=sortAnnoY(anno)
    
    #----------------Loop through annos----------------
    for hii in anno:

        #----------Create a dummy LTTextLine obj----------
        hiibox=hii['rect']
        dummy=LTTextLine(hiibox)
        dummy.set_bbox(hiibox)   #Needs this step
    
        if box.is_hoverlap(dummy) and box.is_voverlap(dummy):
            textii=[]
            num+=1

            lines=sortY(box._objs)

            #----------------Loop through lines----------------
            for lineii in lines:
                if type(lineii)!=LTTextLine and\
                        type(lineii)!=LTTextLineHorizontal:
                    continue
                if lineii.is_hoverlap(dummy) and\
                        lineii.is_voverlap(dummy):
                    #chars=sortX(lineii._objs)
                    chars=lineii._objs

                    #----------------Loop through chars----------------
                    for charii in chars:
                        if type(charii)==LTAnno:
                            textii.append(charii.get_text())
                        elif type(charii)==LTChar:
                            if charii.is_hoverlap(dummy) and\
                                    charii.is_voverlap(dummy):
                                textii.append(charii.get_text())

            #----------------Concatenate texts----------------
            textii=u''.join(textii).strip(' ')

            textii=textii.strip('\n')
            textii=textii.replace('\n',' ')

            #---------------Join with next line---------------
            if len(texts)>1 and texts[-1]=='-':
                texts=texts[:-1]
                joiner=u''
            else:
                joiner=u' '

            #---------------Jump---------------
            if len(textii)-len(textii.rstrip(' '))>=1:
                textii=textii.strip()
                textii+=u' ......'
                texts+=joiner+textii
            else:
                texts+=joiner+textii

                
    texts=texts.strip()

    

    return texts, num