def add(self, obj): # Don't let first element be empty strings if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0-margin: LTContainer.add(self, LTAnno(' ')) if isinstance(obj, LTChar) and not obj.get_text().strip() and not len(self._objs): return self._x1 = obj.x1 LTTextLine.add(self, obj) return
def _clean_textline(item: LTTextLine) -> Optional[LTTextLine]: clean_text = keep_allowed_chars(item.get_text()).strip() # Skip empty and invalid lines if clean_text: # TODO: add subscript detection and use latex underscore # or superscript item.clean_text = clean_text item.font_name, item.font_size = _font_of_mention(item) return item else: return None
def add(self, obj): # Don't let first element be empty strings if isinstance(obj, LTChar) and self.word_margin: margin = self.word_margin * max(obj.width, obj.height) if self._x1 < obj.x0 - margin: LTContainer.add(self, LTAnno(' ')) if isinstance( obj, LTChar) and not obj.get_text().strip() and not len(self._objs): return self._x1 = obj.x1 LTTextLine.add(self, obj) return
def get_word_boundaries( self, mention: LTTextLine ) -> List[Tuple[str, float, float, float, float]]: mention_text = mention.get_text() mention_chars: List[Tuple[str, int, int, int, int]] = [] for obj in mention: if isinstance(obj, LTChar): x0, y0, x1, y1 = obj.bbox mention_chars.append([obj.get_text(), y0, x0, y1, x1]) words = [] mention_words: List[str] = mention_text.split() # word split by " " (space) char_idx = 0 for word in mention_words: curr_word = [word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf")] len_idx = 0 while len_idx < len(word): if mention_chars[char_idx][0] == " ": char_idx += 1 continue if word[len_idx] != mention_chars[char_idx][0]: self.log.warning( "Out of order ({}, {})".format(word, mention_chars[char_idx][0]) ) curr_word[1] = min(curr_word[1], mention_chars[char_idx][1]) curr_word[2] = min(curr_word[2], mention_chars[char_idx][2]) curr_word[3] = max(curr_word[3], mention_chars[char_idx][3]) curr_word[4] = max(curr_word[4], mention_chars[char_idx][4]) len_idx += len(mention_chars[char_idx][0]) char_idx += 1 words.append(curr_word) return words
def get_word_boundaries( self, mention: LTTextLine ) -> List[Tuple[str, float, float, float, float]]: """Split a line of text into words. :param mention: a line of text :return: a list of words """ mention_text = mention.get_text() mention_chars: List[Tuple[str, int, int, int, int]] = [] for obj in mention: if isinstance(obj, LTChar): x0, y0, x1, y1 = obj.bbox mention_chars.append([obj.get_text(), y0, x0, y1, x1]) words = [] mention_words: List[str] = mention_text.split( ) # word split by " " (space) char_idx = 0 for word in mention_words: curr_word = [ word, float("Inf"), float("Inf"), float("-Inf"), float("-Inf") ] len_idx = 0 while len_idx < len(word): char: str = mention_chars[char_idx][0] if char in [" ", "\xa0"]: char_idx += 1 continue if word[len_idx:len_idx + len(char)] != char: logger.warning("Out of order ({}, {})".format( word, mention_chars[char_idx][0])) curr_word[1] = min(curr_word[1], mention_chars[char_idx][1]) curr_word[2] = min(curr_word[2], mention_chars[char_idx][2]) curr_word[3] = max(curr_word[3], mention_chars[char_idx][3]) curr_word[4] = max(curr_word[4], mention_chars[char_idx][4]) len_idx += len(mention_chars[char_idx][0]) char_idx += 1 words.append(curr_word) return words
def findStrFromBox2(anno,box,filename,pheight,verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdftotext Update time: 2018-07-30 09:48:38. ''' texts=u'' num=0 # pdftotext requires int coordinates, scale default dpi of # pdftotext (72) to 720, and multiply coordinates by 10. coord2str=lambda x: int(round(10.*x)) #----------Create a dummy LTTextLine obj---------- dummy=LTTextLine([1,2,3,4]) #----------------Loop through annos---------------- for ii,hii in enumerate(anno): hiibox=hii['rect'] dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii=[] num+=1 lines=sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #------Call pdftotext and save to a temp file------ # NOTE: pdftotext coordinate has origin at top-left. # Coordinates from Mendeley has origin at bottom-left. """ args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\ '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\ '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\ os.path.abspath(filename),'tmp.txt'] """ # NOTE: use '-' as the output for pdftotext to direct the # output to stdout. Quite some speed up. How could I not # notice this before! args=['pdftotext','-f',hii['page'],'-l',hii['page'],\ '-r',720,'-x',coord2str(hiibox[0]),'-y',\ coord2str(pheight-hiibox[3]),'-W',\ coord2str(hiibox[2]-hiibox[0]),'-H',\ coord2str(hiibox[3]-hiibox[1]),\ os.path.abspath(filename),'-'] args=map(str,args) pp=Popen(args,stdout=PIPE,stderr=PIPE) #while pp.poll() !=0: #time.sleep(0.01) #tii=tools.readFile('tmp.txt',False) tii=pp.communicate()[0] tii=tools.deu(tii) textii.append(tii) # break to avoid double sampling. Lines from lineii may # overlap, and may fetch a highlight twice if not break. break #----------------Concatenate texts---------------- textii=u''.join(textii).strip(' ') textii=textii.strip('\n') textii=textii.replace('\n',' ') #---------------Join with next line--------------- if len(texts)>1 and texts[-1]=='-': texts=texts[:-1] joiner=u'' else: joiner=u' ' #---------------Jump--------------- linegap,chargap=measureGap(lines) textii=textii.strip() if ii==0 or len(texts)==0: texts+=joiner+textii lastbox=hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii,linegap,chargap): textii=u' ...... '+textii texts+=joiner+textii else: texts+=joiner+textii lastbox=hiibox texts=texts.strip() #------------------Do some fixes------------------ if len(texts)>0: texts=wordfix.fixWord(texts) return texts, num
def findStrFromBox(anno,box,verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdfminer ''' texts=u'' num=0 #----------------Loop through annos---------------- for ii,hii in enumerate(anno): #----------Create a dummy LTTextLine obj---------- hiibox=hii['rect'] dummy=LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii=[] num+=1 lines=sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #chars=sortX(lineii._objs) chars=lineii._objs #----------------Loop through chars---------------- for charii in chars: if type(charii)==LTAnno: textii.append(charii.get_text()) elif type(charii)==LTChar: if charii.is_hoverlap(dummy) and\ charii.is_voverlap(dummy): textii.append(charii.get_text()) #----------------Concatenate texts---------------- textii=u''.join(textii).strip(' ') textii=textii.strip('\n') textii=textii.replace('\n',' ') #---------------Join with next line--------------- if len(texts)>1 and texts[-1]=='-': texts=texts[:-1] joiner=u'' else: joiner=u' ' #---------------Jump--------------- linegap,chargap=measureGap(lines) textii=textii.strip() if ii==0 or len(texts)==0: texts+=joiner+textii lastbox=hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii,linegap,chargap): textii=u' ...... '+textii texts+=joiner+textii else: texts+=joiner+textii lastbox=hiibox texts=texts.strip() #------------------Do some fixes------------------ if len(texts)>0: texts=wordfix.fixWord(texts) return texts, num
def findStrFromBox(anno, box, verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdfminer ''' texts = u'' num = 0 #----------------Loop through annos---------------- for ii, hii in enumerate(anno): #----------Create a dummy LTTextLine obj---------- hiibox = hii['rect'] dummy = LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii = [] num += 1 lines = sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #chars=sortX(lineii._objs) chars = lineii._objs #----------------Loop through chars---------------- for charii in chars: if type(charii) == LTAnno: textii.append(charii.get_text()) elif type(charii) == LTChar: if charii.is_hoverlap(dummy) and\ charii.is_voverlap(dummy): textii.append(charii.get_text()) #----------------Concatenate texts---------------- textii = u''.join(textii).strip(' ') textii = textii.strip('\n') textii = textii.replace('\n', ' ') #---------------Join with next line--------------- if len(texts) > 1 and texts[-1] == '-': texts = texts[:-1] joiner = u'' else: joiner = u' ' #---------------Jump--------------- linegap, chargap = measureGap(lines) textii = textii.strip() if ii == 0 or len(texts) == 0: texts += joiner + textii lastbox = hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii, linegap, chargap): textii = u' ...... ' + textii texts += joiner + textii else: texts += joiner + textii lastbox = hiibox texts = texts.strip() #------------------Do some fixes------------------ if len(texts) > 0: texts = wordfix.fixWord(texts) return texts, num
def findStrFromBox2(anno, box, filename, pheight, verbose=True): '''Locate and extract strings from a page layout obj Extract text using pdftotext ''' texts = u'' num = 0 # pdftotext requires int coordinates, scale default dpi of # pdftotext (72) to 720, and multiply coordinates by 10. coord2str = lambda x: int(round(10. * x)) #----------------Loop through annos---------------- for ii, hii in enumerate(anno): #----------Create a dummy LTTextLine obj---------- hiibox = hii['rect'] dummy = LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii = [] num += 1 lines = sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #------Call pdftotext and same to a temp file------ # NOTE: pdftotext coordinate has origin at top-left. # Coordinates from Mendeley has origin at bottom-left. args=['pdftotext','-f',hii['page'],'-l',hii['page'],'-r',720,\ '-x',coord2str(hiibox[0]),'-y',coord2str(pheight-hiibox[3]),\ '-W',coord2str(hiibox[2]-hiibox[0]),'-H',coord2str(hiibox[3]-hiibox[1]),\ os.path.abspath(filename),'tmp.txt'] args = map(str, args) pp = Popen(args) while pp.poll() != 0: time.sleep(0.01) tii = tools.readFile('tmp.txt', False) textii.append(tii) # break to avoid double sampling. Lines from lineii may # overlap, and may fetch a highlight twice if not break. break #----------------Concatenate texts---------------- textii = u''.join(textii).strip(' ') textii = textii.strip('\n') textii = textii.replace('\n', ' ') #---------------Join with next line--------------- if len(texts) > 1 and texts[-1] == '-': texts = texts[:-1] joiner = u'' else: joiner = u' ' #---------------Jump--------------- linegap, chargap = measureGap(lines) textii = textii.strip() if ii == 0 or len(texts) == 0: texts += joiner + textii lastbox = hiibox else: #lastbox=anno[ii-1]['rect'] if checkJump(lastbox, hiibox, lineii, linegap, chargap): textii = u' ...... ' + textii texts += joiner + textii else: texts += joiner + textii lastbox = hiibox texts = texts.strip() #------------------Do some fixes------------------ if len(texts) > 0: texts = wordfix.fixWord(texts) return texts, num
def findStrFromBox(anno,box,verbose=True): '''Locate and extract strings from a page layout obj ''' texts=u'' num=0 #-----------Sort annotations vertically----------- anno=sortAnnoY(anno) #----------------Loop through annos---------------- for hii in anno: #----------Create a dummy LTTextLine obj---------- hiibox=hii['rect'] dummy=LTTextLine(hiibox) dummy.set_bbox(hiibox) #Needs this step if box.is_hoverlap(dummy) and box.is_voverlap(dummy): textii=[] num+=1 lines=sortY(box._objs) #----------------Loop through lines---------------- for lineii in lines: if type(lineii)!=LTTextLine and\ type(lineii)!=LTTextLineHorizontal: continue if lineii.is_hoverlap(dummy) and\ lineii.is_voverlap(dummy): #chars=sortX(lineii._objs) chars=lineii._objs #----------------Loop through chars---------------- for charii in chars: if type(charii)==LTAnno: textii.append(charii.get_text()) elif type(charii)==LTChar: if charii.is_hoverlap(dummy) and\ charii.is_voverlap(dummy): textii.append(charii.get_text()) #----------------Concatenate texts---------------- textii=u''.join(textii).strip(' ') textii=textii.strip('\n') textii=textii.replace('\n',' ') #---------------Join with next line--------------- if len(texts)>1 and texts[-1]=='-': texts=texts[:-1] joiner=u'' else: joiner=u' ' #---------------Jump--------------- if len(textii)-len(textii.rstrip(' '))>=1: textii=textii.strip() textii+=u' ......' texts+=joiner+textii else: texts+=joiner+textii texts=texts.strip() return texts, num