def __getPlaintext(self): # extract plaintext from pdf paper = PdfLib(self.wd + os.sep + self.filename) textBeginning = self.__guessDocBegining(self.filename) plaintext = paper.pdf2txt(textBeginning, "max") # normalize text f = Filter(asString=plaintext) plaintext = f.substitutions() \ .oneCharPerLine() \ .normalizeCaracters() \ .lower() \ .uselessCharacters() \ .multipleDots() \ .listEnum() \ .digits() \ .shortTokens() \ .multipleSpaces() \ .getResult() # experience shows, that less than 6000 characters is mostly waste if len(plaintext) > 6000: result = {} result[self.langKey] = self.__guessLang(plaintext) result[self.plaintextKey] = plaintext result[self.filenameKey] = self.filename return result else: raise Exception(u"Document is too short.")
def __guessDocBegining(self, filename): if os.path.exists(self.wd + os.sep + filename): """ inspect the first 5 pages. when a page consists of more than 1500 characters, assume this is the beginning of the text. Those values are based on experience, not science ;) """ maxPages = 5 threshold = 1300 for p in range(1, maxPages): paper = PdfLib(self.wd + os.sep + filename) text = paper.pdf2txt(p) numChar = len(text) textLower = text.lower() if numChar > threshold or textLower.find( "abstract") != -1 or textLower.find( "introduction") != -1: return p return maxPages else: self.logger.info(u"{} does not exist.".format(filename))