def scrape(self): path = self.download_file(CAL_PDF) target = re.sub("\.pdf$", ".txt", path) if not os.path.exists(target): pdftotext(path) entries = self.parse_file(open(target, 'r')) next(entries) # two ignorable lines next(entries) for entry in entries: for e in self.handle_buffer(entry): e.add_source(CAL_PDF) yield e
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() text = pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, "w") fd.write(fetch_raw(pdf).read()) fd.close() text = pdftotext("-nopgbrk", "-layout", fname, "-") os.unlink(fname) return text
def get_events(self): if self.session != self.get_current_session(): raise Exception("Can't do that, dude") path = self.download_file(CAL_PDF) target = re.sub("\.pdf$", ".txt", path) if not os.path.exists(target): pdftotext(path) entries = self.parse_file(open(target, 'r')) next(entries) # two ignorable lines next(entries) for entry in entries: for e in self.handle_buffer(entry): e.add_source(CAL_PDF) yield e
def get_paper_content(root, pdf_url): pdf_url = urlparse.urljoin(root, pdf_url) req = requests.get(pdf_url) pdf_file, pdf_file_name = tempfile.mkstemp(suffix=".pdf") pdf_file = os.fdopen(pdf_file, 'wb') pdf_file.write(req.content) pdf_file.close() sh.pdftotext(pdf_file_name) text_file_name = re.sub("pdf$", "txt", pdf_file_name) with codecs.open(text_file_name, encoding='utf-8') as text_file: text = text_file.read() os.remove(pdf_file_name) os.remove(text_file_name) return text
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, "w") fd.write(fetch_raw(pdf).read()) fd.close() x, y, h, w = 70, 63, 631, 473 text = pdftotext("-nopgbrk", "-layout", "-x", x, "-y", y, "-H", h, "-W", w, fname, "-") os.unlink(fname) return text
def getraw(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() x, y, h, w = 70, 63, 631, 473 text = pdftotext('-nopgbrk', '-layout', '-x', x, '-y', y, '-H', h, '-W', w, fname, '-') os.unlink(fname) return text
def process_pdf(content, language, noOCR=False, noPDFText=False, despeckle=False): if noPDFText: logging.debug("pdftotext disabled") pdfText = "" else: from sh import pdftotext logging.debug("Extracting pdf contents using pdftotext") pdfText = unicode(pdftotext('-', '-', _in=content, _in_bufsize=10000)) logging.debug("Extracted %d chars from the text", len(pdfText)) if noOCR: logging.debug("OCR disabled, returning only pdf text") else: from sh import identify, tesseract, convert logging.debug("Starting OCR Operation") logging.debug("Extracing page numbers") pageNos = map(int, str(identify("-format", "%p ", "pdf:-", _in=content,_in_bufsize=10000)).\ strip().split(' ')) logging.debug("Found pages: %s", pageNos) allPages=u"" for pageNo in pageNos: logging.debug("Processing page %d", pageNo) tmpFolder = tempfile.mkdtemp(prefix='imap-dms-ocr-tmp') co = convert_options if not despeckle else convert_options_despeckle logging.debug("Converting page to image in tmpfolder %s with options %s", tmpFolder, co) convert(co, "pdf:-[%d]" % (pageNo), tmpFolder+"/out.png", _in=content, _in_bufsize=10000) logging.debug("Running tesseract with language %s on file %s", language, tmpFolder+"/out.png") tesseract(tmpFolder+"/out.png", tmpFolder+"/out", "-l",language) f = open(tmpFolder+"/out.txt", "r") pageContent = unicode(f.read(), "utf-8") f.close() logging.debug("Found %d chars for this page", len(pageContent)) allPages+=pageContent+u"\n" shutil.rmtree(tmpFolder) pdfText=pdfText.strip()+"\n\n\n"+allPages.strip() if(len(pdfText.strip())==0): logging.error("No text could be recognized") return None else: return pdfText
def getpdf(pdf): (fd, fname) = mkstemp() fd = os.fdopen(fd, 'wb') try: fd.write(fetch_raw(pdf, binary=True)) except: return [] fd.close() text = pdftotext('-layout', fname, '-') os.unlink(fname) return text.split('\n')
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() text=pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def getraw(pdf): log(5, "fetching url: %s" % pdf) (fd, fname)=mkstemp() fd=os.fdopen(fd, 'wb') fd.write(fetch_raw(pdf, binary=True)) fd.close() text=pdftotext('-nopgbrk', '-layout', fname, '-') os.unlink(fname) return text
def get_region(pdf, page, x1, y1, x2, y2): # this is an extremely ugly hack. should be reimplemented with # some poppler like lib, which itself only supports getting # "selected" text, having some different logic than the # simple one used in pdftotext return pdftotext('-nopgbrk', '-f', page, '-l', page, '-x', x1, '-y', y1, '-H', abs(y2 - y1), '-W', abs(x2 - x1), pdf, '-' )
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'wb') fd.write(fetch_raw(pdf, binary=True)) fd.close() text=pdftotext(#'-nopgbrk', '-layout', #'-x', x, #'-y', y, #'-H', h, #'-W', w, fname, '-') os.unlink(fname) # remove pagebreaks and footers return unpaginate(text,pdf)
def getraw(pdf): (fd, fname)=mkstemp() fd=os.fdopen(fd, 'w') fd.write(fetch_raw(pdf).read()) fd.close() x,y,h,w = 70,63,631,473 text=pdftotext('-nopgbrk', '-layout', '-x', x, '-y', y, '-H', h, '-W', w, fname, '-') os.unlink(fname) return text
def extract(self, document): return pdftotext(document, "-").encode('utf-8', 'ignore')