Пример #1
0
    def scrape(self):
        path = self.download_file(CAL_PDF)
        target = re.sub("\.pdf$", ".txt", path)
        if not os.path.exists(target):
            pdftotext(path)

        entries = self.parse_file(open(target, 'r'))
        next(entries)  # two ignorable lines
        next(entries)

        for entry in entries:
            for e in self.handle_buffer(entry):
                e.add_source(CAL_PDF)
                yield e
Пример #2
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    text = pdftotext('-nopgbrk', '-layout', fname, '-')
    os.unlink(fname)
    return text
Пример #3
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, "w")
    fd.write(fetch_raw(pdf).read())
    fd.close()
    text = pdftotext("-nopgbrk", "-layout", fname, "-")
    os.unlink(fname)
    return text
Пример #4
0
    def get_events(self):
        if self.session != self.get_current_session():
            raise Exception("Can't do that, dude")

        path = self.download_file(CAL_PDF)
        target = re.sub("\.pdf$", ".txt", path)
        if not os.path.exists(target):
            pdftotext(path)

        entries = self.parse_file(open(target, 'r'))
        next(entries)  # two ignorable lines
        next(entries)

        for entry in entries:
            for e in self.handle_buffer(entry):
                e.add_source(CAL_PDF)
                yield e
Пример #5
0
def get_paper_content(root, pdf_url):
    pdf_url = urlparse.urljoin(root, pdf_url)
    req = requests.get(pdf_url)

    pdf_file, pdf_file_name = tempfile.mkstemp(suffix=".pdf")
    pdf_file = os.fdopen(pdf_file, 'wb')
    pdf_file.write(req.content)
    pdf_file.close()
    sh.pdftotext(pdf_file_name)
    text_file_name = re.sub("pdf$", "txt", pdf_file_name)

    with codecs.open(text_file_name, encoding='utf-8') as text_file:
        text = text_file.read()

    os.remove(pdf_file_name)
    os.remove(text_file_name)

    return text
Пример #6
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, "w")
    fd.write(fetch_raw(pdf).read())
    fd.close()
    x, y, h, w = 70, 63, 631, 473
    text = pdftotext("-nopgbrk", "-layout", "-x", x, "-y", y, "-H", h, "-W", w, fname, "-")
    os.unlink(fname)
    return text
Пример #7
0
def get_paper_content(root, pdf_url):
    pdf_url = urlparse.urljoin(root, pdf_url)
    req = requests.get(pdf_url)

    pdf_file, pdf_file_name = tempfile.mkstemp(suffix=".pdf")
    pdf_file = os.fdopen(pdf_file, 'wb')
    pdf_file.write(req.content)
    pdf_file.close()
    sh.pdftotext(pdf_file_name)
    text_file_name = re.sub("pdf$", "txt", pdf_file_name)

    with codecs.open(text_file_name, encoding='utf-8') as text_file:
        text = text_file.read()

    os.remove(pdf_file_name)
    os.remove(text_file_name)

    return text
Пример #8
0
def getraw(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    x, y, h, w = 70, 63, 631, 473
    text = pdftotext('-nopgbrk', '-layout', '-x', x, '-y', y, '-H', h, '-W', w,
                     fname, '-')
    os.unlink(fname)
    return text
Пример #9
0
def process_pdf(content, language, noOCR=False, noPDFText=False, despeckle=False):
  if noPDFText:
    logging.debug("pdftotext disabled")
    pdfText = ""
  else:
    from sh import pdftotext
    logging.debug("Extracting pdf contents using pdftotext")
    pdfText = unicode(pdftotext('-', '-', _in=content, _in_bufsize=10000))
    logging.debug("Extracted %d chars from the text", len(pdfText))

  if noOCR:
    logging.debug("OCR disabled, returning only pdf text")
  else:
    from sh import identify, tesseract, convert

    logging.debug("Starting OCR Operation")
    logging.debug("Extracing page numbers")

    pageNos = map(int,
        str(identify("-format", "%p ", "pdf:-", _in=content,_in_bufsize=10000)).\
            strip().split(' '))
    logging.debug("Found pages: %s", pageNos)
    
    allPages=u""
    for pageNo in pageNos:
      logging.debug("Processing page %d", pageNo)

      tmpFolder = tempfile.mkdtemp(prefix='imap-dms-ocr-tmp')

      co = convert_options if not despeckle else convert_options_despeckle
      logging.debug("Converting page to image in tmpfolder %s with options %s", tmpFolder, co)
      convert(co, "pdf:-[%d]" % (pageNo), tmpFolder+"/out.png",
           _in=content, _in_bufsize=10000)

      logging.debug("Running tesseract with language %s on file %s",
          language, tmpFolder+"/out.png")
      tesseract(tmpFolder+"/out.png", tmpFolder+"/out", "-l",language)
      
      f = open(tmpFolder+"/out.txt", "r")
      pageContent = unicode(f.read(), "utf-8")
      f.close()

      logging.debug("Found %d chars for this page", len(pageContent))
      allPages+=pageContent+u"\n"

      shutil.rmtree(tmpFolder)

    pdfText=pdfText.strip()+"\n\n\n"+allPages.strip()

  if(len(pdfText.strip())==0):
    logging.error("No text could be recognized")
    return None
  else:
    return pdfText
Пример #10
0
def getpdf(pdf):
    (fd, fname) = mkstemp()
    fd = os.fdopen(fd, 'wb')
    try:
        fd.write(fetch_raw(pdf, binary=True))
    except:
        return []
    fd.close()
    text = pdftotext('-layout', fname, '-')
    os.unlink(fname)
    return text.split('\n')
Пример #11
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    text=pdftotext('-nopgbrk',
                   '-layout',
                   fname,
                   '-')
    os.unlink(fname)
    return text
Пример #12
0
def getraw(pdf):
    log(5, "fetching url: %s" % pdf)
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'wb')
    fd.write(fetch_raw(pdf, binary=True))
    fd.close()
    text=pdftotext('-nopgbrk',
                   '-layout',
                   fname,
                   '-')
    os.unlink(fname)
    return text
Пример #13
0
def get_region(pdf, page, x1, y1, x2, y2):
    # this is an extremely ugly hack. should be reimplemented with
    # some poppler like lib, which itself only supports getting
    # "selected" text, having some different logic than the
    # simple one used in pdftotext
    return pdftotext('-nopgbrk',
                     '-f', page,
                     '-l', page,
                     '-x', x1,
                     '-y', y1,
                     '-H', abs(y2 - y1),
                     '-W', abs(x2 - x1),
                     pdf,
                     '-'
                     )
Пример #14
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'wb')
    fd.write(fetch_raw(pdf, binary=True))
    fd.close()
    text=pdftotext(#'-nopgbrk',
                   '-layout',
                   #'-x', x,
                   #'-y', y,
                   #'-H', h,
                   #'-W', w,
                   fname,
                   '-')
    os.unlink(fname)
    # remove pagebreaks and footers
    return unpaginate(text,pdf)
Пример #15
0
def getraw(pdf):
    (fd, fname)=mkstemp()
    fd=os.fdopen(fd, 'w')
    fd.write(fetch_raw(pdf).read())
    fd.close()
    x,y,h,w = 70,63,631,473
    text=pdftotext('-nopgbrk',
                   '-layout',
                   '-x', x,
                   '-y', y,
                   '-H', h,
                   '-W', w,
                   fname,
                   '-')
    os.unlink(fname)
    return text
Пример #16
0
 def extract(self, document):
     return pdftotext(document, "-").encode('utf-8', 'ignore')