Python IngestorException примеры использования

Язык программирования: Python

Пространство имен/Пакет: aleph.ingest.ingestor

Класс/Тип: IngestorException

Примеров на hotexamples.com: 11

Python IngestorException - 11 примеров найдено. Это лучшие примеры Python кода для aleph.ingest.ingestor.IngestorException, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

IngestorException(11)

Основные методы

IngestorException (11)

Пример #1

Показать файл

Файл: tesseract.py Проект: wilbrodn/aleph

def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    try:
        img = Image.open(StringIO(data))
    except DecompressionBombWarning as dce:
        log.debug("Image too large: %", dce)
        return None
    except IOError as ioe:
        log.info("Unknown image format: %r", ioe)
        return None
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_image(img)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.get_text() or ''
    text = text.decode(encoding="UTF-8")
    # extractor.clear()
    log.debug('OCR done: %s, %s characters extracted', languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text

Пример #2

Показать файл

Файл: poppler.py Проект: wethepeopleonline/aleph

def extract_pdf(path, languages=None):
    """
    Extract content from a PDF file.

    This will convert the whole file to XML using `pdftohtml`, then run OCR
    on individual images within the file.
    """
    temp_dir = make_tempdir()
    try:
        out_file = os.path.join(temp_dir, 'pdf.xml')
        log.info("Converting PDF to XML: %r...", path)
        pdftohtml = get_config('PDFTOHTML_BIN')
        args = [pdftohtml, '-xml', '-hidden', '-q', '-nodrm', path, out_file]
        subprocess.call(args)

        if not os.path.exists(out_file):
            raise IngestorException("Could not convert PDF to XML: %s" % path)

        with open(out_file, 'r') as fh:
            xml = string_value(fh.read())
            xml = xml.replace('encoding="UTF-8"', '')
            parser = etree.XMLParser(recover=True, remove_comments=True)
            doc = etree.fromstring(xml, parser=parser)
            log.debug("Parsed XML: %r", path)

        pages = []
        for page in doc.findall('./page'):
            pages.append(extract_page(path, temp_dir, page, languages))

        return {'pages': pages}
    finally:
        remove_tempdir(temp_dir)

Пример #3

Показать файл

Файл: html.py Проект: backgroundcheck/aleph

    def ingest(self, meta, local_path):
        fh, out_path = mkstemp(suffix='.htm')
        os.close(fh)
        with open(local_path, 'rb') as fh:
            doc = html.fromstring(fh.read())
            if not meta.has('title'):
                title = doc.findtext('.//title')
                if title is not None:
                    meta.title = title.strip()

            if not meta.has('summary'):
                summary = doc.find('.//meta[@name="description"]')
                if summary is not None and summary.get('content'):
                    meta.summary = summary.get('content')

            self.cleaner(doc)
        try:
            with open(out_path, 'w') as fh:
                fh.write(etree.tostring(doc))

            pdf_path = self.generate_pdf_version(out_path)
            if pdf_path is None or not os.path.isfile(pdf_path):
                raise IngestorException("Could not convert document: %r", meta)
            self.extract_pdf_alternative(meta, pdf_path)
        finally:
            if os.path.isfile(out_path):
                os.unlink(out_path)

Пример #4

Показать файл

def ingest_file(source_id, meta, file_path, move=False):
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.delay(source_id, meta.data)
    except Exception as ex:
        Ingestor.handle_exception(meta, source_id, ex)

Пример #5

Показать файл

def ingest_file(collection_id, meta, file_path, move=False):
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.delay(collection_id, meta.to_attr_dict())
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()

Пример #6

Показать файл

 def handle_html(self, meta, html_path):
     """OK, this is weirder. Converting HTML to PDF via WebKit."""
     out_path = make_tempfile(name=meta.file_name, suffix='pdf')
     try:
         wkhtmltopdf = get_config('WKHTMLTOPDF_BIN')
         args = [
             wkhtmltopdf, '--disable-javascript', '--no-outline',
             '--no-images', '--quiet', html_path, out_path
         ]
         subprocess.call(args)
         if not os.path.isfile(out_path):
             raise IngestorException("Could not convert document: %r", meta)
         self.extract_pdf_alternative(meta, out_path)
     finally:
         remove_tempfile(out_path)

Пример #7

Показать файл

Файл: tesseract.py Проект: tomjie/aleph

def extract_image_data(data, languages=None):
    """Extract text from a binary string of data."""
    tessdata_prefix = get_config('TESSDATA_PREFIX')
    if tessdata_prefix is None:
        raise IngestorException("TESSDATA_PREFIX is not set, OCR won't work.")
    languages = get_languages_iso3(languages)
    text = Cache.get_ocr(data, languages)
    if text is not None:
        return text
    img = Image.open(StringIO(data))
    # TODO: play with contrast and sharpening the images.
    extractor = Tesseract(tessdata_prefix, lang=languages)
    extractor.set_page_seg_mode(PageSegMode.PSM_AUTO_OSD)
    text = extractor.ocr_image(img)
    log.debug('OCR done: %s, %s characters extracted', languages, len(text))
    Cache.set_ocr(data, languages, text)
    return text

Пример #8

Показать файл

Файл: image.py Проект: backgroundcheck/aleph

 def ingest(self, meta, local_path):
     try:
         fh, pdf_path = mkstemp(suffix='.pdf')
         os.close(fh)
         meta.title = meta.file_name
         convert = get_config('CONVERT_BIN')
         args = [
             convert, local_path, '-density', '300', '-define',
             'pdf:fit-page=A4', pdf_path
         ]
         subprocess.call(args)
         if pdf_path is None or not os.path.isfile(pdf_path):
             raise IngestorException("Could not convert image: %r" % meta)
         self.store_pdf(meta, pdf_path)
         self.extract_pdf(meta, pdf_path)
     finally:
         if os.path.isfile(pdf_path):
             os.unlink(pdf_path)

Пример #9

Показать файл

Файл: document.py Проект: wilbrodn/aleph

 def generate_pdf_alternative(self, meta, local_path):
     """Convert LibreOffice-supported documents to PDF."""
     work_dir = six.text_type(mkdtemp())
     instance_dir = six.text_type(mkdtemp())
     try:
         soffice = get_config('SOFFICE_BIN')
         instance_path = u'"-env:UserInstallation=file://%s"' % instance_dir
         args = [soffice, '--convert-to', 'pdf', '--nofirststartwizard',
                 instance_path, '--norestore', '--nologo', '--nodefault',
                 '--nolockcheck', '--invisible', '--outdir', work_dir,
                 '--headless', string_value(local_path)]
         # log.debug('Converting document: %r', ' '.join(args))
         subprocess.call(args, timeout=CONVERT_TIMEOUT)
         for out_file in os.listdir(work_dir):
             return os.path.join(work_dir, out_file)
         raise IngestorException("Could not convert document: %r" % meta)
     finally:
         shutil.rmtree(instance_dir)

Пример #10

Показать файл

def ingest_file(collection_id,
                meta,
                file_path,
                move=False,
                queue=WORKER_QUEUE,
                routing_key=WORKER_ROUTING_KEY):
    # the queue and routing key arguments are a workaround to
    # expedite user uploads over long-running batch imports.
    try:
        if not os.path.isfile(file_path):
            raise IngestorException("No such file: %r", file_path)
        if not meta.has('source_path'):
            meta.source_path = file_path
        meta = get_archive().archive_file(file_path, meta, move=move)
        ingest.apply_async([collection_id, meta.to_attr_dict()],
                           queue=queue,
                           routing_key=routing_key)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()

Пример #11

Показать файл

Файл: __init__.py Проект: adamchainz/aleph

def ingest_url(collection_id, metadata, url):
    meta = Metadata(data=metadata)
    try:
        fh, tmp_path = mkstemp()
        os.close(fh)
        log.info("Ingesting URL: %r", url)
        res = requests.get(url, stream=True, timeout=120)
        if res.status_code >= 400:
            msg = "HTTP Error %r: %r" % (url, res.status_code)
            raise IngestorException(msg)
        with open(tmp_path, 'w') as fh:
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:
                    fh.write(chunk)
        if not meta.has('source_url'):
            meta.source_url = res.url
        meta.headers = res.headers
        meta = get_archive().archive_file(tmp_path, meta, move=True)
        Ingestor.dispatch(collection_id, meta)
    except Exception as ex:
        Ingestor.handle_exception(meta, collection_id, ex)
    finally:
        db.session.remove()