Пример #1
0
class ServiceDocumentConverter(DocumentConverter):
    """Provides helpers for UNO document conversion via HTTP."""
    SERVICE_URL = env.get('UNOSERVICE_URL')

    @classmethod
    def is_available(cls):
        return cls.SERVICE_URL is not None

    def _document_to_pdf(self, file_path, result, work_path):
        """Converts an office document to PDF."""
        log.info('Converting [%s] to PDF...', result.file_name)
        out_path = os.path.basename(file_path)
        out_path = join_path(work_path, '%s.pdf' % out_path)
        file_name = result.file_name or 'data'
        mime_type = result.mime_type or DEFAULT
        attempt = 1
        for attempt in service_retries():
            fh = open(file_path, 'rb')
            try:
                files = {'file': (file_name, fh, mime_type)}
                res = requests.post(self.SERVICE_URL,
                                    files=files,
                                    timeout=(5, 305),
                                    stream=True)
                res.raise_for_status()
                with open(out_path, 'wb') as fh:
                    for chunk in res.iter_content(chunk_size=None):
                        fh.write(chunk)
                return out_path
            except RequestException as exc:
                if isinstance(exc, HTTPError):
                    if exc.response.status_code == 400:
                        raise ProcessingException(exc.response.text)
                log.error("Conversion failed: %s", exc)
                backoff(failures=attempt)
            finally:
                fh.close()
        raise ProcessingException("Document could not be converted to PDF.")
Пример #2
0
 def find_command(self, name):
     config_name = '%s_BIN' % name
     config_name = config_name.replace('-', '_').upper()
     return env.get(config_name, find_executable(name))
Пример #3
0

# Show error messages to the user.
DEBUG = env.to_bool('ALEPH_DEBUG', False)
# Propose HTTP caching to the user agents.
CACHE = env.to_bool('ALEPH_CACHE', not DEBUG)
# Puts the system into read-only mode and displays a warning.
MAINTENANCE = env.to_bool('ALEPH_MAINTENANCE', False)
# Unit test context.
TESTING = False


###############################################################################
# General instance information

APP_TITLE = env.get('ALEPH_APP_TITLE', lazy_gettext('Aleph'))
APP_DESCRIPTION = env.get('ALEPH_APP_DESCRIPTION', '')
APP_NAME = env.get('ALEPH_APP_NAME', 'aleph')
APP_UI_URL = env.get('ALEPH_UI_URL', 'http://localhost:8080/')
APP_LOGO = env.get('ALEPH_LOGO', '/static/logo.png')
APP_FAVICON = env.get('ALEPH_FAVICON', '/static/logo.png')

# Show a system-wide banner in the user interface.
APP_BANNER = env.get('ALEPH_APP_BANNER')

# Force HTTPS here:
URL_SCHEME = env.get('ALEPH_URL_SCHEME', 'http')

# Shown on the home page as a few sample queries:
SAMPLE_SEARCHES = [lazy_gettext('TeliaSonera'), lazy_gettext('Vladimir Putin')]
SAMPLE_SEARCHES = env.to_list('ALEPH_SAMPLE_SEARCHES', SAMPLE_SEARCHES)
Пример #4
0
from servicelayer import settings as sls
from flask_babel import lazy_gettext

# Show error messages to the user.
DEBUG = env.to_bool('ALEPH_DEBUG', False)
# Propose HTTP caching to the user agents.
CACHE = env.to_bool('ALEPH_CACHE', not DEBUG)
# Puts the system into read-only mode and displays a warning.
MAINTENANCE = env.to_bool('ALEPH_MAINTENANCE', False)
# Unit test context.
TESTING = False

###############################################################################
# General instance information

APP_TITLE = env.get('ALEPH_APP_TITLE', lazy_gettext('Aleph'))
APP_DESCRIPTION = env.get('ALEPH_APP_DESCRIPTION', '')
APP_NAME = env.get('ALEPH_APP_NAME', 'aleph')
APP_UI_URL = env.get('ALEPH_UI_URL', 'http://localhost:8080/')
APP_LOGO = env.get('ALEPH_LOGO', '/static/logo.png')
APP_FAVICON = env.get('ALEPH_FAVICON', '/static/logo.png')

# Show a system-wide banner in the user interface.
APP_BANNER = env.get('ALEPH_APP_BANNER')

# Force HTTPS here:
URL_SCHEME = env.get('ALEPH_URL_SCHEME', 'http')

# Shown on the home page as a few sample queries:
SAMPLE_SEARCHES = [lazy_gettext('TeliaSonera'), lazy_gettext('Vladimir Putin')]
SAMPLE_SEARCHES = env.to_list('ALEPH_SAMPLE_SEARCHES', SAMPLE_SEARCHES)
Пример #5
0
from servicelayer import env
from flask_babel import lazy_gettext

# Show error messages to the user.
DEBUG = env.to_bool('ALEPH_DEBUG', False)
# Propose HTTP caching to the user agents.
CACHE = env.to_bool('ALEPH_CACHE', not DEBUG)
# Puts the system into read-only mode and displays a warning.
MAINTENANCE = env.to_bool('ALEPH_MAINTENANCE', False)
# Unit test context.
TESTING = False

###############################################################################
# General instance information

APP_TITLE = env.get('ALEPH_APP_TITLE', lazy_gettext('Aleph'))
APP_DESCRIPTION = env.get('ALEPH_APP_DESCRIPTION', '')
APP_NAME = env.get('ALEPH_APP_NAME', 'aleph')
APP_UI_URL = env.get('ALEPH_UI_URL', 'http://localhost:8080/')
APP_LOGO = env.get('ALEPH_LOGO', '/static/logo.png')
APP_FAVICON = env.get('ALEPH_FAVICON', '/static/favicon.png')

# Show a system-wide banner in the user interface.
APP_BANNER = env.get('ALEPH_APP_BANNER')

# Shown on the home page as a few sample queries:
SAMPLE_SEARCHES = [lazy_gettext('TeliaSonera'), lazy_gettext('Vladimir Putin')]
SAMPLE_SEARCHES = env.to_list('ALEPH_SAMPLE_SEARCHES', SAMPLE_SEARCHES)

# Force HTTPS here:
FORCE_HTTPS = env.to_bool('ALEPH_FORCE_HTTPS', False)
Пример #6
0
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as sts

TESTING = False

# Document conversion service
CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert")
CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL)
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200)  # 2 hrs
CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256)

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)

# Geonames data file
GEONAMES_PATH = env.get("INGESTORS_GEONAMES_PATH",
                        "/ingestors/data/geonames.txt")

# FastText lid model file
LID_MODEL_PATH = env.get("INGESTORS_LID_MODEL_PATH",
                         "/ingestors/data/lid.176.ftz")

# Disable entity extraction
ANALYZE_ENTITIES = env.to_bool("INGESTORS_ANALYZE_ENTITIES", True)

# List available NER models
NER_MODELS = set(env.to_list("INGESTORS_NER_MODELS", ["eng"]))
NER_DISABLE = ["ara"]
NER_DISABLE = set(env.to_list("INGESTORS_NER_DISABLE", NER_DISABLE))
NER_DEFAULT_MODEL = "xx"
Пример #7
0
import multiprocessing
from servicelayer import env

NUM_THREADS = min(8, multiprocessing.cpu_count())
NUM_THREADS = env.to_int('INGEST_THREADS', NUM_THREADS)
MAX_RETRIES = env.to_int('INGEST_RETRIES', 3)

UNOSERVICE_URL = env.get('UNOSERVICE_URL')
Пример #8
0
import os
import pkg_resources
import multiprocessing
from servicelayer import env
from servicelayer import settings as sls

###############################################################################
# Core configuration
VERSION = pkg_resources.get_distribution('memorious').version
APP_NAME = env.get('MEMORIOUS_APP_NAME', 'memorious')

# Enable debug logging etc.
DEBUG = env.to_bool('MEMORIOUS_DEBUG', default=False)
TESTING = False

# Base operating path
BASE_PATH = os.path.join(os.getcwd(), 'data')
BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH)

# Override servicelayer archive if undefined
sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive')

# Directory which contains crawler pipeline YAML specs
CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH')

# Try and run scrapers in a way that only acquires new data
INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True)

# How many days until an incremental crawl expires
EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 1)
Пример #9
0
# Show error messages to the user.
DEBUG = env.to_bool("ALEPH_DEBUG", False)
# Profile requests
PROFILE = env.to_bool("ALEPH_PROFILE", False)
# Propose HTTP caching to the user agents.
CACHE = env.to_bool("ALEPH_CACHE", not DEBUG)
# Puts the system into read-only mode and displays a warning.
MAINTENANCE = env.to_bool("ALEPH_MAINTENANCE", False)
# Unit test context.
TESTING = False

###############################################################################
# General instance information

APP_TITLE = env.get("ALEPH_APP_TITLE", lazy_gettext("Aleph"))
APP_NAME = env.get("ALEPH_APP_NAME", "aleph")
APP_UI_URL = env.get("ALEPH_UI_URL", "http://localhost:8080/")
APP_LOGO = env.get("ALEPH_LOGO", "/static/logo.png")
APP_LOGO_AR = env.get("ALEPH_LOGO_AR", APP_LOGO)
APP_FAVICON = env.get("ALEPH_FAVICON", "/static/favicon.png")

# Show a system-wide banner in the user interface.
APP_BANNER = env.get("ALEPH_APP_BANNER")

# Force HTTPS here:
FORCE_HTTPS = True if APP_UI_URL.lower().startswith("https") else False
FORCE_HTTPS = env.to_bool("ALEPH_FORCE_HTTPS", FORCE_HTTPS)
PREFERRED_URL_SCHEME = "https" if FORCE_HTTPS else "http"
PREFERRED_URL_SCHEME = env.get("ALEPH_URL_SCHEME", PREFERRED_URL_SCHEME)
# Apply HTTPS rules to the UI URL:
Пример #10
0
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as sts

TESTING = False

# Document conversion service
CONVERT_URL = env.get('UNOSERVICE_URL', 'http://convert-document:3000/convert')
CONVERT_URL = env.get('INGESTORS_CONVERT_DOCUMENT_URL', CONVERT_URL)
CONVERT_TIMEOUT = env.to_int('INGESTORS_CONVERT_TIMEOUT', 7200)  # 2 hrs

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool('INGESTORS_OCR_VISION_API', False)

# Geonames data file
GEONAMES_PATH = env.get('INGESTORS_GEONAMES_PATH',
                        '/ingestors/data/geonames.txt')

# FastText lid model file
LID_MODEL_PATH = env.get('INGESTORS_LID_MODEL_PATH',
                         '/ingestors/data/lid.176.ftz')

# Disable entity extraction
ANALYZE_ENTITIES = env.to_bool('INGESTORS_ANALYZE_ENTITIES', True)

# List available NER models
NER_MODELS = set(env.to_list('INGESTORS_NER_MODELS', ['eng']))
NER_DEFAULT_MODEL = 'xx'

# Use the environment variable set in aleph.env
sts.DATABASE_URI = env.get('ALEPH_DATABASE_URI', sts.DATABASE_URI)
Пример #11
0
# Show error messages to the user.
DEBUG = env.to_bool("ALEPH_DEBUG", False)
# Profile requests
PROFILE = env.to_bool("ALEPH_PROFILE", False)
# Propose HTTP caching to the user agents.
CACHE = env.to_bool("ALEPH_CACHE", not DEBUG)
# Puts the system into read-only mode and displays a warning.
MAINTENANCE = env.to_bool("ALEPH_MAINTENANCE", False)
# Unit test context.
TESTING = False

###############################################################################
# General instance information

APP_TITLE = env.get("ALEPH_APP_TITLE", lazy_gettext("Aleph"))
APP_DESCRIPTION = env.get("ALEPH_APP_DESCRIPTION", "")
APP_NAME = env.get("ALEPH_APP_NAME", "aleph")
APP_UI_URL = env.get("ALEPH_UI_URL", "http://localhost:8080/")
APP_LOGO = env.get("ALEPH_LOGO", "/static/logo.png")
APP_FAVICON = env.get("ALEPH_FAVICON", "/static/favicon.png")

# Show a system-wide banner in the user interface.
APP_BANNER = env.get("ALEPH_APP_BANNER")

# Shown on the home page as a few sample queries:
SAMPLE_SEARCHES = [lazy_gettext("TeliaSonera"), lazy_gettext("Vladimir Putin")]
SAMPLE_SEARCHES = env.to_list("ALEPH_SAMPLE_SEARCHES", SAMPLE_SEARCHES)

# Force HTTPS here:
FORCE_HTTPS = env.to_bool("ALEPH_FORCE_HTTPS", False)
Пример #12
0
 def find_command(self, name):
     config_name = "%s_BIN" % name
     config_name = config_name.replace("-", "_").upper()
     return env.get(config_name, find_executable(name))
Пример #13
0
import os
import pkg_resources
from servicelayer import env
from servicelayer import settings as sls

###############################################################################
# Core configuration
VERSION = pkg_resources.get_distribution("memorious").version
APP_NAME = env.get("MEMORIOUS_APP_NAME", "memorious")

# Enable debug logging etc.
DEBUG = env.to_bool("MEMORIOUS_DEBUG", default=False)
TESTING = False

# Base operating path
BASE_PATH = os.path.join(os.getcwd(), "data")
BASE_PATH = env.get("MEMORIOUS_BASE_PATH", BASE_PATH)

# Override servicelayer archive if undefined
sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, "archive")

# Directory which contains crawler pipeline YAML specs
CONFIG_PATH = env.get("MEMORIOUS_CONFIG_PATH")

# Try and run scrapers in a way that only acquires new data
INCREMENTAL = env.to_bool("MEMORIOUS_INCREMENTAL", default=True)

# Continue running the crawler even when we encounter an error
CONTINUE_ON_ERROR = env.to_bool("MEMORIOUS_CONTINUE_ON_ERROR", default=False)

# How many days until an incremental crawl expires
Пример #14
0
import multiprocessing
from servicelayer import env

# Redis cache
# URL format: redis://localhost:6379/0
REDIS_URL = env.get("REDIS_URL")
REDIS_SHORT = 84700
REDIS_LONG = REDIS_SHORT * 200
REDIS_EXPIRE = env.to_int("REDIS_EXPIRE", REDIS_SHORT * 7)
REDIS_PREFIX = "sla"

# Persistent database tags
TAGS_DATABASE_URI = env.get("TAGS_DATABASE_URI", "sqlite://")

# Worker
WORKER_RETRY = env.to_int("WORKER_RETRY", 3)
WORKER_THREADS = env.to_int("WORKER_THREADS", multiprocessing.cpu_count())
WORKER_REPORTING = env.to_bool("WORKER_REPORTING", True)

# Amazon client credentials
AWS_KEY_ID = env.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_KEY = env.get("AWS_SECRET_ACCESS_KEY")
AWS_REGION = env.get("AWS_REGION", "eu-west-1")
# S3 compatible Minio host if using Minio for storage
ARCHIVE_ENDPOINT_URL = env.get("ARCHIVE_ENDPOINT_URL")

# Storage type (either 's3', 'gs', or 'file', i.e. local file system):
ARCHIVE_TYPE = env.get("ARCHIVE_TYPE", "file")
ARCHIVE_BUCKET = env.get("ARCHIVE_BUCKET")
ARCHIVE_PATH = env.get("ARCHIVE_PATH")
PUBLICATION_BUCKET = env.get("PUBLICATION_BUCKET", ARCHIVE_BUCKET)
Пример #15
0
import multiprocessing
from servicelayer import env

# Redis cache
REDIS_URL = env.get('REDIS_URL')
REDIS_SHORT = 84700
REDIS_LONG = REDIS_SHORT * 200
REDIS_EXPIRE = env.to_int('REDIS_EXPIRE', REDIS_SHORT * 7)
REDIS_PREFIX = 'sla'

# Worker
WORKER_RETRY = env.to_int('WORKER_RETRY', 3)
WORKER_THREADS = min(8, multiprocessing.cpu_count())
WORKER_THREADS = env.to_int('WORKER_THREADS', WORKER_THREADS)

# Amazon client credentials
AWS_KEY_ID = env.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_KEY = env.get('AWS_SECRET_ACCESS_KEY')
AWS_REGION = env.get('AWS_REGION', 'eu-west-1')

# Storage type (either 's3', 'gs', or 'file', i.e. local file system):
ARCHIVE_TYPE = env.get('ARCHIVE_TYPE', 'file')
ARCHIVE_BUCKET = env.get('ARCHIVE_BUCKET')
ARCHIVE_PATH = env.get('ARCHIVE_PATH')