예제 #1
0
import multiprocessing
from servicelayer import env

NUM_THREADS = min(8, multiprocessing.cpu_count())
NUM_THREADS = env.to_int('INGEST_THREADS', NUM_THREADS)
MAX_RETRIES = env.to_int('INGEST_RETRIES', 3)

UNOSERVICE_URL = env.get('UNOSERVICE_URL')
예제 #2
0
파일: settings.py 프로젝트: mudsill/aleph
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as sts

TESTING = False

# Document conversion service
CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert")
CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL)
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200)  # 2 hrs
CONVERT_RETRIES = env.to_int("INGESTORS_CONVERT_RETRIES", 256)

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)

# Geonames data file
GEONAMES_PATH = env.get("INGESTORS_GEONAMES_PATH",
                        "/ingestors/data/geonames.txt")

# FastText lid model file
LID_MODEL_PATH = env.get("INGESTORS_LID_MODEL_PATH",
                         "/ingestors/data/lid.176.ftz")

# Disable entity extraction
ANALYZE_ENTITIES = env.to_bool("INGESTORS_ANALYZE_ENTITIES", True)

# List available NER models
NER_MODELS = set(env.to_list("INGESTORS_NER_MODELS", ["eng"]))
NER_DISABLE = ["ara"]
NER_DISABLE = set(env.to_list("INGESTORS_NER_DISABLE", NER_DISABLE))
NER_DEFAULT_MODEL = "xx"
예제 #3
0
# Base operating path
BASE_PATH = os.path.join(os.getcwd(), 'data')
BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH)

# Override servicelayer archive if undefined
sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive')

# Directory which contains crawler pipeline YAML specs
CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH')

# Try and run scrapers in a way that only acquires new data
INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True)

# How many days until an incremental crawl expires
EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 1)

# How many db inserts per minute
DB_RATE_LIMIT = env.to_int('MEMORIOUS_DB_RATE_LIMIT', 6000)

# How many http requests to a host per minute
HTTP_RATE_LIMIT = env.to_int('MEMORIOUS_HTTP_RATE_LIMIT', 120)

# How many seconds to wait before trying to run scheduled crawlers
SCHEDULER_INTERVAL = env.to_int('MEMORIOUS_SCHEDULER_INTERVAL', 60)

# Max scheduled tasks at the same time
MAX_SCHEDULED = max(
    env.to_int('MEMORIOUS_MAX_SCHEDULED', multiprocessing.cpu_count()),
    20)  # noqa
예제 #4
0
파일: settings.py 프로젝트: djoffrey/aleph
DEFAULT_LANGUAGE = env.get('ALEPH_DEFAULT_LANGUAGE', 'en')

# User interface
UI_LANGUAGES = ['ru', 'es', 'de', 'en', 'ar']
UI_LANGUAGES = env.to_list('ALEPH_UI_LANGUAGES', UI_LANGUAGES)
UI_LANGUAGES = [l.lower().strip() for l in UI_LANGUAGES]

# Result high-lighting
RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True)

# Minimum update date for sitemap.xml
SITEMAP_FLOOR = '2019-06-22'

# Maximum number of entities to return per property when expanding entities
MAX_EXPAND_ENTITIES = env.to_int('ALEPH_MAX_EXPAND_ENTITIES', 200)

# API rate limiting (req/min for anonymous users)
API_RATE_LIMIT = env.to_int('ALEPH_API_RATE_LIMIT', 30)
API_RATE_WINDOW = 15  # minutes

# Mini-CMS
# Pages directory
PAGES_PATH = os.path.join(APP_DIR, 'pages')
PAGES_PATH = env.get('ALEPH_PAGES_PATH', PAGES_PATH)

##############################################################################
# E-mail settings

MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**')
MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost')
예제 #5
0
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as sts

TESTING = False

# Document conversion service
CONVERT_URL = env.get('UNOSERVICE_URL', 'http://convert-document:3000/convert')
CONVERT_URL = env.get('INGESTORS_CONVERT_DOCUMENT_URL', CONVERT_URL)
CONVERT_TIMEOUT = env.to_int('INGESTORS_CONVERT_TIMEOUT', 7200)  # 2 hrs

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool('INGESTORS_OCR_VISION_API', False)

# Geonames data file
GEONAMES_PATH = env.get('INGESTORS_GEONAMES_PATH',
                        '/ingestors/data/geonames.txt')

# FastText lid model file
LID_MODEL_PATH = env.get('INGESTORS_LID_MODEL_PATH',
                         '/ingestors/data/lid.176.ftz')

# Disable entity extraction
ANALYZE_ENTITIES = env.to_bool('INGESTORS_ANALYZE_ENTITIES', True)

# List available NER models
NER_MODELS = set(env.to_list('INGESTORS_NER_MODELS', ['eng']))
NER_DEFAULT_MODEL = 'xx'

# Use the environment variable set in aleph.env
sts.DATABASE_URI = env.get('ALEPH_DATABASE_URI', sts.DATABASE_URI)
예제 #6
0
# Handler is one of: keycloak, google, cognito, azure (or a plugin)
OAUTH_MIGRATE_SUB = env.to_bool("ALEPH_OAUTH_MIGRATE_SUB", True)
OAUTH_HANDLER = env.get("ALEPH_OAUTH_HANDLER", "oidc")
OAUTH_KEY = env.get("ALEPH_OAUTH_KEY")
OAUTH_SECRET = env.get("ALEPH_OAUTH_SECRET")
OAUTH_SCOPE = env.get("ALEPH_OAUTH_SCOPE", "openid email profile")
OAUTH_METADATA_URL = env.get("ALEPH_OAUTH_METADATA_URL")
OAUTH_TOKEN_METHOD = env.get("ALEPH_OAUTH_TOKEN_METHOD", "POST")
OAUTH_ADMIN_GROUP = env.get("ALEPH_OAUTH_ADMIN_GROUP", "superuser")

# No authentication. Everyone is admin.
SINGLE_USER = env.to_bool("ALEPH_SINGLE_USER")

# Default session duration.
SESSION_EXPIRE = 800_000 if SINGLE_USER else 60_000
SESSION_EXPIRE = env.to_int("ALEPH_SESSION_EXPIRE", SESSION_EXPIRE)

# Disable password-based authentication for SSO settings:
PASSWORD_LOGIN = env.to_bool("ALEPH_PASSWORD_LOGIN", not OAUTH)

# Roles that haven't logged in since X months will stop receiving notifications.
ROLE_INACTIVE = env.to_int("ALEPH_ROLE_INACTIVE", 6 * 30)
ROLE_INACTIVE = timedelta(days=ROLE_INACTIVE)

###############################################################################
# Content processing options

DEFAULT_LANGUAGE = env.get("ALEPH_DEFAULT_LANGUAGE", "en")

# User interface
UI_LANGUAGES = ["ru", "es", "de", "en", "ar"]
예제 #7
0
# User interface
UI_LANGUAGES = ["ru", "es", "de", "en", "ar"]
UI_LANGUAGES = env.to_list("ALEPH_UI_LANGUAGES", UI_LANGUAGES)
UI_LANGUAGES = [l.lower().strip() for l in UI_LANGUAGES]

# Document processing pipeline
INGEST_PIPELINE = env.to_list("ALEPH_INGEST_PIPELINE", ["analyze"])

# Result high-lighting
RESULT_HIGHLIGHT = env.to_bool("ALEPH_RESULT_HIGHLIGHT", True)

# Minimum update date for sitemap.xml
SITEMAP_FLOOR = "2019-06-22"

# Maximum number of entities to return per property when expanding entities
MAX_EXPAND_ENTITIES = env.to_int("ALEPH_MAX_EXPAND_ENTITIES", 200)

# API rate limiting (req/min for anonymous users)
API_RATE_LIMIT = env.to_int("ALEPH_API_RATE_LIMIT", 30)
API_RATE_WINDOW = 15  # minutes

# Mini-CMS
# Pages directory
PAGES_PATH = os.path.join(APP_DIR, "pages")
PAGES_PATH = env.get("ALEPH_PAGES_PATH", PAGES_PATH)

##############################################################################
# E-mail settings

MAIL_FROM = env.get("ALEPH_MAIL_FROM", "*****@*****.**")
MAIL_SERVER = env.get("ALEPH_MAIL_HOST", "localhost")
예제 #8
0
# Base operating path
BASE_PATH = os.path.join(os.getcwd(), 'data')
BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH)

# Override servicelayer archive if undefined
sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive')

# Directory which contains crawler pipeline YAML specs
CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH')

# Try and run scrapers in a way that only acquires new data
INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True)

# How many days until an incremental crawl expires
EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 60)

# How many seconds to wait before trying to run scheduled crawlers
SCHEDULER_INTERVAL = env.to_int('MEMORIOUS_SCHEDULER_INTERVAL', 60)

# How many threads to use for execution
THREADS = env.to_int('MEMORIOUS_THREADS', min(8, multiprocessing.cpu_count()))

# Max scheduled tasks at the same time
MAX_SCHEDULED = env.to_int('MEMORIOUS_MAX_SCHEDULED', THREADS)

# HTTP request configuration
HTTP_CACHE = env.to_bool('MEMORIOUS_HTTP_CACHE', default=True)

# HTTP user agent default
USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.1)'
예제 #9
0
BASE_PATH = env.get("MEMORIOUS_BASE_PATH", BASE_PATH)

# Override servicelayer archive if undefined
sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, "archive")

# Directory which contains crawler pipeline YAML specs
CONFIG_PATH = env.get("MEMORIOUS_CONFIG_PATH")

# Try and run scrapers in a way that only acquires new data
INCREMENTAL = env.to_bool("MEMORIOUS_INCREMENTAL", default=True)

# Continue running the crawler even when we encounter an error
CONTINUE_ON_ERROR = env.to_bool("MEMORIOUS_CONTINUE_ON_ERROR", default=False)

# How many days until an incremental crawl expires
EXPIRE = env.to_int("MEMORIOUS_EXPIRE", 1)

# How many db inserts per minute
DB_RATE_LIMIT = env.to_int("MEMORIOUS_DB_RATE_LIMIT", 6000)

# How many http requests to a host per minute
HTTP_RATE_LIMIT = env.to_int("MEMORIOUS_HTTP_RATE_LIMIT", 120)

# Max number of tasks in a stage's task queue
MAX_QUEUE_LENGTH = env.to_int("MEMORIOUS_MAX_QUEUE_LENGTH", 50000)

# HTTP request configuration
HTTP_CACHE = env.to_bool("MEMORIOUS_HTTP_CACHE", default=True)

# HTTP request timeout
HTTP_TIMEOUT = float(env.to_int("MEMORIOUS_HTTP_TIMEOUT", 30))
예제 #10
0
파일: settings.py 프로젝트: wayne9qiu/aleph
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as sts

TESTING = False

# Document conversion service
CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert")
CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL)
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200)  # 2 hrs

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)

# Geonames data file
GEONAMES_PATH = env.get("INGESTORS_GEONAMES_PATH",
                        "/ingestors/data/geonames.txt")

# FastText lid model file
LID_MODEL_PATH = env.get("INGESTORS_LID_MODEL_PATH",
                         "/ingestors/data/lid.176.ftz")

# Disable entity extraction
ANALYZE_ENTITIES = env.to_bool("INGESTORS_ANALYZE_ENTITIES", True)

# List available NER models
NER_MODELS = set(env.to_list("INGESTORS_NER_MODELS", ["eng"]))
NER_DEFAULT_MODEL = "xx"

# Use the environment variable set in aleph.env
sts.DATABASE_URI = env.get("ALEPH_DATABASE_URI", sts.DATABASE_URI)
예제 #11
0
import multiprocessing
from servicelayer import env

# Redis cache
# URL format: redis://localhost:6379/0
REDIS_URL = env.get("REDIS_URL")
REDIS_SHORT = 84700
REDIS_LONG = REDIS_SHORT * 200
REDIS_EXPIRE = env.to_int("REDIS_EXPIRE", REDIS_SHORT * 7)
REDIS_PREFIX = "sla"

# Persistent database tags
TAGS_DATABASE_URI = env.get("TAGS_DATABASE_URI", "sqlite://")

# Worker
WORKER_RETRY = env.to_int("WORKER_RETRY", 3)
WORKER_THREADS = env.to_int("WORKER_THREADS", multiprocessing.cpu_count())
WORKER_REPORTING = env.to_bool("WORKER_REPORTING", True)

# Amazon client credentials
AWS_KEY_ID = env.get("AWS_ACCESS_KEY_ID")
AWS_SECRET_KEY = env.get("AWS_SECRET_ACCESS_KEY")
AWS_REGION = env.get("AWS_REGION", "eu-west-1")
# S3 compatible Minio host if using Minio for storage
ARCHIVE_ENDPOINT_URL = env.get("ARCHIVE_ENDPOINT_URL")

# Storage type (either 's3', 'gs', or 'file', i.e. local file system):
ARCHIVE_TYPE = env.get("ARCHIVE_TYPE", "file")
ARCHIVE_BUCKET = env.get("ARCHIVE_BUCKET")
ARCHIVE_PATH = env.get("ARCHIVE_PATH")
PUBLICATION_BUCKET = env.get("PUBLICATION_BUCKET", ARCHIVE_BUCKET)
예제 #12
0
from servicelayer import env
from servicelayer import settings as sls
from ftmstore import settings as sts

TESTING = False

# When set to True, a debugpy server will be enabled in cli.py process()
DEBUGPY_PROCESS = env.to_bool("INGESTORS_DEBUGPY_PROCESS", False)
# The address that the debugpy server should bind to
DEBUGPY_ADDRESS = env.get("INGESTORS_DEBUGPY_ADDRESS", "0.0.0.0")
# The port that the debugpy server should listen for a connection on
DEBUGPY_PORT = env.to_int("INGESTORS_DEBUGPY_PORT", 5678)
# When set to True, after setting up the debug server the application will block
# and wait for a client connection before continuing with processing
DEBUGPY_WAIT_FOR_CLIENT = env.to_bool("INGESTORS_DEBUGPY_WAIT_FOR_CLIENT", False)

# Document conversion service
CONVERT_URL = env.get("UNOSERVICE_URL", "http://convert-document:3000/convert")
CONVERT_URL = env.get("INGESTORS_CONVERT_DOCUMENT_URL", CONVERT_URL)
CONVERT_TIMEOUT = env.to_int("INGESTORS_CONVERT_TIMEOUT", 7200)  # 2 hrs

# Enable (expensive!) Google Cloud API
OCR_VISION_API = env.to_bool("INGESTORS_OCR_VISION_API", False)

# Enable Google Cloud Translation API
TRANSLATION_API = env.to_bool("INGESTORS_TRANSLATION_API", False)

# White list of language IDs for languages that should be translated
# An empty white list is considered a wildcard, allowing all languages to be translated
TRANSLATION_LANGUAGE_WHITE_LIST = env.to_list("INGESTORS_TRANSLATION_LANGUAGE_WHITE_LIST", None)
예제 #13
0
import multiprocessing
from servicelayer import env

# Redis cache
REDIS_URL = env.get('REDIS_URL')
REDIS_SHORT = 84700
REDIS_LONG = REDIS_SHORT * 200
REDIS_EXPIRE = env.to_int('REDIS_EXPIRE', REDIS_SHORT * 7)
REDIS_PREFIX = 'sla'

# Worker
WORKER_RETRY = env.to_int('WORKER_RETRY', 3)
WORKER_THREADS = min(8, multiprocessing.cpu_count())
WORKER_THREADS = env.to_int('WORKER_THREADS', WORKER_THREADS)

# Amazon client credentials
AWS_KEY_ID = env.get('AWS_ACCESS_KEY_ID')
AWS_SECRET_KEY = env.get('AWS_SECRET_ACCESS_KEY')
AWS_REGION = env.get('AWS_REGION', 'eu-west-1')

# Storage type (either 's3', 'gs', or 'file', i.e. local file system):
ARCHIVE_TYPE = env.get('ARCHIVE_TYPE', 'file')
ARCHIVE_BUCKET = env.get('ARCHIVE_BUCKET')
ARCHIVE_PATH = env.get('ARCHIVE_PATH')
예제 #14
0
# Result high-lighting
RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True)

# Minimum update date for sitemap.xml
SITEMAP_FLOOR = '2018-12-09'

##############################################################################
# E-mail settings

MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**')
MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost')
MAIL_USERNAME = env.get('ALEPH_MAIL_USERNAME')
MAIL_PASSWORD = env.get('ALEPH_MAIL_PASSWORD')
MAIL_USE_SSL = env.to_bool('ALEPH_MAIL_SSL', True)
MAIL_PORT = env.to_int('ALEPH_MAIL_PORT', 465)

###############################################################################
# Database, search index and queue processing.

DATABASE_URI = env.get('ALEPH_DATABASE_URI')
SQLALCHEMY_TRACK_MODIFICATIONS = False
ALEMBIC_DIR = os.path.join(os.path.dirname(__file__), 'migrate')
ALEMBIC_DIR = os.path.abspath(ALEMBIC_DIR)

ELASTICSEARCH_URL = env.get('ALEPH_ELASTICSEARCH_URI', 'http://localhost:9200')
ELASTICSEARCH_TIMEOUT = env.to_int('ELASTICSEARCH_TIMEOUT', 30)

INDEX_PREFIX = env.get('ALEPH_INDEX_PREFIX', APP_NAME)
INDEX_WRITE = env.get('ALEPH_INDEX_WRITE', 'v1')
INDEX_READ = env.to_list('ALEPH_INDEX_READ', [INDEX_WRITE])
예제 #15
0
# Result high-lighting
RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True)

# Minimum update date for sitemap.xml
SITEMAP_FLOOR = '2019-06-22'

##############################################################################
# E-mail settings

MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**')
MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost')
MAIL_USERNAME = env.get('ALEPH_MAIL_USERNAME')
MAIL_PASSWORD = env.get('ALEPH_MAIL_PASSWORD')
MAIL_USE_SSL = env.to_bool('ALEPH_MAIL_SSL', True)
MAIL_PORT = env.to_int('ALEPH_MAIL_PORT', 465)

###############################################################################
# Database, search index and queue processing.

QUEUE_RETRY = env.to_int('ALEPH_QUEUE_RETRY', 3)

DATABASE_URI = env.get('ALEPH_DATABASE_URI')
SQLALCHEMY_TRACK_MODIFICATIONS = False
ALEMBIC_DIR = os.path.join(os.path.dirname(__file__), 'migrate')
ALEMBIC_DIR = os.path.abspath(ALEMBIC_DIR)

ELASTICSEARCH_URL = env.get('ALEPH_ELASTICSEARCH_URI', 'http://localhost:9200')
ELASTICSEARCH_TIMEOUT = env.to_int('ELASTICSEARCH_TIMEOUT', 30)

INDEX_PREFIX = env.get('ALEPH_INDEX_PREFIX', APP_NAME)
예제 #16
0
파일: settings.py 프로젝트: pudo/aleph
# Result high-lighting
RESULT_HIGHLIGHT = env.to_bool('ALEPH_RESULT_HIGHLIGHT', True)

# Minimum update date for sitemap.xml
SITEMAP_FLOOR = '2018-12-09'


##############################################################################
# E-mail settings

MAIL_FROM = env.get('ALEPH_MAIL_FROM', '*****@*****.**')
MAIL_SERVER = env.get('ALEPH_MAIL_HOST', 'localhost')
MAIL_USERNAME = env.get('ALEPH_MAIL_USERNAME')
MAIL_PASSWORD = env.get('ALEPH_MAIL_PASSWORD')
MAIL_USE_SSL = env.to_bool('ALEPH_MAIL_SSL', True)
MAIL_PORT = env.to_int('ALEPH_MAIL_PORT', 465)


###############################################################################
# Database, search index and queue processing.

DATABASE_URI = env.get('ALEPH_DATABASE_URI')
SQLALCHEMY_TRACK_MODIFICATIONS = False
ALEMBIC_DIR = os.path.join(os.path.dirname(__file__), 'migrate')
ALEMBIC_DIR = os.path.abspath(ALEMBIC_DIR)

ELASTICSEARCH_URL = env.get('ALEPH_ELASTICSEARCH_URI', 'http://localhost:9200')
ELASTICSEARCH_TIMEOUT = env.to_int('ELASTICSEARCH_TIMEOUT', 30)

INDEX_PREFIX = env.get('ALEPH_INDEX_PREFIX', APP_NAME)
INDEX_WRITE = env.get('ALEPH_INDEX_WRITE', 'v1')
예제 #17
0
# Base operating path
BASE_PATH = os.path.join(os.getcwd(), 'data')
BASE_PATH = env.get('MEMORIOUS_BASE_PATH', BASE_PATH)

# Override servicelayer archive if undefined
sls.ARCHIVE_PATH = sls.ARCHIVE_PATH or os.path.join(BASE_PATH, 'archive')

# Directory which contains crawler pipeline YAML specs
CONFIG_PATH = env.get('MEMORIOUS_CONFIG_PATH')

# Try and run scrapers in a way that only acquires new data
INCREMENTAL = env.to_bool('MEMORIOUS_INCREMENTAL', default=True)

# How many days until an incremental crawl expires
EXPIRE = env.to_int('MEMORIOUS_EXPIRE', 60)

# How many db inserts per minute
DB_RATE_LIMIT = env.to_int('MEMORIOUS_DB_RATE_LIMIT', 6000)

# How many http requests to a host per minute
HTTP_RATE_LIMIT = env.to_int('MEMORIOUS_HTTP_RATE_LIMIT', 120)  # noqa

# How many seconds to wait before trying to run scheduled crawlers
SCHEDULER_INTERVAL = env.to_int('MEMORIOUS_SCHEDULER_INTERVAL', 60)

# How many threads to use for execution
THREADS = env.to_int('MEMORIOUS_THREADS', min(8, multiprocessing.cpu_count()))

# Max scheduled tasks at the same time
MAX_SCHEDULED = max(env.to_int('MEMORIOUS_MAX_SCHEDULED', THREADS), 20)