def start_up(search_config_id: str, search_config_url: str = None, search_config_title: str = None, detail_config_id: str = None, metadata_config_url: str = None, detail_config_url: str = None, detail_config_title: str = None): conf = get_config() search_config_index_name = conf.get_string("search-config.index-name") search_config = conf.get_config("presets.file-search.search-config") detail_config_index_name = conf.get_string("detail-config.index-name") detail_config = conf.get_config("presets.file-search.detail-config") if detail_config_id is None: detail_config_id = search_config_id try: es_client = start() auth_header = get_admin_auth_header() if not es_client.exists(search_config_index_name, search_config_id, headers=auth_header): create_search_config(es_client, search_config_index_name, search_config_id, search_config, search_config_url, search_config_title, headers=auth_header) if not es_client.exists(detail_config_index_name, detail_config_id, headers=auth_header): create_detail_config(es_client, detail_config_index_name, detail_config_id, detail_config, metadata_config_url, detail_config_url, detail_config_title, headers=auth_header) finally: es_client.close()
def start(): conf = get_config().get_config("elasticsearch") return OpenDistro(**conf)
import logging from config.utils import get_admin_auth_header, get_config from start_client import start from .loaddata import (create_sample_index, load_detail_config, load_sample_config, load_sample_data) index_name = "sample-data" conf = get_config() def start_up(): try: es_client = start() auth_header = get_admin_auth_header() if not es_client.indices.exists(index_name, headers=auth_header): create_sample_index(es_client, index_name, headers=auth_header) load_sample_data(es_client, index_name, num_docs=30, headers=auth_header) load_sample_config( es_client, index_name=conf.get_string("search-config.index-name"), headers=auth_header) load_detail_config( es_client, index_name=conf.get_string("detail-config.index-name"), headers=auth_header)
def run_job(args, config): app_name_template = config.get('global').get('description') project_name = args["--project"] country = args["--country"] # read spark/hadoop informations save_hdfs = (args["--save-hdfs"].lower() == "true") remove_from_local_hdfs = ( args["--remove-from-local-hdfs"].lower() == "true") # file system saving info dependent_schema_dir = args["--dependent-schema-dir"] base_directory = args["--base-directory"] project_schema = "dw_{}_{}".format(project_name, country) save_dir = "{}/{}".format(base_directory, project_schema) target_database_config = { "type": args["--target-type"], "hostname": args["--target-hostname"], "port": args["--target-port"], "username": args["--target-username"], "password": args["--target-password"], "database": args["--target-database"], "schema": args["--target-schema"] } # write s3 bucket informations use_s3_dist_cp = (args["--use-s3-dist-cp"].lower() == "true") aws_access_key = args["--aws-access-key"] aws_secret_key = args["--aws-secret-key"] s3_bucket = args["--s3-bucket-name"] # file related args input_file_type = args["--input-file-type"] output_file_type = args["--output-file-type"] separator = args["--csv-separator"] header = (args["--csv-header"].lower() == "true") infer_schema = (args["--infer-schema"].lower() == "true") # validate if s3 interface is valid with given option validate_s3_interface(use_s3_dist_cp, s3_bucket) metadata_config = config.get('metadata') project_config_file = '../config/dimensions-facts/{}/{}.yml'.format( project_name, country) project_config = get_config(project_config_file).get('default') app_name = app_name_template.format(project_name, country, output_file_type) spark = create_spark_session(aws_access_key, aws_secret_key, app_name) dimensions_config = project_config.get("dimensions") facts_config = project_config.get("facts") dependent_schemas = project_config.get("dependency") # load all tables (stages) from all dependent schemas to calculate dimensions and facts for schema in dependent_schemas: logging.info("loading tables from {} schema".format(schema)) schema_name = dependent_schemas.get(schema) stage_tables_info = get_table_info(metadata_config, schema_name) # for each stage table found, load a temporary view for spark for stage_table in stage_tables_info: table_name = stage_table[0] load_dependent_table(spark, dependent_schema_dir, schema_name, table_name, input_file_type) # load user defined functions (udf) to use as spark sql functions load_spark_sql_udf(spark) # for each dimension for dimension in dimensions_config: # unpersist dataframe from cache after write unpersist_after_write = True # get sql file dim_file = dimensions_config.get(dimension).get("file") # get partition column list dim_partition = dimensions_config.get(dimension).get("partition") if dim_partition is not None: dim_partition = dim_partition.replace(" ", "").split(',') _process_dim_fact(spark=spark, project_name=project_name, project_schema=project_schema, country=country, dim_fact_name=dimension, dim_fact_file=dim_file, dim_fact_partition=dim_partition, output_file_type=output_file_type, separator=separator, header=header, use_s3_dist_cp=use_s3_dist_cp, s3_bucket=s3_bucket, save_hdfs=save_hdfs, remove_from_local_hdfs=remove_from_local_hdfs, save_dir=save_dir, database_config=target_database_config, unpersist_after_write=unpersist_after_write, infer_schema=infer_schema) logging.info("Dimension process {} finished".format(dimension)) # for each fact for fact in facts_config: # unpersist dataframe from cache after write unpersist_after_write = True # get sql file fact_file = facts_config.get(fact).get("file") # get partition column list fact_partition = facts_config.get(fact).get("partition") if fact_partition is not None: fact_partition = fact_partition.replace(" ", "").split(',') _process_dim_fact(spark=spark, project_name=project_name, project_schema=project_schema, country=country, dim_fact_name=fact, dim_fact_file=fact_file, dim_fact_partition=fact_partition, output_file_type=output_file_type, separator=separator, header=header, use_s3_dist_cp=use_s3_dist_cp, s3_bucket=s3_bucket, save_hdfs=save_hdfs, remove_from_local_hdfs=remove_from_local_hdfs, save_dir=save_dir, database_config=target_database_config, unpersist_after_write=unpersist_after_write, infer_schema=infer_schema) logging.info("Fact process {} finished".format(fact)) # spark is not needed stop_spark_context(spark)
project_schema=project_schema, country=country, dim_fact_name=fact, dim_fact_file=fact_file, dim_fact_partition=fact_partition, output_file_type=output_file_type, separator=separator, header=header, use_s3_dist_cp=use_s3_dist_cp, s3_bucket=s3_bucket, save_hdfs=save_hdfs, remove_from_local_hdfs=remove_from_local_hdfs, save_dir=save_dir, database_config=target_database_config, unpersist_after_write=unpersist_after_write, infer_schema=infer_schema) logging.info("Fact process {} finished".format(fact)) # spark is not needed stop_spark_context(spark) if __name__ == '__main__': args = docopt(__doc__, version='1') # configure log set_log(args['--log-level']) env = os.getenv('env_type', 'default') config = get_config('../config/load-dim-fact.yml').get(env) run_job(args, config)
def preview_router(index_name: str, cache_path: str, path_property: str, tags: List[str] = ["preview"]): router = APIRouter() query_builder = ElasticsearchAPIQueryBuilder() conf = get_config() manager = PreviewManager(cache_path, create_folder=True) @query_builder.filter() def filter_config(id: str = Path(None, description="Id of the document to preview.")): return { "ids": { "values": [id] } } @router.get("/preview/{id}", tags=tags) async def preview( page: Optional[int] = Query(0, ge=0, description="The page of the document to generate the preview."), width: Optional[int] = Query(300, ge=1, le=1024, description="The width of the generated preview."), height: Optional[int] = Query(200, ge=1, le=1024, description="The height of the generated preview."), query_body: Dict = Depends(query_builder.build(source=[path_property])), es_client: Elasticsearch = Depends(get_client), auth_header: Dict = Depends(get_auth_header)) -> FileResponse: resp = es_client.search( body=query_body, headers=auth_header, index=index_name ) if resp["hits"]["total"]["value"] > 0: document_path = resp["hits"]["hits"][0]["_source"][path_property] path_to_preview_image = manager.get_jpeg_preview(document_path, page=page, width=width, height=height, ) return FileResponse(path_to_preview_image) else: raise HTTPException(status_code=404, detail="Document not found") @router.get("/preview/info/{id}", tags=tags, response_model=PreviewInfoModel) async def preview_info( query_body: Dict = Depends(query_builder.build(source=[path_property])), es_client: Elasticsearch = Depends(get_client), auth_header: Dict = Depends(get_auth_header)) -> FileResponse: resp = es_client.search( body=query_body, headers=auth_header, index=index_name ) if resp["hits"]["total"]["value"] > 0: document_path = resp["hits"]["hits"][0]["_source"][path_property] if os.path.isfile(document_path): supported = manager.has_jpeg_preview(document_path) pages = manager.get_page_nb(document_path) return PreviewInfoModel(supported=supported, pages=pages) else: return PreviewInfoModel(supported=False, pages=0) else: raise HTTPException(status_code=404, detail="Document not found") return router
from opendistro import OpenDistro from passlib.context import CryptContext from pydantic import BaseModel from starlette.responses import JSONResponse from .basic import auth_header as basic_auth_header security = HTTPBearer(bearerFormat="JWT") def load_secret_key(jwt_config): secret_key = load_secret(jwt_config, "secret-file", "secret-key") return base64.b64decode(secret_key).decode("utf-8") app_config = get_config() jwt_config = app_config.get_config("jwt") secret_key = load_secret_key(jwt_config) algorithm = jwt_config.get_string("algorithm") access_token_expires_minutes = jwt_config.get_int( "access-token-expire-minutes") class Token(BaseModel): access_token: str token_type: str class UserPasswordForm: def __init__(self, username: str = Form(...), password: str = Form(...)): self.username = username
file_type=file_type, separator=separator, header=header, infer_schema=infer_schema) if use_s3_dist_cp: # write dataframe into s3 through s3-dist-cp binary send_s3_using_s3_dist_cp(save_dir=save_dir, project_name=project_name, table_name=table_name, s3_bucket=s3_bucket, file_type=file_type) # delete temporary hdfs files delete_hdfs_file(remove_from_local_hdfs, source_schema, table_name) logging.info("Extract process for table {} finished".format(table_name)) # close spark context stop_spark_context(spark) if __name__ == '__main__': args = docopt(__doc__, version='1') # configure log set_log(args['--log-level']) # .get('default').get('metadata').get('test') config_file_path = args['--config-file'] env = os.getenv('env_type', 'default') config = get_config(config_file_path).get(env) run_job(args, config)
def init_config(app: web.Application) -> None: app['config'] = get_config()