def _conf_db(app, db=None): from widukind_common.utils import get_mongo_db if not db: app.widukind_db = get_mongo_db( app.config.get("MONGODB_URL").strip('"'), connect=False) else: app.widukind_db = db
def export_file_csv_dataset_unit(doc=None, provider=None, dataset_code=None, slug=None): """Create CSV File from one Dataset and record in MongoDB GridFS """ db = get_mongo_db() if not doc: if slug: doc = db[constants.COL_DATASETS].find_one({"slug": slug}) else: if not provider: raise ValueError("provider is required") if not dataset_code: raise ValueError("dataset_code is required") query = {} query['provider_name'] = provider query['dataset_code'] = dataset_code doc = db[constants.COL_DATASETS].find_one(query, {'revisions': 0}) if not doc: raise Exception("Dataset not found for provider[%s] - dataset[%s] - slug[%s]" % (provider, dataset_code, slug)) values = export_dataset(db, doc) return record_csv_file(db, values, provider_name=doc['provider_name'], dataset_code=doc["dataset_code"], slug=doc["slug"], prefix="dataset")
def _conf_db(app, db=None): import gridfs from widukind_common.utils import get_mongo_db from widukind_web.utils import create_or_update_indexes if not db: app.widukind_db = get_mongo_db(app.config.get("MONGODB_URL").strip('"'), connect=False) else: app.widukind_db = db app.widukind_fs = gridfs.GridFS(app.widukind_db) create_or_update_indexes(app.widukind_db)
def setUp(self): BaseTestCase.setUp(self) db = get_mongo_db() self.db = db.client["widukind_test"] self.assertEqual(self.db.name, "widukind_test") utils.clean_mongodb(self.db) create_or_update_indexes(self.db, force_mode=True)
def _conf_db(app, db=None): import gridfs from widukind_common.utils import get_mongo_db from widukind_web.utils import create_or_update_indexes if not db: app.widukind_db = get_mongo_db( app.config.get("MONGODB_URL").strip('"'), connect=False) else: app.widukind_db = db app.widukind_fs = gridfs.GridFS(app.widukind_db) create_or_update_indexes(app.widukind_db)
def consolidate_all_dataset(provider_name=None, db=None, max_bulk=20): db = db or utils.get_mongo_db() query = {"provider_name": provider_name} projection = {"_id": True, "dataset_code": True} cursor = db[constants.COL_DATASETS].find(query, projection) dataset_codes = [doc["dataset_code"] for doc in cursor] bulk_requests = db[constants.COL_DATASETS].initialize_unordered_bulk_op() bulk_size = 0 results = [] for dataset_code in dataset_codes: query, query_modify = consolidate_dataset(provider_name, dataset_code, db=db, execute=False) if not query: logger.warning("bypass dataset [%s]" % dataset_code) continue bulk_size += 1 bulk_requests.find(query).update_one(query_modify) if bulk_size > max_bulk: result = _run_bulk(db, bulk_requests) if result: results.append(result) bulk_requests = db[ constants.COL_DATASETS].initialize_unordered_bulk_op() bulk_size = 0 if bulk_size > 0: result = _run_bulk(db, bulk_requests) if result: results.append(result) results_details = { "matched_count": 0, "modified_count": 0, } for r in results: results_details["matched_count"] += r["nMatched"] results_details["modified_count"] += r["nModified"] return results_details
def __init__(self, provider_name=None, db=None, is_indexes=True, version=0, max_errors=5, use_existing_file=False, not_remove_files=False, async_mode=False, async_framework="gevent", **kwargs): """ :param str provider_name: Provider Name :param pymongo.database.Database db: MongoDB Database instance :param bool is_indexes: Bypass create_or_update_indexes() if False :raises ValueError: if provider_name is None """ if not provider_name: raise ValueError("provider_name is required") self.provider_name = provider_name self.db = db or get_mongo_db() self.version = version self.max_errors = max_errors self.use_existing_file = use_existing_file self.not_remove_files = not_remove_files self.async_mode = async_mode self.async_framework = async_framework if self.async_mode: logger.info("ASYNC MODE ENABLE") else: logger.info("ASYNC MODE DISABLE") self.provider = None self.errors = 0 self.categories_filter = [] #[category_code] self.datasets_filter = [] #[dataset_code] self.selected_datasets = {} self.store_path = os.path.abspath(os.path.join(tempfile.gettempdir(), self.provider_name)) self.for_delete = [] if IS_SCHEMAS_VALIDATION_DISABLE: logger.warning("schemas validation is disable")
def setUp(self): BaseTestCase.setUp(self) from widukind_common.utils import get_mongo_db, create_or_update_indexes from widukind_common import tests_tools as utils db = get_mongo_db() self.db = db.client["widukind_test"] self.assertEqual(self.db.name, "widukind_test") utils.clean_mongodb(self.db) create_or_update_indexes(self.db, force_mode=True) self._collections_is_empty()
def consolidate_all_dataset(provider_name=None, db=None, max_bulk=20): db = db or utils.get_mongo_db() query = {"provider_name": provider_name} projection = {"_id": True, "dataset_code": True} cursor = db[constants.COL_DATASETS].find(query, projection) dataset_codes = [doc["dataset_code"] for doc in cursor] bulk_requests = db[constants.COL_DATASETS].initialize_unordered_bulk_op() bulk_size = 0 results = [] for dataset_code in dataset_codes: query, query_modify = consolidate_dataset(provider_name, dataset_code, db=db, execute=False) if not query: logger.warning("bypass dataset [%s]" % dataset_code) continue bulk_size += 1 bulk_requests.find(query).update_one(query_modify) if bulk_size > max_bulk: result = _run_bulk(db, bulk_requests) if result: results.append(result) bulk_requests = db[constants.COL_DATASETS].initialize_unordered_bulk_op() bulk_size = 0 if bulk_size > 0: result = _run_bulk(db, bulk_requests) if result: results.append(result) results_details = { "matched_count": 0, "modified_count": 0, } for r in results: results_details["matched_count"] += r["nMatched"] results_details["modified_count"] += r["nModified"] return results_details
def export_file_csv_dataset(provider=None, dataset_code=None, slug=None): """Create CSV File from one or more Dataset and record in MongoDB GridFS """ db = get_mongo_db() projection = {'concepts': False, "codelists": False} query = {} if slug: query["slug"] = slug else: query['provider_name'] = provider query['dataset_code'] = dataset_code datasets = db[constants.COL_DATASETS].find(query, projection) return [export_file_csv_dataset_unit(doc=doc) for doc in datasets]
def clean_mongodb(collection_list=None, db=None): """Drop all collections used by dlstats """ db = db or get_mongo_db() collection_list = collection_list or constants.COL_ALL for col in collection_list: try: db.drop_collection(col) except: pass drop_gridfs(db) for col in collection_list: try: db.create_collection(col) except: pass
def __init__(self, provider_name=None, db=None, is_indexes=True): """ :param str provider_name: Provider Name :param pymongo.database.Database db: MongoDB Database instance :param bool is_indexes: Bypass create_or_update_indexes() if False :raises ValueError: if provider_name is None """ if not provider_name: raise ValueError("provider_name is required") self.provider_name = provider_name self.db = db or get_mongo_db() self.provider = self.load_provider_from_db() if is_indexes: create_or_update_indexes(self.db)
def __init__(self, label='', colname=None, id_attr='_id', label_attr='', query={}, validators=None, sort=None, allow_blank=False, blank_text='---', **kwargs): super().__init__(label, validators, **kwargs) self.id_attr = id_attr self.label_attr = label_attr self.allow_blank = allow_blank self.blank_text = blank_text self.colname = colname self.query = query or {} self.sort = sort self.db = get_mongo_db() self.col = self.db[self.colname] self.queryset = self.db[self.colname].find(query) if self.sort and isinstance(self.sort, tuple) and len(self.sort) == 2: self.queryset = self.queryset.sort(*sort)
def export_file_csv_series_unit(doc=None, provider=None, dataset_code=None, key=None, slug=None): """Create CSV File from one series and record in MongoDB GridFS """ db = get_mongo_db() if not doc: if slug: doc = db[constants.COL_SERIES].find_one({"slug": slug}) else: if not provider: raise ValueError("provider is required") if not dataset_code: raise ValueError("dataset_code is required") if not key: raise ValueError("key is required") query = {} query['provider_name'] = provider query['dataset_code'] = dataset_code query['key'] = key doc = db[constants.COL_SERIES].find_one(query) if not doc: msg = "Series not found for provider[%s] - dataset[%s] - key[%s] - slug[%s]" raise Exception(msg % (provider, dataset_code, key, slug)) return record_csv_file(db, export_series(doc), provider_name=doc['provider_name'], dataset_code=doc["dataset_code"], key=doc["key"], slug=doc["slug"], prefix="series")
def export_file_csv_dataset_unit(doc=None, provider=None, dataset_code=None, slug=None): """Create CSV File from one Dataset and record in MongoDB GridFS """ db = get_mongo_db() if not doc: if slug: doc = db[constants.COL_DATASETS].find_one({"slug": slug}) else: if not provider: raise ValueError("provider is required") if not dataset_code: raise ValueError("dataset_code is required") query = {} query['provider_name'] = provider query['dataset_code'] = dataset_code doc = db[constants.COL_DATASETS].find_one(query, {'revisions': 0}) if not doc: raise Exception( "Dataset not found for provider[%s] - dataset[%s] - slug[%s]" % (provider, dataset_code, slug)) values = export_dataset(db, doc) return record_csv_file(db, values, provider_name=doc['provider_name'], dataset_code=doc["dataset_code"], slug=doc["slug"], prefix="dataset")
def remove_all(cls, provider_name, db=None): db = db or get_mongo_db() query = {"provider_name": provider_name} logger.info("remove all categories for [%s]" % provider_name) return db[constants.COL_CATEGORIES].remove(query)
def _conf_db(app): from widukind_common.utils import get_mongo_db app.widukind_db = get_mongo_db(app.config.get("MONGODB_URL"))
def _conf_db(app, db=None): from widukind_common.utils import get_mongo_db if not db: app.widukind_db = get_mongo_db(app.config.get("MONGODB_URL").strip('"'), connect=False) else: app.widukind_db = db
def categories(cls, provider_name, db=None, **query): db = db or get_mongo_db() if not "provider_name" in query: query["provider_name"] = provider_name cursor = db[constants.COL_CATEGORIES].find(query) return dict([(doc["category_code"], doc) for doc in cursor])
def count(cls, provider_name, db=None): db = db or get_mongo_db() query = {"provider_name": provider_name} return db[constants.COL_CATEGORIES].count(query)
def consolidate_dataset(provider_name=None, dataset_code=None, db=None, execute=True): db = db or utils.get_mongo_db() logger.info("START consolidate provider[%s] - dataset[%s]" % (provider_name, dataset_code)) query = {"provider_name": provider_name, "dataset_code": dataset_code} projection = {"_id": False, "dimensions": True, "attributes": True, "values.attributes": True} cursor = db[constants.COL_SERIES].find(query, projection) projection = {"_id": True, "concepts": True, "codelists": True, "dimension_keys": True, "attribute_keys": True} dataset = db[constants.COL_DATASETS].find_one(query, projection) codelists = {} for series in cursor: for k, v in series.get("dimensions").items(): if not k in codelists: codelists[k] = [] if not v in codelists[k]: codelists[k].append(v) if series.get("attributes"): for k, v in series.get("attributes").items(): if not k in codelists: codelists[k] = [] if not v in codelists[k]: codelists[k].append(v) for v in series.get("values"): if v.get("attributes"): for k1, v1 in v.get("attributes").items(): if not k1 in dataset["codelists"]: continue if not k1 in codelists: codelists[k1] = [] if not v1 in codelists[k1]: codelists[k1].append(v1) if logger.isEnabledFor(logging.DEBUG): for k, v in dataset["codelists"].items(): logger.debug("BEFORE - codelist[%s]: %s" % (k, len(v))) logger.debug("BEFORE - concepts[%s]" % list(dataset["concepts"].keys())) logger.debug("BEFORE - dimension_keys[%s]" % dataset["dimension_keys"]) logger.debug("BEFORE - attribute_keys[%s]" % dataset["attribute_keys"]) new_codelists = {} new_concepts = {} new_dimension_keys = [] new_attribute_keys = [] for k, values in dataset["codelists"].items(): '''if entry in codelists from series''' if k in codelists: new_values = {} for v1 in codelists[k]: '''if codelist value in codelists from dataset''' if v1 in values: new_values[v1] = values[v1] new_codelists[k] = new_values new_concepts[k] = dataset["concepts"].get(k) if k in dataset["dimension_keys"]: '''unordered dimension_keys''' new_dimension_keys.append(k) elif k in dataset["attribute_keys"]: '''unordered attribute_keys''' new_attribute_keys.append(k) '''original ordered for dimension_keys''' dimension_keys = [k for k in dataset["dimension_keys"] if k in new_dimension_keys] '''original ordered for attribute_keys''' attribute_keys = [k for k in dataset.get("attribute_keys") if k in new_attribute_keys] if logger.isEnabledFor(logging.DEBUG): for k, v in new_codelists.items(): logger.debug("AFTER - codelist[%s]: %s" % (k, len(v))) logger.debug("AFTER - concepts[%s]" % list(new_concepts.keys())) logger.debug("AFTER - dimension_keys[%s]" % dimension_keys) logger.debug("AFTER - attribute_keys[%s]" % attribute_keys) '''verify change in codelists''' #is_modify = hash_dict(new_codelists) == hash_dict(dataset["codelists"]) is_modify = new_codelists != dataset["codelists"] '''verify change in concepts''' #if not is_modify and hash_dict(new_concepts) != hash_dict(dataset["concepts"]): if is_modify is False and new_concepts != dataset["concepts"]: is_modify = True if is_modify is False: if execute: return None else: return None, None query = {"_id": dataset["_id"]} query_modify = {"$set": { "codelists": new_codelists, "concepts": new_concepts, "dimension_keys": dimension_keys, "attribute_keys": attribute_keys }} if execute: return db[constants.COL_DATASETS].update_one(query, query_modify).modified_count else: return query, query_modify
def search_category_for_dataset(cls, provider_name, dataset_code, db=None): db = db or get_mongo_db() query = {"provider_name": provider_name, "datasets.0": {"$exists": True}, "datasets.dataset_code": dataset_code} return db[constants.COL_CATEGORIES].find_one(query)
def consolidate_dataset(provider_name=None, dataset_code=None, db=None, execute=True): db = db or utils.get_mongo_db() logger.info("START consolidate provider[%s] - dataset[%s]" % (provider_name, dataset_code)) query = {"provider_name": provider_name, "dataset_code": dataset_code} projection = { "_id": False, "dimensions": True, "attributes": True, "values.attributes": True } cursor = db[constants.COL_SERIES].find(query, projection) projection = { "_id": True, "concepts": True, "codelists": True, "dimension_keys": True, "attribute_keys": True } dataset = db[constants.COL_DATASETS].find_one(query, projection) if dataset is None: if execute: return None else: return None, None old_codelists = dataset.get("codelists") or {} old_concepts = dataset.get("concepts") or {} old_dimension_keys = dataset.get("dimension_keys") or [] old_attribute_keys = dataset.get("attribute_keys") or [] codelists = {} for series in cursor: for k, v in series.get("dimensions").items(): if not k in codelists: codelists[k] = [] if not v in codelists[k]: codelists[k].append(v) if series.get("attributes"): for k, v in series.get("attributes").items(): if not k in codelists: codelists[k] = [] if not v in codelists[k]: codelists[k].append(v) for v in series.get("values"): if v.get("attributes"): for k1, v1 in v.get("attributes").items(): if not k1 in old_codelists: continue if not k1 in codelists: codelists[k1] = [] if not v1 in codelists[k1]: codelists[k1].append(v1) if logger.isEnabledFor(logging.DEBUG): for k, v in old_codelists.items(): logger.debug("BEFORE - codelist[%s]: %s" % (k, len(v))) logger.debug("BEFORE - concepts[%s]" % list(old_concepts.keys())) logger.debug("BEFORE - dimension_keys[%s]" % old_dimension_keys) logger.debug("BEFORE - attribute_keys[%s]" % old_attribute_keys) new_codelists = {} new_concepts = {} new_dimension_keys = [] new_attribute_keys = [] for k, values in old_codelists.items(): '''if entry in codelists from series''' if k in codelists: new_values = {} for v1 in codelists[k]: '''if codelist value in codelists from dataset''' if v1 in values: new_values[v1] = values[v1] new_codelists[k] = new_values new_concepts[k] = old_concepts.get(k) if k in old_dimension_keys: '''unordered dimension_keys''' new_dimension_keys.append(k) elif k in old_attribute_keys: '''unordered attribute_keys''' new_attribute_keys.append(k) '''original ordered for dimension_keys''' dimension_keys = [k for k in old_dimension_keys if k in new_dimension_keys] '''original ordered for attribute_keys''' attribute_keys = [k for k in old_attribute_keys if k in new_attribute_keys] if logger.isEnabledFor(logging.DEBUG): for k, v in new_codelists.items(): logger.debug("AFTER - codelist[%s]: %s" % (k, len(v))) logger.debug("AFTER - concepts[%s]" % list(new_concepts.keys())) logger.debug("AFTER - dimension_keys[%s]" % dimension_keys) logger.debug("AFTER - attribute_keys[%s]" % attribute_keys) '''verify change in codelists''' #is_modify = hash_dict(new_codelists) == hash_dict(old_codelists) is_modify = new_codelists != old_codelists '''verify change in concepts''' #if not is_modify and hash_dict(new_concepts) != hash_dict(old_concepts): if is_modify is False and new_concepts != old_concepts: is_modify = True if is_modify is False: if execute: return None else: return None, None query = {"_id": dataset["_id"]} query_modify = { "$set": { "codelists": new_codelists or None, "concepts": new_concepts or None, "dimension_keys": dimension_keys or None, "attribute_keys": attribute_keys or None, } } if execute: return db[constants.COL_DATASETS].update_one( query, query_modify).modified_count else: return query, query_modify
def root_categories(cls, provider_name, db=None): db = db or get_mongo_db() query = {"provider_name": provider_name, "parent": None} cursor = db[constants.COL_CATEGORIES].find(query) return cursor.sort([("position", 1), ("category_code", 1)])