def delete_dataset(database, collection): retrieve_util = MongoRetrieveUtil(database, collection) db_info = retrieve_util.get_mongo_db(database) if not db_info.success: return err_resp(db_info.err_msg) db = db_info.result_obj db[collection].drop()
def run_test(): print('step 1') mr = MongoRetrieveUtil('test-it') if mr.has_error(): #print(mr.get_error_message()) return db_info = mr.get_mongo_db('ok-db') print('db_info.success', db_info.success) print('step 2') client_info = mr.get_mongo_client() if client_info.success: client = client_info.result_obj for x in client.list_databases(): print(x) print('step 3') return print('get_mongo_url:', mr.get_mongo_url()) db_info = mr.get_mongo_db('hello') print('success?', db_info.success) print(mr.list_databases())
def get_data(database, collection, method, query, distinct=None, host=None): """Return data from a Mongo query""" if method == 'distinct' and not distinct: return err_resp("the distinct method requires a 'keys' argument") retrieve_util = MongoRetrieveUtil(database, collection, host) success, data = retrieve_util.run_query(query, method, distinct) return ok_resp(data) if success else err_resp(data)
def api_mongo_healthcheck(request): """Mongo healthcheck""" mongo_check = MongoRetrieveUtil.run_tworavens_healthcheck() if mongo_check.success: return JsonResponse(get_json_success(\ 'Mongo is running', data=mongo_check.result_obj)) return JsonResponse(get_json_error(mongo_check.err_msg))
def upload_query_result(event_obj): """upload query result to dataverse""" collection_name = event_obj.as_dict()['collection_name'] query_obj = event_obj.as_dict()['query'] query_id = event_obj.as_dict()['id'] filename = '%s_%s.txt' % (str(query_id), str(collection_name)) obj = MongoRetrieveUtil(settings.EVENTDATA_DB_NAME, collection_name) success, mongo_obj = obj.run_query(query_obj, 'aggregate') if not mongo_obj: return err_resp(mongo_obj) json_dump = json.dumps(mongo_obj) temp_file_obj = TemporaryFileMaker(filename, json_dump) succ, res_obj = temp_file_obj.return_status() print("query result upload : ", res_obj) if succ: return ok_resp(res_obj) else: return err_resp(res_obj)
def util_results_importance_efd(data_pointer, metadata): LIMIT_UNIQUE_LEVELS = 20 # make sure the base dataset is loaded EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME, metadata['collectionName'], data_path=metadata['collectionPath']) results_collection_name = metadata[ 'collectionName'] + '_produce_' + mongofy_collection_name( metadata['produceId']) util = MongoRetrieveUtil( settings.TWORAVENS_MONGO_DB_NAME, settings.MONGO_COLLECTION_PREFIX + metadata['collectionName']) if util.has_error(): return {KEY_SUCCESS: False, KEY_DATA: util.get_error_message()} levels = {} # populate levels (for example, numeric column tagged as categorical) for variable in metadata['categoricals']: # levels are passed, but levels have lost type information (json object keys are coerced to string) # if not levels[key]: response = util.run_query([ *metadata['query'], { "$group": { "_id": f"${variable}", "count": { '$sum': 1 } } }, { '$sort': { 'count': -1, '_id': 1 } }, { "$limit": LIMIT_UNIQUE_LEVELS } ], 'aggregate') if not response[0]: return {KEY_SUCCESS: False, KEY_DATA: response[1]} levels[variable] = [doc['_id'] for doc in response[1]] # limit the number of unique levels if len(levels[variable]) > LIMIT_UNIQUE_LEVELS: levels[variable] = levels[variable][:LIMIT_UNIQUE_LEVELS] # fitted versions of variables have same levels as their originals levels.update({'fitted ' + key: levels[key] for key in levels}) # renamed variables have the same levels as their originals levels.update({'actual ' + key: levels[key] for key in levels}) # print('metadata levels', levels) def is_categorical(variable, levels): return variable in levels def branch_target(variable, levels): if is_categorical(variable, levels): return { f'{variable}-{level}': { "$avg": { "$cond": [{ "$eq": [f"${variable}", level] }, 1, 0] } } for level in levels[variable] } # compute mean of fitted and actual return { f'fitted {variable}': { "$avg": f'$fitted {variable}' }, f'actual {variable}': { "$avg": f'$actual {variable}' }, 'error': { '$sum': { "$pow": [{ '$subtract': [f'$fitted {variable}', f'$actual {variable}'] }, 2] } } } def aggregate_targets(variables, levels): return { k: v for d in [branch_target(target, levels) for target in variables] for k, v in d.items() } def branch_target_levels(variable, levels): if is_categorical(variable, levels): return { f'{variable}-{level}': { "$avg": { "$cond": [{ "$eq": [f"${variable}", level] }, 1, 0] } } for level in levels[variable] } return {variable: {"$avg": f'${variable}'}} def aggregate_target_levels(variables, levels): return { k: v for d in [ *[ branch_target_levels('fitted ' + target, levels) for target in variables ], *[ branch_target_levels('actual ' + target, levels) for target in variables ] ] for k, v in d.items() } target_aggregator = aggregate_target_levels(metadata['targets'], levels) query = [ *metadata['query'], { "$lookup": { "from": settings.MONGO_COLLECTION_PREFIX + results_collection_name, "localField": "d3mIndex", "foreignField": "d3mIndex", "as": "results_collection" } }, { "$unwind": "$results_collection" }, { "$project": { **{ 'fitted ' + name: f"$results_collection\\.{name}" for name in metadata['targets'] }, **{ 'actual ' + name: f"${name}" for name in metadata['targets'] }, **{ f"predictor {predictor}": f"${predictor}" for predictor in metadata['predictors'] }, **{ "_id": 0 } } }, { "$facet": { predictor: [{ "$group": { **{ "_id": f'$predictor {predictor}', 'count': { "$sum": 1 } }, **target_aggregator } }, { "$sort": { "count": -1, '_id': 1 } }, { "$limit": 20 }, { "$project": { **{ "predictor": "$_id" }, **{k: 1 for k in target_aggregator.keys()}, **{ "_id": 0 } } }] if is_categorical(predictor, levels) else [{ "$bucketAuto": { "groupBy": f'$predictor {predictor}', "buckets": 100, "output": target_aggregator } }, { "$project": { **{ "predictor": { "$avg": ["$_id\\.min", "$_id\\.max"] } }, **{k: 1 for k in target_aggregator.keys()}, **{ "_id": 0 } } }] for predictor in metadata['predictors'] } }, ] try: status = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME, results_collection_name, data_pointer) if not status.success: return {KEY_SUCCESS: False, KEY_DATA: status.err_msg} response = list(util.run_query(query, method='aggregate')) if not response[0]: return {KEY_SUCCESS: response[0], KEY_DATA: response[1]} # exhaust cursor before dropping dataset data = next(response[1]) finally: pass # EventJobUtil.delete_dataset( # settings.TWORAVENS_MONGO_DB_NAME, # results_collection_name) def kernel_linear(size): return list(range(1, size // 2 + 2)) + list(range(size // 2, 0, -1)) def kernel_uniform(size): return [1] * size print(data) def smooth(kernel, data, predictor): if len(kernel) % 2 != 1: raise ValueError('Kernel must be odd-length') # normalize kernel kernel = [i / sum(kernel) for i in kernel] # clip indices for data access on kernel offsets at edges def clip(x): return max(0, min(len(data) - 1, x)) offset = len(kernel) // 2 smoothed = [] for i in range(len(data)): smoothed.append({ **{ level: sum(weight * (data[clip(i + j_level - offset)][level] or 0) for j_level, weight in enumerate(kernel)) for level in data[i].keys() if level != "predictor" }, **{ "predictor": data[i]["predictor"] } }) return smoothed # pyperclip.copy(json.dumps({"query": query, "data": data}, indent=4)) for predictor in metadata['predictors']: if not is_categorical(predictor, levels): data[predictor] = smooth(kernel_linear(size=7), data[predictor], predictor) return {KEY_SUCCESS: True, KEY_DATA: data}
def util_results_confusion_matrix(data_pointer, metadata): """Get the content from the file and format a JSON snippet that includes statistical summaries. """ response = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME, metadata['collectionName'], metadata['collectionPath']) if not response.success: return {KEY_SUCCESS: False, KEY_DATA: response.err_msg} results_collection_name = metadata[ 'collectionName'] + '_produce_' + mongofy_collection_name( metadata['produceId']) util = MongoRetrieveUtil( settings.TWORAVENS_MONGO_DB_NAME, settings.MONGO_COLLECTION_PREFIX + metadata['collectionName']) if util.has_error(): return {KEY_SUCCESS: False, KEY_DATA: util.get_error_message()} query = [ *metadata['query'], # minor optimization, drop unneeded columns before performing lookup { "$project": { **{name: 1 for name in metadata['targets']}, **{ 'd3mIndex': 1 } } }, { "$lookup": { "from": settings.MONGO_COLLECTION_PREFIX + results_collection_name, "localField": "d3mIndex", "foreignField": "d3mIndex", "as": "results_collection" } }, { "$unwind": "$results_collection" }, { "$project": { **{ 'Predicted_' + name: f"$results_collection\\.{name}" for name in metadata['targets'] }, **{ 'Actual_' + name: f"${name}" for name in metadata['targets'] }, **{ "_id": 0 } } }, { '$facet': { target: [{ "$group": { '_id': { 'Actual': f'$Actual_{target}', 'Predicted': f'$Predicted_{target}' }, 'count': { '$sum': 1 } } }, { "$project": { 'Actual': '$_id\\.Actual', 'Predicted': '$_id\\.Predicted', 'count': 1, '_id': 0 } }, { "$sort": { 'Actual': 1 } }] for target in metadata['targets'] } } ] try: status = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME, results_collection_name, data_pointer, indexes=['d3mIndex']) if not status.success: return {KEY_SUCCESS: False, KEY_DATA: status.err_msg} response = list(util.run_query(query, method='aggregate')) finally: EventJobUtil.delete_dataset(settings.TWORAVENS_MONGO_DB_NAME, results_collection_name) if not response[0]: return {KEY_SUCCESS: response[0], KEY_DATA: response[1]} data = next(response[1]) return { KEY_SUCCESS: response[0], KEY_DATA: { target: { 'data': data[target], 'classes': list(set(map(lambda x: x['Actual'], data[target]))) } for target in data.keys() } }
def util_results_real_clustered(data_pointer, metadata): GRID_SIZE = 100 response = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME, metadata['collectionName'], metadata['collectionPath']) if not response.success: return {KEY_SUCCESS: False, KEY_DATA: response.err_msg} results_collection_name = metadata[ 'collectionName'] + '_produce_' + mongofy_collection_name( metadata['produceId']) mongo_util_base = MongoRetrieveUtil( settings.TWORAVENS_MONGO_DB_NAME, settings.MONGO_COLLECTION_PREFIX + metadata['collectionName']) if mongo_util_base.has_error(): return { KEY_SUCCESS: False, KEY_DATA: mongo_util_base.get_error_message() } mongo_util_fitted = MongoRetrieveUtil( settings.TWORAVENS_MONGO_DB_NAME, settings.MONGO_COLLECTION_PREFIX + metadata['collectionName']) if mongo_util_fitted.has_error(): return { KEY_SUCCESS: False, KEY_DATA: mongo_util_fitted.get_error_message() } def normalize(variable, minimum, maximum, scale=1): return { "$divide": [{ "$subtract": [variable, minimum] }, (maximum - minimum) / scale] } try: status = EventJobUtil.import_dataset(settings.TWORAVENS_MONGO_DB_NAME, results_collection_name, data_pointer, indexes=['d3mIndex']) if not status.success: return {KEY_SUCCESS: False, KEY_DATA: status.err_msg} # COMPUTE ACTUAL BOUNDS bounds = {} response = list( mongo_util_base.run_query([ *metadata['query'], { "$match": { target: { "$not": { "$type": 2 } } for target in metadata['targets'] } }, { "$group": { "_id": 0, **{ f'min_{target}': { "$min": f"${target}" } for target in metadata['targets'] }, **{ f'max_{target}': { "$max": f"${target}" } for target in metadata['targets'] } } } ], method='aggregate')) if not response[0]: return {KEY_SUCCESS: response[0], KEY_DATA: response[1]} record = next(response[1]) bounds['actual'] = { target: [record[f'min_{target}'], record[f'max_{target}']] for target in metadata['targets'] } # COMPUTE FITTED BOUNDS response = list( mongo_util_fitted.run_query([{ "$match": { target: { "$not": { "$type": 2 } } for target in metadata['targets'] } }, { "$group": { "_id": 0, **{ f'min_{target}': { "$min": f"${target}" } for target in metadata['targets'] }, **{ f'max_{target}': { "$max": f"${target}" } for target in metadata['targets'] } } }], method='aggregate')) if not response[0]: return {KEY_SUCCESS: response[0], KEY_DATA: response[1]} record = next(response[1]) bounds['fitted'] = { target: [record[f'min_{target}'], record[f'max_{target}']] for target in metadata['targets'] } # GRID CLUSTERING query = [ *metadata['query'], { "$project": { **{name: 1 for name in metadata['targets']}, **{ 'd3mIndex': 1 } } }, # ignore records with strings in the target variable { "$match": { target: { "$not": { "$type": 2 } } for target in metadata['targets'] } }, { "$lookup": { "from": settings.MONGO_COLLECTION_PREFIX + results_collection_name, "localField": "d3mIndex", "foreignField": "d3mIndex", "as": "results_collection" } }, { "$unwind": "$results_collection" }, { "$project": { **{ 'fitted_' + name: f"$results_collection\\.{name}" for name in metadata['targets'] }, **{ 'actual_' + name: f"${name}" for name in metadata['targets'] }, **{ "_id": 0 } } }, { "$facet": { target: [{ "$group": { "_id": { 'x': { '$toInt': normalize(f'$fitted_{target}', *bounds['fitted'][target], GRID_SIZE) }, 'y': { '$toInt': normalize(f'$actual_{target}', *bounds['actual'][target], GRID_SIZE) } }, 'Fitted Values': { "$avg": f'$fitted_{target}' }, 'Actual Values': { "$avg": f'$actual_{target}' }, 'count': { '$sum': 1 } } }, { '$project': { '_id': 0 } }] for target in metadata['targets'] } } ] response = list(mongo_util_base.run_query(query, method='aggregate')) finally: pass # EventJobUtil.delete_dataset( # settings.TWORAVENS_MONGO_DB_NAME, # results_collection_name) return { KEY_SUCCESS: response[0], KEY_DATA: next(response[1]) if response[0] else response[1] }
def import_dataset(database, collection, data_path, reload=False, header=True, columns=None, indexes=None, delimiter=None): """Key method to load a Datafile into Mongo as a new collection""" print('--> import_dataset --') retrieve_util = MongoRetrieveUtil(database, collection) db_info = retrieve_util.get_mongo_db(database) if not db_info.success: return err_resp(db_info.err_msg) db = db_info.result_obj collection_name = settings.MONGO_COLLECTION_PREFIX + collection # dataset already loaded in mongo # if collection_name in db.list_collection_names(): if reload: db[collection_name].drop() MongoDataset.objects.select_for_update().filter( name=collection_name).delete() else: print( '--> import_dataset: data in database, no data in django, not reloading' ) # make sure database entry exists dataset_records = MongoDataset.objects.select_for_update( ).filter(name=collection_name) if dataset_records: dataset_record = dataset_records[0] dataset_record.loading = False dataset_record.save() else: MongoDataset.objects.create(name=collection_name, loading=False) return ok_resp({'collection': collection_name}) else: # if data is not loaded, make sure record is not in database try: MongoDataset.objects.select_for_update().filter( name=collection_name).delete() except MongoDataset.DoesNotExist: pass # print('data not loaded, and no data in django') # create lockable record if not MongoDataset.objects.select_for_update().filter( name=collection_name): MongoDataset.objects.create(name=collection_name, loading=True) # lock on record dataset_record = MongoDataset.objects.select_for_update().get( name=collection_name) if not dataset_record.loading: return ok_resp({'collection': collection_name}) # print(collection_name + ' does not yet exist. Importing.\n\n\n\n') if not data_path: return err_resp('The file_uri cannot be None or an empty string.') if not os.path.exists(data_path): return err_resp(collection + ' not found') # Convert the file uri to a path # fpath, err_msg = format_file_uri_to_path(data_path) if err_msg: return err_resp(err_msg) # for mongoimport commands # import_commands = [] # ------------------------------------- # ignore first line of input files # ------------------------------------- if header: import_commands.append(f'tail -n +2') # ------------------------------------- # standardize column metadata to dict # ------------------------------------- if not columns: columns = DuplicateColumnRemover(data_path).updated_columns if isinstance(columns, list): columns = {col: None for col in columns} # ------------------------------------- # standardize dict's tworavens types to mongo, # try to be flexible with alternative words # ------------------------------------- def mongofy_type(value): return { bool: 'boolean', 'boolean': 'boolean', str: 'string', 'string': 'string', int: 'int32', 'int32': 'int32', 'int': 'int32', float: 'double', 'double': 'double', 'float': 'double', datetime.datetime: 'date', 'date': 'date' }.get(value, 'auto') columns = {col: mongofy_type(columns[col]) for col in columns} # ------------------------------------- # Prepare field names and set delimiter # for Mongo import/insert # ------------------------------------- def sanitize(column): return encode_variable(column).replace('"', '\\"') field_names = ','.join(f"{sanitize(col)}.{columns.get(col, 'auto')}()" for col in columns) print('field_names', field_names) delimiter_type = 'csv' if os.path.splitext(data_path)[1] == 'tsv': delimiter_type = 'tsv' if delimiter in [None, ',']: pass elif delimiter == '\t': delimiter_type = 'tsv' else: import_commands.append(f'tr "{delimiter}" "\t" <') delimiter_type = 'tsv' delimiter = {'csv': ',', 'tsv': '\t'}[delimiter_type] # ------------------------------------------ # TEMP skip this for k8s... # --- # Prepare and run the mongoimport command # ------------------------------------------ # try: if False: # try: import_commands.append( f'mongoimport' f' --db {database}' f' --collection {settings.MONGO_COLLECTION_PREFIX + collection}' f' --type {delimiter_type}' f' --ignoreBlanks' f' --columnsHaveTypes' f' --parseGrace autoCast' f' --drop' f' --numInsertionWorkers=4' f' --fields "{field_names}"') # the first command takes the data path, which is piped through the other commands import_commands[0] = import_commands[0] + ' ' + data_path print('--> import_dataset: mongoimport command:') print('-->' + ' | '.join(import_commands)) # pipe each command to the next print('--> start subprocess') process = subprocess.Popen(shlex.split(import_commands[0]), stdout=subprocess.PIPE) for command in import_commands[1:]: print('--> command (bracketed): [%s]' % command) process = subprocess.Popen(shlex.split(command), stdin=process.stdout, stdout=subprocess.PIPE) process.communicate() for column in columns.keys(): db[collection_name].update({column: { '$exists': False }}, {'$set': { column: None }}, multi=True) else: #except Exception as err: # slower, secondary import if first fails #print('--> mongo err: [%s]' % err) #print(traceback.format_exc()) print( '--> import_dataset: mongoimport failed. Running row-by-row insertion instead.' ) db[collection_name].drop() with open(data_path, 'r') as csv_file: csv_reader = csv.reader(csv_file, delimiter=delimiter) # discard header next(csv_reader) # use duplicate column name removal headers instead columns = [encode_variable(value) for value in columns] for observation in csv_reader: db[collection_name].insert_one({ col: infer_type(val) for col, val in zip(columns, observation) }) if indexes: for index in indexes: # print('creating index ', index, ' on ', collection_name) db[collection_name].create_index(index) dataset_record.loading = False dataset_record.save() return ok_resp({'collection': collection_name})
def check_mongo(): """test""" # ['cline_phoenix_nyt', 'icews', 'cline_phoenix_swb', 'acled_asia', 'cline_speed', 'acled_africa', 'acled_middle_east', 'cline_phoenix_fbis'] mr = MongoRetrieveUtil('cline_phoenix_fbis', '*') if mr.has_error(): print(mr.error_message)