def query_results(n_query, mol): for i in range(n_query): # mol = Molecule.objects.first() # --> the overhead in this query # option = Keywords.objects(program='Psi4').first() # or [0], throws ex query = { "molecule": mol, "method": str(i), "basis": "B1", "keywords": None, "program": "P1", "driver": "energy", } Result.objects(**query) # second DB access
def _clear_db(self, db_name: str): """Dangerous, make sure you are deleting the right DB""" # make sure it's the right DB if get_db().name == db_name: logging.info('Clearing database: {}'.format(db_name)) Result.drop_collection() Molecule.drop_collection() Options.drop_collection() Collection.drop_collection() TaskQueue.drop_collection() Procedure.drop_collection() User.drop_collection() self.client.drop_database(db_name)
def test_add_task_queue(): """ Simple test of adding a task using the ME classes in QCFractal, tasks should be added using storage_socket """ assert TaskQueue.objects.count() == 0 TaskQueue.objects().delete() # add a task that reference results result = Result.objects().first() task = TaskQueue(base_result=result) task.save() assert TaskQueue.objects().count() == 1 # add a task that reference Optimization Procedure opt = OptimizationProcedure.objects().first() task = TaskQueue(base_result=opt) task.save() assert TaskQueue.objects().count() == 2 # add a task that reference Torsiondrive Procedure tor = TorsiondriveProcedure.objects().first() task = TaskQueue(base_result=tor) task.save() assert TaskQueue.objects().count() == 3
def add_results(self, data: List[dict], update_existing: bool=False, return_json=True): """ Add results from a given dict. The dict should have all the required keys of a result. Parameters ---------- data : list of dict Each dict must have: program, driver, method, basis, options, molecule Where molecule is the molecule id in the DB In addition, it should have the other attributes that it needs to store update_existing : bool (default False) Update existing results Returns ------- Dict with keys: data, meta Data is the ids of the inserted/updated/existing docs """ for d in data: for i in self._lower_results_index: if d[i] is None: continue d[i] = d[i].lower() meta = storage_utils.add_metadata() results = [] # try: for d in data: # search by index keywords not by all keys, much faster doc = Result.objects(program=d['program'], name=d['driver'], method=d['method'], basis=d['basis'], options=d['options'], molecule=d['molecule']) if doc.count() == 0 or update_existing: if not isinstance(d['molecule'], ObjectId): d['molecule'] = ObjectId(d['molecule']) doc = doc.upsert_one(**d) results.append(str(doc.id)) meta['n_inserted'] += 1 else: meta['duplicates'].append(self._doc_to_tuples(doc.first(), with_ids=False)) # TODO # If new or duplicate, add the id to the return list results.append(str(doc.first().id)) meta["success"] = True # except (mongoengine.errors.ValidationError, KeyError) as err: # meta["validation_errors"].append(err) # except Exception as err: # meta['error_description'] = err ret = {"data": results, "meta": meta} return ret
def get_results_by_ids(self, ids: List[str]=None, projection=None, return_json=True, with_ids=True): """ Get list of Results using the given list of Ids Parameters ---------- ids : List of str Ids of the results in the DB projection : list/set/tuple of keys, default is None The fields to return, default to return all return_json : bool, default is True Return the results as a list of json inseated of objects with_ids: bool, default is True Include the ids in the returned objects/dicts Returns ------- Dict with keys: data, meta Data is the objects found """ meta = storage_utils.get_metadata() data = [] # try: if projection: data = Result.objects(id__in=ids).only(*projection).limit(self._max_limit) else: data = Result.objects(id__in=ids).limit(self._max_limit) meta["n_found"] = data.count() meta["success"] = True # except Exception as err: # meta['error_description'] = str(err) if return_json: rdata = [self._doc_to_json(d, with_ids) for d in data] else: rdata = data return {"data": rdata, "meta": meta}
def duplicate_results(n_results, mol): """Half the documents are duplicates""" tosave_results = [] for i in range(n_results): # mol = Molecule.objects.first() # one DB access # option = Keywords.objects().first() data = { "molecule": mol, "method": str(i + int(n_results / 2)), "basis": "Bulk", "keywords": None, "program": "P1", "driver": "energy", "other_data": 5, } found = Result.objects(**data).first() if not found: tosave_results.append(Result(**data)) Result.objects.insert(tosave_results) print("Duplciates: ", len(tosave_results))
def test_results(storage_socket): """ Handling results throught the ME classes """ assert Result.objects().count() == 0 assert Options.objects().count() == 0 molecules = Molecule.objects(molecular_formula='H4O2') assert molecules.count() == 2 page1 = { "molecule": molecules[0], "method": "M1", "basis": "B1", "options": None, "program": "P1", "driver": "energy", "other_data": 5, } page2 = { "molecule": molecules[1], "method": "M1", "basis": "B1", "options": None, "program": "P1", "driver": "energy", "other_data": 10, } result = Result(**page1) result.save() # print('Result After save: ', result.to_json()) assert result.molecule.molecular_formula == 'H4O2'
def bulk_insert_results(n_results, mol): results = [] for i in range(n_results): # mol = Molecule.objects.first() # one DB access # option = Keywords.objects().first() data = { "molecule": mol, "method": str(i), "basis": "Bulk", "keywords": None, "program": "P1", "driver": "energy", "other_data": 5, } results.append(Result(**data)) return Result.objects.insert(results)
def insert_results(n_results, mol): # repeat searching for the molecule for i in range(n_results): # mol = Molecule.objects.first() # one DB access # option = Keywords.objects().first() data = { "molecule": mol.id, "method": str(i), "basis": "B1", "keywords": None, "program": "P1", "driver": "energy", "other_data": 5, } results = Result(**data).save() # second DB access return results
def del_results(self, ids: List[str]): """ Removes results from the database using their ids (Should be cautious! other tables maybe referencing results) Parameters ---------- ids : list of str The Ids of the results to be deleted Returns ------- int number of results deleted """ obj_ids = [ObjectId(x) for x in ids] return Result.objects(id__in=obj_ids).delete()
def get_results(self, program: str=None, method: str=None, basis: str=None, molecule: str=None, driver: str=None, options: str=None, status: str='COMPLETE', projection=None, limit: int=None, skip: int=None, return_json=True, with_ids=True): """ Parameters ---------- program : str method : str basis : str molecule : str Molecule id in the DB driver : str options : str The id of the option in the DB status : bool, default is 'COMPLETE' The status of the result: 'COMPLETE', 'INCOMPLETE', or 'ERROR' projection : list/set/tuple of keys, default is None The fields to return, default to return all limit : int, default is None maximum number of results to return if 'limit' is greater than the global setting self._max_limit, the self._max_limit will be returned instead (This is to avoid overloading the server) skip : int, default is None TODO skip the first 'skip' resaults. Used to paginate return_json : bool, deafult is True Return the results as a list of json inseated of objects with_ids : bool, default is True Include the ids in the returned objects/dicts Returns ------- Dict with keys: data, meta Data is the objects found """ meta = storage_utils.get_metadata() query = {} parsed_query = {} if program: query['program'] = program if method: query['method'] = method if basis: query['basis'] = basis if molecule: query['molecule'], _ = _str_to_indices_with_errors(molecule) if driver: query['driver'] = driver if options: query['options'] = options if status: query['status'] = status for key, value in query.items(): if key == "molecule": parsed_query[key + "__in"] = query[key] elif key == "status": parsed_query[key] = value elif isinstance(value, (list, tuple)): parsed_query[key + "__in"] = [v.lower() for v in value] else: parsed_query[key] = value.lower() q_limit = limit if limit and limit < self._max_limit else self._max_limit data = [] try: if projection: data = Result.objects(**parsed_query).only(*projection).limit(q_limit) else: data = Result.objects(**parsed_query).limit(q_limit) meta["n_found"] = data.count() meta["success"] = True except Exception as err: meta['error_description'] = str(err) if return_json: rdata = [] for d in data: d = self._doc_to_json(d, with_ids) if "molecule" in d: d["molecule"] = d["molecule"]["$oid"] rdata.append(d) else: rdata = data return {"data": rdata, "meta": meta}
def queue_submit(self, data: List[Dict]): """Submit a list of tasks to the queue. Tasks are unique by their base_result, which should be inserted into the DB first before submitting it's corresponding task to the queue (with result.status='INCOMPLETE' as the default) The default task.status is 'WAITING' Duplicate tasks sould be a rare case. Hooks are merged if the task already exists Parameters ---------- data : list of tasks (dict) A task is a dict, with the following fields: - hash_index: idx, not used anymore - spec: dynamic field (dict-like), can have any structure - hooks: list of any objects representing listeners (for now) - tag: str - base_results: tuple (required), first value is the class type of the result, {'results' or 'procedure'). The second value is the ID of the result in the DB. Example: "base_result": ('results', result_id) Returns ------- dict (data and meta) 'data' is a list of the IDs of the tasks IN ORDER, including duplicates. An errored task has 'None' in its ID meta['duplicates'] has the duplicate tasks """ meta = storage_utils.add_metadata() results = [] for d in data: try: if not isinstance(d['base_result'], tuple): raise Exception("base_result must be a tuple not {}." .format(type(d['base_result']))) # If saved as DBRef, then use raw query to retrieve (avoid this) # if d['base_result'][0] in ('results', 'procedure'): # base_result = DBRef(d['base_result'][0], d['base_result'][1]) result_obj = None if d['base_result'][0] == 'results': result_obj = Result(id=d['base_result'][1]) elif d['base_result'][0] == 'procedure': result_obj = Procedure(id=d['base_result'][1]) else: raise TypeError("Base_result type must be 'results' or 'procedure'," " {} is given.".format(d['base_result'][0])) task = TaskQueue(**d) task.base_result = result_obj task.save() results.append(str(task.id)) meta['n_inserted'] += 1 except mongoengine.errors.NotUniqueError as err: # rare case # If results is stored as DBRef, get it with: # task = TaskQueue.objects(__raw__={'base_result': base_result}).first() # avoid # If base_result is stored as a Result or Procedure class, get it with: task = TaskQueue.objects(base_result=result_obj).first() self.logger.warning('queue_submit got a duplicate task: ', task.to_mongo()) if d['hooks']: # merge hooks task.hooks.extend(d['hooks']) task.save() results.append(str(task.id)) meta['duplicates'].append(self._doc_to_tuples(task, with_ids=False)) # TODO except Exception as err: meta["success"] = False meta["errors"].append(str(err)) results.append(None) meta["success"] = True ret = {"data": results, "meta": meta} return ret