def check_existing_data(self, case_obj, existing_case, institute_obj, update, keep_actions): """Make sure data from case to be loaded/reuploaded conforms to case data already saved in database. Return eventual evaluated variants to be propagated to the updated case if keep_actions is True Args: case_obj(dict): case dictionary to be loaded/reuploaded existing_case(dict): a case with same _id or same display_name and institute_id as case_obj institute_obj(dict): institute dictionary update(bool): If existing case should be updated keep_actions(bool): If old evaluated variants should be kept when case is updated Returns: previous_evaluated_variants(list): list of variants evaluated in previous case or None if case is not already present in the database. """ if existing_case is None: return if ( existing_case["_id"] != case_obj["_id"] ): # This happens whenever institute and case display name coincide raise IntegrityError( f"A case with different _id ({existing_case['_id']} vs {case_obj['_id']}) and same display name ({case_obj['display_name']}) already exists for this institute." ) if existing_case and not update: raise IntegrityError("Case %s already exists in database" % case_obj["_id"]) # Enforce same display name for updated case as existing case if case_obj["display_name"] != existing_case["display_name"]: raise IntegrityError("Updated case name doesn't match existing case name.") # Check that individuals from updated case match individuals from existing case in ID, name and affected status existing_case_inds = set( [ (ind["individual_id"], ind["display_name"], ind["phenotype"]) for ind in existing_case.get("individuals") ] ) case_inds = set( [ (ind["individual_id"], ind["display_name"], ind["phenotype"]) for ind in case_obj.get("individuals") ] ) if existing_case_inds != case_inds: raise IntegrityError( f"Updated case individuals ({case_inds}) don't match individuals from existing case ({existing_case_inds}). Please either delete old case or modify updated case individuals." ) if keep_actions: # collect all variants with user actions for this case return list(self.evaluated_variants(case_obj["_id"], institute_obj["_id"]))
def load_case(adapter, case_obj, update=False): """Load a case into the database If the case already exists the function will exit. If the user want to load a case that is already in the database 'update' has to be 'True' Args: adapter (MongoAdapter): connection to the database case_obj (dict): case object to persist to the database update(bool): If existing case should be updated Returns: case_obj(dict): A dictionary with the builded case """ logger.info('Loading case {} into database'.format( case_obj['display_name'])) # Check if case exists in database existing_case = adapter.case(case_obj['_id']) if existing_case: if update: adapter.update_case(case_obj) else: raise IntegrityError("Case {0} already exists in database".format( case_obj['case_id'])) else: adapter.add_case(case_obj) return case_obj
def load_delivery_report(adapter: MongoAdapter, report_path: str, case_id: str, update: bool = False): """Load a delivery report into a case in the database If the report already exists the function will exit. If the user want to load a report that is already in the database 'update' has to be 'True' Args: adapter (MongoAdapter): Connection to the database report_path (string): Path to delivery report case_id (string): Optional case identifier update (bool): If an existing report should be replaced Returns: updated_case(dict) """ case_obj = adapter.case(case_id=case_id) if case_obj is None: raise DataNotFoundError("no case found") if update or case_obj.get("delivery_report") is None: _update_report_path(case_obj, report_path, "delivery_report") else: raise IntegrityError("Existing report found, use update = True to " "overwrite") LOG.info("Saving report for case {} in database".format(case_obj["_id"])) return adapter.replace_case(case_obj)
def load_report(adapter, case_id, report_path, update=False): """Add the path to a report to a case Args: adapter(scout.adapter.MongoAdapter) case_id(str) report_path(str) update(bool) Returns: updated_case(dict) """ case_obj = adapter.case(case_id) if not case_obj: raise IntegrityError("Case {0} does not exist".format(case_id)) if case_obj.get('delivery_report'): if not update: raise ValidationError( "Delivery report already exists for case {}".format(case_id)) LOG.info("Set delivery report to %s", report_path) updated_case = adapter.case_collection.find_one_and_update( {'_id': case_id}, {'$set': { 'delivery_report': report_path }}, return_document=pymongo.ReturnDocument.AFTER) return updated_case
def build_panel(panel_info, adapter): """Build a gene_panel object Args: panel_info(dict): A dictionary with panel information adapter (scout.adapter.MongoAdapter) Returns: panel_obj(dict) gene_panel = dict( panel_name = str, # required institute = str, # institute_id, required version = float, # required date = datetime, # required display_name = str, # default is panel_name genes = list, # list of panel genes, sorted on panel_gene['symbol'] ) """ panel_name = panel_info.get('panel_name') if not panel_name: raise KeyError("Panel has to have a name") panel_obj = dict(panel_name=panel_name) logger.info("Building panel with name: {0}".format(panel_name)) try: institute_id = panel_info['institute'] except KeyError as err: raise KeyError("Panel has to have a institute") if adapter.institute(institute_id) is None: raise IntegrityError("Institute %s could not be found" % institute_id) panel_obj['institute'] = panel_info['institute'] panel_obj['version'] = float(panel_info['version']) try: panel_obj['date'] = panel_info['date'] except KeyError as err: raise KeyError("Panel has to have a date") panel_obj['display_name'] = panel_info.get('display_name', panel_info['panel_name']) gene_objs = [] for gene_info in panel_info.get('genes', []): gene_obj = build_gene(gene_info, adapter) gene_objs.append(gene_obj) panel_obj['genes'] = gene_objs return panel_obj
def _add_case(self, case_obj): """Add a case to the database If the case already exists exception is raised Args: case_obj(Case) """ if self.case(case_obj["_id"]): raise IntegrityError("Case %s already exists in database" % case_obj["_id"]) return self.case_collection.insert_one(case_obj)
def add_case(self, case_obj): """Add a case to the database If the case already exists exception is raised Args: case_obj(Case) """ logger.info("Adding case %s to database" % case_obj['case_id']) if self.case(case_obj['case_id']): raise IntegrityError("Case %s already exists in database" % case_obj['case_id']) return self.case_collection.insert_one(case_obj)
def load_exon_bulk(self, exon_objs): """Load a bulk of exon objects to the database Arguments: exon_objs(iterable(scout.models.hgnc_exon)) """ try: result = self.exon_collection.insert_many(transcript_objs) except (DuplicateKeyError, BulkWriteError) as err: raise IntegrityError(err) return result
def load_transcript_bulk(self, transcript_objs): """Load a bulk of transcript objects to the database Arguments: transcript_objs(iterable(scout.models.hgnc_transcript)) """ LOG.info("Loading transcript bulk") try: result = self.transcript_collection.insert_many(transcript_objs) except (DuplicateKeyError, BulkWriteError) as err: raise IntegrityError(err) return result
def update_panel( adapter, panel_name, panel_version, new_version=None, new_date=None, new_maintainer=None, ): """Update a gene panel in the database We need to update the actual gene panel and then all cases that refers to the panel. Args: adapter(scout.adapter.MongoAdapter) panel_name(str): Unique name for a gene panel panel_version(float) new_version(float) new_date(datetime.datetime) new_maintainer(list(user_id)) Returns: updated_panel(scout.models.GenePanel): The updated gene panel object """ panel_obj = adapter.gene_panel(panel_name, panel_version) if not panel_obj: raise IntegrityError("Panel %s version %s does not exist" % (panel_name, panel_version)) updated_panel = adapter.update_panel(panel_obj, new_version, new_date, new_maintainer) panel_id = updated_panel["_id"] # We need to alter the embedded panels in all affected cases update = {"$set": {}} if new_version: update["$set"]["panels.$.version"] = updated_panel["version"] if new_date: update["$set"]["panels.$.updated_at"] = updated_panel["date"] # there is however no need to update maintainer for the embedded versions if update["$set"] != {}: LOG.info("Updating affected cases with {0}".format(update)) query = {"panels": {"$elemMatch": {"panel_name": panel_name}}} adapter.case_collection.update_many(query, update) return updated_panel
def load_variant(self, variant_obj): """Load a variant object Args: variant_obj(dict) Returns: inserted_id """ # LOG.debug("Loading variant %s", variant_obj['_id']) try: result = self.variant_collection.insert_one(variant_obj) except DuplicateKeyError as err: raise IntegrityError("Variant %s already exists in database", variant_obj['_id']) return result
def load_disease_term(self, disease_obj): """Load a disease term into the database Args: disease_obj(dict) """ log.debug("Loading disease term %s into database", disease_obj['_id']) try: self.disease_term_collection.insert_one(disease_obj) except DuplicateKeyError as err: raise IntegrityError( "Disease term %s already exists in database".format( disease_obj['_id'])) log.debug("Disease term saved")
def load_hpo_term(self, hpo_obj): """Add a hpo object Arguments: hpo_obj(dict) """ log.debug("Loading hpo term %s into database", hpo_obj['_id']) try: self.hpo_term_collection.insert_one(hpo_obj) except DuplicateKeyError as err: raise IntegrityError( "Hpo term %s already exists in database".format( hpo_obj['_id'])) log.debug("Hpo term saved")
def add_gene_panel(self, panel_obj): """Add a gene panel to the database Args: panel_obj(dict) """ panel_name = panel_obj['panel_name'] panel_version = panel_obj['version'] if self.gene_panel(panel_name, panel_version): raise IntegrityError("Panel {0} with version {1} already" " exist in database".format( panel_name, panel_version)) LOG.info("loading panel {0}, version {1} to database".format( panel_name, panel_version)) self.panel_collection.insert_one(panel_obj) LOG.debug("Panel saved")
def load_hpo_bulk(self, hpo_bulk): """Add a hpo object Arguments: hpo_bulk(list(scout.models.HpoTerm)) Returns: result: pymongo bulkwrite result """ LOG.debug("Loading hpo bulk") try: result = self.hpo_term_collection.insert_many(hpo_bulk) except (DuplicateKeyError, BulkWriteError) as err: raise IntegrityError(err) return result
def load_managed_variant(self, managed_variant_obj): """Load a managed variant object Args: managed_variant_obj(ManagedVariant) Returns: inserted_id """ try: result = self.managed_variant_collection.insert_one( managed_variant_obj) except DuplicateKeyError as err: raise IntegrityError( "Variant %s already exists in database", managed_variant_obj["display_id"], ) return result.inserted_id
def load_hgnc_bulk(self, gene_objs): """Load a bulk of hgnc gene objects Raises IntegrityError if there are any write concerns Args: gene_objs(iterable(scout.models.hgnc_gene)) Returns: result (pymongo.results.InsertManyResult) """ LOG.info("Loading gene bulk with length %s", len(gene_objs)) try: result = self.hgnc_collection.insert_many(gene_objs) except (DuplicateKeyError, BulkWriteError) as err: raise IntegrityError(err) return result
def update_panel(adapter, panel_name, panel_version, new_version=None, new_date=None): """Update a gene panel in the database We need to update the actual gene panel and then all cases that refers to the panel. Args: adapter(scout.adapter.MongoAdapter) panel_name(str): Unique name for a gene panel panel_version(float) new_version(float) new_date(datetime.datetime) Returns: updated_panel(scout.models.GenePanel): The updated gene panel object """ panel_obj = adapter.gene_panel(panel_name, panel_version) if not panel_obj: raise IntegrityError("Panel %s version %s does not exist" % (panel_name, panel_version)) updated_panel = adapter.update_panel(panel_obj, new_version, new_date) panel_id = updated_panel['_id'] # We need to alter the embedded panels in all affected cases update = {'$set': {}} if new_version: update['$set']['panels.$.version'] = updated_panel['version'] if new_date: update['$set']['panels.$.updated_at'] = updated_panel['date'] LOG.info('Updating affected cases with {0}'.format(update)) query = {'panels': {'$elemMatch': {'panel_name': panel_name}}} adapter.case_collection.update_many(query, update) return updated_panel
def add_gene_panel(self, panel_obj): """Add a gene panel to the database Args: panel_obj(dict) """ panel_name = panel_obj["panel_name"] panel_version = panel_obj["version"] display_name = panel_obj.get("display_name", panel_name) if self.gene_panel(panel_name, panel_version): raise IntegrityError("Panel {0} with version {1} already" " exist in database".format( panel_name, panel_version)) LOG.info("loading panel %s, version %s to database", display_name, panel_version) LOG.info("Nr genes in panel: %s", len(panel_obj.get("genes", []))) result = self.panel_collection.insert_one(panel_obj) LOG.debug("Panel saved") return result.inserted_id
def add_user(self, user_obj): """Add a user object to the database Args: user_obj(scout.models.User): A dictionary with user information Returns: user_info(dict): a copy of what was inserted """ LOG.info("Adding user %s to the database", user_obj['email']) if not '_id' in user_obj: user_obj['_id'] = user_obj['email'] try: self.user_collection.insert_one(user_obj) LOG.debug("User inserted") except DuplicateKeyError as err: raise IntegrityError("User {} already exists in database".format(user_obj['email'])) return user_obj
def add_user(self, user_info): """Add a user object to the database Args: user_info(dict): A dictionary with user information Returns: user_info(dict): a copy of what was inserted """ log.info("Adding user %s to the database", user_info['email']) if not '_id' in user_info: user_info['_id'] = user_info['email'] user_info['created_at'] = datetime.datetime.now() try: self.user_collection.insert_one(user_info) log.debug("User inserted") except DuplicateKeyError as err: raise IntegrityError("User {} already exists in database".format(user_info['email'])) return user_info
def add_institute(self, institute_obj): """Add a institute to the database Args: institute_obj(Institute) """ internal_id = institute_obj["internal_id"] display_name = institute_obj["display_name"] # Check if institute already exists if self.institute(institute_id=internal_id): raise IntegrityError( "Institute {0} already exists in database".format( display_name)) LOG.info("Adding institute with internal_id: {0} and " "display_name: {1}".format(internal_id, display_name)) insert_info = self.institute_collection.insert_one(institute_obj) ##TODO check if insert info was ok LOG.info("Institute saved")
def add_gene_panel(self, panel_obj, replace=False): """Add a gene panel to the database Args: panel_obj(dict) replace(bool), if True, replace panel data in database """ panel_name = panel_obj["panel_name"] panel_version = panel_obj["version"] display_name = panel_obj.get("display_name", panel_name) LOG.info("loading panel %s, version %s to database", display_name, panel_version) LOG.info("Nr genes in panel: %s", len(panel_obj.get("genes", []))) old_panel = self.gene_panel(panel_name, panel_version) if old_panel and replace is False: raise IntegrityError("Panel {0} with version {1} already" " exist in database".format( panel_name, panel_version)) elif ( old_panel ): # Same version of this panel exists, but should be replaced by new panel document LOG.warning( f"Panel {panel_name} v.{panel_version} already exists. Replacing it with new data" ) new_panel = self.panel_collection.find_one_and_replace( old_panel, panel_obj, return_document=pymongo.ReturnDocument.AFTER) LOG.debug("Panel replaced") return new_panel["_id"] # Else create a new panel document with a given version result = self.panel_collection.insert_one(panel_obj) LOG.debug("Panel saved") return result.inserted_id
def build_case(case_data, adapter): """Build a case object that is to be inserted to the database Args: case_data (dict): A dictionary with the relevant case information adapter (scout.adapter.MongoAdapter) Returns: case_obj (dict): A case object dict( case_id = str, # required=True, unique display_name = str, # If not display name use case_id owner = str, # required # These are the names of all the collaborators that are allowed to view the # case, including the owner collaborators = list, # List of institute_ids assignee = str, # _id of a user individuals = list, # list of dictionaries with individuals created_at = datetime, updated_at = datetime, suspects = list, # List of variants referred by there _id causatives = list, # List of variants referred by there _id synopsis = str, # The synopsis is a text blob status = str, # default='inactive', choices=STATUS is_research = bool, # default=False research_requested = bool, # default=False rerun_requested = bool, # default=False analysis_date = datetime, analyses = list, # list of dict # default_panels specifies which panels that should be shown when # the case is opened panels = list, # list of dictionaries with panel information dynamic_gene_list = list, # List of genes genome_build = str, # This should be 37 or 38 genome_version = float, # What version of the build rank_model_version = str, rank_score_threshold = int, # default=8 phenotype_terms = list, # List of dictionaries with phenotype information phenotype_groups = list, # List of dictionaries with phenotype information madeline_info = str, # madeline info is a full xml file multiqc = str, # path to dir with multiqc information vcf_files = dict, # A dictionary with vcf files diagnosis_phenotypes = list, # List of references to diseases diagnosis_genes = list, # List of references to genes has_svvariants = bool, # default=False is_migrated = bool # default=False ) """ log.info("build case with id: {0}".format(case_data['case_id'])) case_obj = { '_id': case_data['case_id'], 'display_name': case_data.get('display_name', case_data['case_id']), } # Check if institute exists in database try: institute_id = case_data['owner'] except KeyError as err: raise ConfigError("Case has to have a institute") institute_obj = adapter.institute(institute_id) if not institute_obj: raise IntegrityError("Institute %s not found in database" % institute_id) case_obj['owner'] = case_data['owner'] # Owner allways has to be part of collaborators collaborators = set(case_data.get('collaborators', [])) collaborators.add(case_data['owner']) case_obj['collaborators'] = list(collaborators) if case_data.get('assignee'): case_obj['assignees'] = [case_data['assignee']] # Individuals ind_objs = [] try: for individual in case_data.get('individuals', []): ind_objs.append(build_individual(individual)) except Exception as error: ## TODO add some action here raise error # sort the samples to put the affected individual first sorted_inds = sorted(ind_objs, key=lambda ind: -ind['phenotype']) case_obj['individuals'] = sorted_inds now = datetime.now() case_obj['created_at'] = now case_obj['updated_at'] = now if case_data.get('suspects'): case_obj['suspects'] = case_data['suspects'] if case_data.get('causatives'): case_obj['causatives'] = case_data['causatives'] case_obj['synopsis'] = case_data.get('synopsis', '') case_obj['status'] = 'inactive' case_obj['is_research'] = False case_obj['research_requested'] = False case_obj['rerun_requested'] = False analysis_date = case_data.get('analysis_date') if analysis_date: case_obj['analysis_date'] = analysis_date # We store some metadata and references about gene panels in 'panels' case_panels = case_data.get('gene_panels', []) default_panels = case_data.get('default_panels', []) panels = [] for panel_name in case_panels: panel_obj = adapter.gene_panel(panel_name) if not panel_obj: raise IntegrityError("Panel %s does not exist in database" % panel_name) panel = { 'panel_id': panel_obj['_id'], 'panel_name': panel_obj['panel_name'], 'display_name': panel_obj['display_name'], 'version': panel_obj['version'], 'updated_at': panel_obj['date'], 'nr_genes': len(panel_obj['genes']) } if panel_name in default_panels: panel['is_default'] = True else: panel['is_default'] = False panels.append(panel) case_obj['panels'] = panels case_obj['dynamic_gene_list'] = [] # Meta data genome_build = case_data.get('genome_build', '37') if not genome_build in ['37', '38']: pass ##TODO raise exception if invalid genome build was used case_obj['genome_build'] = genome_build case_obj['genome_version'] = case_data.get('genome_version') if case_data.get('rank_model_version'): case_obj['rank_model_version'] = str(case_data['rank_model_version']) if case_data.get('sv_rank_model_version'): case_obj['sv_rank_model_version'] = str( case_data['sv_rank_model_version']) if case_data.get('rank_score_threshold'): case_obj['rank_score_threshold'] = float( case_data['rank_score_threshold']) # phenotype information phenotypes = [] for phenotype in case_data.get('phenotype_terms', []): phenotype_obj = build_phenotype(phenotype, adapter) if phenotype_obj: phenotypes.append(phenotype_obj) if phenotypes: case_obj['phenotype_terms'] = phenotypes # phenotype groups phenotype_groups = [] for phenotype in case_data.get('phenotype_groups', []): phenotype_obj = build_phenotype(phenotype, adapter) if phenotype_obj: phenotype_groups.append(phenotype_obj) if phenotype_groups: case_obj['phenotype_groups'] = phenotype_groups # Files case_obj['madeline_info'] = case_data.get('madeline_info') case_obj['chromograph_image_files'] = case_data.get( 'chromograph_image_files') case_obj['chromograph_prefixes'] = case_data.get('chromograph_prefixes') if 'multiqc' in case_data: case_obj['multiqc'] = case_data.get('multiqc') case_obj['vcf_files'] = case_data.get('vcf_files', {}) case_obj['delivery_report'] = case_data.get('delivery_report') case_obj['has_svvariants'] = False if (case_obj['vcf_files'].get('vcf_sv') or case_obj['vcf_files'].get('vcf_sv_research')): case_obj['has_svvariants'] = True case_obj['has_strvariants'] = False if (case_obj['vcf_files'].get('vcf_str')): case_obj['has_strvariants'] = True case_obj['is_migrated'] = False # What experiment is used, alternatives are rare (rare disease) or cancer case_obj['track'] = case_data.get('track', 'rare') return case_obj
def update_institute( self, internal_id, sanger_recipient=None, sanger_recipients=None, loqusdb_id=None, coverage_cutoff=None, frequency_cutoff=None, display_name=None, remove_sanger=None, phenotype_groups=None, group_abbreviations=None, add_groups=None, sharing_institutes=None, cohorts=None, ): """Update the information for an institute Args: internal_id(str): The internal institute id sanger_recipient(str): Email adress to add for sanger order sanger_recipients(list): A list of sanger recipients email addresses loqusdb_id(str): identify loqusdb setting to use coverage_cutoff(int): Update coverage cutoff frequency_cutoff(float): New frequency cutoff display_name(str): New display name remove_sanger(str): Email adress for sanger user to be removed phenotype_groups(iterable(str)): New phenotype groups group_abbreviations(iterable(str)) add_groups(bool): If groups should be added. If False replace groups sharing_institutes(list(str)): Other institutes to share cases with cohorts(list(str)): patient cohorts Returns: updated_institute(dict) """ add_groups = add_groups or False institute_obj = self.institute(internal_id) if not institute_obj: raise IntegrityError( "Institute {} does not exist in database".format(internal_id)) updates = {"$set": {}} updated_institute = institute_obj if sanger_recipient: user_obj = self.user(sanger_recipient) if not user_obj: raise IntegrityError( "user {} does not exist in database".format( sanger_recipient)) LOG.info("Updating sanger recipients for institute: {0} with {1}". format(internal_id, sanger_recipient)) updates["$push"] = {"sanger_recipients": sanger_recipient} if sanger_recipients is not None: updates["$set"][ "sanger_recipients"] = sanger_recipients # can be empty list if remove_sanger: LOG.info( "Removing sanger recipient {0} from institute: {1}".format( remove_sanger, internal_id)) updates["$pull"] = {"sanger_recipients": remove_sanger} if coverage_cutoff: LOG.info( "Updating coverage cutoff for institute: {0} to {1}".format( internal_id, coverage_cutoff)) updates["$set"]["coverage_cutoff"] = coverage_cutoff if frequency_cutoff: LOG.info( "Updating frequency cutoff for institute: {0} to {1}".format( internal_id, frequency_cutoff)) updates["$set"]["frequency_cutoff"] = frequency_cutoff if display_name: LOG.info("Updating display name for institute: {0} to {1}".format( internal_id, display_name)) updates["$set"]["display_name"] = display_name if phenotype_groups is not None: if group_abbreviations: group_abbreviations = list(group_abbreviations) existing_groups = {} if add_groups: existing_groups = institute_obj.get("phenotype_groups", PHENOTYPE_GROUPS) for i, hpo_term in enumerate(phenotype_groups): hpo_obj = self.hpo_term(hpo_term) if not hpo_obj: return "Term {} does not exist in database".format( hpo_term) hpo_id = hpo_obj["hpo_id"] description = hpo_obj["description"] abbreviation = None if group_abbreviations: abbreviation = group_abbreviations[i] existing_groups[hpo_term] = { "name": description, "abbr": abbreviation } updates["$set"]["phenotype_groups"] = existing_groups if sharing_institutes is not None: updates["$set"]["collaborators"] = sharing_institutes if cohorts is not None: updates["$set"]["cohorts"] = cohorts if loqusdb_id is not None: LOG.info("Updating loqusdb id for institute: %s to %s", internal_id, loqusdb_id) updates["$set"]["loqusdb_id"] = loqusdb_id if updates["$set"].keys() or updates.get("$push") or updates.get( "$pull"): updates["$set"]["updated_at"] = datetime.now() updated_institute = self.institute_collection.find_one_and_update( {"_id": internal_id}, updates, return_document=pymongo.ReturnDocument.AFTER, ) LOG.info("Institute updated") return updated_institute
def load_case(self, config_data, update=False, keep_actions=True): """Load a case into the database Check if the owner and the institute exists. If update is True, old case variants will be removed. Args: config_data(dict): A dictionary with all the necessary information update(bool): If existing case should be updated keep_actions(bool): Attempt transfer of existing case user actions to new vars Returns: case_obj(dict) """ # Check that the owner exists in the database institute_obj = self.institute(config_data["owner"]) if not institute_obj: raise IntegrityError("Institute '%s' does not exist in database" % config_data["owner"]) # Build the case object case_obj = build_case(config_data, self) # Check if case exists with old case id old_caseid = "-".join([case_obj["owner"], case_obj["display_name"]]) old_case = self.case(old_caseid) # This is to keep sanger order and validation status old_sanger_variants = self.case_sanger_variants(case_obj["_id"]) genome_build = str(config_data.get("genome_build", 37)) if old_case: LOG.info( "Update case id for existing case: %s -> %s", old_caseid, case_obj["_id"], ) self.update_caseid(old_case, case_obj["_id"]) update = True # Retrieve info to be propagated to eventual updated case # previously evaluated variants (acmg, manual rank, cancer tier, dismissed, mosaic, commented) existing_case = self.case(case_id=case_obj["_id"]) or self.case( institute_id=institute_obj["_id"], display_name=case_obj["display_name"] ) old_evaluated_variants = self.check_existing_data( case_obj, existing_case, institute_obj, update, keep_actions ) if existing_case and keep_actions: # collect all variants with user actions for this case old_evaluated_variants = list( self.evaluated_variants(case_obj["_id"], case_obj["owner"]) ) files = [ {"file_name": "vcf_snv", "variant_type": "clinical", "category": "snv"}, {"file_name": "vcf_sv", "variant_type": "clinical", "category": "sv"}, { "file_name": "vcf_cancer", "variant_type": "clinical", "category": "cancer", }, { "file_name": "vcf_cancer_sv", "variant_type": "clinical", "category": "cancer_sv", }, {"file_name": "vcf_str", "variant_type": "clinical", "category": "str"}, ] try: for vcf_file in files: # Check if file exists if not case_obj["vcf_files"].get(vcf_file["file_name"]): LOG.debug("didn't find {}, skipping".format(vcf_file["file_name"])) continue variant_type = vcf_file["variant_type"] category = vcf_file["category"] if update: self.delete_variants( case_id=case_obj["_id"], variant_type=variant_type, category=category, ) # get custom images from config file custom_images = ( case_obj["custom_images"][category] if case_obj.get("custom_images") and category in case_obj.get("custom_images") else None ) # add variants self.load_variants( case_obj=case_obj, variant_type=variant_type, category=category, build=genome_build, rank_threshold=case_obj.get("rank_score_threshold", 5), custom_images=custom_images, ) except (IntegrityError, ValueError, ConfigError, KeyError) as error: LOG.warning(error) if existing_case: case_obj["rerun_requested"] = False if case_obj["status"] in ["active", "archived"]: case_obj["status"] = "inactive" self.update_case(case_obj) # update Sanger status for the new inserted variants self.update_case_sanger_variants(institute_obj, case_obj, old_sanger_variants) if keep_actions and old_evaluated_variants: self.update_variant_actions(institute_obj, case_obj, old_evaluated_variants) else: LOG.info("Loading case %s into database", case_obj["display_name"]) self._add_case(case_obj) return case_obj
def load_case(self, config_data, update=False): """Load a case into the database Check if the owner and the institute exists. Args: config_data(dict): A dictionary with all the necessary information update(bool): If existing case should be updated Returns: case_obj(dict) """ # Check that the owner exists in the database institute_obj = self.institute(config_data["owner"]) if not institute_obj: raise IntegrityError("Institute '%s' does not exist in database" % config_data["owner"]) # Parse the case information parsed_case = parse_case(config=config_data) # Build the case object case_obj = build_case(parsed_case, self) # Check if case exists with old case id old_caseid = "-".join([case_obj["owner"], case_obj["display_name"]]) old_case = self.case(old_caseid) # This is to keep sanger order and validation status old_sanger_variants = self.case_sanger_variants(case_obj["_id"]) if old_case: LOG.info( "Update case id for existing case: %s -> %s", old_caseid, case_obj["_id"], ) self.update_caseid(old_case, case_obj["_id"]) update = True # Check if case exists in database existing_case = self.case(case_obj["_id"]) if existing_case and not update: raise IntegrityError("Case %s already exists in database" % case_obj["_id"]) files = [ { "file_name": "vcf_snv", "variant_type": "clinical", "category": "snv" }, { "file_name": "vcf_sv", "variant_type": "clinical", "category": "sv" }, { "file_name": "vcf_cancer", "variant_type": "clinical", "category": "cancer", }, { "file_name": "vcf_cancer_sv", "variant_type": "clinical", "category": "cancer_sv", }, { "file_name": "vcf_str", "variant_type": "clinical", "category": "str" }, ] try: for vcf_file in files: # Check if file exists if not case_obj["vcf_files"].get(vcf_file["file_name"]): LOG.debug("didn't find {}, skipping".format( vcf_file["file_name"])) continue variant_type = vcf_file["variant_type"] category = vcf_file["category"] if update: self.delete_variants( case_id=case_obj["_id"], variant_type=variant_type, category=category, ) self.load_variants( case_obj=case_obj, variant_type=variant_type, category=category, rank_threshold=case_obj.get("rank_score_threshold", 5), ) except (IntegrityError, ValueError, ConfigError, KeyError) as error: LOG.warning(error) if existing_case and update: case_obj["rerun_requested"] = False if case_obj["status"] in ["active", "archived"]: case_obj["status"] = "inactive" self.update_case(case_obj) # update Sanger status for the new inserted variants self.update_case_sanger_variants(institute_obj, case_obj, old_sanger_variants) else: LOG.info("Loading case %s into database", case_obj["display_name"]) self._add_case(case_obj) return case_obj
def build_case(case_data, adapter): """Build a case object that is to be inserted to the database Args: case_data (dict): A dictionary with the relevant case information adapter (scout.adapter.MongoAdapter) Returns: case_obj (dict): A case object dict( case_id = str, # required=True, unique display_name = str, # If not display name use case_id owner = str, # required # These are the names of all the collaborators that are allowed to view the # case, including the owner collaborators = list, # List of institute_ids assignee = str, # _id of a user individuals = list, # list of dictionaries with individuals created_at = datetime, updated_at = datetime, suspects = list, # List of variants referred by there _id causatives = list, # List of variants referred by there _id synopsis = str, # The synopsis is a text blob status = str, # default='inactive', choices=STATUS is_research = bool, # default=False research_requested = bool, # default=False rerun_requested = bool, # default=False cohorts = list, # list of strings analysis_date = datetime, analyses = list, # list of dict # default_panels specifies which panels that should be shown when # the case is opened panels = list, # list of dictionaries with panel information dynamic_gene_list = list, # List of genes genome_build = str, # This should be 37 or 38 rank_model_version = str, rank_score_threshold = int, # default=8 phenotype_terms = list, # List of dictionaries with phenotype information phenotype_groups = list, # List of dictionaries with phenotype information madeline_info = str, # madeline info is a full xml file multiqc = str, # path to dir with multiqc information cnv_report = str, # path to file with cnv report coverage_qc_report = str, # path to file with coverage and qc report gene_fusion_report = str, # path to the gene fusions report gene_fusion_report_research = str, # path to the research gene fusions report vcf_files = dict, # A dictionary with vcf files diagnosis_phenotypes = list, # List of references to diseases diagnosis_genes = list, # List of references to genes has_svvariants = bool, # default=False is_migrated = bool # default=False ) """ LOG.info("build case with id: {0}".format(case_data["case_id"])) case_obj = { "_id": case_data["case_id"], "display_name": case_data.get("display_name", case_data["case_id"]), } # Check if institute exists in database try: institute_id = case_data["owner"] except KeyError as err: raise ConfigError("Case has to have a institute") institute_obj = adapter.institute(institute_id) if not institute_obj: raise IntegrityError("Institute %s not found in database" % institute_id) case_obj["owner"] = case_data["owner"] # Owner allways has to be part of collaborators collaborators = set(case_data.get("collaborators", [])) collaborators.add(case_data["owner"]) case_obj["collaborators"] = list(collaborators) if case_data.get("assignee"): case_obj["assignees"] = [case_data["assignee"]] case_obj["smn_tsv"] = case_data.get("smn_tsv") # Individuals ind_objs = [] try: for individual in case_data.get("individuals", []): ind_objs.append(build_individual(individual)) except Exception as error: ## TODO add some action here raise error # sort the samples to put the affected individual first sorted_inds = sorted(ind_objs, key=lambda ind: -ind["phenotype"]) case_obj["individuals"] = sorted_inds now = datetime.now() case_obj["created_at"] = now case_obj["updated_at"] = now if case_data.get("suspects"): case_obj["suspects"] = case_data["suspects"] if case_data.get("causatives"): case_obj["causatives"] = case_data["causatives"] case_obj["synopsis"] = case_data.get("synopsis", "") case_obj["status"] = "inactive" case_obj["is_research"] = False case_obj["research_requested"] = False case_obj["rerun_requested"] = False case_obj["lims_id"] = case_data.get("lims_id", "") analysis_date = case_data.get("analysis_date") if analysis_date: case_obj["analysis_date"] = analysis_date # We store some metadata and references about gene panels in 'panels' case_panels = case_data.get("gene_panels", []) default_panels = case_data.get("default_panels", []) panels = [] for panel_name in case_panels: panel_obj = adapter.gene_panel(panel_name) if not panel_obj: LOG.warning( "Panel %s does not exist in database and will not be saved in case document." % panel_name) continue panel = { "panel_id": panel_obj["_id"], "panel_name": panel_obj["panel_name"], "display_name": panel_obj["display_name"], "version": panel_obj["version"], "updated_at": panel_obj["date"], "nr_genes": len(panel_obj["genes"]), } if panel_name in default_panels: panel["is_default"] = True else: panel["is_default"] = False panels.append(panel) case_obj["panels"] = panels case_obj["dynamic_gene_list"] = [] # Meta data genome_build = case_data.get("genome_build", "37") if not genome_build in ["37", "38"]: pass ##TODO raise exception if invalid genome build was used case_obj["genome_build"] = genome_build if case_data.get("rank_model_version"): case_obj["rank_model_version"] = str(case_data["rank_model_version"]) if case_data.get("sv_rank_model_version"): case_obj["sv_rank_model_version"] = str( case_data["sv_rank_model_version"]) if case_data.get("rank_score_threshold"): case_obj["rank_score_threshold"] = float( case_data["rank_score_threshold"]) # Cohort information if case_data.get("cohorts"): case_obj["cohorts"] = case_data["cohorts"] # Check if all case cohorts are registered under the institute institute_cohorts = set(institute_obj.get("cohorts", [])) all_cohorts = institute_cohorts.union(set(case_obj["cohorts"])) if len(all_cohorts) > len(institute_cohorts): # if not, update institute with new cohorts LOG.warning("Updating institute object with new cohort terms") adapter.institute_collection.find_one_and_update( {"_id": institute_obj["_id"]}, {"$set": { "cohorts": list(all_cohorts) }}) # phenotype information if case_data.get("phenotype_terms"): phenotypes = [] for phenotype in case_data["phenotype_terms"]: phenotype_obj = adapter.hpo_term(phenotype) if phenotype_obj is None: LOG.warning( f"Could not find term with ID '{phenotype}' in HPO collection, skipping phenotype term." ) continue phenotypes.append({ "phenotype_id": phenotype, "feature": phenotype_obj.get("description") }) if phenotypes: case_obj["phenotype_terms"] = phenotypes # phenotype groups if case_data.get("phenotype_groups"): phenotype_groups = [] for phenotype in case_data["phenotype_groups"]: phenotype_obj = build_phenotype(phenotype, adapter) if phenotype_obj: phenotype_groups.append(phenotype_obj) if phenotype_groups: case_obj["phenotype_groups"] = phenotype_groups # Files case_obj["madeline_info"] = case_data.get("madeline_info") case_obj["custom_images"] = case_data.get("custom_images") for custom_report in CUSTOM_CASE_REPORTS: if custom_report in case_data: case_obj[custom_report] = case_data.get(custom_report) case_obj["vcf_files"] = case_data.get("vcf_files", {}) case_obj["delivery_report"] = case_data.get("delivery_report") case_obj["has_svvariants"] = False if case_obj["vcf_files"].get("vcf_sv") or case_obj["vcf_files"].get( "vcf_sv_research"): case_obj["has_svvariants"] = True case_obj["has_strvariants"] = False if case_obj["vcf_files"].get("vcf_str"): case_obj["has_strvariants"] = True case_obj["is_migrated"] = False # What experiment is used, alternatives are rare (rare disease) or cancer case_obj["track"] = case_data.get("track", "rare") case_obj["group"] = case_data.get("group", []) return case_obj
def load_variants(adapter, variant_file, case_obj, variant_type='clinical', category='snv', rank_threshold=5, chrom=None, start=None, end=None): """Load all variant in variants Args: adapter(MongoAdapter) variant_file(str): Path to variant file case(Case) variant_type(str) category(str): 'snv' or 'sv' rank_threshold(int) chrom(str) start(int) end(int) """ institute_obj = adapter.institute(institute_id=case_obj['owner']) if not institute_obj: raise IntegrityError("Institute {0} does not exist in" " database.".format(case_obj['owner'])) gene_to_panels = adapter.gene_to_panels() hgncid_to_gene = adapter.hgncid_to_gene() coordinates = {} vcf_obj = VCF(variant_file) rank_results_header = parse_rank_results_header(vcf_obj) vep_header = parse_vep_header(vcf_obj) # This is a dictionary to tell where ind are in vcf individual_positions = {} for i,ind in enumerate(vcf_obj.samples): individual_positions[ind] = i logger.info("Start inserting variants into database") start_insertion = datetime.now() start_five_thousand = datetime.now() nr_variants = 0 nr_inserted = 0 inserted = 1 coordinates = False if chrom: coordinates = { 'chrom': chrom, 'start': start, 'end': end } try: for nr_variants, variant in enumerate(vcf_obj): rank_score = parse_rank_score( variant.INFO.get('RankScore'), case_obj['display_name'] ) variant_obj = None add_variant = False if coordinates or (rank_score > rank_threshold): parsed_variant = parse_variant( variant=variant, case=case_obj, variant_type=variant_type, rank_results_header=rank_results_header, vep_header = vep_header, individual_positions = individual_positions ) add_variant = True # If there are coordinates the variant should be loaded if coordinates: if not check_coordinates(parsed_variant, coordinates): add_variant = False if add_variant: variant_obj = build_variant( variant=parsed_variant, institute_id=institute_obj['_id'], gene_to_panels=gene_to_panels, hgncid_to_gene=hgncid_to_gene, ) try: load_variant(adapter, variant_obj) nr_inserted += 1 except IntegrityError as error: pass if (nr_variants != 0 and nr_variants % 5000 == 0): logger.info("%s variants parsed" % str(nr_variants)) logger.info("Time to parse variants: {} ".format( datetime.now() - start_five_thousand)) start_five_thousand = datetime.now() if (nr_inserted != 0 and (nr_inserted * inserted) % (1000 * inserted) == 0): logger.info("%s variants inserted" % nr_inserted) inserted += 1 except Exception as error: if not coordinates: logger.warning("Deleting inserted variants") delete_variants(adapter, case_obj, variant_type) raise error logger.info("All variants inserted.") logger.info("Number of variants in file: {0}".format(nr_variants + 1)) logger.info("Number of variants inserted: {0}".format(nr_inserted)) logger.info("Time to insert variants:{0}".format(datetime.now() - start_insertion))
def build_case(case_data, adapter): """Build a case object that is to be inserted to the database Args: case_data (dict): A dictionary with the relevant case information adapter (scout.adapter.MongoAdapter) Returns: case_obj (dict): A case object dict( case_id = str, # required=True, unique display_name = str, # If not display name use case_id owner = str, # required # These are the names of all the collaborators that are allowed to view the # case, including the owner collaborators = list, # List of institute_ids assignee = str, # _id of a user individuals = list, # list of dictionaries with individuals created_at = datetime, updated_at = datetime, suspects = list, # List of variants referred by there _id causatives = list, # List of variants referred by there _id synopsis = str, # The synopsis is a text blob status = str, # default='inactive', choices=STATUS is_research = bool, # default=False research_requested = bool, # default=False rerun_requested = bool, # default=False analysis_date = datetime, analyses = list, # list of dict # default_panels specifies which panels that should be shown when # the case is opened panels = list, # list of dictionaries with panel information dynamic_gene_list = list, # List of genes genome_build = str, # This should be 37 or 38 genome_version = float, # What version of the build rank_model_version = str, rank_score_threshold = int, # default=8 phenotype_terms = list, # List of dictionaries with phenotype information phenotype_groups = list, # List of dictionaries with phenotype information madeline_info = str, # madeline info is a full xml file multiqc = str, # path to dir with multiqc information vcf_files = dict, # A dictionary with vcf files diagnosis_phenotypes = list, # List of references to diseases diagnosis_genes = list, # List of references to genes has_svvariants = bool, # default=False is_migrated = bool # default=False ) """ log.info("build case with id: {0}".format(case_data["case_id"])) case_obj = { "_id": case_data["case_id"], "display_name": case_data.get("display_name", case_data["case_id"]), } # Check if institute exists in database try: institute_id = case_data["owner"] except KeyError as err: raise ConfigError("Case has to have a institute") institute_obj = adapter.institute(institute_id) if not institute_obj: raise IntegrityError("Institute %s not found in database" % institute_id) case_obj["owner"] = case_data["owner"] # Owner allways has to be part of collaborators collaborators = set(case_data.get("collaborators", [])) collaborators.add(case_data["owner"]) case_obj["collaborators"] = list(collaborators) if case_data.get("assignee"): case_obj["assignees"] = [case_data["assignee"]] case_obj["smn_tsv"] = case_data.get("smn_tsv") # Individuals ind_objs = [] try: for individual in case_data.get("individuals", []): ind_objs.append(build_individual(individual)) except Exception as error: ## TODO add some action here raise error # sort the samples to put the affected individual first sorted_inds = sorted(ind_objs, key=lambda ind: -ind["phenotype"]) case_obj["individuals"] = sorted_inds now = datetime.now() case_obj["created_at"] = now case_obj["updated_at"] = now if case_data.get("suspects"): case_obj["suspects"] = case_data["suspects"] if case_data.get("causatives"): case_obj["causatives"] = case_data["causatives"] case_obj["synopsis"] = case_data.get("synopsis", "") case_obj["status"] = "inactive" case_obj["is_research"] = False case_obj["research_requested"] = False case_obj["rerun_requested"] = False analysis_date = case_data.get("analysis_date") if analysis_date: case_obj["analysis_date"] = analysis_date # We store some metadata and references about gene panels in 'panels' case_panels = case_data.get("gene_panels", []) default_panels = case_data.get("default_panels", []) panels = [] for panel_name in case_panels: panel_obj = adapter.gene_panel(panel_name) if not panel_obj: raise IntegrityError("Panel %s does not exist in database" % panel_name) panel = { "panel_id": panel_obj["_id"], "panel_name": panel_obj["panel_name"], "display_name": panel_obj["display_name"], "version": panel_obj["version"], "updated_at": panel_obj["date"], "nr_genes": len(panel_obj["genes"]), } if panel_name in default_panels: panel["is_default"] = True else: panel["is_default"] = False panels.append(panel) case_obj["panels"] = panels case_obj["dynamic_gene_list"] = [] # Meta data genome_build = case_data.get("genome_build", "37") if not genome_build in ["37", "38"]: pass ##TODO raise exception if invalid genome build was used case_obj["genome_build"] = genome_build case_obj["genome_version"] = case_data.get("genome_version") if case_data.get("rank_model_version"): case_obj["rank_model_version"] = str(case_data["rank_model_version"]) if case_data.get("sv_rank_model_version"): case_obj["sv_rank_model_version"] = str( case_data["sv_rank_model_version"]) if case_data.get("rank_score_threshold"): case_obj["rank_score_threshold"] = float( case_data["rank_score_threshold"]) # phenotype information phenotypes = [] for phenotype in case_data.get("phenotype_terms", []): phenotype_obj = build_phenotype(phenotype, adapter) if phenotype_obj: phenotypes.append(phenotype_obj) if phenotypes: case_obj["phenotype_terms"] = phenotypes # phenotype groups phenotype_groups = [] for phenotype in case_data.get("phenotype_groups", []): phenotype_obj = build_phenotype(phenotype, adapter) if phenotype_obj: phenotype_groups.append(phenotype_obj) if phenotype_groups: case_obj["phenotype_groups"] = phenotype_groups # Files case_obj["madeline_info"] = case_data.get("madeline_info") case_obj["chromograph_image_files"] = case_data.get( "chromograph_image_files") case_obj["chromograph_prefixes"] = case_data.get("chromograph_prefixes") if "multiqc" in case_data: case_obj["multiqc"] = case_data.get("multiqc") case_obj["vcf_files"] = case_data.get("vcf_files", {}) case_obj["delivery_report"] = case_data.get("delivery_report") case_obj["has_svvariants"] = False if case_obj["vcf_files"].get("vcf_sv") or case_obj["vcf_files"].get( "vcf_sv_research"): case_obj["has_svvariants"] = True case_obj["has_strvariants"] = False if case_obj["vcf_files"].get("vcf_str"): case_obj["has_strvariants"] = True case_obj["is_migrated"] = False # What experiment is used, alternatives are rare (rare disease) or cancer case_obj["track"] = case_data.get("track", "rare") return case_obj