def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None, quality_filter=None, indivs_to_consider=None, user=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: logger.error( "Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate( collection.find({ '$and': [{ k: v } for k, v in db_query.items()] }).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT + 5)): if i >= settings.VARIANT_QUERY_RESULTS_LIMIT: raise Exception( "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % settings.VARIANT_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) variant.set_extra('project_id', project_id) variant.set_extra('family_id', family_id) self.add_annotations_to_variants([variant], project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_project_variants_in_gene(self, project_id, gene_id, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(None, modified_variant_filter) sys.stderr.write("Project Gene Search: " + str(project_id) + " all variants query: " + str(db_query)) collection = self._get_project_collection(project_id) # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [] for variant_dict in collection.find(db_query).hint([ ('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING) ]): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) return variants
def get_variants_in_gene(self, project_id, family_id, gene_id, genotype_filter=None, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(genotype_filter, modified_variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: return # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [] for variant_dict in collection.find(db_query).hint([ ('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING) ]): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) for v in variants: yield v
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print( "Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate( collection.find(db_query).sort('xpos').limit( MONGO_QUERY_RESULTS_LIMIT + 5)): if i >= MONGO_QUERY_RESULTS_LIMIT: raise Exception( "ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = _make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) for variant_dict in collection.find(db_query).sort('xpos'): variant = Variant.fromJSON(variant_dict) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants(self, project_id, variant_filter=None): variant_filter_t = VariantFilter(**(variant_filter if variant_filter else {})) db_query = self._make_db_query(None, variant_filter) collection = self._get_project_collection(project_id) for variant_dict in collection.find(db_query).sort('xpos'): variant = Variant.fromJSON(variant_dict) if variant_filter is None: yield variant if passes_variant_filter(variant, variant_filter_t)[0]: yield variant
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for variant_dict in collection.find(db_query).sort('xpos'): variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate(collection.find(db_query).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)): if i >= MONGO_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None, quality_filter=None, indivs_to_consider=None, user=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: logger.error("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return for i, variant_dict in enumerate(collection.find({'$and' : [{k: v} for k, v in db_query.items()]}).sort('xpos').limit(settings.VARIANT_QUERY_RESULTS_LIMIT+5)): if i >= settings.VARIANT_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % settings.VARIANT_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) variant.set_extra('project_id', project_id) variant.set_extra('family_id', family_id) self.add_annotations_to_variants([variant], project_id) if passes_variant_filter(variant, variant_filter)[0]: yield variant
def get_variants_in_gene(self, project_id, gene_id, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(None, modified_variant_filter) collection = self._get_project_collection(project_id) variants = [] for variant_dict in collection.find(db_query).hint([('gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)]): variant = Variant.fromJSON(variant_dict) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) return variants
def get_project_variants_in_gene(self, project_id, gene_id, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = self._make_db_query(None, modified_variant_filter) logger.info("Project Gene Search: " + str(project_id) + " all variants query: " + str(db_query)) collection = self._get_project_collection(project_id) # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [Variant.fromJSON(variant_dict) for variant_dict in collection.find(db_query).hint([('db_gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)])] self.add_annotations_to_variants(variants, project_id) variants = filter(lambda variant: passes_variant_filter(variant, modified_variant_filter), variants) variants = sorted(variants, key=lambda v: v.unique_tuple()) return variants
def get_variants_in_gene(self, project_id, family_id, gene_id, genotype_filter=None, variant_filter=None): if variant_filter is None: modified_variant_filter = VariantFilter() else: modified_variant_filter = copy.deepcopy(variant_filter) modified_variant_filter.add_gene(gene_id) db_query = _make_db_query(genotype_filter, modified_variant_filter) collection = self._get_family_collection(project_id, family_id) # we have to collect list in memory here because mongo can't sort on xpos, # as result size can get too big. # need to find a better way to do this. variants = [] for variant_dict in collection.find(db_query).hint([('gene_ids', pymongo.ASCENDING), ('xpos', pymongo.ASCENDING)]): variant = Variant.fromJSON(variant_dict) if passes_variant_filter(variant, modified_variant_filter): variants.append(variant) variants = sorted(variants, key=lambda v: v.unique_tuple()) for v in variants: yield v
def get_variants(self, project_id, family_id, genotype_filter=None, variant_filter=None): db_query = self._make_db_query(genotype_filter, variant_filter) collection = self._get_family_collection(project_id, family_id) if not collection: print("Error: mongodb collection not found for project %s family %s " % (project_id, family_id)) return counters = OrderedDict([('returned_by_query', 0), ('passes_variant_filter', 0)]) for i, variant_dict in enumerate(collection.find({'$and' : [{k: v} for k, v in db_query.items()]}).sort('xpos').limit(MONGO_QUERY_RESULTS_LIMIT+5)): if i >= MONGO_QUERY_RESULTS_LIMIT: raise Exception("ERROR: this search exceeded the %s variant result size limit. Please set additional filters and try again." % MONGO_QUERY_RESULTS_LIMIT) variant = Variant.fromJSON(variant_dict) self.add_annotations_to_variant(variant, project_id) counters["returned_by_query"] += 1 if passes_variant_filter(variant, variant_filter)[0]: counters["passes_variant_filter"] += 1 yield variant for k, v in counters.items(): sys.stderr.write(" %s: %s\n" % (k,v))
def get_de_novo_variants(datastore, reference, family, variant_filter=None, quality_filter=None): """ Returns variants that follow homozygous recessive inheritance in family """ de_novo_filter = inheritance.get_de_novo_filter(family) db_query = datastore._make_db_query(de_novo_filter, variant_filter) collection = datastore._get_family_collection(family.project_id, family.family_id) if not collection: raise ValueError( "Error: mongodb collection not found for project %s family %s " % (family.project_id, family.family_id)) MONGO_QUERY_RESULTS_LIMIT = 5000 variant_iter = collection.find(db_query).sort('xpos').limit( MONGO_QUERY_RESULTS_LIMIT + 5) # get ids of parents in this family valid_ids = set(indiv_id for indiv_id in family.individuals) paternal_ids = set(i.paternal_id for i in family.get_individuals() if i.paternal_id in valid_ids) maternal_ids = set(i.maternal_id for i in family.get_individuals() if i.maternal_id in valid_ids) parental_ids = paternal_ids | maternal_ids # loop over all variants returned for i, variant_dict in enumerate(variant_iter): if i > MONGO_QUERY_RESULTS_LIMIT: raise Exception( "MONGO_QUERY_RESULTS_LIMIT of %s exceeded for query: %s" % (MONGO_QUERY_RESULTS_LIMIT, db_query)) variant = Variant.fromJSON(variant_dict) datastore.add_annotations_to_variant(variant, family.project_id) if not passes_variant_filter(variant, variant_filter)[0]: continue # handle genotype filters if len(parental_ids) != 2: # ordinary filters for non-trios for indiv_id in de_novo_filter.keys(): genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter): break else: yield variant else: # for trios use Mark's recommended filters for de-novo variants: # Hard-coded thresholds: # 1) Child must have > 10% of combined Parental Read Depth # 2) MinimumChildGQscore >= 20 # 3) MaximumParentAlleleBalance <= 5% # Adjustable filters: # Variants should PASS # Child AB should be >= 20 # compute parental read depth for filter 1 total_parental_read_depth = 0 for indiv_id in parental_ids: genotype = variant.get_genotype(indiv_id) if genotype.extras and 'dp' in genotype.extras and genotype.extras[ 'dp'] != '.': total_parental_read_depth += int(genotype.extras['dp']) else: total_parental_read_depth = None # both parents must have DP to use the parental_read_depth filters break for indiv_id in de_novo_filter.keys(): quality_filter_temp = quality_filter.copy( ) # copy before modifying if indiv_id in parental_ids: # handle one of the parents quality_filter_temp['max_ab'] = 5 else: # handle child quality_filter_temp['min_gq'] = 20 if total_parental_read_depth is not None: quality_filter_temp[ 'min_dp'] = total_parental_read_depth * 0.1 genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter_temp): #print("%s: %s " % (variant.chr, variant.pos)) break else: yield variant
def get_de_novo_variants(datastore, reference, family, variant_filter=None, quality_filter=None): """ Returns variants that follow homozygous recessive inheritance in family """ de_novo_filter = inheritance.get_de_novo_filter(family) db_query = datastore._make_db_query(de_novo_filter, variant_filter) collection = datastore._get_family_collection(family.project_id, family.family_id) if not collection: raise ValueError("Error: mongodb collection not found for project %s family %s " % (family.project_id, family.family_id)) variant_iter = collection.find(db_query).sort('xpos') # get ids of parents in this family valid_ids = set(indiv_id for indiv_id in family.individuals) paternal_ids = set(i.paternal_id for i in family.get_individuals() if i.paternal_id in valid_ids) maternal_ids = set(i.maternal_id for i in family.get_individuals() if i.maternal_id in valid_ids) parental_ids = paternal_ids | maternal_ids # loop over all variants returned for variant_dict in variant_iter: variant = Variant.fromJSON(variant_dict) datastore.add_annotations_to_variant(variant, family.project_id) if not passes_variant_filter(variant, variant_filter)[0]: continue # handle genotype filters if len(parental_ids) != 2: # ordinary filters for non-trios for indiv_id in de_novo_filter.keys(): genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter): break else: yield variant else: # for trios use Mark's recommended filters for de-novo variants: # Hard-coded thresholds: # 1) Child must have > 10% of combined Parental Read Depth # 2) MinimumChildGQscore >= 20 # 3) MaximumParentAlleleBalance <= 5% # Adjustable filters: # Variants should PASS # Child AB should be >= 20 # compute parental read depth for filter 1 total_parental_read_depth = 0 for indiv_id in parental_ids: genotype = variant.get_genotype(indiv_id) if genotype.extras and 'dp' in genotype.extras: total_parental_read_depth += int(genotype.extras['dp']) else: total_parental_read_depth = None # both parents must have DP to use the parental_read_depth filters break for indiv_id in de_novo_filter.keys(): quality_filter_temp = quality_filter.copy() # copy before modifying if indiv_id in parental_ids: # handle one of the parents quality_filter_temp['max_ab'] = 5 else: # handle child quality_filter_temp['min_gq'] = 20 if total_parental_read_depth is not None: quality_filter_temp['min_dp'] = total_parental_read_depth * 0.1 genotype = variant.get_genotype(indiv_id) if not passes_genotype_filter(genotype, quality_filter_temp): #print("%s: %s " % (variant.chr, variant.pos)) break else: yield variant