def _validate_nucleotides(self, sm_dict): """ This wil validate the nucleotides against ambiguous snps, invalid snps, and flip the snps if possible whilst removing them if flipping fails. :param sm_dict: The summary dictionary to hold information so that we can filter it :type sm_dict: dict :return: Nothing, filter the arrays if required and flip betas if the allele is flipped """ # Ambiguous # Construct the summary nucleotide effected_allele = mc.line_array(self.sm_effect_allele, sm_dict[self.sm_lines]) alt_allele = mc.line_array(self.sm_alt_allele, sm_dict[self.sm_lines]) sm_dict[self.nucleotide] = np.array([Nucleotide(e, a) for e, a in zip(effected_allele, alt_allele)]) # Filter out any snps where the summery or variant Nucleotide is ambiguous filter_ambiguous = [False if (sm_nuc.to_tuple() in self.ambiguous_snps) or ((var_nuc.a1, var_nuc.a2) in self.ambiguous_snps) else True for sm_nuc, var_nuc in zip(sm_dict[self.nucleotide], sm_dict[self.sm_variants])] self._sum_error_dict["Ambiguous_SNP"] = len(filter_ambiguous) - np.sum(filter_ambiguous) mc.filter_array(sm_dict, filter_ambiguous, "Ambiguous Snps") # Sanity Check # Filter out any snps that do not pass a sanity check (Only a t c and g) allowed_filter = [False if (sm_nuc.a1 not in self.allowed_alleles) or (sm_nuc.a2 not in self.allowed_alleles) or (var_nuc.a1 not in self.allowed_alleles) or (var_nuc.a2 not in self.allowed_alleles) else True for sm_nuc, var_nuc in zip(sm_dict[self.nucleotide], sm_dict[self.sm_variants])] self._sum_error_dict["Non_Allowed_Allele"] = len(allowed_filter) - np.sum(allowed_filter) mc.filter_array(sm_dict, allowed_filter, "Snp Sanity Check") # Determine Flipping # Construct a flip status of 1, 0, -1 for No flipping, failed flipping, and flipped successfully which we can # multiple our betas by sm_dict["Flip"] = np.array([self._flip_nucleotide(var_nuc, sm_nuc) for var_nuc, sm_nuc in zip(sm_dict[self.sm_variants], sm_dict[self.nucleotide])]) filter_flipped = np.array([False if flipped == 0 else True for flipped in sm_dict["Flip"]]) self._sum_error_dict["Non_Matching"] = int(np.sum([1 if f == 0 else 0 for f in sm_dict["Flip"]])) self._sum_error_dict["Flipped"] = int(np.sum([1 if f == -1 else 0 for f in sm_dict["Flip"]])) mc.filter_array(sm_dict, filter_flipped, "Snp Flipping") # Now we have filtered away any errors, multiple the dicts beta and log_odds elements by 1 or -1 based on no # flipping or requiring flipping sm_dict[self.beta] = sm_dict[self.beta] * sm_dict["Flip"] sm_dict[self.log_odds] = sm_dict[self.log_odds] * sm_dict["Flip"]
def _valid_snps_lines_and_variants(self): """ We will load our variants from our validation and core samples and use those to check if the snp found in the summary line is within our validation and core sample sets of snps. If this is the case, then we will add the line to sm_line as well as a Variant object of the current snp valid snp to sm_variants """ # Load the validation snps from the genetic file, as well as the genetic indexer to get the variants from. validation_snps, indexer, duplicates = self.load_variants() self._sum_error_dict["Total Duplicates"] += duplicates print(f"Loaded {len(validation_snps)} snps to check against the summary stats") # Extract the lines from the summary, and create an array of snps found in the summary statistics sm_line = self._line_by_line_summary(validation_snps) sm_snps = mc.line_array(self.sm_snp_id, sm_line) # Filter out snps that where not found in our validation snp set variants_filter = np.array([True if snp in validation_snps else False for snp in sm_snps]) self._sum_error_dict[f"Invalid_Snps"] = len(variants_filter) - np.sum(variants_filter) filtered_snps = sm_snps[variants_filter] # Create an array of Variants from the gen indexer based on valid snps found in the summary stats if self.gen_type == ".bed": # Bed files also have morgan position which we don't currently use so filter out with True sm_variants = indexer.info_from_sid(filtered_snps, True) else: sm_variants = indexer.info_from_sid(filtered_snps) # Return arrays as a dict for further cleaning and filtering, and the validation snp count print(f"Found valid lines {len(sm_line)} and Variants {len(sm_variants)}") sm_dict = {self.sm_lines: np.array(sm_line[variants_filter]), self.sm_variants: np.array(sm_variants)} return sm_dict, len(validation_snps)
def _validation_equality(self, line_index, variant_key, summary_dict, line_type=None): """ Not all summary statistics may have chromosome or bp indexes and in this case information can be returned from the genetic variant. However if the information does exist, then we cross check to make sure it is equal in the summary and genetic files. If it is not, we filter out this snp. This is the generalised method which can also be used for other equality :param line_index: The index for the summary line to construct an array from the :type line_index: int :param line_type: The type of the summary line to be return as, defaults to none which will return a string :type line_type: None | type :param variant_key: Key to access Variant via getitem and set error dict :type variant_key: str :param summary_dict: The summary dictionary to hold information so that we can filter it :type summary_dict: dict :return: Nothing, construct and use the filter on summary_dict then stop """ # Construct an array of summary and genetic chromosomes summary_array = mc.line_array(line_index, summary_dict[self.sm_lines], line_type) variant_array = mc.variant_array(variant_key.lower(), summary_dict[self.sm_variants]) # Filter of True if the variant and summary match, else False which will remove this snp obj_filter = summary_array == variant_array self._sum_error_dict[f"Miss Matching {variant_key}"] = len(obj_filter) - np.sum(obj_filter) mc.filter_array(summary_dict, obj_filter, variant_key)
def _validate_info(self, sm_line): """Construct infos if they exist in the summary stats else return an array of length of summary dict""" if self.sm_info is not None: infos = mc.line_array(self.sm_info, sm_line, float) else: infos = np.empty(len(sm_line)) infos.fill(-1) return infos
def _validation_finite(self, summary_dict, line_index, summary_key): """ Numeric columns need to screened for values being finite and not equal to zero. Unlike _validation_equality this method also appended the information to summary_dict as it has created new information not within the genetic variants rather than just screening pre-existing information :param line_index: The index for the summary line to construct an array from the :type line_index: int :param summary_key: A string key that is used for accessing this attribute :type summary_key: str :param summary_dict: The summary dictionary to hold information so that we can filter it :type summary_dict: dict :return: Nothing, construct the filter and then filter all attributes within the summary dict """ # Construct an array for this numeric summary_key and add it to summary dict under the name of summary_key summary_dict[summary_key] = mc.line_array(line_index, summary_dict[self.sm_lines], float) # Filter out anything that is not finite or is equal to zero obj_filter = np.array([True if np.isfinite(obj) and obj != 0 else False for obj in summary_dict[summary_key]]) self._sum_error_dict[f"Non Finite {summary_key}"] = len(obj_filter) - np.sum(obj_filter) mc.filter_array(summary_dict, obj_filter, summary_key)