Пример #1
0
    def _validate_nucleotides(self, sm_dict):
        """
        This wil validate the nucleotides against ambiguous snps, invalid snps, and flip the snps if possible whilst
        removing them if flipping fails.

        :param sm_dict: The summary dictionary to hold information so that we can filter it
        :type sm_dict: dict

        :return: Nothing, filter the arrays if required and flip betas if the allele is flipped
        """

        # Ambiguous
        # Construct the summary nucleotide
        effected_allele = mc.line_array(self.sm_effect_allele, sm_dict[self.sm_lines])
        alt_allele = mc.line_array(self.sm_alt_allele, sm_dict[self.sm_lines])
        sm_dict[self.nucleotide] = np.array([Nucleotide(e, a) for e, a in zip(effected_allele, alt_allele)])

        # Filter out any snps where the summery or variant Nucleotide is ambiguous
        filter_ambiguous = [False if (sm_nuc.to_tuple() in self.ambiguous_snps) or
                                     ((var_nuc.a1, var_nuc.a2) in self.ambiguous_snps)
                            else True
                            for sm_nuc, var_nuc in zip(sm_dict[self.nucleotide], sm_dict[self.sm_variants])]
        self._sum_error_dict["Ambiguous_SNP"] = len(filter_ambiguous) - np.sum(filter_ambiguous)
        mc.filter_array(sm_dict, filter_ambiguous, "Ambiguous Snps")

        # Sanity Check
        # Filter out any snps that do not pass a sanity check (Only a t c and g)
        allowed_filter = [False if (sm_nuc.a1 not in self.allowed_alleles) or
                                   (sm_nuc.a2 not in self.allowed_alleles) or
                                   (var_nuc.a1 not in self.allowed_alleles) or
                                   (var_nuc.a2 not in self.allowed_alleles)
                          else True
                          for sm_nuc, var_nuc in zip(sm_dict[self.nucleotide], sm_dict[self.sm_variants])]
        self._sum_error_dict["Non_Allowed_Allele"] = len(allowed_filter) - np.sum(allowed_filter)
        mc.filter_array(sm_dict, allowed_filter, "Snp Sanity Check")

        # Determine Flipping
        # Construct a flip status of 1, 0, -1 for No flipping, failed flipping, and flipped successfully which we can
        # multiple our betas by
        sm_dict["Flip"] = np.array([self._flip_nucleotide(var_nuc, sm_nuc) for var_nuc, sm_nuc in
                                    zip(sm_dict[self.sm_variants], sm_dict[self.nucleotide])])
        filter_flipped = np.array([False if flipped == 0 else True for flipped in sm_dict["Flip"]])
        self._sum_error_dict["Non_Matching"] = int(np.sum([1 if f == 0 else 0 for f in sm_dict["Flip"]]))
        self._sum_error_dict["Flipped"] = int(np.sum([1 if f == -1 else 0 for f in sm_dict["Flip"]]))
        mc.filter_array(sm_dict, filter_flipped, "Snp Flipping")

        # Now we have filtered away any errors, multiple the dicts beta and log_odds elements by 1 or -1 based on no
        # flipping or requiring flipping
        sm_dict[self.beta] = sm_dict[self.beta] * sm_dict["Flip"]
        sm_dict[self.log_odds] = sm_dict[self.log_odds] * sm_dict["Flip"]
Пример #2
0
    def _valid_snps_lines_and_variants(self):
        """
        We will load our variants from our validation and core samples and use those to check if the snp found in the
        summary line is within our validation and core sample sets of snps. If this is the case, then we will add the
        line to sm_line as well as a Variant object of the current snp valid snp to sm_variants
        """
        # Load the validation snps from the genetic file, as well as the genetic indexer to get the variants from.
        validation_snps, indexer, duplicates = self.load_variants()
        self._sum_error_dict["Total Duplicates"] += duplicates
        print(f"Loaded {len(validation_snps)} snps to check against the summary stats")

        # Extract the lines from the summary, and create an array of snps found in the summary statistics
        sm_line = self._line_by_line_summary(validation_snps)
        sm_snps = mc.line_array(self.sm_snp_id, sm_line)

        # Filter out snps that where not found in our validation snp set
        variants_filter = np.array([True if snp in validation_snps else False for snp in sm_snps])
        self._sum_error_dict[f"Invalid_Snps"] = len(variants_filter) - np.sum(variants_filter)
        filtered_snps = sm_snps[variants_filter]

        # Create an array of Variants from the gen indexer based on valid snps found in the summary stats
        if self.gen_type == ".bed":
            # Bed files also have morgan position which we don't currently use so filter out with True
            sm_variants = indexer.info_from_sid(filtered_snps, True)
        else:
            sm_variants = indexer.info_from_sid(filtered_snps)

        # Return arrays as a dict for further cleaning and filtering, and the validation snp count
        print(f"Found valid lines {len(sm_line)} and Variants {len(sm_variants)}")
        sm_dict = {self.sm_lines: np.array(sm_line[variants_filter]), self.sm_variants: np.array(sm_variants)}
        return sm_dict, len(validation_snps)
Пример #3
0
    def _validation_equality(self, line_index, variant_key, summary_dict, line_type=None):
        """
        Not all summary statistics may have chromosome or bp indexes and in this case information can be returned from
        the genetic variant. However if the information does exist, then we cross check to make sure it is equal in the
        summary and genetic files. If it is not, we filter out this snp. This is the generalised method which can also
        be used for other equality

        :param line_index: The index for the summary line to construct an array from the
        :type line_index: int

        :param line_type: The type of the summary line to be return as, defaults to none which will return a string
        :type line_type: None | type

        :param variant_key: Key to access Variant via getitem and set error dict
        :type variant_key: str

        :param summary_dict: The summary dictionary to hold information so that we can filter it
        :type summary_dict: dict

        :return: Nothing, construct and use the filter on summary_dict then stop
        """
        # Construct an array of summary and genetic chromosomes
        summary_array = mc.line_array(line_index, summary_dict[self.sm_lines], line_type)
        variant_array = mc.variant_array(variant_key.lower(), summary_dict[self.sm_variants])

        # Filter of True if the variant and summary match, else False which will remove this snp
        obj_filter = summary_array == variant_array
        self._sum_error_dict[f"Miss Matching {variant_key}"] = len(obj_filter) - np.sum(obj_filter)
        mc.filter_array(summary_dict, obj_filter, variant_key)
Пример #4
0
 def _validate_info(self, sm_line):
     """Construct infos if they exist in the summary stats else return an array of length of summary dict"""
     if self.sm_info is not None:
         infos = mc.line_array(self.sm_info, sm_line, float)
     else:
         infos = np.empty(len(sm_line))
         infos.fill(-1)
     return infos
Пример #5
0
    def _validation_finite(self, summary_dict, line_index, summary_key):
        """
        Numeric columns need to screened for values being finite and not equal to zero. Unlike _validation_equality this
        method also appended the information to summary_dict as it has created new information not within the genetic
        variants rather than just screening pre-existing information

        :param line_index: The index for the summary line to construct an array from the
        :type line_index: int

        :param summary_key: A string key that is used for accessing this attribute
        :type summary_key: str

        :param summary_dict: The summary dictionary to hold information so that we can filter it
        :type summary_dict: dict

        :return: Nothing, construct the filter and then filter all attributes within the summary dict
        """
        # Construct an array for this numeric summary_key and add it to summary dict under the name of summary_key
        summary_dict[summary_key] = mc.line_array(line_index, summary_dict[self.sm_lines], float)

        # Filter out anything that is not finite or is equal to zero
        obj_filter = np.array([True if np.isfinite(obj) and obj != 0 else False for obj in summary_dict[summary_key]])
        self._sum_error_dict[f"Non Finite {summary_key}"] = len(obj_filter) - np.sum(obj_filter)
        mc.filter_array(summary_dict, obj_filter, summary_key)