Пример #1
0
    def _validation_equality(self, line_index, variant_key, summary_dict, line_type=None):
        """
        Not all summary statistics may have chromosome or bp indexes and in this case information can be returned from
        the genetic variant. However if the information does exist, then we cross check to make sure it is equal in the
        summary and genetic files. If it is not, we filter out this snp. This is the generalised method which can also
        be used for other equality

        :param line_index: The index for the summary line to construct an array from the
        :type line_index: int

        :param line_type: The type of the summary line to be return as, defaults to none which will return a string
        :type line_type: None | type

        :param variant_key: Key to access Variant via getitem and set error dict
        :type variant_key: str

        :param summary_dict: The summary dictionary to hold information so that we can filter it
        :type summary_dict: dict

        :return: Nothing, construct and use the filter on summary_dict then stop
        """
        # Construct an array of summary and genetic chromosomes
        summary_array = mc.line_array(line_index, summary_dict[self.sm_lines], line_type)
        variant_array = mc.variant_array(variant_key.lower(), summary_dict[self.sm_variants])

        # Filter of True if the variant and summary match, else False which will remove this snp
        obj_filter = summary_array == variant_array
        self._sum_error_dict[f"Miss Matching {variant_key}"] = len(obj_filter) - np.sum(obj_filter)
        mc.filter_array(summary_dict, obj_filter, variant_key)
Пример #2
0
    def pgs_filter_snps(self, write=True):
        """
        Large numbers of snps and individuals can lead to significant memory issues. This will filter the snps in chunks
        vus allowing it to run with less memory
        """

        # Construct the reference panel
        gen_file = self.construct_reference_panel()
        t0 = time.time()

        # Chunk the snps, freqs, and bp positions so we can load raw dosage data in a memory conscious way
        sm_dict = self.sm_dict_from_csv(
            self.summary_directory, f"Cleaned_{self.target_chromosome}.csv")
        snp_list, chunks = self.chunked_snp_names(sm_dict, chunk_return=True)
        bp_positions = np.array_split(
            mc.variant_array(self.bp_position.lower(),
                             sm_dict[self.sm_variants]), chunks)
        freqs = np.array_split(sm_dict[self.freq], chunks)

        # Filter each chunk to clean any snps that may be probabilistic
        accepted_snps = [
            self.filter_snp_chunk(gen_file, snps, f, bp, index, len(snp_list))
            for index, (
                snps, f,
                bp) in enumerate(zip(snp_list, freqs, bp_positions), start=1)
        ]

        mc.filter_array(sm_dict, flatten(accepted_snps), "Filter")
        print(
            f"Found {len(sm_dict[self.sm_variants])} Snps that passed filtering"
        )

        # Return the filter summary dict
        mc.error_dict_to_terminal(self._filter_error_dict, "PGS_Filter_Snps",
                                  t0)
        return self.write_summary_files(sm_dict, write, self.target_chromosome,
                                        "Filtered", self.filter_directory)
Пример #3
0
    def pgs_clean_summary_stats(self, write=True):
        """
        This will take the summary statistics and access the validatable snps, found by cross referencing the genetic
        validation constructed from the reference file, and clean them of possible errors. It then returns an ordered on
        base pair position dictionary of information required for constructing poly-genetic scores and by default writes
        this information to a csv.
        """
        t0 = time.time()

        # Clean the summary lines to only include validatable snps from our genetic samples in target_chromosome
        sm_dict, validation_snps_count = self._valid_snps_lines_and_variants()

        # Clean the summary lines of valid snps for potential errors, if we ever wipe all our samples return None
        sm_dict = self._validate_summary_lines(sm_dict)

        # Construct the order from the base pair position, check we don't have an overflow issue
        order = np.argsort(mc.variant_array(self.bp_position.lower(), sm_dict[self.sm_variants]))
        assert len(order) <= validation_snps_count, ec.snp_overflow(len(order), validation_snps_count)
        mc.filter_array(sm_dict, order, "Order")

        # Log to terminal what has been filtered / removed, then write out if requested, and return the sm dict
        mc.error_dict_to_terminal(self._sum_error_dict, "Summary_Stats", t0)
        print(f"Found {len(sm_dict[self.sm_variants])}")
        return self.write_summary_files(sm_dict, write, self.target_chromosome, "Cleaned", self.summary_directory)
Пример #4
0
    def _validation_finite(self, summary_dict, line_index, summary_key):
        """
        Numeric columns need to screened for values being finite and not equal to zero. Unlike _validation_equality this
        method also appended the information to summary_dict as it has created new information not within the genetic
        variants rather than just screening pre-existing information

        :param line_index: The index for the summary line to construct an array from the
        :type line_index: int

        :param summary_key: A string key that is used for accessing this attribute
        :type summary_key: str

        :param summary_dict: The summary dictionary to hold information so that we can filter it
        :type summary_dict: dict

        :return: Nothing, construct the filter and then filter all attributes within the summary dict
        """
        # Construct an array for this numeric summary_key and add it to summary dict under the name of summary_key
        summary_dict[summary_key] = mc.line_array(line_index, summary_dict[self.sm_lines], float)

        # Filter out anything that is not finite or is equal to zero
        obj_filter = np.array([True if np.isfinite(obj) and obj != 0 else False for obj in summary_dict[summary_key]])
        self._sum_error_dict[f"Non Finite {summary_key}"] = len(obj_filter) - np.sum(obj_filter)
        mc.filter_array(summary_dict, obj_filter, summary_key)
Пример #5
0
    def _validate_nucleotides(self, sm_dict):
        """
        This wil validate the nucleotides against ambiguous snps, invalid snps, and flip the snps if possible whilst
        removing them if flipping fails.

        :param sm_dict: The summary dictionary to hold information so that we can filter it
        :type sm_dict: dict

        :return: Nothing, filter the arrays if required and flip betas if the allele is flipped
        """

        # Ambiguous
        # Construct the summary nucleotide
        effected_allele = mc.line_array(self.sm_effect_allele, sm_dict[self.sm_lines])
        alt_allele = mc.line_array(self.sm_alt_allele, sm_dict[self.sm_lines])
        sm_dict[self.nucleotide] = np.array([Nucleotide(e, a) for e, a in zip(effected_allele, alt_allele)])

        # Filter out any snps where the summery or variant Nucleotide is ambiguous
        filter_ambiguous = [False if (sm_nuc.to_tuple() in self.ambiguous_snps) or
                                     ((var_nuc.a1, var_nuc.a2) in self.ambiguous_snps)
                            else True
                            for sm_nuc, var_nuc in zip(sm_dict[self.nucleotide], sm_dict[self.sm_variants])]
        self._sum_error_dict["Ambiguous_SNP"] = len(filter_ambiguous) - np.sum(filter_ambiguous)
        mc.filter_array(sm_dict, filter_ambiguous, "Ambiguous Snps")

        # Sanity Check
        # Filter out any snps that do not pass a sanity check (Only a t c and g)
        allowed_filter = [False if (sm_nuc.a1 not in self.allowed_alleles) or
                                   (sm_nuc.a2 not in self.allowed_alleles) or
                                   (var_nuc.a1 not in self.allowed_alleles) or
                                   (var_nuc.a2 not in self.allowed_alleles)
                          else True
                          for sm_nuc, var_nuc in zip(sm_dict[self.nucleotide], sm_dict[self.sm_variants])]
        self._sum_error_dict["Non_Allowed_Allele"] = len(allowed_filter) - np.sum(allowed_filter)
        mc.filter_array(sm_dict, allowed_filter, "Snp Sanity Check")

        # Determine Flipping
        # Construct a flip status of 1, 0, -1 for No flipping, failed flipping, and flipped successfully which we can
        # multiple our betas by
        sm_dict["Flip"] = np.array([self._flip_nucleotide(var_nuc, sm_nuc) for var_nuc, sm_nuc in
                                    zip(sm_dict[self.sm_variants], sm_dict[self.nucleotide])])
        filter_flipped = np.array([False if flipped == 0 else True for flipped in sm_dict["Flip"]])
        self._sum_error_dict["Non_Matching"] = int(np.sum([1 if f == 0 else 0 for f in sm_dict["Flip"]]))
        self._sum_error_dict["Flipped"] = int(np.sum([1 if f == -1 else 0 for f in sm_dict["Flip"]]))
        mc.filter_array(sm_dict, filter_flipped, "Snp Flipping")

        # Now we have filtered away any errors, multiple the dicts beta and log_odds elements by 1 or -1 based on no
        # flipping or requiring flipping
        sm_dict[self.beta] = sm_dict[self.beta] * sm_dict["Flip"]
        sm_dict[self.log_odds] = sm_dict[self.log_odds] * sm_dict["Flip"]