Пример #1
0
    def genetic_phenotypes(self, gen_file):
        """
        Load the full genetic data for this chromosome and isolate any information that can be isolated from it. In this
        case, .bed load types can access more than bgen due to the ability to extract sex from the .fam file.
        """

        ph_dict = {}
        # For plink files, load the fam file then extract the fid, iid and sex information
        if self.gen_type == ".bed":
            ph_dict[self.fam] = np.array(
                PlinkObject(
                    self.select_file_on_chromosome()).get_family_identifiers())
            ph_dict[self.fid] = mc.variant_array(self.fid.lower(),
                                                 ph_dict[self.fam])
            ph_dict[self.iid] = mc.variant_array(self.iid.lower(),
                                                 ph_dict[self.fam])
            ph_dict.pop(self.fam, None)

        # Bgen doesn't have a fam equivalent, so just load the fid and iid
        # todo update to allow for sex and missing if we have loaded .sample
        elif self.gen_type == ".bgen":
            if self._snp_tools:
                ids = gen_file.iid
            else:
                ids = gen_file.iid_array()

            ph_dict[self.fid] = np.array([fid for fid, iid in ids])
            ph_dict[self.iid] = np.array([iid for fid, iid in ids])

        else:
            raise Exception("Unknown load type set")

        return ph_dict
Пример #2
0
    def _validation_equality(self, line_index, variant_key, summary_dict, line_type=None):
        """
        Not all summary statistics may have chromosome or bp indexes and in this case information can be returned from
        the genetic variant. However if the information does exist, then we cross check to make sure it is equal in the
        summary and genetic files. If it is not, we filter out this snp. This is the generalised method which can also
        be used for other equality

        :param line_index: The index for the summary line to construct an array from the
        :type line_index: int

        :param line_type: The type of the summary line to be return as, defaults to none which will return a string
        :type line_type: None | type

        :param variant_key: Key to access Variant via getitem and set error dict
        :type variant_key: str

        :param summary_dict: The summary dictionary to hold information so that we can filter it
        :type summary_dict: dict

        :return: Nothing, construct and use the filter on summary_dict then stop
        """
        # Construct an array of summary and genetic chromosomes
        summary_array = mc.line_array(line_index, summary_dict[self.sm_lines], line_type)
        variant_array = mc.variant_array(variant_key.lower(), summary_dict[self.sm_variants])

        # Filter of True if the variant and summary match, else False which will remove this snp
        obj_filter = summary_array == variant_array
        self._sum_error_dict[f"Miss Matching {variant_key}"] = len(obj_filter) - np.sum(obj_filter)
        mc.filter_array(summary_dict, obj_filter, variant_key)
Пример #3
0
    def pgs_filter_snps(self, write=True):
        """
        Large numbers of snps and individuals can lead to significant memory issues. This will filter the snps in chunks
        vus allowing it to run with less memory
        """

        # Construct the reference panel
        gen_file = self.construct_reference_panel()
        t0 = time.time()

        # Chunk the snps, freqs, and bp positions so we can load raw dosage data in a memory conscious way
        sm_dict = self.sm_dict_from_csv(
            self.summary_directory, f"Cleaned_{self.target_chromosome}.csv")
        snp_list, chunks = self.chunked_snp_names(sm_dict, chunk_return=True)
        bp_positions = np.array_split(
            mc.variant_array(self.bp_position.lower(),
                             sm_dict[self.sm_variants]), chunks)
        freqs = np.array_split(sm_dict[self.freq], chunks)

        # Filter each chunk to clean any snps that may be probabilistic
        accepted_snps = [
            self.filter_snp_chunk(gen_file, snps, f, bp, index, len(snp_list))
            for index, (
                snps, f,
                bp) in enumerate(zip(snp_list, freqs, bp_positions), start=1)
        ]

        mc.filter_array(sm_dict, flatten(accepted_snps), "Filter")
        print(
            f"Found {len(sm_dict[self.sm_variants])} Snps that passed filtering"
        )

        # Return the filter summary dict
        mc.error_dict_to_terminal(self._filter_error_dict, "PGS_Filter_Snps",
                                  t0)
        return self.write_summary_files(sm_dict, write, self.target_chromosome,
                                        "Filtered", self.filter_directory)
Пример #4
0
    def pgs_clean_summary_stats(self, write=True):
        """
        This will take the summary statistics and access the validatable snps, found by cross referencing the genetic
        validation constructed from the reference file, and clean them of possible errors. It then returns an ordered on
        base pair position dictionary of information required for constructing poly-genetic scores and by default writes
        this information to a csv.
        """
        t0 = time.time()

        # Clean the summary lines to only include validatable snps from our genetic samples in target_chromosome
        sm_dict, validation_snps_count = self._valid_snps_lines_and_variants()

        # Clean the summary lines of valid snps for potential errors, if we ever wipe all our samples return None
        sm_dict = self._validate_summary_lines(sm_dict)

        # Construct the order from the base pair position, check we don't have an overflow issue
        order = np.argsort(mc.variant_array(self.bp_position.lower(), sm_dict[self.sm_variants]))
        assert len(order) <= validation_snps_count, ec.snp_overflow(len(order), validation_snps_count)
        mc.filter_array(sm_dict, order, "Order")

        # Log to terminal what has been filtered / removed, then write out if requested, and return the sm dict
        mc.error_dict_to_terminal(self._sum_error_dict, "Summary_Stats", t0)
        print(f"Found {len(sm_dict[self.sm_variants])}")
        return self.write_summary_files(sm_dict, write, self.target_chromosome, "Cleaned", self.summary_directory)
Пример #5
0
 def snp_names(self, sm_dict):
     """Variant names differ in pysnptools bgen, so account for this and just return rs_id's"""
     return variant_array(self.snp_id.lower(), sm_dict[self.sm_variants])