Пример #1
0
    def _write_csv(self):
        """ Write SNPs to a CSV file.

        Returns
        -------
        str
            path to file in output directory if SNPs were saved, else empty str
        """
        filename = self._filename
        if not filename:
            ext = ".txt"

            if "sep" in self._kwargs and self._kwargs["sep"] == ",":
                ext = ".csv"

            filename = "{}_{}{}".format(clean_str(self._snps.source),
                                        self._snps.assembly, ext)

        comment = ("# Source(s): {}\n"
                   "# Build: {}\n"
                   "# Build Detected: {}\n"
                   "# Phased: {}\n"
                   "# SNPs: {}\n"
                   "# Chromosomes: {}\n".format(
                       self._snps.source,
                       self._snps.build,
                       self._snps.build_detected,
                       self._snps.phased,
                       self._snps.count,
                       self._snps.chromosomes_summary,
                   ))
        if "header" in self._kwargs:
            if isinstance(self._kwargs["header"], bool):
                if self._kwargs["header"]:
                    self._kwargs["header"] = [
                        "chromosome", "position", "genotype"
                    ]
        else:
            self._kwargs["header"] = ["chromosome", "position", "genotype"]

        return save_df_as_csv(self._snps._snps,
                              self._snps._output_dir,
                              filename,
                              comment=comment,
                              atomic=self._atomic,
                              **self._kwargs)
Пример #2
0
    def _write_vcf(self):
        """ Write SNPs to a VCF file.

        References
        ----------
        1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019,
           https://samtools.github.io/hts-specs/VCFv4.2.pdf

        Returns
        -------
        str
            path to file in output directory if SNPs were saved, else empty str
        discrepant_vcf_position : pd.DataFrame
            SNPs with discrepant positions discovered while saving VCF
        """
        filename = self._filename
        if not filename:
            filename = f"{clean_str(self._snps.source)}_{self._snps.assembly}{'.vcf'}"

        comment = (
            f"##fileformat=VCFv4.2\n"
            f'##fileDate={datetime.datetime.utcnow().strftime("%Y%m%d")}\n'
            f'##source="{self._snps.source}; snps v{snps.__version__}; https://pypi.org/project/snps/"\n'
        )

        reference_sequence_chroms = (
            "1",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
            "10",
            "11",
            "12",
            "13",
            "14",
            "15",
            "16",
            "17",
            "18",
            "19",
            "20",
            "21",
            "22",
            "X",
            "Y",
            "MT",
        )

        df = self._snps.snps

        tasks = []

        # skip insertions and deletions
        df = df.drop(df.loc[df["genotype"].notnull()
                            & ((df["genotype"].str[0] == "I")
                               | (df["genotype"].str[0] == "D")
                               | (df["genotype"].str[1] == "I")
                               | (df["genotype"].str[1] == "D"))].index)

        chroms_to_drop = []
        for chrom in df["chrom"].unique():
            if chrom not in reference_sequence_chroms:
                chroms_to_drop.append(chrom)
                continue

            tasks.append({
                "resources": self._snps._resources,
                "assembly": self._snps.assembly,
                "chrom": chrom,
                "snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]),
            })

        # drop chromosomes without reference sequence data (e.g., unassigned PAR)
        for chrom in chroms_to_drop:
            df = df.drop(df.loc[df["chrom"] == chrom].index)

        # create the VCF representation for SNPs
        results = map(self._create_vcf_representation, tasks)

        contigs = []
        vcf = [pd.DataFrame()]
        discrepant_vcf_position = [pd.DataFrame()]
        for result in list(results):
            contigs.append(result["contig"])
            vcf.append(result["vcf"])
            discrepant_vcf_position.append(result["discrepant_vcf_position"])

        vcf = pd.concat(vcf)
        discrepant_vcf_position = pd.concat(discrepant_vcf_position)

        comment += "".join(contigs)
        comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
        comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"

        return (
            save_df_as_csv(
                vcf,
                self._snps._output_dir,
                filename,
                comment=comment,
                prepend_info=False,
                header=False,
                index=False,
                na_rep=".",
                sep="\t",
            ),
            discrepant_vcf_position,
        )
Пример #3
0
    def _find_shared_dna_output_helper(
        self,
        individuals,
        one_chrom_shared_dna,
        two_chrom_shared_dna,
        one_chrom_shared_genes,
        two_chrom_shared_genes,
    ):
        cytobands = self._resources.get_cytoBand_hg19()

        individuals_filename = ""
        individuals_plot_title = ""

        for individual in individuals:
            individuals_filename += individual.get_var_name() + "_"
            individuals_plot_title += individual.name + " / "

        individuals_filename = individuals_filename[:-1]
        individuals_plot_title = individuals_plot_title[:-3]

        if create_dir(self._output_dir):
            plot_chromosomes(
                one_chrom_shared_dna,
                two_chrom_shared_dna,
                cytobands,
                os.path.join(
                    self._output_dir, "shared_dna_{}.png".format(individuals_filename)
                ),
                "{} shared DNA".format(individuals_plot_title),
                37,
            )

        if len(one_chrom_shared_dna) > 0:
            file = "shared_dna_one_chrom_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                one_chrom_shared_dna,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
                float_format="%.2f",
            )

        if len(two_chrom_shared_dna) > 0:
            file = "shared_dna_two_chroms_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                two_chrom_shared_dna,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
                float_format="%.2f",
            )

        if len(one_chrom_shared_genes) > 0:
            file = "shared_genes_one_chrom_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                one_chrom_shared_genes,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
            )

        if len(two_chrom_shared_genes) > 0:
            file = "shared_genes_two_chroms_{}_GRCh37.csv".format(individuals_filename)
            save_df_as_csv(
                two_chrom_shared_genes,
                self._output_dir,
                file,
                comment=self._get_csv_header(),
                prepend_info=False,
            )
Пример #4
0
    def find_discordant_snps(
        self, individual1, individual2, individual3=None, save_output=False
    ):
        """ Find discordant SNPs between two or three individuals.

        Parameters
        ----------
        individual1 : Individual
            reference individual (child if `individual2` and `individual3` are parents)
        individual2 : Individual
            comparison individual
        individual3 : Individual
            other parent if `individual1` is child and `individual2` is a parent
        save_output : bool
            specifies whether to save output to a CSV file in the output directory

        Returns
        -------
        pandas.DataFrame
            discordant SNPs and associated genetic data

        References
        ----------
        1. David Pike, "Search for Discordant SNPs in Parent-Child
           Raw Data Files," David Pike's Utilities,
           http://www.math.mun.ca/~dapike/FF23utils/pair-discord.php
        2. David Pike, "Search for Discordant SNPs when given data
           for child and both parents," David Pike's Utilities,
           http://www.math.mun.ca/~dapike/FF23utils/trio-discord.php
        """
        self._remap_snps_to_GRCh37([individual1, individual2, individual3])

        df = individual1.snps

        # remove nulls for reference individual
        df = df.loc[df["genotype"].notnull()]

        # add SNPs shared with `individual2`
        df = df.join(individual2.snps["genotype"], rsuffix="2")

        genotype1 = "genotype_" + individual1.get_var_name()
        genotype2 = "genotype_" + individual2.get_var_name()

        if individual3 is None:
            df = df.rename(columns={"genotype": genotype1, "genotype2": genotype2})

            # find discordant SNPs between reference and comparison individuals
            df = df.loc[
                df[genotype2].notnull()
                & (
                    (df[genotype1].str.len() == 1)
                    & (df[genotype2].str.len() == 1)
                    & (df[genotype1] != df[genotype2])
                )
                | (
                    (df[genotype1].str.len() == 2)
                    & (df[genotype2].str.len() == 2)
                    & (df[genotype1].str[0] != df[genotype2].str[0])
                    & (df[genotype1].str[0] != df[genotype2].str[1])
                    & (df[genotype1].str[1] != df[genotype2].str[0])
                    & (df[genotype1].str[1] != df[genotype2].str[1])
                )
            ]
            if save_output:
                save_df_as_csv(
                    df,
                    self._output_dir,
                    "discordant_snps_{}_{}_GRCh37.csv".format(
                        individual1.get_var_name(), individual2.get_var_name()
                    ),
                    comment=self._get_csv_header(),
                    prepend_info=False,
                )
        else:
            # add SNPs shared with `individual3`
            df = df.join(individual3.snps["genotype"], rsuffix="3")

            genotype3 = "genotype_" + individual3.get_var_name()

            df = df.rename(
                columns={
                    "genotype": genotype1,
                    "genotype2": genotype2,
                    "genotype3": genotype3,
                }
            )

            # find discordant SNPs between child and two parents
            df = df.loc[
                (
                    df[genotype2].notnull()
                    & (
                        (df[genotype1].str.len() == 1)
                        & (df[genotype2].str.len() == 1)
                        & (df[genotype1] != df[genotype2])
                    )
                    | (
                        (df[genotype1].str.len() == 2)
                        & (df[genotype2].str.len() == 2)
                        & (df[genotype1].str[0] != df[genotype2].str[0])
                        & (df[genotype1].str[0] != df[genotype2].str[1])
                        & (df[genotype1].str[1] != df[genotype2].str[0])
                        & (df[genotype1].str[1] != df[genotype2].str[1])
                    )
                )
                | (
                    df[genotype3].notnull()
                    & (
                        (df[genotype1].str.len() == 1)
                        & (df[genotype3].str.len() == 1)
                        & (df[genotype1] != df[genotype3])
                    )
                    | (
                        (df[genotype1].str.len() == 2)
                        & (df[genotype3].str.len() == 2)
                        & (df[genotype1].str[0] != df[genotype3].str[0])
                        & (df[genotype1].str[0] != df[genotype3].str[1])
                        & (df[genotype1].str[1] != df[genotype3].str[0])
                        & (df[genotype1].str[1] != df[genotype3].str[1])
                    )
                )
                | (
                    df[genotype2].notnull()
                    & df[genotype3].notnull()
                    & (df[genotype2].str.len() == 2)
                    & (df[genotype2].str[0] == df[genotype2].str[1])
                    & (df[genotype2] == df[genotype3])
                    & (df[genotype1] != df[genotype2])
                )
            ]

            if save_output:
                save_df_as_csv(
                    df,
                    self._output_dir,
                    "discordant_snps_{}_{}_{}_GRCh37.csv".format(
                        individual1.get_var_name(),
                        individual2.get_var_name(),
                        individual3.get_var_name(),
                    ),
                    comment=self._get_csv_header(),
                    prepend_info=False,
                )

        return df
Пример #5
0
def main():
    logger.info("start")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    filenames = [
        filename
        for filename in filenames
        if "readme" not in filename and "phenotype" not in filename
    ]

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)
    tasks = [{"file": filenames[i]} for i in samples]

    # run tasks; results is a list of dicts
    results = p(load_file, tasks)

    # get results from `load_file` where `count` was non-zero
    rows = [item for item in results if "msg" not in item]

    df = pd.DataFrame(
        rows,
        columns=["file", "source", "build", "build_detected", "chromosomes", "count"],
    )

    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")

    # log parsing statistics
    file_count = len(filenames)
    logger.info(f"{file_count} files in the openSNP datadump")
    logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed")
    logger.info(
        f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed"
    )

    # extract files from the datadump where `load_file` returned a message
    if EXTRACT_FILES:
        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
        d = {}
        for result in results:
            if "msg" in result:
                if result["msg"] in d:
                    d[result["msg"]].append(result["file"])
                else:
                    d[result["msg"]] = [result["file"]]

        # add messages / file filters as necessary...
        d["build not detected"] = list(df.loc[~df.build_detected].file.values)

        # extract files that have messages for debugging
        for msg, files in d.items():
            if len(files) == 0:
                continue

            # create a directory for each message (prefix indicates number of files)
            path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}")
            create_dir(path)
            # save each file with message into created directory
            for filename in files:
                with atomic_write(os.path.join(path, filename), mode="wb") as f:
                    f.write(r.load_opensnp_datadump_file(filename))

    logger.info("stop")
Пример #6
0
            "build_detected",
            "x_snps",
            "heterozygous_x_snps",
            "y_snps",
            "y_snps_not_null",
            "count",
        ],
    )

    # derive the columns we want to analyze
    df["heterozygous_x_snps_ratio"] = df.heterozygous_x_snps / df.x_snps
    df["y_snps_not_null_ratio"] = df.y_snps_not_null / df.y_snps

    df.drop(df.loc[df["heterozygous_x_snps_ratio"].isna()].index, inplace=True)
    df.drop(df.loc[df["y_snps_not_null_ratio"].isna()].index, inplace=True)

    plt = create_analysis_plot(
        df[["heterozygous_x_snps_ratio", "y_snps_not_null_ratio"]])

    # save output
    with atomic_write(
            f"{os.path.join(OUTPUT_DIR, 'xy-chrom-snp-ratios.png')}",
            mode="wb",
            overwrite=True,
    ) as f:
        plt.savefig(f)

    save_df_as_csv(df, OUTPUT_DIR, "xy-chrom-snp-ratios.csv")

    logger.info("stop")
Пример #7
0
def main():
    logging.info("start analysis")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # get the 1000 genomes samples
    dfsamples = get_1kg_samples(
        f"{DATA_DIR}/integrated_call_samples_v3.20130502.ALL.panel"
    )
    logging.info("retreived the 1kg samples")

    aisnps_1kg = (
        vcf2df(f"{DATA_DIR}/kidd.55aisnp.1kg.vcf", dfsamples)
        if aisnp_SET == "kidd et al. 55 aisnps"
        else vcf2df(f"{DATA_DIR}/Seldin.128aisnp.1kg.vcf", dfsamples)
    )
    logging.info("made the AIsnp DataFrame")

    # Encode 1kg data
    X_encoded, encoder = encode_genotypes(aisnps_1kg)
    logging.info("encoded the genotypes")

    # perform dimensionality reduction on the 1kg set
    X_reduced, reducer = dimensionality_reduction(
        X_encoded, algorithm=DIMENSIONALITY_REDUCTION_ALGORITHM
    )
    logging.info("Reduced the dimensionality of the genotypes")

    # predicted population
    knn_super_pop = KNeighborsClassifier(
        n_neighbors=9, weights="distance", n_jobs=1
    )
    knn_pop = KNeighborsClassifier(n_neighbors=9, weights="distance", n_jobs=1)

    # fit the knn before adding the user sample
    logging.info("Fitting the superpopulation model")
    knn_super_pop.fit(X_reduced, dfsamples["super population"])
    logging.info("Done!")
    logging.info("Fitting the population model")
    knn_pop.fit(X_reduced, dfsamples["population"])
    logging.info("Done!")

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)

    tasks = [
        {
            "file": filenames[i],
            "aisnps_1kg": aisnps_1kg,
            "X_encoded": X_encoded,
            "encoder": encoder,
            "reducer": reducer,
            "knn_super_pop": knn_super_pop,
            "knn_pop": knn_pop,
        }
        for i in samples
    ]

    # run tasks; results is a list of dicts
    results = p(process_file, tasks)

    # get rows for dataframe summarizing results
    rows = [row for row in results if row]

    df = pd.DataFrame(
        rows,
        columns=[
            "file",
            "source",
            "build",
            "build_detected",
            "chromosomes_summary",
            "snp_count",
            "AFR",
            "AMR",
            "EAS",
            "EUR",
            "SAS",
            "ACB",
            "ASW",
            "BEB",
            "CDX",
            "CEU",
            "CHB",
            "CHS",
            "CLM",
            "ESN",
            "FIN",
            "GBR",
            "GIH",
            "GWD",
            "IBS",
            "ITU",
            "JPT",
            "KHV",
            "LWK",
            "MSL",
            "MXL",
            "PEL",
            "PJL",
            "PUR",
            "STU",
            "TSI",
            "YRI",
            "component1",
            "component2",
            "component3",
        ],
    )

    save_df_as_csv(df, OUTPUT_DIR, "opensnp_ancestry.csv")

    logging.info("analysis done!")