示例#1
0
    def _write_csv(self):
        """ Write SNPs to a CSV file.

        Returns
        -------
        str
            path to file in output directory if SNPs were saved, else empty str
        """
        filename = self._filename
        if not filename:
            ext = ".txt"

            if "sep" in self._kwargs and self._kwargs["sep"] == ",":
                ext = ".csv"

            filename = "{}_{}{}".format(clean_str(self._snps.source),
                                        self._snps.assembly, ext)

        comment = ("# Source(s): {}\n"
                   "# Build: {}\n"
                   "# Build Detected: {}\n"
                   "# Phased: {}\n"
                   "# SNPs: {}\n"
                   "# Chromosomes: {}\n".format(
                       self._snps.source,
                       self._snps.build,
                       self._snps.build_detected,
                       self._snps.phased,
                       self._snps.count,
                       self._snps.chromosomes_summary,
                   ))
        if "header" in self._kwargs:
            if isinstance(self._kwargs["header"], bool):
                if self._kwargs["header"]:
                    self._kwargs["header"] = [
                        "chromosome", "position", "genotype"
                    ]
        else:
            self._kwargs["header"] = ["chromosome", "position", "genotype"]

        return save_df_as_csv(self._snps._snps,
                              self._snps._output_dir,
                              filename,
                              comment=comment,
                              atomic=self._atomic,
                              **self._kwargs)
示例#2
0
    def _write_vcf(self):
        """ Write SNPs to a VCF file.

        References
        ----------
        1. The Variant Call Format (VCF) Version 4.2 Specification, 8 Mar 2019,
           https://samtools.github.io/hts-specs/VCFv4.2.pdf

        Returns
        -------
        str
            path to file in output directory if SNPs were saved, else empty str
        discrepant_vcf_position : pd.DataFrame
            SNPs with discrepant positions discovered while saving VCF
        """
        filename = self._filename
        if not filename:
            filename = "{}_{}{}".format(clean_str(self._snps.source),
                                        self._snps.assembly, ".vcf")

        comment = (
            "##fileformat=VCFv4.2\n"
            "##fileDate={}\n"
            '##source="{}; snps v{}; https://pypi.org/project/snps/"\n'.format(
                datetime.datetime.utcnow().strftime("%Y%m%d"),
                self._snps.source,
                snps.__version__,
            ))

        reference_sequence_chroms = (
            "1",
            "2",
            "3",
            "4",
            "5",
            "6",
            "7",
            "8",
            "9",
            "10",
            "11",
            "12",
            "13",
            "14",
            "15",
            "16",
            "17",
            "18",
            "19",
            "20",
            "21",
            "22",
            "X",
            "Y",
            "MT",
        )

        df = self._snps.snps

        tasks = []

        # skip insertions and deletions
        df = df.drop(df.loc[df["genotype"].notnull()
                            & ((df["genotype"].str[0] == "I")
                               | (df["genotype"].str[0] == "D")
                               | (df["genotype"].str[1] == "I")
                               | (df["genotype"].str[1] == "D"))].index)

        chroms_to_drop = []
        for chrom in df["chrom"].unique():
            if chrom not in reference_sequence_chroms:
                chroms_to_drop.append(chrom)
                continue

            tasks.append({
                "resources": self._snps._resources,
                "assembly": self._snps.assembly,
                "chrom": chrom,
                "snps": pd.DataFrame(df.loc[(df["chrom"] == chrom)]),
            })

        # drop chromosomes without reference sequence data (e.g., unassigned PAR)
        for chrom in chroms_to_drop:
            df = df.drop(df.loc[df["chrom"] == chrom].index)

        # create the VCF representation for SNPs
        results = map(self._create_vcf_representation, tasks)

        contigs = []
        vcf = [pd.DataFrame()]
        discrepant_vcf_position = [pd.DataFrame()]
        for result in list(results):
            contigs.append(result["contig"])
            vcf.append(result["vcf"])
            discrepant_vcf_position.append(result["discrepant_vcf_position"])

        vcf = pd.concat(vcf)
        discrepant_vcf_position = pd.concat(discrepant_vcf_position)

        comment += "".join(contigs)
        comment += '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\n'
        comment += "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"

        return (
            save_df_as_csv(
                vcf,
                self._snps._output_dir,
                filename,
                comment=comment,
                prepend_info=False,
                header=False,
                index=False,
                na_rep=".",
                sep="\t",
            ),
            discrepant_vcf_position,
        )
示例#3
0
 def get_var_name(self):
     return clean_str(self.name)
示例#4
0
def main():
    logger.info("start")

    # get filenames from openSNP data dump
    filenames = r.get_opensnp_datadump_filenames()

    filenames = [
        filename for filename in filenames
        if "readme" not in filename and "phenotype" not in filename
    ]

    # draw a sample from the observations
    random.seed(1)
    SAMPLE_SIZE = len(filenames)
    # SAMPLE_SIZE = 10
    samples = random.sample(range(len(filenames)), SAMPLE_SIZE)

    # setup tasks for parallelizing / execution on multiple cores
    p = Parallelizer(parallelize=True)
    tasks = [{"file": filenames[i]} for i in samples]

    # run tasks; results is a list of dicts
    results = p(load_file, tasks)

    # get results from `load_file` where `count` was non-zero
    rows = [item for item in results if "msg" not in item]

    df = pd.DataFrame(
        rows,
        columns=[
            "file", "source", "build", "build_detected", "chromosomes", "count"
        ],
    )

    save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv")

    # log parsing statistics
    file_count = len(filenames)
    logger.info("{} files in the openSNP datadump".format(file_count))
    logger.info("{:.2%} of openSNP datadump files parsed".format(
        len(df) / file_count))
    logger.info("build detected in {:.2%} of files parsed".format(
        len(df.loc[df.build_detected]) / len(df)))

    # extract files from the datadump where `load_file` returned a message
    if EXTRACT_FILES:
        # group files with same message (e.g., {"some message": ["file1", "file2"], ...})
        d = {}
        for result in results:
            if "msg" in result:
                if result["msg"] in d:
                    d[result["msg"]].append(result["file"])
                else:
                    d[result["msg"]] = [result["file"]]

        # add messages / file filters as necessary...
        d["build not detected"] = list(df.loc[~df.build_detected].file.values)

        # extract files that have messages for debugging
        for msg, files in d.items():
            if len(files) == 0:
                continue

            # create a directory for each message (prefix indicates number of files)
            path = os.path.join(OUTPUT_DIR,
                                "{:04}_{}".format(len(files), clean_str(msg)))
            create_dir(path)
            # save each file with message into created directory
            for filename in files:
                with atomic_write(os.path.join(path, filename),
                                  mode="wb") as f:
                    f.write(r.load_opensnp_datadump_file(filename))

    logger.info("stop")