예제 #1
0
 def f():
     snps = SNPs("tests/input/generic.csv")
     self.assertEqual(
         os.path.relpath(snps.save_snps(sep=",")),
         f"output{os.sep}generic_GRCh37.csv",
     )
     self.run_parsing_tests("output/generic_GRCh37.csv", "generic")
예제 #2
0
    def test_save_snps_vcf_phased(self):
        # read phased data
        s = SNPs("tests/input/testvcf_phased.vcf")

        # setup resource to use test FASTA reference sequence
        r = Resources()
        r._reference_sequences["GRCh37"] = {}
        with open("tests/input/generic.fa", "rb") as f_in:
            with atomic_write("tests/input/generic.fa.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        seq = ReferenceSequence(ID="1", path="tests/input/generic.fa.gz")

        r._reference_sequences["GRCh37"]["1"] = seq

        # save phased data to VCF
        assert os.path.relpath(
            s.save_snps(vcf=True)) == "output/vcf_GRCh37.vcf"
        # read saved VCF
        s = SNPs("output/vcf_GRCh37.vcf")
        assert s.phased
        pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
예제 #3
0
 def f():
     with tempfile.TemporaryDirectory() as tmpdir:
         snps = SNPs("tests/input/generic.csv", output_dir=tmpdir)
         dest = os.path.join(tmpdir, "generic_GRCh37.csv")
         self.assertEqual(
             snps.save_snps(sep=","),
             dest,
         )
         self.run_parsing_tests(dest, "generic")
예제 #4
0
 def test_save_snps_csv_phased(self):
     # read phased data
     s = SNPs("tests/input/testvcf_phased.vcf")
     # save phased data to CSV
     assert os.path.relpath(s.save_snps()) == "output/vcf_GRCh37.csv"
     # read saved CSV
     s = SNPs("output/vcf_GRCh37.csv")
     assert s.phased
     pd.testing.assert_frame_equal(s.snps, self.generic_snps_vcf())
예제 #5
0
"""
Validate input VCF files & remap them to GRCh37.
depends on:
> python 3
> argparse==1.4.0
> snps==0.4.0
> io
"""

import argparse
from snps import SNPs
import io

parser = argparse.ArgumentParser(description='Remap VCF files to GRCh37')
parser.add_argument('-i', '--input_file', help='Input VCF file')
parser.add_argument('-o', '--output_file', help='Output VCF file basename')
args = vars(parser.parse_args())
input_file = args['input_file']
output_file = args['output_file']
output_file_name = f"{output_file}.vcf"

# read & validate input file
snps = SNPs(input_file)

# remap SNPs if reference genome is not GRCh37
if snps.build_detected and snps.build != 37:
    snps.remap_snps(37)

# save to file
saved_snps = snps.save_snps(output_file_name, sep="\t", header=False, vcf=True)
예제 #6
0
)
parser.add_argument(
    '-t',
    '--input_target',
    help=
    'Input BIM file (a combination of all BIM files, transformed into a 23andme-like format'
)
parser.add_argument(
    '-b',
    '--input_base',
    help='Input base file, transformed into a 23andme-like format')

args = vars(parser.parse_args())

# Args to variable
input_target = args['input_target']
input_base = args['input_base']

###############################################################################
# Detect builds and update the base's build if it does not match the target's #
###############################################################################

target = SNPs(input_target, output_dir='.')
base = SNPs(input_base, output_dir='.')

if base.build != target.build:
    base.remap_snps(target.build)
    updated_base = base.save_snps("new_base_coordinates.txt",
                                  sep="\t",
                                  header=True)
예제 #7
0
class TestSnps(BaseSNPsTestCase):
    def setUp(self):
        self.snps_GRCh38 = SNPs("tests/input/GRCh38.csv")
        self.snps = SNPs("tests/input/chromosomes.csv")
        self.snps_only_detect_source = SNPs("tests/input/chromosomes.csv",
                                            only_detect_source=True)
        self.snps_none = SNPs(None)

        with open("tests/input/chromosomes.csv", "r") as f:
            self.snps_buffer = SNPs(f.read().encode("utf-8"))

        with atomic_write("tests/input/chromosomes.csv.zip",
                          mode="wb",
                          overwrite=True) as f:
            with zipfile.ZipFile(f, "w") as f_zip:
                f_zip.write("tests/input/chromosomes.csv",
                            arcname="chromosomes.csv")

        with open("tests/input/chromosomes.csv.zip", "rb") as f:
            data = f.read()
            self.snps_buffer_zip = SNPs(data)
        os.remove("tests/input/chromosomes.csv.zip")

        with open("tests/input/chromosomes.csv", "rb") as f_in:
            with atomic_write("tests/input/chromosomes.csv.gz",
                              mode="wb",
                              overwrite=True) as f_out:
                with gzip.open(f_out, "wb") as f_gzip:
                    shutil.copyfileobj(f_in, f_gzip)

        with open("tests/input/chromosomes.csv.gz", "rb") as f:
            data = f.read()
            self.snps_buffer_gz = SNPs(data)
        os.remove("tests/input/chromosomes.csv.gz")

    def snps_discrepant_pos(self):
        return self.create_snp_df(rsid=["rs3094315"],
                                  chrom=["1"],
                                  pos=[1],
                                  genotype=["AA"])

    def test_assembly(self):
        assert self.snps_GRCh38.assembly == "GRCh38"

    def test_assembly_no_snps(self):
        assert self.snps_none.assembly == ""

    def test_snp_buffer_zip(self):
        assert self.snps_buffer_zip.snp_count == 6

    def test_snp_buffer_gz(self):

        assert self.snps_buffer_gz.snp_count == 6

    def test_snp_buffer(self):
        assert self.snps_buffer.snp_count == 6

    def test_snp_count(self):
        assert self.snps.snp_count == 6

    def test_snp_count_no_snps(self):
        assert self.snps_none.snp_count == 0

    def test_chromosomes(self):
        assert self.snps.chromosomes == ["1", "2", "3", "5", "PAR", "MT"]

    def test_chromosomes_no_snps(self):
        assert self.snps_none.chromosomes == []

    def test_chromosomes_summary(self):
        assert self.snps.chromosomes_summary == "1-3, 5, PAR, MT"

    def test_chromosomes_summary_no_snps(self):
        assert self.snps_none.chromosomes_summary == ""

    def test_build_no_snps(self):
        assert not self.snps_none.build

    def test_build_detected_no_snps(self):
        assert not self.snps_none.build_detected

    def test_build_detected_PAR_snps(self):
        if (not os.getenv("DOWNLOADS_ENABLED")
                or os.getenv("DOWNLOADS_ENABLED") == "true"):
            snps = SNPs("tests/input/GRCh37_PAR.csv")
            assert snps.build == 37
            assert snps.build_detected

    def test_sex_no_snps(self):
        assert self.snps_none.sex == ""

    def test_sex_Male_Y_chrom(self):
        s = self.simulate_snps(chrom="Y",
                               pos_start=1,
                               pos_max=59373566,
                               pos_step=10000)
        file = s.save_snps()
        snps = SNPs(file)
        assert snps.sex == "Male"

    def test_get_summary(self):
        assert self.snps_GRCh38.get_summary() == {
            "source": "generic",
            "assembly": "GRCh38",
            "build": 38,
            "build_detected": True,
            "snp_count": 4,
            "chromosomes": "1, 3",
            "sex": "",
        }

    def test_get_summary_no_snps(self):
        assert not self.snps_none.get_summary()

    def test_is_valid_True(self):
        assert self.snps_GRCh38.is_valid()

    def test_is_valid_False(self):
        assert not self.snps_none.is_valid()

    def test__read_raw_data(self):
        assert self.snps_none.snps.empty
        assert self.snps_none.source == ""

    def test__lookup_build_with_snp_pos_None(self):
        snps = SNPs()
        snps._snps = self.snps_discrepant_pos()
        assert not snps.detect_build()

    def test_get_assembly_None(self):
        snps = SNPs()
        assert snps.get_assembly() is ""

    def test_save_snps_source(self):
        assert (os.path.relpath(
            self.snps_GRCh38.save_snps()) == "output/generic_GRCh38.csv")
        snps = SNPs("output/generic_GRCh38.csv")
        pd.testing.assert_frame_equal(snps.snps, self.snps_GRCh38.snps)

    def test_save_snps_buffer(self):
        out = io.StringIO()
        self.snps.save_snps(out)
        assert out.read().startswith("# Generated by snps")

    def test_snps_only_detect_source(self):
        assert self.snps_only_detect_source.source == "generic"

    def test_duplicate_rsids(self):
        snps = SNPs("tests/input/duplicate_rsids.csv")
        result = self.create_snp_df(rsid=["rs1"],
                                    chrom=["1"],
                                    pos=[101],
                                    genotype=["AA"])
        duplicate_snps = self.create_snp_df(rsid=["rs1", "rs1"],
                                            chrom=["1", "1"],
                                            pos=[102, 103],
                                            genotype=["CC", "GG"])
        pd.testing.assert_frame_equal(snps.snps, result)
        pd.testing.assert_frame_equal(snps.duplicate_snps, duplicate_snps)

    def test_deduplicate_false(self):
        snps = SNPs("tests/input/duplicate_rsids.csv", deduplicate=False)

        result = self.create_snp_df(
            rsid=["rs1", "rs1", "rs1"],
            chrom=["1", "1", "1"],
            pos=[101, 102, 103],
            genotype=["AA", "CC", "GG"],
        )
        pd.testing.assert_frame_equal(snps.snps, result)
예제 #8
0
 def test_save_snps_no_snps(self):
     s = SNPs()
     assert not s.save_snps()
예제 #9
0
 def test_save_snps_specify_file(self):
     s = SNPs("tests/input/GRCh37.csv")
     assert os.path.relpath(s.save_snps("snps.csv")) == "output/snps.csv"
     s_saved = SNPs("output/snps.csv")
     pd.testing.assert_frame_equal(s_saved.snps, self.snps_GRCh37())
예제 #10
0
 def test_save_snps(self):
     snps = SNPs("tests/input/GRCh37.csv")
     assert os.path.relpath(snps.save_snps()) == "output/generic_GRCh37.csv"
     s_saved = SNPs("output/generic_GRCh37.csv")
     pd.testing.assert_frame_equal(s_saved.snps, self.snps_GRCh37())