Exemplo n.º 1
0
 def testNbSeq(self):
     nb_seq = FastaIO.nbSeq(self.tmp_mono_line)
     self.assertEqual(4, nb_seq)
     nb_seq = FastaIO.nbSeq(self.tmp_multi_line)
     self.assertEqual(4, nb_seq)
     nb_seq = FastaIO.nbSeq(self.tmp_multi_line_gz)
     self.assertEqual(4, nb_seq)
Exemplo n.º 2
0
def writeTargetReads(out_R1_path, out_R2_path, reads_pairs):
    """
    """
    with FastaIO(out_R1_path, "a") as FH_out_R1:
        with FastaIO(out_R2_path, "a") as FH_out_R2:
            for R1, R2 in reads_pairs:
                FH_out_R1.write(R1)
                FH_out_R2.write(R2)
Exemplo n.º 3
0
def addStartTag(in_path, out_path):
    FH_in = FastaIO(in_path)
    FH_out = FastaIO(out_path, "w")
    for record in FH_in:
        record.string = "^" + record.string
        FH_out.write(record)
    FH_in.close()
    FH_out.close()
Exemplo n.º 4
0
 def testNbSeqAndNt(self):
     nb_seq, nb_nt = FastaIO.nbSeqAndNt(self.tmp_mono_line)
     self.assertEqual(nb_seq, 4)
     self.assertEqual(nb_nt, 104)
     nb_seq, nb_nt = FastaIO.nbSeqAndNt(self.tmp_multi_line)
     self.assertEqual(nb_seq, 4)
     self.assertEqual(nb_nt, 104)
     nb_seq, nb_nt = FastaIO.nbSeqAndNt(self.tmp_multi_line_gz)
     self.assertEqual(nb_seq, 4)
     self.assertEqual(nb_nt, 104)
Exemplo n.º 5
0
 def testIsValid(self):
     # Valid
     self.assertTrue(FastaIO.isValid(self.tmp_mono_line))
     self.assertTrue(FastaIO.isValid(self.tmp_multi_line))
     self.assertTrue(FastaIO.isValid(self.tmp_multi_line_gz))
     # Valid long file
     content = ">seq1\nATGC\n>seq2\nATGC\n>seq3\nATGC\n>seq4\nATGC\n>seq5\nATGC\n>seq6\nATGC\n>seq7\nATGC\n>seq8\nATGC\n>seq9\nATGC\n>seq10\nATGC\n>seq11\nATGC\n>seq12\nATGC"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(FastaIO.isValid(self.tmp_out))
     # Valid empty file
     content = ""
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(FastaIO.isValid(self.tmp_out))
     # Valid empty sequence
     content = ">seq1\n"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(FastaIO.isValid(self.tmp_out))
     # Invalid file (two headers)
     content = ">seq1\nATGC\n>seq2\n>seq3\nATGC"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(not FastaIO.isValid(self.tmp_out))
     # Invalid file (no header at the first line)
     content = "seq1\nATGC\n>seq2\nATGC"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(not FastaIO.isValid(self.tmp_out))
     # Invalid file (fastq)
     content = "@seq1\nATGC\n+\n####"
     with open(self.tmp_out, "w") as FH_out:
         FH_out.write(content)
     self.assertTrue(not FastaIO.isValid(self.tmp_out))
Exemplo n.º 6
0
def getSeqByChr(genome_path):
    """
    Return by chromosome name the sequence of this chromosome.

    :param genome_path: Path to the genome file (format: fasta).
    :type genome_path: str
    :return: By chromosome name the sequence of this chromosome in uppercase.
    :rtype: dict
    """
    genome_by_chr = dict()
    FH_seq = FastaIO(genome_path)
    for record in FH_seq:
        genome_by_chr[record.id] = record.string.upper()
    FH_seq.close()
    return genome_by_chr
Exemplo n.º 7
0
def getChrSeq(in_ref, chrom_id):
    """
    """
    chrom_seq = None
    with FastaIO(in_ref) as FH_seq:
        for record in FH_seq:
            if record.id == chrom_id:
                chrom_seq = record.string
    return chrom_seq
Exemplo n.º 8
0
 def testIter(self):
     with FastaIO(self.tmp_mono_line) as FH_in:
         for idx, record in enumerate(FH_in):
             self.assertTrue(cmpSequences(record, self.expected_rec[idx]))
         self.assertEqual(idx + 1, 4)
     with FastaIO(self.tmp_multi_line) as FH_in:
         for idx, record in enumerate(FH_in):
             self.assertTrue(cmpSequences(record, self.expected_rec[idx]))
         self.assertEqual(idx + 1, 4)
     with FastaIO(self.tmp_multi_line_gz) as FH_in:
         for idx, record in enumerate(FH_in):
             self.assertTrue(cmpSequences(record, self.expected_rec[idx]))
         self.assertEqual(idx + 1, 4)
     with FastaIO(self.tmp_mono_line) as FH_in_mono:
         with FastaIO(self.tmp_multi_line) as FH_in_multi:
             idx = 0
             for rec_expected, rec_mono_line, rec_multi_line in zip(
                     self.expected_rec, FH_in_mono, FH_in_multi):
                 self.assertTrue(cmpSequences(rec_mono_line, rec_expected))
                 self.assertTrue(cmpSequences(rec_multi_line, rec_expected))
                 idx += 1
             self.assertEqual(idx, 4)
Exemplo n.º 9
0
def getSeqRecord(in_seq, selected_id):
    """
    @summary: Returns the selected sequence object from the sequences file.
    @param in_seq: [str] Path to the sequences file (format: fasta).
    @param selected_id: [str] The ID of the selected sequence.
    @return: [Sequence] The selected sequence object.
    """
    selected_record = None
    with FastaIO(in_seq) as FH_in:
        for record in FH_in:
            if record.id == selected_id:
                selected_record = record
    return selected_record
Exemplo n.º 10
0
def getChromSeq(chrom_name, in_fasta):
    """
    Return the sequence corresponding to the chromosome.

    :param chrom_name: The name of the selected chromosome.
    :type chrom_name: str
    :param in_fasta: The path to the file sequences file (format: fasta).
    :type in_fasta: str
    :return: the sequence corresponding to the chromosome.
    :rtype: str
    """
    seq = None
    with FastaIO(in_fasta) as FH_ref:
        for record in FH_ref:
            if record.id == chrom_name:
                seq = record.string
    if seq is None:
        raise Exception(
            'The chromosome "{}" cannot be rertrieved from "{}".'.format(
                chrom_name, in_fasta))
    return seq
Exemplo n.º 11
0
def getBEDRecords(ref_path, amplicons):
    for ampl in amplicons:
        ampl["found"] = False
    bed_ampl = []
    with FastaIO(ref_path) as FH_seq:
        for record in FH_seq:
            chr_id = record.id
            chr_str = record.string.upper()
            for ampli in amplicons:
                # Primers are on strand +
                up_primer = ampli["f_primer"].upper()
                down_primer = ampli["r_primer"].upper()
                start, end = findPosOnSequence(chr_id, chr_str, up_primer,
                                               down_primer)
                if start is not None:
                    ampli["found"] = True
                    bed_ampl.append(
                        BEDRecord(chr_id, start, end, ampli["name"], 0, "+",
                                  start + len(up_primer),
                                  end - len(down_primer)))
                # Primers are on strand -
                up_primer = revcom(ampli["r_primer"].upper())
                down_primer = revcom(ampli["f_primer"].upper())
                start, end = findPosOnSequence(chr_id, chr_str, up_primer,
                                               down_primer)
                if start is not None:
                    ampli["found"] = True
                    bed_ampl.append(
                        BEDRecord(chr_id, start, end, ampli["name"], 0, "-",
                                  start + len(up_primer),
                                  end - len(down_primer)))
    for ampl in amplicons:
        if not ampl["found"]:
            warnings.warn(
                'The amplicons {} with primers fwd:{}, rvs:{} cannot be found in {}.'
                .format(ampl["name"], ampl["f_primer"], ampl["r_primer"],
                        ref_path))
    return (bed_ampl)
Exemplo n.º 12
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFPrimers.py",
            "--input-variants", self.tmp_variants,
            "--input-regions", self.tmp_regions,
            "--input-sequences", self.tmp_sequences,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            FH_seq.write(Sequence("artificial_chr1", "NNNAAAATTTGGGGGGGGGGTTTAAANNN"))
            #                                         123456789| | | | | | | | | |
            #                                                  10| 14| 18| 22| 26|
            #                                                    12  16  20  24  28
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {"ZOI": HeaderInfoAttr("ZOI", "If the variant can be in interest area.", type="String", number="1")}
            FH_var.writeHeader()
            self.variants = [
                VCFRecord("artificial_chr1", 6, "alt_0", "A", ["AA"], None, None, {"ZOI": "no"}),
                VCFRecord("artificial_chr1", 8, "alt_1", "TT", ["T"], None, None, {"ZOI": "no"}),
                VCFRecord("artificial_chr1", 8, "alt_2", "T", ["TT"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 9, "alt_3", "TTGG", ["TT"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 14, "alt_4", "G", ["GG"], None, None, {"ZOI": "yes"}),
                VCFRecord("artificial_chr1", 18, "alt_5", "GGG", ["G"], None, None, {"ZOI": "yes"}),  # ZOI downstream limit deletion
                VCFRecord("artificial_chr1", 22, "alt_6", "T", ["TT"], None, None, {"ZOI": "yes"}),

                VCFRecord("artificial_chr1", 9, "alt_7", "TT", ["TC"], None, None, {"ZOI": "no"}),  # Substitution before end of upstream primer
                VCFRecord("artificial_chr1", 10, "alt_8", "TG", ["TC"], None, None, {"ZOI": "yes"}),  # Substitution in upstream limit of ZOI
                VCFRecord("artificial_chr1", 15, "alt_9", "GG", ["GC"], None, None, {"ZOI": "yes"}),  # Substitution in dosnstream limit of ZOI
                VCFRecord("artificial_chr1", 20, "alt_10", "GT", ["GC"], None, None, {"ZOI": "no"}),  # Substitution after start of downstream primer
                VCFRecord("artificial_chr1", 21, "alt_11", "TT", ["TC"], None, None, {"ZOI": "no"}),  # Substitution in downstream primer

                VCFRecord("artificial_chr2", 1, "alt_12", "C", ["CTT"], None, None, {"ZOI": "no"}),  # Insertion before end of upstream primer
                VCFRecord("artificial_chr2", 2, "alt_13", "G", ["GCC"], None, None, {"ZOI": "yes"}),  # Insertion in upstream limit of ZOI
                VCFRecord("artificial_chr2", 3, "alt_14", "AT", ["CCGC"], None, None, {"ZOI": "yes"}),  # Insertion in upstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 9, "alt_15", "G", ["GCC"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 9, "alt_16", "G", ["NNN"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 10, "alt_17", "-", ["CC"], None, None, {"ZOI": "yes"}),  # Insertion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 10, "alt_18", "A", ["ATT"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer

                VCFRecord("artificial_chr2", 1, "alt_19", "CG", ["C"], None, None, {"ZOI": "no"}),  # Deletion before end of upstream primer
                VCFRecord("artificial_chr2", 2, "alt_20", "GA", ["G"], None, None, {"ZOI": "yes"}),  # Deletion in upstream limit of ZOI
                VCFRecord("artificial_chr2", 3, "alt_21", "AT", ["C"], None, None, {"ZOI": "yes"}),  # Deletion in upstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 6, "alt_22", "NNCG", ["N"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 8, "alt_23", "CG", ["C"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI
                VCFRecord("artificial_chr2", 8, "alt_24", "CG", ["T"], None, None, {"ZOI": "yes"}),  # Deletion in downstream limit of ZOI and without standardization
                VCFRecord("artificial_chr2", 9, "alt_25", "GA", ["G"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
                VCFRecord("artificial_chr2", 10, "alt_26", "A", ["-"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
                VCFRecord("artificial_chr2", 10, "alt_27", "AT", ["A"], None, None, {"ZOI": "no"}),  # Insertion after start of downstream primer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Exemplo n.º 13
0
        help=
        'Path to the definition of the amplicons (format: Illumina manifest).')
    group_output = parser.add_argument_group('Outputs')  # Outputs
    group_output.add_argument(
        '-f',
        '--fwd-barcodes',
        default="fwd_barcodes.fasta",
        help=
        '******************************** (format: fasta). [Default: %(default)s]'
    )
    group_output.add_argument(
        '-r',
        '--rvs-barcodes',
        default="rvs_barcodes.fasta",
        help=
        '******************************** (format: fasta). [Default: %(default)s]'
    )
    args = parser.parse_args()

    # Process
    amplicons = getAmplicons(args.input_manifest)
    FH_fwd = FastaIO(args.fwd_barcodes, "w")
    FH_rvs = FastaIO(args.rvs_barcodes, "w")
    for ampl in amplicons:
        record_fwd = Sequence(ampl.name, ampl.up_primer)
        FH_fwd.write(record_fwd)
        record_rvs = Sequence(ampl.name, revcom(ampl.down_primer))
        FH_rvs.write(record_rvs)
    FH_fwd.close()
    FH_rvs.close()
Exemplo n.º 14
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_regions = os.path.join(tmp_folder, unique_id + ".bed")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFTargets.py", "--mode", "remove", "--input-variants",
            self.tmp_variants, "--input-targets", self.tmp_regions,
            "--input-reference", self.tmp_sequences, "--output-variants",
            self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(
                Sequence("artificial_chr1",
                         "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create targets
        with BEDIO(self.tmp_regions, "w", write_nb_col=4) as FH_bed:
            FH_bed.write(BEDRecord("artificial_chr1", 1, 6, "target_1"))
            FH_bed.write(BEDRecord("artificial_chr1", 15, 28, "target_2"))
            FH_bed.write(BEDRecord("artificial_chr1", 38, 45, "target_3"))

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "target":
                HeaderInfoAttr("target",
                               "The ID of the overlapped target.",
                               type="String",
                               number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_01", "G", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_02", "C", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_03", "A", ["G"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_04", "G", ["C"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "alt_05", "CATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": None
                           }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 11, "alt_06", "TATGTATG",
                          ["GTACCCGC"], None, None,
                          {"target": "target_2"}),  # Overlap target start
                VCFRecord("artificial_chr1", 13, "alt_07",
                          "TGTATGTGCTCACAAAGTA", ["CCCGCCCCTACATTGCAGT"], None,
                          None, {"target": "target_2"}),  # Include target
                VCFRecord("artificial_chr1", 15, "alt_08", "TATGTGCTCACAAA",
                          ["CGCCCCTACATTGC"], None, None,
                          {"target": "target_2"}),  # Exact target
                VCFRecord("artificial_chr1", 21, "alt_09", "CTCACAA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Included by target
                VCFRecord("artificial_chr1", 24, "alt_10", "ACAAAGTA",
                          ["GTACCCG"], None, None,
                          {"target": "target_2"}),  # Overlap target end
                VCFRecord(
                    "artificial_chr1", 29, "alt_11", "GTAGTAGAT",
                    ["GTACCCGA"], None, None,
                    {"target": None}),  # After target ; first nt after target
                # Ins single nt
                VCFRecord("artificial_chr1", 14, "alt_12", "G", ["GA"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord("artificial_chr1", 15, "alt_12.2", "-", ["A"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_13", "A", ["TG"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_14", "C", ["CG"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_15", "A", ["AT"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord("artificial_chr1", 28, "alt_15.2", "-", ["T"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_16", "A", ["AT"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord(
                    "artificial_chr1", 14, "alt_17", "G", ["GT"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_18", "A", ["AA"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                # Del single nt
                VCFRecord("artificial_chr1", 14, "alt_19", "G", [""], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 15, "alt_20", "T", [""], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 21, "alt_21", "C", [""], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 28, "alt_22", "A", [""], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 29, "alt_23", "G", [""], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Del multi nt
                VCFRecord("artificial_chr1", 11, "alt_24", "TATG", ["T"], None,
                          None, {"target": None
                                 }),  # Before target ; first nt before target
                VCFRecord(
                    "artificial_chr1", 13, "alt_25", "TGTA", ["T"], None, None,
                    {"target": "target_2"}),  # On target ; first nt of target
                VCFRecord("artificial_chr1", 20, "alt_26", "GCTC", ["G"], None,
                          None, {"target": "target_2"}),  # On target
                VCFRecord("artificial_chr1", 27, "alt_27", "AAGT", ["A"], None,
                          None, {"target": "target_2"}),  # On target ; last nt
                VCFRecord(
                    "artificial_chr1", 28, "alt_28", "AGT", ["A"], None, None,
                    {"target": None}),  # After target ; first nt afetr target
                # Movable del multi nt
                VCFRecord("artificial_chr1", 7, "alt_29", "CATGT", ["C"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 12, "alt_30", "ATG", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to first nt of target
                VCFRecord(
                    "artificial_chr1", 28, "alt_31", "AGTA", ["A"], None, None,
                    {"target": "target_2"}),  # Movable to last nt of target
                VCFRecord("artificial_chr1", 30, "alt_32", "TAGT", ["T"], None,
                          None,
                          {"target": "target_2"
                           }),  # On repeat and movable to last nt of target
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "filterVCFHomopolym.py", "--mode", "remove", "--homopolym-length",
            "4", "--input-variants", self.tmp_variants, "--input-reference",
            self.tmp_sequences, "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            #                                                    12  16  20  24  28  32  36  40  44  48  52  56  60  64  68  72  76  80  84  88  92  96  100
            #                                          2 4 6 8 10| 14| 18| 22| 26| 30| 34| 38| 42| 46| 50| 54| 58| 62| 66| 70| 74| 78| 82| 86| 90| 94| 98| 102
            #                                          | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
            FH_seq.write(
                Sequence(
                    "artificial_chr1",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr2",
                    "CGAATATGATCCAGCAATAAAAAGCTCCTACAGGCAAAAGTAGGCAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAA"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr3",
                    "CGAATATGATCCAGCAATGAAAATTCCTACAGGTAAAACGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr4",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCAAAAGGATATTCTCGACAAAACAGCAGAAAGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr5",
                    "CGAATATGATCCAGTAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGAGAAACCTGTCTCTTGGATATTCTCGACACAGCAGGTCAAG"
                ))
            FH_seq.write(
                Sequence(
                    "artificial_chr6",
                    "CGAATATGATCCAGCAATAAAAAGTTCCTACAGGAAAAAAGTAGAAAGCACAACCTGTCTCTTGGAAAATCTCGACACAGCAGGTAAAACAATGCAGTAAAT"
                ))
        """
        Variant	before_start	before_end	before_seq	after_start	after_end	after_seq
        alt_00	10	13	TCCA	15	18	CAAT
        alt_01	20	23	AAAA	25	28	TTCC
        alt_02	30	33	ACAG	35	38	AAAA
        alt_03	40	43	AGTA	45	48	AAAG
        alt_04	10	13	TCCA	16	19	AATA
        alt_05	20	23	AAAA	26	29	TCCT
        alt_06	30	33	ACAG	36	39	AAAA
        alt_07	40	43	GTAG	46	49	AAAG
        alt_08	11	14	CCAG	15	18	CAAT
        alt_09	20	23	AAAA	24	27	TTCC
        alt_10	31	34	AGGT	35	38	AAAA
        alt_11	40	43	GTAG	44	47	AAAG
        alt_12	11	14	CCAG	15	18	CAAT
        alt_13	20	23	AAAA	24	27	GTTC
        alt_14	31	34	CAGG	35	38	AAAA
        alt_15	41	44	GTAG	45	48	AAAG
        alt_16	50	53	GAAA	57	60	GTCA
        alt_17	60	63	AAAA	67	70	TATT
        alt_18	70	73	TCTC	77	80	AAAA
        alt_19	80	83	ACAG	87	90	AAAG
        alt_20	11	14	CCAG	16	19	AATA
        alt_21	20	23	AAAA	25	28	TTCC
        alt_22	31	34	CAGG	36	39	AAAA
        alt_23	40	43	AGTA	45	48	AAAG
        alt_24	11	14	CCAG	17	20	ATAA
        alt_25	19	22	AAAA	26	29	TCCT
        alt_26	29	32	TACA	35	38	AAAA
        alt_27	38	41	AAAG	45	48	AAAG
        alt_28	50	53	ACAA	61	64	CTTG
        alt_29	66	69	AAAA	76	79	CACA
        alt_30	76	79	CACA	86	89	AAAA
        alt_31	88	91	AACA	99	102	AAAT
        """

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	89	17	89	90
artificial_chr2	89	124	89	90
artificial_chr3	88	231	88	89
artificial_chr4	95	337	95	96
artificial_chr5	89	450	89	90
artificial_chr6	102	557	102	103""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "is_filtered":
                HeaderInfoAttr(
                    "is_filtered",
                    "1 if the variant is adjacent to an homopolymer.",
                    type="Integer",
                    number="1")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "alt_00", "G", ["T"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr1", 24, "alt_01", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr1", 34, "alt_02", "G", ["T"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr1", 44, "alt_03", "G", ["T"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Substit multi nt
                VCFRecord("artificial_chr2", 14, "alt_04", "GC", ["TA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr2", 24, "alt_05", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr2", 34, "alt_06", "GC", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr2", 44, "alt_07", "GC", ["TA"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins single nt
                VCFRecord("artificial_chr3", 14, "alt_08", "G", ["GT"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr3", 23, "alt_09", "A", ["AT"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr3", 34, "alt_10", "T", ["TA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr3", 43, "alt_11", "G", ["GT"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # Ins multi nt
                VCFRecord("artificial_chr4", 14, "alt_12", "G", ["GTA"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 23, "alt_13", "A", ["ATA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 34, "alt_14", "G", ["GTA"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 44, "alt_15", "G", ["GTC"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr4", 54, "alt_16", "CCT", ["ATCCAGA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr4", 64, "alt_17", "GGA", ["CTCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr4", 74, "alt_18", "GAC", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr4", 84, "alt_19", "CAG", ["ATCCAGT"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                # Del single nt
                VCFRecord("artificial_chr5", 14, "alt_20", "GT", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr5", 23, "alt_21", "AG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr5", 34, "alt_22", "GA", ["G"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr5", 43, "alt_23", "AG", ["A"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymers
                # # Del multi nt
                VCFRecord("artificial_chr6", 14, "alt_24", "GCA", ["G"], None,
                          None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 23, "alt_25", "AGT", ["C"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 32, "alt_26", "AGG", ["A"], None, None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 42, "alt_27", "TAG", ["C"], None, None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
                VCFRecord("artificial_chr6", 54, "alt_28", "CCTGTCT", ["GAA"],
                          None, None,
                          {"is_filtered": 0}),  # Without adjacent homopolymers
                VCFRecord(
                    "artificial_chr6", 70, "alt_29", "TCTCGA", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers upstream
                VCFRecord(
                    "artificial_chr6", 80, "alt_30", "GCAGGT", ["CCC"], None,
                    None,
                    {"is_filtered": 1}),  # Adjacent homopolymers downstream
                VCFRecord(
                    "artificial_chr6", 92, "alt_31", "ATGCAGT", ["CCC"], None,
                    None,
                    {"is_filtered": 0}),  # Adjacent too short homopolymer
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Exemplo n.º 16
0
    args = parser.parse_args()

    # Logger initialisation
    logging.basicConfig(
        level=logging.DEBUG,
        format=
        '%(asctime)s -- [%(filename)s][pid:%(process)d][%(levelname)s] -- %(message)s'
    )
    log = logging.getLogger(os.path.basename(__file__))
    log.info(" ".join(sys.argv))
    log.info("Random seed used: {}".format(args.random_seed))

    # Get number of duplications by reads
    log.info("Get duplication count for each read")
    random.seed(args.random_seed)
    nb_reads = FastaIO.nbSeq(args.input_R1)
    if nb_reads < 10000:
        log.error(
            "The number of reads in {} is unsufficient to simulate duplication (found: {} ; expected: {})."
            .format(args.input_R1, nb_reads, 10000))
    nb_occurences = getNbOccur(args.duplication_profile, nb_reads)

    # Witre reads
    log.info("Write reads")
    with FastaIO(args.output_R1, "w") as FH_out_R1:
        with FastaIO(args.output_R2, "w") as FH_out_R2:
            with FastaIO(args.input_R1) as FH_in_R1:
                with FastaIO(args.input_R2) as FH_in_R2:
                    for curr_nb_occur, R1, R2 in zip(nb_occurences, FH_in_R1,
                                                     FH_in_R2):
                        description = "dupCount={}".format(curr_nb_occur)
Exemplo n.º 17
0
    def setUp(self):
        tmp_folder = tempfile.gettempdir()
        unique_id = str(uuid.uuid1())

        # Temporary files
        self.tmp_sequences = os.path.join(tmp_folder, unique_id + ".fasta")
        self.tmp_faidx = os.path.join(tmp_folder, unique_id + ".fasta.fai")
        self.tmp_variants = os.path.join(tmp_folder, unique_id + ".vcf")
        self.tmp_output = os.path.join(tmp_folder, unique_id + "_out.vcf")

        # Exec command
        self.cmd = [
            "standardizeVCF.py",
            "--trace-unstandard",
            "--input-reference", self.tmp_sequences,
            "--input-variants", self.tmp_variants,
            "--output-variants", self.tmp_output
        ]

        # Create fasta
        with FastaIO(self.tmp_sequences, "w") as FH_seq:
            # Repeats:                                       ****....            ...***
            # Region:                                 |----|        |------------|         |------|
            FH_seq.write(Sequence("artificial_chr1", "CTCAGTCATGTATGTATGTGCTCACAAAGTAGTAGATCATGGCAC"))
            #                                         123456789| | | | | | | | | | | | | | | | | |
            #                                                  10| 14| 18| 22| 26| 30| 34| 38| 42|
            #                                                    12  16  20  24  28  32  36  40  44
            FH_seq.write(Sequence("artificial_chr2", "CGATNNNCGAT"))
            #                                         123456789|
            #                                                  10

        # Create faidx
        with open(self.tmp_faidx, "w") as FH_fai:
            FH_fai.write("""artificial_chr1	45	17	45	46
artificial_chr2	11	80	11	12""")

        # Create VCF
        with VCFIO(self.tmp_variants, "w") as FH_var:
            FH_var.info = {
                "expected": HeaderInfoAttr("expected", "Standardized version of {chrom}:{pos}={ref}/{alt}.", type="String", number="."),
                "ANN": HeaderInfoAttr("ANN", "Annotation of variants Format: Allele|Annotation_id|Alt_allele_idx", type="String", number="."),
                "expectedANN": HeaderInfoAttr("expectedANN", "Standardized version of annotations Format: Allele|Annotation_id|Alt_allele_idx", type="String", number=".")
            }
            FH_var.writeHeader()
            self.variants = [
                # Substit single nt
                VCFRecord("artificial_chr1", 14, "sub_01", "G", ["T"], None, None, {
                    "expected": ["artificial_chr1:14=G/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 19, "sub_02", "T", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/A", "artificial_chr1:19=T/C"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["A|ann_1|0", "A|ann_2|0"]
                }),
                # Substit multi nt
                VCFRecord("artificial_chr1", 7, "sub_03", "CATGTATG", ["GTACCCGC"], None, None, {
                    "expected": ["artificial_chr1:7=CATGTATG/GTACCCGC"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTGT|ann_3|"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 11, "sub_04", "TATGTATG", ["GTACCCGC", "GTACCCAA"], None, None, {
                    "expected": ["artificial_chr1:11=TATGTATG/GTACCCGC", "artificial_chr1:11=TATGTATG/GTACCCAA"],
                    "ANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"],
                    "expectedANN": ["GTACCCGC|ann_1|0", "GTACCCGC|ann_2|0", "GTACCCAA|ann_3|1"]
                }),
                # Insertion single nt
                VCFRecord("artificial_chr1", 14, "ins_01", "G", ["GA"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GT|ann_3|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_02", "-", ["A"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "ins_03", "G", ["GA", "GC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GA", "artificial_chr1:14=G/GC"],
                    "ANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1", "GT|ann_4|"],
                    "expectedANN": ["GA|ann_1|0", "GA|ann_2|0", "GC|ann_3|1"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_04", "-", ["A", "C"], None, None, {
                    "expected": ["artificial_chr1:19=T/TA", "artificial_chr1:19=T/TC"],
                    "ANN": ["A|ann_1|0", "A|ann_2|0", "C|ann_3|1", "T|ann_4|"],
                    "expectedANN": ["TA|ann_1|0", "TA|ann_2|0", "TC|ann_3|1"]
                }),
                # Insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_05", "G", ["GATGC"], None, None, {
                    "expected": ["artificial_chr1:14=G/GATGC"],
                    "ANN": ["GATGC|ann_1|0", "GATGC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["GATGC|ann_1|0", "GATGC|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 20, "ins_06", "-", ["AAATC"], None, None, {
                    "expected": ["artificial_chr1:19=T/TAAATC"],
                    "ANN": ["AAATC|ann_1|0", "AAATC|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["TAAATC|ann_1|0", "TAAATC|ann_2|0"]
                }),
                # Movable insertion multi nt
                VCFRecord("artificial_chr1", 14, "ins_07", "G", ["GTG"], None, None, {
                    "expected": ["artificial_chr1:12=A/ATG"],
                    "ANN": ["GTG|ann_1|0", "GTG|ann_2|0", "GAAAC|ann_3|"],
                    "expectedANN": ["ATG|ann_1|0", "ATG|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 27, "ins_08", "A", ["AAAA"], None, None, {
                    "expected": ["artificial_chr1:25=C/CAAA"],
                    "ANN": ["AAAA|ann_1|0", "AAAA|ann_2|0", "CAAA|ann_3|"],
                    "expectedANN": ["CAAA|ann_1|0", "CAAA|ann_2|0"]
                }),
                # Deletion single nt
                VCFRecord("artificial_chr1", 14, "del_01", "G", [""], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 14, "del_02", "G", ["-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["-|ann_1|0", "-|ann_2|0", "T|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_03", "TG", ["T"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0"]
                }),
                VCFRecord("artificial_chr1", 13, "del_04", "TG", ["T", "-"], None, None, {
                    "expected": ["artificial_chr1:13=TG/T", "artificial_chr1:12=ATG/A"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "-|ann_3|1"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1"]
                }),
                # Movable deletion multi nt
                VCFRecord("artificial_chr1", 11, "del_05", "TATG", ["T", "TA", "-"], None, None, {
                    "expected": ["artificial_chr1:11=TATG/T", "artificial_chr1:12=ATG/A", "artificial_chr1:7=CATGT/C"],
                    "ANN": ["T|ann_1|0", "T|ann_2|0", "TA|ann_3|1", "-|ann_4|2"],
                    "expectedANN": ["T|ann_1|0", "T|ann_2|0", "A|ann_3|1", "C|ann_4|2"]
                }),
            ]
            for idx, curr_var in enumerate(self.variants):
                FH_var.write(curr_var)
Exemplo n.º 18
0
 def testWrite(self):
     with FastaIO(self.tmp_out, "w") as FH_out:
         for curr_rec in self.expected_rec:
             FH_out.write(curr_rec)
     self.assertTrue(FastaIO.isValid(self.tmp_out))
     self.assertTrue(filecmp.cmp(self.tmp_out, self.tmp_mono_line))