Exemplo n.º 1
0
def uncorr_and_corr_with_rg(tmp_path):
    uncorr_fastq = tmp_path / 'uncorr_withrg.fq'
    corrected_fastq = tmp_path / 'corr_withrg.fq'
    with open(uncorr_fastq,'w') as fu, open(corrected_fastq,'w') as fc:
        r = pysam.FastxRecord(
            name = 'foo/1_RG:Z:bar',
            sequence = 'ATG',
            quality = '((#') #7, 7, 2
        r2 = pysam.FastxRecord(
            name = r.name,
            sequence = 'ACG',
            quality = r.quality)
        fu.write(str(r))
        fc.write(str(r2))
    return str(uncorr_fastq), str(corrected_fastq)
Exemplo n.º 2
0
    def test_createProbesForGeneVariants_emptyVariants_returnEmptyProbes(self):
        query = Query(TEST_QUERY_VCF, TEST_QUERY_REF)

        expected = ""
        actual = query._create_probes_for_gene_variants(pysam.FastxRecord(), [])

        assert actual == expected
Exemplo n.º 3
0
def compress_seq(read):
    """Compress homopolymers within a basecall.

    Creates and RLE sequence record, with run-lengths stored in qualities.

    :param read: `pysam FastxRecord` object.

    :returns: `pysam FastxRecord`
    """
    logger = medaka.common.get_named_logger('Compress_basecalls')

    # Phred qscores `!"#$%&`...
    scores = ''.join(chr(x) for x in range(33, 127))

    rle_compressed = RLEConverter(read.sequence)

    # we can only encode up to a homopolymer length 93
    inds = np.where(rle_compressed.homop_length >= len(scores))[0]
    if len(inds) > 0:
        logger.warning("Some homopolymers in {} are longer than the longest "
                       "supported length\n".format(read.name))
        rle_compressed.homop_length[inds] = len(scores) - 1

    coded_lengths = ''.join([scores[x] for x in rle_compressed.homop_length])

    compressed_record = pysam.FastxRecord(
        name=read.name,
        comment=read.comment if read.comment is not None else '',
        sequence=rle_compressed.compact_basecall,
        quality=coded_lengths)

    return compressed_record
Exemplo n.º 4
0
 def test_fastx_record_can_be_created_from_scratch(self):
     fastx_record = pysam.FastxRecord()
     self.assertRaises(ValueError, str, fastx_record)
     fastx_record.set_name("name")
     self.assertRaises(ValueError, str, fastx_record)
     fastx_record.set_sequence("sequence")
     self.assertEqual(str(fastx_record), ">name\nsequence")
Exemplo n.º 5
0
def test_find_corrected_sites(simple_fastq_reads):
    for r in simple_fastq_reads:
        r2 = pysam.FastxRecord(name = r.name, sequence = r.sequence, quality = r.quality)
        edited_seq = list(r2.sequence)
        edited_seq[5] = 'C'
        r2.sequence = ''.join(edited_seq)
        correct = np.zeros(len(edited_seq), dtype = np.bool)
        correct[5] = True
        assert np.array_equal(recalibrate.find_corrected_sites(r,r2), correct)
Exemplo n.º 6
0
    def test_compress_read(self):
        """Given an input record, check the returned RLE compressed version."""
        read = pysam.FastxRecord(
            name='test',
            comment='runid=b81',
            sequence='ACCGTTTAC')
        compressed_read = medaka.rle.compress_seq(read)
        true_output = {
            'name': read.name,
            'comment': read.comment,
            'sequence': 'ACGTAC',
            'quality': '"#"$""'}

        for key, expected in true_output.items():
            got = getattr(compressed_read, key)
            self.assertEqual(got, expected)
Exemplo n.º 7
0
def test_recalibrate_fastq():
    read = pysam.FastxRecord(name='foo', sequence='ATG',
                             quality='((#')  #7, 7, 2
    meanq = np.array([10])
    globaldeltaq = np.array([1])
    qscoredeltaq = np.array([[2, 2, 2, 2, 2, 2, 2, 2]])
    positiondeltaq = np.zeros((1, 8, 6))
    positiondeltaq[0, 7, :] = 3
    dinucdeltaq = np.zeros([1, 8, 16])
    dinucdeltaq[0, 7, :] = 5
    assert np.array_equal(
        compare_reads.recalibrate_fastq(
            read, meanq,
            globaldeltaq, qscoredeltaq, positiondeltaq, dinucdeltaq,
            np.array([0]), compare_reads.Dinucleotide.dinuc_to_int),
        np.array([21, 21, 2]))
Exemplo n.º 8
0
    def _create_probes_for_gene_variants(
            self, gene: pysam.FastxRecord,
            variants: pysam.tabix_iterator) -> str:
        """Note: An assumption is made with this function that the variants you pass in
        are from the gene passed with them."""
        probes = ""
        variants = [
            entry for entry in variants if not is_invalid_vcf_entry(entry)
        ]

        intervals = [
            self.calculate_probe_boundaries_for_entry(variant)
            for variant in variants
        ]

        intervals_to_probes = dict()

        for variant in variants:
            interval = self.calculate_probe_boundaries_for_entry(variant)
            if interval in intervals_to_probes and float(
                    intervals_to_probes[interval].name.split(
                        "=")[-1]) > get_genotype_confidence(variant):
                continue

            mutated_consensus = ""
            consensus = gene.sequence[slice(*interval)]
            last_idx = 0

            start_idx_of_variant_on_consensus = variant.start - interval[0]
            mutated_consensus += consensus[
                last_idx:start_idx_of_variant_on_consensus]
            mutated_consensus += get_variant_sequence(variant)
            last_idx = start_idx_of_variant_on_consensus + variant.rlen
            mutated_consensus += consensus[last_idx:]
            probe = pysam.FastxRecord()
            probe.set_name(
                f"{variant.chrom}_POS={variant.pos}_interval={interval}_GT_CONF={get_genotype_confidence(variant)}"
                .replace(" ", ""))
            probe.set_sequence(mutated_consensus)
            intervals_to_probes[interval] = probe

        for probe in intervals_to_probes.values():
            probes += str(probe) + "\n"

        return probes
Exemplo n.º 9
0
def test_mapProbesToPanel_oneRecordFirstBaseIsVariantSite():
    probe = pysam.FastxRecord()
    name = "GC00004785_pos168_entry0_CONF123.45"
    probe.set_name(name)
    probe.set_sequence("GACCTACACCGACGCCAAAGGCGAAAAACGCCCAATGTACCAAATCACCAAAAAC")
    panel = Path(TEST_PANEL)

    actual = map_probes_to_panel(str(probe), panel)
    expected = {
        "snps_called_correctly": [True],
        "mismatches": [0],
        "ids": [name],
        "ref_ids": ["T16509G"],
        "total_pandora_calls": 1,
        "pandora_calls_crossing_ref_site": 1,
        "reference_sites_called": 1,
    }

    assert actual == expected
Exemplo n.º 10
0
def test_mapProbesToPanel_oneRecordDoesntMap():
    probe = pysam.FastxRecord()
    name = "GC00004785_pos168_entry0_CONF123.45"
    probe.set_name(name)
    probe.set_sequence("T" * 60)
    panel = Path(TEST_PANEL)

    actual = map_probes_to_panel(str(probe), panel)
    expected = {
        "snps_called_correctly": [],
        "mismatches": [],
        "ids": [],
        "ref_ids": [],
        "total_pandora_calls": 1,
        "pandora_calls_crossing_ref_site": 0,
        "reference_sites_called": 0,
    }

    assert actual == expected
Exemplo n.º 11
0
def test_mapProbesToPanel_oneRecordSnpNotCalledNoOtherMismatches():
    probe = pysam.FastxRecord()
    name = "GC00004785_pos168_entry0_CONF123.45"
    probe.set_name(name)
    probe.set_sequence("TTAACGCCCTCAATTTTGAGGACGTAACCTACACCGACGCCAAAGGCGAA")
    panel = Path(TEST_PANEL)

    actual = map_probes_to_panel(str(probe), panel)
    expected = {
        "snps_called_correctly": [False],
        "mismatches": [1],
        "ids": [name],
        "ref_ids": ["T16509G"],
        "total_pandora_calls": 1,
        "pandora_calls_crossing_ref_site": 1,
        "reference_sites_called": 1,
    }

    assert actual == expected
Exemplo n.º 12
0
def test_mapProbesToPanel_oneRecordSnpCalledTwoMismatches():
    probe = pysam.FastxRecord()
    name = "GC00004785_pos168_entry0_CONF123.45"
    probe.set_name(name)
    probe.set_sequence("ACGTCGTGAGCAGGATATAAAAGCATTACGCCCACAAATCTATGCTCCCA")
    panel = Path(TEST_PANEL)

    actual = map_probes_to_panel(str(probe), panel)
    expected = {
        "snps_called_correctly": [True],
        "mismatches": [2],
        "ids": [name],
        "ref_ids": ["C15154T"],
        "total_pandora_calls": 1,
        "pandora_calls_crossing_ref_site": 1,
        "reference_sites_called": 1,
    }

    assert actual == expected
Exemplo n.º 13
0
def test_mapProbesToPanel_oneRecordMapsToPanelButToRightOfVariant():
    probe = pysam.FastxRecord()
    name = "GC00004785_pos168_entry0_CONF123.45"
    probe.set_name(name)
    probe.set_sequence(
        "ACCTACACCGACGCCAAAGGCGAAAAACGCCCAATGTACCAAATCACCAAAAACGGCTTCGTCTTCCTGGTGATGGGATTCACT"
    )
    panel = Path(TEST_PANEL)

    actual = map_probes_to_panel(str(probe), panel)
    expected = {
        "snps_called_correctly": [],
        "mismatches": [],
        "ids": [],
        "ref_ids": [],
        "total_pandora_calls": 1,
        "pandora_calls_crossing_ref_site": 0,
        "reference_sites_called": 0,
    }

    assert actual == expected
Exemplo n.º 14
0
        np.array([[0,0,0,0,0,0,0,2] + [0] * 35]), #q
        correct_pos_errs, #pos
        correct_pos_total, #pos
        correct_dinuc_errs, #dinuc
        correct_dinuc_total] #diunc

    for a,b in zip(correct_vectors, recalibrate.fastq_to_covariate_arrays(
        uncorr_and_corr_fastq_files)):
            assert np.array_equal(a,b)
    for a,b in zip(correct_vectors, recalibrate.fastq_to_covariate_arrays(
        uncorr_and_corr_with_rg, infer_rg = True)):
            assert np.array_equal(a,b)

#this read is used below
correct_read = pysam.FastxRecord(
        name = 'foo',
        sequence = 'ATG',
        quality = '\'\'#') #6, 6, 2

correct_read_with_rg = pysam.FastxRecord(
        name = 'foo/1_RG:Z:bar',
        sequence = 'ATG',
        quality = '\'\'#')

def test_recalibrate_fastq(uncorr_and_corr_fastq_files, uncorr_and_corr_with_rg, capfd):
    recalibrate.recalibrate_fastq(uncorr_and_corr_fastq_files)
    captured = capfd.readouterr()
    assert captured.out == str(correct_read) + '\n'

    #now test with infer_rg = True
    recalibrate.recalibrate_fastq(uncorr_and_corr_with_rg, infer_rg = True)
    captured = capfd.readouterr()
Exemplo n.º 15
0
r2_out_file = os.path.join(outdir, prefix + "_R2.fastq")
r3_out_file = os.path.join(outdir, prefix + "_R3.fastq")

start_time = time.time()
print("Start to pre-process fastq file",
      time.strftime("%a %b %d %H:%M:%S %Y", time.localtime()))
with pysam.FastxFile(r1_in_file) as r1_in, pysam.FastxFile(
        r2_in_file) as r2_in, open(r1_out_file, "w") as r1_out, open(
            r2_out_file, "w") as r2_out, open(r3_out_file, "w") as r3_out:
    for entry in r1_in:
        r3_entry = next(r2_in)
        name = entry.name
        sequence = entry.sequence
        comment = entry.comment
        quality = entry.quality
        barcode = name.split(":")[1].split("-")[-1]
        barcode_entry = pysam.FastxRecord()
        # write r1
        r1_out.write(str(entry) + "\n")
        # write r2
        barcode_entry.name = name
        barcode_entry.sequence = barcode
        barcode_entry.comment = comment
        barcode_entry.quality = quality[0:len(barcode)]
        r2_out.write(str(barcode_entry) + "\n")
        # write r3
        r3_out.write(str(r3_entry) + "\n")

end_time = time.time()
print("End", end_time - start_time)