Exemplo n.º 1
0
def model_region_singletons(data_container, vcf_path, fasta_path, kmer_size,
                            region):
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        if region.strand is not None:
            if ek.is_dash(region.strand):
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                     region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region),
              file=sys.stderr,
              flush=True)
        return
    region_ref_counts = ek.kmer_search(
        sequence, kmer_size)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(
        region.stop)
    transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    idx_nuc = list('ACGT')
    for variant in vcf(r_string):
        if ek.is_singleton_snv(variant):
            new_var = Variant(variant=variant, fields=['vep'])
            # take 7mer around variant. pyfaidx excludes start index and includes end index
            adj_seq = fasta[str(
                new_var.CHROM)][(new_var.POS -
                                 start_idx_offset):(new_var.POS +
                                                    kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print(
                    'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' %
                    (adj_seq[kmer_mid_idx], variant.REF),
                    file=sys.stderr,
                    flush=True)
            if ek.complete_sequence(adj_seq):
                transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    temp = data_container.get()
    temp.add_kmer_counts(region_ref_counts)
    temp.add_transition(transitions)
    data_container.set(temp)
    print('Finished region %s in %s' % (str(region), str(time.time() - start)),
          flush=True)
    return
Exemplo n.º 2
0
    def test_get_seq_rc(self):
        """ Check get_seq with rc argument """
        fa = Fasta('data/chr17.hg19.part.fa')

        result = fa.get_seq("chr17", 11, 20, rc=False)
        expect = "CCCTGTTCCT"
        print("normal")
        print(result.seq)
        print(expect)
        assert result.seq == expect

        result = fa.get_seq("chr17", 11, 20, rc=True)
        expect = "AGGAACAGGG"
        assert result.seq == expect
        print("rc")
        print(result.seq)
        print(expect)
Exemplo n.º 3
0
 def test_get_seq_rc(self):
     """ Check get_seq with rc argument """
     fa = Fasta('data/chr17.hg19.part.fa')
     
     result = fa.get_seq("chr17", 11, 20, rc=False)
     expect = "CCCTGTTCCT"
     print("normal")
     print(result.seq)
     print(expect)
     assert result.seq == expect
     
     result = fa.get_seq("chr17", 11, 20, rc=True)
     expect = "AGGAACAGGG"
     assert result.seq == expect
     print("rc")
     print(result.seq)
     print(expect)
Exemplo n.º 4
0
class FastaStringExtractor(BaseExtractor):
    """Fasta file extractor

    NOTE: The extractor is not thread-save.
    If you with to use it with multiprocessing,
    create a new extractor object in each process.

    # Arguments
      fasta_file (str): path to the fasta_file
      use_strand (bool): if True, the extracted sequence
        is reverse complemented in case interval.strand == "-"
      force_upper (bool): Force uppercase output
    """

    def __init__(self, fasta_file, use_strand=False, force_upper=False):
        from pyfaidx import Fasta

        self.fasta_file = fasta_file
        self._use_strand = use_strand
        self.fasta = Fasta(self.fasta_file)
        self.force_upper = force_upper

    def extract(self, interval: Interval, use_strand=None, **kwargs) -> str:
        """
        Returns the FASTA sequence in some given interval as string

        Args:
            interval: the interval to query
            use_strand (bool, optional): if True, the extracted sequence
                is reverse complemented in case interval.strand == "-".
                Overrides `self.use_strand`
            **kwargs:

        Returns:
            sequence of requested interval

        """
        # reverse-complement seq the negative strand
        if use_strand is None:
            use_strand = self.use_strand
        rc = use_strand and interval.strand == "-"

        # pyfaidx wants a 1-based interval
        seq = str(self.fasta.get_seq(
            interval.chrom,
            interval.start + 1,
            interval.stop,
            rc=rc
        ).seq)

        # optionally, force upper-case letters
        if self.force_upper:
            seq = seq.upper()
        return seq

    def close(self):
        return self.fasta.close()
Exemplo n.º 5
0
def process_chrom_bin(region, kmer_size, vcf_path, fasta_path, AF=False):
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr)
        return
    region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True,
                                                            count_n=True)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(region.stop)
    if AF:
        transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0]))
    else:
        transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    # count, singletons = ek.count_regional_variants(vcf(r_string))
    for variant in vcf(r_string):
        if ek.is_singleton_snv(variant):
            new_var = Variant(variant=variant)
            # take 7mer around variant. pyfaidx excludes start index and includes end index
            adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF),
                      file=sys.stderr, flush=True)
            if ek.complete_sequence(adj_seq):
                if AF:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF')
                else:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0:
        bin_trans = pd.DataFrame.from_dict(transitions, orient='index')
        bin_trans.sort_index(inplace=True)
        # bin_trans['tot'] = bin_trans.sum(axis=1)
        bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index')
        bin_kcounts.sort_index(inplace=True)
        bin_trans['counts'] = bin_kcounts[0]
        bin_trans['freq'] = bin_trans.apply(row_multinomial)
        # kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True)
        # kmer_freq.fillna(0, inplace=True)
        # kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts
        bin_trans.loc['GC_content', 'freq'] = gc_content
        bin_trans.loc['N_count', 'freq'] = n_count
        print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True)
        return region, bin_trans['freq'].to_dict()
    else:
        print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True)
        return region, None
Exemplo n.º 6
0
def check_clinvar(vcf_path,
                  fasta_path,
                  kmer_size,
                  left_context=0,
                  right_context=0,
                  counts_path=None):
    # VCF is 1-based, closed
    # BED is 0-based, half-open
    names = get_autosome_names_grch38()
    vcf = VCF(vcf_path)
    fasta = Fasta(fasta_path, read_ahead=10_000_000)
    window = KmerWindow(kmer_size, counts_path=counts_path)
    clinvar = []
    for variant in vcf:
        if variant.CHROM not in names: continue
        start = variant.POS - left_context
        stop = variant.POS + right_context
        # start = variant.POS - (kmer_size)
        # stop = variant.POS + (kmer_size)
        seq = fasta.get_seq(variant.CHROM, start, stop).seq.upper()
        clinvar.append((variant.CHROM, variant.POS, variant.INFO.get("CLNSIG"),
                        window.calculate_expected(seq), seq))
    return clinvar
Exemplo n.º 7
0
def match_seq(rec: pd.Series, sequences: pyfaidx.Fasta) -> pyfaidx.Sequence:
    """Given a feature in a GTF/GFF read in by gtfparse, match_seq() will extract the corresponding
    DNA sequence and create a new pyfaidx.Sequence object

    Parameters
    ----------
    rec : :class:`~pandas.Series`
        Information for a feature (i.e. gene, exon, etc...). Requires the following indices: strand,
        gene_name, feature, strand, start, end, seq_hash
    sequences : :class:`~pyfaidx.Sequence`
        Object containing sequences to match against the positions in the index.

    Returns
    -------
    :class:`~pyfaidx.Sequence object` with annotation from `rec` and sequence information from
    `sequences`.
    """

    try:
        rev: bool = bool(rec["strand"] == "-")

        seq = pyfaidx.Sequence(
            name=f"{rec['gene_name']}_"
            f"{rec['feature']}_"
            f"{rec['strand']}_"
            f"{rec['start']}_"
            f"{rec['end']}_"
            f"{rec['seq_hash']}",
            seq=sequences.get_seq(name=rec["seqname"],
                                  start=rec["start"],
                                  end=rec["end"],
                                  rc=rev).seq,
        )
        return seq
    except ValueError:
        print(f"problem with {rec['gene_name']} {rec['start']} "
              f"{rec['end']} {rec['seqname']} {rec['strand']}")
Exemplo n.º 8
0
def model_region_nonsingletons(data_container, vcf_path, fasta_path, kmer_size,
                               region, AC_cutoff):
    if AC_cutoff is not None:
        try:
            AC_cutoff = int(AC_cutoff)
        except ValueError:
            AC_cutoff = None
            print(
                'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.',
                file=sys.stderr,
                flush=True)
    try:
        kmer_size = int(kmer_size)
        if kmer_size < 1: raise ValueError
    except ValueError:
        print('kmer_size must be a positive integer. Please check arguments.',
              file=sys.stderr,
              flush=True)
        exit(1)
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        if region.strand is not None:
            if ek.is_dash(region.strand):
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                     region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region),
              file=sys.stderr,
              flush=True)
        return
    region_ref_counts = ek.kmer_search(
        sequence, kmer_size)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(
        region.stop)
    ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    idx_nuc = list('ACGT')
    for variant in vcf(r_string):
        if ek.is_quality_snv(variant, AC_cutoff=AC_cutoff):
            new_var = Variant(variant=variant)
            adj_seq = fasta[str(
                new_var.CHROM)][(new_var.POS -
                                 start_idx_offset):(new_var.POS +
                                                    kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print(
                    'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' %
                    (adj_seq[kmer_mid_idx], variant.REF),
                    file=sys.stderr,
                    flush=True)
            if ek.complete_sequence(adj_seq):
                ac_transitions[adj_seq.upper()][nuc_idx[
                    new_var.ALT[0]]] += new_var.AC
                an_transitions[adj_seq.upper()][nuc_idx[
                    new_var.ALT[0]]] += new_var.AN
        # if ek.is_singleton_snv(variant):
        #     new_var = Variant(variant=variant, fields=['vep'])
        #     # take 7mer around variant. pyfaidx excludes start index and includes end index
        #     adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq
        #     if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
        #         print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True)
        #     if ek.complete_sequence(adj_seq):
        #         transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    temp = data_container.get()
    temp.add_kmer_counts(region_ref_counts)
    temp.add_transition(ac_transitions)
    temp.add_transition2(an_transitions)
    data_container.set(temp)
    print('Finished region %s in %s' % (str(region), str(time.time() - start)),
          flush=True)
    return
Exemplo n.º 9
0
class FastaReader:
    """Class for reading and querying fasta file."""

    def __init__(self, fasta_location):
        """
        Parameters
        ---------
        fasta_location : string
                         Path to fasta file

        """
        self.fasta_location = fasta_location
        try:
            self.fasta = Fasta(fasta_location, as_raw=True, sequence_always_upper=True)
        except Exception as e:
            raise Exception(
                "Error reading fasta file {} : {}".format(
                    os.path.abspath(self.fasta_location), e
                )
            )

    def query(self, intervals):
        """Query regions for sequence.

        Parameters
        ----------
        intervals: list of Interval
                   The intervals for fasta is one-based and full-closed

        Returns
        -------
        sequences: list(str)
                   An array containing scores for each Interval
                   This function is agnostic of the strand information,
                   the position in the scores is corresponding to the interval

        """
        sequences = []
        chrom_lengths = self.chromosomes
        for i in intervals:
            if i.chrom not in list(chrom_lengths.keys()):
                warnings.warn(
                    "Chromosome {} does not appear in the fasta".format(i.chrom),
                    UserWarning,
                )
            else:
                chrom_length = chrom_lengths[i.chrom]
                if i.start > chrom_length:
                    raise Exception(
                        "Chromsome start point exceeds chromosome length: {}>{}".format(
                            i.start, chrom_length
                        )
                    )
                elif i.end > chrom_length:
                    raise Exception(
                        "Chromsome end point exceeds chromosome length: {}>{}".format(
                            i.end, chrom_length
                        )
                    )
                seq = self.fasta.get_seq(i.chrom, i.start, i.end)
                sequences.append(seq)
        return sequences

    def complement(self, seq):
        """Complement a FASTA sequence.

        Parameters
        ----------
        seq: str
            String fasta sequence


        Returns
        -------
        complement_seq: str
                        complemenet of input fasta
        """
        complement_letters = {"A": "T", "C": "G", "T": "A", "G": "C"}
        seq = seq.upper()
        comp = []
        for nuc in seq:
            if nuc in complement_letters:
                comp.append(complement_letters[nuc])
            else:
                comp.append(nuc)
        return "".join(comp)

    def reverse_complement(self, seq):
        """Reverse-complment a FASTA sequence.

        Parameters
        ----------
        seq: str
            String fasta sequence


        Returns
        -------
        complement_seq: str
                        complemenet of input fasta
        """
        seq = seq.upper()
        return self.complement(seq)[::-1]

    @property
    def chromosomes(self):
        """Return list of chromsome and their sizes
        as in the fasta file.

        Returns
        -------
        chroms : dict
                 Dictionary with {"chr": "Length"} format


        .. currentmodule:: .FastaReader
        .. autosummary::
            .FastaReader
        """
        chroms = OrderedDict()
        for chrom in list(self.fasta.keys()):
            chroms[chrom] = len(self.fasta[chrom])
        return chroms
Exemplo n.º 10
0
def query_bed_region(region, vcf_path, fasta, kmer_size, counts_path,
                     count_frequency):
    """
    @param region:
    @param vcf_path:
    @param fasta:
    @param kmer_size:
    @param counts_path:         This field is critical for count_freuency to work. Needs table of expected AF
    @param count_frequency:
    @return:
    """
    # TODO: Add binning somehow (either keep equal size or equal number of bins
    start = time.time()
    vcf = VCF(vcf_path)
    fasta = Fasta(fasta)
    window = KmerWindow(kmer_size, counts_path=counts_path)
    # The first kmer actually begins centered around first nucleotide in sequence so
    # start position is shifted upstream by half the kmer length
    # end position is shifted downstream by the same
    shift = kmer_size // 2
    try:
        if region.strand is not None:
            if is_dash(region.strand):
                sequence = fasta.get_seq(region.chrom, region.start - shift,
                                         region.stop +
                                         shift).complement.seq.upper()
            else:
                sequence = fasta.get_seq(region.chrom, region.start - shift,
                                         region.stop + shift).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - shift,
                                     region.stop + shift).seq.upper()
        exp = window.calculate_expected(
            sequence)  # this does account for strandedness
        if count_frequency:
            AF, AC, AN = count_regional_AF(vcf(str(region)))
            # if not math.isclose(calc, total, rel_tol=1e-05):
            #     print('WARNING: Calculated AF and VCF AF are different!     Calculated AF: %f     VCF AF: %f' % (
            #         calc, total), file=sys.stderr, flush=True)
            field1 = AC
            field2 = AN
            field3 = AF
            field4 = exp
            # if exp == 0:
            #     field4 = exp
            # else:
            #     field4 = exp
        else:
            # does not account for strandedness here
            all_vars, observed_variants = count_regional_variants(
                vcf(str(region)))
            field1 = all_vars - observed_variants
            field2 = observed_variants
            field3 = exp
            if exp == 0:
                field4 = 0
            else:
                field4 = observed_variants / exp
    except (KeyError, FetchError):
        field1 = 0
        field2 = 0
        field3 = 0
        field4 = 0
    #     exp = 0
    #     observed_variants = 0
    #     all_vars = 0
    # if exp == 0:
    #     ratio = 0
    # else:
    #     ratio = observed_variants / exp
    print('{0:<30} {1:>10} {2:>20} {3:>20} {4:>20}'.format(
        (region.printstr(delim=' ')), str(field1), str(field2), str(field3),
        str(field4)),
          flush=True)
    return "%s\t%s\t%s\t%s\t%s\n" % (region.printstr(), str(field1),
                                     str(field2), str(field3), str(field4))
Exemplo n.º 11
0
def process_bed_region(region, kmer_size, vcf_path, fasta_path, AF=False, delim=','):
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        # sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper()
        if region.strand is not None:
            if ek.is_dash(region.strand):
                sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                         region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                         region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr)
        return
    region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True,
                                                            count_n=True)  # nprocs=1 due to short region
    if AF:
        transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0]))
    else:
        transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    # count, singletons = ek.count_regional_variants(vcf(r_string))
    for variant in vcf(region.vcf_str()):
        if ek.is_singleton_snv(variant):
            new_var = Variant(variant=variant)
            # take 7mer around variant. pyfaidx excludes start index and includes end index
            adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF),
                      file=sys.stderr, flush=True)
            if ek.complete_sequence(adj_seq):
                if AF:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF')
                else:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0:
        bin_trans = pd.DataFrame.from_dict(transitions, orient='index')
        bin_trans.sort_index(inplace=True)
        bin_trans['tot'] = bin_trans.sum(axis=1)
        bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index')
        bin_kcounts.sort_index(inplace=True)
        bin_kcounts.columns = ['counts']
        kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True)
        kmer_freq.fillna(0, inplace=True)
        kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts
        kmer_freq.loc['GC_content', 'freq'] = gc_content
        kmer_freq.loc['N_count', 'freq'] = n_count
        kdict = kmer_freq['freq'].to_dict()
        # kmer_freq.sort_index(inplace=True)
        # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True)
        outstring = region.str_name() + delim
        kkeys = ek.generate_kmers(kmer_size)
        kkeys.append('GC_content')
        kkeys.append('N_count')
        for i, k in enumerate(kkeys):
            try:
                outstring = outstring + str(kmer_freq.loc[k, 'freq'])
            except KeyError:
                outstring = outstring + '0'
            if (i + 1) < len(kkeys):
                outstring = outstring + delim
        print(outstring, flush=True)
        # return region, kmer_freq['freq'].to_dict()
    else:
        # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True)
        outstring = region.str_name() + delim
        for i in range((kmer_size ** 4) + 2):
            outstring = outstring + '0'
            if (i + 1) < ((kmer_size ** 4) + 2):
                outstring = outstring + delim
        print(outstring, flush=True)
Exemplo n.º 12
0
def model_region(datacontainer, vcf_path, fasta_path, kmer_size, region,
                 AC_cutoff):
    if AC_cutoff is not None:
        try:
            AC_cutoff = int(AC_cutoff)
        except ValueError:
            AC_cutoff = None
            print(
                'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.',
                file=sys.stderr,
                flush=True)
    try:
        kmer_size = int(kmer_size)
        if kmer_size < 1: raise ValueError
    except ValueError:
        print('kmer_size must be a positive integer. Please check arguments.',
              file=sys.stderr,
              flush=True)
        exit(1)
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        if region.strand is not None:
            if is_dash(region.strand):
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                     region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region),
              file=sys.stderr,
              flush=True)
        return
    region_ref_counts = kmer_search(sequence,
                                    kmer_size)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(
        region.stop)
    singleton_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    af_transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    idx_nuc = list('ACGT')
    for variant in vcf(r_string):
        if is_quality_snv(variant, AC_cutoff=AC_cutoff):
            adj_seq = fasta[str(
                variant.CHROM)][(variant.POS -
                                 start_idx_offset):(variant.POS +
                                                    kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print(
                    'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' %
                    (adj_seq[kmer_mid_idx], variant.REF),
                    file=sys.stderr,
                    flush=True)
            if complete_sequence(adj_seq):
                ac_transitions[adj_seq.upper()][nuc_idx[
                    variant.ALT[0]]] += variant.INFO.get('AC')
                an_transitions[adj_seq.upper()][nuc_idx[
                    variant.ALT[0]]] += variant.INFO.get('AN')
                af_transitions[adj_seq.upper()][nuc_idx[
                    variant.ALT[0]]] += variant.INFO.get('AF')
                if variant.INFO.get('AC') == 1:
                    singleton_transitions[adj_seq.upper()][nuc_idx[
                        variant.ALT[0]]] += 1
    data = {
        'singleton': singleton_transitions,
        'AC': ac_transitions,
        'AN': an_transitions,
        'AF': af_transitions
    }
    temp = datacontainer.get()
    temp.add_kmer_counts(region_ref_counts)
    for k, v in data.items():
        temp.add_transition(v, k)

    datacontainer.set(temp)
    print('Finished region %s in %s' % (str(region), str(time.time() - start)),
          flush=True)
    return
Exemplo n.º 13
0
import os
from pybedtools import BedTool
from pyfaidx import Fasta
import ggplot as gg
import pandas as pd

ATAC_peaks_regions  = BedTool('../islet_ATAC_peaks/GSE76268_RAW/GSM1978246_ACFQ363beta2_.bed.gz')

genome = Fasta(os.path.expanduser('~/Desktop/genomes/hg19/chr1.fa.gz.bgz'))
print genome.get_seq( 'chr1',
                      int(ATAC_peaks_regions[100][1]),
                      int(ATAC_peaks_regions[100][2]) )

ATAC_peaks_regions_pd = pd.DataFrame(columns = ["start", "end"],
                                     index = [i[0] for i in ATAC_peaks_regions])

ATAC_peaks_regions_pd["start"] = [int(i[1]) for i in ATAC_peaks_regions]
ATAC_peaks_regions_pd["end"]   = [int(i[2]) for i in ATAC_peaks_regions]
ATAC_peaks_regions_pd["length"]  = ATAC_peaks_regions_pd["start"]-ATAC_peaks_regions_pd["end"]
ATAC_peaks_regions_pd.head()
p = gg.ggplot(data = ATAC_peaks_regions_pd, aesthetics=gg.aes(x="start", y="length")) + gg.geom_point()
Exemplo n.º 14
0
def query_bed_region(region, vcf_path, fasta, kmer_size, singleton_path, af_path, an_path, ac_path, model_dir):
    """
    @param ac_path:
    @param an_path:
    @param af_path:
    @param singleton_path:
    @param region:
    @param vcf_path:
    @param fasta:
    @param kmer_size:
    @return:
    """
    # TODO: Add binning somehow (either keep equal size or equal number of bins
    start = time.time()
    vcf = VCF(vcf_path)
    fasta = Fasta(fasta)
    window = QueryWindow(kmer_size, singleton_path=singleton_path, af_path=af_path, an_path=an_path, ac_path=ac_path, model_dir=model_dir)
    # The first kmer actually begins centered around first nucleotide in sequence so
    # start position is shifted upstream by half the kmer length
    # end position is shifted downstream by the same
    shift = kmer_size // 2
    try:
        if region.strand is not None:
            if is_dash(region.strand):
                sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).complement.seq.upper()
            else:
                sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - shift, region.stop + shift).seq.upper()
        exp = window.calculate_expected(sequence)  # this does account for strandedness
        AF, AC, AN, singletons, count = count_regional_alleles(vcf(str(region)))
        field1 = count  # 'NumSNVs'
        field2 = singletons  # 'Singletons'
        field3 = AC  # 'AC'
        field4 = AN  # 'AN'
        field5 = AF  # 'AF'
        field6 = exp.get('singleton')  # 'ExpectedSingletons'
        field7 = exp.get('AC')  # 'ExpectedAC'
        field8 = exp.get('AN')  # 'ExpectedAN'
        field9 = exp.get('AF')  # 'ExpectedAF'

    except (KeyError, FetchError):
        field1 = 0  # 'NumSNVs'
        field2 = 0  # 'Singletons'
        field3 = 0  # 'AC'
        field4 = 0  # 'AN'
        field5 = 0  # 'AF'
        field6 = 0  # 'ExpectedSingletons'
        field7 = 0  # 'ExpectedAC'
        field8 = 0  # 'ExpectedAN'
        field9 = 0  # 'ExpectedAF'

    # print('%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % (
    #     region.printstr(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7),
    #     str(field8), str(field9)), flush=True)
    regname = region.str_name().split('\t')
    print(
        '{: <8} {: <12} {: <12} {: <20} {: <8} {: <10} {: <12} {: <10} {: <10} {: <24} {: <22} {: <20} {: <20} {: <20}'.format(
            str(regname[0]), str(regname[1]), str(regname[2]), str(regname[3]), str(regname[4]), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7), str(field8), str(field9)),
        flush=True)
    return '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (
        region.str_name(), str(field1), str(field2), str(field3), str(field4), str(field5), str(field6), str(field7),
        str(field8), str(field9))
Exemplo n.º 15
0
class FastaHandler(object):
    def __init__(self, fasta_file):
        self.fasta_file = fasta_file
        print("Loading fasta file...please wait...")
        self.fasta_handler = Fasta(fasta_file, sequence_always_upper=True)

    def get_fasta_seq_by_location(self, chrom, start, end):
        if self.fasta_handler is not None:
            seq = self.fasta_handler[chrom][start:end].seq
            return seq
        else:
            raise ValueError("Fasta Handler not initialized")

    def get_fasta_seq_by_id(self, identifier, start=None, end=None):
        if self.fasta_handler is not None:
            if start is not None and end is not None:
                seq_record = self.fasta_handler.get_seq(identifier, start, end)
                if seq_record is not None:
                    return seq_record
            else:
                try:
                    fasta_record = self.fasta_handler[identifier]
                    len_fasta_record = len(fasta_record)
                    # print("len_fasta_record " + str(len_fasta_record))
                    seq_record = self.fasta_handler.get_seq(
                        identifier, 1, len_fasta_record)
                    return seq_record.seq
                except Exception as e:
                    print('Failed to get seq id: ' + str(identifier) + " " +
                          str(e))
                    return None
        else:
            raise ValueError("Fasta seq not found for id " + identifier)

    def get_sequence_by_id(self, identifier):
        if self.fasta_handler is not None:
            try:
                fasta_record = self.fasta_handler[identifier]
                len_fasta_record = len(fasta_record)
                # print("len_fasta_record " + str(len_fasta_record))
                seq_record = self.fasta_handler.get_seq(
                    identifier, 1, len_fasta_record)
                return seq_record.seq
            except Exception as e:
                print('Failed to get seq id: ' + str(identifier) + " " +
                      str(e))
                return None

    def get_seq_record_by_id_location(self,
                                      identifier,
                                      start=None,
                                      end=None,
                                      strand=None):

        if self.fasta_handler is not None:
            if start is not None and end is not None:
                seq_record = self.fasta_handler.get_seq(identifier, start, end)
                if seq_record is not None:
                    return seq_record.seq
            else:
                try:
                    fasta_record = self.fasta_handler[identifier]
                    len_fasta_record = len(fasta_record)
                    # print("len_fasta_record " + str(len_fasta_record))
                    seq_record = self.fasta_handler.get_seq(
                        identifier, 1, len_fasta_record)
                    return seq_record.seq
                except Exception as e:
                    print('Failed to get seq id: ' + str(identifier) + " " +
                          str(e))
                    return None
        else:
            raise ValueError("Fasta seq not found for id " + identifier)