def get_composition(group, kmer_len, species):
    """
    count the composition of each of the segments in a given k-mer and add it to the Counter
    """
    chrom, data = group
    total_triplets = Counter()

    for i, row in data.iterrows():
        try:
            seq = 0
            if kmer_len == 5:
                seq = refseq(species, chrom,
                             int(row['s']) - 2,
                             int(row['distance']) + 4)
            elif kmer_len == 7:
                seq = refseq(species, chrom,
                             int(row['s']) - 3,
                             int(row['distance']) + 6)
            elif kmer_len == 3:
                seq = refseq(species, chrom,
                             int(row['s']) - 1,
                             int(row['distance']) + 2)

            if len(seq) > 0:
                total_triplets = total_triplets + Counter(
                    kmers_generator(seq, kmer_len))
        except:
            continue

    return total_triplets
示例#2
0
def obtain_context(df, cont, cent):
    try:
        seq = refseq('saccer3', df['Chromosome'],
                     df['Peak genomic coordinate'] - cent, cont)
    except:
        seq = '-'

    return seq
示例#3
0
def get_sequence_window(nucid, species, window, kmers=3):
    """Get the sequence for a particular window (including flankig positions)"""
    chrom, start, end = nucid.split('_')

    # this is already one position before the real start
    start_real = int(start)-(window-1)/2
    t_win = 0

    if kmers == 3:
        t_win = window+2
        seq = refseq(species, chrom, int(start_real), t_win)
    elif kmers == 5:
        t_win = window+4
        seq = refseq(species, chrom, int(start_real) - 1, t_win)
    elif kmers == 7:
        t_win = window+6
        seq = refseq(species, chrom, int(start_real) - 2, t_win)

    # this is to avoid problems if we reach the end of the chromosome, unlikely in hg19 but can happen in yeast
    if len(seq) != t_win:
        diff = t_win-len(seq)
        seq = seq+('N'*diff)

    return seq
示例#4
0
def get_mutation(row, genome, size):
    ref = row['REF']
    chr = row['CHR']
    pos = int(row['POS'])
    try:
        kmer = refseq(genome, chr, pos - 2, 5)
    except (ValueError, RuntimeError):  # out of chrom size, unknown chr
        return None
    if len(kmer) == 5 and (ref is '-' or kmer[2] == ref) and all(
            n in 'ACGT' for n in kmer):
        if size == 5:
            return kmer
        else:  # 3-mer by default
            return kmer[1:4]
    else:
        return None
def analyze_experiment(vcf_file):

    df = vcf_reader(vcf_file)
    df['POS-1'] = df['POS'] - 1

    # remove non canonical chromosomes
    df['CAN'] = df['CHROM'].apply(lambda x: 'RMV' if 'CACT' in x else 'PASS')
    df = df[df['CAN'] == 'PASS']

    df['TRIPLET'] = df.apply(
        lambda x: refseq('leish', x['CHROM'], x['POS-1'], 3, release=None),
        axis=1)

    # select whether we have SNVs or others
    df['len_alt'] = df['ALT'].str.len()

    # number of characters in ref
    df['len_ref'] = df['REF'].str.len()

    # first classification between SNV and others
    df['TYPE'] = df.apply(lambda x: 'SNV'
                          if (x['len_alt'] == 1) & (x['len_ref'] == 1) and
                          (x['ALT'] != '-') and (x['REF'] != '-') else 'INDEL',
                          axis=1)

    df = df[df['TYPE'] == 'SNV']
    df['VARIANT_CLASS'] = df.apply(create_snv_class, axis=1)

    # get whether the mutation has happened in the WT or treated
    df['FILTER_WT'] = df['ERR174230.bam'].apply(lambda x: 'RES'
                                                if '0/0' in x else 'WT')

    # select only variants in the treated
    dic_variants = df[df['FILTER_WT'] ==
                      'RES']['VARIANT_CLASS'].value_counts().to_dict()

    do_plot(dic_variants, 'experiment')

    # select only variants in the WT
    dic_variants = df[df['FILTER_WT'] ==
                      'WT']['VARIANT_CLASS'].value_counts().to_dict()
    do_plot(dic_variants, 'WT')
def proportion_ww(genome, file):
    wanted = 'AT'

    count_seq = 0
    data = collections.defaultdict(int)

    with gzip.open(file, 'rt') as infile:
        next(infile)
        for line in infile:
            line_spl = line.rstrip().split('\t')

            try:
                seq = refseq(genome, line_spl[0], int(line_spl[2]) - 73, 148)
                count_seq += 1

                for i in range(len(seq)):
                    if (seq[i] in wanted) & (seq[i + 1] in wanted):
                        data[i] += 1
            except:
                continue

    return data, count_seq
示例#7
0
def IndelsClassifier(row, genome='hg19'):
    """
    This will classify indel and dbs events into the PCAWG categories to feed into the extraction.
    We are classifying indels based on the position in the genome we belive they have occured.

    a) In the case of deletions:
    1. if the first letter in ref is the same as in alt, we conclude that the fragment excised is REF[1:].
    2. if the first letter in ref differs to the one in alt, we conclude the entire REF has been excised.
      This means that when checking sequences, POS should not be included as in 1) since it is also deleted.

    b) In the case of insertions:
    1.if the first letter in ref is the same as in alt, we conclude that the fragment inserted is ALT[1:]
    2.if the first letter in ref differs to the one in alt, we conclude the insertion is the entire REF.

    """

    dipyr = ('C', 'T')
    comp = {'A': 'T', 'G': 'C'}
    complementary = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
    dbs_list = {
        'AC_CA', 'AC_CG', 'AC_CT', 'AC_GA', 'AC_GG', 'AC_GT', 'AC_TA', 'AC_TG',
        'AC_TT', 'AT_CA', 'AT_CC', 'AT_CG', 'AT_GA', 'AT_GC', 'AT_TA', 'CC_AA',
        'CC_AG', 'CC_AT', 'CC_GA', 'CC_GG', 'CC_GT', 'CC_TA', 'CC_TG', 'CC_TT',
        'CG_AT', 'CG_GC', 'CG_GT', 'CG_TA', 'CG_TC', 'CG_TT', 'CT_AA', 'CT_AC',
        'CT_AG', 'CT_GA', 'CT_GC', 'CT_GG', 'CT_TA', 'CT_TC', 'CT_TG', 'GC_AA',
        'GC_AG', 'GC_AT', 'GC_CA', 'GC_CG', 'GC_TA', 'TA_AT', 'TA_CG', 'TA_CT',
        'TA_GC', 'TA_GG', 'TA_GT', 'TC_AA', 'TC_AG', 'TC_AT', 'TC_CA', 'TC_CG',
        'TC_CT', 'TC_GA', 'TC_GG', 'TC_GT', 'TG_AA', 'TG_AC', 'TG_AT', 'TG_CA',
        'TG_CC', 'TG_CT', 'TG_GA', 'TG_GC', 'TG_GT', 'TT_AA', 'TT_AC', 'TT_AG',
        'TT_CA', 'TT_CC', 'TT_CG', 'TT_GA', 'TT_GC', 'TT_GG'
    }

    # ===================
    # DELETION BIG GROUP
    # ===================

    # Example --> chr1 	99072 	CT 	C
    # This means you have removed the T and you keep the C
    if row['CLASS'] == 'DEL':

        first_nuc_REF = row['REF'][0]
        first_nuc_ALT = row['ALT'][0]

        # this means it is constant
        if first_nuc_REF == first_nuc_ALT:
            flag_full = True
            endpos = len(row['REF'])
            size_del = len(row['REF'][1:])
            affected_site = row['REF'][1]
            affected_seq = row['REF'][1:]
            pos = int(row['POS'])

        else:
            size_del = len(row['REF'])
            affected_site = row['REF'][0]
            pos = int(row['POS']) - 1
            affected_seq = row['REF']
            endpos = len(row['REF']) + 1

        # FIRST CLASS : one base DELETION
        if size_del == 1:

            # if in dipyrimidines group
            left_sequence = refseq(genome, row['CHROM'], pos - 4, 5)
            right_sequence = refseq(genome, row['CHROM'], pos + 2, 5)

            # now we check for the repeats
            # we will never have count 0!
            count_eq = 1
            for i in right_sequence:
                if i == affected_site:
                    count_eq += 1
                else:
                    break
            for i in left_sequence[::-1]:
                if i == affected_site:
                    count_eq += 1
                else:
                    break
            # if we have 5 repetitions, we have this class
            if count_eq > 5:
                class_in = 'DEL_{}_1_6+'.format(
                    comp.get(affected_site, affected_site))

            # else, we specifie the dipyr and how many repeats we have
            else:
                class_in = 'DEL_{}_1_{}'.format(
                    comp.get(affected_site, affected_site),
                    count_eq,
                )

        # SECOND CLASS: more than one base deletion
        elif size_del > 1:

            # len_ref-1 because the chunck excised also contains the nucleotide before
            seq1 = refseq(genome, row['CHROM'], pos + endpos, size_del * 5)

            # we want 3' of the reverse
            seq2 = refseq(genome, row['CHROM'], pos - size_del * 5,
                          size_del * 5 + 1)[::-1]

            count_eq = 1

            # split the sequence into bins of the same size of the deleted region
            # Right location
            splitted = [
                seq1[i:i + size_del] for i in range(0, len(seq1), size_del)
            ]
            for i in splitted:
                if i == affected_seq:
                    count_eq += 1
                else:
                    break

            # Left location
            splitted = [
                seq2[i:i + size_del] for i in range(0, len(seq2), size_del)
            ]
            for i in splitted:
                if i == affected_seq[::-1]:
                    count_eq += 1
                else:
                    break

            # if the count is equal or greater than 5, we have this class
            if count_eq > 5:

                # this yields the DEL_repeats_5+_5+
                if size_del >= 5:
                    class_in = 'DEL_repeats_5+_6+'
                else:
                    class_in = 'DEL_repeats_{}_6+'.format(size_del)
            else:

                # if we have some repeats found and they are less than 5,
                # then they belong to the next class
                if count_eq > 1:
                    if size_del >= 5:
                        class_in = 'DEL_repeats_5+_{}'.format(count_eq)
                    else:
                        class_in = 'DEL_repeats_{}_{}'.format(
                            size_del, count_eq)

                # if no full repeat is found, then give opportunity to microhomology
                else:

                    # get sequence on the right
                    right_seq = refseq(genome, row['CHROM'], pos + endpos,
                                       size_del)
                    # get sequence on the left
                    left_seq = refseq(genome, row['CHROM'], pos - size_del,
                                      size_del + 1)

                    good = 0

                    # we go down the size of the indel
                    for i in np.arange(size_del - 1, 0, -1):

                        # check right side
                        tocheck = affected_seq[:i]
                        tocheck_right = right_seq[:i]
                        if tocheck == tocheck_right:
                            good = i
                            break

                        # check the left side
                        tocheck = affected_seq[::-1][:i]
                        tocheck_left = left_seq[::-1][:i]

                        if tocheck == tocheck_left:
                            good = i
                            break

                    # if microhomology has been detected
                    if good > 0:
                        # this yields the DEL_repeats_5+_5+
                        if good >= 5:
                            good = '5+'

                        if size_del >= 5:
                            class_in = 'DEL_MH_5+_{}'.format(good)
                        else:
                            class_in = 'DEL_MH_{}_{}'.format(size_del, good)

                    # else this means a deletion with 0 repetitions
                    else:
                        if size_del >= 5:
                            class_in = 'DEL_repeats_5+_1'  # we put one according to their info
                        else:
                            class_in = 'DEL_repeats_{}_1'.format(size_del)

    # ===================
    # INSERTIONS BIG GROUP
    # ===================
    elif row['CLASS'] == 'INS':

        first_nuc_REF = row['REF'][0]
        first_nuc_ALT = row['ALT'][0]

        if first_nuc_REF == first_nuc_ALT:
            flag_full = True
            endpos = len(row['REF'])
            size_del = len(row['ALT'][1:])
            affected_site = row['ALT'][1]
            affected_seq = row['ALT'][1:]
            pos = int(row['POS'])

        else:
            flag_full = False
            size_del = len(row['ALT'])
            affected_site = row['ALT'][0]
            pos = int(row['POS']) - 1
            affected_seq = row['ALT']
            endpos = len(row['REF']) + 1

        # FIRST CLASS : one base INSERTION
        if size_del == 1:

            # if in dipyrimidines group
            # pos+2 because the deletion is mapped just at the beginning of POS!
            right_sequence = refseq(genome, row['CHROM'], pos + endpos, 5)

            # else we want whathever is on the 3' (because we would reverse it)
            # we want 3' of the reversed. This should include the first nucleotide!
            left_sequence = refseq(genome, row['CHROM'], pos - 4, 5)

            # we will never have count 0!
            count_eq = 0

            for i in right_sequence:
                if i == affected_site:
                    count_eq += 1
                else:
                    break
            for i in left_sequence[::-1]:
                if i == affected_site:
                    count_eq += 1
                else:
                    break

            if count_eq >= 5:
                class_in = 'INS_{}_1_5+'.format(
                    comp.get(affected_site, affected_site))

            # else, we specifie the dipyr and how many repeats we have
            else:
                class_in = 'INS_{}_1_{}'.format(
                    comp.get(affected_site, affected_site),
                    count_eq,
                )

        elif size_del > 1:

            # len_ref-1 because the chunck excised also contains the nucleotide before
            seq1 = refseq(genome, row['CHROM'], pos + endpos, size_del * 5)
            seq2 = refseq(genome, row['CHROM'], pos - size_del * 5,
                          size_del * 5 + 1)[::-1]

            count_eq = 0

            # split the sequence into bins of the same size of the deleted region
            splitted = [
                seq1[i:i + size_del] for i in range(0, len(seq1), size_del)
            ]

            for i in splitted:
                if i == affected_seq:
                    count_eq += 1
                else:
                    break

            splitted = [
                seq2[i:i + size_del] for i in range(0, len(seq2), size_del)
            ]
            for i in splitted:
                if i == affected_seq[::-1]:
                    count_eq += 1
                else:
                    break
            # if the count is equal or greater than 5, we have this class
            if count_eq >= 5:

                # this yields the INS_repeats_5+_5+
                if size_del >= 5:
                    class_in = 'INS_repeats_5+_5+'
                else:
                    class_in = 'INS_repeats_{}_5+'.format(size_del)

            else:
                if size_del >= 5:
                    class_in = 'INS_repeats_5+_{}'.format(count_eq)
                else:
                    class_in = 'INS_repeats_{}_{}'.format(size_del, count_eq)

    # ===================
    # DBS BIG GROUP
    # ===================
    elif row['CLASS'] == 'DBS':

        # When merging the reverse complementary doublet base substitutions classes into one class, 12 of mutation
        # classes have no strandness (e.g. CG > AT), resulting in 78 classes of doublet base substitutions

        class_in = '{}_{}'.format(row['REF'], row['ALT'])
        if class_in not in dbs_list:

            # AC>CA	GT>TG
            class_in = '{}{}_{}{}'.format(complementary[row['REF'][1]],
                                          complementary[row['REF'][0]],
                                          complementary[row['ALT'][1]],
                                          complementary[row['ALT'][0]])

    else:
        class_in = 'NOTGOOD'

    return class_in
def coronavirus(pos, size=1):
    return refseq("sarscov2", "0", pos, size)
示例#9
0
    def simulate_and_analysis(self, item):
        """
        Simulate mutations and analyze simulations

        Args:
            item (tuple): element of analysis data containing element (str), probs_tree (IntervalTree),
                conseq_tree (IntervalTree), n_sim (int)
        Returns:
            element (str): element of analysis
            sim_scores_chunk (list): simulated element's results
            sim_cluster_chunk (list): simulated cluster's results
        """
        element, probs_tree, n_sim, seed = item
        sim_scores_chunk = []
        sim_cluster_chunk = []
        df_simulated_mutations = []
        half_window = (self.simulation_window - 1) // 2
        nucleot = {'A', 'C', 'G', 'T'}
        np.random.seed(seed)

        # Simulate mutations
        for mutation in self.mutations_d[element]:

            # Get coordinates of randomization window
            expected_hotspot_begin = mutation.position - half_window
            expected_hotspot_end = mutation.position + half_window

            if self.simulation_mode == 'region_restricted':
                """
                Region restricted mode samples simulated mutations in a window of length l that fits in the genomic 
                element. 
                
                First, it checks that the genomic region where the mutation is going to be simulated is longer or equal
                than l.            
                
                If this is true, it calculates the expected start and end positions of the simulation window. 
                If one of them falls outside the genomic element, the window of length l is displaced to fit in the 
                genomic element. If both expected start and end positions fall outside the genomic region, the 
                simulation window is trimmed and simulations are performed inside the genomic region. 
                
                If the genomic region is smaller than l, the simulation window becomes the genomic region. This means 
                that the simulation window is trimmed and simulations are performed between the end and the start of 
                the genomic region. 
                """

                if (mutation.region[1] -
                        mutation.region[0]) >= self.simulation_window:
                    # Check if hospot outside region
                    check_5 = expected_hotspot_begin < mutation.region[0]
                    check_3 = expected_hotspot_end > (mutation.region[1] - 1)

                    if check_5 and check_3:
                        hotspot_begin = mutation.region[0]
                        hotspot_end = mutation.region[
                            1] - 1  # regions end +1 in tree
                    elif check_5:
                        hotspot_begin = mutation.region[0]
                        hotspot_end = mutation.region[
                            0] + self.simulation_window - 1  # window //2 per side
                    elif check_3:
                        hotspot_end = mutation.region[
                            1] - 1  # regions end +1 in tree
                        hotspot_begin = hotspot_end - self.simulation_window + 1  # window //2 per side
                    else:
                        hotspot_begin = expected_hotspot_begin
                        hotspot_end = expected_hotspot_end
                else:
                    hotspot_begin = mutation.region[0]
                    hotspot_end = mutation.region[
                        1] - 1  # regions end +1 in tree
            else:
                """
                Simulations are `mutation centered`, they are centered in the mutated position and can fall outside 
                the genomic region. 
                """
                hotspot_begin = expected_hotspot_begin
                hotspot_end = expected_hotspot_end

            # Map to index
            # 3* accounts for alternates in the array of probabilities
            # half_window added in probabilities array
            start_index = 3 * (hotspot_begin -
                               (mutation.region[0] - half_window))
            end_index = 3 * (hotspot_end - (mutation.region[0] - half_window) +
                             1)  # +1, range and slice
            for interval in probs_tree[mutation.group][
                    mutation.region[0]]:  # unique iteration
                simulations = np.random.choice(
                    range(start_index, end_index),
                    size=n_sim,
                    p=self.normalize(element,
                                     interval.data[start_index:end_index]))
                # Add info per simulated mutation
                list_simulations_per_mutation = []
                for count, index in enumerate(simulations):
                    position = mutation.region[0] - half_window + index // 3
                    ref_nucleotide = bgr.refseq(self.genome,
                                                self.chromosomes_d[element],
                                                position, 1)
                    # Calculate sorted alternates and obtain simulated alternated from index
                    if round(index / 3, 1) == (0.7 + index // 3):
                        alternate_index = 2
                    else:
                        alternate_index = 1 if round(
                            index / 3, 1) == (0.3 + index // 3) else 0

                    alternate = sorted(
                        list(nucleot.difference({ref_nucleotide
                                                 })))[alternate_index]
                    # Simulated mutation
                    list_simulations_per_mutation.append(
                        Mutation(position, mutation.region, alternate,
                                 mutation.sample, mutation.group))
                df_simulated_mutations.append(list_simulations_per_mutation)

        # Start analysis
        logger.debug('Start analyzing simulations')
        for simulated_mutations in zip(*df_simulated_mutations):
            cutoff_clusters, element_score = self.analysis(
                element, list(simulated_mutations))
            sim_scores_chunk.append(element_score)
            for interval in cutoff_clusters:
                clusters = interval.data.copy()
                for cluster, values in clusters.items():
                    sim_cluster_chunk.append(values['score'])

        return element, sim_scores_chunk, sim_cluster_chunk
示例#10
0
    def mut_probabilities(self, element):
        """
        Generate mutational probabilities per position of an element using the sequence context observed mutational
        probabilities calculated from the input cohort/s.

        Args:
            element (str): element to calculate pre-smoothing

        Returns:
            probs_tree (IntervalTree): IntervalTree of genomic regions. Length == 3*(genomic + simulation window)
            skip (bool): if True skip further analysis

        """
        nucleot = {'A', 'C', 'G', 'T'}
        probs_tree = defaultdict(IntervalTree)
        delta = 1 if self.kmer == 3 else 2
        half_window = self.simulation_window // 2
        skip = False

        # Check signatures dictionaries per group
        signatures_d = defaultdict()
        for group in self.groups_d[element]:
            if os.path.isfile(self.path_pickle):
                signature = pickle.load(open(self.path_pickle, "rb"))
                try:
                    signatures_d[group] = signature[group]
                except KeyError:
                    raise Exception(
                        'Signatures for group {} are missing in signatures dictionary\n'
                        'Please check signatures file {}'.format(
                            group, self.path_pickle))
            else:
                skip = True

        if not skip:
            # Iterate through genomic regions to get their sequences
            sequence = ''
            for interval in self.regions_d[element]:
                probabilities = defaultdict(list)
                expected_length = interval[1] - interval[0] + half_window * 2
                start = interval[0] - half_window - delta
                size = interval[1] - interval[0] + half_window * 2 + delta * 2
                try:
                    sequence = bgr.refseq(self.genome,
                                          self.chromosomes_d[element], start,
                                          size)
                except ValueError as e:
                    logger.error(e, element, start, size, interval[0],
                                 interval[1])

                if sequence:
                    # Search kmer probabilities
                    for n in range(delta,
                                   len(sequence) - delta):  # start to end
                        ref_kmer = sequence[n - delta:n + delta + 1]
                        prob = defaultdict(list)
                        if ref_kmer.count('N') == 0:
                            # calculate mutational prob to any other kmer
                            # sort alternates to keep track
                            for alt in sorted(
                                    list(
                                        nucleot.difference(
                                            {ref_kmer[self.kmer // 2]}))):
                                for group, signature in signatures_d.items():
                                    prob[group].append(
                                        signature.get(
                                            '{}>{}'.format(ref_kmer, alt), 0))
                        else:
                            logger.warning(
                                'Mutational probabilities for position {0} could not be calculated. '
                                'Reverting {0}>ALT probabilities to 0'.format(
                                    n))
                            for group, signature in signatures_d.items():
                                prob[group].extend([0, 0, 0])
                        # Extend position info
                        for group in signatures_d.keys():
                            probabilities[group].extend(prob[group])

                    # Check and add
                    for group in signatures_d.keys():
                        if sum(probabilities[group]) != 0 and len(
                                probabilities[group]) == 3 * expected_length:
                            probs_tree[group].addi(interval[0], interval[1],
                                                   probabilities[group])
                        elif sum(probabilities[group]) == 0:
                            logger.critical(
                                'Context based mutational probabilities in {} '
                                'region {}-{} equal to 0\n'.format(
                                    element, interval[0], interval[1]))
                            skip = True
                            break
                        elif len(probabilities[group]) != 3 * expected_length:
                            logger.warning(
                                '{} probabilities list length is different than expected'
                                .format(element))
                            skip = True
                            break
                    if skip:
                        break
                else:
                    skip = True
                    break
        if skip:
            logger.critical(
                'Context based mutational probabilities could not be calculated for {0}\n'
                '{0} analysis is skipped'.format(element))

        return probs_tree, skip
def get_full_composition(chrom, kmer_len, species):
    """
    count the composition of each of the segments and add it to the Counter
    """
    seq = refseq(species, chrom, 1, -1)
    return Counter(kmers_generator(seq, kmer_len))