def get_composition(group, kmer_len, species): """ count the composition of each of the segments in a given k-mer and add it to the Counter """ chrom, data = group total_triplets = Counter() for i, row in data.iterrows(): try: seq = 0 if kmer_len == 5: seq = refseq(species, chrom, int(row['s']) - 2, int(row['distance']) + 4) elif kmer_len == 7: seq = refseq(species, chrom, int(row['s']) - 3, int(row['distance']) + 6) elif kmer_len == 3: seq = refseq(species, chrom, int(row['s']) - 1, int(row['distance']) + 2) if len(seq) > 0: total_triplets = total_triplets + Counter( kmers_generator(seq, kmer_len)) except: continue return total_triplets
def obtain_context(df, cont, cent): try: seq = refseq('saccer3', df['Chromosome'], df['Peak genomic coordinate'] - cent, cont) except: seq = '-' return seq
def get_sequence_window(nucid, species, window, kmers=3): """Get the sequence for a particular window (including flankig positions)""" chrom, start, end = nucid.split('_') # this is already one position before the real start start_real = int(start)-(window-1)/2 t_win = 0 if kmers == 3: t_win = window+2 seq = refseq(species, chrom, int(start_real), t_win) elif kmers == 5: t_win = window+4 seq = refseq(species, chrom, int(start_real) - 1, t_win) elif kmers == 7: t_win = window+6 seq = refseq(species, chrom, int(start_real) - 2, t_win) # this is to avoid problems if we reach the end of the chromosome, unlikely in hg19 but can happen in yeast if len(seq) != t_win: diff = t_win-len(seq) seq = seq+('N'*diff) return seq
def get_mutation(row, genome, size): ref = row['REF'] chr = row['CHR'] pos = int(row['POS']) try: kmer = refseq(genome, chr, pos - 2, 5) except (ValueError, RuntimeError): # out of chrom size, unknown chr return None if len(kmer) == 5 and (ref is '-' or kmer[2] == ref) and all( n in 'ACGT' for n in kmer): if size == 5: return kmer else: # 3-mer by default return kmer[1:4] else: return None
def analyze_experiment(vcf_file): df = vcf_reader(vcf_file) df['POS-1'] = df['POS'] - 1 # remove non canonical chromosomes df['CAN'] = df['CHROM'].apply(lambda x: 'RMV' if 'CACT' in x else 'PASS') df = df[df['CAN'] == 'PASS'] df['TRIPLET'] = df.apply( lambda x: refseq('leish', x['CHROM'], x['POS-1'], 3, release=None), axis=1) # select whether we have SNVs or others df['len_alt'] = df['ALT'].str.len() # number of characters in ref df['len_ref'] = df['REF'].str.len() # first classification between SNV and others df['TYPE'] = df.apply(lambda x: 'SNV' if (x['len_alt'] == 1) & (x['len_ref'] == 1) and (x['ALT'] != '-') and (x['REF'] != '-') else 'INDEL', axis=1) df = df[df['TYPE'] == 'SNV'] df['VARIANT_CLASS'] = df.apply(create_snv_class, axis=1) # get whether the mutation has happened in the WT or treated df['FILTER_WT'] = df['ERR174230.bam'].apply(lambda x: 'RES' if '0/0' in x else 'WT') # select only variants in the treated dic_variants = df[df['FILTER_WT'] == 'RES']['VARIANT_CLASS'].value_counts().to_dict() do_plot(dic_variants, 'experiment') # select only variants in the WT dic_variants = df[df['FILTER_WT'] == 'WT']['VARIANT_CLASS'].value_counts().to_dict() do_plot(dic_variants, 'WT')
def proportion_ww(genome, file): wanted = 'AT' count_seq = 0 data = collections.defaultdict(int) with gzip.open(file, 'rt') as infile: next(infile) for line in infile: line_spl = line.rstrip().split('\t') try: seq = refseq(genome, line_spl[0], int(line_spl[2]) - 73, 148) count_seq += 1 for i in range(len(seq)): if (seq[i] in wanted) & (seq[i + 1] in wanted): data[i] += 1 except: continue return data, count_seq
def IndelsClassifier(row, genome='hg19'): """ This will classify indel and dbs events into the PCAWG categories to feed into the extraction. We are classifying indels based on the position in the genome we belive they have occured. a) In the case of deletions: 1. if the first letter in ref is the same as in alt, we conclude that the fragment excised is REF[1:]. 2. if the first letter in ref differs to the one in alt, we conclude the entire REF has been excised. This means that when checking sequences, POS should not be included as in 1) since it is also deleted. b) In the case of insertions: 1.if the first letter in ref is the same as in alt, we conclude that the fragment inserted is ALT[1:] 2.if the first letter in ref differs to the one in alt, we conclude the insertion is the entire REF. """ dipyr = ('C', 'T') comp = {'A': 'T', 'G': 'C'} complementary = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} dbs_list = { 'AC_CA', 'AC_CG', 'AC_CT', 'AC_GA', 'AC_GG', 'AC_GT', 'AC_TA', 'AC_TG', 'AC_TT', 'AT_CA', 'AT_CC', 'AT_CG', 'AT_GA', 'AT_GC', 'AT_TA', 'CC_AA', 'CC_AG', 'CC_AT', 'CC_GA', 'CC_GG', 'CC_GT', 'CC_TA', 'CC_TG', 'CC_TT', 'CG_AT', 'CG_GC', 'CG_GT', 'CG_TA', 'CG_TC', 'CG_TT', 'CT_AA', 'CT_AC', 'CT_AG', 'CT_GA', 'CT_GC', 'CT_GG', 'CT_TA', 'CT_TC', 'CT_TG', 'GC_AA', 'GC_AG', 'GC_AT', 'GC_CA', 'GC_CG', 'GC_TA', 'TA_AT', 'TA_CG', 'TA_CT', 'TA_GC', 'TA_GG', 'TA_GT', 'TC_AA', 'TC_AG', 'TC_AT', 'TC_CA', 'TC_CG', 'TC_CT', 'TC_GA', 'TC_GG', 'TC_GT', 'TG_AA', 'TG_AC', 'TG_AT', 'TG_CA', 'TG_CC', 'TG_CT', 'TG_GA', 'TG_GC', 'TG_GT', 'TT_AA', 'TT_AC', 'TT_AG', 'TT_CA', 'TT_CC', 'TT_CG', 'TT_GA', 'TT_GC', 'TT_GG' } # =================== # DELETION BIG GROUP # =================== # Example --> chr1 99072 CT C # This means you have removed the T and you keep the C if row['CLASS'] == 'DEL': first_nuc_REF = row['REF'][0] first_nuc_ALT = row['ALT'][0] # this means it is constant if first_nuc_REF == first_nuc_ALT: flag_full = True endpos = len(row['REF']) size_del = len(row['REF'][1:]) affected_site = row['REF'][1] affected_seq = row['REF'][1:] pos = int(row['POS']) else: size_del = len(row['REF']) affected_site = row['REF'][0] pos = int(row['POS']) - 1 affected_seq = row['REF'] endpos = len(row['REF']) + 1 # FIRST CLASS : one base DELETION if size_del == 1: # if in dipyrimidines group left_sequence = refseq(genome, row['CHROM'], pos - 4, 5) right_sequence = refseq(genome, row['CHROM'], pos + 2, 5) # now we check for the repeats # we will never have count 0! count_eq = 1 for i in right_sequence: if i == affected_site: count_eq += 1 else: break for i in left_sequence[::-1]: if i == affected_site: count_eq += 1 else: break # if we have 5 repetitions, we have this class if count_eq > 5: class_in = 'DEL_{}_1_6+'.format( comp.get(affected_site, affected_site)) # else, we specifie the dipyr and how many repeats we have else: class_in = 'DEL_{}_1_{}'.format( comp.get(affected_site, affected_site), count_eq, ) # SECOND CLASS: more than one base deletion elif size_del > 1: # len_ref-1 because the chunck excised also contains the nucleotide before seq1 = refseq(genome, row['CHROM'], pos + endpos, size_del * 5) # we want 3' of the reverse seq2 = refseq(genome, row['CHROM'], pos - size_del * 5, size_del * 5 + 1)[::-1] count_eq = 1 # split the sequence into bins of the same size of the deleted region # Right location splitted = [ seq1[i:i + size_del] for i in range(0, len(seq1), size_del) ] for i in splitted: if i == affected_seq: count_eq += 1 else: break # Left location splitted = [ seq2[i:i + size_del] for i in range(0, len(seq2), size_del) ] for i in splitted: if i == affected_seq[::-1]: count_eq += 1 else: break # if the count is equal or greater than 5, we have this class if count_eq > 5: # this yields the DEL_repeats_5+_5+ if size_del >= 5: class_in = 'DEL_repeats_5+_6+' else: class_in = 'DEL_repeats_{}_6+'.format(size_del) else: # if we have some repeats found and they are less than 5, # then they belong to the next class if count_eq > 1: if size_del >= 5: class_in = 'DEL_repeats_5+_{}'.format(count_eq) else: class_in = 'DEL_repeats_{}_{}'.format( size_del, count_eq) # if no full repeat is found, then give opportunity to microhomology else: # get sequence on the right right_seq = refseq(genome, row['CHROM'], pos + endpos, size_del) # get sequence on the left left_seq = refseq(genome, row['CHROM'], pos - size_del, size_del + 1) good = 0 # we go down the size of the indel for i in np.arange(size_del - 1, 0, -1): # check right side tocheck = affected_seq[:i] tocheck_right = right_seq[:i] if tocheck == tocheck_right: good = i break # check the left side tocheck = affected_seq[::-1][:i] tocheck_left = left_seq[::-1][:i] if tocheck == tocheck_left: good = i break # if microhomology has been detected if good > 0: # this yields the DEL_repeats_5+_5+ if good >= 5: good = '5+' if size_del >= 5: class_in = 'DEL_MH_5+_{}'.format(good) else: class_in = 'DEL_MH_{}_{}'.format(size_del, good) # else this means a deletion with 0 repetitions else: if size_del >= 5: class_in = 'DEL_repeats_5+_1' # we put one according to their info else: class_in = 'DEL_repeats_{}_1'.format(size_del) # =================== # INSERTIONS BIG GROUP # =================== elif row['CLASS'] == 'INS': first_nuc_REF = row['REF'][0] first_nuc_ALT = row['ALT'][0] if first_nuc_REF == first_nuc_ALT: flag_full = True endpos = len(row['REF']) size_del = len(row['ALT'][1:]) affected_site = row['ALT'][1] affected_seq = row['ALT'][1:] pos = int(row['POS']) else: flag_full = False size_del = len(row['ALT']) affected_site = row['ALT'][0] pos = int(row['POS']) - 1 affected_seq = row['ALT'] endpos = len(row['REF']) + 1 # FIRST CLASS : one base INSERTION if size_del == 1: # if in dipyrimidines group # pos+2 because the deletion is mapped just at the beginning of POS! right_sequence = refseq(genome, row['CHROM'], pos + endpos, 5) # else we want whathever is on the 3' (because we would reverse it) # we want 3' of the reversed. This should include the first nucleotide! left_sequence = refseq(genome, row['CHROM'], pos - 4, 5) # we will never have count 0! count_eq = 0 for i in right_sequence: if i == affected_site: count_eq += 1 else: break for i in left_sequence[::-1]: if i == affected_site: count_eq += 1 else: break if count_eq >= 5: class_in = 'INS_{}_1_5+'.format( comp.get(affected_site, affected_site)) # else, we specifie the dipyr and how many repeats we have else: class_in = 'INS_{}_1_{}'.format( comp.get(affected_site, affected_site), count_eq, ) elif size_del > 1: # len_ref-1 because the chunck excised also contains the nucleotide before seq1 = refseq(genome, row['CHROM'], pos + endpos, size_del * 5) seq2 = refseq(genome, row['CHROM'], pos - size_del * 5, size_del * 5 + 1)[::-1] count_eq = 0 # split the sequence into bins of the same size of the deleted region splitted = [ seq1[i:i + size_del] for i in range(0, len(seq1), size_del) ] for i in splitted: if i == affected_seq: count_eq += 1 else: break splitted = [ seq2[i:i + size_del] for i in range(0, len(seq2), size_del) ] for i in splitted: if i == affected_seq[::-1]: count_eq += 1 else: break # if the count is equal or greater than 5, we have this class if count_eq >= 5: # this yields the INS_repeats_5+_5+ if size_del >= 5: class_in = 'INS_repeats_5+_5+' else: class_in = 'INS_repeats_{}_5+'.format(size_del) else: if size_del >= 5: class_in = 'INS_repeats_5+_{}'.format(count_eq) else: class_in = 'INS_repeats_{}_{}'.format(size_del, count_eq) # =================== # DBS BIG GROUP # =================== elif row['CLASS'] == 'DBS': # When merging the reverse complementary doublet base substitutions classes into one class, 12 of mutation # classes have no strandness (e.g. CG > AT), resulting in 78 classes of doublet base substitutions class_in = '{}_{}'.format(row['REF'], row['ALT']) if class_in not in dbs_list: # AC>CA GT>TG class_in = '{}{}_{}{}'.format(complementary[row['REF'][1]], complementary[row['REF'][0]], complementary[row['ALT'][1]], complementary[row['ALT'][0]]) else: class_in = 'NOTGOOD' return class_in
def coronavirus(pos, size=1): return refseq("sarscov2", "0", pos, size)
def simulate_and_analysis(self, item): """ Simulate mutations and analyze simulations Args: item (tuple): element of analysis data containing element (str), probs_tree (IntervalTree), conseq_tree (IntervalTree), n_sim (int) Returns: element (str): element of analysis sim_scores_chunk (list): simulated element's results sim_cluster_chunk (list): simulated cluster's results """ element, probs_tree, n_sim, seed = item sim_scores_chunk = [] sim_cluster_chunk = [] df_simulated_mutations = [] half_window = (self.simulation_window - 1) // 2 nucleot = {'A', 'C', 'G', 'T'} np.random.seed(seed) # Simulate mutations for mutation in self.mutations_d[element]: # Get coordinates of randomization window expected_hotspot_begin = mutation.position - half_window expected_hotspot_end = mutation.position + half_window if self.simulation_mode == 'region_restricted': """ Region restricted mode samples simulated mutations in a window of length l that fits in the genomic element. First, it checks that the genomic region where the mutation is going to be simulated is longer or equal than l. If this is true, it calculates the expected start and end positions of the simulation window. If one of them falls outside the genomic element, the window of length l is displaced to fit in the genomic element. If both expected start and end positions fall outside the genomic region, the simulation window is trimmed and simulations are performed inside the genomic region. If the genomic region is smaller than l, the simulation window becomes the genomic region. This means that the simulation window is trimmed and simulations are performed between the end and the start of the genomic region. """ if (mutation.region[1] - mutation.region[0]) >= self.simulation_window: # Check if hospot outside region check_5 = expected_hotspot_begin < mutation.region[0] check_3 = expected_hotspot_end > (mutation.region[1] - 1) if check_5 and check_3: hotspot_begin = mutation.region[0] hotspot_end = mutation.region[ 1] - 1 # regions end +1 in tree elif check_5: hotspot_begin = mutation.region[0] hotspot_end = mutation.region[ 0] + self.simulation_window - 1 # window //2 per side elif check_3: hotspot_end = mutation.region[ 1] - 1 # regions end +1 in tree hotspot_begin = hotspot_end - self.simulation_window + 1 # window //2 per side else: hotspot_begin = expected_hotspot_begin hotspot_end = expected_hotspot_end else: hotspot_begin = mutation.region[0] hotspot_end = mutation.region[ 1] - 1 # regions end +1 in tree else: """ Simulations are `mutation centered`, they are centered in the mutated position and can fall outside the genomic region. """ hotspot_begin = expected_hotspot_begin hotspot_end = expected_hotspot_end # Map to index # 3* accounts for alternates in the array of probabilities # half_window added in probabilities array start_index = 3 * (hotspot_begin - (mutation.region[0] - half_window)) end_index = 3 * (hotspot_end - (mutation.region[0] - half_window) + 1) # +1, range and slice for interval in probs_tree[mutation.group][ mutation.region[0]]: # unique iteration simulations = np.random.choice( range(start_index, end_index), size=n_sim, p=self.normalize(element, interval.data[start_index:end_index])) # Add info per simulated mutation list_simulations_per_mutation = [] for count, index in enumerate(simulations): position = mutation.region[0] - half_window + index // 3 ref_nucleotide = bgr.refseq(self.genome, self.chromosomes_d[element], position, 1) # Calculate sorted alternates and obtain simulated alternated from index if round(index / 3, 1) == (0.7 + index // 3): alternate_index = 2 else: alternate_index = 1 if round( index / 3, 1) == (0.3 + index // 3) else 0 alternate = sorted( list(nucleot.difference({ref_nucleotide })))[alternate_index] # Simulated mutation list_simulations_per_mutation.append( Mutation(position, mutation.region, alternate, mutation.sample, mutation.group)) df_simulated_mutations.append(list_simulations_per_mutation) # Start analysis logger.debug('Start analyzing simulations') for simulated_mutations in zip(*df_simulated_mutations): cutoff_clusters, element_score = self.analysis( element, list(simulated_mutations)) sim_scores_chunk.append(element_score) for interval in cutoff_clusters: clusters = interval.data.copy() for cluster, values in clusters.items(): sim_cluster_chunk.append(values['score']) return element, sim_scores_chunk, sim_cluster_chunk
def mut_probabilities(self, element): """ Generate mutational probabilities per position of an element using the sequence context observed mutational probabilities calculated from the input cohort/s. Args: element (str): element to calculate pre-smoothing Returns: probs_tree (IntervalTree): IntervalTree of genomic regions. Length == 3*(genomic + simulation window) skip (bool): if True skip further analysis """ nucleot = {'A', 'C', 'G', 'T'} probs_tree = defaultdict(IntervalTree) delta = 1 if self.kmer == 3 else 2 half_window = self.simulation_window // 2 skip = False # Check signatures dictionaries per group signatures_d = defaultdict() for group in self.groups_d[element]: if os.path.isfile(self.path_pickle): signature = pickle.load(open(self.path_pickle, "rb")) try: signatures_d[group] = signature[group] except KeyError: raise Exception( 'Signatures for group {} are missing in signatures dictionary\n' 'Please check signatures file {}'.format( group, self.path_pickle)) else: skip = True if not skip: # Iterate through genomic regions to get their sequences sequence = '' for interval in self.regions_d[element]: probabilities = defaultdict(list) expected_length = interval[1] - interval[0] + half_window * 2 start = interval[0] - half_window - delta size = interval[1] - interval[0] + half_window * 2 + delta * 2 try: sequence = bgr.refseq(self.genome, self.chromosomes_d[element], start, size) except ValueError as e: logger.error(e, element, start, size, interval[0], interval[1]) if sequence: # Search kmer probabilities for n in range(delta, len(sequence) - delta): # start to end ref_kmer = sequence[n - delta:n + delta + 1] prob = defaultdict(list) if ref_kmer.count('N') == 0: # calculate mutational prob to any other kmer # sort alternates to keep track for alt in sorted( list( nucleot.difference( {ref_kmer[self.kmer // 2]}))): for group, signature in signatures_d.items(): prob[group].append( signature.get( '{}>{}'.format(ref_kmer, alt), 0)) else: logger.warning( 'Mutational probabilities for position {0} could not be calculated. ' 'Reverting {0}>ALT probabilities to 0'.format( n)) for group, signature in signatures_d.items(): prob[group].extend([0, 0, 0]) # Extend position info for group in signatures_d.keys(): probabilities[group].extend(prob[group]) # Check and add for group in signatures_d.keys(): if sum(probabilities[group]) != 0 and len( probabilities[group]) == 3 * expected_length: probs_tree[group].addi(interval[0], interval[1], probabilities[group]) elif sum(probabilities[group]) == 0: logger.critical( 'Context based mutational probabilities in {} ' 'region {}-{} equal to 0\n'.format( element, interval[0], interval[1])) skip = True break elif len(probabilities[group]) != 3 * expected_length: logger.warning( '{} probabilities list length is different than expected' .format(element)) skip = True break if skip: break else: skip = True break if skip: logger.critical( 'Context based mutational probabilities could not be calculated for {0}\n' '{0} analysis is skipped'.format(element)) return probs_tree, skip
def get_full_composition(chrom, kmer_len, species): """ count the composition of each of the segments and add it to the Counter """ seq = refseq(species, chrom, 1, -1) return Counter(kmers_generator(seq, kmer_len))