def duplicate_trim_set_with_2nd_set(dict_target_allele_SEQ, dict_fixed_allele_SEQ, ext_flag=True, ext_thrd=0.70, ori_flag=False): # if any target_SEQ is subseq of any fixed_SEQ, the target_SEQ is popped # if ext_flag is True, when a fixed_SEQ is a subseq of a target_SEQ, target_SEQ is kept and target_name is changed into fixed_name_ext # the ext_flag is effective if only len(fixed_SEQ) > len(target_SEQ)*ext_thrd dict_trimmed_allele_SEQ = {} for (t_name, t_SEQ) in dict_target_allele_SEQ.items(): assign_name = t_name for (f_name, f_SEQ) in dict_fixed_allele_SEQ.items(): if t_SEQ.upper() in f_SEQ.upper() or t_SEQ.upper() in get_reverse_complement(f_SEQ.upper()): # t_SEQ is the subseq assign_name = False break elif f_SEQ.upper() in t_SEQ.upper() or f_SEQ.upper() in get_reverse_complement(t_SEQ.upper()): # an f_SEQ is the subseq if ext_flag == True and len(f_SEQ) >= len(t_SEQ)*ext_thrd: ext_num = 0 assign_name = assign_new_name(f_name, '/extend-', dict_trimmed_allele_SEQ) else: assign_name = False if assign_name: if ori_flag == True: dict_trimmed_allele_SEQ[t_name] = t_SEQ else: if 'extend' in assign_name: dict_trimmed_allele_SEQ[assign_name] = t_SEQ else: assign_name = assign_new_name(assign_name, '/novel-', dict_trimmed_allele_SEQ) dict_trimmed_allele_SEQ[assign_name] = t_SEQ return dict_trimmed_allele_SEQ
def correct_allele(dict_occupied_place, dict_SEQ, dict_corrected_alleles, dict_flanking_alleles, dict_contig, len_extend): # dict_corrected_alleles {} # - keys: allele_name # - values: corrected_SEQ_set {corrected_SEQ_1, corrected_SEQ_2} # dict_flanking_alleles {} # - keys: allele_name # - values: corrected_SEQ_set {flanking_SEQ_1, flanking_SEQ_2} for contig_name, list_contig in dict_occupied_place.items(): contig_SEQ = dict_contig[contig_name] contig_len = len(contig_SEQ) for pairs in list_contig: pos_start = pairs[0] pos_end = pairs[1] allele_name = pairs[3] try: allele_name = allele_name.split('|')[1] except: allele_name = allele_name.split()[0] flag = pairs[4] flanking_SEQ = contig_SEQ[max(0, pos_start - len_extend - 1):min(contig_len, pos_end + len_extend - 1)].lower() if pairs[2] != 0: # mismatched alleles but remain in contig corrected_SEQ = contig_SEQ[pos_start - 1:pos_end - 1].lower() if flag % 32 >= 16: corrected_SEQ = get_reverse_complement(corrected_SEQ) flanking_SEQ = get_reverse_complement(flanking_SEQ) if dict_corrected_alleles.get(allele_name): dict_corrected_alleles[allele_name].add(corrected_SEQ) else: dict_corrected_alleles[allele_name] = {corrected_SEQ} flanking_name = allele_name + "/novel" if dict_flanking_alleles.get(flanking_name): dict_flanking_alleles[flanking_name].add(flanking_SEQ) else: dict_flanking_alleles[flanking_name] = {flanking_SEQ} else: if flag % 32 >= 16: flanking_SEQ = get_reverse_complement(flanking_SEQ) flanking_name = allele_name if dict_flanking_alleles.get(flanking_name): dict_flanking_alleles[flanking_name].add(flanking_SEQ) else: dict_flanking_alleles[flanking_name] = {flanking_SEQ} return dict_corrected_alleles, dict_flanking_alleles
def get_joint_entropy_profile_per_sequence(seq, w, alias, out=None): """ sliding window entropy profile of all sequences in a family :param fasta: a fasta file contatining viral sequences :param w: the window size :param out: optional. if != None a profile will be saved as a png :return: the vector of profile entropy """ all_entropies = {} entropies = [] # get identifier and genomic sequence genome = seq for j in range(len(genome) - w): sub_genome = genome[j:j + w] try: rc_sub_genome = get_reverse_complement(sub_genome) entropy = joint_entropy(sub_genome, rc_sub_genome, 5) entropies.append(entropy) except: break df = pd.DataFrame({'{}'.format(alias): entropies}) if out != None: df.to_csv(os.path.join(out, '{}_profile.csv'.format(alias)), index=False) return df
def get_joint_entropy_profile(fasta, w, out=None): """ sliding window entropy profile of all sequences in a family :param fasta: a fasta file contatining viral sequences :param w: the window size :param out: optional. if != None a profile will be saved as a png :return: the vector of profile entropy """ all_entropies = {} alias = os.path.basename(fasta).split('.')[0] i = 0 for rec in SeqIO.parse(fasta, "fasta"): entropies = [] # get identifier and genomic sequence genome = str(rec.seq) for j in range(len(genome) - w): sub_genome = genome[j:j + w] rc_sub_genome = str(get_reverse_complement(sub_genome)) entropy = joint_entropy(sub_genome, rc_sub_genome, 5) entropies.append(entropy) print('Done with seq {}'.format(i)) all_entropies['seq_{}'.format(i)] = entropies i += 1 df = pd.DataFrame( dict([(k, pd.Series(v)) for k, v in all_entropies.items()])) df.to_csv(os.path.join(out, '{}_Joint_profile.csv'.format(alias)), index=False) return df
def simulate_genome_by_composition(p, n, size, mode): """ simulate genomes of changing nucleotide compositions :param p: the proportions of each character :param n: length of simulated sequence :param size: number of sequences to simulate :param mode: the mode of simulation: 1= no structure, 2= structure :return: sequences and corresponding names """ sequences = [] names = [] if mode == 1: # repetitive sequences for i in tqdm(range(size)): seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=p, size=n)) sequences.append(seq) names.append('mode_{}_seq_{}'.format(mode,i)) else: # only structure - generate a perfect stem loop for i in tqdm(range(size)): seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=p, size=n // 2)) seq = seq + 'aaaaaa' + str(get_reverse_complement(seq)) sequences.append(seq) names.append('mode_{}_seq_{}'.format(mode, i)) return sequences, names
def get_SEQ_from_sam_list(list_fields, dict_SEQ): for fields in list_fields: if fields[9] != '*': name = fields[0] flag = int(fields[1]) if dict_SEQ.get(name): continue else: if flag % 32 >= 16: # SEQ is reverse_complemented dict_SEQ[name] = get_reverse_complement(fields[9]) else: # SEQ is original one dict_SEQ[name] = fields[9]
def simulate_genome_by_drops(size, w, genome_size=5000): """ simulate genomes of changing nucleotide compositions :param n: length of simulated sequence :param size: number of sequences to simulate :param w: the drop size :return: sequences and corresponding names """ metagenome = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25,0.25,0.25,0.25], size=genome_size)) sequences = [] names = [] # simulate size genomes for i in tqdm(range(size)): # drops 1 - homogeneous sequence drop_1 = np.random.choice(['a', 'c', 'g', 't']) * w # drop 2 - repetitive and structure letter1 = np.random.choice(['a', 'c', 'g', 't']) letter2 = np.random.choice([x for x in ['a', 'c', 'g', 't'] if x != letter1]) # we want a different nuc. drop_2 = letter1 * w//2 + letter2 * w//2 #drop 3 - pure structure stem_arm = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25,0.25,0.25,0.25], size=w//2 - 5)) loop = np.random.choice(['a', 'c', 'g', 't']) * 10 # loop of 10 nuc. drop_3 = stem_arm + loop + str(get_reverse_complement(stem_arm)) # drop 4 - bias in nucleotide composition nucs = ['a', 'c', 'g', 't'] np.random.shuffle(nucs) drop_4 = ''.join(np.random.choice(nucs, p=[0.6,0.2,0.1,0.1], size=w)) # insert genomes to metagenome and save the indices. simulated_genome = metagenome[:1000] + drop_1 + metagenome[1000:2000] + drop_2 +\ metagenome[2000:3000] + drop_3 + metagenome[3000:4000] + drop_4 + metagenome[4000:] sequences.append(simulated_genome) names.append('seq_{}'.format(i)) return sequences, names
dict_o_allele_SEQ = parse_fasta(fn_original_alleles) dict_c_allele_SEQ = parse_fasta(fn_corrected_alleles) dict_ref_trimmed_allele_SEQ = duplicate_trim_set_with_2nd_set(dict_c_allele_SEQ, dict_o_allele_SEQ) dict_shrink = {} for name, SEQ in dict_ref_trimmed_allele_SEQ.items(): dict_shrink[name+'_prefix'] = SEQ[:-1] dict_shrink[name+'_suffix'] = SEQ[1:] dict_self_trimmed_allele_SEQ = duplicate_trim_set_with_2nd_set(dict_ref_trimmed_allele_SEQ, dict_shrink, ext_flag=True, ext_thrd=0, ori_flag=True) set_SEQ = set() for name, SEQ in sorted(dict_self_trimmed_allele_SEQ.items()): if SEQ.upper() in set_SEQ: dict_self_trimmed_allele_SEQ.pop(name) else: set_SEQ.add(SEQ.upper()) set_SEQ.add(get_reverse_complement(SEQ.upper())) f_of = open(fo_filtered_alleles, 'w') f_oe = open(fo_extended_alleles, 'w') for allele_name in sorted(dict_self_trimmed_allele_SEQ.keys()): if 'extend' in allele_name: f_oe.write(">" + allele_name + '\n') f_oe.write(dict_self_trimmed_allele_SEQ[allele_name] + '\n') else: f_of.write(">" + allele_name + '\n') f_of.write(dict_self_trimmed_allele_SEQ[allele_name] + '\n') f_of.close() f_oe.close()
def get_kmers_distribution(fasta, k, out=None): """ get the kmers distribution plot for each family separately :param fasta: fasta file :param k: the kmer length :return: saves the plot """ alias = os.path.basename(fasta).split('.')[0] all_values = [] for rec in SeqIO.parse(fasta, "fasta"): # get identifier and genomic sequence genome = rec.seq rc_genome = str(get_reverse_complement(genome)) kmers_1 = {} kmers_2 = {} if k == 5: # sliding window of k for i in range(len(genome) - k): kmer = genome[i:i + k] if kmer in kmers_1: kmers_1[kmer] += 1 else: kmers_1[kmer] = 1 # for i in range(len(rc_genome) - k): # kmer = rc_genome[i:i+k] # if kmer in kmers_2: # kmers_2[kmer] += 1 # else: # kmers_2[kmer] = 1 elif k == 3: # reading frame for i in range(0, len(genome) - 3, 3): kmer = genome[i:i + 3] if kmer in kmers_1: kmers_1[kmer] += 1 else: kmers_1[kmer] = 1 # for i in range(0, len(rc_genome) - 3, 3): # kmer = rc_genome[i:i + 3] # if kmer in kmers_2: # kmers_2[kmer] += 1 # else: # kmers_2[kmer] = 1 else: assert (k == 1) codon_trimmed = string_by_codon_position(genome, 2) # rc_codon_trimmed = get_reverse_complement(seq) for i in range(len(codon_trimmed)): kmer = codon_trimmed[i] if kmer in kmers_1: kmers_1[kmer] += 1 else: kmers_1[kmer] = 1 # for i in range(len(rc_codon_trimmed)): # kmer = rc_codon_trimmed[i] # if kmer in kmers_2: # kmers_2[kmer] += 1 # else: # kmers_2[kmer] = 1 # create one dictionary for all kmers all_kmers = { x: kmers_1.get(x, 0) + kmers_2.get(x, 0) for x in set(kmers_1) | set(kmers_2) } values = [int(x) for x in all_kmers.values()] all_values.append(all_kmers) if out != None: # sns.distplot(values, hist=False, kde_kws={'shade':True}) plt.hist(values, alpha=0.8, normed=True) if out != None: plt.title('Distribution of kmers {}'.format(alias), fontsize=18) plt.xlabel('# kmers appearence', fontsize=18) plt.ylabel('Count', fontsize=18) sns.despine(offset=10) plt.savefig(os.path.join( out, '{}_kmers_distribution_hist_normed.png'.format(alias)), format='png', dpi=400, bbox_inches='tight') plt.gcf().clear() return all_values
def parse_edit_distance(fn_sam, fn_output_file, fn_output_flanking_region, fn_output_flanking_size, dict_contig, cluster_id, thrsd=0, flanking_size=100): f_report = open(fn_output_file, 'a') f_flank = open(fn_output_flanking_region, 'a') f_flank_size = open(fn_output_flanking_size, 'a') with open(fn_sam, 'r') as f_o: for line in f_o: if line[0] != '@': # real alignment information fields = line.split() #print(fields[11]) eDist = int(fields[11].split(':')[2]) cigar = fields[5] if 'S' in cigar: continue contig_name = fields[2] if contig_name != '*' and eDist <= thrsd: print_word = fields[0] + ' ' + contig_name.split('_')[ 2] + ' ' + contig_name + ' ' + str(cluster_id) + '\n' #print_word = fields[0] + '\t' + fields[2] + '\t' + fields[11] f_report.write(print_word) if dict_contig.get(contig_name): contig_SEQ = dict_contig[contig_name] allele_name = fields[0] allele_print = allele_name + '_cluster_' + cluster_id f_flank.write('>' + allele_print + '\n') if (int(fields[1]) % 32) >= 16: f_flank.write( get_reverse_complement(contig_SEQ) + '\n') print( str( len(contig_SEQ) - int(fields[3]) - len(fields[9]) + 1) + '-' + str(len(contig_SEQ) - int(fields[3]) + 1) + ',' + allele_print) else: f_flank.write(contig_SEQ + '\n') print( str(int(fields[3]) - 1) + '-' + str(int(fields[3]) - 1 + len(fields[9])) + ',' + allele_print) start_pos = int(fields[3]) - 1 - flanking_size end_pos = int(fields[3]) - 1 + len( fields[9]) + flanking_size #print(str(start_pos) + '-' + str(end_pos)) if start_pos < 0: start_pos = 0 if end_pos > len(contig_SEQ): end_pos = len(contig_SEQ) f_flank_size.write('>' + allele_print + '\n') f_flank_size.write(contig_SEQ[start_pos:end_pos] + '\n') else: eprint("Warning! Contig name does not exist! " + contig_name) f_report.close() f_flank.close() f_flank_size.close()
def simulate_dataset(n, size): """ simulate sequences from different classes ( up to 4 : repetitive, repetitive with stem loops, random, only stem loop :param n: number of sequences to simulate :param size: the size of each sequence :return: a data frame containing a sequence, entropy and joint entropy, together with a type indicating the class """ sequences = [] cluster = [] # repetitive sequences for i in tqdm(range(n)): cluster_name = 'Repetitive' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_rep = pd.DataFrame({'sequence':sequences, 'cluster':cluster}) df_rep['entropy'] = df_rep['sequence'].apply(lambda x: entropy_by_kmer(x,5)) df_rep['joint_entropy'] = df_rep['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_rep['entropy'] = df_rep['entropy'] / df_rep['entropy'].max() df_rep['joint_entropy'] = df_rep['joint_entropy'] / df_rep['joint_entropy'].max() sequences = [] cluster = [] # repetitive sequences + structure - generate a perfect stem loop for i in tqdm(range(n)): cluster_name = 'Repetitive + Stem loop' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.6, 0.2, 0.1, 0.1], size=size//2)) seq = seq + str(get_reverse_complement(seq)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_rep_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster}) df_rep_st['entropy'] = df_rep_st['sequence'].apply(lambda x: entropy_by_kmer(x,5)) df_rep_st['joint_entropy'] = df_rep_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_rep_st['entropy'] = df_rep_st['entropy'] / df_rep_st['entropy'].max() df_rep_st['joint_entropy'] = df_rep_st['joint_entropy'] / df_rep_st['joint_entropy'].max() sequences = [] cluster = [] # only structure - generate a perfect stem loop for i in tqdm(range(n)): cluster_name = 'Stem loop' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size//2)) seq = seq + str(get_reverse_complement(seq)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_st = pd.DataFrame({'sequence':sequences, 'cluster':cluster}) df_st['entropy'] = df_st['sequence'].apply(lambda x: entropy_by_kmer(x,5)) df_st['joint_entropy'] = df_st['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_st['entropy'] = df_st['entropy'] / df_st['entropy'].max() df_st['joint_entropy'] = df_st['joint_entropy'] / df_st['joint_entropy'].max() sequences = [] cluster = [] # random for i in tqdm(range(n)): cluster_name = 'Random' seq = ''.join(np.random.choice(['a', 'c', 'g', 't'], p=[0.25, 0.25, 0.25, 0.25], size=size)) seq = seq + str(get_reverse_complement(seq)) sequences.append(seq) cluster.append(cluster_name) # create a data frame with all information df_rand = pd.DataFrame({'sequence': sequences, 'cluster': cluster}) df_rand['entropy'] = df_rand['sequence'].apply(lambda x: entropy_by_kmer(x, 5)) df_rand['joint_entropy'] = df_rand['sequence'].apply(lambda x: joint_entropy(x, str(get_reverse_complement(x)), 5)) # normalize bpth entropy and joint entropy to 0-1 df_rand['entropy'] = df_rand['entropy'] / df_rand['entropy'].max() df_rand['joint_entropy'] = df_rand['joint_entropy'] / df_rand['joint_entropy'].max() # combine all inputs to one df, and return it result = pd.concat([df_rep, df_rep_st, df_st, df_rand]) return result
def process_seqs_for_grep(list_seqs): list_rc = [] for seq in list_seqs: list_rc.append(get_reverse_complement(seq)) set_all = set(list_seqs).union(set(list_rc)) return set_all
if dict_contig_H1.get(contig_name): contig_SEQ = dict_contig_H1[contig_name] elif dict_contig_H2.get(contig_name): contig_SEQ = dict_contig_H2[contig_name] else: eprint("Fatal Error! contig name " + contig_name + " not found!") else: contig_SEQ = dict_contig_H1[contig_name] left_flank = max(0, start_pos - len_extend - 1) right_flank = min(len(contig_SEQ), end_pos + len_extend - 1) flanking_SEQ = contig_SEQ[left_flank:right_flank] if len(annotation_info) > 4: # with mismatch allele_name = allele_name + "/novel" if dict_flank_SEQ.get(allele_name): if flanking_SEQ in dict_flank_SEQ[ allele_name] or get_reverse_complement( flanking_SEQ) in dict_flank_SEQ[allele_name]: pass else: dict_flank_SEQ[allele_name].add(flanking_SEQ) else: dict_flank_SEQ[allele_name] = {flanking_SEQ} f_of = open(fo_asm_flanking, 'w') for allele_name, set_allele_SEQ in sorted(dict_flank_SEQ.items()): for idx, allele_SEQ in enumerate(sorted(set_allele_SEQ)): f_of.write('>' + allele_name + '-' + str(idx) + '\n') f_of.write(allele_SEQ.lower() + '\n') f_of.close()
#Find the same name in novel allele reference database for ref_SEQ in list_novel_SEQ: if ref_SEQ in SEQ: novel_allele_name = dict_novel_serial[ref_SEQ] novel_allele_name += '/f' else: novel_allele_name = allele_name[:allele_name.rfind('-')] novel_allele_name += '/f' else: print("WARNING! Incorrect naming in file", person_name) SEQ = SEQ.lower() if dict_database.get(novel_allele_name): dict_SEQ = dict_database[novel_allele_name] if dict_SEQ.get(SEQ): dict_SEQ[SEQ].append(person_name) elif dict_SEQ.get(get_reverse_complement(SEQ)): dict_SEQ[get_reverse_complement(SEQ)].append(person_name) else: # add the SEQ into dict_SEQ dict_SEQ[SEQ] = [person_name] else: dict_database[novel_allele_name] = {SEQ: [person_name]} f_of = open(fo_merged_fasta, 'w') f_or = open(fo_merged_report, 'w') f_or.write( 'allele_name\tnumber_of_found_in_database\tsamples_possessing_the_allele\n' ) for allele_name, dict_SEQ in sorted(dict_database.items()): for idx, (SEQ, list_person) in enumerate( sorted(dict_SEQ.items(), key=lambda pair: len(pair[1]),
def mark_edit_region(fn_sam, fn_output_file, contig_file): edit_histogram = None cov_histogram = None #list_read_info: [ (start_pos, end_pos, read_name, even_odd_flag, mis_region) ] list_read_info = [] contig_len = 0 contig_name = "" # dict_reads{} # - key: (read_name, pair_number) # - values: read_SEQ dict_reads = {} even_odd_flag = 1 with open(fn_sam, 'r') as f_s: for line in f_s: if line[0] == '@': # header, information of the contig if line.find('LN:') != -1: # sometimes SPAdes would produce more than 1 contig, but the short one are not very useful # so we discard the short contigs and reads align to them if contig_len == 0: contig_len = int( line[line.find('LN:') + 3:-1]) + 1 # the number system start with 1 contig_name = line.split(':')[1][:-3] edit_histogram = np.zeros(contig_len) cov_histogram = np.zeros(contig_len) else: # real alignment information fields = line.split() # if the read align to shorter contigs, pass if contig_name != fields[2]: dict_reads[(read_name, even_odd_flag)] = read_SEQ list_read_info.append( (0, 0, read_name, even_odd_flag, [], "", read_SEQ)) if even_odd_flag == 1: even_odd_flag = 2 else: even_odd_flag = 1 continue read_name = fields[0] read_SEQ = fields[9] cigar = fields[5] sam_flag = int(fields[1]) # if the alignment is a supplementary alignment, pass # read BWA manual "Supplementary Alignment" for more information if sam_flag > 1024: continue # if cigar == '*', means alignment is bad, pass if cigar == '*': dict_reads[(read_name, even_odd_flag)] = read_SEQ #list_read_info.append((start_pos, end_pos, read_name, even_odd_flag, mis_region)) list_read_info.append( (0, 0, read_name, even_odd_flag, [], "", read_SEQ)) if even_odd_flag == 1: even_odd_flag = 2 else: even_odd_flag = 1 continue edit_dist = int(fields[11].split(':')[2]) MD_tag = fields[12].split(':')[2] start_pos = int(fields[3]) number, operate = parse_CIGAR(cigar) mis_region_MD = parse_MD(MD_tag) #if operate[0] == 'S': # mis_region_MD = [ele + number[0] + start_pos - 1 for ele in mis_region_MD] #else: mis_region_MD = [ele + start_pos - 1 for ele in mis_region_MD] mis_region_I = [] # insertion boundary region diff_len = 0 # len contribution of D and I if 'I' in operate or 'D' in operate: idx_I = start_pos - 1 # index in reference for idx, op in enumerate(operate): if op == 'I': diff_len -= number[idx] mis_region_I.append(idx_I) mis_region_I.append(idx_I + 1) else: if op == 'S': diff_len -= number[idx] else: idx_I += number[idx] if op == 'D': diff_len += number[idx] #print(fields[0]) #print(mis_region_MD) #print(mis_region_I) #print(mis_region) mis_region = mis_region_MD + mis_region_I mis_region.sort() edit_histogram[mis_region] += 1 end_pos = start_pos + len(fields[9]) + diff_len cov_histogram[start_pos:end_pos] += 1 # record the reads information if int(sam_flag / 16) % 2 == 1: dict_reads[(read_name, even_odd_flag)] = get_reverse_complement( read_SEQ.upper()) else: dict_reads[(read_name, even_odd_flag)] = read_SEQ list_read_info.append( (start_pos, end_pos, read_name, even_odd_flag, mis_region, cigar, read_SEQ)) if even_odd_flag == 1: even_odd_flag = 2 else: even_odd_flag = 1 contig_SEQ = "" with open(contig_file, 'r') as f_c: contig_flag = False for line in f_c: if line[0] == '>': tmp_name = line[1:].strip() if tmp_name == contig_name: contig_flag = True else: contig_flag = False elif contig_flag: contig_SEQ += line.strip() return edit_histogram, cov_histogram, list_read_info, dict_reads, contig_SEQ
def coverage_analysis( dict_read_allele_clusters, fn_annotation, required_min_depth=0, required_single_coverage=50, required_single_identity=1, ): dict_hc_calls = {} dict_sup_reads = {} # tmp: for dev list_annotated = [] #f_tmp = open('./NA12878_annotated_all.txt', 'r') f_tmp = open(fn_annotation, 'r') for line in f_tmp: list_annotated.append(line.rstrip()) # tmp list_answer = [] # for each cluster #for cluster_id in dict_read_allele_clusters.keys(): for cluster_id in range(55, len(dict_read_allele_clusters.keys()), 50): print("Cluster: " + str(cluster_id)) eprint("============= Cluster: " + str(cluster_id) + " ==============") cluster = dict_read_allele_clusters[str(cluster_id)] dict_allele = cluster[0] dict_read = cluster[1] # for each allele in a cluster for allele in dict_allele.keys(): print(allele) seq_allele = dict_allele[allele] seq_coverage = np.zeros(len(seq_allele)) dict_sup_reads[allele] = set() # tmp if allele in list_annotated: list_answer.append(allele) # need simplify for read in dict_read: seq_read = dict_read[read] # ignore the reads that are too short if len(seq_read) < required_single_coverage: continue traverse_result = hamming_traverse(seq_allele, seq_read, required_single_coverage, required_single_identity) if traverse_result[0]: seq_coverage[traverse_result[1]:traverse_result[2]] += 1 dict_sup_reads[allele].add(read) else: r_seq_read = get_reverse_complement(seq_read) traverse_result = hamming_traverse( seq_allele, r_seq_read, required_single_coverage, required_single_identity) if traverse_result[0]: seq_coverage[ traverse_result[1]:traverse_result[2]] += 1 dict_sup_reads[allele].add(read) if min(seq_coverage) > required_min_depth: if dict_hc_calls.get(allele): print("Warning! evaluate two times") dict_hc_calls[allele] += 1 else: dict_hc_calls[allele] = min(seq_coverage) print("OOO: " + str(min(seq_coverage)) + ' ' + str(sum(seq_coverage) / len(seq_coverage)) + ' ' + str(max(seq_coverage))) else: print("XXX: " + str(min(seq_coverage)) + ' ' + str(sum(seq_coverage) / len(seq_coverage)) + ' ' + str(max(seq_coverage))) print(seq_coverage) print(dict_hc_calls) print(list_answer) # tmp print('Num. high-confidence calls') print(len(set(dict_hc_calls))) print('Num. answer') print(len(set(list_answer))) print('Num. intersection') print(len(set(list_answer).intersection(set(dict_hc_calls)))) #print ("Support reads of alleles:") #print (dict_sup_reads) return dict_sup_reads