def FRR_sam_likelihood (A, sam_path, arg_dict, weights): locus = arg_dict['locus'] chrom, locus_start, locus_end = extract_locus_info(locus) log_likelihood = 0 n = 0 with open(sam_path, 'r') as irr_handle: for record in csv.reader(irr_handle, dialect = 'excel-tab'): if record[0][0] != '@': n = n + 1 sample_dfl = extract_col_sam(record, 'om') # print sample_dfl if 'diploid' in arg_dict and arg_dict['diploid'] == 'True': samp_likelihood = weights['allele_1'] * FRR_allele_likelihood(arg_dict, A[0], sample_dfl) + \ weights['allele_2'] * FRR_allele_likelihood(arg_dict, A[1], sample_dfl) else: samp_likelihood = FRR_allele_likelihood(arg_dict, A, sample_dfl) # print record[0], '\t', locus_start - int(record[3]), '\t', samp_likelihood if samp_likelihood > 0: samp_log_likelihood = np.log(samp_likelihood) else: samp_log_likelihood = -50 log_likelihood = log_likelihood + samp_log_likelihood # log_likelihood = log_likelihood * samp_likelihood # print 'FRR count:', n return log_likelihood
def span_sam_likelihood(A, sam_path, arg_dict, weights): locus = arg_dict['locus'] mean_ins_size = arg_dict['read_ins_mean'] chrom, locus_start, locus_end = extract_locus_info(locus) log_likelihood = 0 yo = 0 nn = 0 with open(sam_path, 'r') as irr_handle: for record in csv.reader(irr_handle, dialect='excel-tab'): if record[0][0] != '@' and int(record[8]) != 0: sample_ins = extract_col_sam(record, 'is') if 'diploid' in arg_dict and arg_dict['diploid'] == 'True': samp_likelihood = weights['allele_1'] * span_allele_likelihood(arg_dict, A[0], sample_ins) + \ weights['allele_2'] * span_allele_likelihood(arg_dict, A[1], sample_ins) else: samp_likelihood = span_allele_likelihood( arg_dict, A, sample_ins) # print record[0], '\t', sample_ins, '\t', np.abs(sample_ins - mean_ins_size) / 3 + 10, '\t', samp_likelihood yo = yo + np.abs(sample_ins - mean_ins_size) / 3 + 10 nn = nn + 1 if samp_likelihood > 0: samp_log_likelihood = np.log(samp_likelihood) # elif np.abs(samp_likelihood) < 10**-20 and samp_likelihood != 0: # accounting for comutational errors # print 'kek' # samp_log_likelihood = np.log(np.abs(samp_likelihood)) elif samp_likelihood <= 0: samp_log_likelihood = -50 log_likelihood = log_likelihood + samp_log_likelihood # print yo / nn return log_likelihood
def encl_sam_likelihood (A, sam_path, arg_dict, weights): locus = arg_dict['locus'] mean_ins_size = arg_dict['read_ins_mean'] chrom, locus_start, locus_end = extract_locus_info(locus) log_likelihood = 0 with open(sam_path, 'r') as irr_handle: for record in csv.reader(irr_handle, dialect = 'excel-tab'): if record[0][0] != '@': nCopy = extract_col_sam(record, 'nc') if 'diploid' in arg_dict and arg_dict['diploid'] == 'True': samp_likelihood = weights['allele_1'] * encl_allele_likelihood(arg_dict, A[0], nCopy) + \ weights['allele_2'] * encl_allele_likelihood(arg_dict, A[1], nCopy) else: samp_likelihood = encl_allele_likelihood(arg_dict, A, nCopy) # if nCopy == 30: # print record[0], '\tnC:', nCopy, '\t', samp_likelihood if samp_likelihood > 0: samp_log_likelihood = np.log(samp_likelihood) # elif np.abs(samp_likelihood) < 10**-20 and samp_likelihood != 0: # accounting for comutational errors # print samp_likelihood, A, nCopy # samp_log_likelihood = np.log(np.abs(samp_likelihood)) elif samp_likelihood <= 0: samp_log_likelihood = -50 log_likelihood = log_likelihood + samp_log_likelihood # print yo / nn return log_likelihood
def IRR_sam_likelihood(A, B, sam_path, arg_dict): locus = arg_dict['locus'] chrom, locus_start, locus_end = extract_locus_info(locus) log_likelihood = 1 with open(sam_path, 'r') as irr_handle: for record in csv.reader(irr_handle, dialect='excel-tab'): if record[0][0] != '@': sample_dfl = locus_start - int(record[3]) samp_likelihood = IRR_genotype_likelihood( arg_dict, A, B, sample_dfl) # print record[0], '\t', locus_start - int(record[3]), '\t', samp_likelihood if samp_likelihood > 0: samp_log_likelihood = np.log(samp_likelihood) else: samp_log_likelihood = -250 log_likelihood = log_likelihood + samp_log_likelihood # log_likelihood = log_likelihood * samp_likelihood return log_likelihood
def span_sam_likelihood(A, B, sam_path, arg_dict): locus = arg_dict['locus'] chrom, locus_start, locus_end = extract_locus_info(locus) log_likelihood = 0 with open(sam_path, 'r') as irr_handle: for record in csv.reader(irr_handle, dialect='excel-tab'): if record[0][0] != '@' and int(record[8]) != 0: sample_ins = int(record[8]) samp_likelihood = span_genotype_likelihood( arg_dict, A, B, sample_ins) # print record[0], '\t', int(record[8]), '\t', np.abs(int(record[8]) - 500) / 3 + 10, '\t', samp_likelihood if samp_likelihood > 0: samp_log_likelihood = np.log(samp_likelihood) elif np.abs(samp_likelihood ) < 10**-20: # accounting for comutational errors samp_log_likelihood = np.log(np.abs(samp_likelihood)) elif samp_likelihood == 0: samp_log_likelihood = -250 else: print 'Error! Negative likelihood:', samp_likelihood log_likelihood = log_likelihood + samp_log_likelihood return log_likelihood
parser.add_argument('--in-pref', type=str, required=True) parser.add_argument('--exp-dir', type=str, required=True) args = parser.parse_args() out_pref = args.out_pref in_pref = args.in_pref exp_dir = args.exp_dir arg_dict = load_profile(exp_dir) read_len = arg_dict['read_len'] locus = arg_dict['locus'] motif = arg_dict['motif'] chrom, locus_start, locus_end = extract_locus_info(locus) pre, post = extract_pre_post_flank(exp_dir, read_len) score_dict = { 'match': 3, \ 'mismatch': -1, \ 'gap': -3} verbose = False margin = 2 in_sam = in_pref + '.sam' out_sam = out_pref + '.sam' out_sam_handle = open(out_sam, 'w') print 'Filtering ' + in_pref + '.sam' with open(in_sam, 'r') as in_sam_handle: for record in in_sam_handle: if record[0] == '@':
from realignment import expansion_aware_realign, classify_realigned_read from load_info import load_profile, extract_locus_info from extract_genome import extract_pre_post_flank read_class = 'srp' nCopy = 70 filt_path = '/storage/nmmsv/expansion-experiments/ATXN3_32_cov60_dist500_hap_viz/aligned_read/nc_'+str(nCopy)+'_'+read_class+'.sam' # filt_path = '/storage/nmmsv/python_playground/test_filter_IRR/nc_'+str(nCopy)+'.sam' filt_path_true = '/storage/nmmsv/expansion-experiments/ATXN3_32_cov60_dist500_hap_viz/aligned_read/true_filter/nc_'+str(nCopy)+'_'+read_class+'.sam' sam_path = '/storage/nmmsv/expansion-experiments/ATXN3_32_cov60_dist500_hap_viz/aligned_read/nc_'+str(nCopy)+'.sam' exp_dir = '/storage/nmmsv/expansion-experiments/ATXN3_32_cov60_dist500_hap_viz/' arg_dict = load_profile(exp_dir) locus = arg_dict['locus'] read_len = arg_dict['read_len'] motif = arg_dict['motif'] chrom, locus_start_ref, locus_end_ref = extract_locus_info(locus) pre, post = extract_pre_post_flank(exp_dir, read_len) score_dict = { 'match': 3, \ 'mismatch': -1, \ 'gap': -3} verbose = False margin = 2 print locus_start_ref, locus_end_ref true_reads = [] kk = 0 with open (filt_path, 'r') as handle: for record in handle: if record[0] != '@': kk = kk + 1 QNAME = record.split()[0]