def __init__(self, full_data, variants, read_name, forward_mapped): """Marginalize over all posterior probabilities to give a per position read probability :param variants: bases to track probabilities :param full_data: path to full tsv file ['contig', 'reference_index', 'reference_kmer', 'read_file', 'strand', 'event_index', 'event_mean', 'event_noise', 'event_duration', 'aligned_kmer', 'scaled_mean_current', 'scaled_noise', 'posterior_probability', 'descaled_event_mean', 'ont_model_mean', 'path_kmer'] """ self.read_name = read_name self.full_data = full_data self.variant_data = self.full_data[["X" in kmer for kmer in self.full_data["reference_kmer"]]] self.variants = sorted(variants) self.forward_mapped = forward_mapped self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'], list(self.variants)]) self.contig = NanoporeRead.bytes_to_string(self.full_data["contig"][0]) self.position_probs = pd.DataFrame() self.has_data = False self.per_read_calls = pd.DataFrame() self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"], list(self.variants)])
def get_data(self): """Calculate the normalized probability of variant for each nucleotide and across the read""" # final location of per position data and per read data data = [] per_read_data = [] for read_strand in (b"t", b"c"): read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand] read_strand = read_strand.decode("utf-8") if len(read_strand_specifc_data) == 0: continue for forward_mapped in set(self.variant_data["forward_mapped"]): mapping_strand = "-" if forward_mapped == b"forward": mapping_strand = "+" strand_specifc_data = read_strand_specifc_data[read_strand_specifc_data["forward_mapped"] == forward_mapped] if len(strand_specifc_data) == 0: continue # get positions on strand positions = set(strand_specifc_data["reference_position"]) n_positions = len(positions) strand_read_nuc_data = [0] * len(self.variants) # marginalize probabilities for each position for pos in positions: pos_data = strand_specifc_data[strand_specifc_data["reference_position"] == pos] total_prob = 0 position_nuc_dict = {x: 0.0 for x in self.variants} # Get total probability for each nucleotide for nuc in set(pos_data["base"]): nuc_data = pos_data[pos_data["base"] == nuc] nuc_prob = sum(nuc_data["posterior_probability"]) total_prob += nuc_prob position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob # normalize probabilities over each position nuc_data = [0] * len(self.variants) for nuc in position_nuc_dict.keys(): index = self.variants.index(nuc) nuc_data[index] = position_nuc_dict[nuc] / total_prob strand_read_nuc_data[index] += nuc_data[index] data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strand], nuc_data])) if n_positions > 0: per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strand, n_positions], [prob / n_positions for prob in strand_read_nuc_data]])) self.position_probs = pd.DataFrame(data, columns=self.columns) self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns) self.has_data = True return self.position_probs
def multiprocess_filter_reads(in_dir, alignment_file, readdb, trim=False, quality_threshold=7, worker_count=1, debug=False): """Multiprocess for filtering reads but dont move the files :param in_dir: input directory with subdirectories assumed to have fast5s in them :param alignment_file: bam file :param readdb: readdb or sequence summary file :param trim: option to trim for x number of bases :param quality_threshold: quality threshold :param worker_count: number of workers to use :param debug: boolean option which will only use one process in order to fail if an error arises :return: True """ assert alignment_file.endswith("bam"), "Alignment file must be in BAM format: {}".format(alignment_file) # grab aligned segment if debug: best_files = [] for sub_in_dir in get_all_sub_directories(in_dir): best_files.extend(filter_reads(alignment_file, readdb, [sub_in_dir], quality_threshold=quality_threshold, trim=trim)) else: filter_reads_args = {"readdb": readdb, "alignment_file": alignment_file, "quality_threshold": quality_threshold, "trim": trim} total, failure, messages, output = multithread.run_service2( filter_read_service2, get_all_sub_directories(in_dir), filter_reads_args, ["in_dir"], worker_count) best_files = merge_lists(output) return best_files
def match_ref_position_with_raw_start_band(aggregate_reference_position, per_event_data): """Match up the reference position from aggregated probability table and the per event data""" final_data = [] for position in aggregate_reference_position["position"]: # get the start and length of total number of events which had bases aligned to this position pos_data = per_event_data[per_event_data["reference_position"] == position] min_raw_start = min(pos_data["raw_start"]) last_event_length = pos_data[pos_data["raw_start"] == max(pos_data["raw_start"])]["raw_length"][0] total_length = max(pos_data["raw_start"]) - min(pos_data["raw_start"]) + last_event_length final_data.append( merge_lists([aggregate_reference_position[aggregate_reference_position["position"] == position].values.tolist()[0], [min_raw_start, total_length]])) final_data = pd.DataFrame(final_data, columns=merge_lists([aggregate_reference_position.columns, ["raw_start", "raw_length"]])) return final_data
def __init__(self, variant_data, variants, read_name): """Marginalize over all posterior probabilities to give a per position read probability :param variants: bases to track probabilities :param variant_data: variant data """ self.read_name = read_name self.variant_data = variant_data self.variants = sorted(variants) self.columns = merge_lists([['read_name', 'contig', 'position', 'strand', 'forward_mapped'], list(self.variants)]) self.contig = NanoporeRead.bytes_to_string(self.variant_data["contig"][0]) self.position_probs = pd.DataFrame() self.has_data = False self.per_read_calls = pd.DataFrame() self.per_read_columns = merge_lists([['read_name', 'contig', 'strand', "forward_mapped", "n_sites"], list(self.variants)])
def _normalize_all_data(self, all_data): """Helper function to normalize all probability data""" for strand in set(all_data["strand"]): strand_data = all_data[all_data["strand"] == strand] for contig in set(strand_data["contig"]): contig_data = strand_data[strand_data["contig"] == contig] for mapped_strand in set(contig_data["forward_mapped"]): strand_mapped_data = contig_data[contig_data["forward_mapped"] == mapped_strand] for position in set(strand_mapped_data["position"]): position_data = strand_mapped_data[strand_mapped_data["position"] == position] sum_total = sum(sum(position_data.loc[:, base]) for base in self.variants) normalized_probs = [np.round(sum(position_data.loc[:, base]) / sum_total, 6) for base in self.variants] yield merge_lists([[contig, position, strand, mapped_strand], normalized_probs])
def __init__(self, variant_tsv_dir, variants="ATGC", verbose=False): """Marginalize over all posterior probabilities to give a per position read probability :param variant_tsv_dir: directory of variantCaller output from signalAlign :param variants: bases to track probabilities """ self.variant_tsv_dir = variant_tsv_dir self.variants = sorted(variants) self.columns = merge_lists([['contig', 'position', 'strand', 'forward_mapped'], list(self.variants)]) self.variant_tsvs = list_dir(self.variant_tsv_dir, ext=".vc.tsv") self.aggregate_position_probs = pd.DataFrame() self.per_position_data = pd.DataFrame() self.verbose = verbose self.per_read_data = pd.DataFrame() self.has_data = self._aggregate_all_variantcalls()
def __init__(self, sa_full_tsv_dir, variants="ATGC", verbose=False, processes=2): """Marginalize over all posterior probabilities to give a per position read probability :param sa_full_tsv_dir: directory of full output from signalAlign :param variants: bases to track probabilities """ self.sa_full_tsv_dir = sa_full_tsv_dir self.variants = sorted(variants) self.columns = merge_lists([['contig', 'position', 'strand', 'forward_mapped'], list(self.variants)]) self.forward_tsvs = list_dir(self.sa_full_tsv_dir, ext=".forward.tsv") self.backward_tsvs = list_dir(self.sa_full_tsv_dir, ext=".backward.tsv") self.verbose = verbose self.worker_count = processes self.aggregate_position_probs = pd.DataFrame() self.per_position_data = pd.DataFrame() self.per_read_data = pd.DataFrame() self.has_data = self._multiprocess_aggregate_all_variantcalls()
def get_covered_kmers(positions_data1, read_name1, ref_sequence1, ref_name1, strand1, ref_start1, ref_end1): this_positions_data = positions_data1.loc[ (positions_data1["chr"] == ref_name1) & (positions_data1["strand"] == strand1) & (positions_data1["start"] >= ref_start1) & (positions_data1["start"] <= ref_end1)] if this_positions_data.empty: return None kmer_lists = np.vectorize(get_kmer)(ref_sequence1, this_positions_data['start'], ref_start1, strand1, this_positions_data["replace"]) kmer_subset_lists1 = merge_lists([[ kmer[i:i + kmer_length] for i in range(kmer_length) if len(kmer[i:i + kmer_length]) == kmer_length and set(kmer[i:i + kmer_length]) <= set(alphabet) ] for kmer in kmer_lists]) return read_name1, kmer_subset_lists1
def __init__(self, samples, out_file_path, template=True, complement=False, verbose=True): """ Control how each kmer/event assignment is processed given a set of samples and the parameters associated with each sample :param samples: :param out_file: :param template: generate kmers for template read strand: default: True :param complement: generate kmers for complement read strand: default: True :param min_probability: the minimum probability to use for assigning kmers :param verbose: option to print update statements """ self.strands = [] if template: self.strands.append('t') if complement: self.strands.append('c') assert self.strands != [], 'template or complement need to be set to True. ' \ 'complement: {}, template: {}'.format(complement, template) for sample in samples: assert isinstance(sample, SignalAlignSample) self.canonical = "ATGC" self.samples = samples self.out_file_path = out_file_path self.template = template self.complement = complement self.verbose = verbose self.master_assignment_table = \ make_master_assignment_table(sorted(merge_lists([sample.analysis_files for sample in self.samples]))) self.k = len(self.master_assignment_table.iloc[0]['kmer']) self.n_assignments = len(self.master_assignment_table)
def train_normal_hmm(self, transitions=True, emissions=False): """Train model transitions""" i = 0 # start iterating while i < self.args.transitions_args.iterations: # align all the samples self.run_signal_align( get_expectations=True, trim=self.args.transitions_args.training_bases) all_sample_files = merge_lists( [sample.analysis_files for sample in self.samples]) assert len( all_sample_files ) > 0, "Something failed in multithread signal alignment. We got no sample files" # load then normalize the expectations template_expectations_files = [ x for x in all_sample_files if x.endswith(".template.expectations.tsv") ] if len(template_expectations_files) > 0: self.template_model.add_and_normalize_expectations( files=template_expectations_files, hmm_file=self.template_hmm_model_path, update_transitions=transitions, update_emissions=emissions) if self.two_d: complement_expectations_files = [ x for x in all_sample_files if x.endswith(".complement.expectations.tsv") ] if len(complement_expectations_files) > 0: self.complement_model.add_and_normalize_expectations( files=complement_expectations_files, hmm_file=self.complement_model_path, update_transitions=transitions, update_emissions=emissions) # log the running likelihood if len(self.template_model.running_likelihoods) > 0 and \ (self.two_d and len(self.complement_model.running_likelihoods)) > 0: print( "[trainModels_transitions] {i}| {t_likelihood}\t{c_likelihood}" .format(t_likelihood=self.template_model. running_likelihoods[-1], c_likelihood=self.complement_model. running_likelihoods[-1], i=i)) if self.args.transitions_args.test and (len(self.template_model.running_likelihoods) >= 2) and \ (self.two_d and len(self.complement_model.running_likelihoods) >= 2): assert (self.template_model.running_likelihoods[-2] < self.template_model.running_likelihoods[ -1]) and \ (self.complement_model.running_likelihoods[-2] < self.complement_model.running_likelihoods[ -1]), "Testing: Likelihood error, went up" elif len(self.template_model.running_likelihoods) > 0: print("[trainModels_transitions] {i}| {t_likelihood}".format( t_likelihood=self.template_model.running_likelihoods[-1], i=i)) if self.args.transitions_args.test and (len( self.template_model.running_likelihoods) >= 2): assert (self.template_model.running_likelihoods[-2] < self.template_model.running_likelihoods[-1] ), "Testing: Likelihood error, went up" i += 1 print( "[trainModels_transitions] - finished training transitions routine" ) return self.template_hmm_model_path, self.complement_hmm_model_path
BED_PATHS = "/home/ubuntu/bisulfite_methylation_analysis/bisulfite_data" REP1_CPG = os.path.join(BED_PATHS, "chr1_ENCFF279HCL.bed") REP1_CHG = os.path.join(BED_PATHS, "chr1_ENCFF721BJM.bed") REP1_CHH = os.path.join(BED_PATHS, "chr1_ENCFF448RTC.bed") REP2_CPG = os.path.join(BED_PATHS, "chr1_ENCFF835NTC.bed") REP2_CHG = os.path.join(BED_PATHS, "chr1_ENCFF349NNL.bed") REP2_CHH = os.path.join(BED_PATHS, "chr1_ENCFF038HXQ.bed") min_coverage = 10 delta = 6 nb_cpu = 10 percents = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100] percents = [10, 20, 30, 40, 50, 60, 70, 80, 90] all_percents = merge_lists([[x - 1, x, x + 1] for x in percents]) assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format(OUTPUT_DIR) assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE) assert os.path.exists(REP1_CPG), "{} does not exist".format(REP1_CPG) assert os.path.exists(REP1_CHG), "{} does not exist".format(REP1_CHG) assert os.path.exists(REP1_CHH), "{} does not exist".format(REP1_CHH) assert os.path.exists(REP2_CPG), "{} does not exist".format(REP2_CPG) assert os.path.exists(REP2_CHG), "{} does not exist".format(REP2_CHG) assert os.path.exists(REP2_CHH), "{} does not exist".format(REP2_CHH) # reference handler and reverse complement handler rh = ReferenceHandler(REFERENCE) rc = ReverseComplement() chromosome_strings = rh.fasta.references[:25] chromosome_data = {
def test_merge_lists(self): with captured_output() as (_, _): a = [[1, 2, 3], [4, 5, 6]] self.assertEqual(merge_lists(a), [1, 2, 3, 4, 5, 6])
def main(): args = parse_args() assert os.path.isdir(args.output_dir), "{} is not a directory".format( args.output_dir) assert os.path.exists(args.bam), "{} does not exist".format(args.bam) assert os.path.exists(args.positions_file), "{} does not exist".format( args.positions_file) output_dir = args.output_dir bam = args.bam positions_file = args.positions_file reference = args.reference alphabet = args.alphabet kmer_length = args.kmer_length n_processes = args.threads # output_dir = "/home/ubuntu/mount/download/FAB39088" # bam = "/home/ubuntu/mount/download/FAB39088/fastq/canonical_cpg_FAB39088.2308.sorted.bam" # output_dir = "/home/ubuntu/mount/download/FAF01169" # bam = "/home/ubuntu/mount/download/FAF01169/Bham/fastq/canonical_cpg_FAF01169.2308.sorted.bam" # # positions_file = "/home/ubuntu/bisulfite_methylation_analysis/positions/canonical_added_cxx.positions" # reference = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa" # alphabet = "ACGT" # kmer_length = 6 fasta_handle = None if reference is not None: assert os.path.exists(reference), "{} does not exist".format(reference) fasta_handle = ReferenceHandler(reference) rc = ReverseComplement() positions_data = pd.read_csv( positions_file, names=["chr", "start", "strand", "find", "replace"], sep="\t") km = KmerMap(alphabet, kmer_length) counter = 0 def get_kmer(sequence, pos, start_pos, strand, replace): try: base = sequence[(pos - (kmer_length - 1)) - start_pos:(pos + kmer_length) - start_pos] base = base[:(kmer_length - 1)] + replace + base[kmer_length:] if strand == "-": return rc.complement(base) return base except Exception as e: print(e, sequence, pos, start_pos) # def get_ref_base(chromosome, start_pos, strand): # try: # base = fasta_handle.get_sequence(chromosome_name=chromosome, start=start_pos, stop=start_pos + 1) # if strand == "-": # return rc.complement(base) # return base # except Exception as e: # print(e, fasta_handle, chromosome, start_pos, strand) # # def get_base(sequence, pos, start_pos, reversed): # try: # base = sequence[pos - start_pos] # if reversed: # return rc.complement(base) # return base # except Exception as e: # print(e, sequence, pos, start_pos) def get_covered_kmers(positions_data1, read_name1, ref_sequence1, ref_name1, strand1, ref_start1, ref_end1): this_positions_data = positions_data1.loc[ (positions_data1["chr"] == ref_name1) & (positions_data1["strand"] == strand1) & (positions_data1["start"] >= ref_start1) & (positions_data1["start"] <= ref_end1)] if this_positions_data.empty: return None kmer_lists = np.vectorize(get_kmer)(ref_sequence1, this_positions_data['start'], ref_start1, strand1, this_positions_data["replace"]) kmer_subset_lists1 = merge_lists([[ kmer[i:i + kmer_length] for i in range(kmer_length) if len(kmer[i:i + kmer_length]) == kmer_length and set(kmer[i:i + kmer_length]) <= set(alphabet) ] for kmer in kmer_lists]) return read_name1, kmer_subset_lists1 def meta_get_covered_kmers(positions, all_args1): data_to_return = [] for args1 in all_args1: data = get_covered_kmers(positions, *args1) if data is not None: data_to_return.append(data) return data_to_return all_args = [] with closing(pysam.AlignmentFile( bam, 'rb' if bam.endswith("bam") else 'r')) as aln: for aligned_segment in aln.fetch(until_eof=True): try: if not aligned_segment.has_tag('MD'): if fasta_handle is None: raise Exception( "Need to specify --reference if MD flag is not set" ) else: ref_sequence = fasta_handle.get_sequence( chromosome_name=aligned_segment.reference_name, start=aligned_segment.reference_start, stop=aligned_segment.reference_end) else: ref_sequence = aligned_segment.get_reference_sequence( ).upper() read_name = aligned_segment.qname.split("_")[0] ref_name = aligned_segment.reference_name ref_start = aligned_segment.reference_start ref_end = aligned_segment.reference_end reversed_read = aligned_segment.is_reverse if reversed_read: strand = "-" else: strand = "+" all_args.append([ read_name, ref_sequence, ref_name, strand, ref_start, ref_end ]) counter += 1 except Exception as e: print(e, file=sys.stderr) print("starting on {} reads".format(len(all_args))) list_of_args = [all_args[x::n_processes] for x in range(n_processes)] # extra_args = {"positions": positions_data} # data = get_covered_kmers(positions_data, *list_of_args[0][0]) # print(data) service = BasicService2(meta_get_covered_kmers, positions_data, service_name="multiprocess_meta_get_covered_kmers") total, failure, messages, output = run_service(service.run, list_of_args, {}, ["all_args1"], n_processes) # print(pd.concat(output, ignore_index=True)) km = KmerMap(alphabet, kmer_length) all_data = merge_lists(output) print("number of reads: ", len(all_data)) for read_name, kmer_subset_lists in all_data: # print(read_name, kmer_subset_lists) r = Read(read_name) for kmer in kmer_subset_lists: r.add_kmer(kmer) km.add_read(r) kmer_counts_file_path = os.path.join(output_dir, "all_reads_kmer_counts.txt") with open(kmer_counts_file_path, "w") as fh: print("\n".join([ "\t".join([kmer, str(count)]) for kmer, count in km.kmer_counts.items() ]), file=fh) keep_kmer_map = KmerMap(alphabet, kmer_length) print("number of zero covered kmers: ", len(km.get_zero_kmers())) curr_threshold = 1 iteration = 0 increase_threshold = True while increase_threshold: curr_threshold += 1 find_kmers = keep_kmer_map.get_threshold_uncovered_kmers( threshold=curr_threshold) while len(find_kmers) > 0: print(iteration, len(find_kmers)) next_kmer = km.get_non_zero_min_kmer_in_kmers(find_kmers) if next_kmer is None: print( "No more reads to cover found kmers: threshold {}".format( curr_threshold)) increase_threshold = True if curr_threshold >= 10: increase_threshold = False break next_read_index, next_read = km.get_read(next_kmer) if next_read is None: print("Whoops, something is wrong") break keep_kmer_map.add_read(next_read) km.remove_read(next_read_index) find_kmers = keep_kmer_map.get_threshold_uncovered_kmers( threshold=curr_threshold) iteration += 1 print("Exited first while") if len(find_kmers) == 0: print("Found reads covering all kmers at threshold {}".format( curr_threshold)) file_path = os.path.join( output_dir, "{}_reads_covering_kmers_with_threshold_{}.txt".format( "all" if increase_threshold else "some", curr_threshold)) with open(file_path, "w") as fh: print("\n".join([read.read_id for read in keep_kmer_map.reads]), file=fh) kmer_counts_file_path = os.path.join( output_dir, "{}_kmer_counts_with_threshold_{}.txt".format( "all" if increase_threshold else "some", curr_threshold)) with open(kmer_counts_file_path, "w") as fh: print("\n".join([ "\t".join([kmer, str(count)]) for kmer, count in keep_kmer_map.kmer_counts.items() ]), file=fh)
def get_data(self): """Calculate the normalized probability of variant for each nucleotide and across the read""" # final location of per position data and per read data data = [] per_read_data = [] if self.forward_mapped: mapping_strands = ["+", "-"] else: mapping_strands = ["-", "+"] if len(self.variant_data) > 0: kmer_len_1 = len(self.variant_data["reference_kmer"].iloc[0]) - 1 mapping_index = 0 for read_strand in ("t", "c"): read_strand_specifc_data = self.variant_data[self.variant_data["strand"] == read_strand] # read_strand = read_strand.decode("utf-8") if len(read_strand_specifc_data) == 0: continue # get positions on strand positions = sorted(set(read_strand_specifc_data["reference_index"])) if mapping_strands[mapping_index] == "-": positions = positions[::-1] strand_read_nuc_data = [0] * len(self.variants) # marginalize probabilities for each position n_positions = 0 for pos in positions: pos_data = read_strand_specifc_data[read_strand_specifc_data["reference_index"] == pos] if pos_data["aligned_kmer"].iloc[0][kmer_len_1] != "X": continue n_positions += 1 total_prob = 0 position_nuc_dict = {x: 0.0 for x in self.variants} # Get total probability for each nucleotide for nuc in self.variants: # kmer_len_1 = pos_data["reference_kmer"].iloc[0].find("X") # print(pos_data["reference_kmer"].iloc[0]) nuc_data = pos_data[[nuc == kmer[kmer_len_1] for kmer in pos_data["path_kmer"]]] nuc_prob = sum(nuc_data["posterior_probability"]) total_prob += nuc_prob position_nuc_dict[NanoporeRead.bytes_to_string(nuc)] = nuc_prob # normalize probabilities over each position nuc_data = [0] * len(self.variants) for index, nuc in enumerate(self.variants): assert total_prob > 0, "Check 'variants' parameter. There seems to be no kmers with those " \ "variant characters" nuc_data[index] = position_nuc_dict[nuc] / total_prob strand_read_nuc_data[index] += nuc_data[index] data.append(merge_lists([[self.read_name, self.contig, pos, read_strand, mapping_strands[mapping_index]], nuc_data])) if n_positions > 0: per_read_data.append(merge_lists([[self.read_name, self.contig, read_strand, mapping_strands[mapping_index], n_positions], [prob / n_positions for prob in strand_read_nuc_data]])) mapping_index += 1 self.position_probs = pd.DataFrame(data, columns=self.columns) self.per_read_calls = pd.DataFrame(per_read_data, columns=self.per_read_columns) self.has_data = True else: self.has_data = False return self.position_probs