def getBackwardSequence(self, contig, raw_sequence): """Edit 'raw_sequence' given a ambiguity positions file, Assumes raw_sequence is forward direction( 5'-3') :param contig: which contig the sequence belongs (aka header) :param raw_sequence: raw nucleotide sequence :return: edited nucleotide sequence """ rc = ReverseComplement() raw_sequence = rc.complement(raw_sequence) return self._get_substituted_sequence(contig, raw_sequence, "-")
def test_iupac_complement(self): with captured_output() as (_, _): handle = ReverseComplement() for char in IUPAC_BASES: bases = iupac_base_to_bases(char) complement = iupac_complement(char) complement_chars = iupac_base_to_bases(complement) for x in bases: self.assertTrue(handle.complement(x) in complement_chars)
def test_reverse_complement(self): with captured_output() as (_, _): test = self.base.reverse_complement("ATGC") self.assertEqual(test, "GCAT") with self.assertRaises(AttributeError): self.base.reverse_complement(1) testbase = ReverseComplement(find="a1", replace="c2") self.assertEqual(testbase.reverse_complement("A1"), "2C") self.assertEqual(testbase.reverse_complement("a1"), "2C")
def match_events_with_eventalign(events=None, event_detections=None, minus=False, rna=False): """Match event index with event detection data to label segments of signal for each kmer # RNA is sequenced 3'-5' # reversed for fasta/q sequence # if mapped to reverse strand # reverse reverse complement = complement # DNA is sequenced 5'-3' # if mapped to reverse strand # reverse complement :param events: events table reference_index', 'event_index', 'aligned_kmer', 'posterior_probability :param event_detections: event detection event table :param minus: boolean option to for minus strand mapping :param rna: boolean for RNA read """ assert events is not None, "Must pass signal alignment events" assert event_detections is not None, "Must pass event_detections events" check_numpy_table(events, req_fields=('position', 'event_index', 'reference_kmer')) check_numpy_table(event_detections, req_fields=('start', 'length')) label = np.zeros(len(events), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S6')]) label['raw_start'] = [event_detections[x]["start"] for x in events["event_index"]] label['raw_length'] = [event_detections[x]["length"] for x in events["event_index"]] label['reference_index'] = events["position"] def convert_to_str(string): """Helper function to catch bytes as strings""" if type(string) is str: return string else: return bytes.decode(string) flip = ReverseComplement() if minus: if rna: kmers = [flip.complement(convert_to_str(x)) for x in events["reference_kmer"]] else: kmers = [flip.reverse_complement(convert_to_str(x)) for x in events["reference_kmer"]] else: if rna: kmers = [flip.reverse(convert_to_str(x)) for x in events["reference_kmer"]] else: kmers = events["reference_kmer"] label['kmer'] = kmers label['posterior_probability'] = np.ones(len(events)) # np.sort(label, order='raw_start', kind='mergesort') return label
def test_reverse_complement(self): rev_comp = ReverseComplement(find="ACGTMKRYBVDHNacgtmkrybvdhn", replace="TGCAKMYRVBHDNtgcakmyrvbhdn") for x in range(10): rand_len = np.random.randint(0, 1000) random_dna = get_random_string(rand_len, chars=list(set("ACGTMKRYBVDHN"))) self.assertEqual(reverse_complement(random_dna, reverse=True, complement=True), rev_comp.reverse_complement(random_dna)) self.assertEqual(reverse_complement(random_dna, reverse=False, complement=True), rev_comp.complement(random_dna)) self.assertEqual(reverse_complement(random_dna, reverse=True, complement=False), rev_comp.reverse(random_dna)) self.assertEqual(reverse_complement(random_dna, reverse=False, complement=False), random_dna)
def test_rna_reads(self): with tempfile.TemporaryDirectory() as tempdir: template_model = os.path.join( self.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") args = create_signalAlignment_args( alignment_file=self.rna_bam, bwa_reference=self.rna_reference, forward_reference=self.rna_reference, in_templateHmm=template_model, path_to_bin=self.path_to_bin, destination=tempdir, embed=True, delete_tmp=False) in_rna_file = os.path.join( self.test_dir_rna, "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5" ) final_args = merge_dicts([args, dict(in_fast5=in_rna_file)]) handle = SignalAlignment(**final_args) handle.run() fh = pysam.FastaFile(self.rna_reference) f5fh = Fast5(in_rna_file) sa_events = f5fh.get_signalalign_events() for i, event in enumerate(sa_events): kmer = fh.fetch(reference="rna_fake", start=event["reference_index"], end=event["reference_index"] + 5)[::-1] self.assertEqual(event["path_kmer"].decode(), kmer) self.assertEqual(event["reference_kmer"].decode(), kmer) in_rna_file = os.path.join( self.test_dir_rna, "DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5" ) final_args = merge_dicts([args, dict(in_fast5=in_rna_file)]) handle = SignalAlignment(**final_args) handle.run() rev_c = ReverseComplement() f5fh = Fast5(in_rna_file) sa_events = f5fh.get_signalalign_events() for i, event in enumerate(sa_events): kmer = fh.fetch(reference="rna_fake", start=event["reference_index"], end=event["reference_index"] + 5)[::-1] rev_kmer = rev_c.reverse_complement(kmer) self.assertEqual(event["path_kmer"].decode(), rev_kmer) self.assertEqual(event["reference_kmer"].decode(), kmer)
def resegment_reads(fast5_path, params, speedy=False, overwrite=False): """Re-segment and create anchor alignment from previously base-called fast5 file :param fast5_path: path to fast5 file :param params: event detection parameters :param speedy: boolean option for speedyStatSplit or minknow :param overwrite: overwrite a previous event re-segmented event table :param name: name of key where events table will be placed (Analyses/'name'/Events) :return True when completed """ assert os.path.isfile(fast5_path), "File does not exist: {}".format(fast5_path) name = "ReSegmentBasecall_00{}" # create Fast5 object f5fh = Fast5(fast5_path, read='r+') # gather previous event detection old_event_table = f5fh.get_basecall_data() # assert check_event_table_time(old_event_table), "Old event is not consistent" read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # pick event detection algorithm signal = f5fh.get_read(raw=True, scale=True) if speedy: event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts([params, {"event_detection": "speedy_stat_split"}]) else: event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts([params, {"event_detection": "minknow_event_detect"}]) keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts([params, dict(zip(keys, values)), f5fh.raw_attributes]) if f5fh.is_read_rna(): old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time) # set event table new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table) f5fh.set_new_event_table(name, new_event_table, attributes, overwrite=overwrite) # gather new sequence sequence = sequence_from_events(new_event_table) if f5fh.is_read_rna(): sequence = ReverseComplement().reverse(sequence) sequence = sequence.replace("T", "U") quality_scores = '!'*len(sequence) fastq = create_fastq_line(read_id+" :", sequence, quality_scores) # set fastq f5fh.set_fastq(name, fastq) return f5fh
def setUpClass(cls): super(ReverseComplementTest, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-1]) cls.fasta = os.path.join(cls.HOME, "test_files/test.fa") cls.fastq = os.path.join(cls.HOME, "test_files/test.fastq") cls.reference = os.path.join(cls.HOME, "test_files/ecoli_k12_mg1655.fa") cls.base = ReverseComplement()
def test_instantiation(self): with captured_output() as (_, _): self.assertEqual(ReverseComplement().find, "ATGC") self.assertEqual(ReverseComplement().replace, "TACG") with self.assertRaises(AssertionError): ReverseComplement(find="asdfe") ReverseComplement(replace="asdfe") ReverseComplement(find="asdfe", replace="poiuyq") with self.assertRaises(AssertionError): ReverseComplement(find="aa", replace="at")
def get_kmer_counts_from_reference_given_bed( reference, bed_file, k=5, param_filter=FilterBed.return_true, check_base=None): """Generate kmer counts covering positions in a bed file""" ref_handler = ReferenceHandler(reference) kmers = Counter() counter = 0 for chromosome, start, stop, _, _, strand, _, _, _, coverage, percentage in parse_methyl_bed( bed_file): if param_filter(chromosome, start, stop, strand, coverage, percentage): block_start = max(0, start - (k - 1)) block_end = min(ref_handler.get_chr_sequence_length(chromosome), stop + (k - 1)) seq = ref_handler.get_sequence(chromosome, block_start, block_end) # Check if base in bed file matches the reference sequence if check_base is not None: base = ref_handler.get_sequence(chromosome, start, stop) if strand == "-": this_base = ReverseComplement().complement(check_base) else: this_base = check_base assert this_base == base, \ "Check base is not the same as the one from the reference. " \ "{} != {}. {}".format(this_base, base, [chromosome, start, stop, strand, coverage, percentage]) kmers += count_all_sequence_kmers(seq, k=k, rev_comp_only=(strand == "-")) # Print some updates because this takes a long time counter += 1 if counter % 10000 == 0: print(".", end="") sys.stdout.flush() if counter % 1000000 == 0: print(counter) return kmers
def test_convert_write_fastq(self): with captured_output() as (_, _): with tempfile.TemporaryDirectory() as tempdir: path = os.path.join(tempdir, "test.fastq") ReverseComplement(find="AUGC", replace="ATGC").convert_write_fastq(self.fastq, path, complement=True, reverse=True) bad_path = os.path.join(tempdir, "test.txt") for new_record, record in zip(SeqIO.parse(path, "fastq"), SeqIO.parse(self.fastq, "fastq")): self.assertTrue(new_record.id.endswith("reverse_complement")) self.assertTrue(str(new_record.seq).find("U") == -1) self.assertEqual(str(new_record.seq), "AACCTAACGACACCACTATCCCTACACCCTATCCAACTACTATTACTCTATTCTACTTATCACCCTACTACTACCTCATCCT" "CCTCCCTAAAATTTCGAGTAAGTAAAATCAATTTCGTGTCAAAATTCATTAAGGGCATCCTAATAGAGGTTGGTCGGCGATT" "TTAATAAGTGTATGTTTCGGACGTTCATAAGTTTAAAGTGTTTGTGTTAACGTTTTCGTCTTTGATTTTGGAAGTATCAGTC" "ACTCTAATTTTGTTACGAAGTAGTAAGAATTTCATGGACAATTATTTACGACGATTTATGATTCACGATTTTTTTTTTTCGA" "TCACGACCGTCGACCGACGACCACCGACCGTCGACCACCGAACCTACCATTATTTTTCCTTTTGAAAATATTACTGTTCGTG" "AGAATATAAGTAAAAAATAGAGTATTGACCTATTGTGTCCCGTCCTAC") self.assertEqual(new_record.letter_annotations["phred_quality"], record.letter_annotations["phred_quality"][::-1]) os.remove(path) with self.assertRaises(AssertionError): self.base.convert_write_fastq(self.fastq, bad_path, complement=True, reverse=True) self.base.convert_write_fastq(self.fastq, path, complement=False, reverse=False)
def main(): args = parse_args() assert os.path.isdir(args.output_dir), "{} is not a directory".format( args.output_dir) assert os.path.exists(args.bam), "{} does not exist".format(args.bam) assert os.path.exists(args.positions_file), "{} does not exist".format( args.positions_file) output_dir = args.output_dir bam = args.bam positions_file = args.positions_file reference = args.reference alphabet = args.alphabet kmer_length = args.kmer_length n_processes = args.threads # output_dir = "/home/ubuntu/mount/download/FAB39088" # bam = "/home/ubuntu/mount/download/FAB39088/fastq/canonical_cpg_FAB39088.2308.sorted.bam" # output_dir = "/home/ubuntu/mount/download/FAF01169" # bam = "/home/ubuntu/mount/download/FAF01169/Bham/fastq/canonical_cpg_FAF01169.2308.sorted.bam" # # positions_file = "/home/ubuntu/bisulfite_methylation_analysis/positions/canonical_added_cxx.positions" # reference = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa" # alphabet = "ACGT" # kmer_length = 6 fasta_handle = None if reference is not None: assert os.path.exists(reference), "{} does not exist".format(reference) fasta_handle = ReferenceHandler(reference) rc = ReverseComplement() positions_data = pd.read_csv( positions_file, names=["chr", "start", "strand", "find", "replace"], sep="\t") km = KmerMap(alphabet, kmer_length) counter = 0 def get_kmer(sequence, pos, start_pos, strand, replace): try: base = sequence[(pos - (kmer_length - 1)) - start_pos:(pos + kmer_length) - start_pos] base = base[:(kmer_length - 1)] + replace + base[kmer_length:] if strand == "-": return rc.complement(base) return base except Exception as e: print(e, sequence, pos, start_pos) # def get_ref_base(chromosome, start_pos, strand): # try: # base = fasta_handle.get_sequence(chromosome_name=chromosome, start=start_pos, stop=start_pos + 1) # if strand == "-": # return rc.complement(base) # return base # except Exception as e: # print(e, fasta_handle, chromosome, start_pos, strand) # # def get_base(sequence, pos, start_pos, reversed): # try: # base = sequence[pos - start_pos] # if reversed: # return rc.complement(base) # return base # except Exception as e: # print(e, sequence, pos, start_pos) def get_covered_kmers(positions_data1, read_name1, ref_sequence1, ref_name1, strand1, ref_start1, ref_end1): this_positions_data = positions_data1.loc[ (positions_data1["chr"] == ref_name1) & (positions_data1["strand"] == strand1) & (positions_data1["start"] >= ref_start1) & (positions_data1["start"] <= ref_end1)] if this_positions_data.empty: return None kmer_lists = np.vectorize(get_kmer)(ref_sequence1, this_positions_data['start'], ref_start1, strand1, this_positions_data["replace"]) kmer_subset_lists1 = merge_lists([[ kmer[i:i + kmer_length] for i in range(kmer_length) if len(kmer[i:i + kmer_length]) == kmer_length and set(kmer[i:i + kmer_length]) <= set(alphabet) ] for kmer in kmer_lists]) return read_name1, kmer_subset_lists1 def meta_get_covered_kmers(positions, all_args1): data_to_return = [] for args1 in all_args1: data = get_covered_kmers(positions, *args1) if data is not None: data_to_return.append(data) return data_to_return all_args = [] with closing(pysam.AlignmentFile( bam, 'rb' if bam.endswith("bam") else 'r')) as aln: for aligned_segment in aln.fetch(until_eof=True): try: if not aligned_segment.has_tag('MD'): if fasta_handle is None: raise Exception( "Need to specify --reference if MD flag is not set" ) else: ref_sequence = fasta_handle.get_sequence( chromosome_name=aligned_segment.reference_name, start=aligned_segment.reference_start, stop=aligned_segment.reference_end) else: ref_sequence = aligned_segment.get_reference_sequence( ).upper() read_name = aligned_segment.qname.split("_")[0] ref_name = aligned_segment.reference_name ref_start = aligned_segment.reference_start ref_end = aligned_segment.reference_end reversed_read = aligned_segment.is_reverse if reversed_read: strand = "-" else: strand = "+" all_args.append([ read_name, ref_sequence, ref_name, strand, ref_start, ref_end ]) counter += 1 except Exception as e: print(e, file=sys.stderr) print("starting on {} reads".format(len(all_args))) list_of_args = [all_args[x::n_processes] for x in range(n_processes)] # extra_args = {"positions": positions_data} # data = get_covered_kmers(positions_data, *list_of_args[0][0]) # print(data) service = BasicService2(meta_get_covered_kmers, positions_data, service_name="multiprocess_meta_get_covered_kmers") total, failure, messages, output = run_service(service.run, list_of_args, {}, ["all_args1"], n_processes) # print(pd.concat(output, ignore_index=True)) km = KmerMap(alphabet, kmer_length) all_data = merge_lists(output) print("number of reads: ", len(all_data)) for read_name, kmer_subset_lists in all_data: # print(read_name, kmer_subset_lists) r = Read(read_name) for kmer in kmer_subset_lists: r.add_kmer(kmer) km.add_read(r) kmer_counts_file_path = os.path.join(output_dir, "all_reads_kmer_counts.txt") with open(kmer_counts_file_path, "w") as fh: print("\n".join([ "\t".join([kmer, str(count)]) for kmer, count in km.kmer_counts.items() ]), file=fh) keep_kmer_map = KmerMap(alphabet, kmer_length) print("number of zero covered kmers: ", len(km.get_zero_kmers())) curr_threshold = 1 iteration = 0 increase_threshold = True while increase_threshold: curr_threshold += 1 find_kmers = keep_kmer_map.get_threshold_uncovered_kmers( threshold=curr_threshold) while len(find_kmers) > 0: print(iteration, len(find_kmers)) next_kmer = km.get_non_zero_min_kmer_in_kmers(find_kmers) if next_kmer is None: print( "No more reads to cover found kmers: threshold {}".format( curr_threshold)) increase_threshold = True if curr_threshold >= 10: increase_threshold = False break next_read_index, next_read = km.get_read(next_kmer) if next_read is None: print("Whoops, something is wrong") break keep_kmer_map.add_read(next_read) km.remove_read(next_read_index) find_kmers = keep_kmer_map.get_threshold_uncovered_kmers( threshold=curr_threshold) iteration += 1 print("Exited first while") if len(find_kmers) == 0: print("Found reads covering all kmers at threshold {}".format( curr_threshold)) file_path = os.path.join( output_dir, "{}_reads_covering_kmers_with_threshold_{}.txt".format( "all" if increase_threshold else "some", curr_threshold)) with open(file_path, "w") as fh: print("\n".join([read.read_id for read in keep_kmer_map.reads]), file=fh) kmer_counts_file_path = os.path.join( output_dir, "{}_kmer_counts_with_threshold_{}.txt".format( "all" if increase_threshold else "some", curr_threshold)) with open(kmer_counts_file_path, "w") as fh: print("\n".join([ "\t".join([kmer, str(count)]) for kmer, count in keep_kmer_map.kmer_counts.items() ]), file=fh)
from py3helpers.seq_tools import ReferenceHandler, ReverseComplement from scipy.stats import norm, invgauss, entropy # needs pyranges "conda install -c bioconda pyranges" import pyranges as pr OUTPUT_DIR = "/home/ubuntu/mount/download/RNA_rel2/reference" REFERENCE = "/home/ubuntu/mount/download/RNA_rel2/reference/gencode.v27.transcripts.fa" p_lambda = 50 delta = 6 assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format(OUTPUT_DIR) assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE) # reference handler and reverse complement handler rh = ReferenceHandler(REFERENCE) rc = ReverseComplement() transcript_strings = rh.fasta.references transcript_data = { transcript: rh.get_sequence(transcript, 0, rh.get_chr_sequence_length(transcript)) for transcript in rh.fasta.references } def get_base(transcript, pos): try: base = transcript_data[transcript][pos] except: print(chromosome, pos) return base
def main(): OUTPUT_DIR = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis" positions_data = False keys = ["contig", "reference_index", "strand"] # RNA canonical # REFERENCE = "/home/ubuntu/mount/download/RNA_rel2/reference/gencode.v27.transcripts.fa" # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/all_runs/", "/home/ubuntu/mount/UBC_runs/all_runs/"] # # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/OICR_runs/test/", "/home/ubuntu/mount/OICR_runs/test/"] # VARIANT_NAMES = ["/variant_calls/na12878_OICR_RNA_canonical.csv", "/variant_calls/na12878_UBC_RNA_canonical.csv"] # ALPHABET = "ATGC" # KMER_LENGTH = 5 # NAMES = ["OICR", "UBC"] # # # DNA canonical # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa" # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/canonical_calling/all_runs/", # "/home/ubuntu/mount/FAF01169_runs/canonical_calling/all_runs/"] # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"] # ALPHABET = "ATGC" # KMER_LENGTH = 6 # NAMES = ["FAB39088_canonical", "FAF01169_canonical"] # DNA mod # REFERENCE = "/home/ubuntu/bisulfite_methylation_analysis/ref/GRCh38_full_analysis_set_plus_decoy_hla.fa" # VARIANT_HOME_DIRS = ["/home/ubuntu/mount/FAB39088_runs/cpg_calling/all_runs/", # "/home/ubuntu/mount/FAF01169_runs/cpg_calling/all_runs/"] # NAMES = ["FAB39088_methyl", "FAF01169_methyl"] # VARIANT_NAMES = ["/variant_calls/variant_calls.csv", "/variant_calls/variant_calls.csv"] # ALPHABET = "ATGCM" # KMER_LENGTH = 6 # POSITIONS_FILE = "/home/ubuntu/bisulfite_methylation_analysis/positions/all_mC.positions" # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"], # sep="\t") # ECOLI MOD REFERENCE = "/home/ubuntu/ecoli_methylation_analysis/reference/ecoli.fa" VARIANT_HOME_DIRS = [ "/home/ubuntu/ecoli_methylation_analysis/signalalign_output/" ] NAMES = ["variant_calls"] VARIANT_NAMES = ["ecoli_dna_baseline_ATCGMQ_sa.model.csv"] ALPHABET = "ATGCM" KMER_LENGTH = 6 # POSITIONS_FILE = "/home/ubuntu/ecoli_methylation_analysis/kmer_analysis/all.positions" # positions_data = pd.read_csv(POSITIONS_FILE, names=["contig", "reference_index", "strand", "find", "replace"], # sep="\t") if positions_data is not False: i2 = positions_data.set_index(keys).index assert os.path.exists(REFERENCE), "{} does not exist".format(REFERENCE) assert os.path.isdir(OUTPUT_DIR), "{} is not a directory".format( OUTPUT_DIR) rh = ReferenceHandler(REFERENCE) rc = ReverseComplement() kmers = {k: 0 for k in all_string_permutations(ALPHABET, KMER_LENGTH)} paths = [] for home_dir, variant_name in zip(VARIANT_HOME_DIRS, VARIANT_NAMES): assert os.path.isdir(home_dir), "{} is not a directory".format( home_dir) home_dir_paths = os.listdir(home_dir) tmp_paths = [ os.path.join(home_dir, x, variant_name) for x in home_dir_paths if os.path.exists(os.path.join(home_dir, x, variant_name)) ] assert len( tmp_paths ) > 0, "Check inputs, there are no paths which exist: {}".format( home_dir) paths.append(tmp_paths) def get_kmer(chromosome, pos, strand): try: seq = rh.get_sequence(chromosome, (pos - KMER_LENGTH) + 1, pos + KMER_LENGTH) if strand == "-": seq = rc.reverse_complement(seq) if positions_data is not False: replace = read_pos_data[ (read_pos_data["contig"] == chromosome) & (read_pos_data["reference_index"] == pos) & (read_pos_data["strand"] == strand)] if not replace.empty: seq = seq[:KMER_LENGTH - 1] + replace.iloc[0][ "replace"] + seq[KMER_LENGTH:] except Exception as e: print(e, chromosome, pos, strand) return seq void = '-' fill = '#' n_spaces = 100 n_files = 0 for variant_set, name in zip(paths, NAMES): n_paths = len(variant_set) count = n_spaces / n_paths increaseCount = 0 print("Starting on {}".format(name)) local_kmers = { k: 0 for k in all_string_permutations(ALPHABET, KMER_LENGTH) } for variant_path in variant_set: print('[' + (fill * int(increaseCount)) + (void * int(n_spaces - increaseCount)) + '] ' + str(int(increaseCount)) + '%', end='\r') increaseCount += count variant_data = pd.read_csv(variant_path) if positions_data is not False: i1 = variant_data.set_index(keys).index read_pos_data = positions_data[i2.isin(i1)] # read_id 028a34d4-2a7a-44e7-ab23-305915996ec8 # contig RDN18-1 # reference_index 973 # strand + # variants Aa # prob1 0.986967 # prob2 0.013033 # prob3 NaN variant_data['next_base'] = np.vectorize(get_kmer)( variant_data['contig'], variant_data['reference_index'], variant_data['strand']) large_kmers = set(variant_data['next_base']) for l_kmer in large_kmers: for i in range(KMER_LENGTH): k = l_kmer[i:KMER_LENGTH + i] if len(k) == KMER_LENGTH: kmers[k] += 1 local_kmers[k] += 1 print('[' + (fill * int(increaseCount)) + (void * int(n_spaces - increaseCount)) + '] ' + str(int(increaseCount)) + '%', end='\n') total_zeros = 0 for x, y in local_kmers.items(): if y == 0: total_zeros += 1 n_files += n_paths print("{} Kmers Covered: {}/{}".format(name, len(local_kmers) - total_zeros, len(local_kmers))) print("{} Average coverage: {:.4}".format( name, np.sum(list(local_kmers.values())) / (len(local_kmers) - total_zeros))) with open(os.path.join(OUTPUT_DIR, name + ".tsv"), 'w') as fh: print("\n".join([ "\t".join([x, str(y / n_paths)]) for x, y in local_kmers.items() ]), file=fh) total_zeros = 0 for x, y in kmers.items(): if y == 0: total_zeros += 1 print("TOTAL Kmers Covered: {}/{}".format( len(kmers) - total_zeros, len(kmers))) print("TOTAL Average coverage: {}".format( np.average(list(kmers.values())) / (n_files / 2))) with open(os.path.join(OUTPUT_DIR, "total_" + "_".join(NAMES) + ".tsv"), 'w') as fh: print("\n".join( ["\t".join([x, str(y / n_files)]) for x, y in kmers.items()]), file=fh)
def resegment_reads(fast5_path, params=None, speedy=False, overwrite=True, analysis_path="ReSegmentBasecall_000"): """Re-segment and create anchor alignment from previously base-called fast5 file :param fast5_path: path to fast5 file :param params: event detection parameters :param speedy: boolean option for speedyStatSplit or minknow :param overwrite: overwrite a previous event re-segmented event table :param analysis_path: name of key where events table will be placed (Analyses/'name'/Events) :return True when completed """ assert os.path.isfile(fast5_path), "File does not exist: {}".format( fast5_path) # create Fast5 object and sanity check f5fh = Fast5(fast5_path, read='r+') if not f5fh.has_basecall_data(): f5fh.close() return None # gather previous event detection old_event_table = f5fh.get_basecall_data() read_id = bytes.decode(f5fh.raw_attributes['read_id']) sampling_freq = f5fh.sample_rate start_time = f5fh.raw_attributes['start_time'] # get params if params is None: params = get_default_event_detection_params( EVENT_DETECT_SPEEDY if speedy else EVENT_DETECT_MINKNOW) # pick event detection algorithm signal = f5fh.get_read(raw=True, scale=True) if speedy: event_table = create_speedy_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts( [params, { "event_detection": "speedy_stat_split" }]) else: event_table = create_minknow_event_table(signal, sampling_freq, start_time, **params) params = merge_dicts( [params, { "event_detection": "minknow_event_detect" }]) # metadata keys = ["nanotensor version", "time_stamp"] values = ["0.2.0", TimeStamp().posix_date()] attributes = merge_dicts( [params, dict(zip(keys, values)), f5fh.raw_attributes]) # do resegmentation if f5fh.is_read_rna(): old_event_table = index_to_time(old_event_table, sampling_freq=sampling_freq, start_time=start_time) new_event_table = create_anchor_kmers(new_events=event_table, old_events=old_event_table) # get destination in fast5 #todo find latest location? ie: save_event_table_and_fastq(..) destination = f5fh._join_path(f5fh.__base_analysis__, analysis_path) f5fh.set_event_table(destination, new_event_table, attributes, overwrite=overwrite) # gather new sequence sequence = sequence_from_events(new_event_table) if f5fh.is_read_rna(): sequence = ReverseComplement().reverse(sequence) sequence = sequence.replace("T", "U") quality_scores = '!' * len(sequence) fastq = create_fastq_line(read_id + " :", sequence, quality_scores) # set fastq f5fh.set_fastq(destination, fastq, overwrite=overwrite) return f5fh
def setUpClass(cls): super(CreateLabelsTest, cls).setUpClass() cls.HOME = '/'.join(os.path.abspath(__file__).split("/")[:-4]) cls.fasta = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta") dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch112_read108_strand.fast5") rev_dna_file = os.path.join(cls.HOME, "tests/minion_test_reads/1D/LomanLabz_PC_20161025_FNFAB42699_MN17633_sequencing_run_20161025_E_coli_native_450bps_82361_ch6_read347_strand.fast5") rev_rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_61_ch_151_strand.fast5") forward_rna_file = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_no_events/DEAMERNANOPORE_20170922_FAH26525_MN16450_sequencing_run_MA_821_R94_NA12878_mRNA_09_22_17_67136_read_36_ch_218_strand.fast5") rna_reference = os.path.join(cls.HOME, "tests/test_sequences/fake_rna_ref.fa") ecoli_dna_reference = os.path.join(cls.HOME, "tests/test_sequences/E.coli_K12.fasta") cls.dna_reference_handle = pysam.FastaFile(ecoli_dna_reference) cls.rna_reference_handle = pysam.FastaFile(rna_reference) cls.tmp_directory = tempfile.mkdtemp() # get file locations cls.tmp_dna_file = os.path.join(str(cls.tmp_directory), 'test_dna.fast5') cls.tmp_dna_file2 = os.path.join(str(cls.tmp_directory), 'test_dna2.fast5') cls.tmp_rna_file1 = os.path.join(str(cls.tmp_directory), 'test_rna.fast5') cls.tmp_rna_file2 = os.path.join(str(cls.tmp_directory), 'test_rna2.fast5') # run signalAlign on one file cls.rna_model_file = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acgt_RNA.model") cls.dna_model_file_94 = os.path.join(cls.HOME, "models/testModelR9p4_5mer_acegt_template.model") cls.rna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/RNA_edge_cases/rna_reads.bam") cls.dna_sam = os.path.join(cls.HOME, "tests/minion_test_reads/oneD.bam") cls.bin_path = os.path.join(cls.HOME, "bin") # kmer index cls.kmer_index = 2 # copy file to tmp directory shutil.copy(dna_file, cls.tmp_dna_file) shutil.copy(rev_dna_file, cls.tmp_dna_file2) shutil.copy(forward_rna_file, cls.tmp_rna_file1) shutil.copy(rev_rna_file, cls.tmp_rna_file2) args = create_signalAlignment_args(destination=cls.tmp_directory, in_templateHmm=cls.rna_model_file, alignment_file=cls.rna_sam, forward_reference=rna_reference, embed=True, path_to_bin=cls.bin_path, diagonal_expansion=5, delete_tmp=False) sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file1}])) sa_h.run() sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_rna_file2}])) sa_h.run() args = create_signalAlignment_args(destination=cls.tmp_directory, in_templateHmm=cls.dna_model_file_94, alignment_file=cls.dna_sam, forward_reference=ecoli_dna_reference, embed=True, path_to_bin=cls.bin_path, diagonal_expansion=10, traceBackDiagonals=100, constraint_trim=3) sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file}])) sa_h.run() sa_h = SignalAlignment(**merge_dicts([args, {'in_fast5': cls.tmp_dna_file2}])) sa_h.run() cls.dna_handle = CreateLabels(cls.tmp_dna_file, kmer_index=cls.kmer_index) cls.dna_handle2 = CreateLabels(cls.tmp_dna_file2, kmer_index=cls.kmer_index) cls.rna1_handle = CreateLabels(cls.tmp_rna_file1, kmer_index=cls.kmer_index) cls.rna2_handle = CreateLabels(cls.tmp_rna_file2, kmer_index=cls.kmer_index) cls.rev_comp = ReverseComplement() cls.tmp_dna_file3 = os.path.join(cls.HOME, "tests/minion_test_reads/embedded_files/miten_PC_20160820_FNFAD20259_MN17223_sequencing_run_AMS_158_R9_WGA_Ecoli_08_20_16_43623_ch100_read2324_strand.fast5") cls.dna3_handle = CreateLabels(cls.tmp_dna_file3, kmer_index=cls.kmer_index)
def create_labels_from_guide_alignment(events, sam_string, rna=False, reference_path=None, kmer_index=2, one_ref_indexing=False): """Create labeled signal from a guide alignment with only matches being reported :param events: path to fast5 file :param sam_string: sam alignment string :param rna: if read is rna, reverse again :param reference_path: if sam_string has MDZ field the reference sequence can be inferred, otherwise, it is needed :param kmer_index: index of the kmer to select for reference to event mapping :param one_ref_indexing: boolean zero or 1 based indexing for reference """ # test if the required fields are in structured numpy array check_numpy_table(events, req_fields=('raw_start', 'model_state', 'p_model_state', 'raw_length', 'move')) assert type(one_ref_indexing) is bool, "one_ref_indexing must be a boolean" psam_h = initialize_pysam_wrapper(sam_string, reference_path=reference_path) # create an indexed map of the events and their corresponding bases bases, base_raw_starts, base_raw_lengths, probs = index_bases_from_events( events, kmer_index=kmer_index) # check if string mapped to reverse strand if psam_h.alignment_segment.is_reverse: probs = probs[::-1] base_raw_starts = base_raw_starts[::-1] # rna reads go 3' to 5' so we dont need to reverse if it mapped to reverse strand if not rna: bases = ReverseComplement().reverse(''.join(bases)) # reverse if it mapped to forward strand and RNA elif rna: bases = ReverseComplement().reverse(''.join(bases)) # all 'matches' and 'mismatches' matches_map = psam_h.seq_alignment.matches_map # zero indexed reference start ref_start = psam_h.alignment_segment.reference_start + one_ref_indexing # set labels raw_start = [] raw_length = [] reference_index = [] kmer = [] posterior_probability = [] cigar_labels = [] prev = matches_map[0].reference_index for i, alignment in enumerate(matches_map): if i == 0 or alignment.reference_index == prev + 1: raw_start.append(base_raw_starts[alignment.query_index]) raw_length.append(base_raw_lengths[alignment.query_index]) reference_index.append(alignment.reference_index + ref_start) kmer.append(alignment.reference_base) posterior_probability.append(probs[alignment.query_index]) else: # initialize labels cigar_label = np.zeros(len(raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) # assign labels cigar_label['raw_start'] = raw_start cigar_label['raw_length'] = raw_length cigar_label['reference_index'] = reference_index cigar_label['kmer'] = kmer cigar_label['posterior_probability'] = posterior_probability # add to other blocks cigar_labels.append(cigar_label) # reset trackers raw_start = [base_raw_starts[alignment.query_index]] raw_length = [base_raw_lengths[alignment.query_index]] reference_index = [alignment.reference_index + ref_start] kmer = [alignment.reference_base] posterior_probability = [probs[alignment.query_index]] # keep track of reference positions prev = alignment.reference_index # catch the last label cigar_label = np.zeros(len(raw_start), dtype=[('raw_start', int), ('raw_length', int), ('reference_index', int), ('posterior_probability', float), ('kmer', 'S5')]) # assign labels cigar_label['raw_start'] = raw_start cigar_label['raw_length'] = raw_length cigar_label['reference_index'] = reference_index cigar_label['kmer'] = kmer cigar_label['posterior_probability'] = posterior_probability # add to other blocks cigar_labels.append(cigar_label) return cigar_labels
def main(): args = parse_args() assert os.path.isdir(args.output_dir), "{} is not a directory".format( args.output_dir) assert os.path.exists(args.reference), "{} does not exist".format( args.reference) assert os.path.exists(args.positions_file), "{} does not exist".format( args.positions_file) positions_data = pd.read_csv( args.positions_file, names=["chr", "start", "strand", "find", "replace"], sep="\t") positions_data["kmer"] = np.nan # reference handler and reverse complement handler rh = ReferenceHandler(args.reference) rc = ReverseComplement() chromosome_data = { chromosome: rh.get_sequence(chromosome, 0, rh.get_chr_sequence_length(chromosome)) for chromosome in rh.fasta.references } alphabet = "ACGMT" kmer_length = 6 def get_kmer(chromosome, pos, strand, replace=None): try: seq = chromosome_data[chromosome][(pos - kmer_length) + 1:pos + kmer_length] if strand == "-": seq = rc.reverse_complement(seq) if replace is not None: seq = seq[:kmer_length - 1] + replace + seq[kmer_length:] except Exception as e: print(e, chromosome, pos, strand) return seq mod_pos_data = positions_data.loc[positions_data['replace'] == "M"].copy() mod_pos_data.loc[:, "kmer"] = np.vectorize(get_kmer)(mod_pos_data['chr'], mod_pos_data['start'], mod_pos_data['strand'], "M") kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)} large_kmers = set(mod_pos_data['kmer']) for l_kmer in large_kmers: for i in range(kmer_length): k = l_kmer[i:kmer_length + i] if len(k) == kmer_length: kmers[k] += 1 m_kmers = [x for x, y in kmers.items() if x.count("M") == 1] found_m_only_kmers = { x: y for x, y in kmers.items() if y > 0 and x.count("M") == 1 } print(f"Number of M kmers: {len(m_kmers)}") print(f"Number of found M kmers: {len(found_m_only_kmers)}") c_pos_data = positions_data.loc[positions_data['replace'] == "C"].copy() c_pos_data.loc[:, 'kmer'] = np.vectorize(get_kmer)(c_pos_data['chr'], c_pos_data['start'], c_pos_data['strand'], "C") filter_c_pos_data = c_pos_data[~c_pos_data["kmer"].str.contains( '|'.join(["N", "W", "Y"]), regex=True)] kmers = {k: 0 for k in all_string_permutations(alphabet, kmer_length)} large_kmers = set(filter_c_pos_data['kmer']) for l_kmer in large_kmers: for i in range(kmer_length): k = l_kmer[i:kmer_length + i] if len(k) == kmer_length: kmers[k] += 1 no_m_kmers = [ x for x, y in kmers.items() if x.count("M") == 0 and x.count("C") > 0 ] found_no_m_kmers = { x: y for x, y in kmers.items() if y > 0 and x.count("M") == 0 and x.count("C") > 0 } print(f"Number of Canonical kmers: {len(no_m_kmers)}") print(f"Number of found Canonical kmers: {len(found_no_m_kmers)}")