def calculate_kmer_distribution(taxa, bin_width): lengths_file=re.sub('.cmap', '.'+str(bin_width)+'.lengths', taxa) try: with open(lengths_file) as i_file: lengths={} for line in i_file: line_data=line.split("\t") lengths[float(line_data[0])]=int(line_data[1]) return lengths except: pass raw_lengths_file=re.sub('.cmap', '.raw_lengths', taxa) raw_lengths=[] max_length=-1 try: with open(raw_lengths_file) as i_file: for line in i_file: raw_lengths=[float(x) for x in line.strip().split("\t")] max_length=max(raw_lengths) except: cmap=CmapFile(taxa) current_contig_id=None previous_position=0.0 max_length=-1.0 for label in cmap.parse(): if label.channel != "1": continue if label.contig_id != current_contig_id: current_contig_id=label.contig_id previous_position=0.0 length=label.position-previous_position raw_lengths.append(length) if length > max_length: max_length = length with open(raw_lengths_file, 'w') as o_file: for raw_length in raw_lengths: o_file.write(str(raw_length)+"\t") lengths=OrderedDict() bin_max=0 while bin_max < max_length: bin_max+=bin_width lengths[bin_max]=0 for raw_length in raw_lengths: for bin_max in lengths: if raw_length < bin_max: lengths[bin_max]+=1 break with open(lengths_file, 'w') as o_file: for bin_max in lengths: o_file.write(str(bin_max)+"\t"+str(lengths[bin_max])+"\n") return lengths
def __init__ (self, xmap_file_name): file_name_parts=xmap_file_name.split('/') file_name_parts_length=len(file_name_parts) if file_name_parts_length>1: self.workspace="/".join(file_name_parts[0:(file_name_parts_length-1)]) else: self.workspace="." with CD(self.workspace): file_name=file_name_parts[file_name_parts_length-1] self.xmap=XmapFile(file_name) self.anchor_cmap=CmapFile(file_name.replace(".xmap", "_r.cmap")) self.query_cmap=CmapFile(file_name.replace(".xmap", "_q.cmap")) self.ALIGNED_LABELS=re.compile("\(([\d]+),([\d]+)\)")
class tCmapFile(tFile_base): def setUp(self): with open(self.input_file, "w"): self.obj=CmapFile(self.input_file) def test_getExtension(self): self.assertEqual("cmap", CmapFile.getExtension()) def test_parse(self): expected=CmapFile_iter(self.input_file) self.assertEqual(expected, self.obj.parse()) def test_write(self): label=Mock(contig_id=1, contig_len=1.0, contig_site_count=1, label_id=1, channel="1", position=1.0, stdev=1.0, coverage=1.0, occurrences=1, snr_mean=1.0, snr_stdev=1.0, snr_count=1.0) expected="\t".join([str(label.contig_id), str(label.contig_len), str(label.contig_site_count), str(label.label_id), label.channel, str(label.position), str(label.stdev), str(label.coverage), str(label.occurrences), str(label.snr_mean), str(label.snr_stdev), str(label.snr_count)]) + "\n" o_file=StringIO() self.obj.write(label, o_file) self.assertEqual(expected, o_file.getvalue())
def createQualityObject(self): if not self.isComplete(): raise Exception("The step is not complete yet") count=0 total_length=0.0 lengths=[] label_occurrences=0 label_count=0 for cmap_name in glob(self.getStepDir() + "/*.cmap"): # This glob relies on there not being a merged .cmap in the same directory (i.e. Summarize has not been run) contigs=set() cmap_file=CmapFile(cmap_name) for label in cmap_file.parse(): if not label.contig_id in contigs: count+=1 total_length+=label.contig_len contigs.add(label.contig_id) lengths.append(label.contig_len) label_occurrences+=label.occurrences label_count+=1 sorted_lengths=sorted(lengths, reverse=True) minlen=sorted_lengths[len(sorted_lengths)-1] maxlen=sorted_lengths[0] n50=0 length_included_in_n50=0 target_length_included=total_length/2.0 for length in sorted(lengths, reverse=True): length_included_in_n50+=length if length_included_in_n50 >= target_length_included: n50 = length break with open(self.getOutputFile()) as contig_file: for line in contig_file: if line[0] != "C": continue contig_data=line.split(",") nummaps=contig_data[len(contig_data)-1] nummaps_data=nummaps.split("=") nummaps=nummaps_data[len(nummaps_data)-1] self.quality=Quality(length=total_length, count=count, average_length=total_length/count, n50=n50, min=minlen, max=maxlen, average_occurrences=float(label_occurrences)/label_count, total_mols_aligned=nummaps, avg_mols_aligned=float(nummaps)/count) self.saveQualityObjectToFile()
def test_getExtension(self): self.assertEqual("cmap", CmapFile.getExtension())
def setUp(self): with open(self.input_file, "w"): self.obj=CmapFile(self.input_file)
def setUp(self): self.xmap=open(self.xmap_name, 'w') self.xmap_writer=XmapFile(self.xmap_name) self.query_cmap=open(self.query_cmap_name, 'w') self.anchor_cmap=open(self.anchor_cmap_name, 'w') self.cmap_writer=CmapFile(self.query_cmap_name)
class tFindBreakpoints(tScripts): maxDiff=None xmap_name='file.xmap' xmap_writer=None xmap=None query_cmap_name='file_r.cmap' query_cmap=None anchor_cmap_name='file_q.cmap' anchor_cmap=None cmap_writer=None def setUp(self): self.xmap=open(self.xmap_name, 'w') self.xmap_writer=XmapFile(self.xmap_name) self.query_cmap=open(self.query_cmap_name, 'w') self.anchor_cmap=open(self.anchor_cmap_name, 'w') self.cmap_writer=CmapFile(self.query_cmap_name) def tearDown(self): self.xmap.close() self.query_cmap.close() self.anchor_cmap.close() os.remove(self.xmap_name) os.remove(self.query_cmap_name) os.remove(self.anchor_cmap_name) ### Bad input ### def test_badInput_none(self): expected_stdout="usage: find_breakpoints.py [-h] xmap_file\nfind_breakpoints.py: error: too few arguments\n" expected_returnCode=2 expecteds=[expected_stdout, expected_returnCode] actual_stdout=None try: actual_stdout=subprocess.check_output("../scripts/find_breakpoints.py", stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) def test_badInput_notAFile(self): expected_stdout="The .xmap alignment file you specified, file.xmap.bed.ext, could not be found\n" expected_returnCode=1 expecteds=[expected_stdout, expected_returnCode] actual_stdout=None try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name+".bed.ext"], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) @unittest.skip('not_implemented') def test_badInput_notXmap(self): pass def test_badInput_contigInXmapNotInAnchorCmap(self): for label_position in xrange(1,1000): label=Mock(contig_id=1, contig_len=1000, contig_site_count=1000, label_id=label_position, channel="1", position=label_position, stdev=0, coverage=1, occurrences=1) self.cmap_writer.write(label, self.query_cmap) self.query_cmap.close() alignment=Mock(alignment_id=1, query_id=1, anchor_id=1, query_start=2, query_end=998, anchor_start=1, anchor_end=1000, orientation='+', confidence=250,hit_enum="*", query_len=1000, anchor_len=1000, label_channel="1", alignment="*") self.xmap_writer.write(alignment, self.xmap) self.xmap.close() expected_stdout="The .xmap alignment ("+self.xmap_name+") file contains a contig id (1) that was not found in one of the .cmap contig files ("+self.anchor_cmap_name+")\n" expected_returnCode=1 expecteds=[expected_stdout, expected_returnCode] try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) def test_badInput_contigInXmapNotInQueryCmap(self): for label_position in xrange(1,1000): label=Mock(contig_id=1, contig_len=1000, contig_site_count=1000, label_id=label_position, channel="1", position=label_position, stdev=0, coverage=1, occurrences=1) self.cmap_writer.write(label, self.anchor_cmap) self.anchor_cmap.close() alignment=Mock(alignment_id=1, query_id=1, anchor_id=1, query_start=2, query_end=998, anchor_start=1, anchor_end=1000, orientation='+', confidence=250,hit_enum="*", query_len=1000, anchor_len=1000, label_channel="1", alignment="*") self.xmap_writer.write(alignment, self.xmap) self.xmap.close() expected_stdout="The .xmap alignment ("+self.xmap_name+") file contains a contig id (1) that was not found in one of the .cmap contig files ("+self.query_cmap_name+")\n" expected_returnCode=1 expecteds=[expected_stdout, expected_returnCode] try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) def test_badInput_cantFindCmap(self): os.remove(self.query_cmap_name) os.remove(self.anchor_cmap_name) expected_stdout="Some of the .cmap contig files associated with your .xmap alignment file could not be found. More specifically, Unable to find query, anchor maps.\n" expected_returnCode=1 expecteds=[expected_stdout, expected_returnCode] actual_stdout=None try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] with open(self.query_cmap_name, 'w'): pass with open(self.anchor_cmap_name, 'w'): pass self.assertEquals(expecteds, actuals) ### One alignment only ### def test_oneAlignment_no5PercentOverhang(self): for label_position in xrange(1,1000): label=Mock(contig_id=1, contig_len=1000, contig_site_count=1000, label_id=label_position, channel="1", position=label_position, stdev=0, coverage=1, occurrences=1) self.cmap_writer.write(label, self.query_cmap) self.cmap_writer.write(label, self.anchor_cmap) self.query_cmap.close() self.anchor_cmap.close() alignment=Mock(alignment_id=1, query_id=1, anchor_id=1, query_start=2, query_end=998, anchor_start=1, anchor_end=1000, orientation='+', confidence=250,hit_enum="*", query_len=1000, anchor_len=1000, label_channel="1", alignment="*") self.xmap_writer.write(alignment, self.xmap) self.xmap.close() expected_stdout="" expected_returnCode=0 expecteds=[expected_stdout, expected_returnCode] try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) def test_oneAlignment_noLabeledOverhang(self): for label_position in xrange(1,500): label=Mock(contig_id=1, contig_len=1000, contig_site_count=1000, label_id=label_position, channel="1", position=label_position, stdev=0, coverage=1, occurrences=1) self.cmap_writer.write(label, self.query_cmap) self.cmap_writer.write(label, self.anchor_cmap) self.query_cmap.close() self.anchor_cmap.close() alignment=Mock(alignment_id=1, query_id=1, anchor_id=1, query_start=2, query_end=500, anchor_start=1, anchor_end=500, orientation='+', confidence=250,hit_enum="*", query_len=1000, anchor_len=1000, label_channel="1", alignment="*") self.xmap_writer.write(alignment, self.xmap) self.xmap.close() expected_stdout="" expected_returnCode=0 expecteds=[expected_stdout, expected_returnCode] try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) def test_oneAlignment_oneSingleLabelOverhang(self): for label_position in xrange(1,502): label=Mock(contig_id=1, contig_len=1000, contig_site_count=1000, label_id=label_position, channel="1", position=label_position, stdev=0, coverage=1, occurrences=1) self.cmap_writer.write(label, self.query_cmap) self.cmap_writer.write(label, self.anchor_cmap) self.query_cmap.close() self.anchor_cmap.close() alignment=Mock(alignment_id=1, query_id=1, anchor_id=1, query_start=2, query_end=500, anchor_start=1, anchor_end=500, orientation='+', confidence=250,hit_enum="*", query_len=1000, anchor_len=1000, label_channel="1", alignment="*") self.xmap_writer.write(alignment, self.xmap) self.xmap.close() expected_stdout="Chr01\t500\t501\t1\t250.0\t+\n" expected_returnCode=0 expecteds=[expected_stdout, expected_returnCode] try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) def test_oneAlignment_oneLabeledOverhang(self): for label_position in xrange(1,1000): label=Mock(contig_id=1, contig_len=1000, contig_site_count=1000, label_id=label_position, channel="1", position=label_position, stdev=0, coverage=1, occurrences=1) self.cmap_writer.write(label, self.query_cmap) self.cmap_writer.write(label, self.anchor_cmap) self.query_cmap.close() self.anchor_cmap.close() alignment=Mock(alignment_id=1, query_id=1, anchor_id=1, query_start=2, query_end=500, anchor_start=1, anchor_end=500, orientation='+', confidence=250,hit_enum="*", query_len=1000, anchor_len=1000, label_channel="1", alignment="*") self.xmap_writer.write(alignment, self.xmap) self.xmap.close() expected_stdout="Chr01\t500\t502\t1\t250.0\t+\n" expected_returnCode=0 expecteds=[expected_stdout, expected_returnCode] try: actual_stdout=subprocess.check_output(["../scripts/find_breakpoints.py",self.xmap_name], stderr=subprocess.STDOUT) actual_returnCode=0 except subprocess.CalledProcessError as error: actual_stdout=error.output actual_returnCode=error.returncode except: raise actuals=[actual_stdout, actual_returnCode] self.assertEquals(expecteds, actuals) ### Multiple alignments ### @unittest.skip('not implmemented') def test_multipleAligns_noTwoLabelOverhang(self): pass
class AssessReferenceAlignment(object): def __init__ (self, xmap_file_name): file_name_parts=xmap_file_name.split('/') file_name_parts_length=len(file_name_parts) if file_name_parts_length>1: self.workspace="/".join(file_name_parts[0:(file_name_parts_length-1)]) else: self.workspace="." with CD(self.workspace): file_name=file_name_parts[file_name_parts_length-1] self.xmap=XmapFile(file_name) self.anchor_cmap=CmapFile(file_name.replace(".xmap", "_r.cmap")) self.query_cmap=CmapFile(file_name.replace(".xmap", "_q.cmap")) self.ALIGNED_LABELS=re.compile("\(([\d]+),([\d]+)\)") def extractTruePositives(self): self.true_positive_labels={} self.true_positive_locations={} for alignment in self.xmap.parse(): anchor=alignment.anchor_id if not anchor in self.true_positive_labels: self.true_positive_labels[anchor]=set() if not anchor in self.true_positive_locations: self.true_positive_locations[anchor]=[] for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment): self.true_positive_labels[anchor].add(int(label_pair.group(1))) for label in self.anchor_cmap.parse(): if not label.contig_id in self.true_positive_labels: continue if label.label_id in self.true_positive_labels[label.contig_id]: self.true_positive_locations[label.contig_id].append(label.position) return self.true_positive_locations def extractFalseNegatives(self): # false negative lables are present in the anchor, not in the query self.false_negative_labels={} self.false_negative_locations={} for alignment in self.xmap.parse(): anchor=alignment.anchor_id if not anchor in self.false_negative_labels: self.false_negative_labels[anchor]=set() if not anchor in self.false_negative_locations: self.false_negative_locations[anchor]=[] previous_label=None for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment): anchor_label=int(label_pair.group(1)) if previous_label is None: previous_label=anchor_label continue for i in xrange(previous_label+1,anchor_label): self.false_negative_labels[anchor].add(i) previous_label=anchor_label for label in self.anchor_cmap.parse(): if not label.contig_id in self.false_negative_labels: continue if label.label_id in self.false_negative_labels[label.contig_id]: self.false_negative_locations[label.contig_id].append(label.position) return self.false_negative_locations def extractFalsePositives(self): self.false_positive_labels={} for alignment in self.xmap.parse(): anchor=alignment.anchor_id query=alignment.query_id if not query in self.false_positive_labels: self.false_positive_labels[query]={} previous_label_pair=None for label_pair in self.ALIGNED_LABELS.finditer(alignment.alignment): if previous_label_pair is None: previous_label_pair=label_pair continue previous_query_label=int(previous_label_pair.group(2)) query_label=int(label_pair.group(2)) if alignment.orientation=="+": start=previous_query_label+1 stop=query_label else: start=query_label+1 stop=previous_query_label for i in xrange(start, stop): self.false_positive_labels[query][i]={"anchor_id": anchor, "anchor_last_true_positive": int(previous_label_pair.group(1)), "query_last_true_positive": int(previous_label_pair.group(2))} previous_label_pair=label_pair false_positive_offsets={} last_true_positive=None for label in self.query_cmap.parse(): if not label.contig_id in self.false_positive_labels: last_true_positive=label continue if not label.label_id in self.false_positive_labels[label.contig_id]: last_true_positive=label continue false_positive=self.false_positive_labels[label.contig_id][label.label_id] anchor=false_positive["anchor_id"] anchor_label=false_positive["anchor_last_true_positive"] if not anchor in false_positive_offsets: false_positive_offsets[anchor]={} if not anchor_label in false_positive_offsets[anchor]: false_positive_offsets[anchor][anchor_label]=[] false_positive_offsets[anchor][anchor_label].append(label.position-last_true_positive.position) self.false_positive_locations={} for label in self.anchor_cmap.parse(): if not label.contig_id in false_positive_offsets: continue if not label.label_id in false_positive_offsets[label.contig_id]: continue if not label.contig_id in self.false_positive_locations: self.false_positive_locations[label.contig_id]=[] for offset in false_positive_offsets[label.contig_id][label.label_id]: self.false_positive_locations[label.contig_id].append(label.position+offset) return self.false_positive_locations def extractPartialMatches(self, output_name='partial_matches.xmap'): self.partial_match_locations={} with open(output_name, 'w') as o_file: for align in self.xmap.parse(): proportion=abs(align.query_start-align.query_end)/float(align.query_len) if proportion < 0.9: anchor=align.anchor if not anchor in self.partial_match_locations: self.partial_match_locations[anchor]=[] self.partial_match_locations[anchor].append(align.anchor_start, align.anchor_end) xfile.write(align, o_file) return self.partial_match_locations def extractSequenceContexts(self, loci): pass def processSeqeuenceContexts(self, fasta_file, motif): snvs=set() for i in xrange(0,len(motif)): for base in ['A', 'T', 'C', 'G']: if base==motif[i]: continue snv=motif[0:i]+base+motif[i+1:len(motif)] snvs.add(snv) print("HasGap HasSNV") for record in SeqIO.parse(fasta_file, 'fasta'): output="0" if "NNNNNNN" in record.seq or "nnnnnnn" in record.seq: output="1" contains_snv=False for snv in snvs: if snv in record.seq: contains_snv=True if contains_snv: output+="\t1" else: output+="\t0" print(output) def findNearestNeighbors(self,loci,neighbor_locis): neighbors={} for chr in loci: if not chr in neighbors: neighbors[chr]=[] for locus in loci[chr]: nearest_dist=None for neighbor_loci in neighbor_locis: if not chr in neighbor_loci: continue for neighbor_locus in neighbor_loci[chr]: dist=abs(locus-neighbor_locus) if nearest_dist is None or dist<nearest_dist: nearest_dist=dist if nearest_dist is not None: neighbors[chr].append(nearest_dist) return neighbors def findLabelsWithNearNeighbors(self,loci,neighbor_locis,threshold=301): nearest_neighbors=af.findNearestNeighbors(loci, neighbor_locis) offending_count=0 for chrom in nearest_neighbors: for distance in nearest_neighgbors[chrom]: if distance < 301: offending_count+=1 return offending_count