def calculate_non_overlapping_range_with(self, occupied): # convert block occurrences into ranges potential_block_range = RangeSet() for occurrence in self.block_occurrences(): potential_block_range.add_range( occurrence, occurrence + self.minimum_block_length) #check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # no overlap, return complete block_range return potential_block_range # There is overlap with occupied range # we need to deal with it real_block_range = RangeSet() for lower in potential_block_range.contiguous(): # TODO: what I really want here is a find first over a generator upper = [ x for x in block_intersection.contiguous() if x[0] >= lower[0] ] if upper: lower = lower[0] upper = upper[0][0] if lower != upper: real_block_range.add_range(lower, upper) if not real_block_range: # There is complete overlap, so return None return None # Assert: check that the first slice is not larger than potential block length! first_range = next(real_block_range.contiguous()) if first_range[-1] - first_range[0] + 1 > self.minimum_block_length: raise PartialOverlapException() return real_block_range
def calculate_non_overlapping_range_with(self, occupied): # convert block occurrences into ranges potential_block_range = RangeSet() for occurrence in self.block_occurrences(): potential_block_range.add_range(occurrence, occurrence + self.minimum_block_length) #check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # no overlap, return complete block_range return potential_block_range # There is overlap with occupied range # we need to deal with it real_block_range = RangeSet() for lower in potential_block_range.contiguous(): # TODO: what I really want here is a find first over a generator upper = [x for x in block_intersection.contiguous() if x[0] >= lower[0]] if upper: lower = lower[0] upper = upper[0][0] if lower != upper: real_block_range.add_range(lower, upper) if not real_block_range: # There is complete overlap, so return None return None # Assert: check that the first slice is not larger than potential block length! first_range = real_block_range.contiguous().next() if first_range[-1]-first_range[0]+1>self.minimum_block_length: raise PartialOverlapException() return real_block_range
def _get_non_overlapping_repeating_blocks(self): # The LCP intervals that are calculated from the extend suffix array are all potential blocks. # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the # amount of witnesses they occur in. potential_blocks = self.token_index.split_lcp_array_into_intervals() # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length queue = PriorityQueue() for interval in potential_blocks: queue.put(interval) occupied = RangeSet() real_blocks = [] while not queue.empty(): item = queue.get() # print(item) # test intersection with occupied potential_block_range = item._as_range() # check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # print("Selected!") occupied.union_update(potential_block_range) real_blocks.append(Block(potential_block_range)) continue # check complete overlap or partial if block_intersection == potential_block_range: # print("complete overlap; skip") continue # print("partial overlap!") occurrence_difference = potential_block_range.difference( block_intersection) # print(occurrence_difference) # check on left partial overlap # filter it # determine start positions start_pos = item.block_occurrences() # print(start_pos) resulting_difference = RangeSet() count = 0 for range in occurrence_difference.contiguous(): if range[0] in start_pos: resulting_difference.add_range(range[0], range[-1] + 1) count += 1 # print(resulting_difference) if count < 2: continue # in case of right partial overlap # calculate the minimum allowed range minimum_length = item.length for range in resulting_difference.contiguous(): if len(range) < minimum_length: minimum_length = len(range) # print(minimum_length) result = RangeSet() for range in resulting_difference.contiguous(): result.add_range(range[0], range[0] + minimum_length) # print("Selecting partial result: "+str(result)) occupied.union_update(result) real_blocks.append(Block(result)) return real_blocks
def _get_non_overlapping_repeating_blocks(self): # The LCP intervals that are calculated from the extend suffix array are all potential blocks. # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the # amount of witnesses they occur in. potential_blocks = self.token_index.split_lcp_array_into_intervals() # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length queue = PriorityQueue() for interval in potential_blocks: queue.put(interval) occupied = RangeSet() real_blocks = [] while not queue.empty(): item = queue.get() # print(item) # test intersection with occupied potential_block_range = item._as_range() # check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # print("Selected!") occupied.union_update(potential_block_range) real_blocks.append(Block(potential_block_range)) continue # check complete overlap or partial if block_intersection == potential_block_range: # print("complete overlap; skip") continue # print("partial overlap!") occurrence_difference = potential_block_range.difference(block_intersection) # print(occurrence_difference) # check on left partial overlap # filter it # determine start positions start_pos = item.block_occurrences() # print(start_pos) resulting_difference = RangeSet() count = 0 for range in occurrence_difference.contiguous(): if range[0] in start_pos: resulting_difference.add_range(range[0], range[-1]+1) count+=1 # print(resulting_difference) if count < 2: continue # in case of right partial overlap # calculate the minimum allowed range minimum_length = item.length for range in resulting_difference.contiguous(): if len(range) < minimum_length: minimum_length = len(range) # print(minimum_length) result = RangeSet() for range in resulting_difference.contiguous(): result.add_range(range[0], range[0]+minimum_length) # print("Selecting partial result: "+str(result)) occupied.union_update(result) real_blocks.append(Block(result)) return real_blocks
def test_contiguous(self): r0 = RangeSet() self.assertEqual([], [str(ns) for ns in r0.contiguous()]) r1 = RangeSet("1,3-9,14-21,30-39,42") self.assertEqual(['1', '3-9', '14-21', '30-39', '42'], [str(ns) for ns in r1.contiguous()])
def test_contiguous(self): r1 = RangeSet("1,3-9,14-21,30-39,42") self.assertEqual(['1', '3-9', '14-21', '30-39', '42'], [str(ns) for ns in r1.contiguous()])