def test_window_to_json(self): """ Test if a window is properly dumped into a json formatted string Returns ------- None. """ window = Window(idx=10, capacity=5) window.add(observation=Observation( position=100, read_depth=200, base=['A', 'B'])) window.add(observation=Observation( position=300, read_depth=500, base=['A', 'B'])) json_str = window.to_json() data = json.loads(json_str) self.assertEqual(data["id"], window.get_id()) self.assertEqual(data["capacity"], window.capacity()) self.assertEqual(len(data["observations"]), 2) self.assertEqual( data["observations"][0], '{"position": 100, "read_depth": 200, "base": ["A", "B"]}') self.assertEqual( data["observations"][1], '{"position": 300, "read_depth": 500, "base": ["A", "B"]}')
def create_windows(self, nwindows, window_capacity): windows = [] # create 5 windows counter = 0 for i in range(nwindows): window = Window(idx=i, capacity=window_capacity) for obs in range(window.capacity()): observation = Observation(position=counter, read_depth=obs, base=['A']) window.add(observation=observation) counter += 1 windows.append(window) return windows
def load(filename): with open(filename, 'r') as f: idx = int(f.readline().split(":")[1].rstrip("\n")) start = int(f.readline().split(":")[1].rstrip("\n")) end = int(f.readline().split(":")[1].rstrip("\n")) w_size = int(f.readline().split(":")[1].rstrip("\n")) region = Region(idx=idx, start=start, end=end, window_size=w_size) n_wag_wins = int(f.readline().split(":")[1].rstrip("\n")) windows = [] for w in range(n_wag_wins): wid = int(f.readline().split(":")[1].rstrip("\n")) cap = int(f.readline().split(":")[1].rstrip("\n")) size = int(f.readline().split(":")[1].rstrip("\n")) window = Window(idx=wid, capacity=cap) for obs in range(size): pos = int(f.readline().split(":")[1].rstrip("\n")) rd = float(f.readline().split(":")[1].rstrip("\n")) base = list(f.readline().split(":")[1].rstrip("\n")) obs = Observation(position=pos, read_depth=rd, base=base) window.add(observation=obs) windows.append(window) region.set_windows(wtype=WindowType.WGA, windows=windows) n_no_wag_wins = int(f.readline().split(":")[1].rstrip("\n")) windows = [] for w in range(n_no_wag_wins): wid = int(f.readline().split(":")[1].rstrip("\n")) cap = int(f.readline().split(":")[1].rstrip("\n")) size = int(f.readline().split(":")[1].rstrip("\n")) window = Window(idx=wid, capacity=cap) for obs in range(size): pos = int(f.readline().split(":")[1].rstrip("\n")) rd = float(f.readline().split(":")[1].rstrip("\n")) base = list(f.readline().split(":")[1].rstrip("\n")) obs = Observation(position=pos, read_depth=rd, base=base) window.add(observation=obs) windows.append(window) region.set_windows(wtype=WindowType.NO_WGA, windows=windows) return region
def extract_windows(chromosome, ref_filename, bam_filename, **args): """ Extract the windows that couple the seq_file and ref_files for the given chromosome :param chromosome: chromosome name (str) :param ref_file: The reference file :param test_file: The sequence file :return: """ windowcapacity = args["windowsize"] start_idx = args["start_idx"] end_idx = args["end_idx"] # the windows list windows = [] with pysam.FastaFile(ref_filename) as fastafile: print("{0} Reference file: {1}".format(INFO, fastafile.filename)) with pysam.AlignmentFile(bam_filename, "rb") as sam_file: print("{0} Sam file: {1} ".format(INFO, sam_file.filename)) wcounter = 0 while start_idx < end_idx: sam_output = window_sam_file(chromosome=chromosome, sam_file=sam_file, fastafile=fastafile, start=start_idx, end=start_idx + windowcapacity, **args) windows.append( Window(idx=wcounter, capacity=windowcapacity, samdata=sam_output)) start_idx += windowcapacity wcounter += 1 return windows
def load(filename): print("{0} Loading region from file: {1}".format(INFO, filename)) with open(filename, 'r') as f: idx = int(f.readline().split(":")[1].rstrip("\n")) start = int(f.readline().split(":")[1].rstrip("\n")) end = int(f.readline().split(":")[1].rstrip("\n")) w_size = int(f.readline().split(":")[1].rstrip("\n")) region = Region(idx=idx, start=start, end=end, window_size=w_size) n_wag_wins = int(f.readline().split(":")[1].rstrip("\n")) windows = [] for w in range(n_wag_wins): wid = int(f.readline().split(":")[1].rstrip("\n")) cap = int(f.readline().split(":")[1].rstrip("\n")) n_props = int(f.readline().split(":")[1].rstrip("\n")) samdata = {} for prop in range(n_props): line = f.readline().split(":") name = line[0] val = line[1].rstrip("\n") if val == 'False': val = False elif val == 'True': val = True else: try: val = float(val) except: pass samdata[name] = val window = Window(idx=wid, capacity=cap, samdata=samdata) windows.append(window) region.set_windows(wtype=WindowType.WGA, windows=windows) n_no_wag_wins = int(f.readline().split(":")[1].rstrip("\n")) windows = [] for w in range(n_no_wag_wins): wid = int(f.readline().split(":")[1].rstrip("\n")) cap = int(f.readline().split(":")[1].rstrip("\n")) n_props = int(f.readline().split(":")[1].rstrip("\n")) samdata = {} for prop in range(n_props): line = f.readline().split(":") name = line[0] val = line[1].rstrip("\n") if val == 'False': val = False elif val == 'True': val = True else: try: val = float(val) except: pass samdata[name] = val window = Window(idx=wid, capacity=cap, samdata=samdata) windows.append(window) region.set_windows(wtype=WindowType.NO_WGA, windows=windows) return region
def test_windows_to_json(self): windows = [] window = Window(idx=10, capacity=5) window.add(observation=Observation( position=100, read_depth=200, base=['A', 'B'])) window.add(observation=Observation( position=300, read_depth=500, base=['A', 'B'])) windows.append(window) window = Window(idx=11, capacity=5) window.add(observation=Observation( position=100, read_depth=200, base=['A', 'B'])) window.add(observation=Observation( position=300, read_depth=500, base=['A', 'B'])) windows.append(window) json_str = windows_to_json(windows=windows) data = json.loads(json_str) new_windows = windows_from_json(jsonmap=data) self.assertEqual(new_windows[0].get_id(), windows[0].get_id()) self.assertEqual(new_windows[0].capacity(), windows[0].capacity()) self.assertEqual(new_windows[0][0].position, windows[0][0].position) self.assertEqual(new_windows[0][0].read_depth, windows[0][0].read_depth) self.assertEqual(new_windows[0][0].base, windows[0][0].base) self.assertEqual(new_windows[1].get_id(), windows[1].get_id()) self.assertEqual(new_windows[1].capacity(), windows[1].capacity()) self.assertEqual(new_windows[1][0].position, windows[1][0].position) self.assertEqual(new_windows[1][0].read_depth, windows[1][0].read_depth) self.assertEqual(new_windows[1][0].base, windows[1][0].base)
def create_windows(bamlist, indel_dict, fastdata, windowcapacity, start, end, **kwargs): """ Arrange the given bamlist into windows of size windowcapacity. Note that a window is not necessary that will have the windowcapacity items. windowcapacity simply indicates the maximum number of observations that should be added in a window :param bamlist: :param indel_dict: insertions/deletions directory :param fastdata: The reference sequence :return: a list of Window instances """ if not bamlist: raise Error("No test sequence is provided") if not fastdata: raise Error("No reference sequence is provided") print("{0} Estimated number" " of windows: {1} ".format(INFO, len(bamlist) // windowcapacity)) # the returned list of windows windows = [] idstart = 0 window = Window(idx=idstart, capacity=windowcapacity) previous_observation = None for idx, item in enumerate(bamlist): for base in item[2]: base.upper() # create an observation observation = Observation(position=int(item[0]), read_depth=item[1], base=item[2]) #print("Id: ",observation.position) #if observation.position == 1005881: # print("Hi....") # import pdb # pdb.set_trace() if previous_observation is not None: # is this sequential? if int(observation.position) == int( previous_observation.position) + 1: # yes it is...nice add it to the window # and update the observation window = \ add_window_observation(window=window, windows=windows, observation=observation, windowcapacity=windowcapacity) previous_observation = observation else: #import pdb #pdb.set_trace() logging.info("For observation {0}" " there is a gap. Next " "observation is at {1}".format( previous_observation.position, observation.position)) # minus one at the end because previous position #has already been added gap_size = int(observation.position) - int( previous_observation.position) - 1 # there is a gap we cannot simply # add the observation as this may have # to be added to the next window depending # on the size of the gap. # fill in the missing info from the # reference file positions are adjusted # in _get_missing_gap_info to start +1, end +1 # to access the referece section window_gaps = \ _get_missing_gap_info(start=int(previous_observation.position), end=int(observation.position), fastdata=fastdata) if len(window_gaps) != gap_size: raise Error("Invalid window_gaps. " "Size {0} not equal to {1}".format( len(window_gaps), gap_size)) # after getting the missing info we try to add it # to the window. we may have accumulated so much info that # we exceed the window capacity. For example # a window already has windowcapacity - 2 items # and len(window_gaps) is 10. In this case we need # to create a new window for win_gap_item in window_gaps: dummy_observation = Observation(position=win_gap_item[0], read_depth=win_gap_item[1], base=win_gap_item[2]) window = add_window_observation( window=window, windows=windows, observation=dummy_observation, windowcapacity=windowcapacity) # add also the current observation # that led us here window = add_window_observation(window=window, windows=windows, observation=observation, windowcapacity=windowcapacity) previous_observation = observation else: # that's the first observation window = add_window_observation(window=window, windows=windows, observation=observation, windowcapacity=windowcapacity) previous_observation = observation # catch also the last window. The last # window may not be using all its capacity # as this depends on the partitioning. Optionally # we fill in the missing data if that was requested if len(window) != window.capacity: print("{0} Window {1} size {2} is" " not equal capacity {3} ".format(WARNING, window.idx, len(window), window.capacity)) # fill in missing data if this is requested if kwargs.get("fill_missing_window_data", False): miss_factor = kwargs["fill_missing_window_data_factor"] print("{0} Window size {1} is not \ equal capacity {2} ".format(WARNING, miss_factor)) while window.has_capacity(): window.add(observation=Observation(position=DUMMY_ID, read_depth=miss_factor, base=[DUMMY_BASE])) windows.append(window) #sanity check that the last window is in found = False for w in windows: if w.idx == window.idx: found = True break if found == False: windows.append(window) return windows