示例#1
0
    def test_window_to_json(self):
        """
    Test if a window is properly dumped into a
    json formatted string

    Returns
    -------
    None.

    """

        window = Window(idx=10, capacity=5)
        window.add(observation=Observation(
            position=100, read_depth=200, base=['A', 'B']))

        window.add(observation=Observation(
            position=300, read_depth=500, base=['A', 'B']))

        json_str = window.to_json()
        data = json.loads(json_str)

        self.assertEqual(data["id"], window.get_id())
        self.assertEqual(data["capacity"], window.capacity())
        self.assertEqual(len(data["observations"]), 2)
        self.assertEqual(
            data["observations"][0],
            '{"position": 100, "read_depth": 200, "base": ["A", "B"]}')
        self.assertEqual(
            data["observations"][1],
            '{"position": 300, "read_depth": 500, "base": ["A", "B"]}')
示例#2
0
    def create_windows(self, nwindows, window_capacity):
        windows = []

        # create 5 windows
        counter = 0
        for i in range(nwindows):
            window = Window(idx=i, capacity=window_capacity)

            for obs in range(window.capacity()):

                observation = Observation(position=counter,
                                          read_depth=obs,
                                          base=['A'])

                window.add(observation=observation)
                counter += 1

            windows.append(window)
        return windows
    def load(filename):
        with open(filename, 'r') as f:
            idx = int(f.readline().split(":")[1].rstrip("\n"))
            start = int(f.readline().split(":")[1].rstrip("\n"))
            end = int(f.readline().split(":")[1].rstrip("\n"))
            w_size = int(f.readline().split(":")[1].rstrip("\n"))

            region = Region(idx=idx, start=start, end=end, window_size=w_size)

            n_wag_wins = int(f.readline().split(":")[1].rstrip("\n"))
            windows = []
            for w in range(n_wag_wins):
                wid = int(f.readline().split(":")[1].rstrip("\n"))
                cap = int(f.readline().split(":")[1].rstrip("\n"))
                size = int(f.readline().split(":")[1].rstrip("\n"))

                window = Window(idx=wid, capacity=cap)

                for obs in range(size):
                    pos = int(f.readline().split(":")[1].rstrip("\n"))
                    rd = float(f.readline().split(":")[1].rstrip("\n"))
                    base = list(f.readline().split(":")[1].rstrip("\n"))

                    obs = Observation(position=pos, read_depth=rd, base=base)
                    window.add(observation=obs)
                windows.append(window)

            region.set_windows(wtype=WindowType.WGA, windows=windows)
            n_no_wag_wins = int(f.readline().split(":")[1].rstrip("\n"))

            windows = []
            for w in range(n_no_wag_wins):
                wid = int(f.readline().split(":")[1].rstrip("\n"))
                cap = int(f.readline().split(":")[1].rstrip("\n"))
                size = int(f.readline().split(":")[1].rstrip("\n"))

                window = Window(idx=wid, capacity=cap)

                for obs in range(size):
                    pos = int(f.readline().split(":")[1].rstrip("\n"))
                    rd = float(f.readline().split(":")[1].rstrip("\n"))
                    base = list(f.readline().split(":")[1].rstrip("\n"))

                    obs = Observation(position=pos, read_depth=rd, base=base)
                    window.add(observation=obs)
                windows.append(window)
            region.set_windows(wtype=WindowType.NO_WGA, windows=windows)
            return region
def extract_windows(chromosome, ref_filename, bam_filename, **args):
    """
    Extract the windows that couple the seq_file and ref_files
    for the given chromosome
    :param chromosome: chromosome name (str)
    :param ref_file: The reference file
    :param test_file: The sequence file
    :return:
    """

    windowcapacity = args["windowsize"]
    start_idx = args["start_idx"]
    end_idx = args["end_idx"]

    # the windows list
    windows = []

    with pysam.FastaFile(ref_filename) as fastafile:
        print("{0} Reference file: {1}".format(INFO, fastafile.filename))

        with pysam.AlignmentFile(bam_filename, "rb") as sam_file:
            print("{0} Sam file: {1} ".format(INFO, sam_file.filename))

            wcounter = 0
            while start_idx < end_idx:
                sam_output = window_sam_file(chromosome=chromosome,
                                             sam_file=sam_file,
                                             fastafile=fastafile,
                                             start=start_idx,
                                             end=start_idx + windowcapacity,
                                             **args)
                windows.append(
                    Window(idx=wcounter,
                           capacity=windowcapacity,
                           samdata=sam_output))

                start_idx += windowcapacity
                wcounter += 1

    return windows
    def load(filename):

        print("{0} Loading region from file: {1}".format(INFO, filename))
        with open(filename, 'r') as f:

            idx = int(f.readline().split(":")[1].rstrip("\n"))
            start = int(f.readline().split(":")[1].rstrip("\n"))
            end = int(f.readline().split(":")[1].rstrip("\n"))
            w_size = int(f.readline().split(":")[1].rstrip("\n"))

            region = Region(idx=idx, start=start,
                            end=end, window_size=w_size)

            n_wag_wins = int(f.readline().split(":")[1].rstrip("\n"))
            windows = []
            for w in range(n_wag_wins):
                wid = int(f.readline().split(":")[1].rstrip("\n"))
                cap = int(f.readline().split(":")[1].rstrip("\n"))

                n_props = int(f.readline().split(":")[1].rstrip("\n"))
                samdata = {}
                for prop in range(n_props):
                    line = f.readline().split(":")
                    name = line[0]
                    val = line[1].rstrip("\n")

                    if val == 'False':
                        val = False
                    elif val == 'True':
                        val = True
                    else:
                        try:
                            val = float(val)
                        except:
                            pass

                    samdata[name] = val

                window = Window(idx=wid, capacity=cap, samdata=samdata)
                windows.append(window)

            region.set_windows(wtype=WindowType.WGA, windows=windows)
            n_no_wag_wins = int(f.readline().split(":")[1].rstrip("\n"))

            windows = []
            for w in range(n_no_wag_wins):
                wid = int(f.readline().split(":")[1].rstrip("\n"))
                cap = int(f.readline().split(":")[1].rstrip("\n"))

                n_props = int(f.readline().split(":")[1].rstrip("\n"))
                samdata = {}
                for prop in range(n_props):
                    line = f.readline().split(":")
                    name = line[0]
                    val = line[1].rstrip("\n")

                    if val == 'False':
                        val = False
                    elif val == 'True':
                        val = True
                    else:
                        try:
                            val = float(val)
                        except:
                            pass

                    samdata[name] = val

                window = Window(idx=wid, capacity=cap, samdata=samdata)
                windows.append(window)
            region.set_windows(wtype=WindowType.NO_WGA, windows=windows)
            return region
示例#6
0
    def test_windows_to_json(self):

        windows = []
        window = Window(idx=10, capacity=5)
        window.add(observation=Observation(
            position=100, read_depth=200, base=['A', 'B']))

        window.add(observation=Observation(
            position=300, read_depth=500, base=['A', 'B']))

        windows.append(window)

        window = Window(idx=11, capacity=5)
        window.add(observation=Observation(
            position=100, read_depth=200, base=['A', 'B']))

        window.add(observation=Observation(
            position=300, read_depth=500, base=['A', 'B']))

        windows.append(window)
        json_str = windows_to_json(windows=windows)
        data = json.loads(json_str)
        new_windows = windows_from_json(jsonmap=data)

        self.assertEqual(new_windows[0].get_id(), windows[0].get_id())
        self.assertEqual(new_windows[0].capacity(), windows[0].capacity())
        self.assertEqual(new_windows[0][0].position, windows[0][0].position)
        self.assertEqual(new_windows[0][0].read_depth,
                         windows[0][0].read_depth)
        self.assertEqual(new_windows[0][0].base, windows[0][0].base)

        self.assertEqual(new_windows[1].get_id(), windows[1].get_id())
        self.assertEqual(new_windows[1].capacity(), windows[1].capacity())
        self.assertEqual(new_windows[1][0].position, windows[1][0].position)
        self.assertEqual(new_windows[1][0].read_depth,
                         windows[1][0].read_depth)
        self.assertEqual(new_windows[1][0].base, windows[1][0].base)
示例#7
0
def create_windows(bamlist, indel_dict, fastdata, windowcapacity, start, end,
                   **kwargs):
    """
    Arrange the given bamlist into windows of size windowcapacity.
    Note that a window is not necessary that will have the
    windowcapacity items. windowcapacity simply indicates the
    maximum number of observations that should be
    added in a window
    :param bamlist:
    :param indel_dict: insertions/deletions directory
    :param fastdata: The reference sequence
    :return: a list of Window instances
    """

    if not bamlist:
        raise Error("No test sequence is provided")

    if not fastdata:
        raise Error("No reference sequence is provided")

    print("{0} Estimated number"
          " of windows: {1} ".format(INFO,
                                     len(bamlist) // windowcapacity))

    # the returned list of windows
    windows = []

    idstart = 0
    window = Window(idx=idstart, capacity=windowcapacity)
    previous_observation = None

    for idx, item in enumerate(bamlist):

        for base in item[2]:
            base.upper()

        # create an observation
        observation = Observation(position=int(item[0]),
                                  read_depth=item[1],
                                  base=item[2])

        #print("Id: ",observation.position)

        #if observation.position == 1005881:
        #    print("Hi....")
        #  import pdb
        #  pdb.set_trace()

        if previous_observation is not None:

            # is this sequential?
            if int(observation.position) == int(
                    previous_observation.position) + 1:

                # yes it is...nice add it to the window
                # and update the observation
                window = \
                  add_window_observation(window=window,
                                         windows=windows,
                                         observation=observation,
                                         windowcapacity=windowcapacity)

                previous_observation = observation
            else:

                #import pdb
                #pdb.set_trace()
                logging.info("For observation {0}"
                             " there is a gap. Next "
                             "observation is at {1}".format(
                                 previous_observation.position,
                                 observation.position))

                # minus one at the end because previous position
                #has already been added
                gap_size = int(observation.position) - int(
                    previous_observation.position) - 1

                # there is a gap we cannot simply
                # add the observation as this may have
                # to be added to the next window depending
                # on the size of the gap.

                # fill in the missing info from the
                # reference file positions are adjusted
                # in _get_missing_gap_info to start +1, end +1
                # to access the referece section
                window_gaps = \
                  _get_missing_gap_info(start=int(previous_observation.position),
                                        end=int(observation.position),
                                        fastdata=fastdata)

                if len(window_gaps) != gap_size:
                    raise Error("Invalid window_gaps. "
                                "Size {0} not equal to {1}".format(
                                    len(window_gaps), gap_size))

                # after getting the missing info we try to add it
                # to the window. we may have accumulated so much info that
                # we exceed the window capacity. For example
                # a window already has windowcapacity - 2 items
                # and len(window_gaps) is 10. In this case we need
                # to create a new window

                for win_gap_item in window_gaps:
                    dummy_observation = Observation(position=win_gap_item[0],
                                                    read_depth=win_gap_item[1],
                                                    base=win_gap_item[2])

                    window = add_window_observation(
                        window=window,
                        windows=windows,
                        observation=dummy_observation,
                        windowcapacity=windowcapacity)

                # add also the current observation
                # that led us here
                window = add_window_observation(window=window,
                                                windows=windows,
                                                observation=observation,
                                                windowcapacity=windowcapacity)

                previous_observation = observation
        else:

            # that's the first observation
            window = add_window_observation(window=window,
                                            windows=windows,
                                            observation=observation,
                                            windowcapacity=windowcapacity)
            previous_observation = observation

    # catch also the last window. The last
    # window may not be using all its capacity
    # as this depends on the partitioning. Optionally
    # we fill in the missing data if that was requested
    if len(window) != window.capacity:

        print("{0} Window {1} size {2} is"
              " not equal capacity {3} ".format(WARNING, window.idx,
                                                len(window), window.capacity))

        # fill in missing data if this is requested
        if kwargs.get("fill_missing_window_data", False):

            miss_factor = kwargs["fill_missing_window_data_factor"]
            print("{0} Window size {1} is not \
              equal capacity {2} ".format(WARNING, miss_factor))

            while window.has_capacity():
                window.add(observation=Observation(position=DUMMY_ID,
                                                   read_depth=miss_factor,
                                                   base=[DUMMY_BASE]))

        windows.append(window)

    #sanity check that the last window is in
    found = False
    for w in windows:
        if w.idx == window.idx:
            found = True
            break

    if found == False:
        windows.append(window)

    return windows