예제 #1
0
def find_subread_entry(cmph5_alignment, subread_dict):
    """Find the entry in subread_dict that corresponds to the
    cmph5_alignment. Match the movie name and hole number, then find
    which of the bax.h5 subread bounds overlap with the bounds of the
    alignment.

    Args:
        cmph5_alignment: a CmpH5IO.CmpH5Alignment
        subread_dict: dictionary from affixes.subread_dictionary that will be
                      searched for a match with the cmph5_alignment

    Returns:
        key, overlapping_bounds: the (movie_name, hole_number) key and the
                                 subread bounds that contain the alignment

    Returns None if the alignment is not found in the subread_dict
    """

    key = (cmph5_alignment.movieInfo.Name, cmph5_alignment.HoleNumber)

    if key not in subread_dict:
        # If this alignment isn't in the input fofn, just skip it
        return None, None

    # Figure out which subread this alignment is from by checking for
    # overlap with the bounds from the region table
    overlapping_bounds = None
    for subread_bounds in subread_dict[key].iterkeys():
        if BasH5IO.intersectRanges(
            (cmph5_alignment.rStart, cmph5_alignment.rEnd),
                subread_bounds) is not None:
            overlapping_bounds = subread_bounds
            break

    return key, overlapping_bounds
예제 #2
0
def get_chemistry_info(sam_header, input_filenames, fail_on_missing=False):
    """Get chemistry triple information for movies referenced in a SAM
    header.

    Args:
        sam_header: a pysam.Samfile.header, which is a multi-level dictionary.
                    Movie names are read from RG tags in this header.
        input_filenames: a list of bas, bax, or fofn filenames.
        fail_on_missing: if True, raise an exception if the chemistry
                         information for a movie in the header cannot be
                         found. If False, just log a warning.
    Returns:
        a list of strings that can be written as DS tags to RG entries in the
        header of a new SAM or BAM file. For example,
        ['BINDINGKIT:xxxx;SEQUENCINGKIT:yyyy;SOFTWAREVERSION:2.0']

    Raises:
        ChemistryLoadingException if chemistry information cannot be found
        for a movie in the header and fail_on_missing is True.
    """

    # First get the full list of ba[sx] files, reading through any fofn or xml
    # inputs
    bas_filenames = []
    for filename in input_filenames:
        bas_filenames.extend(FofnIO.enumeratePulseFiles(filename))

    # Then get the chemistry triple for each movie in the list of bas files
    triple_dict = {}
    for bas_filename in bas_filenames:
        bas_file = BasH5IO.BasH5Reader(bas_filename)
        movie_name = bas_file.movieName
        chem_triple = bas_file.chemistryBarcodeTriple
        triple_dict[movie_name] = chem_triple

    # Finally, find the movie names that appear in the header and create CO
    # lines with the chemistry triple
    if 'RG' not in sam_header:
        return []
    rgds_entries = {}
    for rg_entry in sam_header['RG']:
        rg_id = rg_entry['ID']
        rg_movie_name = rg_entry[MOVIENAME_TAG]

        try:
            rg_chem_triple = triple_dict[rg_movie_name]
            rgds_entries[rg_id] = rg_chem_triple
        except KeyError:
            err_msg = (
                "Cannot find chemistry information for movie {m}.".format(
                    m=rg_movie_name))
            if fail_on_missing:
                raise ChemistryLoadingException(err_msg)
            else:
                log.warning(err_msg)

    rgds_strings = format_rgds_entries(rgds_entries)

    return rgds_strings
    def setUpClass(cls):
        base_test_case.BaseTestCase.setUpClass()

        bash5_filename = os.path.join(
            base_test_case.ROOT_DATA_DIR,
            "m130522_092457_42208_cTEST1_s1_p0.1.bax.h5")
        cls.bash5_reader = BasH5IO.BasH5Reader(bash5_filename)
        cls.subread_dict = affixes.subread_dictionary(cls.bash5_reader)
    def setUpClass(cls):
        base_test_case.BaseTestCase.setUpClass()

        bash5_filename = os.path.join(
            base_test_case.ROOT_DATA_DIR,
            "m130522_092457_42208_cTEST1_s1_p0.1.bax.h5")
        cmph5_filename = os.path.join(base_test_case.ROOT_DATA_DIR,
                                      "test_alignment.cmp.h5")
        cls.bash5_reader = BasH5IO.BasH5Reader(bash5_filename)
        cls.cmph5_reader = CmpH5IO.CmpH5Reader(cmph5_filename)
        cls.subread_dict = affixes.subread_dictionary(cls.bash5_reader)
        cls.affix_bounds = affixes.affix_boundaries(cls.subread_dict,
                                                    cls.cmph5_reader, 1)
    def setUpClass(cls):
        base_test_case.BaseTestCase.setUpClass()

        bash5_filename = os.path.join(base_test_case.ROOT_DATA_DIR,
                                      "m130522_092457_42208_cTEST1_s1_p0.1.bax.h5")
        cmph5_filename = os.path.join(base_test_case.ROOT_DATA_DIR,
                                      "test_alignment.cmp.h5")
        cls.bash5_reader = BasH5IO.BasH5Reader(bash5_filename)
        cls.cmph5_reader = CmpH5IO.CmpH5Reader(cmph5_filename)
        cls.subread_dict = affixes.subread_dictionary(cls.bash5_reader)
        cls.affix_bounds = affixes.affix_boundaries(cls.subread_dict,
                                                    cls.cmph5_reader, 1)
        cls.original_region_table = cls.bash5_reader.file['PulseData/Regions'].value
        cls.region_table = affixes.affix_region_table(
            cls.original_region_table, cls.bash5_reader.movieName,
            cls.affix_bounds)
    def test_no_overlap_alignments(self):
        """Affixes cannot overlap with any aligned part of any subread."""
        
        alignment_dict = {}
        for alignment in self.cmph5_reader:
            key = (alignment.movieInfo.Name, alignment.HoleNumber)
            if key not in alignment_dict:
                alignment_dict[key] = []
            alignment_dict[key].append((alignment.rStart, alignment.rEnd))

        for key in self.affix_bounds:
            try:
                alignments = alignment_dict[key]
            except KeyError:
                pass

            affixes = self.affix_bounds[key]

            for affix in affixes:
                for alignment in alignments:
                    self.assertIsNone(BasH5IO.intersectRanges(affix, alignment))
    def test_no_overlap_alignments(self):
        """Affixes cannot overlap with any aligned part of any subread."""

        alignment_dict = {}
        for alignment in self.cmph5_reader:
            key = (alignment.movieInfo.Name, alignment.HoleNumber)
            if key not in alignment_dict:
                alignment_dict[key] = []
            alignment_dict[key].append((alignment.rStart, alignment.rEnd))

        for key in self.affix_bounds:
            try:
                alignments = alignment_dict[key]
            except KeyError:
                pass

            affixes = self.affix_bounds[key]

            for affix in affixes:
                for alignment in alignments:
                    self.assertIsNone(BasH5IO.intersectRanges(
                        affix, alignment))
예제 #8
0
def create_affix_region_tables(input_fofn_filename, cmph5_filename,
                               output_path, min_affix_size):
    """Create the pbbridgemapper rgn.h5 and fofn files.

    Args:
        input_fofn_filename: fofn of bax.h5 filenames
        cmph5_filename: aligned_reads.cmp.h5 for the bax.h5 files
        output_path: path where the fofn and rgn.h5 files will be written
        min_affix_size: smallest affix that will be included in the
                        region table

    Returns:
        output_fofn_filename: file name of the FOFN of pbbridgemapper rgn.h5
                              files
    """

    bash5_filenames = list(FofnIO.readFofn(input_fofn_filename))
    logging.info("Read filenames from input fofn file: %s", bash5_filenames)

    cmph5_reader = CmpH5IO.CmpH5Reader(cmph5_filename)
    logging.info("Opened %s", cmph5_filename)
    output_rgn_filenames = []

    try:
        os.makedirs(os.path.join(output_path, 'pbbridgemapper_regions'))
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise OSError("Could not create regions directory {d}.".format(
                d=os.path.join(output_path, 'pbbridgemapper_regions')))

    for bash5_filename in bash5_filenames:
        logging.debug("Getting affix boundaries from %s", bash5_filename)

        bash5_reader = BasH5IO.BasH5Reader(bash5_filename)
        subread_dict = pbbridgemapper.affixes.subread_dictionary(bash5_reader)
        logging.debug("Created subread dictionary from %d ZMWs",
                      len(subread_dict))

        affix_bounds = pbbridgemapper.affixes.affix_boundaries(
            subread_dict, cmph5_reader, min_affix_size)
        logging.debug("Found %d unmapped affixes",
                      sum([len(k) for k in affix_bounds.itervalues()]))

        original_region_table = (
            bash5_reader.file.get('/PulseData/Regions').value)
        affix_region_table = pbbridgemapper.affixes.affix_region_table(
            original_region_table, bash5_reader.movieName, affix_bounds)

        output_rgn_filename = os.path.join(
            output_path, 'pbbridgemapper_regions',
            re.sub(r"ba[sx]\.h5$", "rgn.h5", os.path.basename(bash5_filename)))
        pbbridgemapper.affixes.write_region_table(affix_region_table,
                                                  output_rgn_filename,
                                                  bash5_reader)
        bash5_reader.close()
        logging.info("Wrote pbbridgemapper region table to %s",
                     output_rgn_filename)
        output_rgn_filenames.append(output_rgn_filename)

    # Now the rgn.h5 files have been created, we just need to make the fofn
    output_fofn_filename = os.path.join(
        output_path,
        re.sub("fofn$", "pbbridgemapper_regions.fofn",
               os.path.basename(input_fofn_filename)))

    with open(output_fofn_filename, 'w') as output_fofn_file:
        for i in xrange(len(output_rgn_filenames)):
            filename = output_rgn_filenames[i]
            output_fofn_file.write(filename)
            if i < len(output_rgn_filenames) - 1:
                output_fofn_file.write('\n')
    logging.info("Wrote rgn file names to %s", output_fofn_filename)

    return output_fofn_filename
예제 #9
0
def create_pbbridgemapper_output(input_fofn_filename, affix_cmph5_filename,
                                 primary_cmph5_filename, split_reads_filename,
                                 unique_only):
    """Create the split_reads file the SMRTview wants.
    """

    # First build a dictionary of all the subreads in the input fofn
    bash5_filenames = list(FofnIO.readFofn(input_fofn_filename))
    all_subread_dict = {}
    for bash5_filename in bash5_filenames:
        bash5_reader = BasH5IO.BasH5Reader(bash5_filename)
        subread_dict = pbbridgemapper.affixes.subread_dictionary(bash5_reader)
        all_subread_dict.update(subread_dict)

    # Now iterate through the primary alignments, recording best scoring
    # primary alignments for each subread
    primary_cmph5_reader = CmpH5IO.CmpH5Reader(primary_cmph5_filename)
    for alignment in primary_cmph5_reader:
        key, overlapping_bounds = pbbridgemapper.affixes.find_subread_entry(
            alignment, all_subread_dict)
        if key is None:
            continue

        if all_subread_dict[key][overlapping_bounds] is not None:
            existing_alignment = (
                all_subread_dict[key][overlapping_bounds]['primary'])
        else:
            existing_aligment = None

        if (existing_aligment is None
                or alignment.mapQV > existing_alignment['map_qv']):
            all_subread_dict[key][overlapping_bounds] = {}
            all_subread_dict[key][overlapping_bounds]['primary'] = (
                pbbridgemapper.smrtview_output.alignment_to_output_dict(
                    alignment, overlapping_bounds))

    primary_cmph5_reader.close()

    # Now iterate through the affix alignments
    try:
        affix_cmph5_reader = CmpH5IO.CmpH5Reader(affix_cmph5_filename)
    except CmpH5IO.EmptyCmpH5Error:
        affix_cmph5_reader = []

    for alignment in affix_cmph5_reader:
        key, overlapping_bounds = pbbridgemapper.affixes.find_subread_entry(
            alignment, all_subread_dict)
        if key is None:
            continue

        # Figure out if this is a prefix or suffix alignment
        if all_subread_dict[key][overlapping_bounds] is None:
            continue
        alignment_dict = (
            pbbridgemapper.smrtview_output.alignment_to_output_dict(
                alignment, overlapping_bounds))
        primary_dict = all_subread_dict[key][overlapping_bounds]['primary']
        primary_subread_start = primary_dict['subread_start']
        primary_subread_end = primary_dict['subread_end']

        if alignment_dict['subread_end'] <= primary_subread_start:
            affix_type = 'prefix'
        elif alignment_dict['subread_start'] >= primary_subread_end:
            affix_type = 'suffix'
        else:
            continue

        if affix_type in all_subread_dict[key][overlapping_bounds]:
            existing_alignment = (
                all_subread_dict[key][overlapping_bounds][affix_type])
        else:
            existing_alignment = None

        if (existing_alignment is None
                or alignment.mapQV > existing_alignment['map_qv']):
            all_subread_dict[key][overlapping_bounds][affix_type] = \
                alignment_dict

    if unique_only:
        pbbridgemapper.smrtview_output.remove_nonunique_alignments(
            all_subread_dict)
    pbbridgemapper.smrtview_output.write_split_reads_file(
        all_subread_dict, split_reads_filename)