예제 #1
0
파일: pdb.py 프로젝트: porteusconf/forgi
def output_multiple_chains(chains, filename, file_type="pdb"):
    '''
    Dump multiple chains to an output file. Remove the hydrogen atoms.

    :param chains: An iterable of Bio.PDB.Chain to dump.
    :param filename: The place to dump it.
    '''
    class HSelect(bpdb.Select):
        def accept_atom(self, atom):
            if atom.name.find('H') >= 0:
                return False
            else:
                return True
    m = bpdb.Model.Model(0)
    s = bpdb.Structure.Structure('stru')
    for chain in chains:
        log.debug("Adding chain %s with %s residues", chain.id, len(chain))
        m.add(chain)
        if file_type=="pdb" and len(chain.id)!=1:
            raise ValueError("Cannot save chain with name %s (not a single character) "
                             "in PDB format. Use cif format instead!")
    s.add(m)
    if file_type == "pdb":
        io = bpdb.PDBIO()
    else:
        io = bpdb.MMCIFIO()
    io.set_structure(s)
    try:
        io.save(filename, HSelect())
    except Exception as e:
        with log_to_exception(log, e):
            log.error("Could not output PDB with chains and residues:")
            for chain in s[0]:
                log.error("%s: %s", chain.id, [r.id for r in chain])
        raise
예제 #2
0
def parse_base_pair_id(base_pair_id):
    """
    Separate the two chain/base identifiers present in the interaction section of
    an MC-Annotate output file.

    @param base_pair_id: The identifier string for the interacting nucleotides (i.e. 'A33-B45')
    @return: 4-tuple containing of the form (chain1, res1, chain2, res2) i.e. ('A', 33, 'B', '45')
    """
    # A number in single quotes or a letter, followed by a (potentially negative) number and
    # potentiallly by an insertion code.
    residue_pattern = r"(?:'\d'|[A-Za-z])-?\d+(?:\.[A-Za-z])?"

    parts = re.findall(residue_pattern, base_pair_id)
    if len(parts) != 2:
        e = ValueError("Invalid interaction in the MC-Annotate file: %s" %
                       base_pair_id)
        with log_to_exception(log, e):
            log.error("Regex matched the following parts: %s", parts)
        raise e
    if "-".join(parts) != base_pair_id:
        raise ValueError("Invalid interaction in the MC-Annotate file: %s" %
                         base_pair_id)

    log.debug("Parts are '{}'".format(parts))
    (from_chain, from_base) = parse_chain_base(parts[0].strip())
    (to_chain, to_base) = parse_chain_base(parts[1].strip())

    return (from_chain, from_base, to_chain, to_base)
예제 #3
0
def _validate_pdb_to_stem(target_stem, chains, cg, elem_name):
    """
    :param target_stem: A StemModel to which the pdb chain should be aligned
    :param chains: A dict {chain_id: Chain}
    :param cg: The original coarse-grained representation of the pdb chains
    :param elem_name: The elem_name in cg.
    """
    try:
        pdb_stem = _define_to_stem_model(cg, chains, elem_name)
    except Exception as e:
        with log_to_exception(log, e):
            for chain in chains.values():
                log.error([r.id for r in chain.get_residues()])
        raise
    d_start = ftuv.magnitude(pdb_stem.mids[0] - target_stem.mids[0])
    d_end = ftuv.magnitude(pdb_stem.mids[1] - target_stem.mids[1])
    assert d_start < 0.1, "{Distance between stem starts {} and {} is too big: {}".format(
        pdb_stem.mids[0], target_stem.mids[0], d_start)
    assert d_start < 0.1, "{Distance between stem ends {} and {} is too big: {}".format(
        pdb_stem.mids[1], target_stem.mids[1], d_end)
    tw1_polar_pdb = ftuv.spherical_cartesian_to_polar(pdb_stem.twists[0])
    tw1_polar_target = ftuv.spherical_cartesian_to_polar(target_stem.twists[0])
    d_twist_u = abs(tw1_polar_pdb[1] - tw1_polar_target[1])
    d_twist_v = abs(tw1_polar_pdb[2] - tw1_polar_target[2])
    if d_twist_u > 0.01:
        log.warning("Deviation of twist angle u too big for %s: %s", elem_name,
                    d_twist_u)
    if d_twist_v > 0.01:
        log.warning("Deviation of twist angle v too big for %s: %s", elem_name,
                    d_twist_v)
    return True
예제 #4
0
 def _getitem(self, key, include_missing=False, show_modifications=False):
     log.debug("_getitem called for %s, include_missing=%s, show_modifications=%s",
               key, include_missing, show_modifications)
     if isinstance(key, int):
         key = to_0_based(key)
         if show_modifications and self._seqids[key] in self._modifications:
             return self._modifications[self._seqids[key]]
         else:
             return self._seq[key]
     elif isinstance(key, fgr.RESID):
         try:
             i = self._seqids.index(key)
         except ValueError:
             if key in self._missing_nts:
                 nt = self._missing_nts[key]
                 if include_missing == False:
                     raise IndexError("No structure available for nucleotide '{}'."
                                      "For look-up including missing residues, use"
                                      "`.with_missing[key]`".format(key))
                 if show_modifications and key in self._modifications:
                     return self._modifications[key]
                 return nt
             error = IndexError(
                 "Nucleotide {} is not part of this RNA".format(key))
             with log_to_exception(log, error):
                 log.error("self._missing_nts = %s", self._missing_nts)
             raise error
         else:
             if show_modifications and key in self._modifications:
                 return self._modifications[key]
             return self._seq[i]
     elif isinstance(key, slice):
         return self._getslice(key, include_missing, show_modifications)
     else:
         raise TypeError("Wrong index type: {}".format(type(key).__name__))
예제 #5
0
파일: _dssr.py 프로젝트: pkerpedjiev/forgi
    def cg_stem(self, dssr_stem):
        """
        Get the stem define in the CoarseGrainRNA that corresponds to the stem id in the dssr-format.

        :param dssr_stem: INT the stem in the DSSR Anntotation.
        """
        log.debug("Mapping DSSR stem %s to forgi", dssr_stem)
        if "stems" not in self._dssr:
            raise DSSRLookupError("The DSSR object does not contain any stem!")
        for stem_obj in self._dssr["stems"]:
            if stem_obj["index"] == dssr_stem:
                break
        else:
            raise DSSRLookupError("No stem with index {}".format(dssr_stem))
        log.debug("Found stem %s&%s", stem_obj["strand1"], stem_obj["strand2"])

        cg_stems = Counter()  # See, if the dssr_stems maps to more than 1 cg-stem
        for pair in stem_obj["pairs"]:
            res1 = dssr_to_pdb_resid(pair["nt1"])
            res2 = dssr_to_pdb_resid(pair["nt2"])
            log.debug("Contains pair %s-%s", res1, res2)
            if self._cg.chains and (res1.chain not in self._cg.chains or res2.chain not in self._cg.chains):
                e = WrongChain()
                with log_to_exception(log, e):
                    log.error("Wrong chain: res1={}, res2={}, cg.chains={}".format(
                        res1, res2, self._cg.chains))
                raise e
            i1 = self._cg.seq.to_integer(res1)
            i2 = self._cg.seq.to_integer(res2)
            nodes = self._cg.nucleotides_to_elements([i1, i2])
            for node in nodes:
                cg_stems[node] += 1
        if not cg_stems:
            raise RuntimeError(
                "No stem matching dssr_stem {}.".format(dssr_stem))
        most_common = cg_stems.most_common()
        if len(most_common) > 1:
            extra_info = ""
            for d in cg_stems.keys():
                if d[0] == "i":
                    extra_info += "\n{} is {}:".format(d,
                                                       self._cg.get_define_seq_str(d))
                    extra_info += "\n\t" + self._cg.seq + "\n\t" + \
                        self._cg.to_dotbracket_string() + "\n\t"
                    resnums = list(self._cg.define_residue_num_iterator(d))
                    for i in range(len(self._cg.seq)):
                        pos = i + 1
                        if pos in resnums:
                            extra_info += "^"
                        else:
                            extra_info += " "

            warnings.warn("dssr_stem {} maps to more than one cg element: {} {}".format(
                dssr_stem, list(cg_stems.keys()), extra_info))
        for mc in most_common:
            if mc[0][0] == "s":
                return mc[0]
        raise RuntimeError("No stem matching dssr_stem {}, only single stranded region: {}.".format(
            dssr_stem, list(cg_stems.keys())))
예제 #6
0
def read_stats_file(filename):
    log.info("Reading stats-file %s", filename)
    with open(filename) as f:
        try:
            return parse_stats_file(f)
        except Exception as e:
            with log_to_exception(log, e):
                log.error("Failed to parse file %s", filename)
            raise
예제 #7
0
def parse_stats_file(file_handle):
    stats = {
        "stem": defaultdict(list),
        "angle": defaultdict(list),
        "loop": defaultdict(list),
        "3prime": defaultdict(list),
        "5prime": defaultdict(list)
    }
    for line in file_handle:
        line = line.strip()
        if "#" in line:
            line = line.split('#')[0]
        if not line:
            continue
        if line.startswith("stem"):
            stem_stat = ftmstats.StemStat(line)
            stats["stem"][stem_stat.bp_length].append(stem_stat)
        elif line.startswith("angle") or line.startswith(
                "open") or line.startswith("pseudo"):
            angle_stat = ftmstats.AngleStat()
            try:
                angle_stat.parse_line(line)
            except Exception as e:
                with log_to_exception(log, e):
                    log.error(
                        "Could not parse file due to error parsing line '{}'".
                        format(line))
                raise
            if len(angle_stat.define) > 0 and angle_stat.define[
                    0] == 1:  #An angle at the beginning of a structure
                #I guess this should never happen, if the stats do not stem from faulty bulge graphs.
                log.error(
                    "Ignoring angle stat {} because it is at the beginning of a structure."
                    " Does the stat come from a faulty BulgeGraph?".format(
                        angle_stat.pdb_name))
                continue
            angle_stat.ang_type = patch_angtype(angle_stat.ang_type)
            log.debug(
                "Reading angle_stat with dimensions %s and %s, and type %s. With define %s",
                angle_stat.dim1, angle_stat.dim2, angle_stat.ang_type,
                angle_stat.define)
            stats["angle"][(angle_stat.dim1, angle_stat.dim2,
                            angle_stat.ang_type)].append(angle_stat)
            # Adding the reverse does not work as intended and produces a lot of structures
            # that do not fulfill the constraint energy.
            # stats["angle"][(angle_stat.dim1, angle_stat.dim2, -angle_stat.ang_type)].append(angle_stat)
            # Note that CoarseGrainRNA.get_stats extracts two angle stats per angle.
        else:
            key = line.split()[0]
            if key not in ["3prime", "5prime", "loop"]:
                raise ValueError(
                    "Illegal line in stats file: '{}'".format(line))
            stat = ftmstats.LoopStat(line)
            stats[key][stat.bp_length].append(stat)
    return stats
예제 #8
0
def _safe_resid_from_chain_res(chain, residue):
    try:
        return fgr.resid_from_str(str("{}:{}".format(chain, residue)))
    except ValueError as e:
        if residue.isdigit():
            with log_to_exception(log, e):
                log.error("Chain is '{}', res is '{}'".format(chain, residue))
            raise
        else:
            warnings.warn("Illegal residue number: '{}'.".format(residue))
            return
예제 #9
0
def _safe_resid_from_chain_res(chain, residue):
    try:
        return fgr.resid_from_str(str("{}:{}".format(chain, residue)))
    except ValueError as e:
        if residue.isdigit():
            with log_to_exception(log, e):
                log.error("Chain is '{}', res is '{}'".format(chain, residue))
            raise
        else:
            warnings.warn("Illegal residue number: '{}'.".format(residue))
            return
예제 #10
0
def get_dotplot(lines):
    """docstring for get_dotplot"""
    residues = []
    residue_types = []
    bps = defaultdict(lambda: -1)
    bpseq_str = ""

    for line in iterate_over_residue_list(lines):
        parts = line.split(' ')
        residues.append(parse_chain_base(parts[0]))  # A tuple chain, id
        residue_types += [parts[2]]

    paired = set()
    for line in iterate_over_interactions(lines):
        parts = line.split(' ')
        #bond_type = parts[3]
        # if bond_type.find('Ww/Ww') >= 0 or bond_type.find('Ww/Ws') >= 0 or bond_type.find('Ws/Ww') >= 0:
        if ((line.find('Ww/Ww') >= 0 and
             (line.find('A-U') >= 0 or line.find('U-A') >= 0
              or line.find('C-G') >= 0 or line.find('G-C') >= 0))
                or (line.find('Ws/Ww') >= 0 and line.find('U-G') >= 0)
                or (line.find('Ww/Ws') >= 0 and line.find('G-U') >= 0)):
            # if bond_type.find('Ww/Ww') >= 0:
            # print line
            chain1, base1, chain2, base2 = parse_base_pair_id(parts[0])
            res1 = (chain1, base1)
            res2 = (chain2, base2)
            if res1 in paired or res2 in paired:
                if log.isEnabledFor(logging.WARNING):
                    if res1 in bps:
                        existing = "{} - {}".format(res1, residues[bps[res1]])
                    else:
                        existing = "{} - {}".format(res2, residues[bps[res2]])
                    log.warning(
                        "Base-triple encountered: Ignoring basepair %s - %s, because basepair %s exists",
                        res1, res2, existing)
                continue

            paired.add(res1)
            paired.add(res2)
            try:
                bps[res1] = residues.index(res2)
                bps[res2] = residues.index(res1)
            except ValueError as e:
                with log_to_exception(log, e):
                    log.error("bps = %s, residues = %s, res1 = %s, res2 = %s",
                              bps, residues, res1, res2)
                raise

    for i in range(len(residue_types)):
        bpseq_str += "%d %s %s\n" % (i + 1, residue_types[i],
                                     bps[residues[i]] + 1)
    seq_ids = _seqids_from_residue_map(residues)
    return bpseq_str, seq_ids
예제 #11
0
def raise_error_contextmngr2(value):
    e = ValueError("Another ValueError")
    log = logging.getLogger("main.inside_ctxt2")
    log.info("Before with-context. This is logged directly")
    with logging_exceptions.log_to_exception(log, e):
        log.debug("This is DEBUG ... %s", value)
        log.info("This is an INFO ... %s", value)
        log.warning("This is a WARNING ... %s", value)
        log.error("This is an ERROR ... %s", value)
        log.critical("This is CRITICAL ... %s", value)
        log.info("Raising inside with context")
        raise e
예제 #12
0
 def _get_fragment(self, stat, sm):
     key = stat.pdb_name + "__def_" + "-".join(map(str, stat.define))
     new_fragment = False
     try:
         fragment, _, _ = ftup.get_all_chains(op.join(
             self.LIBRARY_DIRECTORY, key[2:4], key + ".cif"),
                                              no_annotation=True)
     except Exception:
         cg, chains = self._get_source_cg_and_chain(stat, sm)
         new_fragment = True
     else:
         fragment = {c.id: c for c in fragment}
         log.debug("Used stored fragment for %s", key)
         pdb_basename = stat.pdb_name.split(":")[0]
         cg_filename = op.expanduser(
             op.join(self.cg_library_path, pdb_basename + ".cg"))
         cg = self.get_cg(cg_filename)  #The cg with the template
     try:
         elem = cg.get_node_from_residue_num(stat.define[0])
     except Exception:
         log.error("stat %s with define %s", stat, stat.define)
         raise
     if stat.define != cg.defines[elem]:
         err = ValueError(
             "The CG files where the stats where extracted and "
             "the cg file used for reconstruction are not consistent!")
         with log_to_exception(log, err):
             log.error("%s != %s for element %s (%s)", stat.define,
                       cg.defines[elem], elem, stat.pdb_name)
         raise err
     if new_fragment:
         fragment = ftup.extract_subchains_from_seq_ids(
             chains,
             cg.define_residue_num_iterator(elem,
                                            seq_ids=True,
                                            adjacent=(elem[0] != "s")))
         if self.LIBRARY_DIRECTORY is not None:
             log.debug("Storing newly-created fragment for %s", key)
             import distutils.dir_util
             distutils.dir_util.mkpath(
                 op.join(self.LIBRARY_DIRECTORY, key[2:4]))
             ftup.output_multiple_chains(
                 fragment.values(),
                 op.join(self.LIBRARY_DIRECTORY, key[2:4], key + ".cif"),
                 "cif")
     return cg, elem, fragment
예제 #13
0
def _enumerate_background_geometries(all_cgs, cutoff_dist, aminor_geometries):
    """
    :param all_cgs: A dictionary {PDBID: [ cg1, cg2, ...]}
    """
    non_ame_geometries = set()
    for pdb_id, curr_cgs in all_cgs.items():
        for cg in curr_cgs:
            try:
                for loop in cg.defines:
                    if loop[0] == "s":
                        continue
                    if loop in cg.incomplete_elements or loop in cg.interacting_elements:
                        continue
                    for stem in cg.stem_iterator():
                        if loop in cg.edges[stem]:
                            continue
                        if stem in cg.incomplete_elements or stem in cg.interacting_elements:
                            continue
                        dist, angle1, angle2 = ftca.get_relative_orientation(
                            cg, loop, stem)
                        if loop[0] == "i":
                            flexibility = ftca.get_loop_flexibility(cg, loop)
                        else:
                            flexibility = 1
                        if not np.isnan(dist + angle1 + angle2) and dist <= cutoff_dist:
                            geometry = AMGeometry(cg.name, loop, stem, dist,
                                                  angle1, angle2,
                                                  "&".join(
                                                      cg.get_define_seq_str(loop)),
                                                  1000, "no_interaction",
                                                  flexibility)
                            if geometry in aminor_geometries:
                                log.info(
                                    "Geometry %s is in aminor_geometries", geometry)
                            else:
                                non_ame_geometries.add(geometry)
            except BaseException as e:
                with log_to_exception(log, e):
                    log.error(
                        "An Error occurred during processing of cg: %s", cg.name)
                raise
    log.error("%s non_ame geometries found", len(non_ame_geometries))
    return non_ame_geometries
예제 #14
0
def _enumerate_background_geometries(all_cgs, cutoff_dist, aminor_geometries):
    """
    :param all_cgs: A dictionary {PDBID: [ cg1, cg2, ...]}
    """
    non_ame_geometries = set()
    for pdb_id, curr_cgs in all_cgs.items():
        for cg in curr_cgs:
            try:
                for loop in cg.defines:
                    if loop[0] == "s":
                        continue
                    if loop in cg.incomplete_elements or loop in cg.interacting_elements:
                        continue
                    for stem in cg.stem_iterator():
                        if loop in cg.edges[stem]:
                            continue
                        if stem in cg.incomplete_elements or stem in cg.interacting_elements:
                            continue
                        dist, angle1, angle2 = ftca.get_relative_orientation(
                            cg, loop, stem)
                        if loop[0] == "i":
                            flexibility = ftca.get_loop_flexibility(cg, loop)
                        else:
                            flexibility = 1
                        if not np.isnan(dist + angle1 +
                                        angle2) and dist <= cutoff_dist:
                            geometry = AMGeometry(
                                cg.name, loop, stem, dist, angle1, angle2,
                                "&".join(cg.get_define_seq_str(loop)), 1000,
                                "no_interaction", flexibility)
                            if geometry in aminor_geometries:
                                log.info("Geometry %s is in aminor_geometries",
                                         geometry)
                            else:
                                non_ame_geometries.add(geometry)
            except BaseException as e:
                with log_to_exception(log, e):
                    log.error("An Error occurred during processing of cg: %s",
                              cg.name)
                raise
    log.error("%s non_ame geometries found", len(non_ame_geometries))
    return non_ame_geometries
예제 #15
0
def split_at_cofold_cutpoints(bg, cutpoints):
    """
    Multiple sequences should not be connected along the backbone.

    We have constructed the bulge graph, as if they were connected along the backbone, so
    now we have to split it.
    """

    for splitpoint in cutpoints:
        element_left = bg.get_node_from_residue_num(splitpoint)
        element_right = bg.get_node_from_residue_num(splitpoint + 1)
        if element_left[0] in "ft" or element_right[0] in "ft":
            if element_left[0] == "t" and element_left[0] != "t":
                continue  # Splitpoint already implemented
            elif element_right[0] == "f" and element_left[0] != "f":
                continue  # Splitpoint already implemented
            else:
                # No cofold structure. First sequence is disconnected from rest
                e = GraphConstructionError(
                    "Cannot create BulgeGraph. Found two sequences not "
                    "connected by any base-pair.")
                with log_to_exception(log, e):
                    log.error("Trying to split between %s and %s",
                              element_left, element_right)
                raise e
            return
        elif element_left[0] == "i" or element_right[0] == "i":
            _split_interior_loop(bg, splitpoint, element_left, element_right)
        elif element_left != element_right:
            _split_between_elements(bg, splitpoint, element_left,
                                    element_right)
        elif element_left[0] == "s":
            _split_inside_stem(bg, splitpoint, element_left)
        else:
            _split_inside_loop(bg, splitpoint, element_left)
        bg._node_to_resnum = {}
    if not _is_connected(bg):
        raise GraphConstructionError(
            "Cannot create BulgeGraph. Found two sequences not connected by any "
            " base-pair.")
예제 #16
0
    def _get_source_cg_and_chain(self, stat, sm):
        """
        Load the fragment defined in the stat from the fragment library as pdb and cg.

        :param stat: The forgi.threedee.model.stats.StemStat or ftms.AngleStat or ftms.LoopStat object.
        :param sm: The SpatialModel to reconstruct. Used, if it contains stats not sampled but loaded directly.
        """
        stat_name = stat.pdb_name
        if stat_name == sm.bg.name and sm.bg.chains:
            return sm.bg, sm.bg.chains

        pdb_basename = stat_name.split(":")[0]
        pdb_filename = op.expanduser(
            op.join(self.pdb_library_path,
                    "_".join(pdb_basename.split("_")[:-1]) + ".pdb"))
        cg_filename = op.expanduser(
            op.join(self.cg_library_path, pdb_basename + ".cg"))
        #Make sure the files exist.
        try:
            try:
                with open(pdb_filename):
                    pass
            except IOError:
                pdb_filename = pdb_filename.rstrip(".pdb") + ".cif"
                with open(pdb_filename):
                    pass
            with open(cg_filename):
                pass
        except Exception as e:
            with log_to_exception(log, e):
                log.error("Failed to open files for stat %s", stat.pdb_name)
            raise
        log.debug("Opening cg-file %s to extract stat %s", cg_filename,
                  stat.pdb_name)
        cg = self.get_cg(cg_filename)  #The cg with the template
        chains = self.get_pdb(pdb_filename, store=self.store)

        return cg, chains
예제 #17
0
def split_at_cofold_cutpoints(bg, cutpoints):
    """
    Multiple sequences should not be connected along the backbone.

    We have constructed the bulge graph, as if they were connected along the backbone, so
    now we have to split it.
    """

    for splitpoint in cutpoints:
        element_left = bg.get_node_from_residue_num(splitpoint)
        element_right = bg.get_node_from_residue_num(splitpoint + 1)
        if element_left[0] in "ft" or element_right[0] in "ft":
            if element_left[0] == "t" and element_left[0] != "t":
                continue  # Splitpoint already implemented
            elif element_right[0] == "f" and element_left[0] != "f":
                continue  # Splitpoint already implemented
            else:
                # No cofold structure. First sequence is disconnected from rest
                e = GraphConstructionError("Cannot create BulgeGraph. Found two sequences not "
                                           "connected by any base-pair.")
                with log_to_exception(log, e):
                    log.error("Trying to split between %s and %s",
                              element_left, element_right)
                raise e
            return
        elif element_left[0] == "i" or element_right[0] == "i":
            _split_interior_loop(bg, splitpoint, element_left, element_right)
        elif element_left != element_right:
            _split_between_elements(
                        bg, splitpoint, element_left, element_right)
        elif element_left[0] == "s":
            _split_inside_stem(bg, splitpoint, element_left)
        else:
            _split_inside_loop(bg, splitpoint, element_left)
        bg._node_to_resnum = {}
    if not _is_connected(bg):
        raise GraphConstructionError("Cannot create BulgeGraph. Found two sequences not connected by any "
                                     " base-pair.")
예제 #18
0
def output_multiple_chains(chains, filename):
    '''
    Dump multiple chains to an output file. Remove the hydrogen atoms.

    :param chains: An iterable of Bio.PDB.Chain to dump.
    :param filename: The place to dump it.
    '''
    class HSelect(bpdb.Select):
        def accept_atom(self, atom):
            if atom.name.find('H') >= 0:
                return False
            else:
                return True

    m = bpdb.Model.Model(' ')
    s = bpdb.Structure.Structure(' ')
    for chain in chains:
        log.debug("Adding chain %s with %s residues", chain.id, len(chain))
        m.add(chain)

    s.add(m)

    io = bpdb.PDBIO()
    io.set_structure(s)
    try:
        io.save(filename, HSelect())
    except Exception as e:
        with log_to_exception(log, e):
            log.error("Could not output PDB with residues:")
            log.error(
                list(r.get_id()
                     for r in bpdb.Selection.unfold_entities(m, 'R')))
            log.error(" in chains:")
            log.error(
                list(c.get_id()
                     for c in bpdb.Selection.unfold_entities(m, 'C')))
        raise
예제 #19
0
         key = {"name": cg.name, "filename": filenames[i]}
         if args.per_ml:
             new_data = describe_ml_segments(cg)
             for i in range(len(new_data["segment"])):
                 for k, v in key.items():
                     data[k].append(v)
                 for k, v in new_data.items():
                     data[k].append(v[i])
         else:
             new_data = describe_rna(cg, file_num, dist_pairs, angle_pairs)
             for k, v in key.items():
                 data[k].append(v)
             for k, v in new_data.items():
                 data[k].append(v)
     except Exception as e:
         with log_to_exception(log, e):
             log.error(
                 "Error occurred during describing %d%s cg %s", file_num, {
                     1: "st",
                     2: "nd",
                     3: "rd"
                 }.get(file_num % 10 * (file_num % 100 not in [11, 12, 13]),
                       "th"), cg.name)
         raise
 if args.keys:
     allowed_keys = args.keys.split(",") + ["name"]
     for key in list(data.keys()):
         if key not in allowed_keys:
             del data[key]
 df = pd.DataFrame(data)
 df.set_index("name", append=True, inplace=True)
예제 #20
0
def load_rna(filename,
             rna_type="any",
             allow_many=True,
             pdb_chain=None,
             pdb_remove_pk=True,
             pdb_dotbracket="",
             dissolve_length_one_stems=True,
             pdb_annotation_tool=None,
             pdb_allow_www_query=False):
    """
    :param rna_type: One of "any", and "3d" and "pdb"

                     *  "any": Return either BulgeGraph or CoarseGrainRNA object,
                               depending on the input format
                     *  "only_cg": Only accept cg-files.
                     *  "3d":  Return CoarseGrainRNA objects,
                               if the file contains 3D information,
                               raise an error otherwise
                     *   "pdb": only accept pdb files

    :param allow_many: If True, return a list. If False, return a single
                       CoarseGrainRNA object or raise a WrongFileFormat,
                       if more than one RNA is present.
    :param pdb_chain: Extract the given chain from the file.
                      Only applicable if filename corresponds to a pdb file
    :param pdb_remove_pk: Detect pseudoknot-free structures from the pdb.
    :param pdb_dotbracket: Only applicable, if filename corresponds to a pdb file and pdb_chain is given.
    :param dissolve_length_one_stems: Ignored if input is in forgi bg/cg format.
    :param pdb_annotation_tool: Use DSSR, MC-Annotate or forgi heuristic for
                    basepair-detection in PDB/MMCIF files (None for auto-detect).
                    Ignored for other file-types.

    :retuns: A list of RNAs or a single RNA
    """
    # Is filename a dotbracket string and not a filename?
    if all(c in ".()[]{}&" for c in filename):
        # A dotbracket-string was provided via the commandline
        if not rna_type == "any":
            warnings.warn(
                "Cannot treat '{}' as dotbracket string, since we need a sequence. "
                "Trying to treat it as a filename instead...".format(filename))
        else:
            log.info("Assuming RNA %s is a dotbracketstring and not a file.",
                     filename)
            bg = fgb.BulgeGraph.from_dotbracket(
                filename, dissolve_length_one_stems=dissolve_length_one_stems)
            if allow_many:
                return [bg]
            else:
                return bg
    with open(filename) as rnafile:
        filetype = sniff_filetype(rnafile)
    if rna_type == "pdb" and filetype not in ["pdb", "cif"]:
        raise WrongFileFormat(
            "Only PDB files (*.pdb/.cif) are accepted, but file {} has type {}."
            .format(filename, filetype))
    if rna_type == "only_cg" and filetype != "forgi":
        raise WrongFileFormat(
            "Only forgi cg files are accepted, but file {} has type {}.".
            format(filename, filetype))
    if filetype == "forgi":
        cg = ftmc.CoarseGrainRNA.from_bg_file(filename)
        if rna_type in ["3d", "only_cg"] and not cg.coords.is_filled:  # pylint: disable=E1101
            raise WrongFileFormat(
                "File {} does not contain all 3D coordinates!".format(
                    filename))
        if allow_many:
            return [cg]
        else:
            return cg
    elif filetype == "pdb" or filetype == "cif":
        if pdb_chain:
            cgs = ftmc.CoarseGrainRNA.from_pdb(
                filename,
                load_chains=pdb_chain,
                remove_pseudoknots=pdb_remove_pk and not pdb_dotbracket,
                secondary_structure=pdb_dotbracket,
                dissolve_length_one_stems=dissolve_length_one_stems,
                filetype=filetype,
                annotation_tool=pdb_annotation_tool,
                query_PDBeChem=pdb_allow_www_query)
        else:
            if pdb_dotbracket:
                raise ValueError(
                    "pdb_dotbracket requires a chain to be given to avoid ambiguity."
                )
            cgs = ftmc.CoarseGrainRNA.from_pdb(
                filename,
                remove_pseudoknots=pdb_remove_pk,
                dissolve_length_one_stems=dissolve_length_one_stems,
                filetype=filetype,
                annotation_tool=pdb_annotation_tool,
                query_PDBeChem=pdb_allow_www_query)
        if allow_many:
            return cgs
        else:
            if len(cgs) > 1:
                raise WrongFileFormat(
                    "More than one connected RNA component in pdb file {}: {}".
                    format(filename, [cg.name for cg in cgs]))
            return cgs[0]
    # elif filetype=="mmcif":
    #    raise WrongFileFormat("MMCIF files are not yet supported.")
    elif filetype == "bpseq":
        if rna_type == "3d":
            raise WrongFileFormat(
                "bpseq file {} is not supported. We need 3D coordinates!".
                format(filename))
        with open(filename, 'r') as f:
            text = f.read()
            try:
                int(text[0])
            except ValueError:
                i = text.find("\n1 ")
                text = text[i + 1:]
        bg = ftmc.CoarseGrainRNA.from_bpseq_str(
            text, dissolve_length_one_stems=dissolve_length_one_stems)
        if allow_many:
            return [bg]
        else:
            return bg
    elif filetype == "fasta" or filetype == "other":
        if rna_type == "3d":
            raise WrongFileFormat(
                "Fasta(like) file {} is not supported. We need 3D coordinates!"
                .format(filename))
        try:
            bgs = ftmc.CoarseGrainRNA.from_fasta(
                filename, dissolve_length_one_stems=dissolve_length_one_stems)
        except Exception as e:
            with log_to_exception(log, e):
                log.critical("Could not parse file %r.", filename)
                if filetype == "other":
                    log.critical(
                        "We assumed file %r to be some fasta-variant or dotbracket file, but an error occurred during parsing.",
                        filename)
            raise
        if allow_many:
            return bgs
        else:
            if len(bgs) > 1:
                raise WrongFileFormat(
                    "More than one RNA found in fasta/ dotbracket file {}.".
                    format(filename))
            return bgs[0]
예제 #21
0
    def _parse(self, filepath):
        meta = {}
        with open(filepath) as file:
            headers = None
            data = None
            for line_no, line in enumerate(file):
                try:
                    line = line.strip()
                    if not line:
                        continue
                    elif line.startswith("# Random Seed:"):
                        meta["seed"] = int(line.split()[-1])
                    elif line.startswith("# Command"):
                        meta["command"] = line.split('`')[1]
                    elif line.startswith("# Version"):
                        fields = line.split()
                        meta["ernwin_version"] = fields[3].rstrip(",")
                        meta["forgi_version"] = fields[5]
                    elif line.startswith("#"):
                        continue
                    elif headers is None:
                        headers = line.split("\t")
                        self._init_collector_lookup(headers)
                        data = []
                        for i in range(len(headers)):
                            data.append([])
                    else:
                        fields = line.split('\t')
                        for i, field in enumerate(fields):
                            if i == 0:  # Step
                                data[i].append(int(field))
                            elif i == 1:  # Sampling_Energy
                                data[i].append(float(field))

                            cls = self._collectors[i]
                            if cls == "Sampling Move":
                                data[i].append(field)
                            elif cls is not None:
                                data[i].append(cls.parse_value(field))

                except Exception as e:
                    with log_to_exception(log, e):
                        log.error(
                            "Exception occurred during parsing of line %d '%s'",
                            line_no, line)
                    raise
            data_dic = {}
            for i, header in enumerate(headers):
                if data[i]:
                    if isinstance(data[i][0], tuple):
                        data_dic["{}_{}".format(
                            header, data[i][0][0])] = [x[1] for x in data[i]]
                    else:
                        data_dic[header] = data[i]
            data_dic["move_type"] = []
            data_dic["accepted"] = []
            data_dic["delta_E"] = []
            data_dic["stats_moved"] = []

            for d in data[-1]:
                field, _, accepted = d.rpartition(";")
                typ, _, field = field.partition(":")
                data_dic["accepted"].append(accepted)
                if typ == "RE":
                    data_dic["delta_E"].append(float("nan"))
                    data_dic["move_type"].append("RE")
                    data_dic["stats_moved"].append(float("nan"))
                else:
                    self.update_data_move(data_dic, typ, field)
        return data_dic
예제 #22
0
def mend_breakpoints(chains, gap):
    """
    :param gap: A list of res_ids, which can be moved to mend the gap.
    """
    #raise NotImplementedError("Error")
    try:
        import moderna
    except ImportError:
        warnings.warn(
            "Cannot mend gaps in sequence, because ModeRNA is not installed!")
        return chains
    mod_models = {}
    with fus.make_temp_directory() as tmpdir:
        log.info("Writing chains %s", chains.values())

        #ftup.output_multiple_chains(chains.values(), op.join(tmpdir, "tmp.pdb"))
        for g in gap:
            if g[0].chain != g[1].chain:
                log.warning(
                    "Not mending gap between multiple chains: %s and %s", g[0],
                    g[1])
                continue
            if g[0].chain not in mod_models:
                try:
                    mod_models[g[0].chain] = moderna.load_model(
                        chains[g[0].chain], data_type="chain"
                    )  #moderna.load_model(op.join(tmpdir, "tmp.pdb"), g[0].chain)
                except Exception as e:
                    with log_to_exception(log, e):
                        log.error("g is %s, g[0] is %s, g[0].chain is %s", g,
                                  g[0], g[0].chain)
                        log.error("chains is %s", chains)
                    raise
            moderna.fix_backbone(mod_models[g[0].chain],
                                 resid_to_moderna(g[0]),
                                 resid_to_moderna(g[1]))
            #moderna.write_model(mod_models[g[0].chain], op.join(tmpdir, "tmp.pdb"))
        #for chain_id, model in mod_models.items():
        #    moderna.write_model(model,  op.join(tmpdir, "mended_{}.pdb".format(chain_id)))
        #Load back to Biopython
        mended_chains = {}
        for chain_id in chains.keys():
            if chain_id in mod_models:
                mended_chains[chain_id] = mod_models[
                    chain_id]  #Mod models are chain subclasses anyway
                log.info("Mended:", mended_chains)
                mended_chains[chain_id].id = chain_id
            else:
                mended_chains[chain_id] = chains[chain_id]
    log.info("mended_chains: %s", mended_chains)
    # Moderna may replace modified residues with "UNK" for unknown or otherrwise change the code.
    # We have to replace them back.
    for chain_id in chains:
        for res in mended_chains[chain_id]:
            changed = False
            for o_res in chains[chain_id]:
                if o_res.id[1:] == res.id[1:]:
                    log.debug("Changing Moderna residue %s to %s", res, o_res)
                    assert not changed  #Only one residue per number+icode
                    res.id = o_res.id
                    res.resname = o_res.resname
                    log.debug("Moderna residue now %s", res)
                    changed = True
    # Convert back from ModeRNA to Biopython
    out_chains = {}
    for k, v in mended_chains.items():
        s = v.get_structure()[0]
        log.error("%s, %s %s", k, s, s.child_dict)
        assert len(s.child_list) == 1
        out_chains[k] = s.child_list[0]
        out_chains[k].id = k
    return out_chains
예제 #23
0
def insert_element(cg_to, cg_from, elem_to, elem_from, chains_to, chains_from,
                   angle_type):
    '''
    Take an element (elem_from) from one dict of chains (chains_from, cg_from) and
    insert it on the new chain while aligning on the adjoining elements.

    The neighboring elements need to be present in chain_to in order
    for the next element to be aligned to their starting and ending
    positions.

    The dimensions and type of elem_to and elem_from need to be identical.

    This method aligns the flanking base pairs on both ends
    (except for 3' and 5' elements) of the fragment with the respective
    base-pairs in the stem-scaffold. This means that there will be
    equally big breaks in the chain on both sides of the fragment.


    :param cg_to: The coarse-grain representation of the target chain
    :param cg_from: The coarse-grain representation of the source chain
    :param elem_to: The element to replace
    :param elem_from: The source element
    :param chains_to: A dict chainid:chain. The chains to graft onto
    :param chains_from: A dict chainid:chain. The chains to excise from

    :returns: a list of tuples containing gaps to mend
    '''
    log.info("Inserting element %s", elem_to)
    assert elem_from[0] == elem_to[0], "{}[0]!={}[0]".format(
        elem_from, elem_to)
    # The define of the loop with adjacent nucleotides (if present) in both cgs
    define_a_to = cg_to.define_a(elem_to)
    define_a_from = cg_from.define_a(elem_from)
    assert len(define_a_to) == len(define_a_from)
    nt_in_define_from = [
        x in cg_from.defines[elem_from] for x in define_a_from
    ]

    # The defines translated to seq_ids.
    closing_bps_to = []
    closing_bps_from = []

    log.debug("Angle type is %s", angle_type)
    for nt in define_a_to:
        closing_bps_to.append(cg_to.seq.to_resid(nt))
    for nt in define_a_from:
        closing_bps_from.append(cg_from.seq.to_resid(nt))
    # Seq_ids of all nucleotides in the loop that will be inserted
    seq_ids_a_from = []
    for i in range(0, len(define_a_from), 2):
        for nt in range(define_a_from[i], define_a_from[i + 1] + 1):
            seq_ids_a_from.append(cg_from.seq.to_resid(nt))
    log.debug("seqids_a from %s", seq_ids_a_from)
    #The loop fragment to insert in a dict {chain_id:chain}
    try:
        chains_from = ftup.extract_subchains_from_seq_ids(
            chains_from, seq_ids_a_from)
    except Exception as e:
        with log_to_exception(log, e):
            log.error(
                "Could not extract fragment %s from pdb: "
                " At least one of the seq_ids %s not found."
                " Chains are %s", elem_from, seq_ids_a_from,
                chains_from.keys())
        raise

    # A list of tuples (seq_id_from, seq_id_to) for the nucleotides
    # that will be used for alignment.
    log.debug("Closing_bps _from are %s", closing_bps_from)
    alignment_positions = []
    assert elem_from[0] != "s", "No stems allowed in insert_element"
    if elem_from[0] == "f":
        alignment_positions.append((closing_bps_from[1], closing_bps_to[1]))
    elif elem_from[0] == "t":
        alignment_positions.append((closing_bps_from[0], closing_bps_to[0]))
    else:
        for i in range(len(closing_bps_from)):  #
            alignment_positions.append(
                (closing_bps_from[i], closing_bps_to[i]))

    log.debug("Calling align_on_nucleotides for %s", elem_to)
    align_on_nucleotides(chains_from, chains_to, alignment_positions)

    #The defines and seq_ids WITHOUT adjacent elements
    define_to = cg_to.defines[elem_to]
    define_from = cg_from.defines[elem_from]
    no_moderna = False
    if len(define_from) != len(define_to):
        log.warning(
            "Inconsistent defines: {} and {} for {}. Using ModeRNA fragment instead."
            .format(define_from, define_to, elem_to))
        target_seqs = cg_to.get_define_seq_str(elem_to)  # One or two strands
        for i, target_seq in enumerate(target_seqs):
            if closing_bps_from[2 * i].chain != closing_bps_from[2 * i +
                                                                 1].chain:
                raise NotImplementedError("TODO")
            try:
                mod_chain = use_moderna_fragment(
                    chains_from[closing_bps_from[2 * i].chain], target_seq,
                    closing_bps_from[2 * i], closing_bps_from[2 * i + 1])
            except:
                no_moderna = True
            else:
                chains_from[seq_ids_a_from[0].chain] = mod_chain
    elif cg_to.element_length(elem_to) != cg_from.element_length(elem_from):
        log.warning("%s not consistent with %s: Missing residues", define_from,
                    define_to)
        log.warning("%s has different len than %s for angle type %s",
                    define_from, define_to, angle_type)
        if define_to[1] - define_to[0] > define_from[1] - define_from[0]:
            # Apply an indel on the left side
            if closing_bps_from[0].chain != closing_bps_from[1].chain:
                raise NotImplementedError("TODO")
            target_seq = cg_to.get_define_seq_str(elem_to)[0]  # Forward strand
            try:
                mod_chain = use_moderna_fragment(
                    chains_from[closing_bps_from[0].chain], target_seq,
                    closing_bps_from[0], closing_bps_from[1])
            except:
                no_moderna = True
            else:
                chains_from[seq_ids_a_from[0].chain] = mod_chain
        else:
            raise NotImplementedError("TODO")
    seq_ids_to = []
    for i in range(0, len(define_to), 2):
        seq_ids_to.append([])
        for nt in range(define_to[i], define_to[i + 1] + 1):
            seq_ids_to[-1].append(cg_to.seq.to_resid(nt))
    seq_ids_from = []
    # Now append first strand to seq_ids_from
    assert closing_bps_from[0].chain == closing_bps_from[1].chain
    log.debug("nt_in_define=%s", nt_in_define_from)
    if closing_bps_from[0].resid < closing_bps_from[1].resid:
        s = list(
            iter_resids_between(chains_from[closing_bps_from[0].chain],
                                closing_bps_from[0].resid,
                                closing_bps_from[1].resid,
                                nt_in_define_from[0], nt_in_define_from[1]))
    else:
        s = list(
            iter_resids_between(chains_from[closing_bps_from[0].chain],
                                closing_bps_from[1].resid,
                                closing_bps_from[0].resid,
                                nt_in_define_from[1], nt_in_define_from[0]))
        s[0].reverse()
    if s:
        seq_ids_from.append(s)

    if len(closing_bps_from) > 2:
        assert closing_bps_from[2].chain == closing_bps_from[3].chain
        if closing_bps_from[2].resid < closing_bps_from[3].resid:
            s = (list(
                iter_resids_between(chains_from[closing_bps_from[2].chain],
                                    closing_bps_from[2].resid,
                                    closing_bps_from[3].resid,
                                    nt_in_define_from[2],
                                    nt_in_define_from[3])))
        else:
            s = list(
                iter_resids_between(chains_from[closing_bps_from[2].chain],
                                    closing_bps_from[3].resid,
                                    closing_bps_from[2].resid,
                                    nt_in_define_from[3],
                                    nt_in_define_from[2]))
            s.reverse()
        if s:
            seq_ids_from.append(s)

    log.info("Fragment %s", seq_ids_from)
    log.info("Target %s", seq_ids_to)
    if not no_moderna:
        assert len(seq_ids_from[0]) == len(
            seq_ids_to[0]), "Unequal length for {}: {} {}".format(
                elem_to, seq_ids_from, seq_ids_to)
        if len(seq_ids_to) > 1:
            assert len(seq_ids_from[1]) == len(
                seq_ids_to[1]), "Unequal length for {}: {} {}".format(
                    elem_to, seq_ids_from, seq_ids_to)
    log.debug("Copying %s to %s for %s", seq_ids_from, seq_ids_to, elem_to)
    # Now copy the residues from the fragment chain to the scaffold chain.
    lastres = [None, None]
    for a in range(len(seq_ids_to)):
        for i in range(len(seq_ids_to[a])):
            try:
                resid_from = seq_ids_from[a][i]
            except IndexError:
                lastres[a] = seq_ids_to[a][i - 1]
                break
            resid_to = seq_ids_to[a][i]
            residue = chains_from[resid_from.chain][resid_from.resid]
            #Change the resid to the target
            residue.parent = None
            residue.id = resid_to.resid
            if resid_to.chain not in chains_to:
                log.info("Adding chain with id %r for residue %r",
                         resid_to.chain, resid_to)
                chains_to[resid_to.chain] = bpdb.Chain.Chain(resid_to.chain)
            #Now, add the residue to the target chain
            chains_to[resid_to.chain].add(residue)
    # Now we need to mend gaps created by imperfect alignment.
    gaps_to_mend = []
    if elem_from[0] != "f":
        log.debug("To mend: %s %s ", cg_to.seq.to_resid(define_a_to[0]),
                  cg_to.seq.to_resid(define_a_to[0] + 1))
        gaps_to_mend.append([
            cg_to.seq.to_resid(define_a_to[0]),
            cg_to.seq.to_resid(define_a_to[0] + 1)
        ])
        d = gap_length(chains_to, cg_to.seq.to_resid(define_a_to[0]),
                       cg_to.seq.to_resid(define_a_to[0] + 1))
        log.debug("Elem {}: dist {} - {} is {}".format(elem_to, define_a_to[0],
                                                       define_a_to[0] + 1, d))
    if elem_from[0] != "t":
        if lastres[0] is not None:
            r = lastres[0]
        else:
            r = cg_to.seq.to_resid(define_a_to[1] - 1)
        gaps_to_mend.append([r, cg_to.seq.to_resid(define_a_to[1])])
        d = gap_length(chains_to, r, cg_to.seq.to_resid(define_a_to[1]))
        log.debug("Elem {}: dist {} - {} is {}".format(elem_to, define_a_to[1],
                                                       define_a_to[1] - 1, d))

    if elem_from[0] == "i":
        gaps_to_mend.append([
            cg_to.seq.to_resid(define_a_to[2]),
            cg_to.seq.to_resid(define_a_to[2] + 1)
        ])
        if lastres[1] is not None:
            r = lastres[1]
        else:
            r = cg_to.seq.to_resid(define_a_to[3] - 1)
        gaps_to_mend.append([r, cg_to.seq.to_resid(define_a_to[3])])
    log.debug("To mend %s", gaps_to_mend)
    return gaps_to_mend
예제 #24
0
        logging_exceptions.log_exception(e, logging.WARNING)
    try:
        raise_error_contextmngr(555)
    except Exception as e:
        logging_exceptions.log_exception(e)

    log = logging.getLogger("another.logger.name")
    log.info(
        "The following should log from the main module level (doesn't work with root logger)"
    )
    helper_function(log)

    fltr = Filter1("CTMNGR2")
    log = logging.getLogger("main.inside_ctxt2")
    log.addFilter(fltr)

    log = logging.getLogger()
    log.handlers[0].addFilter(Filter1("RootHandler"))
    try:
        raise_error_contextmngr2(12345)
    except Exception as e:
        logging_exceptions.log_exception(e, with_stacktrace=False)

    log.info("Almost there")
    try:
        raise_error_contextmngr2(-1)
    except Exception as e:
        with logging_exceptions.log_to_exception(log, e):
            log.critical("using a BARE RAISE works as intended.")
        raise
예제 #25
0
def load_rna(filename, rna_type="any", allow_many=True, pdb_chain=None,
             pbd_remove_pk=True, pdb_dotbracket="",
             dissolve_length_one_stems = True):
    """
    :param rna_type: One of "any", "cg" and "3d" and "pdb"

                     *  "any": Return either BulgeGraph or CoarseGrainRNA objekte,
                            depending on the input format
                     *  "cg":  Always convert to CoarseGrainRNA objects,
                            even if they have no 3D information
                     *  "only_cg": Only accept cg-files.
                     *  "3d":  Return CoarseGrainRNA objects,
                            if the file contains 3D information,
                            raise an error otherwise
                     *  "pdb": only accept pdb files
    :param allow_many: If True, return a list. If False raise an error, if more than one RNA is present.
    :param pdb_chain: Extract the given chain from the file.
                      Only applicable if filename corresponds to a pdb file
    :param pdb_remove_pk: Detect pseudoknot-free structures from the pdb.
    :param pdb_dotbracket: Only applicable, if filename corresponds to a pdb file and pdb_chain is given.
    :param dissolve_length_one_stems: Ignored if input is in forgi bg/cg format.

    :retuns: A list of RNAs or a single RNA
    """
    # Is filename a dotbracket string and not a filename?
    if all( c in ".()[]{}&" for c in filename):
        # A dotbracket-string was provided via the commandline
        if not rna_type=="any":
            warnings.warn("Cannot treat '{}' as dotbracket string, since we need a sequence. "
                          "Trying to treat it as a filename instead...".format(filename))
        else:
            log.info("Assuming RNA %s is a dotbracketstring and not a file.", filename)
            bg = fgb.from_fasta_text(filename, dissolve_length_one_stems=dissolve_length_one_stems)
            if allow_many:
                return [bg]
            else:
                return bg
    with open(filename) as rnafile:
        filetype = sniff_filetype(rnafile)
    if rna_type=="pdb" and filetype!="pdb":
        raise WrongFileFormat("Only PDB files are accepted, but file {} has type {}.".format(filename, filetype))
    if rna_type=="only_cg" and filetype!="forgi":
        raise WrongFileFormat("Only forgi cg files are accepted, but file {} has type {}.".format(filename, filetype))
    if filetype=="forgi":
        cg = ftmc.CoarseGrainRNA(filename)
        if rna_type in ["3d", "only_cg"] and not cg.coords.is_filled:
            raise WrongFileFormat("File {} does not contain all 3D coordinates!".format(filename))
        if allow_many:
            return [cg]
        else:
            return cg
    elif filetype=="pdb":
        if pdb_chain:
            cgs = [ftmc.load_cg_from_pdb(filename, chain_id=pdb_chain,
                                                 remove_pseudoknots=pbd_remove_pk and not pdb_dotbracket,
                                                 secondary_structure=pdb_dotbracket, dissolve_length_one_stems=dissolve_length_one_stems)]
            if dissolve_length_one_stems:
                for cg in cgs:
                    cg.dissolve_length_one_stems()
        else:
            if pdb_dotbracket:
                raise ValueError("pdb_dotbracket requires a chain ti be given to avioid ambiguity.")
            cgs = ftmc.connected_cgs_from_pdb(filename, remove_pseudoknots = pbd_remove_pk,
                                              dissolve_length_one_stems=dissolve_length_one_stems)
        if allow_many:
            return cgs
        else:
            if len(cgs)>1:
                raise WrongFileFormat("More than one connected RNA component in pdb file {}.".format(filename))
            return cgs[0]
    elif filetype=="mmcif":
        raise WrongFileFormat("MMCIF files are not yet supported.")
    elif filetype=="bpseq":
        if rna_type=="3d":
            raise WrongFileFormat("bpseq file {} is not supported. We need 3D coordinates!".format(filename))
        bg = fgb.BulgeGraph()
        with open(filename, 'r') as f:
            text = f.read()
            try:
                int(text[0])
            except ValueError:
                i=text.find("\n1 ")
                text=text[i+1:]
        bg.from_bpseq_str(text, dissolve_length_one_stems=dissolve_length_one_stems)
        if rna_type=="cg":
            bg = ftmc.from_bulge_graph(bg)
        if allow_many:
            return [bg]
        else:
            return bg
    elif filetype =="fasta" or filetype=="other":
        if rna_type=="3d":
            raise WrongFileFormat("Fasta(like) file {} is not supported. We need 3D coordinates!".format(filename))
        try:
            bgs = fgb.from_fasta(filename, dissolve_length_one_stems=dissolve_length_one_stems)
        except Exception as e:
            with log_to_exception(log, e):
                log.critical("Could not parse file %r.", filename)
                if filetype=="other":
                    log.critical("We assumed file %r to be some fasta-variant or dotbracket file, but an error occurred during parsing.", filename)
            raise
        if isinstance(bgs, fgb.BulgeGraph):
            bgs = [bgs]
        if dissolve_length_one_stems:
            for bg in bgs:
                bg.dissolve_length_one_stems()
        if rna_type=="cg":
            bgs = list(map(ftmc.from_bulge_graph, bgs))
        if allow_many:
            return bgs
        else:
            if len(bgs)>1:
                raise WrongFileFormat("More than one RNA found in fasta/ dotbracket file {}.".format(filename))
            return bgs[0]
예제 #26
0
def load_rna(filename, rna_type="any", allow_many=True, pdb_chain=None,
             pdb_remove_pk=True, pdb_dotbracket="",
             dissolve_length_one_stems=True,
             pdb_annotation_tool=None, pdb_allow_www_query=False):
    """
    :param rna_type: One of "any", and "3d" and "pdb"

                     *  "any": Return either BulgeGraph or CoarseGrainRNA object,
                               depending on the input format
                     *  "only_cg": Only accept cg-files.
                     *  "3d":  Return CoarseGrainRNA objects,
                               if the file contains 3D information,
                               raise an error otherwise
                     *   "pdb": only accept pdb files

    :param allow_many: If True, return a list. If False, return a single
                       CoarseGrainRNA object or raise a WrongFileFormat,
                       if more than one RNA is present.
    :param pdb_chain: Extract the given chain from the file.
                      Only applicable if filename corresponds to a pdb file
    :param pdb_remove_pk: Detect pseudoknot-free structures from the pdb.
    :param pdb_dotbracket: Only applicable, if filename corresponds to a pdb file and pdb_chain is given.
    :param dissolve_length_one_stems: Ignored if input is in forgi bg/cg format.
    :param pdb_annotation_tool: Use DSSR, MC-Annotate or forgi heuristic for
                    basepair-detection in PDB/MMCIF files (None for auto-detect).
                    Ignored for other file-types.

    :retuns: A list of RNAs or a single RNA
    """
    # Is filename a dotbracket string and not a filename?
    if all(c in ".()[]{}&" for c in filename):
        # A dotbracket-string was provided via the commandline
        if not rna_type == "any":
            warnings.warn("Cannot treat '{}' as dotbracket string, since we need a sequence. "
                          "Trying to treat it as a filename instead...".format(filename))
        else:
            log.info(
                "Assuming RNA %s is a dotbracketstring and not a file.", filename)
            bg = fgb.BulgeGraph.from_dotbracket(
                filename, dissolve_length_one_stems=dissolve_length_one_stems)
            if allow_many:
                return [bg]
            else:
                return bg
    with open(filename) as rnafile:
        filetype = sniff_filetype(rnafile)
    if rna_type == "pdb" and filetype not in ["pdb", "cif"]:
        raise WrongFileFormat(
            "Only PDB files (*.pdb/.cif) are accepted, but file {} has type {}.".format(filename, filetype))
    if rna_type == "only_cg" and filetype != "forgi":
        raise WrongFileFormat(
            "Only forgi cg files are accepted, but file {} has type {}.".format(filename, filetype))
    if filetype == "forgi":
        cg = ftmc.CoarseGrainRNA.from_bg_file(filename)
        if rna_type in ["3d", "only_cg"] and not cg.coords.is_filled: # pylint: disable=E1101
            raise WrongFileFormat(
                "File {} does not contain all 3D coordinates!".format(filename))
        if allow_many:
            return [cg]
        else:
            return cg
    elif filetype == "pdb" or filetype == "cif":
        if pdb_chain:
            cgs = ftmc.CoarseGrainRNA.from_pdb(filename, load_chains=pdb_chain,
                                               remove_pseudoknots=pdb_remove_pk and not pdb_dotbracket,
                                               secondary_structure=pdb_dotbracket,
                                               dissolve_length_one_stems=dissolve_length_one_stems,
                                               filetype=filetype, annotation_tool=pdb_annotation_tool,
                                               query_PDBeChem=pdb_allow_www_query)
        else:
            if pdb_dotbracket:
                raise ValueError(
                    "pdb_dotbracket requires a chain to be given to avoid ambiguity.")
            cgs = ftmc.CoarseGrainRNA.from_pdb(filename, remove_pseudoknots=pdb_remove_pk,
                                               dissolve_length_one_stems=dissolve_length_one_stems,
                                               filetype=filetype, annotation_tool=pdb_annotation_tool,
                                               query_PDBeChem=pdb_allow_www_query)
        if allow_many:
            return cgs
        else:
            if len(cgs) > 1:
                raise WrongFileFormat("More than one connected RNA component in pdb file {}: {}".format(
                    filename, [cg.name for cg in cgs]))
            return cgs[0]
    # elif filetype=="mmcif":
    #    raise WrongFileFormat("MMCIF files are not yet supported.")
    elif filetype == "bpseq":
        if rna_type == "3d":
            raise WrongFileFormat(
                "bpseq file {} is not supported. We need 3D coordinates!".format(filename))
        with open(filename, 'r') as f:
            text = f.read()
            try:
                int(text[0])
            except ValueError:
                i = text.find("\n1 ")
                text = text[i + 1:]
        bg = ftmc.CoarseGrainRNA.from_bpseq_str(
            text, dissolve_length_one_stems=dissolve_length_one_stems)
        if allow_many:
            return [bg]
        else:
            return bg
    elif filetype == "fasta" or filetype == "other":
        if rna_type == "3d":
            raise WrongFileFormat(
                "Fasta(like) file {} is not supported. We need 3D coordinates!".format(filename))
        try:
            bgs = ftmc.CoarseGrainRNA.from_fasta(
                filename, dissolve_length_one_stems=dissolve_length_one_stems)
        except Exception as e:
            with log_to_exception(log, e):
                log.critical("Could not parse file %r.", filename)
                if filetype == "other":
                    log.critical(
                        "We assumed file %r to be some fasta-variant or dotbracket file, but an error occurred during parsing.", filename)
            raise
        if allow_many:
            return bgs
        else:
            if len(bgs) > 1:
                raise WrongFileFormat(
                    "More than one RNA found in fasta/ dotbracket file {}.".format(filename))
            return bgs[0]
예제 #27
0
         key = {"name": cg.name, "filename": filenames[i]}
         if args.per_ml:
             new_data = describe_ml_segments(cg)
             for i in range(len(new_data["segment"])):
                 for k, v in key.items():
                     data[k].append(v)
                 for k, v in new_data.items():
                     data[k].append(v[i])
         else:
             new_data = describe_rna(cg, file_num, dist_pairs, angle_pairs)
             for k, v in key.items():
                 data[k].append(v)
             for k, v in new_data.items():
                 data[k].append(v)
     except Exception as e:
         with log_to_exception(log, e):
             log.error("Error occurred during describing %d%s cg %s", file_num, {1: "st", 2: "nd", 3: "rd"}.get(
                 file_num % 10 * (file_num % 100 not in [11, 12, 13]), "th"), cg.name)
         raise
 if args.keys:
     allowed_keys = args.keys.split(",") + ["name"]
     for key in list(data.keys()):
         if key not in allowed_keys:
             del data[key]
 df = pd.DataFrame(data)
 df.set_index("name", append=True, inplace=True)
 if args.csv:
     if not args.mode and os.path.isfile(args.csv):
         raise RuntimeError("File {} exists already.".format(args.csv))
     if not args.mode or args.mode == 'o':
         df.to_csv(args.csv)
예제 #28
0
파일: _dssr.py 프로젝트: porteusconf/forgi
    def cg_stem(self, dssr_stem):
        """
        Get the stem define in the CoarseGrainRNA that corresponds to the stem id in the dssr-format.

        :param dssr_stem: INT the stem in the DSSR Anntotation.
        """
        log.debug("Mapping DSSR stem %s to forgi", dssr_stem)
        if "stems" not in self._dssr:
            raise DSSRLookupError("The DSSR object does not contain any stem!")
        for stem_obj in self._dssr["stems"]:
            if stem_obj["index"] == dssr_stem:
                break
        else:
            raise DSSRLookupError("No stem with index {}".format(dssr_stem))
        log.debug("Found stem %s&%s", stem_obj["strand1"], stem_obj["strand2"])

        cg_stems = Counter(
        )  # See, if the dssr_stems maps to more than 1 cg-stem
        for pair in stem_obj["pairs"]:
            res1 = dssr_to_pdb_resid(pair["nt1"])
            res2 = dssr_to_pdb_resid(pair["nt2"])
            log.debug("Contains pair %s-%s", res1, res2)
            if self._cg.chains and (res1.chain not in self._cg.chains
                                    or res2.chain not in self._cg.chains):
                e = WrongChain()
                with log_to_exception(log, e):
                    log.error(
                        "Wrong chain: res1={}, res2={}, cg.chains={}".format(
                            res1, res2, self._cg.chains))
                raise e
            i1 = self._cg.seq.to_integer(res1)
            i2 = self._cg.seq.to_integer(res2)
            nodes = self._cg.nucleotides_to_elements([i1, i2])
            for node in nodes:
                cg_stems[node] += 1
        if not cg_stems:
            raise RuntimeError(
                "No stem matching dssr_stem {}.".format(dssr_stem))
        most_common = cg_stems.most_common()
        if len(most_common) > 1:
            extra_info = ""
            for d in cg_stems.keys():
                if d[0] == "i":
                    extra_info += "\n{} is {}:".format(
                        d, self._cg.get_define_seq_str(d))
                    extra_info += "\n\t" + self._cg.seq + "\n\t" + \
                        self._cg.to_dotbracket_string() + "\n\t"
                    resnums = list(self._cg.define_residue_num_iterator(d))
                    for i in range(len(self._cg.seq)):
                        pos = i + 1
                        if pos in resnums:
                            extra_info += "^"
                        else:
                            extra_info += " "

            warnings.warn(
                "dssr_stem {} maps to more than one cg element: {} {}".format(
                    dssr_stem, list(cg_stems.keys()), extra_info))
        for mc in most_common:
            if mc[0][0] == "s":
                return mc[0]
        raise RuntimeError(
            "No stem matching dssr_stem {}, only single stranded region: {}.".
            format(dssr_stem, list(cg_stems.keys())))