Exemplo n.º 1
0
class builder(Node):
    """Builds and adds a :term:`SKETCH` from given :term:`FORM` string to the :class:`.Case` using ideal SSE elements.

    If corrections are available and specified, these will be applied onto the :term:`SKETCH`.

    .. caution::
        In order to apply secondary structure or per layer corrections, the :mod:`.corrector` plugin
        needs to be set in the :class:`.Pipeline`.

    :param connectivity: Expected secondary structure connectivity. *Important*: at the moment only a single
                         connectivity supported (default: True).
    :param motif: Expected Motif to be added to the :term:`SKETCH` (default: False).
    :param pick_aa: Desired amino acid type to use for the :term:`SKETCH` sequence. If not specified, it will
                    use pseudorandomly assign amino acid types based on secondary structure propensity scores.
    :param write2disc: Dump the :term:`SKETCH` (default: True).

    :raises:
        :NodeDataError: On **check**. If the required fields to be executed are not there.
        :NodeDataError: On **execution**. If the :class:`.Case` contains anything other than one defined connectivity.
    """
    REQUIRED_FIELDS = ('topology.architecture', 'topology.connectivity')
    RETURNED_FIELDS = ()
    VERSION = 'v1.0'

    def __init__(self,
                 tag: int,
                 connectivity: Optional[bool] = True,
                 motif: Optional[bool] = False,
                 pick_aa: Optional[str] = None,
                 write2disc: Optional[str] = True):
        super(builder, self).__init__(tag)

        self.connectivity = connectivity
        self.motif = motif
        self.pick_aa = pick_aa
        self.write2disc = write2disc

    def single_check(self, dummy: Dict) -> Dict:
        kase = Case(dummy)

        # Check what it needs
        for itag in self.REQUIRED_FIELDS:
            if kase[itag] is None:
                raise NodeDataError(f'Field "{itag}" is required')

        # Include what keywords it adds (in this instance, nothing)
        # Here Nothing
        return kase.data

    def single_execute(self, data: Dict) -> Dict:
        case = Case(data)

        # Apply connectivity?
        if self.connectivity:
            if case.connectivity_count == 0:
                raise NodeDataError(
                    'Minimum a single connectivity must be provided.')
            if case.connectivity_count > 1:
                raise NodeDataError(
                    'Only single connectivity cases can be build.')
Exemplo n.º 2
0
def pds_database( log: Logger,
                  filter: Optional[Union[str, Path]] = None,
                  ) -> Tuple[Path, List]:
    """Provide the list of target PDS as a file and a list.

    .. note::
        Depends on the ``master.pds`` configuration option.

    :param log: Job logger.
    :param filter: File containting the target subset, otherwise all PDS database is considered.
    """
    # @TODO: PDS-FILTER
    pds_file = Path(TBcore.get_option('master', 'pds'))
    if pds_file.is_file():
        pds_list = [line.strip() for line in open(pds_file).readlines() if len(line.strip()) > 0]
    elif pds_file.is_dir():
        pds_list = [str(x.resolve()) for x in pds_file.glob('*/*.pds')]
    else:
        raise NodeDataError('The provided MASTER database directory/list file cannot be found.')

    # Even if a PDS file already exists, we always create a temporary file so that we can
    # manage different versions of the PDS database in different Nodes.
    f = NamedTemporaryFile(mode='w', delete=False)
    log.info(f'Temporary file for PDS database: {f.name}')
    [f.write(x + '\n') for x in pds_list]
    f.close()
    pds_file = Path(f.name)
    return pds_file, pds_list
Exemplo n.º 3
0
    def single_check(self, dummy: Dict) -> Dict:
        kase = Case(dummy)
        # Check what it needs
        for itag in self.REQUIRED_FIELDS:
            if kase[itag] is None:
                raise NodeDataError(f'Field "{itag}" is required')

        # Include what keywords it adds
        kase.data.setdefault('metadata', {}).setdefault('motif_picker', [])
        return kase.data
Exemplo n.º 4
0
    def single_check(self, dummy: Dict) -> Dict:
        kase = Case(dummy)

        # Check what it needs
        for itag in self.REQUIRED_FIELDS:
            if kase[itag] is None:
                raise NodeDataError(f'Field "{itag}" is required')

        # Include what keywords it adds (in this instance, nothing)
        return kase.data
Exemplo n.º 5
0
 def get_fragfiles(self) -> pd.DataFrame:
     """Obtain the fragment files.
     """
     fragpath = Path(core.get_option('loop_master', 'fragments'))
     self.log.debug(f'Listing available fragment files at: {fragpath.name}')
     if not fragpath.is_dir():
         raise NodeDataError(f'{fragpath.name} is not a folder.')
     return pd.DataFrame(
         [(x.name[:4], x.name[5:6], x, y)
          for x, y in zip(sorted(fragpath.glob('*/*3mers.gz')),
                          sorted(fragpath.glob('*/*9mers.gz')))],
         columns=['pdb', 'chain', '3mers', '9mers'])
Exemplo n.º 6
0
    def single_check(self, dummy: Dict) -> Dict:
        kase = Case(dummy)

        # Check what it needs
        for itag in self.REQUIRED_FIELDS:
            if kase[itag] is None:
                raise NodeDataError(f'Field "{itag}" is required')

        # Include what keywords it adds (in this instance, nothing)
        kase.data.setdefault('metadata', {}).setdefault('loop_fragments', [])
        kase.data.setdefault('metadata', {}).setdefault('loop_lengths', [])
        return kase.data
Exemplo n.º 7
0
class fragment_maker(Node):
    """Creates or mixes fragments that are needed in multiple Rosetta protocols. Mutliple ways of creating fragments
    are possible through different protocols.

    .. note::
        Currently, solely the ``loop_fragment`` protocol is implemented.

    .. caution::
        In order to create fragments with the ``loop_fragment`` protocol, the :mod:`.loop_fragments` plugin
        needs to be set in the :class:`.Pipeline`.

    :param protocol: Fragment creation protocol to be used.
    :param script: Rosetta script to pick fragments.

    :raises:
        :NodeDataError: On **check**. If the required fields to be executed are not there.
        :NodeDataError: On **execution**. If the :class:`.Case` contains anything other than one defined connectivity.
        :NodeMissingError: On **exection**. If required variable inputs are not there.
    """
    REQUIRED_FIELDS = ('metadata.loop_fragments', 'metadata.loop_lengths')
    RETURNED_FIELDS = ('metadata.fragments')
    VERSION = 'v1.0'

    def __init__(self,
                 tag: int,
                 protocol: str,
                 script: Optional[Union[Path, str]] = None):
        super(fragment_maker, self).__init__(tag)

        self.protocol = protocol
        self.script = script

    def single_check(self, dummy: Dict) -> Dict:
        kase = Case(dummy)

        # Check what it needs
        for itag in self.REQUIRED_FIELDS:
            if kase[itag] is None:
                raise NodeDataError(f'Field "{itag}" is required')

        # Include what keywords it adds (in this instance, nothing)
        kase.data.setdefault('metadata', {}).setdefault('fragments', {})
        return kase.data

    def single_execute(self, data: Dict) -> Dict:
        case = Case(data)
        data = {'protocol': self.protocol, 'files': []}

        # Fragments can only be made for a full, reoriented Case.
        if case.connectivity_count > 1:
            raise NodeDataError(
                'FunFolDes can only be applied to one connectivity.')
Exemplo n.º 8
0
def createPDS( infile: Union[Path, str], outfile: Optional[str] = None ) -> List[str]:
    """Make the createPDS command call.

    .. note::
        Depends on the ``master.create`` configuration option.

    :param infile: PDB file to convert.
    :param outfile: Name of the expected PDS output.
    """
    _, createPDS = get_master_exes()
    infile = Path(infile)
    if not infile.is_file():
        raise NodeDataError(f'Unable to find structure file {infile}')
    outfile = outfile if outfile is not None else infile.with_suffix('.pds')
    return shlex.split(f'{createPDS} --type query --pdb {str(infile)} --pds {str(outfile)}  --dCut 100')
Exemplo n.º 9
0
    def get_abegos(self) -> pd.DataFrame:
        """Load ABEGO data.
        """
        abegos = Path(core.get_option('loop_master', 'abego'))
        if not abegos.is_file():
            raise NodeDataError(f'ABEGO file {abegos.name} cannot be found.')

        self.log.debug(f'Loading ABEGO data from: {abegos.name}\n')
        doopen = gzip.open if abegos.suffix == '.gz' else open
        abegodata = []
        with doopen(abegos, 'rt') as fd:
            for line1, line2 in itertools.zip_longest(*[fd] * 2):
                line2 = line2 if len(line2.strip()) != 0 else 'NON\n'
                line1 = line1.strip().lstrip('>').split('_')
                abegodata.append(f'{line1[0]},{line1[1]},{line2}')
        abegodata = pd.read_csv(StringIO(''.join(abegodata)),
                                names=['pdb', 'chain', 'abego'],
                                header=None)
        abegodata = abegodata[abegodata['abego'] != 'NON']
        return abegodata
Exemplo n.º 10
0
    def single_execute(self, data: Dict) -> Dict:
        kase = Case(data)
        result = {'id': f'motif_{self.identifier}'}

        # Create a working folder
        folders = kase.undirected_path.joinpath(result['id'])
        folders.mkdir(parents=True, exist_ok=True)
        result['data_dir'] = str(folders)

        # Load Structure and create eigens
        try:
            motifs = self.Motif(*reverse_motif(
                self.log, self.source, self.selection, self.attach,
                self.hotspot, self.identifier, self.binder))
        except StructuralError as se:
            raise NodeDataError(str(se))
        result['motifs'] = motifs
        # Attach data and return
        kase.data.setdefault('metadata', {}).setdefault('motif_picker',
                                                        []).append(result)
        return kase.data
Exemplo n.º 11
0
class statistics( Node ):
    """Various statistics on the sequence and structure level are computed depending on available scripts.

    .. note::
        Depends on the ``statistic.molprobity`` configuration option.
        Depends on the ``statistic.tmalign`` configuration option.
        Depends on the ``statistic.trrosetta_repo`` configuration option.
        Depends on the ``statistic.trrosetta_wts`` configuration option.
        Depends on the ``statistic.trrosetta_env`` configuration option.

    .. caution::
        In order to execute this :class:`.Node`, we highly recommend to install `trRosetta` with all dependencies.
        The external conda environment can be specified in the ``statistic.trrosetta_env`` configuration option.

    .. admonition:: To Developers

        Due to its use in multiple :class:`.Node`, functions to deal with this :class:`.Node` are mostly located
        in the respective module file and external scripts are locate in this :class:`.Node` directory.

    :param loop_range: Expected loop length is calculated from the euclidian distance between two secondary
        structures. This attribute adds a window of ``loop_range`` residues under and over the calculated
        length.
    :param source: Plugin designs come from, e.g. :class:`funfoldes`.
    :param stage: The type of design, e.g. folded or designed.
    :param analysis: Geometric or quality assessment.
    :param metric: Type of geometric or quality assessment.

    :raises:
        :NodeDataError: On **check**. If the required fields to be executed are not there.
    """
    REQUIRED_FIELDS = ()
    RETURNED_FIELDS = ()
    VERSION = 'v1.0'

    def __init__( self, tag: int,
                        source: str,
                        stage: str,
                        analysis: str,
                        metric: Optional[str] = None,
                        **kwargs ) -> str:
        super(statistics, self).__init__(tag)

        self.source = source
        self.stage = stage
        self.analysis = analysis
        self.metric = metric


    def single_check( self, dummy: Dict ) -> Dict:
        case = Case(dummy)

        # Check what it needs
        for itag in self.REQUIRED_FIELDS:
            if case[itag] is None:
                raise NodeDataError(f'Field "{itag}" is required')

        # Include what keywords it adds (in this instance, nothing)
        if self.analysis == 'geometry':
            case['metadata'].setdefault('statistic', {}).setdefault('geometry', '')
        if self.analysis == 'quality':
            case['metadata'].setdefault('statistic', {}).setdefault('quality', '')
        return case.data
Exemplo n.º 12
0

    def funfoldes2pdb( self, case: Case, wfolder: Path ) -> List:
        """
        """
        if self.stage == 'folding':
            silent_files = case['metadata.funfoldes.silent_files.folding']
        elif self.stage == 'design':
            silent_files = case['metadata.funfoldes.silent_files.design']
        if silent_files is None:
            raise NodeMissingError('There is no output data from the funfoldes plugin.')

        extract_pdb = Path(str(Path(TBcore.get_option('rosetta', 'scripts')).resolve()
                            ).replace('rosetta_scripts.', 'extract_pdbs.'))
        if not extract_pdb.is_file() or not os.access(str(extract_pdb), os.X_OK):
            raise NodeDataError(f'Cannot find executable {extract_pdb}')

        if not TBcore.get_option('slurm', 'use'):
            cmd = [extract_pdb, '-in:file:silent']
            cmd.extend(silent_files)
            cmd.extend(['-out:prefix', str(wfolder) + '/'])
        else:
            indir = str(wfolder.joinpath('${SLURM_ARRAY_TASK_ID}'))
            cmd = ['srun', extract_pdb, '-in:file:silent']
            if self.stage == 'folding':
                cmd.append(os.path.commonprefix([str(x) for x in silent_files]) + '${SLURM_ARRAY_TASK_ID}_funfol.silent')
            elif self.stage == 'design':
                cmd.append(os.path.commonprefix([str(x) for x in silent_files]) + '${SLURM_ARRAY_TASK_ID}_des.silent')
            cmd.extend(['-out:prefix', str(indir) + '/'])
        return [['mkdir', '-p', indir] if TBcore.get_option('slurm', 'use') else '', cmd]
Exemplo n.º 13
0
    def single_execute(self, data: Dict) -> Dict:
        kase = Case(data)
        # Loop MASTER is only applied to a Case with one single connectivity
        if kase.connectivity_count != 1:
            err = f'{self.nodeID} can only be applied to one connectivity. '
            err += f'Current case contains a total of {kase.connectivity_count}.'
            raise NodeDataError(err)
        # And has to be reoriented
        if not kase.is_reoriented:
            self.log.debug(
                'Topology was provided without oriented SSE -> orienting.')
            kase = kase.apply_topologies()[0]

        # Generate the folder tree for a single connectivity.
        folders = kase.connectivities_paths[0].joinpath('loopgroup_master')
        folders.mkdir(parents=True, exist_ok=True)

        # Global step distance
        loop_step = kase.cast_absolute(
        )['configuration.defaults.distance.loop_step']

        # Output keys
        kase.data.setdefault('metadata', {}).setdefault('loop_fragments', [])
        kase.data.setdefault('metadata', {}).setdefault('loop_lengths', [])

        # Find steps: Each pair of secondary structure.
        #it = kase.connectivities_str[0].split('.')
        #steps = [it[i:i + 2] for i in range(0, len(it) - 1)]
        lengths = kase.connectivity_len[0]
        start = 1

        for i, (group, infos) in enumerate(self.steps.items()):
            self.log.info(f'Search at: {group}')

            # 1. Make folders and files
            wfolder = folders.joinpath(f'loopgroup{i + 1:02d}')
            wfolder.mkdir(parents=True, exist_ok=True)
            outfile = wfolder.joinpath(f'loopgroup_master.iter{i + 1:02d}.pdb')
            outfilePDS = wfolder.joinpath(
                f'loopgroup_master.iter{i + 1:02d}.pds')
            masfile = outfile.with_suffix('.master')

            gr = self.steps[f'group{i + 1:02d}'][-1].split(';')
            gr = [int(g) for g in gr if g != 'x']
            for g in gr:
                checkpoint = wfolder.joinpath(f'loop{g:02d}/checkpoint.json')
                # 2. Check if checkpoint exists, retrieve and skip
                reload = TButil.checkpoint_in(self.log, checkpoint)
                if reload is not None:
                    self.log.debug(
                        f'Reloading loopgroup{i + 1:02d} with loop{g:02d}')
                    kase.data['metadata']['loop_fragments'].append(reload)
                    kase.data['metadata']['loop_lengths'].append(
                        int(reload['edges']['loop']))
                    start += (int(reload['edges']['sse1']) +
                              int(reload['edges']['loop']))
                    continue

            # 3. Check hairpin
            # Get SSEs and identifiers
            sses = [kase.get_sse_by_id(sse) for sse in infos[0]]
            #sse1_name, sse2_name = sse1['id'], sse2['id']
            #is_hairpin = self.check_hairpin(sse1_name, sse2_name)

            # 4. Generate structures
            sses = TBstructure.build_pdb_object(self.log,
                                                sses,
                                                5,
                                                concat=False,
                                                outfile=outfile)

            if not masfile.is_file():
                # 5. calculate expected loop length by loop_step
                #Mdis, mdis = TBstructure.get_loop_length(self.log, sse1, sse2, loop_step, self.loop_range)

                # 6. Run MASTER
                #outfilePDS = outfile if outfile is not None else Path(outfile).with_suffix('.pds')
                self.log.debug(f'FILE {outfilePDS}')
                # -> make PDS query
                cmd = TBMaster.createPDS(outfile, outfilePDS)
                self.log.debug(f'EXECUTE: {" ".join(cmd)}')
                run(cmd, stdout=DEVNULL)
                # -> run MASTER
                cmd = TBMaster.master_groupedgap(outfilePDS, self.pdsdb,
                                                 masfile, infos[1],
                                                 self.rmsd_cut)
                self.log.debug(f'EXECUTE: {" ".join(cmd)}')
                result = run(cmd, stdout=DEVNULL)

                # TODO: implement motif compability
                # if result.returncode: # no loop between that connection, e.g. a motif ranging over multiple sse with keeping the loops
                #     # 4. Generate structures
                #     self.log.debug('generate combined structure')
                #     sse = pd.concat([sse1, sse2], sort=False)
                #
                #     # 6. Run MASTER
                #     self.log.debug(Path(outfile))
                #     #outfilePDS = outfile if outfile is not None else Path(outfile).with_suffix('.pds')
                #     self.log.debug(f'FILE {outfilePDS}')
                #     # -> make PDS query
                #     cmd = TBMaster.createPDS(outfile, outfilePDS)
                #     self.log.debug(f'EXECUTE: {" ".join(cmd)}')
                #     run(cmd, stdout=DEVNULL)
                #     # -> run MASTER
                #     cmd = TBMaster.master_nogap(outfilePDS, self.pdsdb, masfile, self.rmsd_cut)
                #     self.log.debug(f'EXECUTE: {" ".join(cmd)}')
                #     run(cmd, stdout=DEVNULL)
                #
                #     # 6. Minimize master data (pick top_loopsx3 lines to read and minimize the files)
                #     match_count = self.minimize_master_file(masfile)
                #     self.log.debug(f'match count here {match_count}')
                #
                #     # 7. Retrieve MASTER data
                #     dfloop = self.process_master_data_no_gap(masfile, sse1_name, sse2_name)
                #     sse1l, loopl, sse2l = lengths[i], int(dfloop['loop_length'].values[0]), lengths[i + 1]
                #     total_len = sse1l + loopl + sse2l
                #     end_edge = total_len + start - 1
                #     edges = {'ini': int(start), 'end': int(end_edge), 'sse1': int(sse1l), 'loop': int(loopl), 'sse2': int(sse2l)}
                #     self.log.debug(f'INI: {start}; END: {end_edge}; SSE1: {sse1l}; LOOP: {loopl}; SSE2: {sse2l}')
                #     self.log.debug(dfloop.to_string())
                #
                #     # 8. Bring and Combine fragments from the different sources.
                #     loop_data = self.make_fragment_files(dfloop, edges, masfile)
                #     loop_data['match_count'] += match_count

                #else:

                # 6. Minimize master data (pick top_loopsx3 lines to read and minimize the files)
                match_count = self.minimize_master_file(masfile)
                # 7. Retrieve MASTER data
                df_container = self.process_master_data(
                    masfile, infos[0], infos[1], infos[2])

                for indx in list(df_container.order.drop_duplicates()):
                    dfloop = df_container[df_container.order == indx]
                    sse1l, loopl, sse2l = lengths[i], int(
                        dfloop['loop_length'].values[0]), lengths[i + 1]
                    total_len = sse1l + loopl + sse2l
                    end_edge = total_len + start - 1
                    edges = {
                        'ini': int(start),
                        'end': int(end_edge),
                        'sse1': int(sse1l),
                        'loop': int(loopl),
                        'sse2': int(sse2l)
                    }
                    self.log.debug(
                        f'INI: {start}; END: {end_edge}; SSE1: {sse1l}; LOOP: {loopl}; SSE2: {sse2l}'
                    )
                    self.log.debug(dfloop.to_string())

                    # 8. Bring and Combine fragments from the different sources.
                    loop_data, nfolder = self.make_fragment_files(dfloop,
                                                                  edges,
                                                                  masfile,
                                                                  no_loop=True)
                    loop_data['match_count'] += match_count

                    # 9. Save data in the Case
                    kase.data['metadata']['loop_fragments'].append(loop_data)
                    kase.data['metadata']['loop_lengths'].append(int(loopl))
                    start += (sse1l + loopl)

                    # 10. Checkpoint save
                    checkpoint = nfolder.joinpath('checkpoint.json')
                    TButil.checkpoint_out(self.log, checkpoint, loop_data)
        return kase
Exemplo n.º 14
0
            ofile = case.main_path.joinpath('architecture').joinpath(
                'undirected_sketch.pdb')

        # for i, j, sse in case:
        #     if 'atoms' in sse['metadata'] and not TBcore.get_option('system', 'overwrite'):
        #         self.log.debug(f'{case.name}.{sse["id"]} already has atoms defined\n')
        #         continue
        #
        #     self.log.debug(f'Building coordinates for {case.name}.{sse["id"]}\n')
        #     case.data['topology']['architecture'][i][j] = self.make_structure(sse, self.pick_aa)

        # Insert motif?
        if self.motif:
            if not case.data['metadata']['motif_picker']:
                raise NodeDataError(
                    'Motif must be provided and pre-picked through the motif_picker.'
                )
            else:
                # Include what keywords it adds (in this instance, nothing)
                case.data.setdefault('metadata', {}).setdefault('binder', {})

                for n, motif_data in enumerate(
                        case.data['metadata']['motif_picker']):
                    motif, binder, hotspots, attach, selection, identifier = motif_data[
                        'motifs']
                    motif = pd.DataFrame(motif,
                                         columns=[
                                             'auth_comp_id', 'auth_atom_id',
                                             'auth_seq_id', 'auth_asym_id',
                                             'sse_id', 'internal_num',
                                             'Cartn_x', 'Cartn_y', 'Cartn_z'