Exemplo n.º 1
0
def get_dataset(structure, label, length=None):
    '''Returns a dataset of continuous segments of protein sequence with the
    specified DSSP secondary structure code (E, H, C) of a minimum length.

    Attributes
    ----------
        structure: structure data
        label (char): DSSP secondary structure label (E, H, C)
        length (int): minimum length of secondary structure segment

    Returns
    -------
        dataset of continuous segments of protein sequence
    '''

    colNames = ["sequence", "label"]

    if length == None:

        rows = secondaryStructureExtractor.get_python_rdd(structure) \
            .flatMap(StructureToSecondaryStructureElements(label))

        return pythonRDDToDataset.get_dataset(rows, colNames)
    else:

        rows = secondaryStructureExtractor.get_python_rdd(structure) \
            .flatMap(StructureToSecondaryStructureElements(label, length))

        return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structureRDD, length):
    '''Returns a dataset of sequence segments of the specified length and
    the DSSP Q8 and Q3 code of the center residue in a segment.

    Parameters
    ----------
    structureRDD : structure
    length : int
       segment length, must be an odd number

    Returns
    -------
    dataset
       dataset of segments

    Raises
    ------
    Exception
        Segment length must be an odd number

    '''

    if length % 2 == 0:
        raise Exception("Segment length must be an odd number %i" % length)

    rows = secondaryStructureExtractor.get_python_rdd(structureRDD) \
            .flatMap(StructureToSecondaryStructureSegments(length))

    colNames = ["structureChainId", "sequence", "labelQ8", "labelQ3"]
    return pythonRDDToDataset.get_dataset(rows, colNames)
Exemplo n.º 3
0
def get_dataset(structure):
    '''Returns a dataset with protein sequence and secondary structure assignments.

    Parameters
    ----------
    structure : mmtfStructure
       single protein chain

    Returns
    -------
    dataset
       dataset with sequence and secondary structure assignments
    '''
    print("RUNNING")
    rows = structure.flatMap(lambda x: _get_phi_psi(x))  # Map or flatMap
    print("MAPPED")
    # convert to dataset
    colNames = ["pdbId", "chain", "resi", "resn", "phi",
                "psi"]  #+["is"+aa for aa in aa3]
    #sc = SparkContext.getOrCreate()

    #newdf = it.chain.from_iterable(rows)
    #allrows = sc.union(rows)
    #return reduce(DataFrame.unionAll, rows.collect())
    return pythonRDDToDataset.get_dataset(rows, colNames)
Exemplo n.º 4
0
def get_dataset():
    '''Gets JPred 4/JNet (v.2.3.1) secondary structure dataset.

    Returns
    -------
    dataset
       secondaryStructure dataset
    '''

    URL = "http://www.compbio.dundee.ac.uk/jpred/downloads/retr231.tar.gz"
    instream = urllib.request.urlopen(URL)
    secondaryStructures, sequences, trained = {}, {}, {}
    scopIds = set()
    res = []

    with tarfile.open(fileobj=instream, mode="r:gz") as tf:

        for entry in tf:
            if entry.isdir():
                continue
            br = tf.extractfile(entry)

            if ".dssp" in entry.name:
                scopID = str(br.readline())[3:-3]  # Remove newline and byte
                secondaryStructure = str(
                    br.readline())[2:-3]  # Remove newline and byte
                secondaryStructure = secondaryStructure.replace('-', 'C')
                secondaryStructures[scopID] = secondaryStructure

            if ".fasta" in entry.name:
                scopID = str(br.readline())[3:-3]  # Remove newline and byte
                sequence = str(br.readline())[2:-3]  # Remove newline and byte
                scopIds.add(scopID)
                sequences[scopID] = sequence

                if "training/" in entry.name:
                    trained[scopID] = "true"
                elif "blind/" in entry.name:
                    trained[scopID] = "false"

    for scopId in scopIds:
        row = Row(scopId, sequences[scopId], secondaryStructures[scopId],
                  trained[scopId])
        res.append(row)

    sc = SparkContext.getOrCreate()
    data = sc.parallelize(res)
    colNames = ["scopID", "sequence", "secondaryStructure", "trained"]

    return pythonRDDToDataset.get_dataset(data, colNames)
Exemplo n.º 5
0
    def get_dataset(self, structures):
        '''Returns a dataset of residues that interact with specified group
        within a specified cutoff distance

        Attricutes:
            structure (pythonRdd): a set of PDB structures
        Returns:
            dataset with interacting residue and atom information
        '''
        # create a list of all residues with a threshold distance
        rows = structures.flatMap(
            StructureToAllInteractions(self.groupName, self.distance))

        # convert to a dataset
        colNames = [
            "structureId", "residue1", "atom1", "element1", "index1",
            "residue2", "atom2", "element2", "index2", "distance"
        ]
        return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structures):
    '''Returns a dataset of polymer sequence contained in PDB entries
    using the full sequence used in the experimnet
    (i.e., the "SEQRES" record in PDB files)

    Attributes
    ----------
        structures (pythonRDD): a set of PDB structures

    Returns
    -------
        dataset with interacting residue and atom information
    '''

    rows = structures.flatMap(StructureToPolymerSequences()) \
                     .map(lambda x: Row(x[0],x[1]))

    colNames = ["structureChainId", "sequence"]

    return pythonRDDToDataset.get_dataset(rows, colNames)
Exemplo n.º 7
0
def get_dataset(structure, parameters=None, classifier=None, options=None):
    '''Returns a dataset with protein sequence and secondary structure assignments.

    Parameters
    ----------
    structure : mmtfStructure
       single protein chain

    Returns
    -------
    dataset
       dataset with sequence and secondary structure assignments
    '''

    rows = structure.map(lambda x: _get_free_sasa(x, parameters, classifier,
                                                  options))  # Map or flatMap

    # convert to dataset
    colNames = ["structureChainId", "totalArea"]

    return pythonRDDToDataset.get_dataset(rows, colNames)
def get_dataset(structure):
    '''Returns a dataset with protein sequence and secondary structure assignments.

    Attributes
    ----------
        structure (mmtfStructure): single protein chain

    Returns
    -------
        dataset with sequence and secondary structure assignments
    '''

    rows = structure.map(
        lambda x: _get_sec_struct_fractions(x))  # Map or flatMap

    # convert to dataset
    colNames = [
        "structureChainId", "sequence", "alpha", "beta", "coil", "dsspQ8Code",
        "dsspQ3Code"
    ]

    return pythonRDDToDataset.get_dataset(rows, colNames)