示例#1
0
def main(DIMACSFile):
    """Take a file in DIMACS format and turn it into a sparse matrix format.

    @param DIMACSFile: The location of a file in DIMACS format.
    @type DIMACSFile : string
    return @type: sparsematrix
    return @use : The sparsematrix representation of the DIMACS format graph.

    """

    toNodes = []
    fromNodes = []
    numNodes = 0

    readIn = open(DIMACSFile, 'r')

    for line in readIn:
        if line[0] == 'c':
            continue
        elif line[0] == 'p':
            chunks = line.split()
            numNodes = int(chunks[2])
        elif line[0] == 'e':
            chunks = line.split()
            fromNodes.append(int(chunks[1]) - 1)
            toNodes.append(int(chunks[2]) - 1)

    readIn.close()

    adjacent = sparsematrix.sparse_matrix(numNodes)
    adjacent.addlist(fromNodes, toNodes)
    adjacent.addlist(toNodes, fromNodes)

    return adjacent
def main(similarities, cutoffPercent = 20, maxEValue = 1, minAlignLength = 20):
    """Create a sparse matrix from the processed PSI-BLAST output.
    
    @param similarities: The location of the processed PSI-BLAST output.
    @type similarities: string
    @param cutoffPercent: A percentage similarity > this parameter is deemed to be too similar.
    @type cutoffPercent:  integer
    @param maxEValue: The maximum permissible value for the E value of an alignment.
    @type maxEValue: float
    @param minAlignLength: The number of amino acids aligned in the query and the hit sequence
                           must be >= this value for the percentage similarity to be deemed significant.
    @type minAlignLength: integer
    
    """
    
    proteinNames = []  # Store the names of all the proteins found to be too similar to another protein
    similarProteins = []  # Store the pairs that are too similar
    
    readSimilarities = open(similarities, 'r')
    
    for line in readSimilarities:
        
        chunks = line.split()
        if len(chunks) == 12:
            query = chunks[0]
            hit = chunks[4]
            percentage = chunks[9]
        elif len(chunks) == 13:
            query = chunks[0]
            hit = chunks[5]
            percentage = chunks[10]
        
        # Ignore similarities where the query and the hit are the same, the percentage similarity is <= cutoffPercent,
        # the E value is > maxEValue and the length of the alignment is < minAlignLength.
        invalid = (query == hit or
            float(percentage) <= cutoffPercent
            )
        # If the similarity is valid record the proteins as being too similar.
        if not invalid:
            proteinNames.append(query)
            proteinNames.append(hit)
            similarProteins.append(tuple(sorted((query, hit))))
    
    readSimilarities.close()
    
    proteinNames = list(set(proteinNames))
    proteinNames.sort()
    similarProteins = list(set(similarProteins))
    indexDict = dict((proteinNames[x], x) for x in range(len(proteinNames)))
    
    # Create the sparse matrix
    adjacent = sparsematrix.sparse_matrix(len(proteinNames))
    xValues = [indexDict[x] for (x,y) in similarProteins]
    yValues = [indexDict[y] for (x,y) in similarProteins]
    adjacent.addlist(xValues, yValues)
    adjacent.addlist(yValues, xValues)
    
    return adjacent, proteinNames
def main(i, locForResults, dirPISCES, duplicates, resolutionData, fastaSequences, alignmentFile, minLength=20, maxLength=10000):
    """Compares the non-redundant dataset produced by Leaf under given parameters, with the one generated by PISCES.

    @param i: The gzipped list of non-redundant proteins generated by PISCES for the given parameters.
    @type i : string (file location)
    @param locForResults: The location where the results of the comparison should be written out.
    @type locForResults : string (file location)
    @param dirPISCES: The location of the directory that contains all of the culled lists generated by PISCES for all of the parameters.
    @type dirPISCES : string (directory location)
    @param duplicates: The location of the file containing the information about representative PDB chains (as determined by PISCES).
    @type duplicates : string (file location)
    @param resolutionData: The location of the file containing information about resolution and R Value for the chains.
    @type resolutionData : string (file location)
    @param fastaSequences: The file containing all the chains in the PDB in FASTA format.
    @type fastaSequences : string (file location)
    @param alignmentFile: The file containing the alignments for all the representative chains in the PDB (as calculated by PISCES).
    @type alignmentFile : string (file location)
    @param minLength: The minimum length that a protein chain can have (in terms of number of amino acids).
    @type minLength : integer
    @param maxLength: The maximum length that a protein chain can have (in terms of number of amino acids).
    @type maxLength : integer

    """

    resultsFile = open(locForResults, 'a')

    # The name of the PISCES file gives informaiton such as the number of proteins in the non-redundant dataset, resolution and R Value.
    chunks = i.split('_')
    numberPISCESProteinsKept = int(chunks[-1][6:-3])  # The number of proteins kept by PISCES.
    seqIdenThreshold = int(chunks[1][2:])  # The percentage cutoff used.
    resolutionLimit = float(chunks[2][3:])  # The maximum resolution permitted.
    RFactorLimit = float(chunks[3][1:])  # The maximum R Value permitted.
    # Determine whether non-XRay structures were allowed, and whether structures containing only alpha carbons were allowed.
    if len(chunks) == 7:
        inclNotXray = True
        inclCAOnly = False
    elif len(chunks) == 8:
        inclNotXray = True
        inclCAOnly = True
    else:
        inclNotXray = False
        inclCAOnly = False

    # Extract resolution, alignment, representative and length information for all of the PDB chains.
    resolutionInfo = process_resolution(resolutionData)
    alignments = process_alignments(alignmentFile, seqIdenThreshold)
    redundant = process_redundant(duplicates)
    lengths = process_fasta(fastaSequences)

    print 'Now working on file: ', i

    validProteins = []  # The proteins that are valid for the parameter constraints (resolution, R Value etc.).

    # Go through resolution and select the identifiers of the proteins that meet the structural requirements.
    for p in lengths.keys():
        proteinResolution = resolutionInfo[p]['Resolution']
        proteinRFactor = resolutionInfo[p]['RFactor']
        proteinFreeRFactor = resolutionInfo[p]['FreeRFactor']
        if not inclNotXray:
            # If non-Xray structures are not being included:
            if resolutionInfo[p]['Experiment'] != 'XRAY' or proteinResolution == 'NA':
                # If the protein structure was not determined by Xray or the resolution is not known, then don't include it
                continue
        else:
            if resolutionInfo[p]['Experiment'] != 'XRAY':
                # If non-Xray structures are being included and the protein structure was not determined by Xray, then include it.
                proteinResolution = 'NA'
                proteinRFactor = 'NA'
                proteinFreeRFactor = 'NA'
        if not inclCAOnly and resolutionInfo[p]['CAOnly'] == 'yes':
            # If the structure is only CA, and that type of structure is not being considered, then don't include it.
            continue
        
        if proteinResolution != 'NA' and float(proteinResolution) > resolutionLimit:
            continue
        if lengths[p] < minLength or lengths[p] > maxLength:
            continue
        if proteinRFactor == 'NA':
            if not inclNotXray:
                continue
        elif float(proteinRFactor) > RFactorLimit:
            continue
        
        validProteins.append(p)

    # Free up memory used by the resolution and length dicts.
    del resolutionInfo
    del lengths
    
    # Go through the list of proteins that meet the structural requirements and remove all that can be
    # replaced by a representative sequence. Add the representative to the list if not already in the list.
    nonRedundantProteins = set([])
    for p in validProteins:
        if redundant.has_key(p):
            nonRedundantProteins.add(redundant[p])
        else:
            nonRedundantProteins.add(p)

    # Free up memory used by the redundant dict.
    del redundant
    
    # Go through the alignments and record all alignments for the proteins that remain.
    proteinNames = []
    similarProteins = []
    singletonProteins = []
    for p in nonRedundantProteins:
        singleton = True
        if alignments.has_key(p):
            # If there are alignments involving p then record the ones that involve other proteins in nonRedundantProteins.
            for h in alignments[p]:
                if h in nonRedundantProteins and alignments[p][h] > seqIdenThreshold:
                    singleton = False
                    # If the two proteins are too similar then record this.
                    proteinNames.append(p)
                    proteinNames.append(h)
                    similarProteins.append(tuple(sorted([p, h])))
        if singleton:
            singletonProteins.append(p)

    # Free up memory used by the alignments dict.
    del alignments
    
    proteinNames = list(set(proteinNames))
    proteinNames.sort()
    similarProteins = list(set(similarProteins))
    indexDict = dict((proteinNames[x], x) for x in range(len(proteinNames)))

    # Create the sparse matrix.
    adjacent = sparsematrix.sparse_matrix(len(proteinNames))
    xValues = [indexDict[x] for (x,y) in similarProteins]
    yValues = [indexDict[y] for (x,y) in similarProteins]
    adjacent.addlist(xValues, yValues)
    adjacent.addlist(yValues, xValues)

    # Perform the culling by Leaf.
    removedLeaf, proteinsToKeep, removeNode, nodesToKeep, timeTaken = CMLeaf.main(adjacent, proteinNames)

    # Calculate and record the improvement of Leaf over PISCES.
    numberLeafProteinsKept = len(nonRedundantProteins) - len(removedLeaf)
    percentageImprovementLeaf = (float((numberLeafProteinsKept - numberPISCESProteinsKept)) / numberPISCESProteinsKept) * 100
    resultsFile.write(i + '\t' + str(numberPISCESProteinsKept) + '\t' + str(numberLeafProteinsKept) + '\t' + str(percentageImprovementLeaf) + '\t' +
                      str(numOfNodes) + '\t' + str(maxDegree) + '\t' + str(meanDegree) + '\n')
    resultsFile.close()