Пример #1
0
    def getFractionOfPdbCovered(self):
        """
        Return the fraction of sequences in PDB that are matched by at least
        one substring in the subset of substrings that is being evaluated.
        """
        hit = 0
        total = 0

        db = DatabaseSpecifier().getDatabaseFromKeywords(
            trigPoints=[],
            landmarks=['AC ' + self.structureType],
            acAlphaHelixFilename=self.acAlphaHelixFilename,
            acAlphaHelix310Filename=self.acAlphaHelix310Filename,
            acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename,
            acAlphaHelixPiFilename=self.acAlphaHelixPiFilename,
            acExtendedStrandFilename=self.acExtendedStrandFilename)

        backend = Backend()
        backend.configure(db.dbParams)

        for read in SSFastaReads(self.pdbFile,
                                 readClass=SSAAReadWithX,
                                 checkAlphabet=0):
            total += 1
            scannedRead = backend.scan(read)
            if len(scannedRead.landmarks) > 0:
                hit += 1

        return hit / total
Пример #2
0
    def testTwoFiles(self):
        """
        It must be possible to read from two FASTA files.
        """
        class SideEffect(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('file1.fasta', filename)
                    self.count += 1
                    return File(['>id1\n', 'ACTG\n', '>id1\n', 'hhhh\n'])
                elif self.count == 1:
                    self.test.assertEqual('file2.fasta', filename)
                    self.count += 1
                    return File(['>id2\n', 'CAGT\n', '>id2\n', 'eeee\n'])
                else:
                    self.test.fail('We are only supposed to be called twice!')

        sideEffect = SideEffect(self)
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect.sideEffect
            reads = SSFastaReads(['file1.fasta', 'file2.fasta'])
            self.assertEqual([
                SSAARead('id1', 'ACTG', 'hhhh'),
                SSAARead('id2', 'CAGT', 'eeee'),
            ], list(reads))
Пример #3
0
 def testNoQuality(self):
     """
     A PDB FASTA file read must not have any quality information.
     """
     data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(SSFastaReads(data))
         self.assertIs(None, reads[0].quality)
Пример #4
0
 def testOneRead(self):
     """
     A PDB FASTA file with one read must be read properly.
     """
     data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(SSFastaReads(data))
         self.assertEqual([SSAARead('seq1', 'REDD', 'HH--')], reads)
Пример #5
0
 def testEmpty(self):
     """
     An empty PDB FASTA file results in an empty iterator.
     """
     data = ''
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = SSFastaReads(data)
         self.assertEqual([], list(reads))
Пример #6
0
 def testTypeDefaultsToSSAARead(self):
     """
     A PDB FASTA file whose type is not specified must result in reads that
     are instances of SSAARead.
     """
     data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(SSFastaReads(data))
         self.assertTrue(isinstance(reads[0], SSAARead))
Пример #7
0
 def testDontConvertLowerToUpperCaseIfNotSpecified(self):
     """
     A read sequence and its structure must not be converted from lower to
     upper case if the conversion is not requested.
     """
     data = '\n'.join(['>seq1', 'rrFF', '>str1', 'HHee'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(SSFastaReads(data))
         self.assertEqual([SSAARead('seq1', 'rrFF', 'HHee')], reads)
Пример #8
0
 def testConvertLowerToUpperCaseIfSpecified(self):
     """
     A read sequence and structure must be converted from lower to upper
     case if requested.
     """
     data = '\n'.join(['>seq1', 'rrrff', '>str1', 'hheeh'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(SSFastaReads(data, upperCase=True))
         self.assertEqual([SSAARead('seq1', 'RRRFF', 'HHEEH')], reads)
Пример #9
0
 def testDisableAlphabetChecking(self):
     """
     It must be possible to have a SSFastaReads instance not do alphabet
     checking, if requested (by passing checkAlphabet=0).
     """
     data = '\n'.join(['>seq1', 'rr-rr', '>str1', 'hh-hh'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         self.assertEqual(1, len(list(SSFastaReads(data, checkAlphabet=0))))
Пример #10
0
 def testOddNumberOfRecords(self):
     """
     Trying to parse a PDB FASTA file with an odd number of records must
     raise a ValueError.
     """
     data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--', '>seq2', 'REAA'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         error = "^Structure file 'x.fasta' has an odd number of records\.$"
         six.assertRaisesRegex(self, ValueError, error, list,
                               SSFastaReads('x.fasta'))
Пример #11
0
 def testUnequalSequenceAndStructureLengths(self):
     """
     Trying to parse a PDB FASTA file that has a sequence whose structure
     is of a different length must raise a ValueError.
     """
     data = '\n'.join(
         ['>seq1', 'REDD', '>str1', 'HH--', '>seq2', 'REAA', '>str2', 'HH'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         error = ("Sequence 'seq2' length \(4\) is not equal to structure "
                  "'str2' length \(2\) in input file 'x\.fasta'\.$")
         six.assertRaisesRegex(self, ValueError, error, list,
                               SSFastaReads('x.fasta'))
Пример #12
0
    def testReadClass(self):
        """
        A PDB FASTA file whose read class is something other than SSAARead must
        result in reads that are instances of that class.
        """
        class ReadClass:
            def __init__(self, id, sequence, structure):
                pass

        data = '\n'.join(['>seq1', 'RRRR', '>str1', 'HHHH'])
        with patch.object(builtins, 'open', mock_open(read_data=data)):
            reads = list(SSFastaReads(data, readClass=ReadClass))
            self.assertTrue(isinstance(reads[0], ReadClass))
Пример #13
0
 def testTwoReads(self):
     """
     A PDB FASTA file with two reads must be read properly and its
     sequences must be returned in the correct order.
     """
     data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--',
                       '>seq2', 'REAA', '>str2', 'HHEE'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = list(SSFastaReads(data))
         self.assertEqual(2, len(reads))
         self.assertEqual([SSAARead('seq1', 'REDD', 'HH--'),
                           SSAARead('seq2', 'REAA', 'HHEE')],
                          reads)
Пример #14
0
 def testOnlyCheckSomeAlphabets(self):
     """
     It must be possible to have the alphabets of only a certain number of
     reads checked. A non-alphabetic character in a later read must not
     stop that read from being processed.
     """
     data = '\n'.join([
         '>seq1', 'rrrr', '>str1', 'hhhh', '>seq2', 'r-rr', '>str2', 'h-hh'
     ])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = list(SSFastaReads(data, checkAlphabet=1))
         self.assertEqual(2, len(reads))
         self.assertEqual('r-rr', reads[1].sequence)
Пример #15
0
 def testAlphabetIsCheckedAndRaisesValueErrorOnFirstRead(self):
     """
     The default behavior of a SSFastaReads instance is to check to ensure
     its sequences have the correct alphabet and to raise ValueError if not.
     A non-alphabetic character in the first read must be detected.
     """
     data = '\n'.join(['>seq1', 'at-at', '>str1', 'HH-HH'])
     error = ("^Read alphabet \('-AT'\) is not a subset of expected "
              "alphabet \('ACDEFGHIKLMNPQRSTVWY'\) for read class "
              "SSAARead\.$")
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         six.assertRaisesRegex(self, ValueError, error, list,
                               SSFastaReads(data))
                    help='The type of structure that should be extracted.')

args = parser.parse_args()

finder = findLandmark(args.featureType)()

dropStructure = args.dropStructure
margin = args.margin

if margin < 0:
    raise ValueError('Margin must be non-negative.')

# The ss.txt file available at PDB has sequences that contain (at least)
# 'X' and 'U'. So, for now, read it without checking sequence alphabet.

for read in SSFastaReads(sys.stdin, checkAlphabet=0):
    for feature in finder.findWithMargin(read, margin):

        # Drop the ':sequence' suffix from read ids and add information
        # about the (1-based) offsets at which this feature was found.
        start = feature.offset - margin
        end = feature.offset + feature.length + margin
        readId = read.id.replace(':sequence', '') + ':%d-%d' % (start + 1, end)

        if dropStructure:
            featureWithMargin = AARead(readId, read.sequence[start:end])
        else:
            featureWithMargin = read[start:end]
            featureWithMargin.id = readId

        print(featureWithMargin.toString(format_='fasta'), end='')
Пример #17
0
from os.path import dirname, join

from .bitScores import BIT_SCORES
from .zScores import Z_SCORES

from light.performance import data

from dark.fasta_ss import SSFastaReads
from dark.reads import SSAAReadWithX

DATASET = 'pdb_2hla_a'

_DIR = join(dirname(data.__file__), DATASET)

QUERIES = list(
    SSFastaReads(join(_DIR, 'queries.fasta'), readClass=SSAAReadWithX))

SUBJECTS = list(
    SSFastaReads(join(_DIR, 'subjects.fasta'), readClass=SSAAReadWithX))

_ = (BIT_SCORES, Z_SCORES)  # Keep pyflakes quiet.
Пример #18
0
from os.path import dirname, join

from .bitScores import BIT_SCORES
from .zScores import Z_SCORES

from light.performance import data

from dark.fasta_ss import SSFastaReads
from dark.reads import SSAAReadWithX

DATASET = 'ha'

_DIR = join(dirname(data.__file__), DATASET)

QUERIES = list(SSFastaReads(join(_DIR, 'queries.fasta'),
                            readClass=SSAAReadWithX))

SUBJECTS = list(SSFastaReads(join(_DIR, 'subjects.fasta'),
                             readClass=SSAAReadWithX))

_ = (BIT_SCORES, Z_SCORES)  # Keep pyflakes quiet.
Пример #19
0
                        raise ValueError(
                            'Sequence id %r found in multiple categories (%s) '
                            'in %r' % (sequenceId, ', '.join(
                                sorted(sequenceIdToCategories[sequenceId])),
                                       args.categories))
            sequenceIdToCategories[sequenceId].add(category)

# Read the PDB sequence information and add each sequence to its category.
#
# Sequence ids must be in the form e.g., pdb_2hla_a as produced by
# clean-pdb-ss-fasta.py (in this directory).

sequencesByCategory = defaultdict(Reads)

for sequence in SSFastaReads(sys.stdin,
                             readClass=SSAAReadWithX,
                             checkAlphabet=0):
    pdb, sequenceId, chain = sequence.id.split('_')
    assert pdb == 'pdb' and len(chain) == 1, (
        'Unrecognized PDB id %r found on stdin.' % sequence.id)
    if args.keepChain:
        sequenceId += '_' + chain

    if sequenceId in sequenceIdToCategories:
        for category in sequenceIdToCategories[sequenceId]:
            sequencesByCategory[category].add(sequence)
    else:
        if not args.ignoreUncategorizedSequences:
            print('Sequence %r on stdin is not in any category.' % sequence.id,
                  file=sys.stderr)
Пример #20
0
        help=('A file of (1-based) sequence numbers to retain. Numbers must '
              'be one per line.'))

    args = parser.parse_args()

    if args.readClass == 'fastq':
        # TODO: FastqReads should take a checkAlphabet argument, in the way
        # that FastaReads does.
        reads = FastqReads(sys.stdin)
    elif args.readClass == 'fasta':
        reads = FastaReads(sys.stdin, checkAlphabet=False)
    else:
        # args.readClass must be fasta-ss due to the 'choices' argument
        # passed to parser.add_argument value above.
        assert args.readClass == 'fasta-ss'
        reads = SSFastaReads(sys.stdin, checkAlphabet=False)

    saveAs = args.saveAs or args.readClass

    # Check for incompatible read/write formats. We can't write FASTQ
    # unless we have FASTQ on input (else we won't have quality
    # information), and we can't write PDB FASTA with secondary structure
    # information unless we have that on input.
    if saveAs == 'fastq' and args.readClass != 'fastq':
        raise ValueError(
            'You have specified --saveAs fastq without using --readClass '
            'fastq to indicate that the input is FASTQ. Please be explicit.')
    elif saveAs == 'fasta-ss' and args.readClass != 'fasta-ss':
        raise ValueError(
            'You have specified --saveAs fasta-ss without using --readClass '
            'fasta-ss to indicate that the input is PDB FASTA. Please be '
Пример #21
0
tooShort = set()
poorResolution = set()
nmr = set()

minLength = args.minLength
maxResolution = args.maxResolution
discardNMR = args.discardNMR

# The NMR resolution is assigned to PDB structures that were obtained via
# NMR (as opposed to crystallization). For now we keep all such
# structures. This is briefly mentioned at
# http://www.rcsb.org/pdb/static.do?p=general_information/about_pdb/\
# summaries.html
NMR_RESOLUTION = -1.0

for sequence in SSFastaReads(sys.stdin, checkAlphabet=0):
    pdbId = sequence.id.split(':', maxsplit=1)[0]

    # Check if this sequence has been deleted from PDB.
    if pdbDeletions:
        if pdbId in pdbDeletions:
            deleted.add(pdbId)
            continue

    # Check if the resolution on this sequence is good (i.e., numerically
    # low) enough.
    if pdbResolutions:
        try:
            resolution = pdbResolutions[pdbId]
        except KeyError:
            print('PDB id %r has unknown resolution!' % pdbId, file=sys.stderr)
Пример #22
0
    type=bool,
    help=('If True the evaluateMatchNoPrefix function will be used to '
          'evaluate the helix. If False use the evaluateMatch function.'))

parser.add_argument(
    '--structureType',
    default='H',
    choices={'H', 'G', 'I', 'E', 'K'},
    help=('The type of structure that should be evaluated against. '
          'H: Alpha helix, G: Alpha helix 3 10, I: Alpha helix pi, E: '
          'Extended strand, K: Combined alpha helix.'))

args = parser.parse_args()

pdbReads = [(read.sequence, read.structure)
            for read in SSFastaReads(args.pdbFile, checkAlphabet=0)]
helices = FastaReads(sys.stdin, readClass=AAReadWithX, checkAlphabet=0)

if args.evaluateNoPrefix:
    evaluationFunction = evaluateMatchNoPrefix
else:
    evaluationFunction = evaluateMatch

for i, helix in enumerate(helices):
    truePositive = falsePositive = 0
    helixSequence = helix.sequence
    if ('X' in helixSequence or 'Z' in helixSequence or 'B' in helixSequence):
        continue
    else:
        uniqueRegex = re.compile(helixSequence)
        for sequence, structure in pdbReads:
        if args.printParams:
            print('DATABASE PARAMETERS FOR STRUCTURE %r' % structureName)
            print(dbParams.print_(margin='  '))
            print('FIND PARAMETERS FOR STRUCTURE %r' % structureName)
            print(findParams.print_(margin='  '))

        # Set up the database.
        database = Database(dbParams)
        backend = Backend()
        backend.configure(database.dbParams)

        # Read the sequence out of the PDB ss.txt file.
        chains = Reads()
        sequenceFile = join(dirname(light.__file__),
                            '..', 'data', 'pdb-20160303-ss.txt')
        for record in SSFastaReads(sequenceFile, checkAlphabet=0):
            if structureName in record.id:
                chainName = record.id.split('_')[2].lower()
                chains.add(SSAAReadWithX(chainName, record.sequence,
                                         record.structure))

        assert len(chains) > 0, ('%r does not contain any sequences with id %r'
                                 % (sequenceFile, structureName))
        if first:
            firstStructureName = structureName + '1'
            structureName += '1'

        # Load the structure into PyMOL.
        cmd.load(structureFile, structureName)

        # Set the display.