def getFractionOfPdbCovered(self): """ Return the fraction of sequences in PDB that are matched by at least one substring in the subset of substrings that is being evaluated. """ hit = 0 total = 0 db = DatabaseSpecifier().getDatabaseFromKeywords( trigPoints=[], landmarks=['AC ' + self.structureType], acAlphaHelixFilename=self.acAlphaHelixFilename, acAlphaHelix310Filename=self.acAlphaHelix310Filename, acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename, acAlphaHelixPiFilename=self.acAlphaHelixPiFilename, acExtendedStrandFilename=self.acExtendedStrandFilename) backend = Backend() backend.configure(db.dbParams) for read in SSFastaReads(self.pdbFile, readClass=SSAAReadWithX, checkAlphabet=0): total += 1 scannedRead = backend.scan(read) if len(scannedRead.landmarks) > 0: hit += 1 return hit / total
def testTwoFiles(self): """ It must be possible to read from two FASTA files. """ class SideEffect(object): def __init__(self, test): self.test = test self.count = 0 def sideEffect(self, filename, **kwargs): if self.count == 0: self.test.assertEqual('file1.fasta', filename) self.count += 1 return File(['>id1\n', 'ACTG\n', '>id1\n', 'hhhh\n']) elif self.count == 1: self.test.assertEqual('file2.fasta', filename) self.count += 1 return File(['>id2\n', 'CAGT\n', '>id2\n', 'eeee\n']) else: self.test.fail('We are only supposed to be called twice!') sideEffect = SideEffect(self) with patch.object(builtins, 'open') as mockMethod: mockMethod.side_effect = sideEffect.sideEffect reads = SSFastaReads(['file1.fasta', 'file2.fasta']) self.assertEqual([ SSAARead('id1', 'ACTG', 'hhhh'), SSAARead('id2', 'CAGT', 'eeee'), ], list(reads))
def testNoQuality(self): """ A PDB FASTA file read must not have any quality information. """ data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--']) with patch.object(builtins, 'open', mock_open(read_data=data)): reads = list(SSFastaReads(data)) self.assertIs(None, reads[0].quality)
def testOneRead(self): """ A PDB FASTA file with one read must be read properly. """ data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--']) with patch.object(builtins, 'open', mock_open(read_data=data)): reads = list(SSFastaReads(data)) self.assertEqual([SSAARead('seq1', 'REDD', 'HH--')], reads)
def testEmpty(self): """ An empty PDB FASTA file results in an empty iterator. """ data = '' with patch.object(builtins, 'open', mock_open(read_data=data)): reads = SSFastaReads(data) self.assertEqual([], list(reads))
def testTypeDefaultsToSSAARead(self): """ A PDB FASTA file whose type is not specified must result in reads that are instances of SSAARead. """ data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--']) with patch.object(builtins, 'open', mock_open(read_data=data)): reads = list(SSFastaReads(data)) self.assertTrue(isinstance(reads[0], SSAARead))
def testDontConvertLowerToUpperCaseIfNotSpecified(self): """ A read sequence and its structure must not be converted from lower to upper case if the conversion is not requested. """ data = '\n'.join(['>seq1', 'rrFF', '>str1', 'HHee']) with patch.object(builtins, 'open', mock_open(read_data=data)): reads = list(SSFastaReads(data)) self.assertEqual([SSAARead('seq1', 'rrFF', 'HHee')], reads)
def testConvertLowerToUpperCaseIfSpecified(self): """ A read sequence and structure must be converted from lower to upper case if requested. """ data = '\n'.join(['>seq1', 'rrrff', '>str1', 'hheeh']) with patch.object(builtins, 'open', mock_open(read_data=data)): reads = list(SSFastaReads(data, upperCase=True)) self.assertEqual([SSAARead('seq1', 'RRRFF', 'HHEEH')], reads)
def testDisableAlphabetChecking(self): """ It must be possible to have a SSFastaReads instance not do alphabet checking, if requested (by passing checkAlphabet=0). """ data = '\n'.join(['>seq1', 'rr-rr', '>str1', 'hh-hh']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): self.assertEqual(1, len(list(SSFastaReads(data, checkAlphabet=0))))
def testOddNumberOfRecords(self): """ Trying to parse a PDB FASTA file with an odd number of records must raise a ValueError. """ data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--', '>seq2', 'REAA']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): error = "^Structure file 'x.fasta' has an odd number of records\.$" six.assertRaisesRegex(self, ValueError, error, list, SSFastaReads('x.fasta'))
def testUnequalSequenceAndStructureLengths(self): """ Trying to parse a PDB FASTA file that has a sequence whose structure is of a different length must raise a ValueError. """ data = '\n'.join( ['>seq1', 'REDD', '>str1', 'HH--', '>seq2', 'REAA', '>str2', 'HH']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): error = ("Sequence 'seq2' length \(4\) is not equal to structure " "'str2' length \(2\) in input file 'x\.fasta'\.$") six.assertRaisesRegex(self, ValueError, error, list, SSFastaReads('x.fasta'))
def testReadClass(self): """ A PDB FASTA file whose read class is something other than SSAARead must result in reads that are instances of that class. """ class ReadClass: def __init__(self, id, sequence, structure): pass data = '\n'.join(['>seq1', 'RRRR', '>str1', 'HHHH']) with patch.object(builtins, 'open', mock_open(read_data=data)): reads = list(SSFastaReads(data, readClass=ReadClass)) self.assertTrue(isinstance(reads[0], ReadClass))
def testTwoReads(self): """ A PDB FASTA file with two reads must be read properly and its sequences must be returned in the correct order. """ data = '\n'.join(['>seq1', 'REDD', '>str1', 'HH--', '>seq2', 'REAA', '>str2', 'HHEE']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): reads = list(SSFastaReads(data)) self.assertEqual(2, len(reads)) self.assertEqual([SSAARead('seq1', 'REDD', 'HH--'), SSAARead('seq2', 'REAA', 'HHEE')], reads)
def testOnlyCheckSomeAlphabets(self): """ It must be possible to have the alphabets of only a certain number of reads checked. A non-alphabetic character in a later read must not stop that read from being processed. """ data = '\n'.join([ '>seq1', 'rrrr', '>str1', 'hhhh', '>seq2', 'r-rr', '>str2', 'h-hh' ]) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): reads = list(SSFastaReads(data, checkAlphabet=1)) self.assertEqual(2, len(reads)) self.assertEqual('r-rr', reads[1].sequence)
def testAlphabetIsCheckedAndRaisesValueErrorOnFirstRead(self): """ The default behavior of a SSFastaReads instance is to check to ensure its sequences have the correct alphabet and to raise ValueError if not. A non-alphabetic character in the first read must be detected. """ data = '\n'.join(['>seq1', 'at-at', '>str1', 'HH-HH']) error = ("^Read alphabet \('-AT'\) is not a subset of expected " "alphabet \('ACDEFGHIKLMNPQRSTVWY'\) for read class " "SSAARead\.$") mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): six.assertRaisesRegex(self, ValueError, error, list, SSFastaReads(data))
help='The type of structure that should be extracted.') args = parser.parse_args() finder = findLandmark(args.featureType)() dropStructure = args.dropStructure margin = args.margin if margin < 0: raise ValueError('Margin must be non-negative.') # The ss.txt file available at PDB has sequences that contain (at least) # 'X' and 'U'. So, for now, read it without checking sequence alphabet. for read in SSFastaReads(sys.stdin, checkAlphabet=0): for feature in finder.findWithMargin(read, margin): # Drop the ':sequence' suffix from read ids and add information # about the (1-based) offsets at which this feature was found. start = feature.offset - margin end = feature.offset + feature.length + margin readId = read.id.replace(':sequence', '') + ':%d-%d' % (start + 1, end) if dropStructure: featureWithMargin = AARead(readId, read.sequence[start:end]) else: featureWithMargin = read[start:end] featureWithMargin.id = readId print(featureWithMargin.toString(format_='fasta'), end='')
from os.path import dirname, join from .bitScores import BIT_SCORES from .zScores import Z_SCORES from light.performance import data from dark.fasta_ss import SSFastaReads from dark.reads import SSAAReadWithX DATASET = 'pdb_2hla_a' _DIR = join(dirname(data.__file__), DATASET) QUERIES = list( SSFastaReads(join(_DIR, 'queries.fasta'), readClass=SSAAReadWithX)) SUBJECTS = list( SSFastaReads(join(_DIR, 'subjects.fasta'), readClass=SSAAReadWithX)) _ = (BIT_SCORES, Z_SCORES) # Keep pyflakes quiet.
from os.path import dirname, join from .bitScores import BIT_SCORES from .zScores import Z_SCORES from light.performance import data from dark.fasta_ss import SSFastaReads from dark.reads import SSAAReadWithX DATASET = 'ha' _DIR = join(dirname(data.__file__), DATASET) QUERIES = list(SSFastaReads(join(_DIR, 'queries.fasta'), readClass=SSAAReadWithX)) SUBJECTS = list(SSFastaReads(join(_DIR, 'subjects.fasta'), readClass=SSAAReadWithX)) _ = (BIT_SCORES, Z_SCORES) # Keep pyflakes quiet.
raise ValueError( 'Sequence id %r found in multiple categories (%s) ' 'in %r' % (sequenceId, ', '.join( sorted(sequenceIdToCategories[sequenceId])), args.categories)) sequenceIdToCategories[sequenceId].add(category) # Read the PDB sequence information and add each sequence to its category. # # Sequence ids must be in the form e.g., pdb_2hla_a as produced by # clean-pdb-ss-fasta.py (in this directory). sequencesByCategory = defaultdict(Reads) for sequence in SSFastaReads(sys.stdin, readClass=SSAAReadWithX, checkAlphabet=0): pdb, sequenceId, chain = sequence.id.split('_') assert pdb == 'pdb' and len(chain) == 1, ( 'Unrecognized PDB id %r found on stdin.' % sequence.id) if args.keepChain: sequenceId += '_' + chain if sequenceId in sequenceIdToCategories: for category in sequenceIdToCategories[sequenceId]: sequencesByCategory[category].add(sequence) else: if not args.ignoreUncategorizedSequences: print('Sequence %r on stdin is not in any category.' % sequence.id, file=sys.stderr)
help=('A file of (1-based) sequence numbers to retain. Numbers must ' 'be one per line.')) args = parser.parse_args() if args.readClass == 'fastq': # TODO: FastqReads should take a checkAlphabet argument, in the way # that FastaReads does. reads = FastqReads(sys.stdin) elif args.readClass == 'fasta': reads = FastaReads(sys.stdin, checkAlphabet=False) else: # args.readClass must be fasta-ss due to the 'choices' argument # passed to parser.add_argument value above. assert args.readClass == 'fasta-ss' reads = SSFastaReads(sys.stdin, checkAlphabet=False) saveAs = args.saveAs or args.readClass # Check for incompatible read/write formats. We can't write FASTQ # unless we have FASTQ on input (else we won't have quality # information), and we can't write PDB FASTA with secondary structure # information unless we have that on input. if saveAs == 'fastq' and args.readClass != 'fastq': raise ValueError( 'You have specified --saveAs fastq without using --readClass ' 'fastq to indicate that the input is FASTQ. Please be explicit.') elif saveAs == 'fasta-ss' and args.readClass != 'fasta-ss': raise ValueError( 'You have specified --saveAs fasta-ss without using --readClass ' 'fasta-ss to indicate that the input is PDB FASTA. Please be '
tooShort = set() poorResolution = set() nmr = set() minLength = args.minLength maxResolution = args.maxResolution discardNMR = args.discardNMR # The NMR resolution is assigned to PDB structures that were obtained via # NMR (as opposed to crystallization). For now we keep all such # structures. This is briefly mentioned at # http://www.rcsb.org/pdb/static.do?p=general_information/about_pdb/\ # summaries.html NMR_RESOLUTION = -1.0 for sequence in SSFastaReads(sys.stdin, checkAlphabet=0): pdbId = sequence.id.split(':', maxsplit=1)[0] # Check if this sequence has been deleted from PDB. if pdbDeletions: if pdbId in pdbDeletions: deleted.add(pdbId) continue # Check if the resolution on this sequence is good (i.e., numerically # low) enough. if pdbResolutions: try: resolution = pdbResolutions[pdbId] except KeyError: print('PDB id %r has unknown resolution!' % pdbId, file=sys.stderr)
type=bool, help=('If True the evaluateMatchNoPrefix function will be used to ' 'evaluate the helix. If False use the evaluateMatch function.')) parser.add_argument( '--structureType', default='H', choices={'H', 'G', 'I', 'E', 'K'}, help=('The type of structure that should be evaluated against. ' 'H: Alpha helix, G: Alpha helix 3 10, I: Alpha helix pi, E: ' 'Extended strand, K: Combined alpha helix.')) args = parser.parse_args() pdbReads = [(read.sequence, read.structure) for read in SSFastaReads(args.pdbFile, checkAlphabet=0)] helices = FastaReads(sys.stdin, readClass=AAReadWithX, checkAlphabet=0) if args.evaluateNoPrefix: evaluationFunction = evaluateMatchNoPrefix else: evaluationFunction = evaluateMatch for i, helix in enumerate(helices): truePositive = falsePositive = 0 helixSequence = helix.sequence if ('X' in helixSequence or 'Z' in helixSequence or 'B' in helixSequence): continue else: uniqueRegex = re.compile(helixSequence) for sequence, structure in pdbReads:
if args.printParams: print('DATABASE PARAMETERS FOR STRUCTURE %r' % structureName) print(dbParams.print_(margin=' ')) print('FIND PARAMETERS FOR STRUCTURE %r' % structureName) print(findParams.print_(margin=' ')) # Set up the database. database = Database(dbParams) backend = Backend() backend.configure(database.dbParams) # Read the sequence out of the PDB ss.txt file. chains = Reads() sequenceFile = join(dirname(light.__file__), '..', 'data', 'pdb-20160303-ss.txt') for record in SSFastaReads(sequenceFile, checkAlphabet=0): if structureName in record.id: chainName = record.id.split('_')[2].lower() chains.add(SSAAReadWithX(chainName, record.sequence, record.structure)) assert len(chains) > 0, ('%r does not contain any sequences with id %r' % (sequenceFile, structureName)) if first: firstStructureName = structureName + '1' structureName += '1' # Load the structure into PyMOL. cmd.load(structureFile, structureName) # Set the display.