예제 #1
0
    def __init__(self, inStr):
        # print 'seq input', inStr
        self.successfullyParsed = False
        self.features = []
        self.residues_nuc = '-'
        self.residues_prt = '-'
        self.translations = []

        self.actualSeqIdNo = 0
        self.actualMolType = '-'
        self.actualLength = 0
        self.mixedMode = False
        self.isSkipCode = False

        m = sequencePattern.match(inStr)
        if m:
            self.seqIdNo = safeStrip(m.group('seqIdNo'))
            self.length = safeStrip(m.group('length'))
            self.molType = safeStrip(m.group('molType'))
            self.organism = safeStrip(m.group('organism'))

            featureTable = safeStrip(m.group('featureTable'))

            self.seqNo400 = safeStrip(m.group('seqNo400'))

            if featureTable:
                fiter = re.finditer(featurePattern, featureTable)
                for fmatcher in fiter:
                    if fmatcher:
                        self.features.append(Feature(fmatcher))

            residues = m.group('residues')
            nucList = []
            prtList = []
            for line in residues.splitlines():
                if nucPattern.match(line):
                    nucList.append(line)
                else: #if prtPattern.match(line): TODO: add more robust code
                    prtList.append(line)

            self.residues_nuc = ''.join(nucList)
            self.residues_prt = ''.join(prtList)

            self.residues_nuc = re.sub(r'[\s,\d]', '', self.residues_nuc)
            self.residues_prt = re.sub(r'[\s,\d]', '', self.residues_prt)

            if len(self.residues_nuc) > 0 and len(self.residues_prt) > 0:
                self.mixedMode = True
            if self.residues_nuc == '' and self.residues_prt == '':
                self.isSkipCode = True
                
            if self.mixedMode:
                currentStart = 0
                for f in self.features:
                    if f.key == 'CDS':
                        t = su.getRangeFromLocation(f.location)
                        currentTranslationLength = t[1] - t[0]
                        currentEnd = currentStart + currentTranslationLength +1
                        currentTranslation = self.residues_prt[currentStart:currentEnd]
                        currentStart = currentEnd 
                        self.translations.append(currentTranslation)
                        f.translation = currentTranslation

            self.__setActualMolType__()
            self.__setActualLength__()
            self.successfullyParsed = True #TODO: to add unittest for False

            # print '='*30
        else:
            raise SeqlException('Parser failed for input:\n%s' % inStr)
예제 #2
0
    def __init__(self, aStr):
        self.successfullyParsed = False
        self.features = []
        self.residues_nuc = '-'
        self.residues_prt = '-'
        self.translations = []

        self.actualSeqIdNo = 0
        self.actualMolType = '-'
        self.actualLength = 0
        self.mixedMode = False
        self.isSkipCode = False

        sm = SEQUENCE_PATTERN.match(aStr)

        if sm:
            #             print 'Sequence match found.'
            self.seqIdNo_raw = sm.group('seqIdNo_raw')
            self.seqIdNo = safeStrip(sm.group('seqIdNo'))
            self.length_raw = sm.group('length_raw')
            self.length = safeStrip(sm.group('length'))
            self.molType_raw = sm.group('molType_raw')
            self.molType = safeStrip(sm.group('molType'))
            self.organism_raw = sm.group('organism_raw')
            self.organism = safeStrip(sm.group('organism'))

            featuresString = sm.group('features_raw')
            #             print featuresString
            if featuresString:
                featureMatchers = FEATURE_PATTERN.finditer(featuresString)
                for fm in featureMatchers:
                    self.features.append(Feature(fm))

            self.residues_raw = sm.group('residues_raw')
            self.seqNo400 = safeStrip(sm.group('seqNo400'))

            residues = sm.group('residues')

            nucList = []
            prtList = []
            for line in residues.splitlines():
                if nucPattern.match(line):
                    nucList.append(line)
                else:  #if prtPattern.match(line): TODO: add more robust code
                    prtList.append(line)

            self.residues_nuc = ''.join(nucList)
            self.residues_prt = ''.join(prtList)

            self.residues_nuc = re.sub(r'[\s,\d]', '', self.residues_nuc)
            self.residues_prt = re.sub(r'[\s,\d]', '', self.residues_prt)

            if len(self.residues_nuc) > 0 and len(self.residues_prt) > 0:
                self.mixedMode = True


#             TODO: test it
            if self.residues_nuc == '' and self.residues_prt == '':
                self.isSkipCode = True

            if self.mixedMode:
                currentStart = 0
                for f in self.features:
                    if f.key == 'CDS':
                        t = su.getRangeFromLocation(f.location)
                        currentTranslationLength = t[1] - t[0]
                        currentEnd = currentStart + currentTranslationLength + 1
                        currentTranslation = self.residues_prt[
                            currentStart:currentEnd]
                        currentStart = currentEnd
                        self.translations.append(currentTranslation)
                        f.translation = currentTranslation
            self.__setActualMolType__()
            self.__setActualLength__()
            self.successfullyParsed = True

        else:
            #             print 'File', self.filePath
            print 'Sequence: No match for sequence pattern for input:', aStr
예제 #3
0
    def __init__(self, aStr):
        self.successfullyParsed = False
        self.features = []
        self.residues_nuc = '-'
        self.residues_prt = '-'
        self.translations = []
 
        self.actualSeqIdNo = 0
        self.actualMolType = '-'
        self.actualLength = 0
        self.mixedMode = False
        self.isSkipCode = False
        
        sm = SEQUENCE_PATTERN.match(aStr)
        
        if sm:
#             print 'Sequence match found.'
            self.seqIdNo_raw = sm.group('seqIdNo_raw')
            self.seqIdNo = safeStrip(sm.group('seqIdNo'))
            self.length_raw = sm.group('length_raw')
            self.length = safeStrip(sm.group('length'))
            self.molType_raw = sm.group('molType_raw')
            self.molType = safeStrip(sm.group('molType'))
            self.organism_raw = sm.group('organism_raw')
            self.organism = safeStrip(sm.group('organism'))
            
            featuresString = sm.group('features_raw')
#             print featuresString
            if featuresString:
                featureMatchers = FEATURE_PATTERN.finditer(featuresString)
                for fm in featureMatchers:
                    self.features.append(Feature(fm))
            
            self.residues_raw = sm.group('residues_raw')
            self.seqNo400 = safeStrip(sm.group('seqNo400'))
            
            residues = sm.group('residues')
            
            nucList = []
            prtList = []
            for line in residues.splitlines():
                if nucPattern.match(line):
                    nucList.append(line)
                else: #if prtPattern.match(line): TODO: add more robust code
                    prtList.append(line)
 
            self.residues_nuc = ''.join(nucList)
            self.residues_prt = ''.join(prtList)
 
            self.residues_nuc = re.sub(r'[\s,\d]', '', self.residues_nuc)
            self.residues_prt = re.sub(r'[\s,\d]', '', self.residues_prt)
            
            if len(self.residues_nuc) > 0 and len(self.residues_prt) > 0:
                self.mixedMode = True
#             TODO: test it
            if self.residues_nuc == '' and self.residues_prt == '':
                self.isSkipCode = True
                
            if self.mixedMode:
                currentStart = 0
                for f in self.features:
                    if f.key == 'CDS':
                        t = su.getRangeFromLocation(f.location)
                        currentTranslationLength = t[1] - t[0]
                        currentEnd = currentStart + currentTranslationLength +1
                        currentTranslation = self.residues_prt[currentStart:currentEnd]
                        currentStart = currentEnd 
                        self.translations.append(currentTranslation)
                        f.translation = currentTranslation
            self.__setActualMolType__()
            self.__setActualLength__()
            self.successfullyParsed = True 
            
        else:
#             print 'File', self.filePath
            print 'Sequence: No match for sequence pattern for input:', aStr