示例#1
0
文件: parser.py 项目: mgymrek/PyVCF
    def _parse_samples(self, samples, samp_fmt, site):
        '''Parse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        '''

        # check whether we already know how to parse this format
        if samp_fmt not in self._format_cache:
            self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
        samp_fmt = self._format_cache[samp_fmt]

        if cparse:
            return cparse.parse_samples(
                self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site)

        samp_data = []
        _map = self._map

        nfields = len(samp_fmt._fields)

        for name, sample in itertools.izip(self.samples, samples):

            # parse the data for this sample
            sampdat = [None] * nfields

            for i, vals in enumerate(sample.split(':')):

                # short circuit the most common
                if samp_fmt._fields[i] == 'GT':
                    sampdat[i] = vals
                    continue
                elif vals == ".":
                    sampdat[i] = None
                    continue

                entry_num = samp_fmt._nums[i]
                entry_type = samp_fmt._types[i]

                # we don't need to split single entries
                if entry_num == 1 or ',' not in vals:

                    if entry_type == 'Integer':
                        try:
                            sampdat[i] = int(vals)
                        except ValueError:
                            sampdat[i] = float(vals)
                    elif entry_type == 'Float':
                        sampdat[i] = float(vals)
                    else:
                        sampdat[i] = vals

                    if entry_num != 1:
                        sampdat[i] = (sampdat[i])

                    continue

                vals = vals.split(',')

                if entry_type == 'Integer':
                    try:
                        sampdat[i] = _map(int, vals)
                    except ValueError:
                        sampdat[i] = _map(float, vals)
                elif entry_type == 'Float' or entry_type == 'Numeric':
                    sampdat[i] = _map(float, vals)
                else:
                    sampdat[i] = vals

            # create a call object
            call = _Call(site, name, samp_fmt(*sampdat))
            samp_data.append(call)

        return samp_data
示例#2
0
    def _parse_samples(self, samples, samp_fmt, site):
        '''Parse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        '''

        # check whether we already know how to parse this format
        if samp_fmt not in self._format_cache:
            self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
        samp_fmt = self._format_cache[samp_fmt]

        if cparse:
            return cparse.parse_samples(self.samples, samples, samp_fmt,
                                        samp_fmt._types, samp_fmt._nums, site)

        samp_data = []
        _map = self._map

        nfields = len(samp_fmt._fields)

        for name, sample in zip(self.samples, samples):

            # parse the data for this sample
            sampdat = [None] * nfields

            for i, vals in enumerate(sample.split(':')):

                # short circuit the most common
                if samp_fmt._fields[i] == 'GT':
                    sampdat[i] = vals
                    continue
                # genotype filters are a special case
                elif samp_fmt._fields[i] == 'FT':
                    sampdat[i] = self._parse_filter(vals)
                    continue
                elif not vals or vals == ".":
                    sampdat[i] = None
                    continue

                entry_num = samp_fmt._nums[i]
                entry_type = samp_fmt._types[i]

                # we don't need to split single entries
                if entry_num == 1:
                    if entry_type == 'Integer':
                        try:
                            sampdat[i] = int(vals)
                        except ValueError:
                            sampdat[i] = float(vals)
                    elif entry_type == 'Float' or entry_type == 'Numeric':
                        sampdat[i] = float(vals)
                    else:
                        sampdat[i] = vals
                    continue

                vals = vals.split(',')
                if entry_type == 'Integer':
                    try:
                        sampdat[i] = _map(int, vals)
                    except ValueError:
                        sampdat[i] = _map(float, vals)
                elif entry_type == 'Float' or entry_type == 'Numeric':
                    sampdat[i] = _map(float, vals)
                else:
                    sampdat[i] = vals

            # create a call object
            call = _Call(site, name, samp_fmt(*sampdat))
            samp_data.append(call)

        return samp_data
示例#3
0
    def _parse_samples(self, samples, samp_fmt, site, EntryDbID):
        '''Parse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        '''

        # check whether we already know how to parse this format
        # TODO at some point add DB
        # TODO 1 remove print when ready
        print samp_fmt
        individGeno = samp_fmt.split(":")
        IndividualFunctions = []
        CustomGeno = []
        #Supported
        #TODO individual
        #JULIA: AD DP, GLE, GL, EC GP, GT, FT, PL, GQ, HQ, PS, PQ        
        
        for genotype in individGeno:
            if ( genotype == "AD" ):
                IndividualFunctions.append(self.db.createAD)
            elif (genotype == "DP" ):
                 IndividualFunctions.append(self.db.createDP)
            elif (genotype == "EC" ):
                IndividualFunctions.append(self.db.createEC)
            elif (genotype == "FT" ):
                IndividualFunctions.append(self.db.createFT)
            elif (genotype == "GL" ):
                IndividualFunctions.append(self.db.createGL)
            elif (genotype == "GLE" ):      
                 IndividualFunctions.append(self.db.createGLE)
            elif (genotype == "GP" ):
                IndividualFunctions.append(self.db.createGP)
            elif (genotype == "GQ" ):
                IndividualFunctions.append(self.db.createGQ)
            elif (genotype == "GT" ):
                IndividualFunctions.append(self.db.createGT)
            elif (genotype == "HQ" ):
                IndividualFunctions.append(self.db.createHQ)
            elif (genotype == "PL" ):      
                 IndividualFunctions.append(self.db.createPL)
            elif (genotype == "PQ" ):
                IndividualFunctions.append(self.db.createPQ)
            elif (genotype == "PS" ):
                IndividualFunctions.append(self.db.createPS)
            else:
                CustomGeno.append( genotype )
                IndividualFunctions.append(self.db.createIndividualDefault)
           
        
        if samp_fmt not in self._format_cache:
            self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)

        samp_fmt = self._format_cache[samp_fmt]

        if cparse:
            return cparse.parse_samples(
                self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site)

        samp_data = []
        _map = self._map

        nfields = len(samp_fmt._fields)
        
        indNumber = 0;
        indId = 0

        for name, sample in itertools.izip(self.samples, samples):
            
            customCount = 0
            
            indId = self.db.createIndividualEntry( EntryDbID, indNumber ); 
            if indId == -1:
                print "Failed to create individual entry"
                
            indNumber += 1
            # parse the data for this sample
            sampdat = [None] * nfields

            for i, vals in enumerate(sample.split(':')):
                #TODO individ here
                # short circuit the most common
                #MINE
                if ( IndividualFunctions[i] == self.db.createIndividualDefault ):
                    IndividualFunctions[i]( CustomGeno[customCount], indId, vals )
                    customCount += 1
                else:
                    IndividualFunctions[i]( indId, vals ) 
                
                
                if vals == '.' or vals == './.':
                    sampdat[i] = None
                    continue

                entry_num = samp_fmt._nums[i]
                entry_type = samp_fmt._types[i]

                # we don't need to split single entries
                if entry_num == 1 or ',' not in vals:

                    #TODO: add DB upload and subroutines
                    if entry_type == 'Integer':
                        sampdat[i] = int(vals)
                    elif entry_type == 'Float':
                        sampdat[i] = float(vals)
                    else:
                        sampdat[i] = vals

                    if entry_num != 1:
                        sampdat[i] = (sampdat[i])

                    continue

                vals = vals.split(',')

                if entry_type == 'Integer':
                    sampdat[i] = _map(int, vals)
                elif entry_type == 'Float' or entry_type == 'Numeric':
                    sampdat[i] = _map(float, vals)
                else:
                    sampdat[i] = vals

            # create a call object
            call = _Call(site, name, samp_fmt(*sampdat))
            samp_data.append(call)

        return samp_data
示例#4
0
文件: parser.py 项目: cmclean/PyVCF
    def _parse_samples(self, samples, samp_fmt, site):
        '''Parse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        '''

        # check whether we already know how to parse this format
        if samp_fmt in self._format_cache:
            samp_fmt, samp_fmt_types, samp_fmt_nums = \
                    self._format_cache[samp_fmt]
        else:
            sf, samp_fmt_types, samp_fmt_nums = self._parse_sample_format(samp_fmt)
            self._format_cache[samp_fmt] = (sf, samp_fmt_types, samp_fmt_nums)
            samp_fmt = sf

        if cparse:
            return cparse.parse_samples(
                self.samples, samples, samp_fmt, samp_fmt_types, samp_fmt_nums, site)

        samp_data = []
        _map = self._map

        for name, sample in itertools.izip(self.samples, samples):

            # parse the data for this sample
            sampdict = dict([(x, None) for x in samp_fmt])

            for fmt, entry_type, entry_num, vals in itertools.izip(
                    samp_fmt, samp_fmt_types, samp_fmt_nums, sample.split(':')):

                # short circuit the most common
                if vals == '.' or vals == './.':
                    sampdict[fmt] = None
                    continue

                # we don't need to split single entries
                if entry_num == 1 or ',' not in vals:

                    if entry_type == 'Integer':
                        sampdict[fmt] = int(vals)
                    elif entry_type == 'Float':
                        sampdict[fmt] = float(vals)
                    else:
                        sampdict[fmt] = vals

                    if entry_num != 1:
                        sampdict[fmt] = (sampdict[fmt])

                    continue

                vals = vals.split(',')

                if entry_type == 'Integer':
                    sampdict[fmt] = _map(int, vals)
                elif entry_type == 'Float' or entry_type == 'Numeric':
                    sampdict[fmt] = _map(float, vals)
                else:
                    sampdict[fmt] = vals

            # create a call object
            call = _Call(site, name, sampdict)
            samp_data.append(call)

        return samp_data
示例#5
0
    def _parse_samples(self, samples, samp_fmt, EntryDbID):
        '''Parse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        '''
        individGeno = samp_fmt.split(":")
        IndividualFunctions = []
        CustomGeno = []
		
        for genotype in individGeno:
            if ( genotype == "AD" ):
                IndividualFunctions.append(self.db.createAD)
            elif (genotype == "DP" ):
                 IndividualFunctions.append(self.db.createDP)
            elif (genotype == "EC" ):
                IndividualFunctions.append(self.db.createEC)
            elif (genotype == "FT" ):
                IndividualFunctions.append(self.db.createFT)
            elif (genotype == "GL" ):
                IndividualFunctions.append(self.db.createGL)
            elif (genotype == "GLE" ):      
                 IndividualFunctions.append(self.db.createGLE)
            elif (genotype == "GP" ):
                IndividualFunctions.append(self.db.createGP)
            elif (genotype == "GQ" ):
                IndividualFunctions.append(self.db.createGQ)
            elif (genotype == "GT" ):
                IndividualFunctions.append(self.db.createGT)
            elif (genotype == "HQ" ):
                IndividualFunctions.append(self.db.createHQ)
            elif (genotype == "PL" ):      
                 IndividualFunctions.append(self.db.createPL)
            elif (genotype == "PQ" ):
                IndividualFunctions.append(self.db.createPQ)
            elif (genotype == "PS" ):
                IndividualFunctions.append(self.db.createPS)
            else:
                CustomGeno.append( genotype )
                IndividualFunctions.append(self.db.createIndividualDefault)
           
        
        if samp_fmt not in self._format_cache:
            self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)

        samp_fmt = self._format_cache[samp_fmt]

        if cparse:
            return cparse.parse_samples(
                self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site)

        samp_data = []
        _map = self._map

        nfields = len(samp_fmt._fields)
        
        indNumber = 0;
        indId = 0

        for name, sample in itertools.izip(self.samples, samples):
            
            customCount = 0
            
            indId = self.db.createIndividualEntry( EntryDbID, indNumber ); 
            if indId == -1:
                print "Failed to create individual entry"
                
            indNumber += 1
            # parse the data for this sample
            sampdat = [None] * nfields

            for i, vals in enumerate(sample.split(':')):
                if ( IndividualFunctions[i] == self.db.createIndividualDefault ):
                    IndividualFunctions[i]( CustomGeno[customCount], indId, vals )
                    customCount += 1
                else:
                    IndividualFunctions[i]( indId, vals ) 
                
                
                if vals == '.' or vals == './.':
                    sampdat[i] = None
                    continue

                entry_num = samp_fmt._nums[i]
                entry_type = samp_fmt._types[i]

                if entry_num == 1 or ',' not in vals:

                    if entry_type == 'Integer':
                        sampdat[i] = int(vals)
                    elif entry_type == 'Float':
                        sampdat[i] = float(vals)
                    else:
                        sampdat[i] = vals

                    if entry_num != 1:
                        sampdat[i] = (sampdat[i])

                    continue

                vals = vals.split(',')

                if entry_type == 'Integer':
                    sampdat[i] = _map(int, vals)
                elif entry_type == 'Float' or entry_type == 'Numeric':
                    sampdat[i] = _map(float, vals)
                else:
                    sampdat[i] = vals
示例#6
0
    def _parse_samples(self, samples, samp_fmt, site):
        '''Parse a sample entry according to the format specified in the FORMAT
        column.

        NOTE: this method has a cython equivalent and care must be taken
        to keep the two methods equivalent
        '''

        # check whether we already know how to parse this format
        if samp_fmt not in self._format_cache:
            self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)

        samp_fmt = self._format_cache[samp_fmt]

        if cparse:
            return cparse.parse_samples(self.samples, samples, samp_fmt,
                                        samp_fmt._types, samp_fmt._nums, site)

        samp_data = []
        _map = self._map

        nfields = len(samp_fmt._fields)

        for name, sample in itertools.izip(self.samples, samples):

            # parse the data for this sample
            sampdat = [None] * nfields

            for i, vals in enumerate(sample.split(':')):

                # short circuit the most common
                if vals == '.' or vals == './.':
                    sampdat[i] = None
                    continue

                if i >= len(samp_fmt._nums):
                    # print 'Error: i = ' + str(i) + ', samp_fmt._nums = ' + str(samp_fmt._nums) + \
                    #       ', nfields = ' + str(nfields) + ', samp_fmt._fields = ' + str(samp_fmt._fields) + \
                    #       ', sample = ' + str(sample)
                    sys.stderr.write('i >= len(samp_fmt._nums)' + ' - i=' +
                                     str(i) + ', len(samp_fmt._nums)=' +
                                     str(len(samp_fmt._nums)) + ', name=' +
                                     name + ', vals=' + str(vals) + '\n')
                    break
                entry_num = samp_fmt._nums[i]
                entry_type = samp_fmt._types[i]

                # we don't need to split single entries
                if entry_num == 1 or ',' not in vals:

                    if entry_type == 'Integer':
                        try:
                            sampdat[i] = int(vals)
                        except ValueError:
                            try:
                                sampdat[i] = float(vals)
                            except ValueError:
                                sampdat[i] = None
                    elif entry_type == 'Float':
                        try:
                            sampdat[i] = float(vals)
                        except ValueError:
                            sampdat[i] = None
                    else:
                        sampdat[i] = vals

                    if entry_num != 1:
                        sampdat[i] = (sampdat[i])

                    continue

                vals = vals.split(',')

                if entry_type == 'Integer':
                    try:
                        sampdat[i] = _map(int, vals)
                    except ValueError:
                        try:
                            sampdat[i] = _map(float, vals)
                        except ValueError:
                            sampdat[i] = [None for v in vals]
                elif entry_type == 'Float' or entry_type == 'Numeric':
                    try:
                        sampdat[i] = _map(float, vals)
                    except ValueError:
                        sampdat[i] = [None for v in vals]
                else:
                    sampdat[i] = vals

            # create a call object
            call = _Call(site, name, samp_fmt(*sampdat))
            samp_data.append(call)

        return samp_data