def next(self): next_record = None used_records = [False for i in range(0, len(self.next_records))] for vcf_index in range(0, len(self.next_records)): if self.next_records[vcf_index] is None: try: self.next_records[vcf_index] = self.vcf_list[vcf_index].next() except StopIteration: continue if next_record is None or (self.ascii_sort and next_record > self.next_records[vcf_index]) or\ (not self.ascii_sort and (next_record.chrnumber, next_record.POS) > (self.next_records[vcf_index].chrnumber, self.next_records[vcf_index].POS)): if next_record is not None: used_records = [False for i in range(0, len(self.next_records))] next_record = self.next_records[vcf_index] used_records[vcf_index] = True elif (next_record.CHROM, next_record.POS) == \ (self.next_records[vcf_index].CHROM, self.next_records[vcf_index].POS): if next_record.merge(self.next_records[vcf_index]): used_records[vcf_index] = True if not any(record is not None for record in self.next_records): raise StopIteration for i in range(0, len(used_records)): if used_records[i]: self.next_records[i] = None for sample in self.samples: if next_record.get_sample(sample) is None: samp_fmt = self._parse_sample_format('DP:FREQ') my_freqs = [None] my_freqs.extend([None for alt in next_record.ALT]) samp_data = [None, my_freqs] next_record.add_call(_Call(next_record, sample, samp_fmt(*samp_data))) next_record._sort_samples(sort=self.samples) return next_record
def _parse_samples(self, samples, samp_fmt, site): '''Parse a sample entry according to the format specified in the FORMAT column. NOTE: this method has a cython equivalent and care must be taken to keep the two methods equivalent ''' # check whether we already know how to parse this format if samp_fmt not in self._format_cache: self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt) samp_fmt = self._format_cache[samp_fmt] if cparse: return cparse.parse_samples( self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site) samp_data = [] _map = self._map nfields = len(samp_fmt._fields) for name, sample in itertools.izip(self.samples, samples): # parse the data for this sample sampdat = [None] * nfields for i, vals in enumerate(sample.split(':')): # short circuit the most common if samp_fmt._fields[i] == 'GT': sampdat[i] = vals continue elif vals == ".": sampdat[i] = None continue entry_num = samp_fmt._nums[i] entry_type = samp_fmt._types[i] # we don't need to split single entries if entry_num == 1 or ',' not in vals: if entry_type == 'Integer': try: sampdat[i] = int(vals) except ValueError: sampdat[i] = float(vals) elif entry_type == 'Float': sampdat[i] = float(vals) else: sampdat[i] = vals if entry_num != 1: sampdat[i] = (sampdat[i]) continue vals = vals.split(',') if entry_type == 'Integer': try: sampdat[i] = _map(int, vals) except ValueError: sampdat[i] = _map(float, vals) elif entry_type == 'Float' or entry_type == 'Numeric': sampdat[i] = _map(float, vals) else: sampdat[i] = vals # create a call object call = _Call(site, name, samp_fmt(*sampdat)) samp_data.append(call) return samp_data
def _parse_samples(self, samples, samp_fmt, site): '''Parse a sample entry according to the format specified in the FORMAT column. NOTE: this method has a cython equivalent and care must be taken to keep the two methods equivalent ''' # check whether we already know how to parse this format if samp_fmt not in self._format_cache: self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt) samp_fmt = self._format_cache[samp_fmt] if cparse: return cparse.parse_samples( self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site) samp_data = [] _map = self._map nfields = len(samp_fmt._fields) for name, sample in itertools.izip(self.samples, samples): # parse the data for this sample sampdat = [None] * nfields for i, vals in enumerate(sample.split(':')): # short circuit the most common if vals == '.' or vals == './.': sampdat[i] = None continue entry_num = samp_fmt._nums[i] entry_type = samp_fmt._types[i] # we don't need to split single entries if entry_num == 1 or ',' not in vals: if entry_type == 'Integer': try: sampdat[i] = int(vals) except ValueError: sampdat[i] = float(vals) elif entry_type == 'Float': sampdat[i] = float(vals) else: sampdat[i] = vals if entry_num != 1: sampdat[i] = (sampdat[i]) continue vals = vals.split(',') if entry_type == 'Integer': try: sampdat[i] = _map(int, vals) except ValueError: sampdat[i] = _map(float, vals) elif entry_type == 'Float' or entry_type == 'Numeric': sampdat[i] = _map(float, vals) else: sampdat[i] = vals # create a call object call = _Call(site, name, samp_fmt(*sampdat)) samp_data.append(call) return samp_data
def _parse_samples(self, samples, samp_fmt, site, EntryDbID): '''Parse a sample entry according to the format specified in the FORMAT column. NOTE: this method has a cython equivalent and care must be taken to keep the two methods equivalent ''' # check whether we already know how to parse this format # TODO at some point add DB # TODO 1 remove print when ready print samp_fmt individGeno = samp_fmt.split(":") IndividualFunctions = [] CustomGeno = [] #Supported #TODO individual #JULIA: AD DP, GLE, GL, EC GP, GT, FT, PL, GQ, HQ, PS, PQ for genotype in individGeno: if ( genotype == "AD" ): IndividualFunctions.append(self.db.createAD) elif (genotype == "DP" ): IndividualFunctions.append(self.db.createDP) elif (genotype == "EC" ): IndividualFunctions.append(self.db.createEC) elif (genotype == "FT" ): IndividualFunctions.append(self.db.createFT) elif (genotype == "GL" ): IndividualFunctions.append(self.db.createGL) elif (genotype == "GLE" ): IndividualFunctions.append(self.db.createGLE) elif (genotype == "GP" ): IndividualFunctions.append(self.db.createGP) elif (genotype == "GQ" ): IndividualFunctions.append(self.db.createGQ) elif (genotype == "GT" ): IndividualFunctions.append(self.db.createGT) elif (genotype == "HQ" ): IndividualFunctions.append(self.db.createHQ) elif (genotype == "PL" ): IndividualFunctions.append(self.db.createPL) elif (genotype == "PQ" ): IndividualFunctions.append(self.db.createPQ) elif (genotype == "PS" ): IndividualFunctions.append(self.db.createPS) else: CustomGeno.append( genotype ) IndividualFunctions.append(self.db.createIndividualDefault) if samp_fmt not in self._format_cache: self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt) samp_fmt = self._format_cache[samp_fmt] if cparse: return cparse.parse_samples( self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site) samp_data = [] _map = self._map nfields = len(samp_fmt._fields) indNumber = 0; indId = 0 for name, sample in itertools.izip(self.samples, samples): customCount = 0 indId = self.db.createIndividualEntry( EntryDbID, indNumber ); if indId == -1: print "Failed to create individual entry" indNumber += 1 # parse the data for this sample sampdat = [None] * nfields for i, vals in enumerate(sample.split(':')): #TODO individ here # short circuit the most common #MINE if ( IndividualFunctions[i] == self.db.createIndividualDefault ): IndividualFunctions[i]( CustomGeno[customCount], indId, vals ) customCount += 1 else: IndividualFunctions[i]( indId, vals ) if vals == '.' or vals == './.': sampdat[i] = None continue entry_num = samp_fmt._nums[i] entry_type = samp_fmt._types[i] # we don't need to split single entries if entry_num == 1 or ',' not in vals: #TODO: add DB upload and subroutines if entry_type == 'Integer': sampdat[i] = int(vals) elif entry_type == 'Float': sampdat[i] = float(vals) else: sampdat[i] = vals if entry_num != 1: sampdat[i] = (sampdat[i]) continue vals = vals.split(',') if entry_type == 'Integer': sampdat[i] = _map(int, vals) elif entry_type == 'Float' or entry_type == 'Numeric': sampdat[i] = _map(float, vals) else: sampdat[i] = vals # create a call object call = _Call(site, name, samp_fmt(*sampdat)) samp_data.append(call) return samp_data