def process_line(self, record): # we are revisiting the same line once for each sample # until I figure out how the cleaning and loading will # work otherwise # Ensures the cache is updated and available self.variants.ensure_cache(record) # Calculate the MD5 of the variant itself (not the record) md5 = calculate_md5(record) # Ensure the variant exists variant_id = self.variants.get(md5) assert variant_id is not None cleaned = super(ResultStream, self).process_line(record) # Remove variant specific parts cleaned = cleaned[4:] # can these be indexed? call = record.genotype(self.vcf_sample) #for sample in record.samples: # if str(sample.sample) == str(self.vcf_sample): # call = sample # break #log.debug('record {0} annotating sample {1} with variant {2} details {3}'.format(record,self.vcf_sample, variant_id,call)) # empty values cause KeyErrors# if None in (call['AD'], call['DP'], call['GQ'], call['GT'], call['PL']): return None # already seen this variant for this sample # otherwise we would get a duplicate key value violation in sample_result if Result.objects.filter(variant=variant_id, sample=self.sample_id).exists(): return None # the possibility for multiple alleles in these wide vcfs is almost infinite # so we need to triage the really weird ones into having a reference allele "0/#" # or being off the map entirely "#/#"" try: keyed_geno = self.genotypes[call['GT']] except KeyError, e: if call['GT'].startswith('0'): keyed_geno = self.genotypes['0/#'] else: keyed_geno = self.genotypes['#/#']
def process_line(self, record): # we are revisiting the same line once for each sample # until I figure out how the cleaning and loading will # work otherwise # Ensures the cache is updated and available self.variants.ensure_cache(record) # Calculate the MD5 of the variant itself (not the record) md5 = calculate_md5(record) # Ensure the variant exists variant_id = self.variants.get(md5) assert variant_id is not None cleaned = super(ResultStream, self).process_line(record) # Remove variant specific parts cleaned = cleaned[4:] # can these be indexed? call = record.genotype(self.vcf_sample) #for sample in record.samples: # if str(sample.sample) == str(self.vcf_sample): # call = sample # break #log.debug('record {0} annotating sample {1} with variant {2} details {3}'.format(record,self.vcf_sample, variant_id,call)) # empty values cause KeyErrors# if None in (call['AD'],call['DP'],call['GQ'],call['GT'],call['PL']): return None # already seen this variant for this sample # otherwise we would get a duplicate key value violation in sample_result if Result.objects.filter(variant=variant_id,sample=self.sample_id).exists(): return None # the possibility for multiple alleles in these wide vcfs is almost infinite # so we need to triage the really weird ones into having a reference allele "0/#" # or being off the map entirely "#/#"" try: keyed_geno = self.genotypes[call['GT']] except KeyError, e: if call['GT'].startswith('0'): keyed_geno = self.genotypes['0/#'] else: keyed_geno = self.genotypes['#/#']
def get_variant(self, record): "Get or create a variant." chrom, pos, ref, alt = record.CHROM, record.POS, record.REF, '/'.join( [str(x) for x in record.ALT]) # Calculate MD5 and attempt to fetch the primary key from # the local cache, otherwise use it when inserting. md5 = calculate_md5(chrom, pos, ref, alt) # Ensure the cache is valid for the chromosome self.check_cache(chrom) variant_id = self.variant_cache.get(md5, None) # Just make a faux instance if variant_id: variant = Variant(pk=variant_id) # Create if it does not exist else: variant = Variant(pos=pos, ref=ref, alt=alt, md5=md5) # Update foreign key references variant.chr = self.get_chromosome(chrom) variant.type = self.get_variant_type(record.var_type.upper()) # Periods are useless.. variant.rsid = record.ID == '.' and None or record.ID variant.save() self.file_variants += 1 # Update cache self.variant_cache[md5] = variant.pk # Process SNPEff data if this is the first time this variant # has been seen. if 'EFF' in record.INFO: effs = record.INFO['EFF'].split(',') self.load_effects(effs, variant) return variant
def get_variant(self, record): "Get or create a variant." chrom, pos, ref, alt = record.CHROM, record.POS, record.REF, '/'.join([str(x) for x in record.ALT]) # Calculate MD5 and attempt to fetch the primary key from # the local cache, otherwise use it when inserting. md5 = calculate_md5(chrom, pos, ref, alt) # Ensure the cache is valid for the chromosome self.check_cache(chrom) variant_id = self.variant_cache.get(md5, None) # Just make a faux instance if variant_id: variant = Variant(pk=variant_id) # Create if it does not exist else: variant = Variant(pos=pos, ref=ref, alt=alt, md5=md5) # Update foreign key references variant.chr = self.get_chromosome(chrom) variant.type = self.get_variant_type(record.var_type.upper()) # Periods are useless.. variant.rsid = record.ID == '.' and None or record.ID variant.save() self.file_variants += 1 # Update cache self.variant_cache[md5] = variant.pk # Process SNPEff data if this is the first time this variant # has been seen. if 'EFF' in record.INFO: effs = record.INFO['EFF'].split(',') self.load_effects(effs, variant) return variant
def readline(self, size=-1): "Ignore the `size` since a complete line must be processed." while True: try: record = next(self.reader) except StopIteration: break # Ensure this is a valid record if checks.record_is_valid(record): if self.use_cache: # Ensures the cache is updated and available self.variants.ensure_cache(record) # Calculate the MD5 of the variant itself (not the record) md5 = calculate_md5(record) # Ensure this variant is not already loaded if not self.use_cache or md5 not in self.variants: cleaned = self.process_line(record) cleaned.append(md5) return self.outdel.join(cleaned) + '\n' return ''
def process_line(self, record): cleaned = super(EVSProcessor, self).process_line(record) # Add the MD5 cleaned.append(calculate_md5(*cleaned[:4])) return cleaned
def process_line(self, record): # Calculate MD5 using extracted values md5 = calculate_md5(record) # Ensures the cache is updated and available self.variants.ensure_cache(record) # Ensure the variant exists variant_id = self.variants.get(md5) if not variant_id: log.error('Missing variant', extra={ 'chr': record.CHROM, 'pos': record.POS, 'ref': record.REF, 'alt': record.ALT, }) return # Skip processing effects since they are only loaded once if self._effects_exists(md5, variant_id): return effects = [] # Is this returning a list now?? effects_line = record.INFO.get('EFF', []) # Multiple separate SNPEff records for eff in effects_line: effect, values = self._snpeff_dict(eff) transcript = values.get('Transcript') gene_pk = self.get_gene(values.get('Gene_Name')) row = [ variant_id, values.get('Codon_Change'), values.get('Amino_acid_change'), self.effects.get(effect), self.functional_classes.get(values.get('Functional_Class')), gene_pk, self.get_transcript(gene_pk, transcript), ] # Extension fields from snpEff CBMi fork segment = values.get('Segment') hgvs_c = values.get('HGVS_DNA_nomenclature') hgvs_p = values.get('HGVS_protein_nomenclature') # Trim off transcript prefix, clean up format if transcript: if segment and segment.startswith(transcript): segment = \ segment[len(transcript):].strip('._').replace('_', '.') if hgvs_c and hgvs_c.startswith(transcript): hgvs_c = hgvs_c[len(transcript):].lstrip(':') row.extend([ segment, hgvs_c, hgvs_p, ]) effects.append([self.process_column('', x) for x in row]) if not effects: log.error('No effects process for variant', extra={ 'chr': record.CHROM, 'pos': record.POS, 'ref': record.REF, 'alt': record.ALT, }) return effects
def process_line(self, record): # we are revisiting the same line once for each sample # until I figure out how the cleaning and loading will # work otherwise # Ensures the cache is updated and available self.variants.ensure_cache(record) # Calculate the MD5 of the variant itself (not the record) md5 = calculate_md5(record) # Ensure the variant exists variant_id = self.variants.get(md5) assert variant_id is not None cleaned = super(ResultStream, self).process_line(record) # Remove variant specific parts cleaned = cleaned[4:] # can these be indexed? call = record.genotype(self.vcf_sample) # Already seen this variant for this sample, otherwise we would get a # duplicate key value violation in sample_result. if Result.objects.filter( variant=variant_id, sample=self.sample_id).exists(): return None # The possibility for multiple alleles in these wide vcfs is almost # infinite so we need to triage the really weird ones into having a # reference allele "0/#" or being off the map entirely "#/#"". gt = getattr(call, 'GT', None) if gt: try: keyed_geno = self.genotypes[gt] except KeyError: if gt.startswith('0'): keyed_geno = self.genotypes['0/#'] else: keyed_geno = self.genotypes['#/#'] else: keyed_geno = None dp = getattr(call, 'DP', None) gq = getattr(call, 'GQ', None) ad = getattr(call, 'AD', None) if ad and len(ad) > 1: ad0 = ad[0] ad1 = ad[1] else: ad0 = None ad1 = None pl = getattr(call, 'PL', None) if pl: pl = ','.join([str(x) for x in pl]) # Append remaining columns other = [variant_id, self.sample_id, dp, record.QUAL, keyed_geno, gq, ad0, ad1, pl, self.now, self.now] cleaned.extend([self.process_column('', x) for x in other]) return cleaned
def process_line(self, record): # Calculate MD5 using extracted values md5 = calculate_md5(record) # Ensures the cache is updated and available self.variants.ensure_cache(record) # Ensure the variant exists variant_id = self.variants.get(md5) if not variant_id: log.error('Missing variant', extra={ 'chr': record.CHROM, 'pos': record.POS, 'ref': record.REF, 'alt': record.ALT, }) return # Skip processing effects since they are only loaded once if self._effects_exists(md5, variant_id): return effects = [] #is this returning a list now?? effects_line = record.INFO['EFF'] # Multiple separate SNPEff records for eff in effects_line: effect, values = self._snpeff_dict(eff) transcript = values.get('Transcript') gene_pk = self.get_gene(values.get('Gene_Name')) row = [ variant_id, values.get('Codon_Change'), values.get('Amino_acid_change'), self.effects.get(effect), self.functional_classes.get(values.get('Functional_Class')), gene_pk, self.get_transcript(gene_pk, transcript), ] # Extension fields from snpEff CBMi fork segment = values.get('Segment') hgvs_c = values.get('HGVS_DNA_nomenclature') hgvs_p = values.get('HGVS_protein_nomenclature') # Trim off transcript prefix, clean up format if transcript: if segment and segment.startswith(transcript): segment = segment[len(transcript):].strip('._').replace( '_', '.') if hgvs_c and hgvs_c.startswith(transcript): hgvs_c = hgvs_c[len(transcript):].lstrip(':') row.extend([ segment, hgvs_c, hgvs_p, ]) effects.append([self.process_column('', x) for x in row]) if not effects: log.error('No effects process for variant', extra={ 'chr': record.CHROM, 'pos': record.POS, 'ref': record.REF, 'alt': record.ALT, }) return effects
def process_line(self, record): cleaned = super(ThousandGProcessor, self).process_line(record) # Add the MD5 cleaned.append(calculate_md5(*cleaned[:4])) return cleaned