Пример #1
0
    def process_line(self, record):
        # we are revisiting the same line once for each sample
        # until I figure out how the cleaning and loading will
        # work otherwise

        # Ensures the cache is updated and available
        self.variants.ensure_cache(record)

        # Calculate the MD5 of the variant itself (not the record)
        md5 = calculate_md5(record)

        # Ensure the variant exists
        variant_id = self.variants.get(md5)
        assert variant_id is not None

        cleaned = super(ResultStream, self).process_line(record)
        # Remove variant specific parts
        cleaned = cleaned[4:]

        # can these be indexed?
        call = record.genotype(self.vcf_sample)
        #for sample in record.samples:
        #    if str(sample.sample) == str(self.vcf_sample):
        #        call = sample
        #        break

        #log.debug('record {0} annotating sample {1} with variant {2} details {3}'.format(record,self.vcf_sample, variant_id,call))

        # empty values cause KeyErrors#
        if None in (call['AD'], call['DP'], call['GQ'], call['GT'],
                    call['PL']):
            return None

        # already seen this variant for this sample
        # otherwise we would get a duplicate key value violation in sample_result
        if Result.objects.filter(variant=variant_id,
                                 sample=self.sample_id).exists():
            return None

        # the possibility for multiple alleles in these wide vcfs is almost infinite
        # so we need to triage the really weird ones into having a reference allele "0/#"
        # or being off the map entirely "#/#""
        try:
            keyed_geno = self.genotypes[call['GT']]
        except KeyError, e:
            if call['GT'].startswith('0'):
                keyed_geno = self.genotypes['0/#']
            else:
                keyed_geno = self.genotypes['#/#']
Пример #2
0
    def process_line(self, record):
        # we are revisiting the same line once for each sample
        # until I figure out how the cleaning and loading will
        # work otherwise
        
        # Ensures the cache is updated and available
        self.variants.ensure_cache(record)

        # Calculate the MD5 of the variant itself (not the record)
        md5 = calculate_md5(record)

        # Ensure the variant exists
        variant_id = self.variants.get(md5)
        assert variant_id is not None

        
        cleaned = super(ResultStream, self).process_line(record)
        # Remove variant specific parts
        cleaned = cleaned[4:]

        # can these be indexed?
        call = record.genotype(self.vcf_sample)
        #for sample in record.samples:
        #    if str(sample.sample) == str(self.vcf_sample):
        #        call = sample
        #        break

        #log.debug('record {0} annotating sample {1} with variant {2} details {3}'.format(record,self.vcf_sample, variant_id,call))
        
        # empty values cause KeyErrors#
        if None in (call['AD'],call['DP'],call['GQ'],call['GT'],call['PL']):
            return None
        
        # already seen this variant for this sample
        # otherwise we would get a duplicate key value violation in sample_result
        if Result.objects.filter(variant=variant_id,sample=self.sample_id).exists():
            return None
        
        # the possibility for multiple alleles in these wide vcfs is almost infinite
        # so we need to triage the really weird ones into having a reference allele "0/#"
        # or being off the map entirely "#/#""
        try:
            keyed_geno = self.genotypes[call['GT']]
        except KeyError, e:
            if call['GT'].startswith('0'):
                keyed_geno = self.genotypes['0/#']
            else:
                keyed_geno = self.genotypes['#/#']
Пример #3
0
    def get_variant(self, record):
        "Get or create a variant."
        chrom, pos, ref, alt = record.CHROM, record.POS, record.REF, '/'.join(
            [str(x) for x in record.ALT])

        # Calculate MD5 and attempt to fetch the primary key from
        # the local cache, otherwise use it when inserting.
        md5 = calculate_md5(chrom, pos, ref, alt)

        # Ensure the cache is valid for the chromosome
        self.check_cache(chrom)

        variant_id = self.variant_cache.get(md5, None)

        # Just make a faux instance
        if variant_id:
            variant = Variant(pk=variant_id)
        # Create if it does not exist
        else:
            variant = Variant(pos=pos, ref=ref, alt=alt, md5=md5)

            # Update foreign key references
            variant.chr = self.get_chromosome(chrom)
            variant.type = self.get_variant_type(record.var_type.upper())

            # Periods are useless..
            variant.rsid = record.ID == '.' and None or record.ID
            variant.save()
            self.file_variants += 1

            # Update cache
            self.variant_cache[md5] = variant.pk

            # Process SNPEff data if this is the first time this variant
            # has been seen.
            if 'EFF' in record.INFO:
                effs = record.INFO['EFF'].split(',')
                self.load_effects(effs, variant)
        return variant
Пример #4
0
    def get_variant(self, record):
        "Get or create a variant."
        chrom, pos, ref, alt = record.CHROM, record.POS, record.REF, '/'.join([str(x) for x in record.ALT])

        # Calculate MD5 and attempt to fetch the primary key from
        # the local cache, otherwise use it when inserting.
        md5 = calculate_md5(chrom, pos, ref, alt)

        # Ensure the cache is valid for the chromosome
        self.check_cache(chrom)

        variant_id = self.variant_cache.get(md5, None)

        # Just make a faux instance
        if variant_id:
            variant = Variant(pk=variant_id)
        # Create if it does not exist
        else:
            variant = Variant(pos=pos, ref=ref, alt=alt, md5=md5)

            # Update foreign key references
            variant.chr = self.get_chromosome(chrom)
            variant.type = self.get_variant_type(record.var_type.upper())

            # Periods are useless..
            variant.rsid = record.ID == '.' and None or record.ID
            variant.save()
            self.file_variants += 1

            # Update cache
            self.variant_cache[md5] = variant.pk

            # Process SNPEff data if this is the first time this variant
            # has been seen.
            if 'EFF' in record.INFO:
                effs = record.INFO['EFF'].split(',')
                self.load_effects(effs, variant)
        return variant
Пример #5
0
    def readline(self, size=-1):
        "Ignore the `size` since a complete line must be processed."
        while True:
            try:
                record = next(self.reader)
            except StopIteration:
                break

            # Ensure this is a valid record
            if checks.record_is_valid(record):
                if self.use_cache:
                    # Ensures the cache is updated and available
                    self.variants.ensure_cache(record)

                # Calculate the MD5 of the variant itself (not the record)
                md5 = calculate_md5(record)

                # Ensure this variant is not already loaded
                if not self.use_cache or md5 not in self.variants:
                    cleaned = self.process_line(record)
                    cleaned.append(md5)
                    return self.outdel.join(cleaned) + '\n'

        return ''
Пример #6
0
    def readline(self, size=-1):
        "Ignore the `size` since a complete line must be processed."
        while True:
            try:
                record = next(self.reader)
            except StopIteration:
                break

            # Ensure this is a valid record
            if checks.record_is_valid(record):
                if self.use_cache:
                    # Ensures the cache is updated and available
                    self.variants.ensure_cache(record)

                # Calculate the MD5 of the variant itself (not the record)
                md5 = calculate_md5(record)

                # Ensure this variant is not already loaded
                if not self.use_cache or md5 not in self.variants:
                    cleaned = self.process_line(record)
                    cleaned.append(md5)
                    return self.outdel.join(cleaned) + '\n'

        return ''
Пример #7
0
 def process_line(self, record):
     cleaned = super(EVSProcessor, self).process_line(record)
     # Add the MD5
     cleaned.append(calculate_md5(*cleaned[:4]))
     return cleaned
Пример #8
0
    def process_line(self, record):
        # Calculate MD5 using extracted values
        md5 = calculate_md5(record)

        # Ensures the cache is updated and available
        self.variants.ensure_cache(record)

        # Ensure the variant exists
        variant_id = self.variants.get(md5)

        if not variant_id:
            log.error('Missing variant', extra={
                'chr': record.CHROM,
                'pos': record.POS,
                'ref': record.REF,
                'alt': record.ALT,
            })
            return

        # Skip processing effects since they are only loaded once
        if self._effects_exists(md5, variant_id):
            return

        effects = []

        # Is this returning a list now??
        effects_line = record.INFO.get('EFF', [])

        # Multiple separate SNPEff records
        for eff in effects_line:
            effect, values = self._snpeff_dict(eff)

            transcript = values.get('Transcript')
            gene_pk = self.get_gene(values.get('Gene_Name'))

            row = [
                variant_id,
                values.get('Codon_Change'),
                values.get('Amino_acid_change'),
                self.effects.get(effect),
                self.functional_classes.get(values.get('Functional_Class')),
                gene_pk,
                self.get_transcript(gene_pk, transcript),
            ]

            # Extension fields from snpEff CBMi fork
            segment = values.get('Segment')
            hgvs_c = values.get('HGVS_DNA_nomenclature')
            hgvs_p = values.get('HGVS_protein_nomenclature')

            # Trim off transcript prefix, clean up format
            if transcript:
                if segment and segment.startswith(transcript):
                    segment = \
                        segment[len(transcript):].strip('._').replace('_', '.')
                if hgvs_c and hgvs_c.startswith(transcript):
                    hgvs_c = hgvs_c[len(transcript):].lstrip(':')

            row.extend([
                segment,
                hgvs_c,
                hgvs_p,
            ])

            effects.append([self.process_column('', x) for x in row])

        if not effects:
            log.error('No effects process for variant', extra={
                'chr': record.CHROM,
                'pos': record.POS,
                'ref': record.REF,
                'alt': record.ALT,
            })

        return effects
Пример #9
0
    def process_line(self, record):
        # we are revisiting the same line once for each sample
        # until I figure out how the cleaning and loading will
        # work otherwise

        # Ensures the cache is updated and available
        self.variants.ensure_cache(record)

        # Calculate the MD5 of the variant itself (not the record)
        md5 = calculate_md5(record)

        # Ensure the variant exists
        variant_id = self.variants.get(md5)
        assert variant_id is not None

        cleaned = super(ResultStream, self).process_line(record)
        # Remove variant specific parts
        cleaned = cleaned[4:]

        # can these be indexed?
        call = record.genotype(self.vcf_sample)

        # Already seen this variant for this sample, otherwise we would get a
        # duplicate key value violation in sample_result.
        if Result.objects.filter(
                variant=variant_id, sample=self.sample_id).exists():
            return None

        # The possibility for multiple alleles in these wide vcfs is almost
        # infinite so we need to triage the really weird ones into having a
        # reference allele "0/#" or being off the map entirely "#/#"".
        gt = getattr(call, 'GT', None)
        if gt:
            try:
                keyed_geno = self.genotypes[gt]
            except KeyError:
                if gt.startswith('0'):
                    keyed_geno = self.genotypes['0/#']
                else:
                    keyed_geno = self.genotypes['#/#']
        else:
            keyed_geno = None

        dp = getattr(call, 'DP', None)
        gq = getattr(call, 'GQ', None)
        ad = getattr(call, 'AD', None)
        if ad and len(ad) > 1:
            ad0 = ad[0]
            ad1 = ad[1]
        else:
            ad0 = None
            ad1 = None
        pl = getattr(call, 'PL', None)
        if pl:
            pl = ','.join([str(x) for x in pl])

        # Append remaining columns
        other = [variant_id, self.sample_id, dp, record.QUAL, keyed_geno, gq,
                 ad0, ad1, pl, self.now, self.now]

        cleaned.extend([self.process_column('', x) for x in other])
        return cleaned
Пример #10
0
    def process_line(self, record):
        # Calculate MD5 using extracted values
        md5 = calculate_md5(record)

        # Ensures the cache is updated and available
        self.variants.ensure_cache(record)

        # Ensure the variant exists
        variant_id = self.variants.get(md5)

        if not variant_id:
            log.error('Missing variant',
                      extra={
                          'chr': record.CHROM,
                          'pos': record.POS,
                          'ref': record.REF,
                          'alt': record.ALT,
                      })
            return

        # Skip processing effects since they are only loaded once
        if self._effects_exists(md5, variant_id):
            return

        effects = []

        #is this returning a list now??
        effects_line = record.INFO['EFF']

        # Multiple separate SNPEff records
        for eff in effects_line:
            effect, values = self._snpeff_dict(eff)

            transcript = values.get('Transcript')
            gene_pk = self.get_gene(values.get('Gene_Name'))

            row = [
                variant_id,
                values.get('Codon_Change'),
                values.get('Amino_acid_change'),
                self.effects.get(effect),
                self.functional_classes.get(values.get('Functional_Class')),
                gene_pk,
                self.get_transcript(gene_pk, transcript),
            ]

            # Extension fields from snpEff CBMi fork
            segment = values.get('Segment')
            hgvs_c = values.get('HGVS_DNA_nomenclature')
            hgvs_p = values.get('HGVS_protein_nomenclature')

            # Trim off transcript prefix, clean up format
            if transcript:
                if segment and segment.startswith(transcript):
                    segment = segment[len(transcript):].strip('._').replace(
                        '_', '.')
                if hgvs_c and hgvs_c.startswith(transcript):
                    hgvs_c = hgvs_c[len(transcript):].lstrip(':')

            row.extend([
                segment,
                hgvs_c,
                hgvs_p,
            ])

            effects.append([self.process_column('', x) for x in row])

        if not effects:
            log.error('No effects process for variant',
                      extra={
                          'chr': record.CHROM,
                          'pos': record.POS,
                          'ref': record.REF,
                          'alt': record.ALT,
                      })

        return effects
Пример #11
0
 def process_line(self, record):
     cleaned = super(ThousandGProcessor, self).process_line(record)
     # Add the MD5
     cleaned.append(calculate_md5(*cleaned[:4]))
     return cleaned