Exemplo n.º 1
0
def create_population_frequency_store_from_settings(db, settings_module):
    """
    Creates a population frequency store from scratch.
    Args:
        db: empty pymongo Database
        settings_module: module with REFERENCE_POPULATIONS setting
    """
    store = PopulationFrequencyStore(db)
    store.ensure_indices()
    store.load_populations(settings_module.REFERENCE_POPULATIONS)
Exemplo n.º 2
0
 def __init__(self, settings_module, custom_annotator=None):
     self._db = pymongo.MongoClient(host=os.environ.get('MONGO_SERVICE_HOSTNAME', 'localhost'))[settings_module.db_name]
     self._population_frequency_store = PopulationFrequencyStore(
         db_conn=self._db,
         reference_populations=settings_module.reference_populations,
     )
     self._vep_annotator = HackedVEPAnnotator(
         vep_perl_path=settings_module.vep_perl_path,
         vep_cache_dir=settings_module.vep_cache_dir,
         vep_batch_size=settings_module.vep_batch_size,
         human_ancestor_fa=None,
         #human_ancestor_fa=settings_module.human_ancestor_fa,
     )
     self._custom_annotator = custom_annotator
     self.reference_populations = settings_module.reference_populations
     self.reference_population_slugs = [pop['slug'] for pop in settings_module.reference_populations]
Exemplo n.º 3
0
 def __init__(self, settings_module, custom_annotator=None):
     self._db = pymongo.Connection()[settings_module.db_name]
     self._population_frequency_store = PopulationFrequencyStore(
         db_conn=self._db,
         reference_populations=settings_module.reference_populations,
     )
     self._vep_annotator = HackedVEPAnnotator(
         vep_perl_path=settings_module.vep_perl_path,
         vep_cache_dir=settings_module.vep_cache_dir,
         vep_batch_size=settings_module.vep_batch_size,
         human_ancestor_fa=None,
         #human_ancestor_fa=settings_module.human_ancestor_fa,
     )
     self._custom_annotator = custom_annotator
     self.reference_populations = settings_module.reference_populations
     self.reference_population_slugs = [
         pop['slug'] for pop in settings_module.reference_populations
     ]
Exemplo n.º 4
0
 def __init__(self, custom_annotator=None):
     self._population_frequency_store = PopulationFrequencyStore(
         get_db=self.get_annotator_datastore,
         reference_populations=settings.ANNOTATOR_SETTINGS.
         reference_populations,
     )
     self._vep_annotator = HackedVEPAnnotator(
         vep_perl_path=settings.ANNOTATOR_SETTINGS.vep_perl_path,
         vep_cache_dir=settings.ANNOTATOR_SETTINGS.vep_cache_dir,
         vep_batch_size=settings.ANNOTATOR_SETTINGS.vep_batch_size,
         human_ancestor_fa=None,
         #human_ancestor_fa=settings_module.human_ancestor_fa,
     )
     self._custom_annotator = custom_annotator
     self.reference_populations = settings.ANNOTATOR_SETTINGS.reference_populations
     self.reference_population_slugs = [
         pop['slug']
         for pop in settings.ANNOTATOR_SETTINGS.reference_populations
     ]
Exemplo n.º 5
0
 def __init__(self, settings_module, custom_annotator=None):
     self._db = pymongo.Connection()[settings_module.db_name]
     self._population_frequency_store = PopulationFrequencyStore(
         db_conn=self._db,
         reference_populations=settings_module.reference_populations,
     )
     self._vep_annotator = HackedVEPAnnotator(
         vep_perl_path=settings_module.vep_perl_path,
         vep_cache_dir=settings_module.vep_cache_dir,
         vep_batch_size=settings_module.vep_batch_size,
         human_ancestor_fa=None,
         #human_ancestor_fa=settings_module.human_ancestor_fa,
     )
     self._custom_annotator = custom_annotator
     self.reference_populations = settings_module.reference_populations
     self.reference_population_slugs = [pop['slug'] for pop in settings_module.reference_populations]
Exemplo n.º 6
0
class VariantAnnotator():
    def __init__(self, settings_module, custom_annotator=None):
        self._db = pymongo.MongoClient(
            host=settings_module.db_host,
            port=settings_module.db_port)[settings_module.db_name]
        self._population_frequency_store = PopulationFrequencyStore(
            db_conn=self._db,
            reference_populations=settings_module.reference_populations,
        )
        self._vep_annotator = HackedVEPAnnotator(
            vep_perl_path=settings_module.vep_perl_path,
            vep_cache_dir=settings_module.vep_cache_dir,
            vep_batch_size=settings_module.vep_batch_size,
            human_ancestor_fa=None,
            #human_ancestor_fa=settings_module.human_ancestor_fa,
        )
        self._custom_annotator = custom_annotator
        self.reference_populations = settings_module.reference_populations
        self.reference_population_slugs = [
            pop['slug'] for pop in settings_module.reference_populations
        ]

    def _ensure_indices(self):
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

    def _clear(self):
        self._db.drop_collection('variants')
        self._db.drop_collection('vcf_files')
        self._ensure_indices()

    def get_annotator_datastore(self):
        """Returns the mongo database object for the xbrowse_annotator database. This database contains the
        'variants' and 'pop_variants' collections."""
        return self._db

    def get_population_frequency_store(self):
        """Returns the PopulationFrequencyStore used to store the system-wide reference populations available on all
        projects.
        """
        return self._population_frequency_store

    def load(self):
        self._clear()
        self._ensure_indices()
        self._population_frequency_store.load()

    def get_variant(self, xpos, ref, alt):
        variant = Variant(xpos, ref, alt)
        self.annotate_variant(variant)
        return variant

    def get_annotation(self, xpos, ref, alt, populations=None):
        doc = self._db.variants.find_one({
            'xpos': xpos,
            'ref': ref,
            'alt': alt
        })
        if doc is None:
            raise ValueError("Could not find annotations for variant: " +
                             str((xpos, ref, alt)))
        annotation = doc['annotation']
        if populations is None:
            populations = self.reference_population_slugs
        if populations is not None:
            freqs = {}
            for p in populations:
                freqs[p] = annotation['freqs'].get(p, 0.0)
            annotation['freqs'] = freqs
        return annotation

    def add_variants_to_annotator(self, variant_t_list, force_all=False):
        """
        Make sure that all the variants in variant_t_list are in annotator
        For the ones that are not, go through the whole load cycle
        """
        if force_all:
            variants_to_add = variant_t_list
        else:
            variants_to_add = self._get_missing_annotations(variant_t_list)
        custom_annotations = None
        if self._custom_annotator:
            print "Getting custom annotations..."
            custom_annotations = self._custom_annotator.get_annotations_for_variants(
                variants_to_add)
            print "...done"
        for variant_t, vep_annotation in self._vep_annotator.get_vep_annotations_for_variants(
                variants_to_add):
            annotation = {
                'vep_annotation':
                vep_annotation,
                'freqs':
                self._population_frequency_store.get_frequencies(
                    variant_t[0], variant_t[1], variant_t[2]),
            }
            add_convenience_annotations(annotation)
            if self._custom_annotator:
                annotation.update(custom_annotations[variant_t])
            self._db.variants.update(
                {
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
                }, {
                    '$set': {
                        'annotation': annotation
                    },
                },
                upsert=True)

    def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force_all and self._db.vcf_files.find_one(
            {'vcf_file_path': vcf_file_path}):
            print "VCF already annotated"
            return
        print "Scanning VCF file first..."
        variant_t_list = []
        for variant_t in vcf_stuff.iterate_tuples(
                compressed_file(vcf_file_path)):
            variant_t_list.append(variant_t)
            if len(variant_t_list) == 100000:
                print "Adding another 100000 variants, through {}".format(
                    variant_t_list[-1][0])
                self.add_variants_to_annotator(variant_t_list, force_all)
                variant_t_list = []
        self.add_variants_to_annotator(variant_t_list, force_all)
        self._db.vcf_files.insert({
            'vcf_file_path': vcf_file_path,
            'date_added': datetime.datetime.utcnow()
        })

    def get_vcf_file_from_annotator(self, vcf_file_path):

        return self._db.vcf_files.find_one({'vcf_file_path': vcf_file_path})

    def add_preannotated_vcf_file(self,
                                  vcf_file_path,
                                  force=False,
                                  start_from_chrom=None,
                                  end_with_chrom=None):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force and self._db.vcf_files.find_one(
            {'vcf_file_path': vcf_file_path}):
            print "VCF %(vcf_file_path)s already loaded into db.variants cache" % locals(
            )
            return

        r = vcf.VCFReader(filename=vcf_file_path)
        if "CSQ" not in r.infos:
            raise ValueError(
                "ERROR: CSQ field not found in %s. Was this VCF annotated with VEP?"
                % vcf_file_path)

        expected_csq_fields = set(
            "Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|BIOTYPE|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|EXON|INTRON|DOMAINS|HGVSc|HGVSp|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF|Polyphen2_HVAR_pred|CADD_phred|MutationTaster_pred|MetaSVM_pred|SIFT_pred|FATHMM_pred"
            .split("|"))
        actual_csq_fields_string = str(
            r.infos["CSQ"].desc).split("Format:")[1].strip()
        actual_csq_fields = set(actual_csq_fields_string.split("|"))
        if len(expected_csq_fields - actual_csq_fields) > 0:
            raise ValueError(
                "ERROR: VEP did not add all expected CSQ fields to the VCF. The VCF's CSQ = %s and is missing these fields: %s"
                % (actual_csq_fields_string,
                   expected_csq_fields - actual_csq_fields))

        if start_from_chrom or end_with_chrom:
            if start_from_chrom:
                print("Start chrom: chr%s" % start_from_chrom)
            if end_with_chrom:
                print("End chrom: chr%s" % end_with_chrom)

            chrom_list = list(map(str, range(1, 23))) + ['X', 'Y']
            chrom_list_start_index = 0
            if start_from_chrom:
                chrom_list_start_index = chrom_list.index(
                    start_from_chrom.replace("chr", "").upper())

            chrom_list_end_index = len(chrom_list)
            if end_with_chrom:
                chrom_list_end_index = chrom_list.index(
                    end_with_chrom.replace("chr", "").upper())

            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = tabix_file.header
            for chrom in chrom_list[
                    chrom_list_start_index:chrom_list_end_index + 1]:
                print("Will load chrom: " + chrom)
                try:
                    vcf_iter = itertools.chain(vcf_iter,
                                               tabix_file.fetch(chrom))
                except ValueError as e:
                    print("WARNING: add_preannotated_vcf_file: " + str(e))

            vcf_file_obj = vcf_iter
        else:
            print("Loading pre-annotated VCF file: %s into db.variants cache" %
                  vcf_file_path)
            vcf_file_obj = gzip.open(vcf_file_path) if vcf_file_path.endswith(
                '.gz') else open(vcf_file_path)

        counters = defaultdict(int)
        for variant, vep_annotation in vep_annotations.parse_vep_annotations_from_vcf(
                vcf_file_obj):
            # for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)):
            variant_t = variant.unique_tuple()
            counters['alleles'] += 1
            annotation = {
                'vep_annotation':
                vep_annotation,
                'freqs':
                self._population_frequency_store.get_frequencies(
                    variant_t[0], variant_t[1], variant_t[2]),
            }

            add_convenience_annotations(annotation)

            chrom, pos = genomeloc.get_chr_pos(variant_t[0])

            worst_annotation = vep_annotation[
                annotation["worst_vep_annotation_index"]]
            predictors = get_predictors(worst_annotation)
            annotation.update(predictors)
            #if self._custom_annotator:
            #    custom_annotations = self._custom_annotator.get_annotations_for_variants([variant_t])
            #    annotation.update(custom_annotations[variant_t])

            if counters['alleles'] % 10000 == 0:
                import pprint
                pprint.pprint(variant_t)

            self._db.variants.update(
                {
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
                }, {'$set': {
                    'annotation': annotation
                }},
                upsert=True)

        print("Finished parsing %s alleles from %s" %
              (counters.get('alleles', 0), vcf_file_path))
        self._db.vcf_files.update({'vcf_file_path': vcf_file_path}, {
            'vcf_file_path': vcf_file_path,
            'date_added': datetime.datetime.utcnow()
        },
                                  upsert=True)

    def _get_missing_annotations(self, variant_t_list):
        ret = []
        for variant_t in variant_t_list:
            if not self._db.variants.find_one({
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
            }):
                ret.append(variant_t)
        return ret

    def annotate_variant(self, variant, populations=None):
        if not hasattr(variant, 'annotation') or not variant.annotation:
            try:
                annotation = self.get_annotation(variant.xpos, variant.ref,
                                                 variant.alt, populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                variant.annotation = None
                return

            variant.annotation = annotation
        else:
Exemplo n.º 7
0
class VariantAnnotator():

    def __init__(self, settings_module, custom_annotator=None):
        self._db = pymongo.Connection()[settings_module.db_name]
        self._population_frequency_store = PopulationFrequencyStore(
            db_conn=self._db,
            reference_populations=settings_module.reference_populations,
        )
        self._vep_annotator = HackedVEPAnnotator(
            vep_perl_path=settings_module.vep_perl_path,
            vep_cache_dir=settings_module.vep_cache_dir,
            vep_batch_size=settings_module.vep_batch_size,
            human_ancestor_fa=None,
            #human_ancestor_fa=settings_module.human_ancestor_fa,
        )
        self._custom_annotator = custom_annotator
        self.reference_populations = settings_module.reference_populations
        self.reference_population_slugs = [pop['slug'] for pop in settings_module.reference_populations]

    def _ensure_indices(self):
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

    def _clear(self):
        self._db.drop_collection('variants')
        self._db.drop_collection('vcf_files')
        self._ensure_indices()

    def load(self):
        self._clear()
        self._ensure_indices()
        self._population_frequency_store.load()

    def get_variant(self, xpos, ref, alt):
        variant = Variant(xpos, ref, alt)
        self.annotate_variant(variant)
        return variant

    def get_annotation(self, xpos, ref, alt, populations=None):
        doc = self._db.variants.find_one({'xpos': xpos, 'ref': ref, 'alt': alt})
        annotation = doc['annotation']
        if populations is None:
            populations = self.reference_population_slugs
        if populations is not None:
            freqs = {}
            for p in populations:
                freqs[p] = annotation['freqs'].get(p, 0.0)
            annotation['freqs'] = freqs
        return annotation

    def add_variants_to_annotator(self, variant_t_list, force_all=False):
        """
        Make sure that all the variants in variant_t_list are in annotator
        For the ones that are not, go through the whole load cycle
        """
        if force_all:
            variants_to_add = variant_t_list
        else:
            variants_to_add = self._get_missing_annotations(variant_t_list)
        custom_annotations = None
        if self._custom_annotator:
            print "Getting custom annotations..."
            custom_annotations = self._custom_annotator.get_annotations_for_variants(variants_to_add)
            print "...done"
        for variant_t, vep_annotation in self._vep_annotator.get_vep_annotations_for_variants(variants_to_add):
            annotation = {
                'vep_annotation': vep_annotation,
                'freqs': self._population_frequency_store.get_frequencies(variant_t[0], variant_t[1], variant_t[2]),
            }
            add_convenience_annotations(annotation)
            if self._custom_annotator:
                annotation.update(custom_annotations[variant_t])
            self._db.variants.update({
                'xpos': variant_t[0],
                'ref': variant_t[1],
                'alt': variant_t[2]
            }, {'$set': {'annotation': annotation},
            }, upsert=True)

    def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force_all and self._db.vcf_files.find_one({'vcf_file_path': vcf_file_path}):
            print "VCF already annotated"
            return
        print "Scanning VCF file first..."
        variant_t_list = []
        for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)):
            variant_t_list.append(variant_t)
            if len(variant_t_list) == 100000:
                print "Adding another 100000 variants, through {}".format(variant_t_list[-1][0])
                self.add_variants_to_annotator(variant_t_list, force_all)
                variant_t_list = []
        self.add_variants_to_annotator(variant_t_list, force_all)
        self._db.vcf_files.insert({'vcf_file_path': vcf_file_path, 'date_added': datetime.datetime.utcnow()})

    def _get_missing_annotations(self, variant_t_list):
        ret = []
        for variant_t in variant_t_list:
            if not self._db.variants.find_one(
                {'xpos': variant_t[0],
                 'ref': variant_t[1],
                 'alt': variant_t[2]}):
                ret.append(variant_t)
        return ret

    def annotate_variant(self, variant, populations=None):
        annotation = self.get_annotation(variant.xpos, variant.ref, variant.alt, populations)
        variant.annotation = annotation

        # todo: gotta remove one
        # ...or actually maybe both
        variant.gene_ids = [g for g in annotation['gene_ids']]
        variant.coding_gene_ids = [g for g in annotation['coding_gene_ids']]
Exemplo n.º 8
0
class VariantAnnotator():
    def __init__(self, settings_module, custom_annotator=None):
        self._db = pymongo.Connection()[settings_module.db_name]
        self._population_frequency_store = PopulationFrequencyStore(
            db_conn=self._db,
            reference_populations=settings_module.reference_populations,
        )
        self._vep_annotator = HackedVEPAnnotator(
            vep_perl_path=settings_module.vep_perl_path,
            vep_cache_dir=settings_module.vep_cache_dir,
            vep_batch_size=settings_module.vep_batch_size,
            human_ancestor_fa=None,
            #human_ancestor_fa=settings_module.human_ancestor_fa,
        )
        self._custom_annotator = custom_annotator
        self.reference_populations = settings_module.reference_populations
        self.reference_population_slugs = [
            pop['slug'] for pop in settings_module.reference_populations
        ]

    def _ensure_indices(self):
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

    def _clear(self):
        self._db.drop_collection('variants')
        self._db.drop_collection('vcf_files')
        self._ensure_indices()

    def load(self):
        self._clear()
        self._ensure_indices()
        self._population_frequency_store.load()

    def get_variant(self, xpos, ref, alt):
        variant = Variant(xpos, ref, alt)
        self.annotate_variant(variant)
        return variant

    def get_annotation(self, xpos, ref, alt, populations=None):
        doc = self._db.variants.find_one({
            'xpos': xpos,
            'ref': ref,
            'alt': alt
        })
        if doc is None:
            raise ValueError("Could not find annotations for variant: " +
                             str((xpos, ref, alt)))
        annotation = doc['annotation']
        if populations is None:
            populations = self.reference_population_slugs
        if populations is not None:
            freqs = {}
            for p in populations:
                freqs[p] = annotation['freqs'].get(p, 0.0)
            annotation['freqs'] = freqs
        return annotation

    def add_variants_to_annotator(self, variant_t_list, force_all=False):
        """
        Make sure that all the variants in variant_t_list are in annotator
        For the ones that are not, go through the whole load cycle
        """
        if force_all:
            variants_to_add = variant_t_list
        else:
            variants_to_add = self._get_missing_annotations(variant_t_list)
        custom_annotations = None
        if self._custom_annotator:
            print "Getting custom annotations..."
            custom_annotations = self._custom_annotator.get_annotations_for_variants(
                variants_to_add)
            print "...done"
        for variant_t, vep_annotation in self._vep_annotator.get_vep_annotations_for_variants(
                variants_to_add):
            annotation = {
                'vep_annotation':
                vep_annotation,
                'freqs':
                self._population_frequency_store.get_frequencies(
                    variant_t[0], variant_t[1], variant_t[2]),
            }
            add_convenience_annotations(annotation)
            if self._custom_annotator:
                annotation.update(custom_annotations[variant_t])
            self._db.variants.update(
                {
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
                }, {
                    '$set': {
                        'annotation': annotation
                    },
                },
                upsert=True)

    def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force_all and self._db.vcf_files.find_one(
            {'vcf_file_path': vcf_file_path}):
            print "VCF already annotated"
            return
        print "Scanning VCF file first..."
        variant_t_list = []
        for variant_t in vcf_stuff.iterate_tuples(
                compressed_file(vcf_file_path)):
            variant_t_list.append(variant_t)
            if len(variant_t_list) == 100000:
                print "Adding another 100000 variants, through {}".format(
                    variant_t_list[-1][0])
                self.add_variants_to_annotator(variant_t_list, force_all)
                variant_t_list = []
        self.add_variants_to_annotator(variant_t_list, force_all)
        self._db.vcf_files.insert({
            'vcf_file_path': vcf_file_path,
            'date_added': datetime.datetime.utcnow()
        })

    def add_preannotated_vcf_file(self, vcf_file_path, force=False):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force and self._db.vcf_files.find_one(
            {'vcf_file_path': vcf_file_path}):
            print "VCF %(vcf_file_path)s already loaded into db.variants cache" % locals(
            )
            return

        print("Loading pre-annotated VCF file: %s into db.variants cache" %
              vcf_file_path)
        for variant, vep_annotation in vep_annotations.parse_vep_annotations_from_vcf(
                open(vcf_file_path)):
            # for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)):
            variant_t = variant.unique_tuple()

            annotation = {
                'vep_annotation':
                vep_annotation,
                'freqs':
                self._population_frequency_store.get_frequencies(
                    variant_t[0], variant_t[1], variant_t[2]),
            }

            add_convenience_annotations(annotation)

            if self._custom_annotator:
                custom_annotations = self._custom_annotator.get_annotations_for_variants(
                    [variant_t])
                annotation.update(custom_annotations[variant_t])

            self._db.variants.update(
                {
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
                }, {'$set': {
                    'annotation': annotation
                }},
                upsert=True)

        self._db.vcf_files.update({'vcf_file_path': vcf_file_path}, {
            'vcf_file_path': vcf_file_path,
            'date_added': datetime.datetime.utcnow()
        },
                                  upsert=True)

    def _get_missing_annotations(self, variant_t_list):
        ret = []
        for variant_t in variant_t_list:
            if not self._db.variants.find_one({
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
            }):
                ret.append(variant_t)
        return ret

    def annotate_variant(self, variant, populations=None):
        try:
            annotation = self.get_annotation(variant.xpos, variant.ref,
                                             variant.alt, populations)
        except ValueError, e:
            sys.stderr.write("WARNING: " + str(e) + "\n")
            variant.annotation = None
            return

        variant.annotation = annotation

        # todo: gotta remove one
        # ...or actually maybe both
        variant.gene_ids = [g for g in annotation['gene_ids']]
        variant.coding_gene_ids = [g for g in annotation['coding_gene_ids']]
Exemplo n.º 9
0
class VariantAnnotator():

    def __init__(self, settings_module, custom_annotator=None):
        self._db = pymongo.Connection()[settings_module.db_name]
        self._population_frequency_store = PopulationFrequencyStore(
            db_conn=self._db,
            reference_populations=settings_module.reference_populations,
        )
        self._vep_annotator = HackedVEPAnnotator(
            vep_perl_path=settings_module.vep_perl_path,
            vep_cache_dir=settings_module.vep_cache_dir,
            vep_batch_size=settings_module.vep_batch_size,
            human_ancestor_fa=None,
            #human_ancestor_fa=settings_module.human_ancestor_fa,
        )
        self._custom_annotator = custom_annotator
        self.reference_populations = settings_module.reference_populations
        self.reference_population_slugs = [pop['slug'] for pop in settings_module.reference_populations]

    def _ensure_indices(self):
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

    def _clear(self):
        self._db.drop_collection('variants')
        self._db.drop_collection('vcf_files')
        self._ensure_indices()

    def get_annotator_datastore(self):
        """Returns the mongo database object for the xbrowse_annotator database. This database contains the
        'variants' and 'pop_variants' collections."""
        return self._db

    def get_population_frequency_store(self):
        """Returns the PopulationFrequencyStore used to store the system-wide reference populations available on all
        projects.
        """
        return self._population_frequency_store


    def load(self):
        self._clear()
        self._ensure_indices()
        self._population_frequency_store.load()

    def get_variant(self, xpos, ref, alt):
        variant = Variant(xpos, ref, alt)
        self.annotate_variant(variant)
        return variant

    def get_annotation(self, xpos, ref, alt, populations=None):
        doc = self._db.variants.find_one({'xpos': xpos, 'ref': ref, 'alt': alt})
        if doc is None:
            raise ValueError("Could not find annotations for variant: " + str((xpos, ref, alt)))
        annotation = doc['annotation']
        if populations is None:
            populations = self.reference_population_slugs
        if populations is not None:
            freqs = {}
            for p in populations:
                freqs[p] = annotation['freqs'].get(p, 0.0)
            annotation['freqs'] = freqs
        return annotation

    def add_variants_to_annotator(self, variant_t_list, force_all=False):
        """
        Make sure that all the variants in variant_t_list are in annotator
        For the ones that are not, go through the whole load cycle
        """
        if force_all:
            variants_to_add = variant_t_list
        else:
            variants_to_add = self._get_missing_annotations(variant_t_list)
        custom_annotations = None
        if self._custom_annotator:
            print "Getting custom annotations..."
            custom_annotations = self._custom_annotator.get_annotations_for_variants(variants_to_add)
            print "...done"
        for variant_t, vep_annotation in self._vep_annotator.get_vep_annotations_for_variants(variants_to_add):
            annotation = {
                'vep_annotation': vep_annotation,
                'freqs': self._population_frequency_store.get_frequencies(variant_t[0], variant_t[1], variant_t[2]),
            }
            add_convenience_annotations(annotation)
            if self._custom_annotator:
                annotation.update(custom_annotations[variant_t])
            self._db.variants.update({
                'xpos': variant_t[0],
                'ref': variant_t[1],
                'alt': variant_t[2]
            }, {'$set': {'annotation': annotation},
            }, upsert=True)


    def add_vcf_file_to_annotator(self, vcf_file_path, force_all=False):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force_all and self._db.vcf_files.find_one({'vcf_file_path': vcf_file_path}):
            print "VCF already annotated"
            return
        print "Scanning VCF file first..."
        variant_t_list = []
        for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)):
            variant_t_list.append(variant_t)
            if len(variant_t_list) == 100000:
                print "Adding another 100000 variants, through {}".format(variant_t_list[-1][0])
                self.add_variants_to_annotator(variant_t_list, force_all)
                variant_t_list = []
        self.add_variants_to_annotator(variant_t_list, force_all)
        self._db.vcf_files.insert({'vcf_file_path': vcf_file_path, 'date_added': datetime.datetime.utcnow()})

    def add_preannotated_vcf_file(self, vcf_file_path, force=False):
        """
        Add the variants in vcf_file_path to annotator
        Convenience wrapper around add_variants_to_annotator
        """
        if not force and self._db.vcf_files.find_one({'vcf_file_path': vcf_file_path}):
            print "VCF %(vcf_file_path)s already loaded into db.variants cache" % locals()
            return

        r = vcf.VCFReader(filename=vcf_file_path)
        if "CSQ" not in r.infos:
            raise ValueError("ERROR: CSQ field not found in %s. Was this VCF annotated with VEP?" % vcf_file_path)

        expected_csq_fields = set("Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|BIOTYPE|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|SIFT|PolyPhen|EXON|INTRON|DOMAINS|HGVSc|HGVSp|GMAF|AFR_MAF|AMR_MAF|ASN_MAF|EUR_MAF|AA_MAF|EA_MAF|CLIN_SIG|SOMATIC|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF_info|LoF_flags|LoF_filter|LoF|Polyphen2_HVAR_pred|CADD_phred|MutationTaster_pred|MetaSVM_pred|SIFT_pred|FATHMM_pred".split("|"))
        actual_csq_fields_string = str(r.infos["CSQ"].desc).split("Format:")[1].strip()
        actual_csq_fields = set(actual_csq_fields_string.split("|"))
        if len(expected_csq_fields - actual_csq_fields) > 0:
            raise ValueError("ERROR: VEP did not add all expected CSQ fields to the VCF. The VCF's CSQ = %s and is missing these fields: %s" % (actual_csq_fields_string, expected_csq_fields - actual_csq_fields))

        print("Loading pre-annotated VCF file: %s into db.variants cache" % vcf_file_path)
        for variant, vep_annotation in vep_annotations.parse_vep_annotations_from_vcf(open(vcf_file_path)):
        # for variant_t in vcf_stuff.iterate_tuples(compressed_file(vcf_file_path)):
            variant_t = variant.unique_tuple()

            annotation = {
                'vep_annotation': vep_annotation,
                'freqs': self._population_frequency_store.get_frequencies(variant_t[0], variant_t[1], variant_t[2]),
                }

            add_convenience_annotations(annotation)

            if self._custom_annotator:
                custom_annotations = self._custom_annotator.get_annotations_for_variants([variant_t])
                annotation.update(custom_annotations[variant_t])

            self._db.variants.update(
                {
                    'xpos': variant_t[0],
                    'ref': variant_t[1],
                    'alt': variant_t[2]
                }, {
                    '$set': {'annotation': annotation}
                }, upsert=True)

        self._db.vcf_files.update({'vcf_file_path': vcf_file_path},
            {'vcf_file_path': vcf_file_path, 'date_added': datetime.datetime.utcnow()}, upsert=True)

    def _get_missing_annotations(self, variant_t_list):
        ret = []
        for variant_t in variant_t_list:
            if not self._db.variants.find_one(
                {'xpos': variant_t[0],
                 'ref': variant_t[1],
                 'alt': variant_t[2]}):
                ret.append(variant_t)
        return ret

    def annotate_variant(self, variant, populations=None):
        try:
            annotation = self.get_annotation(variant.xpos, variant.ref, variant.alt, populations)
        except ValueError, e:
            sys.stderr.write("WARNING: " + str(e) + "\n")
            variant.annotation = None
            return

        variant.annotation = annotation

        # todo: gotta remove one
        # ...or actually maybe both
        variant.gene_ids = [g for g in annotation['gene_ids']]
        variant.coding_gene_ids = [g for g in annotation['coding_gene_ids']]