Пример #1
0
    def handle(self, *args, **options):
        number_of_variants_to_check = int(options.get("number_of_variants_to_check") or 20000)

        if not args:
            args = [p.project_id for p in Project.objects.all()]
            args.reverse()

        for project_id in args:
            try:
                project = Project.objects.get(project_id=project_id)
            except:
                print("ERROR: Project not found. Skipping..")
                continue
            all_counter = 0
            #found_counter = 0
            not_found_counter = 0
            not_found_variants = []
            for vcf_file in project.get_all_vcf_files():
                path = vcf_file.file_path
                #print("Processing %s - %s" % (project.project_id, path))
                if not os.path.isfile(path) and path.endswith(".vcf"):
                    path = path + ".gz"
                if path.endswith(".gz"):
                    f = gzip.open(path)
                else:
                    f = open(path)
                if f:
                    for variant in vcf_stuff.iterate_vcf(f):
                        all_counter += 1
                        try:
                            get_mall(project_id).annotator.get_annotation(variant.xpos, variant.ref, variant.alt)
                        except ValueError, e:
                            not_found_counter += 1
                            if len(not_found_variants) < 30:
                                chrom, pos = genomeloc.get_chr_pos(variant.xpos)
                                chrom = chrom.replace("chr","")
                                ref, alt = variant.ref, variant.alt
                                not_found_variants.append("%(chrom)s-%(pos)s-%(ref)s-%(alt)s" % locals())
                            #print("WARNING: variant not found in annotator cache: " + str(e))
                            #if not_found_counter > 5:
                            #    print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id)
                            #    break
                            found_counter = 0
                        #else:
                        #    found_counter += 1
                        #    if found_counter > 15000:
                        #        #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id)
                        #        break
                        if all_counter >= number_of_variants_to_check:
                            fraction_missing = float(not_found_counter) / all_counter
                            if not_found_counter > 10:
                                print("---- ERROR: (%(fraction_missing)0.2f%%)  %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: " % locals())

                                for v in not_found_variants:
                                    print("http://exac.broadinstitute.org/variant/" + v)
                            break
Пример #2
0
def get_vep_annotations_from_vcf(vcf_file_obj):
    """
    Iterate through the variants in a VEP annotated VCF, pull out annotation from CSQ field
    """
    vep_meta_fields = ['CSQ']
    header_info = {}
    csq_field_names = None
    for variant in vcf_stuff.iterate_vcf(vcf_file_obj, meta_fields=vep_meta_fields, header_info=header_info):
        if csq_field_names is None:
            csq_field_names = get_csq_fields_from_vcf_desc(header_info['CSQ'].desc)
        vep_annotation = get_vep_annotation_from_csq_info(variant.extras['CSQ'], csq_field_names)
        yield variant, vep_annotation
Пример #3
0
    def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None, start_from_chrom=None, end_with_chrom=None):
        """
        This is how variants are loaded
        """

        chrom_list = list(map(str, range(1,23))) + ['X','Y']
        chrom_list_start_index = 0
        if start_from_chrom:
            chrom_list_start_index = chrom_list.index(start_from_chrom.replace("chr", "").upper())

        chrom_list_end_index = len(chrom_list)
        if end_with_chrom:
            chrom_list_end_index = chrom_list.index(end_with_chrom.replace("chr", "").upper())
        chromosomes_to_include = set(chrom_list[chrom_list_start_index : chrom_list_end_index])

        #tabix_file = pysam.TabixFile(vcf_file)
        #vcf_iter = tabix_file.header
        #for chrom in chrom_list[chrom_list_start_index:chrom_list_end_index]:
        #    print("Will load chrom: " + chrom)
        #    vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom))

        project_collection = self._get_project_collection(project_id)
        reference_populations = self._annotator.reference_population_slugs + self._custom_populations_map.get(project_id)
        for counter, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list)):
            if (start_from_chrom or end_with_chrom) and variant.chr.replace("chr", "") not in chromosomes_to_include:
                continue

            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            if counter % 2000 == 0:
                print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting variant %d  %s:%s-%s-%s (%0.1f%% done with %s) " % (counter, variant.chr, variant.pos, variant.ref, variant.alt, 100*variant.pos / CHROMOSOME_SIZES[variant.chr.replace("chr", "")], variant.chr))

            variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt})
            if not variant_dict:
                variant_dict = variant.toJSON()
                try:
                    annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
                except ValueError, e:
                    sys.stderr.write("WARNING: " + str(e) + "\n")
                    continue
                _add_index_fields_to_variant(variant_dict, annotation)
            else:
                for indiv_id, genotype in variant.get_genotypes():
                    if genotype.num_alt != 0:
                        variant_dict['genotypes'][indiv_id] = genotype._asdict()
            project_collection.save(variant_dict)
Пример #4
0
    def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None, start_from_chrom=None, end_with_chrom=None):
        """
        This is how variants are loaded
        """

        chrom_list = list(map(str, range(1,23))) + ['X','Y']
        chrom_list_start_index = 0
        if start_from_chrom:
            chrom_list_start_index = chrom_list.index(start_from_chrom.replace("chr", "").upper())

        chrom_list_end_index = len(chrom_list)
        if end_with_chrom:
            chrom_list_end_index = chrom_list.index(end_with_chrom.replace("chr", "").upper())
        chromosomes_to_include = set(chrom_list[chrom_list_start_index : chrom_list_end_index])
        #tabix_file = pysam.TabixFile(vcf_file)
        #vcf_iter = tabix_file.header
        #for chrom in chrom_list[chrom_list_start_index:chrom_list_end_index]:
        #    print("Will load chrom: " + chrom)
        #    vcf_iter = itertools.chain(vcf_iter, tabix_file.fetch(chrom))

        project_collection = self._get_project_collection(project_id)
        reference_populations = self._annotator.reference_population_slugs + self._custom_populations_map.get(project_id)
        for counter, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list)):
            if (start_from_chrom or end_with_chrom) and variant.chr.replace("chr", "") not in chromosomes_to_include:
                continue

            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            if counter % 2000 == 0:
                print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting variant %d  %s:%s-%s-%s (%0.1f%% done with %s) " % (counter, variant.chr, variant.pos, variant.ref, variant.alt, 100*variant.pos / CHROMOSOME_SIZES[variant.chr.replace("chr", "")], variant.chr))

            variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt})
            if not variant_dict:
                variant_dict = variant.toJSON()
                try:
                    annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
                except ValueError, e:
                    sys.stderr.write("WARNING: " + str(e) + "\n")
                    continue
                _add_index_fields_to_variant(variant_dict, annotation)
            else:
                for indiv_id, genotype in variant.get_genotypes():
                    if genotype.num_alt != 0:
                        variant_dict['genotypes'][indiv_id] = genotype._asdict()
            project_collection.save(variant_dict)
Пример #5
0
def get_variants_from_esp_file(file_like_object):
    """
    file_like_object is an ESP-style VCF file
    This could be called 23 times, one for each file, or just once after running vcf-merge
    Return a stream of variants with the following structure:
    {
        'xpos': long int
        'ref': str,
        'alt': alt,
        'esp_ea': float - aaf in european americans
        'esp_aa': float - aaf in african americans
    }
    Note that ref and alt are *not* currently reduced by xbrowse
    """

    ac_meta_fields = ['EA_AC', 'AA_AC']
    for variant in vcf_stuff.iterate_vcf(file_like_object,
                                         meta_fields=ac_meta_fields,
                                         vcf_row_info=True):

        v = {
            'xpos': variant.xpos,
            'ref': variant.ref,
            'alt': variant.alt,
        }
        ea_counts = [int(c) for c in variant.extras['EA_AC'].split(',')]
        aa_counts = [int(c) for c in variant.extras['AA_AC'].split(',')]

        ea_total = sum(ea_counts)
        aa_total = sum(aa_counts)

        # note that allele counts in VCF are alt1,alt2,ref.
        # ...dumb
        ea_thisallele = ea_counts[variant.extras['vcf_row_info']
                                  ['alt_allele_pos']]
        aa_thisallele = aa_counts[variant.extras['vcf_row_info']
                                  ['alt_allele_pos']]

        v['esp_ea'] = float(ea_thisallele) / ea_total
        v['esp_aa'] = float(aa_thisallele) / aa_total

        yield v
Пример #6
0
 def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None):
     """
     This is how variants are loaded
     """
     project_collection = self._get_project_collection(project_id)
     reference_populations = self._get_project_reference_populations(project_id)
     for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list)):
         if i % 1000 == 0:
             print i
         variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt})
         if not variant_dict:
             self._annotator.annotate_variant(variant, reference_populations)
             variant_dict = variant.toJSON()
             variant_dict['vep_consequence'] = variant.annotation['vep_consequence']
             variant_dict['freqs'] = variant.annotation['freqs']
         else:
             for indiv_id, genotype in variant.get_genotypes():
                 if genotype.num_alt != 0:
                     variant_dict['genotypes'][indiv_id] = genotype._asdict()
         project_collection.save(variant_dict)
Пример #7
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        for collection in collections.values():
            collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]

        vcf_file = compressed_file(vcf_file_path)
        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))
        for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            progress.update(vcf_file.tell_progress())
            annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                family_variant_dict = family_variant.toJSON()
                _add_index_fields_to_variant(family_variant_dict, annotation)
                if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                    collection = collections[family['family_id']]
                    collection.insert(family_variant_dict)
Пример #8
0
 def add_variants_to_project_from_vcf(self, vcf_file, project_id, indiv_id_list=None):
     """
     This is how variants are loaded
     """
     project_collection = self._get_project_collection(project_id)
     reference_populations = self._annotator.reference_population_slugs + self._custom_populations_map.get(project_id)
     for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list):
         variant_dict = project_collection.find_one({'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt})
         if not variant_dict:
             variant_dict = variant.toJSON()
             try:
                 annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
             except ValueError, e:
                 sys.stderr.write("WARNING: " + str(e) + "\n")
                 continue
             _add_index_fields_to_variant(variant_dict, annotation)
         else:
             for indiv_id, genotype in variant.get_genotypes():
                 if genotype.num_alt != 0:
                     variant_dict['genotypes'][indiv_id] = genotype._asdict()
         project_collection.save(variant_dict)
Пример #9
0
def get_variants_from_esp_file(file_like_object):
    """
    file_like_object is an ESP-style VCF file
    This could be called 23 times, one for each file, or just once after running vcf-merge
    Return a stream of variants with the following structure:
    {
        'xpos': long int
        'ref': str,
        'alt': alt,
        'esp_ea': float - aaf in european americans
        'esp_aa': float - aaf in african americans
    }
    Note that ref and alt are *not* currently reduced by xbrowse
    """

    ac_meta_fields = ['EA_AC', 'AA_AC']
    for variant in vcf_stuff.iterate_vcf(file_like_object, meta_fields=ac_meta_fields, vcf_row_info=True):

        v = {
            'xpos': variant.xpos,
            'ref': variant.ref,
            'alt': variant.alt,
        }
        ea_counts = [int(c) for c in variant.extras['EA_AC'].split(',')]
        aa_counts = [int(c) for c in variant.extras['AA_AC'].split(',')]

        ea_total = sum(ea_counts)
        aa_total = sum(aa_counts)

        # note that allele counts in VCF are alt1,alt2,ref.
        # ...dumb
        ea_thisallele = ea_counts[variant.extras['vcf_row_info']['alt_allele_pos']]
        aa_thisallele = aa_counts[variant.extras['vcf_row_info']['alt_allele_pos']]

        v['esp_ea'] = float(ea_thisallele) / ea_total
        v['esp_aa'] = float(aa_thisallele) / aa_total

        yield v
Пример #10
0
    def handle(self, *args, **options):
        number_of_variants_to_check = int(
            options.get("number_of_variants_to_check") or 20000)

        if not args:
            args = [p.project_id for p in Project.objects.all()]
            args.reverse()

        for project_id in args:
            try:
                project = Project.objects.get(project_id=project_id)
            except:
                print("ERROR: Project not found. Skipping..")
                continue
            all_counter = 0
            #found_counter = 0
            not_found_counter = 0
            not_found_variants = []
            for vcf_file in project.get_all_vcf_files():
                path = vcf_file.file_path
                #print("Processing %s - %s" % (project.project_id, path))
                if not os.path.isfile(path) and path.endswith(".vcf"):
                    path = path + ".gz"
                if path.endswith(".gz"):
                    f = gzip.open(path)
                else:
                    f = open(path)
                if f:
                    for variant in vcf_stuff.iterate_vcf(f):
                        all_counter += 1
                        try:
                            get_mall(project).annotator.get_annotation(
                                variant.xpos, variant.ref, variant.alt)
                        except ValueError, e:
                            not_found_counter += 1
                            if len(not_found_variants) < 30:
                                chrom, pos = genomeloc.get_chr_pos(
                                    variant.xpos)
                                chrom = chrom.replace("chr", "")
                                ref, alt = variant.ref, variant.alt
                                not_found_variants.append(
                                    "%(chrom)s-%(pos)s-%(ref)s-%(alt)s" %
                                    locals())
                            #print("WARNING: variant not found in annotator cache: " + str(e))
                            #if not_found_counter > 5:
                            #    print("---- ERROR: 5 variants not found. Project %s should be reloaded." % project_id)
                            #    break
                            found_counter = 0
                        #else:
                        #    found_counter += 1
                        #    if found_counter > 15000:
                        #        #print("---- Found 5000 variants in a row. Project %s looks ok." % project_id)
                        #        break
                        if all_counter >= number_of_variants_to_check:
                            fraction_missing = float(
                                not_found_counter) / all_counter
                            if not_found_counter > 10:
                                print(
                                    "---- ERROR: (%(fraction_missing)0.2f%%)  %(not_found_counter)s / %(all_counter)s variants not found. Project %(project_id)s should be reloaded. Examples: "
                                    % locals())

                                for v in not_found_variants:
                                    print(
                                        "http://exac.broadinstitute.org/variant/"
                                        + v)
                            break
Пример #11
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]
        number_of_families = len(family_info_list)
        sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals())

        for family in family_info_list:
            print("Indexing family: " + str(family))
            collection = collections[family['family_id']]
            collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # check whether some of the variants for this chromosome has been loaded already
        # if yes, start from the last loaded variant, and not from the beginning
        if "_chr" in vcf_file_path or ".chr" in vcf_file_path:
            # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome
            vcf_file = compressed_file(vcf_file_path)
            variant = next(vcf_stuff.iterate_vcf(vcf_file, genotypes=False, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map))
            print(vcf_file_path + "  - chromsome: " + str(variant.chr))
            vcf_file.close()

            position_per_chrom = {}
            for chrom in range(1,24):
                position_per_chrom[chrom] = defaultdict(int)
                for family in family_info_list:     #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1)
                    variants = list(collections[family['family_id']].find({'$and': [{'xpos': { '$gte': chrom*1e9 }}, {'xpos': { '$lt': (chrom+1)*1e9}}] }).sort([('xpos',-1)]).limit(1))
                    if len(variants) > 0:
                        position_per_chrom[chrom][family['family_id']] = variants[0]['xpos'] - chrom*1e9
                    else:
                        position_per_chrom[chrom][family['family_id']] = 0

            for chrom in range(1,24):
                position_per_chrom[chrom] = min(position_per_chrom[chrom].values()) # get the smallest last-loaded variant position for this chromosome across all families

            chr_idx = int(variant.xpos/1e9)
            start_from_pos = int(position_per_chrom[chr_idx])

            print("Start from: %s - %s (%0.1f%% done)" % (chr_idx, start_from_pos, 100.*start_from_pos/CHROMOSOME_SIZES[variant.chr.replace("chr", "")]))
            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = itertools.chain(tabix_file.header, tabix_file.fetch(variant.chr.replace("chr", ""), start_from_pos, int(2.5e8)))
        else:
            vcf_iter = vcf_file = compressed_file(vcf_file_path)
            # TODO handle case where it's one vcf file, not split by chromosome

        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_iter, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            try:
                annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                try:
                    family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                    family_variant_dict = family_variant.toJSON()
                    _add_index_fields_to_variant(family_variant_dict, annotation)
                    if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                        collection = collections[family['family_id']]
                        if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}):
                            family_id_to_variant_list[family['family_id']].append(family_variant_dict)
                            variants_buffered_counter += 1
                except Exception, e:
                    sys.stderr.write("ERROR: on variant %s, family: %s - %s\n" % (variant.toJSON(), family, e))
    def load(self):
        self._db.drop_collection('variants')
        self._db.variants.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # load dbsnp info
        for i, variant in enumerate(vcf_stuff.iterate_vcf(open(self._settings.dbsnp_vcf_file))):
            if not i % 100000:
                print i
            self._db.variants.update(
                {'xpos': variant.xpos, 'ref': variant.ref, 'alt': variant.alt},
                {'$set': {'rsid': variant.vcf_id}},
                upsert=True
            )

        # load dbnsfp info
        polyphen_map = {
            'D': 'probably_damaging',
            'P': 'possibly_damaging',
            'B': 'benign',
        }

        sift_map = {
            'D': 'damaging',
            'T': 'tolerated',
        }

        fathmm_map = {
            'D': 'damaging',
            'T': 'tolerated',
        }

        muttaster_map = {
            'A': 'disease_causing',
            'D': 'disease_causing',
            'N': 'polymorphism',
            'P': 'polymorphism',
        }

        for chrom in CHROMOSOMES:
            print "Reading dbNSFP data for {}".format(chrom)
            single_chrom_file = open(self._settings.dbnsfp_dir + 'dbNSFP2.1_variant.' + chrom)
            for i, line in enumerate(single_chrom_file):
                if i == 0:
                    continue
                if not i%100000:
                    print i
                fields = line.strip('\n').split('\t')
                chrom, pos, ref, alt = fields[:4]
                chrom = 'chr' + chrom
                pos = int(pos)
                xpos = genomeloc.get_single_location(chrom, pos)
                if not xpos:
                    continue
                polyphen = polyphen_map.get(fields[25])
                sift = sift_map.get(fields[23])
                fathmm = fathmm_map.get(fields[39])
                muttaster = muttaster_map.get(fields[33])

                self._db.variants.update(
                    {'xpos': xpos, 'ref': ref, 'alt': alt},
                    {'$set': {
                        'polyphen': polyphen,
                        'sift': sift,
                        'fathmm': fathmm,
                        'muttaster': muttaster,
                    }},
                    upsert=True
                )
Пример #13
0
    def load_population_to_annotator(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
            else:
                vcf_file = open(population['file_path'])
            for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False)):
                if i % 10000 == 0:
                    print i
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
            else:
                vcf_file = open(population['file_path'])
            meta_key = population['vcf_info_key']
            for i, variant in enumerate(vcf_stuff.iterate_vcf(vcf_file, meta_fields=[meta_key,])):
                if i % 10000 == 0:
                    print i
                freq = float(variant.extras.get(meta_key, 0))
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                print "Adding %s" % filename
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                for i, variant in enumerate(get_variants_from_esp_file(f)):
                    if i % 10000 == 0:
                        print i
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )

        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
            else:
                counts_file = open(population['file_path'])
            for i, line in enumerate(counts_file):
                if i % 10000 == 0:
                    print i
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
            else:
                counts_file = open(population['file_path'])
            for i, line in enumerate(counts_file):
                if i % 10000 == 0:
                    print i
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
Пример #14
0
    def _add_vcf_file_for_family_set(self, family_info_list, vcf_file_path, reference_populations=None, vcf_id_map=None):
        collections = {f['family_id']: self._db[f['coll_name']] for f in family_info_list}
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]

        number_of_families = len(family_info_list)
        sys.stderr.write("Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n" % locals())

        #for family in family_info_list:
        #    print("Indexing family: " + str(family))
        #    collection = collections[family['family_id']]
        #    collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        vcf_file = compressed_file(vcf_file_path)
        size = os.path.getsize(vcf_file_path)
        progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError("%s has zero variants to insert. Should not be in buff." % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, indiv_id_list=indiv_id_list, vcf_id_map=vcf_id_map):
            progress.update(vcf_file.tell_progress())
            try:
                annotation = self._annotator.get_annotation(variant.xpos, variant.ref, variant.alt, populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                family_variant = variant.make_copy(restrict_to_genotypes=family['individuals'])
                family_variant_dict = family_variant.toJSON()
                _add_index_fields_to_variant(family_variant_dict, annotation)
                if xbrowse_utils.is_variant_relevant_for_individuals(family_variant, family['individuals']):
                    collection = collections[family['family_id']]
                    if not collection.find_one({'xpos': family_variant.xpos, 'ref': family_variant.ref, 'alt': family_variant.alt}):
                        family_id_to_variant_list[family['family_id']].append(family_variant_dict)
                        variants_buffered_counter += 1

            if variants_buffered_counter > 10000:
                print(date.strftime(datetime.now(), "%m/%d/%Y %H:%M:%S") + "-- inserting %d family-variants from %d vcf rows into %s families" % (variants_buffered_counter, vcf_rows_counter, len(family_id_to_variant_list)))

                insert_all_variants_in_buffer(family_id_to_variant_list, collections)

                assert len(family_id_to_variant_list) == 0
                vcf_rows_counter = 0
                variants_buffered_counter = 0
Пример #15
0
    def _add_vcf_file_for_family_set(self,
                                     family_info_list,
                                     vcf_file_path,
                                     reference_populations=None,
                                     vcf_id_map=None,
                                     start_from_chrom=None,
                                     end_with_chrom=None):
        collections = {
            f['family_id']: self._db[f['coll_name']]
            for f in family_info_list
        }
        #for collection in collections.values():
        #    collection.drop_indexes()
        indiv_id_list = [i for f in family_info_list for i in f['individuals']]
        number_of_families = len(family_info_list)
        sys.stderr.write(
            "Loading variants for %(number_of_families)d families %(family_info_list)s from %(vcf_file_path)s\n"
            % locals())

        for family in family_info_list:
            print("Indexing family: " + str(family))
            collection = collections[family['family_id']]
            collection.ensure_index([('xpos', 1), ('ref', 1), ('alt', 1)])

        # check whether some of the variants for this chromosome has been loaded already
        # if yes, start from the last loaded variant, and not from the beginning
        if "_chr" in vcf_file_path or ".chr" in vcf_file_path:
            # if the VCF files are split by chromosome (eg. for WGS projects), check within the chromosome
            vcf_file = compressed_file(vcf_file_path)
            variant = next(
                vcf_stuff.iterate_vcf(vcf_file,
                                      genotypes=False,
                                      indiv_id_list=indiv_id_list,
                                      vcf_id_map=vcf_id_map))
            print(vcf_file_path + "  - chromsome: " + str(variant.chr))
            vcf_file.close()

            position_per_chrom = {}
            for chrom in range(1, 24):
                position_per_chrom[chrom] = defaultdict(int)
                for family in family_info_list:  #variants = collections[family['family_id']].find().sort([('xpos',-1)]).limit(1)
                    variants = list(collections[family['family_id']].find({
                        '$and': [{
                            'xpos': {
                                '$gte': chrom * 1e9
                            }
                        }, {
                            'xpos': {
                                '$lt': (chrom + 1) * 1e9
                            }
                        }]
                    }).sort([('xpos', -1)]).limit(1))
                    if len(variants) > 0:
                        position_per_chrom[chrom][family[
                            'family_id']] = variants[0]['xpos'] - chrom * 1e9
                    else:
                        position_per_chrom[chrom][family['family_id']] = 0

            for chrom in range(1, 24):
                position_per_chrom[chrom] = min(
                    position_per_chrom[chrom].values()
                )  # get the smallest last-loaded variant position for this chromosome across all families

            chr_idx = int(variant.xpos / 1e9)
            start_from_pos = int(position_per_chrom[chr_idx])

            print("Start from: %s - %s (%0.1f%% done)" %
                  (chr_idx, start_from_pos, 100. * start_from_pos /
                   CHROMOSOME_SIZES[variant.chr.replace("chr", "")]))
            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = itertools.chain(
                tabix_file.header,
                tabix_file.fetch(variant.chr.replace("chr", ""),
                                 start_from_pos, int(2.5e8)))
        elif start_from_chrom or end_with_chrom:
            if start_from_chrom:
                print("Start chrom: chr%s" % start_from_chrom)
            if end_with_chrom:
                print("End chrom: chr%s" % end_with_chrom)

            chrom_list = list(map(str, range(1, 23))) + ['X', 'Y']
            chrom_list_start_index = 0
            if start_from_chrom:
                chrom_list_start_index = chrom_list.index(
                    start_from_chrom.replace("chr", "").upper())

            chrom_list_end_index = len(chrom_list)
            if end_with_chrom:
                chrom_list_end_index = chrom_list.index(
                    end_with_chrom.replace("chr", "").upper())

            tabix_file = pysam.TabixFile(vcf_file_path)
            vcf_iter = tabix_file.header
            for chrom in chrom_list[
                    chrom_list_start_index:chrom_list_end_index + 1]:
                print("Will load chrom: " + chrom)
                try:
                    vcf_iter = itertools.chain(vcf_iter,
                                               tabix_file.fetch(chrom))
                except ValueError as e:
                    print("WARNING: " + str(e))

        else:
            vcf_iter = vcf_file = compressed_file(vcf_file_path)
            # TODO handle case where it's one vcf file, not split by chromosome

        size = os.path.getsize(vcf_file_path)

        #progress = get_progressbar(size, 'Loading VCF: {}'.format(vcf_file_path))

        def insert_all_variants_in_buffer(buff, collections_dict):
            for family_id in buff:
                if len(buff[family_id]) == 0:  # defensive programming
                    raise ValueError(
                        "%s has zero variants to insert. Should not be in buff."
                        % family_id)

            while len(buff) > 0:
                # choose a random family for which to insert a variant from among families that still have variants to insert
                family_id = random.choice(buff.keys())

                # pop a variant off the list for this family, and insert it
                family_variant_dict_to_insert = buff[family_id].pop()
                c = collections_dict[family_id]
                c.insert(family_variant_dict_to_insert)

                if len(buff[family_id]) == 0:
                    del buff[
                        family_id]  # if no more variants for this family, delete it

        vcf_rows_counter = 0
        variants_buffered_counter = 0
        family_id_to_variant_list = defaultdict(
            list)  # will accumulate variants to be inserted all at once
        for variant in vcf_stuff.iterate_vcf(vcf_iter,
                                             genotypes=True,
                                             indiv_id_list=indiv_id_list,
                                             vcf_id_map=vcf_id_map):
            if variant.alt == "*":
                #print("Skipping GATK 3.4 * alt allele: " + str(variant.unique_tuple()))
                continue

            try:
                annotation = self._annotator.get_annotation(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    populations=reference_populations)
            except ValueError, e:
                sys.stderr.write("WARNING: " + str(e) + "\n")
                continue

            vcf_rows_counter += 1
            for family in family_info_list:
                # TODO: can we move this inside the if relevant clause below?
                try:
                    family_variant = variant.make_copy(
                        restrict_to_genotypes=family['individuals'])
                    family_variant_dict = family_variant.toJSON()
                    _add_index_fields_to_variant(family_variant_dict,
                                                 annotation)
                    if xbrowse_utils.is_variant_relevant_for_individuals(
                            family_variant, family['individuals']):
                        collection = collections[family['family_id']]
                        if not collection.find_one({
                                'xpos': family_variant.xpos,
                                'ref': family_variant.ref,
                                'alt': family_variant.alt
                        }):
                            family_id_to_variant_list[family[
                                'family_id']].append(family_variant_dict)
                            variants_buffered_counter += 1
                except Exception, e:
                    sys.stderr.write(
                        "ERROR: on variant %s, family: %s - %s\n" %
                        (variant.toJSON(), family, e))
Пример #16
0
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(
                size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 genotypes=True,
                                                 genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and (
                "1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = [
                    "EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"
                ]
            else:
                meta_fields = [
                    meta_key,
                ]

            for variant in vcf_stuff.iterate_vcf(vcf_file,
                                                 meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes"
                                                     in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(
                            freq,
                            float(
                                variant.extras.get(meta_key,
                                                   0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(
                        variant.extras.get(
                            meta_key,
                            0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(
                    os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(
                    file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'], variant['ref'], variant['alt'],
                        population['slug'], variant[population['counts_key']])
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(
                size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(xpos, ref, alt,
                                               population['slug'], freq)
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith(
                    '.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(
                size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(
                    vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(
                        variant.extras.get(ac_info_key).split(',')
                        [alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AC value %s from %s: %s" %
                        (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(
                        variant.extras.get(an_info_key).split(',')
                        [AN_index].replace("NA", "0"))
                except Exception, e:
                    print(
                        "Couldn't parse AN value %s from %s: %s" %
                        (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac) / an
                self._add_population_frequency(variant.xpos, variant.ref,
                                               variant.alt, population['slug'],
                                               freq)
    def load_population(self, population):
        """
        Take a population and a data source; extract and load it into annotator
        Data source can be VCF file, VCF Counts file, or a counts dir (in the case of ESP data)
        """
        if population['file_type'] == 'vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            progress = get_progressbar(size, 'Loading vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, genotypes=True, genotype_meta=False):
                progress.update(progress_file.tell())
                freq = get_aaf(variant)
                self._add_population_frequency(variant.xpos, variant.ref, variant.alt, population['slug'], freq)
            vcf_file.close()

        elif population['file_type'] == 'sites_vcf':
            if population['file_path'].endswith('.gz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            meta_key = population.get('vcf_info_key', 'AF')

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            is_1kg_popmax = "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"])
            if is_1kg_popmax:
                meta_fields = ["EAS_AF", "EUR_AF", "AFR_AF", "AMR_AF", "SAS_AF"]
            else:
                meta_fields = [meta_key,]

            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=meta_fields):
                progress.update(progress_file.tell())
                if "popmax" in meta_key.lower() and ("1000 Genomes" in population["name"]):
                    allele_idx = variant.extras['alt_allele_pos']
                    freq = 0
                    for meta_key in meta_fields:
                        freq = max(freq, float(variant.extras.get(meta_key, 0).split(',')[allele_idx]))

                    ##INFO=<ID=EAS_AF,Number=A,Type=Float,Description="Allele frequency in the EAS populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=EUR_AF,Number=A,Type=Float,Description="Allele frequency in the EUR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AFR_AF,Number=A,Type=Float,Description="Allele frequency in the AFR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=AMR_AF,Number=A,Type=Float,Description="Allele frequency in the AMR populations calculated from AC and AN, in the range (0,1)">
                    ##INFO=<ID=SAS_AF,Number=A,Type=Float,Description="Allele frequency in the SAS populations calculated from AC and AN, in the range (0,1)">
                else:
                    freq = float(variant.extras.get(meta_key, 0).split(',')[variant.extras['alt_allele_pos']])

                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )
            vcf_file.close()

        #
        # Directory of per-chromosome VCFs that ESP publishes
        #
        elif population['file_type'] == 'esp_vcf_dir':
            for filename in os.listdir(population['dir_path']):
                file_path = os.path.abspath(os.path.join(population['dir_path'], filename))
                f = open(file_path)
                file_size = os.path.getsize(file_path)
                progress = get_progressbar(file_size, 'Loading ESP file: {}'.format(filename))
                for variant in get_variants_from_esp_file(f):
                    progress.update(f.tell())
                    self._add_population_frequency(
                        variant['xpos'],
                        variant['ref'],
                        variant['alt'],
                        population['slug'],
                        variant[population['counts_key']]
                    )
                f.close()
        #
        # text file of allele counts, as Monkol has been using for the joint calling data
        #
        elif population['file_type'] == 'counts_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = counts_file

            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = 'chr' + fields[0]
                pos = int(fields[1])
                xpos = genomeloc.get_single_location(chrom, pos)
                ref = fields[2]
                alt = fields[3]
                if int(fields[5]) == 0:
                    continue
                freq = float(fields[4]) / float(fields[5])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        # this is now the canonical allele frequency file -
        # tab separated file with xpos / ref / alt / freq
        elif population['file_type'] == 'xbrowse_freq_file':
            if population['file_path'].endswith('.gz'):
                counts_file = gzip.open(population['file_path'])
                progress_file = counts_file.fileobj
            else:
                counts_file = open(population['file_path'])
                progress_file = counts_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))

            for line in counts_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                xpos = int(fields[0])
                ref = fields[1]
                alt = fields[2]
                freq = float(fields[3])
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            counts_file.close()

        elif population['file_type'] == 'tsv_file':
            if population['file_path'].endswith('.gz'):
                freq_file = gzip.open(population['file_path'])
                progress_file = freq_file.fileobj
            else:
                freq_file = open(population['file_path'])
                progress_file = freq_file
            size = os.path.getsize(population['file_path'])
            progress = get_progressbar(size, 'Loading population: {}'.format(population['slug']))
            header = next(freq_file)
            print("Header: " + header)
            for line in freq_file:
                progress.update(progress_file.tell())
                fields = line.strip('\n').split('\t')
                chrom = fields[0]
                pos = int(fields[1])
                ref = fields[2]
                alt = fields[3]
                freq = float(fields[4])

                xpos = genomeloc.get_single_location(chrom, pos)
                self._add_population_frequency(
                    xpos,
                    ref,
                    alt,
                    population['slug'],
                    freq
                )
            freq_file.close()

        elif population['file_type'] == 'sites_vcf_with_counts':
            if population['file_path'].endswith('.gz') or population['file_path'].endswith('.bgz'):
                vcf_file = gzip.open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file.fileobj
            else:
                vcf_file = open(population['file_path'])
                size = os.path.getsize(population['file_path'])
                progress_file = vcf_file
            ac_info_key = population['ac_info_key']
            an_info_key = population['an_info_key']

            progress = get_progressbar(size, 'Loading sites vcf: {}'.format(population['slug']))
            for variant in vcf_stuff.iterate_vcf(vcf_file, meta_fields=[ac_info_key, an_info_key]):
                progress.update(progress_file.tell())

                alt_allele_pos = variant.extras['alt_allele_pos']
                try:
                    ac = int(variant.extras.get(ac_info_key).split(',')[alt_allele_pos].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AC value %s from %s: %s" % (alt_allele_pos, ac_info_key, variant.extras), e)
                    continue

                try:
                    if "popmax" in ac_info_key.lower():
                        AN_index = alt_allele_pos  # each allele may have a different AN value from a different population
                    else:
                        AN_index = 0

                    an = int(variant.extras.get(an_info_key).split(',')[AN_index].replace("NA", "0"))
                except Exception, e:
                    print("Couldn't parse AN value %s from %s: %s" % (alt_allele_pos, an_info_key, variant.extras), e)
                    continue

                if an == 0:
                    freq = 0.0
                else:
                    freq = float(ac)/an
                self._add_population_frequency(
                    variant.xpos,
                    variant.ref,
                    variant.alt,
                    population['slug'],
                    freq
                )