load_data.py

'''
1. Read data from SNPData.csv, process them, and output snps.py for use later
2. Calculate genotype frequency using ga4gh
'''
from pysam import TabixFile, asTuple
import ga4gh
import csv

WARNING = "'''\nThis file is auto-generated by program. Don't touch me.\n'''\n" 


if __name__ == '__main__':
    # load data from files 
    snps = {}
    with open('SNPData.csv') as src:
        disease = None
        for row in csv.DictReader(src):
            if row['Chromosome'] is None:
                disease = row['SNP']
                continue
            row['disease'] = disease[0:-1]
            snps[row['SNP']] = row 
    
    with open('DrugInfo.csv') as src:
        drug_info = {row['SNP']: row for row in csv.DictReader(src)} 

    with open('okg.ped') as pop_src:
        # mapping: sample id -> population id
        populations = {indiv['Individual ID']: indiv['Population']
                for indiv in csv.DictReader(pop_src, delimiter='\t')}

    print 'Determining genomic coordinates for sequences.'
    f = TabixFile('snps.sorted.txt.gz', parser=asTuple()) 
    snp_table = {}
    for row in f.fetch():
        _, snp, chrom, pos = row
        if snp in snps or snp in drug_info:
            snp_table[snp] = {
                'chromosome': chrom,
                'pos': int(pos)
            } 
    with open('snps.py', 'w') as dump:
        dump.write(WARNING)
        dump.write('COORDINATES = %s\n'% snp_table)
        dump.write('DATA = %s\n'% snps)
        dump.write('DRUG_INFO = %s\n'% drug_info)
    print 'Data written to snps.py' 
    print 'Determining allele frequencies (using data from 1000 Genomes)'
    genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()} 
    variants = list(ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
    # determine allele frequencies for different population
    freqs = {
            pop: ga4gh.get_frequencies(
                variants,
                genotypes,
                population=lambda call: populations.get(call.get('callSetName'))==pop)
            for pop in set(populations.values())} 
    # allele frequencies for 1000 Genomes' whole population
    freqs['1kg'] = ga4gh.get_frequencies(variants, genotypes)
    with open('freq.py', 'w') as dump:
        dump.write(WARNING)
        dump.write('FREQUENCIES = %s\n'% freqs) 
    print 'Data written to freq.py.'