/
load_data.py
63 lines (57 loc) · 2.33 KB
/
load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
'''
1. Read data from SNPData.csv, process them, and output snps.py for use later
2. Calculate genotype frequency using ga4gh
'''
from pysam import TabixFile, asTuple
import ga4gh
import csv
WARNING = "'''\nThis file is auto-generated by program. Don't touch me.\n'''\n"
if __name__ == '__main__':
# load data from files
snps = {}
with open('SNPData.csv') as src:
disease = None
for row in csv.DictReader(src):
if row['Chromosome'] is None:
disease = row['SNP']
continue
row['disease'] = disease[0:-1]
snps[row['SNP']] = row
with open('DrugInfo.csv') as src:
drug_info = {row['SNP']: row for row in csv.DictReader(src)}
with open('okg.ped') as pop_src:
# mapping: sample id -> population id
populations = {indiv['Individual ID']: indiv['Population']
for indiv in csv.DictReader(pop_src, delimiter='\t')}
print 'Determining genomic coordinates for sequences.'
f = TabixFile('snps.sorted.txt.gz', parser=asTuple())
snp_table = {}
for row in f.fetch():
_, snp, chrom, pos = row
if snp in snps or snp in drug_info:
snp_table[snp] = {
'chromosome': chrom,
'pos': int(pos)
}
with open('snps.py', 'w') as dump:
dump.write(WARNING)
dump.write('COORDINATES = %s\n'% snp_table)
dump.write('DATA = %s\n'% snps)
dump.write('DRUG_INFO = %s\n'% drug_info)
print 'Data written to snps.py'
print 'Determining allele frequencies (using data from 1000 Genomes)'
genotypes = {snp: snp_data['Code'] for snp, snp_data in snps.iteritems()}
variants = list(ga4gh.search_variants(genotypes, dataset=ga4gh.OKG, repo_id='google'))
# determine allele frequencies for different population
freqs = {
pop: ga4gh.get_frequencies(
variants,
genotypes,
population=lambda call: populations.get(call.get('callSetName'))==pop)
for pop in set(populations.values())}
# allele frequencies for 1000 Genomes' whole population
freqs['1kg'] = ga4gh.get_frequencies(variants, genotypes)
with open('freq.py', 'w') as dump:
dump.write(WARNING)
dump.write('FREQUENCIES = %s\n'% freqs)
print 'Data written to freq.py.'