-
Notifications
You must be signed in to change notification settings - Fork 1
/
repolarization_lookupKey.py
140 lines (122 loc) · 7.45 KB
/
repolarization_lookupKey.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
###########################
# Author: Patrick Monnahan
# Purpose: Repolarize reference/alternative alleles in A. arenosa mapped to north american A. lyrata. We used genotype calls from mapping European A. lyrata, A. croatica, and A. haleri to the same N. American lyrata reference genome, and identified sites in which these samples showed an average frequency > 0.5. Since all of these samples are more closely genetically related to A. arenosa, we repolarized the reference/alternative at these sites.
############################
import argparse
import gzip
parser = argparse.ArgumentParser(description = 'This program takes a merged vcf with highest-coverage individuals from specific pops as input, and outputs a lookup key from the merged vcf')
parser.add_argument('-v', type = str, metavar = 'vcf_path', required = True, help = 'path to vcf')
parser.add_argument('-gz', type = str, metavar = 'gzipped?', required = False, default = 'false', help = 'are vcfs gzipped (true) or not (false)')
parser.add_argument('-o', type = str, metavar = 'Output_Prefix', required = True, help = 'Vcfs retain original name but the concatenated lookup key will be a text file specified by output')
parser.add_argument('-mi', type = int, metavar = 'minimum amount individuals', required = True, help = 'minimum amount of individuals required to support alternative alleles')
parser.add_argument('-mp', type = float, metavar = 'min. proportion alt alleles', required = True, help = 'minimum proportion of alternative alleles to allow')
parser.add_argument('-ly', type = str, metavar = 'lyrata_only?', required = False, default = 'false', help = 'do you want to include lyrata only (true) or not (false)?')
args = parser.parse_args()
if args.gz == 'true' and args.v[-3:] == '.gz':
gzip.gunzip(args.v)
lookup_table_file = open(args.v+args.o+"repolarized.lookupKey.minAlleles_"+str(args.mi)+".txt", 'w')
lookup_table_file = open(args.o+"repolarized.lookupKey.minInd_"+str(args.mi)+".txt", 'w')
if args.ly == 'true':
args.mi = 2 # args.mi must = 2, since there are only two lyrata samples
args.mp = 1.0
count = 0
type_counts = [0, 0, 0, 0]
count_file = open(args.o + "counts.txt",'w')
with open(args.v) as vcf:
for line_idx, line in enumerate(vcf): # Cycle over lines in the VCF file
cols = line.replace('\n', '').split('\t') # Split each line of vcf
if len(cols) < 2: ## This should be info just before header
pass
elif cols[0] == "#CHROM": #This should be header
names = [] # list to append pop names to (may not be necessary..)
for j in cols[9:]: # get names of individuals in vcf
names.append(j)
else:
scaff = cols[0] # parse important info from each line
position = int(cols[1])
ref_base = cols[3]
alt_base = cols[4]
info = cols[7].split(";")
AN = float(info[2].split("=")[1])
AC = float(info[0].split("=")[1])
newsite=[]
num_ind = 0
alt_ind = 0
het_ind = 0
min_ind = args.mi
min_prop_alt = args.mp
cros = 0
hals = 0
lyrs = 0
cro_alt_alleles = 0
lyr_alt_alleles = 0
hal_alt_alleles = 0
species = 0
for j, ind in enumerate(cols[9:]):
ind = ind.split(":")
if len(ind) ==5:
dp = ind[2]
gt = ind[0]
gt = gt.split("/")
# below is the 'Lyrata Only' section.
if args.ly == 'true':
if j == 2 or j == 3:
if gt[0] != ".":
num_ind += 1
if sum([int(x) for x in gt]) == 2:
alt_ind += 1
elif sum([int(x) for x in gt]) == 1:
het_ind += 1
elif args.ly != 'true':
if j == 0 or j == 1: # Croatica
if gt[0] != ".":
num_ind += 1
cros += 2
if sum([int(x) for x in gt]) == 2:
alt_ind += 1
cro_alt_alleles += 2
elif sum([int(x) for x in gt]) == 1:
het_ind += 1
cro_alt_alleles += 1
if j == 2 or j == 3: # lyrata
if gt[0] != ".":
num_ind += 1
lyrs += 2
if sum([int(x) for x in gt]) == 2:
alt_ind += 1
lyr_alt_alleles += 2
elif sum([int(x) for x in gt]) == 1:
het_ind += 1
lyr_alt_alleles += 1
if j == 4 or j == 5: # halleri
if gt[0] != ".":
num_ind += 1
hals += 2
if sum([int(x) for x in gt]) == 2:
alt_ind += 1
hal_alt_alleles += 2
elif sum([int(x) for x in gt]) == 1:
het_ind += 1
hal_alt_alleles += 1
## NOTE: for last 3 elif statements, notice the additional weighting of lyrata and haleri over croatica, and haleri over lyrata. This is because haleri > lyrata > croatica in terms of genetic similarity to arenosa
if all(k >= 2 for k in [cros, lyrs, hals]): # 3 species genotyped for at least one indivdiual
spec_freqs = [float(l) / float(m) for l, m in zip([cro_alt_alleles, lyr_alt_alleles, hal_alt_alleles], [cros, lyrs, hals])]
if sum([1 for jj in spec_freqs if jj > 0.5]) >= 2: # Asking
lookup_table_file.write(scaff + "\t" + str(position) + "\n")
type_counts[0] += 1
elif all(k >= 2 for k in [cros, lyrs]): # Genotypes for cro and lyr
spec_freqs = [float(l) / float(m) for l, m in zip([cro_alt_alleles, lyr_alt_alleles, lyr_alt_alleles], [cros, lyrs, lyrs])]
if sum(spec_freqs) / 3.0 > 0.5:
lookup_table_file.write(scaff + "\t" + str(position) + "\n")
type_counts[1] += 1
elif all(k >= 2 for k in [cros, hals]): # Genotypes for cro and hal
spec_freqs = [float(l) / float(m) for l, m in zip([cro_alt_alleles, hal_alt_alleles, hal_alt_alleles], [cros, hals, hals])]
if sum(spec_freqs) / 3.0 > 0.5:
lookup_table_file.write(scaff + "\t" + str(position) + "\n")
type_counts[2] += 1
elif all(k >= 2 for k in [lyrs, hals]): # Genotypes for lyr and hal
spec_freqs = [float(l) / float(m) for l, m in zip([lyr_alt_alleles, hal_alt_alleles, hal_alt_alleles], [lyrs, hals, hals])]
if sum(spec_freqs) / 3.0 > 0.5:
lookup_table_file.write(scaff + "\t" + str(position) + "\n")
type_counts[3] += 1
print(type_counts)