def resampling_f(fastadict, seq, n, k): fastadict[seq] = fastadict[seq].replace("N", "").replace("n", "") seq_list = np.random.choice(tuple(fastadict[seq]), replace=True, size=(n * k, )).view('S{k}'.format(k=k)) l = [] for element in seq_list: new_seq_het = list( heterozygosity(element))[2] # Het of the new sequence l.append(new_seq_het) return l
return intergenicDict intergenDict = intergenicCoord(intergenic) if winSize == 0 and step == 0: # Do not use sliding window approach, just calculate heterozygosity in the given region new_fastdict = {} for scaffold in intergenDict.keys(): if scaffold in fastaDict.keys(): for coordinate in intergenDict.get(scaffold): intron = fastaDict.get( scaffold)[int(coordinate[0]):int(coordinate[1])] print(scaffold + '\t' + str(coordinate[0]) + '\t' + str(coordinate[1]) + '\t' + str(heterozygosity(intron)[0]) + '\t' + str(heterozygosity(intron)[1]) + '\t' + str(heterozygosity(intron)[2])) # Get a sequence in fasta that lies within the intergenic sequence -- V2 else: # Print header print("Scaffold" + "\t" + "Orientation" + "\t" + "Window" + "\t" + "Window_Start" + "\t" + "Window_End" + "\t" + "Distance_from_Gene" + "\t" + "Intergenic_start" + "\t" + "Intergenic_end" + "\t" + "Num_SNPs" + "\t" + "Num_bases" + "\t" + "Het") new_fastdict = {} for scaffold in intergenDict.keys(): counter = 0 if scaffold in fastaDict.keys(): # print scaffold
#!/usr/bin/python from __future__ import division import sys from fasta import readfasta from het import heterozygosity fasta = sys.argv[1] # Read fasta into a dictionary with open(fasta, 'r') as f: fasta_dict = readfasta(f) # Calculate heterozygosity for every scaffold for key in fasta_dict.keys(): het = heterozygosity(fasta_dict[key]) print(key + "\t" + str(het[0]) + "\t" + str(het[1]) + "\t" + str(het[2]))
#********************************************** intron_dict = {} for line in intron_coord: line = line.strip('\n').split('\t') key, value = line[0], line[1:] if key in intron_dict.keys(): intron_dict[key].append(value) else: intron_dict[key] = [value] #************************************************************************** # Extract the intronic sequences from fasta and read them into a dictionary #************************************************************************** for scaf in intron_dict.keys(): scaffold = scaf.split('_', 1)[0] if scaffold in fastaseq.keys(): for coordinate in intron_dict.get(scaf): # If the sequence is ABCD and the coordinates are 1 and 3, because the # indexing is from 0 in Python, we would take the sequence from 0 to 3 # because the last base is exclusive. intron_seq = fastaseq.get( scaffold)[int(coordinate[0]):int(coordinate[1])] print(scaf + '\t' + str(coordinate[0]) + '\t' + str(coordinate[1]) + '\t' + str(heterozygosity(intron_seq)[0]) + '\t' + str(heterozygosity(intron_seq)[1]) + '\t' + str(heterozygosity(intron_seq)[2])) intron_coord.close() fasta.close()
for coordinate in intergenDict.get(scaffold): counter = 0 new_scaffold = fastaDict.get( scaffold)[int(coordinate[0]):int(coordinate[1]) + 1] length = len(new_scaffold) if length % 2 == 1: length = length + 1 midpoint = int(length / 2) sequence_left = new_scaffold[0:midpoint] sequence_right = new_scaffold[:(midpoint - 1):-1] chunks_left = slidingWindow(sequence_left, winSize, step) for i in chunks_left: counter += 1 end = str(int(coordinate[0]) + int(counter * winSize)) start = str(int(end) - winSize) distance = str(abs(int(coordinate[0]) - int(end))) print scaffold, "left", counter, start, end, distance, coordinate[ 0], coordinate[1], heterozygosity(i) counter = 0 chunks_right = slidingWindow(sequence_right, winSize, step) for i in chunks_right: counter += 1 end = str(int(coordinate[1]) - int(counter * winSize)) start = str(int(end) + winSize) distance = str(int(coordinate[1]) - int(end)) print scaffold, "right", counter, start, end, distance, coordinate[ 0], coordinate[1], heterozygosity(i) fastaseq.close() intergenic.close()
with open(window, 'r') as windowfile: for line in windowfile: line = line.strip('\n').split('\t') key, value = line[0], line[1:] if key in window_dict.keys(): window_dict[key].append(value) else: window_dict[key] = [value] # print window_dict fasta_win = {} for scaffold in window_dict.keys(): for element in window_dict.get(scaffold): if scaffold in fasta_win.keys(): fasta_win[scaffold].append(fasta_dict[scaffold][int(element[0]) - 1:int(element[1])]) else: fasta_win[scaffold] = [ fasta_dict[scaffold][int(element[0]) - 1:int(element[1])] ] # print fasta_win for scaffold in fasta_win.keys(): for sequence in fasta_win.get(scaffold): index = fasta_win[scaffold].index(sequence) print scaffold, window_dict[scaffold][index][0], window_dict[scaffold][ index][1], window_dict[scaffold][index][2], heterozygosity( sequence)[0], heterozygosity(sequence)[1], heterozygosity( sequence)[2]