for i in xrange(len(data)-(k-1)): d[s[i:i+k]] += 1 return d def permutation_test(s, n=100): """Assess odds ratio on permuted string and return a list with scores.""" res = [] for _ in range(n): random.shuffle(s) f = count_kmers("".join(s), 2) res.append(log2(N * f[a + b] / float(f1[a] * f1[b]))) return res from main import get_seq data = "".join(get_seq()) N = float(len(data)) f2 = count_kmers(data, 2) f1 = count_kmers(data, 1) a = "G"; b = "C" odds_ratio = log2(N * f2[a + b] / float(f1[a] * f1[b])) print odds_ratio list_data = list(data) res = permutation_test(list_data) p = sum(1 for r in res if r>odds_ratio) / float(len(res)) print "OR=%5.3f, p=%7.5f" % (odds_ratio, p)
offset = 0 skipped = 0 for i in xrange(min(len(seq1), len(seq2))): if i + offset == len(seq2)-1: break if(seq1[i] != seq2[i+offset]): if(seq1[i] == seq2[i+offset+1] and seq1[i+1] == seq2[i+offset+2]): offset += 1 skipped += 1 continue diff+=1 print "skipped ", skipped return diff+skipped if __name__ == '__main__': seqs = [get_seq("NC_001807.1.fasta"), get_seq("NC_001807.2.fasta"), get_seq("NC_001807.3.fasta"), get_seq("NC_001807.4.fasta"), get_seq("NC_012920.1.fasta")] lens = [] for s in seqs[:-1]: lens.append(simple_seq_diff(seqs[4], s)) print "" import pylab pylab.bar(range(len(lens)), lens) pylab.xlabel("NC_001807.1 NC_001807.2 NC_001807.3 NC_001807.4") pylab.ylabel("Num differences + num skips") pylab.show()