def generate(X, seqType, args): ''' # Note-1: args.gGap --> 1, 2, 3 # Note-2: gGap --> ('X', 'X') :param X: :param seqType: :param args: :return: ''' elements = utils.sequenceElements(seqType) m2 = list(itertools.product(elements, repeat=2)) m = m2 # print(args.gGap) T = [] for x in X: x = x[:args.terminusLength] t = [] for i in range(1, args.gGap + 1, 1): V = utils.kmers(x, i + 2) # seqLength = len(x) - (i+2) + 1 for gGap in m: # print(gGap[0], end='') # print('-'*i, end='') # print(gGap[1]) # trackingFeatures.append(gGap[0] + '-' * i + gGap[1]) C = 0 for v in V: if v[0] == gGap[0] and v[-1] == gGap[1]: C += 1 # print(C, end=',') t.append(C) #end-for #end-for t = np.array(t) # t = t.reshape(-1, 1) T.append(t) # end-for T = np.array(T) # print(T.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = (4 * args.gGap * 4) else: if seqType == 'PROT': totalFeature = (20 * args.gGap * 20) else: None #end-if save.datasetSave(T, totalFeature, 'fg11') #end-def
def log_ZS_analytic((matrix, mu, Ne)): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) return log(acc)
def log_ZS_analytic((matrix, mu, Ne)): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1/(1+exp(ep-mu)))**(Ne-1) return log(acc)
def log_Zb_chem_pot_ref_dep(L, sigma, G, mu, upto=4): sites = kmers(L) scores = [sigma * (L - site.count("A")) for site in sites] Zs = [ sum([ exp(-sum(comb) - mu * k) for comb in itertools.combinations(scores, k) ]) for k in trange(upto) ] Z0 = sum(Zs) return log(G / (4**L) * Z0)
def generate(X, seqType, args): ''' :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA' or seqType == 'RNA': p = [0] * (4 * 4) # As we are working for g11 else: if seqType == 'PROT': p = [0] * (20 * 20) # As we are working for g11 else: None # Trail: Merged elements = utils.sequenceElements(seqType) m = list(itertools.product(elements, repeat=2)) T = [] for x in X: merged = [] x = x[:args.terminusLength] for i in range(1, args.gGap + 1): kmers = utils.kmers(x, 2 + i) # g11 --> 2, gGap (g11+gGap) t = [] require = (args.terminusLength - (2 + 1) + 1) - (len(x) - (2 + i) + 1) for kmer in kmers: d = {''.join(_): 0 for _ in m} segment = kmer[0] + kmer[-1] d[segment] = 1 t.append(list(d.values())) # break # break # print(v) if require > 0: for i in range(require): t.append(p) # end-for else: None t = np.array(t) # print(t) merged.append(t) # print('------------------') # end-for T.append(np.concatenate((merged), axis=1)) # end-for T = np.array(T) # print(T.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = (4 * args.gGap * 4) else: if seqType == 'PROT': totalFeature = (20 * args.gGap * 20) else: None # end-if save.datasetSave(T, totalFeature, 'pg11') #end-for
def compute_Z(model): k = len(model) L = int(1 + sqrt(1+8*k)/2) return sum(exp(score(model, "".join(kmer))) for kmer in kmers(L))
ep = score_seq(matrix, "".join(kmer)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) return log(acc) def log_ZM_analytic((matrix, mu, Ne), N): log_ZS = log_ZS_analytic((matrix, mu, Ne)) return N * log_ZS def log_Z_analytic((matrix, mu, Ne), N): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) return N * log(acc) def log_ZS_naive((matrix, mu, Ne), trials=1000): acc = 0 nu = Ne - 1 L = len(matrix) for i in xrange(trials): ep = score_seq(matrix, random_site(L)) acc += (1 / (1 + exp(ep - mu)))**(Ne - 1) mean_Zs = acc / trials return L * log(4) + log(mean_Zs)
gen_kmers <k> sample <sample name> or gen_kmers <k> genomes <opt:full> """ k = int(sys.argv[1]) version = sys.argv[2] if version == 'sample': sample_name = sys.argv[3] filename = 'samples/%s.txt' % sample_name sample_kmers = kmer_store() with open(filename) as f: for read in f: read_kmers = kmers(read.strip(), k) for kmer in read_kmers: sample_kmers.update(kmer) output_filename = 'pickles/%s_kmers_%d.pickle' % (os.path.basename( os.path.normpath(filename)).replace('.txt', ''), k) with open(output_filename, 'w') as f: cPickle.dump(sample_kmers.kmers, f) elif version == 'genomes': full = True if (len(sys.argv) == 4 and sys.argv[3] == 'full') else False kmer_spectra = defaultdict(lambda: [0] * 20) for index, genome_filename in enumerate( progress( filter(lambda x: x.endswith('.fna'), os.listdir('genomes')))): kmer_spectrum = {} if full else kmer_store()
def generate(X, seqType, args): ''' :param X: :param seqType: :param args: :return: ''' if seqType == 'DNA' or seqType == 'RNA': p = [0] * (4**args.kTuple) else: if seqType == 'PROT': p = [0] * (20**args.kTuple) else: None # print(p) # print(len(p)) elements = utils.sequenceElements(seqType) m = list(itertools.product(elements, repeat=args.kTuple)) terminusLength = args.terminusLength # print(terminusLength) T = [] for x in X: # print(len(x)) x = x[:terminusLength] # print(len(x)) # print('-----------------') require = (terminusLength - args.kTuple + 1) - (len(x) - args.kTuple + 1) # print(require) t = [] kmers = utils.kmers(x, args.kTuple) for kmer in kmers: d = {''.join(i): 0 for i in m} d[kmer] = 1 t.append(list(d.values())) #end-for if require > 0: for i in range(require): t.append(p) #end-for else: None t = np.array(t) # print(t.shape) T.append(t) # print(t.shape) #end-for T = np.array(T) # print(T.shape) totalFeature = 0 if seqType == 'DNA' or seqType == 'RNA': totalFeature = (4**args.kTuple) else: if seqType == 'PROT': totalFeature = (20**args.kTuple) else: None # end-if save.datasetSave(T, totalFeature, 'pkmer') #end-def
gen_kmers <k> sample <sample name> or gen_kmers <k> genomes <opt:full> """ k = int(sys.argv[1]) version = sys.argv[2] if version == 'sample': sample_name = sys.argv[3] filename = 'samples/%s.txt' % sample_name sample_kmers = kmer_store() with open(filename) as f: for read in f: read_kmers = kmers(read.strip(), k) for kmer in read_kmers: sample_kmers.update(kmer) output_filename = 'pickles/%s_kmers_%d.pickle' % (os.path.basename(os.path.normpath(filename)).replace('.txt',''), k) with open(output_filename, 'w') as f: cPickle.dump(sample_kmers.kmers, f) elif version =='genomes': full = True if (len(sys.argv) == 4 and sys.argv[3] == 'full') else False kmer_spectra = defaultdict(lambda:[0]*20) for index, genome_filename in enumerate(progress(filter(lambda x: x.endswith('.fna'), os.listdir('genomes')))): kmer_spectrum = {} if full else kmer_store() for kmer in kmers(nucleotides_fna('genomes/'+genome_filename), k): if full: kmer_spectrum[kmer] = kmer_spectrum[kmer]+1 if kmer in kmer_spectrum else 1
L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1/(1+exp(ep-mu)))**(Ne-1) return log(acc) def log_ZM_analytic((matrix, mu, Ne), N): log_ZS = log_ZS_analytic((matrix, mu, Ne)) return N * log_ZS def log_Z_analytic((matrix, mu, Ne), N): """compute log_Z analytically""" acc = 0 nu = Ne - 1 L = len(matrix) for kmer in kmers(L): ep = score_seq(matrix, "".join(kmer)) acc += (1/(1+exp(ep-mu)))**(Ne-1) return N * log(acc) def log_ZS_naive((matrix, mu, Ne), trials=1000): acc = 0 nu = Ne - 1 L = len(matrix) for i in xrange(trials): ep = score_seq(matrix, random_site(L)) acc += (1/(1+exp(ep-mu)))**(Ne-1) mean_Zs = acc / trials return L * log(4) + log(mean_Zs) def log_ZM_naive((matrix, mu, Ne), N, trials=1000):
def compute_Z(model): k = len(model) L = int(1 + sqrt(1 + 8 * k) / 2) return sum(exp(score(model, "".join(kmer))) for kmer in kmers(L))