def load_data(xfile, maxlen=5000, hard_stop=1e60): g = [ii.strip().split() for ii in gz(xfile)] k = [ idx for idx, i in enumerate(g) if len(i) > 0 and i[0].startswith('//') ] f = [] lens = [] all_pos = [] for idx, i in enumerate(k): l = g[i + 3:i + 211] pos = np.array(map(float, g[i + 2][1:]), dtype="float32") #print l[0][:10], l[-1][:10] #print len(l) q = [] for i in l: i = list(i[0]) q.append(np.array([int(j) for j in i], dtype='int8')) #print len(q) #print q[0][:10], q[-1][:10] q = sort_min_diff(np.array(q)).T if q.shape[0] <= maxlen: f.append(q) # print q.shape all_pos.append(pos) if len(f) > 9: if not len(f) % 10: print idx, len(f), len(all_pos) lens.append(len(q[0])) if len(f) >= hard_stop: break print len([i for i in lens if i > maxlen]), len(lens) print '*********', len(f), len(all_pos) return f, all_pos
def _gunzip(self, fileobjin, fileobjout): """Returns NamedTemporaryFile with unzipped content of fileobj""" source = gz(fileobj=fileobjin, mode='rb') target = fileobjout try: while 1: data=source.read(65536) if data and len(data): target.write(data) else: target.flush() break except Exception: target.close() raise else: return target
def _gunzip(self, fileobjin, fileobjout): """ Returns NamedTemporaryFile with unzipped content of fileobj. @type fileobjin: File @param fileobjin: file containing the archive @type fileobjout: File @param fileobjout: file where to put the unziped file """ source = gz(fileobj=fileobjin, mode='rb') target = fileobjout try: while 1: data=source.read(65536) if data and len(data): target.write(data) else: target.flush() break except Exception: target.close() raise else: return target
from common import * from gzip import GzipFile as gz from sklearn.neighbors import NearestNeighbors def sort_min_diff(amat): '''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity. this problem is NP-hard, so here we use a nearest neighbors approx. it's not perfect, but it's fast and generally performs ok. assumes your input matrix is a numpy array''' mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat) v = mb.kneighbors(amat) smallest = np.argmin(v[0].sum(axis=1)) return amat[v[1][smallest]] a = (i.strip().split('asdfasdfd') for i in gz('gap.LD.sims.txt.gz')) idx = 0 xvals, yvals = {}, {} seg_sites = {} pos = {} n = [] ctr = 0 true_idx = 0 for i in a: if 'segsites' in i[0]: seg_sites[true_idx] = int(i[0].split()[-1]) if 'positions' in i[0]: pos[true_idx] = [float(jj) for jj in i[0].split()[1:]] if './ms' in i[0]:
'''this function takes in a SNP matrix with indv on rows and returns the same matrix with indvs sorted by genetic similarity. this problem is NP-hard, so here we use a nearest neighbors approx. it's not perfect, but it's fast and generally performs ok. assumes your input matrix is a numpy array''' mb = NearestNeighbors(len(amat), metric='manhattan').fit(amat) v = mb.kneighbors(amat) smallest = np.argmin(v[0].sum(axis=1)) return amat[v[1][smallest]] def convert_01_to_neg1_1(amat): '''convert standard binary 0/1 ms SNP matrix to -1/1 SNP matrix. B/c weights & biases are drawn from a distribution with mean=0 choosing -1/1 (which is also mean=0) tends to help in training. assumes your input matrix is a numpy array''' return (amat * -2 + 1) * -1 a = (i.strip().split('asdfasdfd') for i in gz('all.auto.tet.LD.sims.txt.gz')) k = range(48) idx = 0 xvals, yvals = {}, {} seg_sites = {} pos = {} n = [] ctr = 0 true_idx = 0 for i in a: if 'segsites' in i[0]: seg_sites[true_idx] = int(i[0].split()[-1]) if 'positions' in i[0]: pos[true_idx] = [float(jj) for jj in i[0].split()[1:]] if './ms' in i[0]:
def convert_01_to_neg1_1(amat): '''convert standard binary 0/1 ms SNP matrix to -1/1 SNP matrix. B/c weights & biases are drawn from a distribution with mean=0 choosing -1/1 (which is also mean=0) tends to help in training. assumes your input matrix is a numpy array''' return (amat * -2 + 1) * -1 def rsquare(x, y): return np.corrcoef(x, y)[0][1]**2 #r-squared def rmse(x, y): return np.sqrt(np.mean((x - y)**2)) a = (i.strip().split('asdfasdfd') for i in gz('autotet.test.data.LD.sims.txt.gz')) k = range(48) idx = 0 xvals, yvals = {}, {} seg_sites = {} pos = {} n = [] ctr = 0 true_idx = 0 maxL = 0 for i in a: if 'segsites' in i[0]: seg_sites[true_idx] = int(i[0].split()[-1]) if seg_sites[true_idx] > maxL: maxL = seg_sites[true_idx]