def test__mapping_repr(display_max_rows, n_vars, n_attr) -> None: long_name = "long_name" a = defchararray.add(long_name, np.arange(0, n_vars).astype(str)) b = defchararray.add("attr_", np.arange(0, n_attr).astype(str)) c = defchararray.add("coord", np.arange(0, n_vars).astype(str)) attrs = {k: 2 for k in b} coords = {_c: np.array([0, 1]) for _c in c} data_vars = dict() for (v, _c) in zip(a, coords.items()): data_vars[v] = xr.DataArray( name=v, data=np.array([3, 4]), dims=[_c[0]], coords=dict([_c]), ) ds = xr.Dataset(data_vars) ds.attrs = attrs with xr.set_options(display_max_rows=display_max_rows): # Parse the data_vars print and show only data_vars rows: summary = formatting.dataset_repr(ds).split("\n") summary = [v for v in summary if long_name in v] # The length should be less than or equal to display_max_rows: len_summary = len(summary) data_vars_print_size = min(display_max_rows, len_summary) assert len_summary == data_vars_print_size summary = formatting.data_vars_repr(ds.data_vars).split("\n") summary = [v for v in summary if long_name in v] # The length should be equal to the number of data variables len_summary = len(summary) assert len_summary == n_vars summary = formatting.coords_repr(ds.coords).split("\n") summary = [v for v in summary if "coord" in v] # The length should be equal to the number of data variables len_summary = len(summary) assert len_summary == n_vars with xr.set_options( display_max_rows=display_max_rows, display_expand_coords=False, display_expand_data_vars=False, display_expand_attrs=False, ): actual = formatting.dataset_repr(ds) col_width = formatting._calculate_col_width(ds.variables) dims_start = formatting.pretty_print("Dimensions:", col_width) dims_values = formatting.dim_summary_limited(ds, col_width=col_width + 1, max_rows=display_max_rows) expected = f"""\ <xarray.Dataset> {dims_start}({dims_values}) Coordinates: ({n_vars}) Data variables: ({n_vars}) Attributes: ({n_attr})""" expected = dedent(expected) assert actual == expected
def render(player, comp): c_coor = np.chararray((3, 3)) c_coor[:] = "X" p_coor = np.chararray((3, 3)) p_coor[:] = "O" print char.add(char.multiply(c_coor, comp.astype(int)), char.multiply(p_coor, player.astype(int)))
def TAD_bins(arr): """ Returns TADs as objects from their coordinates. """ if arr.shape[0]: _vector_str = np.vectorize(str) return npchar.add(_vector_str(arr[:, 0]), npchar.add(",", _vector_str(arr[:, 1]))) return arr
def set_given_hopping(self, n, size, dic, mask, upper_part): ''' Private method. Fill self.hop. :param n: Integer. Hopping type. :param size: Integer. Number of hoppings. :param doc: Dictionary. Hopping dictionary. :param mask: np.ndarray. Mask. :param upper_part: Boolean. If True, self.hop['i'] < self.hop['j']. ''' hop = np.empty(size, dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')]) hop['n'] = dic['n'] hop['t'] = dic['t'] if upper_part: hop['i'] = self.store_hop[n]['i'][mask] hop['j'] = self.store_hop[n]['j'][mask] hop['ang'] = self.store_hop[n]['ang'][mask] hop['tag'] = self.store_hop[n]['tag'][mask] else: hop['i'] = self.store_hop[n]['j'][mask] hop['j'] = self.store_hop[n]['i'][mask] hop['ang'] = self.store_hop[n]['ang'][mask] - 180 hop['tag'] = npc.add(self.lat.coor['tag'][hop['i']], self.lat.coor['tag'][hop['j']]) return hop
def set_hopping_manual(self, dict_hop, upper_part=True): ''' Set hoppings manually. :param dict_hop: Dictionary of hoppings. key: hopping indices, val: hopping values. :parameter upper_part: Boolean. * True, fill the Hamiltonian upper part. * False, fill the Hamiltonian lower part. ''' hop = np.zeros(len(dict_hop), dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')]) i = [h[0] for h in dict_hop.keys()] j = [h[1] for h in dict_hop.keys()] t = [val for val in dict_hop.values()] hop['i'], hop['j'] = i, j hop['t'] = t hop['tag'] = npc.add(self.lat.coor['tag'][i], self.lat.coor['tag'][j]) ang = 180 / PI * np.arctan2( self.lat.coor['y'][j] - self.lat.coor['y'][i], self.lat.coor['x'][j] - self.lat.coor['x'][i]) if upper_part: ang[ang < 0] += 180 else: ang[ang >= 0] -= 180 hop['ang'] = ang self.hop = np.concatenate([self.hop, hop])
def set_hopping_manual(self, dict_hop, upper_part=True): ''' Set hoppings manually. :param dict_hop: Dictionary of hoppings. key: hopping indices, val: hopping values. :parameter upper_part: Boolean. * True, fill the Hamiltonian upper part. * False, fill the Hamiltonian lower part. ''' hop = np.zeros(len(dict_hop), dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), ('ang', 'f8'), ('tag', 'S2'), ('t', 'c16')]) i = [h[0] for h in dict_hop.keys()] j = [h[1] for h in dict_hop.keys()] t = [val for val in dict_hop.values()] hop['i'], hop['j']= i, j hop['t'] = t hop['tag'] = npc.add(self.lat.coor['tag'][i], self.lat.coor['tag'][j]) ang = 180 / PI * np.arctan2(self.lat.coor['y'][j]-self.lat.coor['y'][i], self.lat.coor['x'][j]-self.lat.coor['x'][i]) if upper_part: ang[ang < 0] += 180 else: ang[ang >= 0] -= 180 hop['ang'] = ang self.hop = np.concatenate([self.hop, hop])
def paral(path, d): probs = pd.read_csv("%s/%s.csv" % (direct, d), header=None).as_matrix()[:, 1].astype(float) # probs = np.loadtxt("%s/%s.csv" % (direct, d))[:, 1].astype(float) repeat = np.loadtxt("rep_trips/%s.txt" % d)[:, 1].astype(int) new_p = probs new_p[np.bitwise_and(repeat == 1, 1 > 0.5)] = 1.0 indexes = np.arange(1, 201).astype(str) d_id = (np.ones(200)*int(d)).astype(int).astype(str) und = np.ones(200).astype(str) und[:] = "_" first_column = ncd.add(d_id, und) first_column = ncd.add(first_column, indexes) second_column = np.array(["%.8f" % p for p in new_p]) outp = np.vstack((first_column, second_column)).T np.savetxt("%s/%s.csv" % (path, d), outp, fmt="%s", delimiter=",") print(d)
def paral(path, d): probs = pd.read_csv("%s/%s.csv" % (direct, d), header=None).as_matrix()[:, 1].astype(float) # probs = np.loadtxt("%s/%s.csv" % (direct, d))[:, 1].astype(float) repeat = np.loadtxt("rep_trips/%s.txt" % d)[:, 1].astype(int) new_p = probs new_p[np.bitwise_and(repeat == 1, 1 > 0.5)] = 1.0 indexes = np.arange(1, 201).astype(str) d_id = (np.ones(200)*int(d)).astype(int).astype(str) und = np.ones(200).astype(str) und[:] = "_" first_column = ncd.add(d_id, und) first_column = ncd.add(first_column, indexes) second_column = np.array(["%.8f" % p for p in new_p]) outp = np.vstack((first_column, second_column)).T np.savetxt("%s/%s.csv" % (path, d), outp, fmt="%s", delimiter=",") print d
def dbscan_sub_clus(df,i): ''' Does another level of clustering on the basis of DBZ values. PARAMETER : eps2 , min_pts2 will be the parameter of dbz level of clustering i is cluster label df dataframe of only those points belonging to the cluster i ''' db = DBSCAN(min_samples=min_pts2, eps=eps2) db.fit(df[['dbz']]) lab = add(db.labels_.astype(str), '_'+str(i)) # new label is formed as 'New labels'_'cluster label i' return lab #returns string
def p_ill_do_it_faggot(d, X_out, y_out): Xt, yt = utils.load_driver_pca(d) X_out, y_out = utils.load_outliers_pca(d, _samples-200) yt[:] = 1 # rs = utils.clean_noise(Xt, _n_clean) # Xt = np.vstack((rs["X_clean"], rs["X_noise"])) # yt[_n_clean:] = 0 X = np.vstack((Xt, X_out)) y = np.hstack((yt, y_out)) k_fold = cross_validation.KFold(len(X), n_folds=5) X, y = shuffle(X, y, random_state=13) d_p = np.zeros((200)) for j, (train, test) in enumerate(k_fold): # probas_ = gbrt.fit(X[train], y[train]).predict_proba(X[test]) gbrt.fit(X[train], y[train]) regr.fit(X[train], y[train]) my_p = gbrt.predict_proba(Xt)[:, 1] + regr.predict_proba(Xt)[:, 1] d_p += my_p d_p /= float(len(k_fold) * 2) d_p = (d_p - d_p.min())/(d_p.max()-d_p.min()) d_p[d_p > 0.9] = 1.0 d_p[d_p < 0.1] = 0.0 indexes = np.arange(1, 201).astype(str) d_id = (np.ones(200)*int(d)).astype(int).astype(str) und = np.ones(200).astype(str) und[:] = "_" first_column = ncd.add(d_id, und) first_column = ncd.add(first_column, indexes) second_column = np.array(["%.8f" % p for p in d_p]) outp = np.vstack((first_column, second_column)).T np.savetxt("subm_%s/%s.csv" % (_hash_id, d), outp, fmt="%s", delimiter=",") print d
def trace_export(traces, time, x, y): index = np.arange(traces.shape[0]) to_add = [ ' ', x.astype(int).astype('str'), ' ', y.astype(int).astype('str') ] header = index.astype('str') for entry in to_add: header = np_str.add(header, entry) header = np.insert(header, 0, 'time (s)') export = np.vstack((header, np.vstack((time, traces)).T)) return export
def createtable(M): s = np.shape(M) res = '<table boarder = "2">' for i in range(s[0]): res = add(res, ' <tr>') for j in range(s[1]): res = add(res, add(' <td>', add(str(M[i, j]), '</td>'))) res = add(res, '</tr>') res = add(res, '</table>') return res
def fill_store_hop(self, n): ''' Private method. Store in *store_hop* indices (with :math:`i < j`), positive angles, and tags of a given type of hopping. ''' ind = np.argwhere(np.isclose(self.dist_uni[n], self.vec_hop['dis'], atol=ATOL)) ind_up = ind[ind[:, 1] > ind[:, 0]] hop = np.zeros(len(ind_up), dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), ('ang', 'f8'), ('tag', 'S2')]) hop['i'] = ind_up[:, 0] hop['j'] = ind_up[:, 1] hop['ang'] = self.vec_hop['ang'][ind_up[:, 0], ind_up[:, 1]] hop['tag'] = npc.add(self.lat.coor['tag'][ind_up[:, 0]], self.lat.coor['tag'][ind_up[:, 1]]) self.store_hop[n] = hop
def set_hopping_def(self, hopping_def): ''' Set specific hoppings. :param hopping_def: Dictionary of hoppings. key: hopping indices, val: hopping values. Example usage:: sys.set_hopping_def({(0, 1): 1., (1, 2): -1j}) ''' error_handling.empty_hop(self.hop) error_handling.set_hopping_def(self.hop, hopping_def, self.lat.sites) for key, val in hopping_def.items(): cond = (self.hop['i'] == key[0]) & (self.hop['j'] == key[1]) self.hop['t'][cond] = val self.hop['ang'] = self.vec_hop['ang'][key[0], key[1]] self.hop['tag'] = npc.add(self.lat.coor['tag'][key[0]], self.lat.coor['tag'][key[1]])
def fill_store_hop(self, n): ''' Private method. Store in *store_hop* indices (with :math:`i < j`), positive angles, and tags of a given type of hopping. ''' ind = np.argwhere( np.isclose(self.dist_uni[n], self.vec_hop['dis'], atol=ATOL)) ind_up = ind[ind[:, 1] > ind[:, 0]] hop = np.zeros(len(ind_up), dtype=[('n', 'u2'), ('i', 'u4'), ('j', 'u4'), ('ang', 'f8'), ('tag', 'S2')]) hop['i'] = ind_up[:, 0] hop['j'] = ind_up[:, 1] hop['ang'] = self.vec_hop['ang'][ind_up[:, 0], ind_up[:, 1]] hop['tag'] = npc.add(self.lat.coor['tag'][ind_up[:, 0]], self.lat.coor['tag'][ind_up[:, 1]]) self.store_hop[n] = hop
# mmdbins = np.percentile(mmd, np.arange(0,100.1,25)) for j in range(1,len(mmdbins)): tmpind=np.in1d(mmdind,j) if np.sum(tmpind) ==0: plot([],[],c=colors[j-1]) continue x1=updf.values[tmpind,:].transpose() y1=(tmp[tmpind,:]-tmp[tmpind,249][:,np.newaxis]).transpose() x0=np.nanmean(x1,axis=1) y0=np.nanmean(y1,axis=1) # plot(x0,y0,c=colormapping.to_rgba(tmpmmd[j-1])) plot(x0,y0,c=colors[j-1]) ax=plt.gca() ax.legend( npch.add(npch.add( mmdbins[:-1].astype(str),' - '),mmdbins[1:].astype(str) ),loc='best') plt.ylabel('Altitude (km)') plt.xlabel('Updraft (m/s)') plt.title('Flight '+str(szi)+' updraft MMD cases '+str(len(np.squeeze(mmd)))) plt.plot(plt.xlim(),np.array([0,0]),'k--') plt.plot(np.array([0,0]),plt.ylim(),'k--') plt.show() # except: # pass #%% # grouped w wind profile by IWC cats import pandas as pd
def set(self, *args, merge=False): """Main entry point to assign value on plate Parameters ---------- well : dict or str - if dict, well must contain well identifier as key and value to assign as value.eg : {"A2" : "value", "A[3-6]" : 42} - if string, well is only a well identifier eg : "G5" value : list or str or int or float - if list, value should be presented with multiple well identifer "B-D[2-5]", ["value1", "value2", "value3"] merge : bool (by default False) Value on well are not overide but added Returns ------- BioPlate : BioPlate return instance of plate Exemples -------- see :ref:`Set-values-on-plate` """ well, value = self._args_analyse(*args) if not isinstance(well, str) and isinstance(well, Iterable): generator = well.items() if isinstance(well, dict) else well for key, val in generator: if merge: self.set(key, val, merge=True) else: self.set(key, val) return self well = BioPlateMatrix(str(well)) if isinstance(value, list): plate_shape = self[well.row, well.column].shape len_plate_shape = len(plate_shape) if len_plate_shape > 1: if well.pos == "R": resh_val = np.reshape(value, (plate_shape[0], 1)) else: resh_val = value if merge: self[well.row, well.column] = ncd.add(self[well.row, well.column], resh_val) return self self[well.row, well.column] = resh_val return self else: if merge: self[well.row, well.column][:len(value)] = ncd.add( self[well.row, well.column][:len(value)], value) return self self[well.row, well.column][:len(value)] = value return self if merge: self[well.row, well.column] = ncd.add(self[well.row, well.column], value) return self self[well.row, well.column] = value return self
train_labels = all_labels[3000:, :] # Plot and save distribution of test data classes, counts = np.unique(test_labels[:,1], return_counts=True) plt.figure() plt.bar(classes, counts) plt.title('Distribution of retinopathy severity grades in test data') plt.xlabel('Grade') plt.ylabel('Count') plt.savefig('../results/class_distribution_test.png') class_dist = np.asarray((classes, counts), dtype=np.int).T np.savetxt(fname='../results/class_distribution_test.csv', X=class_dist, delimiter=',') # PLot and save distribution of train data classes, counts = np.unique(train_labels[:,1], return_counts=True) plt.figure() plt.bar(classes, counts) plt.title('Distribution of retinopathy severity grades in train data') plt.xlabel('Grade') plt.ylabel('Count') plt.savefig('../results/class_distribution_train.png') class_dist = np.asarray((classes, counts), dtype=np.int).T np.savetxt(fname='../results/class_distribution_train.csv', X=class_dist, delimiter=',') # Save filenames separately test_filenames = add(test_labels[:,0], np.full(shape=test_labels[:,0].shape, fill_value='.jpeg')) np.savetxt(fname='../data/test_filenames.txt', X=test_filenames, delimiter='', fmt='%s') train_filenames = add(train_labels[:,0], np.full(shape=train_labels[:,0].shape, fill_value='.jpeg')) np.savetxt(fname='../data/train_filenames.txt', X=train_filenames, delimiter='', fmt='%s')
def fit(self): # Results _cols_x = ['x%d' % i for i in range(self.n_parameters)] self.hist_ = pd.DataFrame(index=range((self.iters + 1) * self.n_chromosomes), columns=['iter', ] + _cols_x + ['cost', 'orig', ]) self.hist_[['orig', ]] = '-1' #Initial random population self.pop_ = self._random(self.n_chromosomes) self.cost_ = self._fitness_function() filter_iter = range(0, self.n_chromosomes) self.hist_.loc[filter_iter, 'iter'] = 0 self.hist_.loc[filter_iter, 'cost'] = self.cost_ self.hist_.loc[filter_iter, _cols_x] = self.pop_ for i in range(self.iters): if self.verbose > 0: print('Iteration ' + str(i) + ' of ' + str(self.iters)) orig = np.empty(self.n_chromosomes, dtype='S10') cost_sort = np.argsort(self.cost_) #Elitims new_pop = np.empty_like(self.pop_) new_pop[0:self.n_elite] = self.pop_[cost_sort[0:self.n_elite]] orig[0:self.n_elite] = (cost_sort[0:self.n_elite] + i * self.n_chromosomes).astype(np.str) #Cumulative probability of selection as parent zcost = (self.cost_ - np.average(self.cost_)) / np.std(self.cost_) pzcost = 1 - norm.cdf(zcost) pcost = np.cumsum(pzcost / sum(pzcost)) #Select parents & match numparents = self.n_chromosomes - self.n_elite #TODO: Add random state rand_parents = np.random.rand(numparents, 2) parents = np.zeros(rand_parents.shape, dtype=np.int) for parent1 in range(numparents): for parent2 in range(2): parents[parent1, parent2] = np.searchsorted(pcost, rand_parents[parent1, parent2]) if self.type_ == 'binary': #Binary #random single point matching rand_match = int(np.random.rand() * self.n_parameters) child = self.pop_[parents[parent1, 0]] child[rand_match:] = self.pop_[parents[parent1, 1], rand_match:] else: #Continious rand_match = np.random.rand(self.n_parameters) child = self.pop_[parents[parent1, 0]] * rand_match child += (1 - rand_match) * self.pop_[parents[parent1, 1]] new_pop[self.n_elite + parent1] = child orig[self.n_elite:] = [','.join(row.astype(np.str)) for row in (parents + i * self.n_chromosomes)] #Mutate m_rand = np.random.rand(self.n_chromosomes, self.n_parameters) m_rand[0:self.n_elite] = 1.0 mutations = m_rand <= self.per_mutations num_mutations = np.count_nonzero(mutations) if self.type_ == 'binary': new_pop[mutations] = (new_pop[mutations] == 0).astype(np.int) else: new_pop[mutations] = self._random(num_mutations)[:, 0] rows_mutations = np.any(mutations, axis=1) orig[rows_mutations] = add(orig[rows_mutations], np.array(['_M'] * np.count_nonzero(rows_mutations), dtype='S10')) # Replace replicates with random temp_unique = np.ascontiguousarray(new_pop).view(np.dtype((np.void, new_pop.dtype.itemsize * new_pop.shape[1]))) _, temp_unique_idx = np.unique(temp_unique, return_index=True) n_replace = self.n_chromosomes - temp_unique_idx.shape[0] if n_replace > 0: temp_unique_replace = np.ones(self.n_chromosomes, dtype=np.bool) temp_unique_replace[:] = True temp_unique_replace[temp_unique_idx] = False new_pop[temp_unique_replace] = self._random(n_replace) orig[temp_unique_replace] = '-1' self.pop_ = new_pop self.cost_ = self._fitness_function() filter_iter = range((i + 1) * self.n_chromosomes, (i + 2) * self.n_chromosomes) self.hist_.loc[filter_iter, 'iter'] = i + 1 self.hist_.loc[filter_iter, 'cost'] = self.cost_ self.hist_.loc[filter_iter, _cols_x] = self.pop_ self.hist_.loc[filter_iter, 'orig'] = orig best = np.argmin(self.cost_) self.x = self.pop_[best] self.x_cost = self.cost_[best]
DIR = os.path.dirname(os.path.realpath(__file__)) ID_DIR = os.path.join(DIR, 'shapenetcore_ids') DATASET_DIR = os.path.join(DIR, 'ShapeNetCore.v2/{}'.format(class_id)) if not os.path.exists(DATASET_DIR): print( "please download the ShapeNetCore v.2 dataset, and place it into the same directory as this file" ) sys.exit(0) if not os.path.exists(ID_DIR): os.mkdir(ID_DIR, 0777) obj_ids = np.array(next(os.walk(DATASET_DIR))[1]) obj_ids = add(class_id + '/', obj_ids) np.random.shuffle(obj_ids) a = int(float(ratio1) * 0.01 * len(obj_ids)) b = int(float(ratio1 + ratio2) * 0.01 * len(obj_ids)) train, validate, test = obj_ids[:a], obj_ids[a:b], obj_ids[b:] print('Total: %d' % len(obj_ids)) print('Train: %d' % len(train)) print('Validate: %d' % len(validate)) print('Test: %d' % len(test)) np.savetxt(os.path.join(ID_DIR, '{}_trainids.txt'.format(class_id)), train, fmt='%s')
def main(): parser = argparse.ArgumentParser(description="run motif clustering") parser.add_argument('input_dir', help="The location of tomtom results, bzip'd") parser.add_argument('features', help='path to RSAT features file') parser.add_argument('--filter_motifs', default=True, action='store_true', help="Pre-filter motifs to include; see comments") parser.add_argument('--plot_motifs', default=False, action='store_true', help="Plot motif clusters in separate PDFs") parser.add_argument('--option', type=int, default=1, help="Filtering option, 1 or 2; see comments") parser.add_argument('--mcl_I', type=float, default=2.4, help='mcl I parameter value') args = parser.parse_args() # if option == 1, then we SUM all the log-pvals for multiple occurrences # of the same motif pair, then filter the sums to only those that # are < -10 (same as done for original Halo ensemble) # if option == 2, then we take the LOWEST log-pval over multiple occurrences # of the same motif pair, with no additional filtering. # Note, option (2) was used for Eco and (1) for Halo in MSB EGRIN2 paper. option = args.option print('OPTION:', option) # pre-filter motifs, not implemented yet. (1) remove motifs that are in coding # regions (from fimo table); # (2) filter by motif E-value (3) filter by bicluster residual? pre_filter = False if args.filter_motifs: pre_filter = args.filter_motifs coding_fracs = total_frac = None if pre_filter: """ # necessary only because egrin2-tools has hyphen and can't have hyphens in # python module paths... try: os.symlink('egrin2-tools/src/postproc/coding_fracs.py', 'coding_fracs.py') except: None""" total_frac = cf.get_total_coding_rgn(args.features) cf_files = np.sort(np.array(glob.glob(os.path.join('*/coding_fracs.tsv.bz2')))) coding_fracs = [] for f in cf_files: print(f) cff = pd.read_table(bz2.BZ2File(f), sep='\t') cm_run = os.path.dirname(f) # .split('-')[2] cff['cm_run'] = cm_run if cff.shape[0] > 1: coding_fracs.append(cff) # [f] = cff coding_fracs = pd.concat(coding_fracs, keys=None, ignore_index=True) # this has a hack - for some reason cluster_id in coding_fracs is %04d, # trim first zero to make it %03d ... splitted = npstr.split(coding_fracs.motif.values.astype(str), '_') # see https://stackoverflow.com/a/28286749 clust_id = np.char.mod('_%03d_', np.array([int(i[0]) for i in splitted])) mot_id = np.char.mod('%02d', np.array([int(i[1]) for i in splitted])) mot_id = npstr.add(clust_id, mot_id) coding_fracs['motif_id'] = npstr.add(coding_fracs.cm_run.values.astype(str), mot_id) input_dir = 'tomtom_out' input_dir = args.input_dir # folder with the tomtom files bzip'd files = np.sort(np.array(glob.glob(input_dir + "/*tomtom.tsv.bz2"))) dfs = {} # can pd.concat work on shelved dataframes? YES. Note protocol=2 is faster and smaller. dfs = shelve.open('tomtom_shelf.db', protocol=2, writeback=False) # if using a shelf, once this is done once, you don't have to do it again. if len(dfs) != len(files): for f in files: gene = os.path.basename(f).split('.')[0] print(f, gene) if gene in dfs.keys(): continue try: df = pd.read_table(bz2.BZ2File(f), sep='\t') print(df.shape) if df.shape[0] <= 0: continue df = df.ix[df['p-value'] <= 0.01] # 0.1] print(df.shape) df = df.ix[df['#Query ID'] != df['Target ID']] print(df.shape) df = df.ix[df.Overlap >= 6] # same setting as Halo run df = df.drop(['Query consensus', 'Target consensus'], axis=1) if pre_filter: # add the coding fracs to the df: tmp = pd.merge(df, coding_fracs, how='left', left_on='#Query ID', right_on='motif_id') tmp = pd.merge(tmp, coding_fracs, how='left', left_on='Target ID', right_on='motif_id') tmp = tmp.drop(['motif_x', 'cm_run_x', 'motif_id_x', 'motif_y', 'cm_run_y', 'motif_id_y'], axis=1) # drop the motifs with coding fracs greater than # (expected value) + (obs. stddev) / 2 cutoff = total_frac # + coding_fracs.coding_frac.mad() / 2 tmp = tmp.ix[np.logical_or(tmp.coding_frac_x.values < cutoff, tmp.coding_frac_y.values < cutoff)] print(tmp.shape) df = tmp; del tmp dfs[gene] = df except: continue if not os.path.isfile('motifs_tomtom.tsv.bz2'): if type(dfs) == dict: dfs2 = pd.concat(dfs, axis=0) else: dfs2 = pd.concat(dfs.values(), axis=0) print(dfs2.shape) # incase we fail on steps below... dfs2.to_csv(bz2.BZ2File('motifs_tomtom.tsv.bz2', 'w'), sep='\t', index=False, header=True) else: dfs2 = pd.read_table(bz2.BZ2File('motifs_tomtom.tsv.bz2', 'r')) if option == 2: # sort so lower p-values come first (these are kept by drop_duplicates) dfs2.sort('p-value', inplace=True) # no, we sum up the duplicate weights below dfs2.drop_duplicates(['#Query ID', 'Target ID'], inplace=True) print(dfs2.shape) gr = pd.DataFrame({'query': dfs2['#Query ID'].values, 'target': dfs2['Target ID'].values, 'weight': np.round_(-np.log10(dfs2['p-value'].values + 1e-99), 4)}) # igraph cannot read bzipped files (streams) gr.to_csv('motifs_graph.tsv', sep=' ', index=False, header=False) del gr gr2 = ig.Graph.Read_Ncol('motifs_graph.tsv', names=True, weights=True, directed=False) system('bzip2 -fv9 motifs_graph.tsv &') print(gr2.ecount(), gr2.vcount()) # cool! see http://igraph.org/python/doc/igraph.GraphBase-class.html#simplify # add up the weights for duplicated edges into a single edge weight if option == 1: gr2a = gr2.simplify(multiple=True, loops=False, combine_edges=sum) print(gr2a.ecount(), gr2a.vcount()) # see http://igraph.org/python/doc/tutorial/tutorial.html#selecting-vertices-and-edges # used 10 for Halo; use less for fewer runs. returns an EdgeList gr2b = gr2a.es.select(weight_gt=10) gr2b = gr2b.subgraph() # convert to a graph print(gr2b.ecount(), gr2b.vcount()) elif option == 2: gr2a = gr2.simplify(multiple=True, loops=False, combine_edges=max) print(gr2a.ecount(), gr2a.vcount()) gr2b = gr2a del gr2 # no weights used - same as Halo analysis which was best! gr2b.write_ncol("mot_metaclustering.txt", weights=None) # now run mcl, latest version from http://www.micans.org/mcl/src/mcl-latest.tar.gz param_I = args.mcl_I cmd = 'mcl mot_metaclustering.txt --abc -I %.1f -v all -te 3 -S 200000' % (param_I) system(cmd) param_I_str = str(param_I).replace('.','') fo = open('out.mot_metaclustering.txt.I%s'%(param_I_str), 'r') lines = fo.readlines() fo.close() lines = [np.array(line.split()) for line in lines] # file contains actual motif ids rather than numbers clusters = lines del lines clust_lens = np.array([len(i) for i in clusters]) print('Clusters with >= 10 motifs:', np.sum(clust_lens >= 10)) print('Total number of motifs:', gr2a.vcount()) print('Number of motifs in >= 10-size clusters:', np.sum(np.array([len(clusters[i]) for i in np.where(clust_lens >= 10)[0]]))) print('Fraction of motifs in >= 10-size clusters:', float(np.sum(np.array([len(clusters[i]) for i in np.where(clust_lens >= 10)[0]]))) / float(gr2a.vcount())) del gr2a # Get info on alignments for each motif cluster dfs2.set_index('#Query ID', drop=False, inplace=True) clust_dfs = {} for i in xrange(len(clusters)): clust = clusters[i] print(i, len(clust)) if i in clust_dfs.keys() or len(clust) < 10 or i > 500: continue df = dfs2.ix[clust] df = df.iloc[np.in1d(df['Target ID'].values, clust)] df = df.sort(['p-value']) df = df.ix[~df.duplicated(['#Query ID', 'Target ID'])] # remove dupes df = df.reset_index(drop=True) df['motif_clust'] = i print(df.shape) clust_dfs[i] = df del dfs2 clust_dfs = pd.concat(clust_dfs, axis=0) print(clust_dfs.shape) # get coding fracs per motif cluster via: # clust_dfs.groupby('motif_clust').mean().coding_frac_x clust_dfs.to_csv(bz2.BZ2File('motif_clusts_%s.tsv.bz2'%(param_I_str), 'w'), sep='\t', index=False, header=True)
if FOLDERNAME == "/home/ole/windows/all_data/emb217/deployments/moorings/TC_Flach/RBR/data" or FOLDERNAME == "/home/ole/windows/all_data/emb217/deployments/moorings/TC_Tief/RBR/data": skip_header = 28 temporary_data = np.genfromtxt(datafile_path, skip_header=skip_header, usecols=(0, 1, 2), encoding="iso8859_15", dtype="str") locale.setlocale(locale.LC_TIME, "C") print(locale.getlocale()) #convert the strings in the data to datetime objects days = temporary_data[:, 0] hours = temporary_data[:, 1] string_time = np.asarray(add(days, add("-", hours))) full_utc = np.asarray([ dt.datetime.strptime(string_time[i], "%d-%b-%Y-%X.%f") for i in np.arange(string_time.size) ]) #Error in the time log of one sensor. Wrong about one hour #if datafile_path[25:] == "/emb217/deployments/moorings/TC_Flach/RBR/data/EMB217_TC-Chain-flach_016172_eng.txt": # full_utc = full_utc - dt.timedelta(hours=1) full_temperature = temporary_data[:, 2].astype("float") #temperature can't be reasonable negative full_temperature[full_temperature < 0] = np.nan #search for measurement properties in the file for i in np.arange(np.shape(sensor_positions)[0]):
def run(): parser = argparse.ArgumentParser(description="Examples: \n" +\ "calc_spectra data/vega.pkl data/vega/ -i 0.000 1.5707963267948966 150; " +\ "calc_spectra data/vega.pkl data/vega/ -i 0.088418; " +\ "calc_spectra data/altair.pkl data/altair/ -i 0.8840; " +\ "calc_spectra data/achernar.pkl data/achernar/ -i 1.0577") parser.add_argument("pkl_sfile", help="the pickled star file") parser.add_argument("output", help="the output directory") parser.add_argument( '-i', type=float, nargs='+', help='either a single inclination in radians ' + 'or a equally spaced values specified by minimum, maximum and number', required=True) parser.add_argument("-m", help="longitudinal integration method: 0=cubic(default), 1=trapezoidal", type=int, \ default=0) args = parser.parse_args() ## inputs pkl_sfile = args.pkl_sfile # pickled star file output = args.output # output location # integration method if args.m == 0: m = 'cubic' elif args.m == 1: m = 'trapezoid' else: sys.exit( "Longitudinal integration method should be either 0 (cubic) or 1 (trapezoidal)." ) # inclinations i = args.i li = len(i) if li not in [1, 3]: sys.exit("Please specify either a single inclination in radians (one number) " +\ "or a range specified by minimum, maximum and step (three numbers).") elif li == 1: inclinations = np.array(i) # decimal precision of inclination for printout prec = 6 elif li == 3: mi, ma, num = i inclinations = np.linspace(mi, ma, num=int(num)) # decimal precision of inclination for printout prec = np.int(np.ceil(-np.log10((ma - mi) / num))) leni = len(inclinations) # unpickle the star with open(pkl_sfile, 'rb') as f: st = pickle.load(f) # get the wavelengths at which we see light from this star wl = st.wavelengths ## write the spectra of the star in text format # create the directory if it doesn't exist if not os.path.exists(output): os.mkdir(output) # filenames if not output.endswith('/'): output += '/' filename = os.path.splitext(os.path.basename(pkl_sfile))[0] inc_str = np.array([("%." + str(prec) + "f") % x for x in np.round(inclinations, decimals=prec)]) ofiles = ch.add(output + filename, inc_str) ofiles = ch.replace(ofiles, '.', '_') ofiles = ch.add(ofiles, '.txt') for i, ofile in np.ndenumerate(ofiles): # message if i[0] % 10 == 0: print( str(i[0]) + " out of " + str(leni) + " inclinations calculated.") sys.stdout.flush() # current inclination inc = inclinations[i] # calculate the spectrum or the magnitudes light = st.integrate(inc, method=m) # create this file if it doesn't exist, open it for writing f = open(ofile, 'w+') # write the header f.write('# luminosity: ' + str(st.luminosity) + '\n') f.write('# omega: ' + str(st.surface.omega) + '\n') f.write('# inclination(rad): ' + str(inclinations[i]) + '\n') f.write('# mass: ' + str(st.mass) + '\n') f.write('# Req: ' + str(st.Req) + '\n') f.write('# distance: ' + format(st.distance, '.2e') + ' cm\n') f.write('# A_V: ' + format(*(st.a_v), '.2f') + '\n') f.write('# number of upper half z values: ' + str(st.map.nz) + '\n') # write the spectrum to the file f.write('\n') if st.bands is None: # spectrum mode f.write('# wavelength(nm)\tflux(ergs/s/Hz/ster)\n') for j, w in np.ndenumerate(wl): f.write(str(w)) f.write('\t %.5E' % light[j]) f.write('\n') else: # photometry mode f.write('# filter\twavelength(nm)\tmagnitude\n') for j, w in enumerate(wl): f.write(st.bands[j]) f.write('\t %.6g' % w) f.write('\t %.8f' % light[j]) f.write('\n') f.close()
res = add(res, '</tr>') res = add(res, '</table>') return res FORREPLACE = createtable(rho) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for z, height in enumerate(abs(rho)): ax.bar(np.arange(4), height, zs=z, zdir='y', color='b', alpha=0.8) plt.savefig('rhobar3') FORREPLACE = add(FORREPLACE, '<img src="../rhobar3.png" width="500" height="500"><br>') f = add(add(add(add('<br>Tangle ', str(mean[0])), ' +/- '), str(errs[0])), '\n') f = add( f, add( add(add(add('<br>Linear Entropy ', str(mean[1])), ' +/- '), str(errs[1])), '\n')) f = add( f, add(add(add(add('<br>Entropy ', str(mean[2])), ' +/- '), str(errs[2])), '\n')) f = add(f, add(add('<br>Intensity ', str(intensity)), '\n')) f = add(f, add(add('<br>fval ', str(fval)), '\n'))
cf_files = np.sort(np.array(glob.glob(os.path.join('*/coding_fracs.tsv.bz2')))) coding_fracs = [] # {} for f in cf_files: print f cff = pd.read_table(bz2.BZ2File(f), sep='\t') cm_run = os.path.dirname(f) # .split('-')[2] cff['cm_run'] = cm_run if cff.shape[0] > 1: coding_fracs.append(cff) # [f] = cff coding_fracs = pd.concat(coding_fracs, keys=None, ignore_index=True) # this has a hack - for some reason cluster_id in coding_fracs is %04d, trim first zero to make it %03d ... splitted = npstr.split(coding_fracs.motif.values.astype(str), '_') clust_id = np.char.mod('_%03d_', np.array([int(i[0]) for i in splitted])) # see https://stackoverflow.com/a/28286749 mot_id = np.char.mod('%02d', np.array([int(i[1]) for i in splitted])) mot_id = npstr.add(clust_id, mot_id) coding_fracs['motif_id'] = npstr.add(coding_fracs.cm_run.values.astype(str), mot_id) input_dir = 'tomtom_out' input_dir = opt.input_dir files = np.sort(np.array(glob.glob(input_dir + "/*tomtom.tsv.bz2"))) # folder with the tomtom files bzip'd dfs = {} # can pd.concat work on shelved dataframes? YES. Note protocol=2 is faster and smaller. dfs = shelve.open('tomtom_shelf.db', protocol=2, writeback=False) if len(dfs) != len(files): # if using a shelf, once this is done once, you don't have to do it again. for f in files: gene = os.path.basename(f).split('.')[0] print f, gene if gene in dfs.keys(): continue
def fit(self): # Results _cols_x = ['x%d' % i for i in range(self.n_parameters)] self.hist_ = pd.DataFrame(index=range( (self.iters + 1) * self.n_chromosomes), columns=[ 'iter', ] + _cols_x + [ 'cost', 'orig', ]) self.hist_[[ 'orig', ]] = '-1' #Initial random population self.pop_ = self._random(self.n_chromosomes) self.cost_ = self._fitness_function() filter_iter = range(0, self.n_chromosomes) self.hist_.loc[filter_iter, 'iter'] = 0 self.hist_.loc[filter_iter, 'cost'] = self.cost_ self.hist_.loc[filter_iter, _cols_x] = self.pop_ for i in range(self.iters): if self.verbose > 0: print('Iteration ' + str(i) + ' of ' + str(self.iters)) orig = np.empty(self.n_chromosomes, dtype='S10') cost_sort = np.argsort(self.cost_) #Elitims new_pop = np.empty_like(self.pop_) new_pop[0:self.n_elite] = self.pop_[cost_sort[0:self.n_elite]] orig[0:self.n_elite] = (cost_sort[0:self.n_elite] + i * self.n_chromosomes).astype(np.str) #Cumulative probability of selection as parent zcost = (self.cost_ - np.average(self.cost_)) / np.std(self.cost_) pzcost = 1 - norm.cdf(zcost) pcost = np.cumsum(pzcost / sum(pzcost)) #Select parents & match numparents = self.n_chromosomes - self.n_elite #TODO: Add random state rand_parents = np.random.rand(numparents, 2) parents = np.zeros(rand_parents.shape, dtype=np.int) for parent1 in range(numparents): for parent2 in range(2): parents[parent1, parent2] = np.searchsorted( pcost, rand_parents[parent1, parent2]) if self.type_ == 'binary': #Binary #random single point matching rand_match = int(np.random.rand() * self.n_parameters) child = self.pop_[parents[parent1, 0]] child[rand_match:] = self.pop_[parents[parent1, 1], rand_match:] else: #Continious rand_match = np.random.rand(self.n_parameters) child = self.pop_[parents[parent1, 0]] * rand_match child += (1 - rand_match) * self.pop_[parents[parent1, 1]] new_pop[self.n_elite + parent1] = child orig[self.n_elite:] = [ ','.join(row.astype(np.str)) for row in (parents + i * self.n_chromosomes) ] #Mutate m_rand = np.random.rand(self.n_chromosomes, self.n_parameters) m_rand[0:self.n_elite] = 1.0 mutations = m_rand <= self.per_mutations num_mutations = np.count_nonzero(mutations) if self.type_ == 'binary': new_pop[mutations] = (new_pop[mutations] == 0).astype(np.int) else: new_pop[mutations] = self._random(num_mutations)[:, 0] rows_mutations = np.any(mutations, axis=1) orig[rows_mutations] = add( orig[rows_mutations], np.array(['_M'] * np.count_nonzero(rows_mutations), dtype='S10')) # Replace replicates with random temp_unique = np.ascontiguousarray(new_pop).view( np.dtype((np.void, new_pop.dtype.itemsize * new_pop.shape[1]))) _, temp_unique_idx = np.unique(temp_unique, return_index=True) n_replace = self.n_chromosomes - temp_unique_idx.shape[0] if n_replace > 0: temp_unique_replace = np.ones(self.n_chromosomes, dtype=np.bool) temp_unique_replace[:] = True temp_unique_replace[temp_unique_idx] = False new_pop[temp_unique_replace] = self._random(n_replace) orig[temp_unique_replace] = '-1' self.pop_ = new_pop self.cost_ = self._fitness_function() filter_iter = range((i + 1) * self.n_chromosomes, (i + 2) * self.n_chromosomes) self.hist_.loc[filter_iter, 'iter'] = i + 1 self.hist_.loc[filter_iter, 'cost'] = self.cost_ self.hist_.loc[filter_iter, _cols_x] = self.pop_ self.hist_.loc[filter_iter, 'orig'] = orig best = np.argmin(self.cost_) self.x = self.pop_[best] self.x_cost = self.cost_[best]
def get_generators(n_total, batch_size, image_shape=None, type='array', zeros_left=5000): ''' Construct generators for training and validation data Zero grade images are downsampled :param n_total: number of total images to use (training plus validation) :param batch_size: batch size used in training :param image_shape: image size used in training :param zeros_left: how many images of grade zero should be left in the pool use a negative value to keep all the zeros :return: train_gen: generator of training data test_gen: generator of validation data ''' # Set the number of training samples n_train = int(np.ceil(n_total * 0.8)) n_test = int(np.floor(n_total * 0.2)) # Read filenames from a text file listing all the images full_filenames = np.genfromtxt('../data/train_filenames.txt', dtype=str) # Read the labels file full_labels = np.genfromtxt('../data/trainLabels.csv', skip_header=1, dtype=str, delimiter=',') # Keep only labels of data that can be used in training full_samples = replace(full_filenames, ".jpeg", "") full_mask = np.isin(full_labels[:, 0], full_samples) trainable_labels = np.copy(full_labels[full_mask, :]) # Downsample the zero grade, keeping only the first 5000 # Randomize order np.random.seed(1234) np.random.shuffle(trainable_labels) # Arrange by a stable sort (mergesort) trainable_labels = np.copy( trainable_labels[trainable_labels[:, 1].argsort(kind='mergesort')]) # Remove extra zeros if zeros_left > 0: _, counts = np.unique(trainable_labels[:, 1], return_counts=True) n_zeros = counts[0] downsampled_labels = np.copy(trainable_labels[(n_zeros - zeros_left):, :]) else: downsampled_labels = np.copy(trainable_labels) # Randomize and choose training data np.random.shuffle(downsampled_labels) train_labels = downsampled_labels[:n_train, :] #test_labels = downsampled_labels[n_train:(n_train + n_test)] # Exclude training samples from the original data and choose test data among them np.random.shuffle(trainable_labels) exclusion = np.isin(trainable_labels[:, 0], train_labels[:, 0], invert=True) valid_labels = np.copy(trainable_labels[exclusion, :]) test_labels = np.copy(valid_labels[:n_test, :]) # Print the counts of each class in test and train data _, train_counts = np.unique(train_labels[:, 1], return_counts=True) print("\nTrain distribution:") print(train_counts / np.sum(train_counts)) _, test_counts = np.unique(test_labels[:, 1], return_counts=True) print("\nTest distribution:") print(test_counts / np.sum(test_counts)) print("\n") if type == 'array': # Add .npy file ending train_filenames = add(train_labels[:, 0], np.full(shape=n_train, fill_value='.npy')) test_filenames = add(test_labels[:, 0], np.full(shape=n_test, fill_value='.npy')) # Add path of the data folder to the files train_filepaths = add( np.full(shape=train_filenames.shape, fill_value='../data/arrays/'), train_filenames) test_filepaths = add( np.full(shape=test_filenames.shape, fill_value='../data/arrays/'), test_filenames) # Create an instance of the image generator train_gen = ArrayGenerator(train_filepaths, train_labels[:, 1], batch_size) test_gen = ArrayGenerator(test_filepaths, test_labels[:, 1], batch_size) elif type == 'image': if image_shape is None: raise ValueError # Add .jpeg file ending train_filenames = add(train_labels[:, 0], np.full(shape=n_train, fill_value='.jpeg')) test_filenames = add(test_labels[:, 0], np.full(shape=n_test, fill_value='.jpeg')) # Add path of the data folder to the files train_filepaths = add( np.full(shape=train_filenames.shape, fill_value='../data/train/'), train_filenames) test_filepaths = add( np.full(shape=test_filenames.shape, fill_value='../data/train/'), test_filenames) # Create an instance of the image generator train_gen = ImageGenerator(train_filepaths, train_labels[:, 1], batch_size, image_shape) test_gen = ImageGenerator(test_filepaths, test_labels[:, 1], batch_size, image_shape) return train_gen, test_gen
locUK608 = astropy.coordinates.EarthLocation.from_geodetic( lat=51.143833512, lon=-1.433500703, height=176.028) # UK608 LBA locIE613 = astropy.coordinates.EarthLocation.from_geocentric( 3801633.528060000, -529021.899396000, 5076997.185, unit='m') # IE613 LBA if args.observatory.startswith('UK608'): tstart.location = locUK608 elif argsobservatory.startswith('IE613'): tstart.location = locIE613 if args.observatory.startswith('UK608'): tend.location = locUK608 elif argsobservatory.startswith('IE613'): tend.location = locIE613 filename = args.filename sources, durations = sourcelist(filename) numberofsources = len(sources) print('Designing observations starting at ', tstart) print(numberofsources, ' Sources') # print (sources, durations) psrnames = add('PSR ', sources) # print (psrnames, durations) lst = getlst(tstart, psrnames[0]) # Find first source if args.strictorder: index = 0 else: index = findfirstsource( psrnames, lst, 3 ) # the last argument is in hours, to be subtracted from the LST to find the first source rotated_psrnames = np.roll(psrnames, -index) rotated_durations = np.roll(durations, -index) # print (rotated_psrnames) currenttime = tstart deadtime = astropy.time.TimeDelta(60, format='sec') stepwait = astropy.time.TimeDelta(600, format='sec')