def mp_progress(func, iterable, processes=10, scale=10): gensim = utils.import_module("gensim") mp = utils.import_module("multiprocessing.pool") chunks = list(gensim.utils.chunkize(iterable, processes * scale)) pool = mp.Pool(processes) ret = [] for chunk in utils.tqdm(chunks): ret.extend(pool.map(func, chunk)) return ret
def train_model(agent, dataObj, ep_count=20, batch_size=32, evaluate_every=5): start_ep = agent.episode + 0 end_ep = start_ep + ep_count for i in range(ep_count): if (i + 1) % evaluate_every == 0: evaluate_model(agent, dataObj, batch_size=batch_size) start_time = getChiTimeNow() epsilon_start = agent.epsilon total_profit = 0 data_length = len(dataObj.trainDF) - 1 avg_loss_array = [] agent.reset() state = agent.getState(dataObj, 0) try: for t in tqdm(range(data_length), total=data_length, leave=True, desc='Episode {}/{}'.format(agent.episode, end_ep)): reward = 0 # select an action action = agent.act(state) open_pnl = calcOpenPnl(agent, dataObj, t + 1) day_pnl = evaluateAction(action, agent, dataObj, t + 1) reward = open_pnl + day_pnl total_profit += reward next_state = agent.getState(dataObj, t + 1) done = (t == data_length - 1) agent.remember(state, action, reward, next_state, done) if len(agent.memory) > batch_size: # train every batch_size if t % batch_size == 0: loss = agent.train_experience_replay(batch_size) avg_loss_array.append(loss) state = next_state endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \ 'train', start_time) except (KeyboardInterrupt, SystemExit): print('KeyboardInterrupt or SystemExit. Ending current episode.') endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \ 'train', start_time) raise except: print('Unknown error...Ending current episode.') endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \ 'train', start_time) raise
def metric(self, name): if name == 'vali': data = self.data.vali_batch elif name == 'test': data = self.data.test_batch elif name == 'train': data = self.data.train_batch else: raise Exception(f'unknown name: {name}') cnt = 0 max_phase = 11 full_hitrate_per_phase = [[] for _ in range(max_phase)] full_ndcg_per_phase = [[] for _ in range(max_phase)] half_hitrate_per_phase = [[] for _ in range(max_phase)] half_ndcg_per_phase = [[] for _ in range(max_phase)] pbar = tqdm(desc='predicting...', leave=False) loss_list = [] for mv in self.model.metric(data): for i in range(len(mv.ndcg)): pbar.update(1) # if mv.future_seq[0][0] == 0: continue phase = mv.phase[i] item_deg = self.data.dp.item_deg_per_phase[phase][ mv.true_item[i]] mid_deg = self.data.dp.mid_deg_per_phase[phase] if item_deg <= mid_deg: half_hitrate_per_phase[phase].append(mv.hit_rate[i]) half_ndcg_per_phase[phase].append(mv.ndcg[i]) full_hitrate_per_phase[phase].append(mv.hit_rate[i]) full_ndcg_per_phase[phase].append(mv.ndcg[i]) loss_list.append(mv.loss) cnt += 1 if args.run_test and cnt > 10: break pbar.close() result = np.zeros(4, dtype=float) for p in range(max_phase): if half_hitrate_per_phase[p]: m1 = np.mean(full_hitrate_per_phase[p]) m2 = np.mean(full_ndcg_per_phase[p]) m3 = np.mean(half_hitrate_per_phase[p]) m4 = np.mean(half_ndcg_per_phase[p]) m = np.array([m1, m2, m3, m4]) if args.show_detail: print( f'phase: {p}, vali: {format_metric(m)}, full num: {len(full_hitrate_per_phase[p])}, half num: {len(half_hitrate_per_phase[p])}' ) result += m loss_mean = np.mean(loss_list) return result, format_metric(result), loss_mean
def lddmm_matching(I, J, m=None, lddmm_steps=1000, lddmm_integration_steps=10, reg_weight=1e-1, learning_rate_pose=2e-2, fluid_params=[1.0, .1, .01], progress_bar=True): """Matching image I to J via LDDMM""" if m is None: defsh = [I.shape[0], 3] + list(I.shape[2:]) m = torch.zeros(defsh, dtype=I.dtype).to(I.device) do_regridding = m.shape[2:] != I.shape[2:] J = J.to(I.device) matchterms = [] regterms = [] losses = [] metric = lm.FluidMetric(fluid_params) m.requires_grad_() pb = range(lddmm_steps) if progress_bar: pb = tqdm(pb) for mit in pb: if m.grad is not None: m.grad.detach_() m.grad.zero_() m.requires_grad_() h = lm.expmap(metric, m, num_steps=lddmm_integration_steps) if do_regridding is not None: h = lm.regrid(h, shape=I.shape[2:], displacement=True) Idef = lm.interp(I, h) regterm = (metric.sharp(m) * m).mean() matchterm = mse_loss(Idef, J) matchterms.append(matchterm.detach().item()) regterms.append(regterm.detach().item()) loss = matchterm + reg_weight * regterm loss.backward() loss.detach_() with torch.no_grad(): #v = metric.sharp(m) #regterm = (v*m).mean()#.detach() #del v #losses.append(loss.detach()+ .5*reg_weight*regterm) losses.append(loss.detach()) p = metric.flat(m.grad).detach() if torch.isnan(losses[-1]).item(): print(f"loss is NaN at iter {mit}") break #if mit > 0 and losses[-1].item() > losses[-2].item(): # print(f"loss increased at iter {mit}") #p.add_(reg_weight/np.prod(m.shape[1:]), m) m.add_(-learning_rate_pose, p) return m.detach(), [l.item() for l in losses], matchterms, regterms
def nearest_neighbors(args, samples, dataset): dataset_bow = [create_bag_of_words(dataset[i]["string"]) for i in range(len(dataset))] samples_bow = [create_bag_of_words(sample) for sample in samples] knn = [] progress = utils.tqdm(total=len(samples), desc="finding knn") for sample, sample_bow in zip(samples, samples_bow): progress.update(1) sims = [(bow_cosine_similarity(sample_bow, dataset_bow[i]), dataset[i]["string"]) for i in range(len(dataset))] sims.sort(key=lambda x: x[0], reverse=True) knn.append([x[1] for x in sims[:args.nearest_neighbors]]) return knn
def generate_from(self, zs): self.model.train(False) res = [] progress = utils.tqdm(total=len(zs), desc="generating") for i in range(0, len(zs), self.batch_size): z = zs[i:i + self.batch_size] progress.update(len(z)) z = z.to(self.device) gens, probs = self._generate_step(z) res.extend(list(zip(gens, probs))) progress.close() gens, probs = list(zip(*res)) return list(zip(*gens)), list(zip(*probs))
def dump_features_all_item(self, name): if name == 'vali': data = self.data.vali_batch elif name == 'test': data = self.data.test_batch elif name == 'train': data = self.data.train_batch else: raise Exception(f'unknown name: {name}') users = [] items = [] logits = [] from run_for_fuse import all_res fn_list = all_res.keys() user2items = None for fn in fn_list: _users, _items_list, _ = utils.load_pkl( f'{utils.for_fuse_dir}/{fn}_{name}') if user2items is None: user2items = {} for _u, _items in zip(_users, _items_list): user2items[_u] = set(_items) else: assert set(user2items.keys()) == set(_users) for _u, _items in zip(_users, _items_list): user2items[_u] |= set(_items) pbar = tqdm(desc=f'dump {name}, predicting...', leave=False) for pv in self.model.predict(data): pbar.update(1) users.extend(pv.user.tolist()) for i in range(len(pv.user)): user = pv.user[i] _items_i = sorted(user2items[user]) items.append(_items_i) logits.append(pv.all_scores[i, _items_i].tolist()) pbar.close() feat = [users, items, logits] fn = f'{utils.for_fuse_dir}/union_{args.msg}_{name}' print(f'{utils.get_time_str()} dump file {fn}') utils.save_pkl(feat, fn) print(f'{utils.get_time_str()} dump file {fn} over') return fn
def load(): print('loading h5 store') store = pd.HDFStore(conf.wd + conf.data + '.h5', mode='r') full = store['df'] print('sampling') #either use t (timestep) and wnd (window) or tstart and tend if conf.t is not None and conf.wnd is not None: t = np.searchsorted(full.index, pd.Timestamp(conf.t)) tstart = full.index[t - 1] tend = full.index[t + conf.wnd - 1] else: tstart = pd.Timestamp(conf.tstart) tend = pd.Timestamp(conf.tend) print('Loading range', tstart, '-', tend) # assumes 0-indexed if conf.gridXmin == 0 and conf.gridXmax == conf.gridXdim - 1 and conf.gridYmin == 0 and conf.gridYmax == conf.gridYdim - 1: used_data_slice = full.loc[slice(tstart, tend), :] else: used_data_slice = pd.DataFrame() # tqdm for progress bar #only if gridXmin and gridXmax are defined. Otherwise full dataset is used for i in tqdm(range(conf.gridXmin, conf.gridXmax + 1)): x = full.loc[slice(tstart, tend), slice(i * conf.gridYdim + conf.gridYmin, i * conf.gridYdim + conf.gridYmax)] used_data_slice = used_data_slice.merge(x, how='outer', left_index=True, right_index=True, copy=False) print('sample size: {} = {}%'.format( used_data_slice.shape[0] * used_data_slice.shape[1], (used_data_slice.shape[0] * used_data_slice.shape[1] / (full.shape[0] * full.shape[1])) * 100)) store.close() if conf.diff: #Calculates the difference of a DataFrame element compared with another element in the DataFrame (default is the element in the same column of the previous row). used_data_slice = used_data_slice.diff().shift(-1) #hack: drop those snp tickers that screw up the scale #p.drop(p.columns[np.where((p > 200).any())[0]], axis=1, inplace=True) used_data_slice.fillna(0, inplace=True) return used_data_slice
def eval(tokenizer: Tokenizer, model: GPT2LMHeadModel, dataset: MyDataset, args: TrainingArguments): model.eval() loss = 0 iterator = build_data_iterator(tokenizer, dataset, args.eval_batch_size, args.block_size) for ids, attention_mask in tqdm(iterator, desc='eval'): ids = ids.to(args.device) with torch.no_grad(): loss += model(ids, attention_mask=attention_mask.to(args.device), labels=ids)[0].item() model.train() return loss / len(iterator)
def batch_average(dataloader, **kwargs): """Compute the average using streaming batches from a dataloader along a given dimension""" avg = None sumsizes = 0 for (i, img) in tqdm(dataloader, 'image avg'): sz = img.shape[0] avi = img.to('cuda').mean(**kwargs) if avg is None: avg = avi else: # add similar-sized numbers using this running average avg = avg*(sumsizes/(sumsizes+sz)) + avi*(sz/(sumsizes+sz)) sumsizes += sz return avg
def expo(args): def filename_fn(args): rs = 'N({}, {})'.format(args.radius, args.sigma) return rs def fpath(fname): _fpath = os.path.join(args.output_dir, fname) return _fpath length = 5 * args.radius linspace, data = SyntheticDataset.grid_data(args.num_points, length=length) # loader = dataset[args.dataset](args) # trainData = loader.train # for batch_idx, samples in enumerate(trainData): # data,labels = samples[DatasetType.InD] plt.xlim(-1 * length, length) plt.ylim(-1 * length, length) for scale in tqdm([1, 2, 3, 4]): sigma = scale * args.sigma scale_args = deepcopy(args) scale_args.sigma = sigma fname = filename_fn(scale_args) checkpoint_dir = os.path.join(args.work_dir, 'checkpoints') saver = Saver(checkpoint_dir) # makes directory if already not present payload = saver.load(hash_args( scale_args)) #hash_args(scale_args) generates the hex string def run_and_save(scale_args): export = main(scale_args) #Model creation?? payload = export['model'] saver.save(hash_args(scale_args), payload) return payload export = payload or run_and_save(scale_args) with torch.no_grad(): scores = inference(export, data) np_x = data.cpu().numpy() for key in scores: score = scores[key].cpu().numpy() plot_pcolormesh(np_x, linspace, score) score_fname = '{}_{}'.format(fname, key) plt.title(score_fname) flush_plot(plt, fpath(score_fname) + '.png')
def train_loop(self): brk = 0 vali_best_w = -1 for ep in range(args.epochs): pbar = tqdm(total=args.nb_vali_step, desc='training', leave=False) try: train_v = [] t0 = time.time() for _ in range(args.nb_vali_step): # dict v = self.model.fit() train_v.append(v) pbar.update(1) finally: pbar.close() train_time = time.time() - t0 train_msg = dict_mean(train_v) vali_v, vali_str, vali_loss = self.metric('vali') vali_w = np.sum( [v * w for v, w in zip(vali_v, self.metric_weights)]) if vali_w > vali_best_w: vali_best_w = vali_w self.best_vali = vali_v self.model.save(0) brk = 0 else: brk += 1 red = (brk == 0) msg = f'#{ep + 1}/{args.epochs} {train_msg}, brk: {brk}, vali: {vali_str}, {vali_loss:.4f}' if args.show_test and args.nb_test > 0: _, test_str, test_loss = self.metric('test') msg = f'{msg}, test: {test_str}' vali_time = time.time() - t0 - train_time msg = f'{msg}, time: {train_time:.0f}s,{vali_time:.0f}s' args.log.log(msg, red=red) if ep < args.min_train_epochs: brk = 0 if brk >= args.early_stopping: break if args.epochs == 0: self.model.save(0) self.model.restore(0)
def build_graph_item_item(self): from tqdm import tqdm G_forward = [defaultdict(int) for _ in range(args.nb_items)] G_backward = [defaultdict(int) for _ in range(args.nb_items)] nb_edges = 0 for u, item_list in tqdm(enumerate(self.user2item_seq), desc='build edges'): n = len(item_list) for i in range(1, n): a, b = item_list[i - 1], item_list[i] if a >= 3 and b >= 3: G_forward[a][b] += 1 G_backward[b][a] += 1 if G_forward[a][b] == args.gnn_min_edge_cnt: nb_edges += 1 args.update(nb_edges=nb_edges) neighbors = [[], []] maxn = args.gnn_adj_length for item in tqdm(range(args.nb_items), desc='sample neighbors'): nxt_forward = self.sample_neighbors(G_forward[item], maxn) nxt_backward = self.sample_neighbors(G_backward[item], maxn) neighbors[0].append(nxt_forward) neighbors[1].append(nxt_backward) self.neighbors = neighbors
def search(self, queries): with torch.no_grad(): neighbors = [] num_queries = len(queries) for i in utils.tqdm(range(0, num_queries, self.batch_size), desc="searching nearest neighbors"): x = queries[i:i + self.batch_size] x = torch.stack([self.tensorize_bow(s) for s in x]).to(self.device) logits = torch.matmul(self.tensors, x.t()).t() idxs = torch.sort(logits, 1, True)[1][:, :self.num_neighbors] idxs = idxs.cpu().tolist() neighbors.extend([[self.sents[j] for j in idx] for idx in idxs]) return neighbors
def affine_matching(I, J, A=None, T=None, affine_steps=100, reg_weightA=1e2, reg_weightT=1e1, learning_rate_A=1e-4, learning_rate_T=1e-2, progress_bar=True): """Matching image I to J via affine transform""" if A is None: A = torch.zeros((I.shape[0], 3, 3), dtype=I.dtype).to(I.device) if T is None: T = torch.zeros((I.shape[0], 3), dtype=I.dtype).to(I.device) J = J.to(I.device) losses = [] I.requires_grad_(False) J.requires_grad_(False) steps = range(affine_steps) eye = torch.eye(3).view(1, 3, 3).type(I.dtype).to(I.device) if progress_bar: steps = tqdm(steps) for mit in steps: A.requires_grad_(True) T.requires_grad_(True) if A.grad is not None and T.grad is not None: A.grad.detach_() A.grad.zero_() T.grad.detach_() T.grad.zero_() Idef = lm.affine_interp(I, A + eye, T) regtermA = mse_loss(A, A) regtermT = mse_loss(T, T) loss = mse_loss( Idef, J) + .5 * reg_weightA * regtermA + .5 * reg_weightT * regtermT loss.backward() loss.detach_() with torch.no_grad(): losses.append(loss) #if torch.isnan(losses[-1]).item(): #print(f"loss is NaN at iter {mit}") #break #if mit > 0 and losses[-1].item() > losses[-2].item(): #print(f"loss increased at iter {mit}") A.add_(-learning_rate_A, A.grad) T.add_(-learning_rate_T, T.grad) return A.detach(), T.detach(), [l.item() for l in losses]
def evaluate_model(agent, dataObj, debug=False, batch_size=32): print('Evaluating Model') start_time = getChiTimeNow() epsilon_start = np.nan total_profit = 0 data_length = len(dataObj.trainDF) - 1 avg_loss_array = [] agent.reset() state = agent.getState(dataObj, 0) try: for t in tqdm(range(data_length)): reward = 0 # select an action action = agent.act(state, is_eval=True) open_pnl = calcOpenPnl(agent, dataObj, t + 1) day_pnl = evaluateAction(action, agent, dataObj, t + 1) reward = open_pnl + day_pnl total_profit += reward next_state = agent.getState(dataObj, t + 1) done = (t == data_length - 1) # agent.memory.append((state, action, reward, next_state, done)) # don't know why this line was here instead of the below agent.remember(state, action, reward, next_state, done) if len(agent.memory) > batch_size: # train every batch_size if t % batch_size == 0: loss = agent.evaluate_experience_replay(batch_size) avg_loss_array.append(loss) state = next_state endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \ 'eval', start_time) except (KeyboardInterrupt, SystemExit): print('KeyboardInterrupt or SystemExit. Ending current episode.') endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \ 'eval', start_time) raise except: print('Unknown error...Ending current episode.') endEpisode(agent, dataObj, batch_size, epsilon_start, avg_loss_array, \ 'eval', start_time) raise
def yc(self, frac=1): pbar = tqdm(desc='read data', total=self.N) f = open(f'{data_home}/yc_1_{frac}/data.txt', 'r') for line in f: pbar.update(1) line = line[:-1] sid, vid_list_str = line.split() vid_list = [] for vid in vid_list_str.split(','): vid, cls, ts = vid.split(':') cls = int(cls) # cls: 0, 1, 2, ... ts = int(ts) vid_list.append([vid, cls, ts]) yield vid_list f.close() pbar.close()
def train(self, dataloader): self.global_step = 0 self.model.train(True) optimizer = self.optimizer_cls(list(self.trainable_params())) self.progress = utils.tqdm(total=len(dataloader.dataset), disable=not self.show_progress) if self.kld_annealing is not None: kld_scale = 0.0 else: kld_scale = 1.0 for eidx in range(1, self.epochs + 1): self.local_step = 0 stats_cum = collections.defaultdict(float) for batch in dataloader: optimizer.zero_grad() batch_size, x, lens, targets = self.prepare_batch(batch) self.global_step += batch_size self.local_step += batch_size self.progress.update(batch_size) ret = self.model(x, lens) logits, loss_kld = ret.get("pass"), ret.get("loss") loss = self.calculate_celoss(logits, targets) if loss_kld is not None: loss += kld_scale * loss_kld.mean() loss.backward() optimizer.step() stats = {"loss": loss.item()} if loss_kld is not None: stats["loss-kld"] = kld_scale * loss_kld.mean().item() stats["kld-anneal"] = kld_scale for k, v in stats.items(): stats_cum[f"{k}-cum"] += v * batch_size desc = self.report_stats(stats) self.progress.set_description(desc) self.report_samples(batch.get("string"), logits.max(2)[1], lens) stats_cum = {k: v / self.local_step for k, v in stats_cum.items()} desc = self.report_stats(stats_cum) logging.info(f"[{eidx}] {desc}") if self.kld_annealing is not None: kld_scale += self.kld_annealing kld_scale = min(1.0, kld_scale) if eidx % self.save_period == 0: self.snapshot(eidx)
def automatic_image_disambiguation(features, queries, select_clusters, gamma=1.0, k=200, n_clusters=None, max_clusters=10, show_progress=False): """ Automatic Image Disambiguation (our method) based on clustering of directions and directed boni. features - n-by-d matrix containing d-dimensional features of n samples. queries - Dictionary mapping query IDs to dictionaries with keys 'relevant' and 'img_id'. 'img_id' gives the ID of the query image and 'relevant' points to a list of IDs of images relevant for this query. select_clusters - Callback function taking a query dictionary with keys 'relevant' and 'img_id' and a list of lists of images for each cluster as arguments and returning a list of indices of selected clusters. gamma - Controls the effect of the cluster selection. For gamma < 1.0, the direction of samples must match the selected direction more exactly for those samples being adjusted, while for very large gamma, even samples in the orthogonal direction will be assigned a highly adjusted distance. k - The number of baseline retrieval results to be used for the initial clustering step. n_clusters - The number of clusters (image senses) to be shown to the user for selection of the relevant clusters. If set to None, the number of clusters will be determined heuristically. max_clusters - Maximum number of clusters. Has only an effect if n_clusters is None. show_progress - If True, a progress bar will be shown (requires tqdm). Returns: re-ranked retrieval results as dictionary mapping query IDs to tuples consisting of an ordered list of retrieved image IDs and a corresponding list of adjusted distances to the query. """ # Baseline retrieval retrievals = baseline_retrieval(features, queries, select_clusters) ret_it = tqdm( retrievals.items(), desc='AID', total=len(retrievals), leave=False) if show_progress else retrievals.items() with Pool(initializer=_init_pool, initargs=(features, queries, select_clusters, gamma, k, n_clusters, max_clusters)) as p: return dict(p.imap_unordered(_aid_worker, ret_it, 10))
def encode(self, dataloader): with torch.no_grad(): self.model.train(False) self.step = 0 progress = utils.tqdm( total=len(dataloader.dataset), desc=f"encoding distribution", ) means, stds = [], [] for batch in dataloader: batch_size, (w, l, i, lens) = self.prepare_batch(batch) self.step += batch_size progress.update(batch_size) mean, std = self.model.encode(w, l, i, lens) means.append(mean.cpu()) stds.append(std.cpu()) progress.close() return torch.cat(means, 0), torch.cat(stds, 0)
def generate(self, num_samples): self.model.train(False) z = self.sample_z(num_samples) samples = [] progress = utils.tqdm(total=num_samples, desc="generating") for i in range(0, num_samples, self.batch_size): z_batch = z[i:i + self.batch_size] progress.update(z_batch.size(0)) x = z.new(z_batch.size(0), 1).fill_(self.bos_idx).long() x, lens = self.model.decode(z_batch, x, eos_idx=self.eos_idx, max_len=self.max_len ) x, lens = x.cpu().tolist(), lens.cpu().tolist() for sent, l in zip(x, lens): samples.append(self.to_sent(sent[:l])) return samples
def hard_cluster_selection(features, queries, select_clusters, k=200, n_clusters=None, max_clusters=10, show_progress=False): """ Hard Cluster Selection as used by CLUE, but on the clusters determined by AID (our method). """ # Baseline retrieval retrievals = baseline_retrieval(features, queries, select_clusters) ret_it = tqdm(retrievals.items(), desc='Hard-Select', total=len(retrievals), leave=False) if show_progress else retrievals.items() with Pool(initializer=_init_pool, initargs=(features, queries, select_clusters, 1.0, k, n_clusters, max_clusters)) as p: return dict(p.imap_unordered(_hs_worker, ret_it, 10))
def build_graph_user_item(self): from tqdm import tqdm user2item = [defaultdict(int) for _ in range(args.nb_users)] item2user = [defaultdict(int) for _ in range(args.nb_items)] for user, item_list in tqdm(enumerate(self.user2item_seq), desc='build edges'): for item in item_list: if item >= 3: user2item[user][item] += 1 item2user[item][user] += 1 maxn = args.gnn_adj_length user_neighbors = [] for user in range(args.nb_users): items = self.sample_neighbors(user2item[user], maxn) user_neighbors.append(items) item_neighbors = [] for item in range(args.nb_items): users = self.sample_neighbors(item2user[item], maxn) item_neighbors.append(users) self.neighbors = [user_neighbors, item_neighbors]
def train(self, dataset, device): self.model.train() # dataset_iter = iter(dataset) # np.random.shuffle(dataset) N = sum(1 for _ in deepcopy(dataset)) # print("len of datasets: ", N) loss_total = 0 i = 0 self.optimizer.zero_grad() adjs, atoms, proteins, labels = [], [], [], [] # TODO: 进度条 for _ in tqdm(range(N), ascii=True): data = next(dataset) i = i+1 atom, adj, protein, label = data # TODO: 将Tensor转移到显卡上 if torch.cuda.is_available(): atom, adj, protein, label = atom.cuda(), adj.cuda(), protein.cuda(), label.cuda() adjs.append(adj) atoms.append(atom) proteins.append(protein) labels.append(label) if i % 8 == 0 or i == N: data_pack = pack(atoms, adjs, proteins, labels, device) loss = self.model(data_pack) # loss = loss / self.batch loss.backward() # torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=10) adjs, atoms, proteins, labels = [], [], [], [] else: continue if i % self.batch == 0 or i == N: self.optimizer.step() self.optimizer.zero_grad() loss_total += loss.item() return loss_total
def dump_features(self, name): if name == 'vali': data = self.data.vali_batch elif name == 'test': data = self.data.test_batch elif name == 'train': data = self.data.train_batch else: raise Exception(f'unknown name: {name}') users = [] items = [] logits = [] pbar = tqdm(desc=f'dump {name}, predicting...', leave=False) for pv in self.model.predict(data): pbar.update(1) users.extend(pv.user.tolist()) _items = pv.top_items.tolist() _scores = pv.top_scores.tolist() items.extend(_items) logits.extend(_scores) if args.run_test and pbar.n > 10: break pbar.close() feat = [users, items, logits] fn = f'{utils.for_fuse_dir}/{args.msg}_{name}' print(f'{utils.get_time_str()} dump file {fn}') utils.save_pkl(feat, fn) print(f'{utils.get_time_str()} dump file {fn} over') return fn
def build_adj(self, G, M): # M: number of adj per node N = args.nb_nodes # adj shape: [N, M] adj = [None] * N adj[0] = [0] * M w = [None] * N w[0] = [0] * M rdm = np.random.RandomState(555) pbar = tqdm(total=N - 1, desc='building adj') for node in range(1, N): pbar.update(1) adj_list = G.get_adj(node) if len(adj_list) > M: adj_list = rdm.choice(adj_list, size=M, replace=False).tolist() mask = [0] * (M - len(adj_list)) adj_list = adj_list[:] + mask adj[node] = adj_list w_list = [G.edge_cnt.get((node, x), 0) for x in adj_list] w[node] = w_list pbar.close() return [adj, w]
change_points['c'] = np.random.rand(len(change_points)) else: change_points['c'] = np.repeat(conf.infChangeFrac, len(change_points)) ranges = np.concatenate(([0], change_points.i)) ranges = np.concatenate((ranges, [conf.Nsample])) print(ranges) df = pd.DataFrame() model = None assign_model = None assigns = None from utils import tqdm #create data according to a certain model until next change point for i in tqdm(range(1, len(ranges))): model, assign_model, assigns = generate_model( change_points.corr_coef[i - 2] if i > 1 else INITAL_CORR_COEF, ranges[i - 1] - ranges[i - 2], model, assign_model, assigns, change_points.t[i - 2] if i > 1 else '', change_points.c[i - 2] if i > 1 else 1) dat = generate_data(model, assigns, ranges[i] - ranges[i - 1]) df = df.append(pd.DataFrame(dat), ignore_index=True) start = pd.to_datetime('2000-01-01 00:00') df.index = pd.DatetimeIndex( [start + pd.to_timedelta(5 * i, unit='m') for i in df.index], name='i') change_points['time'] = df.index[change_points['i']]
def on_run_started(self, dataloader): ret = super(PredictorWithProgress, self).on_run_started(dataloader) self.progress = utils.tqdm(total=len(dataloader.dataset), desc="predicting", disable=not self.show_progress) return ret
parser.add_argument('--save_every', '-se', type=int, default=5) parser.add_argument('--device', '-d', type=str, default=None) args = parser.parse_args() if not args.device: args.device = 'cuda' if torch.cuda.is_available() else 'cpu' model = SiameseNet(mode='train', device=args.device) datagen = DataLoader(Pairloader(split='train'), shuffle=True) bce_loss = nn.BCELoss() optimizer = optim.Adam(model.parameters(), lr=1e-4) for epoch in range(args.epochs): epoch_loss = 0.0 with tqdm(datagen) as t: for i, batch in enumerate(t): t.set_description('EPOCH: %i'%(epoch+1)) data1, data2, label = batch[0][0].to(device=args.device), batch[0][1].to(device=args.device), batch[1].to(device=args.device) optimizer.zero_grad() output = model(data1, data2) loss = bce_loss(output, label) loss.backward() optimizer.step() epoch_loss+=loss.item() t.set_postfix(loss=epoch_loss/(i+1))
def loadAndProcessData(comb_row, config_dir, day_chg_incs, minute_incs, minute_dir=tdm_dir + 'Minute_Files/'): """ 1. Loads the minute files from the external hard drive 2. Creates, saves, and returns all_minutesDF, sec_guideDF, and dailyDF """ other_secs = comb_row[[ col for col in comb_row.index if (col[:3] == 'Sec' and col != 'Sec1') ]].values other_secs = [i for i in other_secs if type(i) == str] print('No pre-loaded data found. Loading data for ' + comb_row.Sec1 + ' and ' + ','.join(other_secs)) sec1_minuteDF = readCSV(minute_dir + comb_row.Sec1 + '.csv') sec1_minuteDF = sec1_minuteDF.loc[ (sec1_minuteDF.Date >= comb_row.TrainStartDate) & (sec1_minuteDF.Date <= comb_row.ValEndDate)].reset_index(drop=True) print('Loading minuteDF for ' + comb_row.Sec2) other_secs_minuteDF = readCSV(minute_dir + comb_row.Sec2 + '.csv')[[ 'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb' ]] for sec in other_secs[1:]: print('Loading minuteDF for ' + sec) other_secs_minuteDF = other_secs_minuteDF.append( readCSV(minute_dir + sec + '.csv')[[ 'Product', 'Date', 'Minute', 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb' ]], ignore_index=True) other_secs_minuteDF = other_secs_minuteDF.loc[ (other_secs_minuteDF.Date >= comb_row.TrainStartDate) & (other_secs_minuteDF.Date <= comb_row.ValEndDate)].reset_index( drop=True) print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) + ' rows.') print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.') print("readCSVs complete. Subsetting dates...") # [x] subset for dates dates_in_common = set(sec1_minuteDF.Date.unique()) for sec in other_secs: print(sec) other_sec_dates = set(other_secs_minuteDF.loc[ other_secs_minuteDF.Product == sec].Date.unique()) print('removing', [ str(d) for d in sorted(list(dates_in_common.difference(other_sec_dates))) ]) dates_in_common = dates_in_common.intersection(other_sec_dates) print(len(dates_in_common), 'dates_in_common') sec1_dates_to_remove = set( sec1_minuteDF.Date.unique()).difference(dates_in_common) print(str(len(dates_in_common)) + ' dates_in_common') if len(sec1_dates_to_remove) > 0: print('- removing ' + str(len(sec1_dates_to_remove)) + ' dates from ' + comb_row.Sec1) sec1_minuteDF = sec1_minuteDF.loc[sec1_minuteDF.Date.isin( dates_in_common)].reset_index(drop=True) for sec in other_secs: sec_dates_to_remove = set( other_secs_minuteDF.loc[other_secs_minuteDF.Product == sec].Date. unique()).difference(dates_in_common) if len(sec_dates_to_remove) > 0: print('- removing ' + str(len(sec_dates_to_remove)) + ' dates from ' + sec) other_secs_minuteDF = other_secs_minuteDF.loc[ other_secs_minuteDF.Date.isin(dates_in_common)].reset_index(drop=True) print('other_secs_minuteDF has ' + str(len(other_secs_minuteDF)) + ' rows.') print('sec1_minuteDF has ' + str(len(sec1_minuteDF)) + ' rows.') print("Date subset complete. Determining each day's Open/Closes...") # [x] determine each day's open and close dailyDF = pd.DataFrame(columns=['Date', 'Open', 'Close']) dailyDF['Date'] = sec1_minuteDF.Date.unique() for i in tqdm(range(len(dailyDF))): date = dailyDF.loc[i].Date lastOpen = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.min() firstClose = sec1_minuteDF.loc[sec1_minuteDF.Date == date].Minute.max() other_sec_date_subDF = other_secs_minuteDF.loc[other_secs_minuteDF.Date == date] for sec in other_secs: lastOpen = max( lastOpen, other_sec_date_subDF.loc[other_sec_date_subDF.Product == sec].Minute.min()) firstClose = min( firstClose, other_sec_date_subDF.loc[ other_sec_date_subDF.Product == sec].Minute.max()) dailyDF.loc[i, 'Open'] = lastOpen dailyDF.loc[i, 'Close'] = firstClose dailyDF.Open = dailyDF.Open.dt.strftime(date_format='%H:%M') dailyDF.Close = dailyDF.Close.dt.strftime(date_format='%H:%M') dailyDF.to_csv(config_dir + 'Data/daily_summary.csv', index=False) print( "Each day's Open/Closes determination complete. Creating all_minutesDF..." ) # [x] create all_minutesDF all_minutesDF = pd.DataFrame(columns=['Date', 'Minute']) # enumerate minutes for i in range(len(dailyDF)): open_dt = pd.to_datetime( dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' + dailyDF.loc[i].Open, format='%Y-%m-%d %H:%M') close_dt = pd.to_datetime( dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' + dailyDF.loc[i].Close, format='%Y-%m-%d %H:%M') minute_range = pd.date_range(start=open_dt, end=close_dt, freq='T') day_minutesDF = pd.DataFrame({ 'Date': minute_range.date, 'Minute': minute_range.values }) all_minutesDF = all_minutesDF.append(day_minutesDF, ignore_index=True) #populate minute data col_stems = [ 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A', 'Count', 'B_TickImb', 'A_TickImb', 'M_TickImb' ] first_minute_populate_stems = [ 'O_B', 'O_A', 'H_B', 'H_A', 'L_B', 'L_A', 'C_B', 'C_A' ] for sec_num in range(1, len(other_secs) + 2): sec_cols = [col_stem + str(sec_num) for col_stem in col_stems] for sec_col in sec_cols: all_minutesDF[sec_col] = np.nan all_minutesDF[[c + '1' for c in col_stems ]] = pd.merge(all_minutesDF[['Minute']], sec1_minuteDF[['Minute'] + col_stems], on='Minute', how='left')[col_stems] print('Merging into all_minutesDF...') for sec_num in range(2, len(other_secs) + 2): other_sec = other_secs[sec_num - 2] all_minutesDF[[c + str(sec_num) for c in col_stems]] = pd.merge( all_minutesDF[['Minute']], other_secs_minuteDF[['Minute'] + col_stems].loc[ other_secs_minuteDF.Product == other_sec], on='Minute', how='left')[col_stems] print('Getting the first datapoint of each day...') #get first datapoint of each day for date in tqdm(dates_in_common): date = dailyDF.loc[i].Date if date in dates_in_common: open_dt = pd.to_datetime( dailyDF.loc[i].Date.strftime(format='%Y-%m-%d') + ' ' + dailyDF.loc[i].Open, format='%Y-%m-%d %H:%M') sec1_last_row = sec1_minuteDF.loc[(sec1_minuteDF.Date == date) & ( sec1_minuteDF.Minute <= open_dt)].iloc[-1] if sec1_last_row.Minute < open_dt: if (open_dt - sec1_last_row.Minute).seconds / 60 > 20: raise ValueError( 'Too much time has elapsed. ' + comb_row.Sec1 + ' open quote is stale at ' + open_dt.strftime(format='%Y-%m-%d %H:%M') + ' by ' + str((open_dt - sec1_last_row.Minute).seconds / 60) + ' minutes.') else: all_minutesDF.loc[all_minutesDF.Minute == open_dt, [c + '1' for c in col_stems]] = 0 all_minutesDF.loc[ all_minutesDF.Minute == open_dt, [c + '1' for c in first_minute_populate_stems ]] = sec1_last_row[first_minute_populate_stems] other_secs_subDF = other_secs_minuteDF.loc[ (other_secs_minuteDF.Date == date) & (other_secs_minuteDF.Minute <= open_dt)] for sec_num in range(2, len(other_secs) + 2): other_sec = other_secs[sec_num - 2] other_sec_last_row = other_secs_subDF.loc[ other_secs_subDF.Product == other_sec].iloc[-1] if other_sec_last_row.Minute < open_dt: if (open_dt - other_sec_last_row.Minute).seconds / 60 > 20: raise ValueError( "Too much time has elapsed. " + other_sec + " open quote is stale at " + date.strftime(open_dt='%Y-%m-%d %H:%M') + ' by ' + str((open_dt - other_sec_last_row.Minute).seconds / 60) + ' minutes.') else: all_minutesDF.loc[ all_minutesDF.Minute == open_dt, [c + str(sec_num) for c in col_stems]] = 0 all_minutesDF.loc[all_minutesDF.Minute == open_dt, [ c + str(sec_num) for c in first_minute_populate_stems ]] = other_sec_last_row[first_minute_populate_stems] print('Saving all_minutesDF...') all_minutesDF.to_csv(config_dir + 'Data/all_minutes.csv', index=False) print('Save complete.') sec_guideDF = pd.DataFrame({'Sec': [comb_row.Sec1] + list(other_secs)}) sec_guideDF.to_csv(config_dir + 'Data/sec_guide.csv', index=False) return all_minutesDF, dailyDF, sec_guideDF