def selfplay( load_file: "The path to the network model files", output_dir: "Where to write the games"="data/selfplay", holdout_dir: "Where to write the games"="data/holdout", output_sgf: "Where to write the sgfs"="sgf/", readouts: 'How many simulations to run per move'=100, verbose: '>=2 will print debug info, >=3 will print boards' = 1, resign_threshold: 'absolute value of threshold to resign at' = 0.95, holdout_pct: 'how many games to hold out for evaluation' = 0.05): _ensure_dir_exists(output_sgf) _ensure_dir_exists(output_dir) with timer("Loading weights from %s ... " % load_file): network = dual_net.DualNetwork(load_file) network.name = os.path.basename(load_file) with timer("Playing game"): player = selfplay_mcts.play( network, readouts, resign_threshold, verbose) output_name = '{}-{}'.format(int(time.time()), socket.gethostname()) game_data = player.extract_data() with gfile.GFile(os.path.join(output_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf()) tf_examples = preprocessing.make_dataset_from_selfplay(game_data) # Hold out 5% of games for evaluation. if random.random() < holdout_pct: fname = os.path.join(holdout_dir, "{}.tfrecord.zz".format(output_name)) else: fname = os.path.join(output_dir, "{}.tfrecord.zz".format(output_name)) preprocessing.write_tf_examples(fname, tf_examples)
def evaluate( black_model: 'The path to the model to play black', white_model: 'The path to the model to play white', output_dir: 'Where to write the evaluation results'='data/evaluate/sgf', readouts: 'How many readouts to make per move.'=400, games: 'the number of games to play'=16, verbose: 'How verbose the players should be (see selfplay)' = 1): black_model = os.path.abspath(black_model) white_model = os.path.abspath(white_model) with timer("Loading weights"): black_net = dual_net.DualNetwork(black_model) white_net = dual_net.DualNetwork(white_model) with timer("%d games" % games): players = evaluation.play_match( black_net, white_net, games, readouts, verbose) for idx, p in enumerate(players): fname = "{:s}-vs-{:s}-{:d}".format(black_net.name, white_net.name, idx) with open(os.path.join(output_dir, fname + '.sgf'), 'w') as f: f.write(sgf_wrapper.make_sgf(p[0].position.recent, p[0].make_result_string( p[0].position), black_name=os.path.basename( black_model), white_name=os.path.basename(white_model)))
def main(): with timer('init'): ftt = FeatureToolsTrainV1( # input_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input", # output_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input", input_path="/home/yuyang/02-ds-case/01-Home_Credit/new_input", output_path="/home/yuyang/02-ds-case/01-Home_Credit/output", debug=False ) with timer('es set'): ftt.es_set() with timer('dfs run'): ftt.dfs_run()
def validate( working_dir: 'tf.estimator working directory', *tf_record_dirs: 'Directories where holdout data are', checkpoint_name: 'Which checkpoint to evaluate (None=latest)'=None, validate_name: 'Name for validation set (i.e. selfplay or human)'=None): tf_records = [] with timer("Building lists of holdout files"): for record_dir in tf_record_dirs: tf_records.extend(gfile.Glob(os.path.join(record_dir, '*.zz'))) with timer("Validating from {} to {}".format(os.path.basename(tf_records[0]), os.path.basename(tf_records[-1]))): dual_net.validate(working_dir, tf_records, checkpoint_name=checkpoint_name, name=validate_name)
def nn_1(debug=True): df = pd.read_pickle('../output/basic_application_noonehot_f117.pkl.gz') with timer('feature construct'): X, y, embed_cols, len_embed_cols = embedding_select(df) training = y.notnull() testing = y.isnull() train_id = df[training]['SK_ID_CURR'] sub_id = df[testing]['SK_ID_CURR'] print('\nid length {} {}'.format(len(list(train_id)), len(list(sub_id)))) print('\nid length {} {}'.format(list(train_id)[:5], list(sub_id)[:5])) with timer('nn embedding train time'): metrics, oof_preds, sub_preds = nn_embedding(X, y, embed_cols, len_embed_cols, debug=debug) print('Saving results...') print(oof_preds.shape, sub_preds.shape) print(oof_preds.head(), sub_preds.head()) sub = pd.DataFrame() train = pd.DataFrame() if debug: sub['SK_ID_CURR'] = [i for i in range(500)] train['SK_ID_CURR'] = [i for i in range(1000)] else: sub['SK_ID_CURR'] = sub_id train['SK_ID_CURR'] = train_id print(sub_preds.shape, type(sub_preds)) sub['TARGET'] = sub_preds train['nn_train_pred'] = oof_preds if debug: sub[['SK_ID_CURR', 'TARGET']].to_csv('../03_Stack/input/sub_nn_ub_debug.csv', index=False) train[['SK_ID_CURR', 'nn_train_pred']].to_csv('../03_Stack/input/xpred_nn_ub_debug.csv', index=False) else: sub[['SK_ID_CURR', 'TARGET']].to_csv('../03_Stack/input/sub_nn_ub_embedding.csv', index=False) train[['SK_ID_CURR', 'nn_train_pred']].to_csv('../03_Stack/input/xpred_nn_ub_embedding.csv', index=False) print(sub.head(), sub.shape) print(train.head(), train.shape)
def __main__(): global parser parser = args_options() args = parser.parse_args() with timer(): exit(*main(args))
def get_followers(tw, name): # logger.info('get_followers called') tw = tw.authorize() with timer(logger.info): ids = [user for user in tw.cursor(tw.get_followers_ids, screen_name=name, count=COUNT)] return set(ids)
def load_player(model_path): print("Loading weights from %s ... " % model_path) with timer("Loading weights from %s ... " % model_path): network = dual_net.DualNetwork(model_path) network.name = os.path.basename(model_path) player = MCTSPlayer(network, verbosity=2) return player
def evaluate( black_model: 'The path to the model to play black', white_model: 'The path to the model to play white', output_dir: 'Where to write the evaluation results'='sgf/evaluate', readouts: 'How many readouts to make per move.'=400, games: 'the number of games to play'=16, verbose: 'How verbose the players should be (see selfplay)' = 1): _ensure_dir_exists(output_dir) with timer("Loading weights"): black_net = dual_net.DualNetwork(black_model) white_net = dual_net.DualNetwork(white_model) with timer("%d games" % games): evaluation.play_match( black_net, white_net, games, readouts, output_dir, verbose)
def data_from_id(tw, ids): # logger.info('data_from_id called') tw = tw.authorize() result = None with timer(logger.info): result = tw.lookup_user(user_id=ids) return result
def main(): with timer('init'): ftt = FeatureToolsTrainV1( input_path="/home/ubuntu/01-Home_credit/new_input", output_path="/home/ubuntu/01-Home_credit/output", # input_path='../new_input', # output_path='../output', debug=False ) with timer('es set'): ftt.es_set() with timer('dfs run'): ftt.dfs_run()
def main(): input_path = '/Users/yuyang/02CS/12-ML/01-ML-case/08-new-Home_Credit/input/' output_path = 'bureau_balance.csv' pb = PrepareBureauBalance( input_path=input_path ) with timer('data_prepare'): pb.data_prepare() with timer('data_transform'): pb.data_transform() with timer('data_generate'): pb.data_generate() with timer('data_reture'): df = pb.data_return() with timer('data_save'): df.to_csv(output_path, index=False)
def main(): input_path = '/Users/yuyang/02CS/12-ML/01-ML-case/08-new-Home_Credit/input/' output_path = 'previous_application.csv' pb = PreparePreviousApplication( input_path=input_path ) with timer('data_prepare'): pb.data_prepare() with timer('data_transform'): pb.data_transform() with timer('data_generate'): pb.data_generate() with timer('data_reture'): df = pb.data_return() with timer('data_save'): df.to_csv(output_path, index=False)
def __main__(): global parser parser = args_options() args = parser.parse_args() if args.subs == 'search' and (hasattr(args, 'json') or hasattr(args, 'geojson')): print(main(args)) else: with timer(): exit(*main(args))
def train(chunk_dir, save_file, load_file=None, generation_num=0, logdir=None, num_steps=None, verbosity=1): tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz'))) tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):] print("Training from:", tf_records[0], "to", tf_records[-1]) n = dual_net.DualNetworkTrainer(save_file) with timer("Training"): n.train(tf_records, init_from=load_file, logdir=logdir, num_steps=num_steps, verbosity=verbosity)
def main(): gc.enable() input_path = "/home/ubuntu/01-Home_credit/new_input" output_path = "/home/ubuntu/01-Home_credit/output" #input_path = "/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input" #output_path = "/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/output" with timer('data_load'): app_train, pre_app, credit, installment, poscash = load_data(input_path, debug=False) with timer('construct set'): entity_sets = es_set(app_train, pre_app, credit, installment, poscash) del app_train, pre_app, credit, installment, poscash gc.collect() with timer('dfs run'): dfs_run(entity_sets, output_path)
def __main__(): global parser parser = args_options() args = parser.parse_args() if args.subs == 'search': if args.json: print main(args) sys.exit(0) else: with timer(): exit(*main(args))
def loop(): """Run gather and train as subprocesses.""" gather_errors = 0 while True: print("==================================") with timer("Gather"): gather = subprocess.call("python rl_loop.py gather", shell=True) if gather != 0: print("Error in gather, retrying") gather_errors += 1 if gather_errors == 3: print("Gathering died too many times!") sys.exit(1) continue gather_errors = 0 with timer("Train"): subprocess.call("python rl_loop.py train", shell=True) with timer("validate"): subprocess.call("python rl_loop.py validate", shell=True)
def train( working_dir: 'tf.estimator working directory.', chunk_dir: 'Directory where gathered training chunks are.', model_save_path: 'Where to export the completed generation.', generation_num: 'Which generation you are training.'=0): tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz'))) tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):] print("Training from:", tf_records[0], "to", tf_records[-1]) with timer("Training"): dual_net.train(working_dir, tf_records, generation_num) dual_net.export_model(working_dir, model_save_path)
def evaluate( black_model: 'The path to the model to play black', white_model: 'The path to the model to play white', output_dir: 'Where to write the evaluation results'='sgf/evaluate', readouts: 'How many readouts to make per move.'=200, games: 'the number of games to play'=20, verbose: 'How verbose the players should be (see selfplay)' = 1): qmeas.start_time('evaluate') _ensure_dir_exists(output_dir) with timer("Loading weights"): black_net = dual_net.DualNetwork(black_model) white_net = dual_net.DualNetwork(white_model) winners = [] with timer("%d games" % games): winners = evaluation.play_match( black_net, white_net, games, readouts, output_dir, verbose) qmeas.stop_time('evaluate') white_count = 0 for win in winners: if 'W' in win or 'w' in win: white_count += 1 return white_count * 1.0 / games
def _superdot(self, lhs, rhs, profiler=None): try: if lhs is None: return None if rhs is None: return None if isinstance(lhs, np.ndarray) and lhs.size==1: lhs = lhs.ravel()[0] if isinstance(rhs, np.ndarray) and rhs.size==1: rhs = rhs.ravel()[0] if isinstance(lhs, numbers.Number) or isinstance(rhs, numbers.Number): return lhs * rhs if isinstance(rhs, LinearOperator): return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x))) if isinstance(lhs, LinearOperator): if sp.issparse(rhs): return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x))) else: # TODO: ????????????? # return lhs.matmat(rhs) return lhs.dot(rhs) # TODO: Figure out how/whether to do this. tm_maybe_sparse = timer() lhs, rhs = utils.convert_inputs_to_sparse_if_necessary(lhs, rhs) if tm_maybe_sparse() > 0.1: pif('convert_inputs_to_sparse_if_necessary in {}sec'.format(tm_maybe_sparse())) if not sp.issparse(lhs) and sp.issparse(rhs): return rhs.T.dot(lhs.T).T return lhs.dot(rhs) except Exception as e: import sys, traceback traceback.print_exc(file=sys.stdout) if DEBUG: import pdb; pdb.post_mortem() else: raise
def gather( input_directory: 'where to look for games'='data/selfplay/', output_directory: 'where to put collected games'='data/training_chunks/', examples_per_record: 'how many tf.examples to gather in each chunk'=EXAMPLES_PER_RECORD): qmeas.start_time('gather') _ensure_dir_exists(output_directory) models = [model_dir.strip('/') for model_dir in sorted(gfile.ListDirectory(input_directory))[-50:]] with timer("Finding existing tfrecords..."): model_gamedata = { model: gfile.Glob( os.path.join(input_directory, model, '*.tfrecord.zz')) for model in models } print("Found %d models" % len(models)) for model_name, record_files in sorted(model_gamedata.items()): print(" %s: %s files" % (model_name, len(record_files))) meta_file = os.path.join(output_directory, 'meta.txt') try: with gfile.GFile(meta_file, 'r') as f: already_processed = set(f.read().split()) except tf.errors.NotFoundError: already_processed = set() num_already_processed = len(already_processed) for model_name, record_files in sorted(model_gamedata.items()): if set(record_files) <= already_processed: continue print("Gathering files for %s:" % model_name) for i, example_batch in enumerate( tqdm(preprocessing.shuffle_tf_examples(examples_per_record, record_files))): output_record = os.path.join(output_directory, '{}-{}.tfrecord.zz'.format(model_name, str(i))) preprocessing.write_tf_examples( output_record, example_batch, serialize=False) already_processed.update(record_files) print("Processed %s new files" % (len(already_processed) - num_already_processed)) with gfile.GFile(meta_file, 'w') as f: f.write('\n'.join(sorted(already_processed))) qmeas.stop_time('gather')
def selfplay_cache_model( network: "The path to the network model files", output_dir: "Where to write the games"="data/selfplay", holdout_dir: "Where to write the games"="data/holdout", output_sgf: "Where to write the sgfs"="sgf/", readouts: 'How many simulations to run per move'=100, verbose: '>=2 will print debug info, >=3 will print boards' = 1, resign_threshold: 'absolute value of threshold to resign at' = 0.95, holdout_pct: 'how many games to hold out for validation' = 0.05): qmeas.start_time('selfplay') clean_sgf = os.path.join(output_sgf, 'clean') full_sgf = os.path.join(output_sgf, 'full') _ensure_dir_exists(clean_sgf) _ensure_dir_exists(full_sgf) _ensure_dir_exists(output_dir) _ensure_dir_exists(holdout_dir) with timer("Playing game"): player = selfplay_mcts.play( network, readouts, resign_threshold, verbose) output_name = '{}-{}'.format(int(time.time() * 1000 * 1000), socket.gethostname()) game_data = player.extract_data() with gfile.GFile(os.path.join(clean_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf(use_comments=False)) with gfile.GFile(os.path.join(full_sgf, '{}.sgf'.format(output_name)), 'w') as f: f.write(player.to_sgf()) tf_examples = preprocessing.make_dataset_from_selfplay(game_data) # Hold out 5% of games for evaluation. if random.random() < holdout_pct: fname = os.path.join(holdout_dir, "{}.tfrecord.zz".format(output_name)) else: fname = os.path.join(output_dir, "{}.tfrecord.zz".format(output_name)) preprocessing.write_tf_examples(fname, tf_examples) qmeas.stop_time('selfplay')
def run(self, train, test, logger): with timer(self.name, logger): self.create_features(train, test) logger.debug('[{}] train:{} test:{}'.format( self.name, self.train.shape, self.test.shape)) return self
entityset=self.__es, target_entity="application_train", agg_primitives=[Sum, Std, Max, Min, Median, Count, Skew, PercentTrue, Trend, AvgTimeBetween], where_primitives=[Std, Max, Min, Median, Count], verbose=True, chunk_size=150, # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间 ) self.__train_feature.to_csv(os.path.join(self.__output_path, "train_agg_df.csv"), index=True) def main(): with timer('init'): ftt = FeatureToolsTrainV1( # input_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input", # output_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input", input_path="/home/yuyang/02-ds-case/01-Home_Credit/new_input", output_path="/home/yuyang/02-ds-case/01-Home_Credit/output", debug=False ) with timer('es set'): ftt.es_set() with timer('dfs run'): ftt.dfs_run() if __name__ == "__main__": with timer('总计时间'): main()
pass os.mkdir(generated_rootfs) os.mkdir(generated_parent_rootfs) fs_dist = define_fs_structure(dist_define) depth = fs_dist["depth"] width = fs_dist["width"] layers_desc = fs_dist["layers"] dist = Distributor(generated_rootfs, depth, width) dist.generate_tree() for ld in layers_desc: with utils.timer("Generating test layer"): for d in ld.values(): for f in d: try: size = f["size"] except KeyError: size = None put_files(dist, f["type"], f["count"], size) parent_dist = Distributor(generated_parent_rootfs, depth, width) parent_dist.generate_tree() for ld in layers_desc: with utils.timer("Generating test parent layer"): for d in ld.values(): for f in d:
if __name__ == '__main__': args = argparser() # get input arguments if args.silent: import warnings warnings.filterwarnings("ignore") print( "==================== Training model {0} on dataset {1} ====================" .format(args.model, args.dataset)) # Load data dataset = np.load('data/regression_datasets/' + args.dataset + '.npz') X, y = dataset['data'], dataset['target'] log_score, rmse_score = [], [] # Train multiple models T = timer() for i in range(args.repeats): print("==================== Model {0}/{1} ====================".format( i + 1, args.repeats)) # Make train/test split Xtrain, Xtest, ytrain, ytest = train_test_split( X, y, test_size=args.test_size, random_state=(i + 1) * args.seed) # Normalize data scaler = preprocessing.StandardScaler() scaler.fit(Xtrain) Xtrain = scaler.transform(Xtrain) Xtest = scaler.transform(Xtest) # Fit and score model T.begin()
# 出力ディレクトリ・パスを準備 outdir_name = '2_paint_out' output_path = utils.make_outdir(image_dir, outdir_name) output_koma_path = utils.make_outdir(output_path, '0_koma') if len(os.listdir(output_koma_path)) >= 3: shutil.rmtree(output_path) output_path = utils.make_outdir(image_dir, outdir_name) output_koma_path = utils.make_outdir(output_path, '0_koma') output_shaved_path = utils.make_outdir(output_koma_path, '0_padding_shave') # paint_out処理: 1st img_path_list = utils.get_path_list(image_dir, args.ext) print('pages:', len(img_path_list) - (args.start + args.end)) with utils.timer('paint_out処理: 1st 切り抜き位置が求められた画像を切り抜き'): odd_cp_list = [] # 奇数idxページのカットポイントを格納 even_cp_list = [] # 偶数idxページのカットポイントを格納 not_cut_img_path_dict = {} exec_paint_out_cut(img_path_list, kind='1st') # 平均切り出し座標を算出 even_page_cp = find_average_point(even_cp_list) odd_page_cp = find_average_point(odd_cp_list) print('lens', len(img_path_list) - len(not_cut_img_path_dict)) # 平均切り出し座標から画像を切り出すループ if not_cut_img_path_dict: with utils.timer('平均切り出し座標から画像を切り出しています'): for idx, img_path in not_cut_img_path_dict.items():
def encode_episode_data(self): # pass """ Encodes data from data["train"] to use in the episode calculations """ # torch.set_grad_enabled(False) dataset = data["train_deleted"] img_embs, cap_embs = timer(self.encode_data, (dataset,)) if opt.cuda: img_embs = img_embs.cuda() cap_embs = cap_embs.cuda() image_caption_distances = timer(pairwise_distances, (img_embs, cap_embs)) topk = torch.topk(image_caption_distances, opt.topk, 1, largest=False) (image_caption_distances_topk, image_caption_distances_topk_idx) = (topk[0], topk[1]) data["image_caption_distances_topk"] = image_caption_distances_topk data["image_caption_distances_topk_idx"] = image_caption_distances_topk_idx del topk del image_caption_distances intra_cap_distance = timer(pairwise_distances, (cap_embs, cap_embs)) select_indices_row = [] select_indices_col = [] for row in data["image_caption_distances_topk_idx"].cpu().numpy(): permutations = list(zip(*itertools.permutations(row, 2))) permutations_list = [list(p) for p in permutations] select_indices_row.extend(permutations_list[0]) select_indices_col.extend(permutations_list[1]) all_dist = intra_cap_distance[select_indices_row, select_indices_col] all_dist = all_dist.view(len(data["train_deleted"][0]), opt.topk, opt.topk -1) all_dist = all_dist.mean(dim=2) # all_img = torch.Tensor(data["train_deleted"][0]) # print(all_img.size()) # print(data["image_caption_distances_topk"].size()) # print(all_dist.size()) # data["all_states"] = torch.cat((all_img, data["image_caption_distances_topk"].cpu(), all_dist.cpu()), 1) # print(data["all_states"].size()) print(data["image_caption_distances_topk"].size()) # data["all_states"] = torch.cat((img_embs, all_dist, data["image_caption_distances_topk"]), dim=1).cpu() data["all_states"] = torch.cat((torch.Tensor(data["train_deleted"][0]), all_dist.cpu(), data["image_caption_distances_topk"].cpu()), dim=1).cpu() print(data["all_states"].size()) # data["images_embed_all"] = img_embs.data.cpu() # data["captions_embed_all"] = cap_embs.data.cpu() # all_dist = all_dist.cpu() # data["all_states"] = all_dist.cpu() # print(data["all_states"].size()) # Testing for fixed index to see if it works # test_idx = 1337 # top_cap_idx = data["image_caption_distances_topk_idx"][test_idx] # top_cap = cap_embs.index_select(0, top_cap_idx) # # top_cap_intra_dist = pairwise_distances(top_cap, top_cap) # # print(top_cap_intra_dist) # top_cap_intra_dist = top_cap_intra_dist[top_cap_intra_dist > 0.0001].view(opt.topk, -1) # top_cap_mean_intra_dist = top_cap_intra_dist.mean(dim=1) # print(top_cap_mean_intra_dist) # print(data["all_states"][test_idx]) del intra_cap_distance del img_embs del cap_embs torch.set_grad_enabled(True)
def train_ppo(env_class, steps, track_eps=25, log_interval=1, solved_at=90.0, continual_solved_at=90.0, care_about=None, num_processes=8, gamma=0.99, MaxT=400, num_steps=128, clip_param=0.3, linear_schedule=True, policy=None, ob_rms=None, eval_envs=None, eval_eps=-1, hidden=-1, entropy_coef=0, linear_schedule_mode=0, lr=3e-4, training_seed=0, verbosity=1, training_method=learn.PPO, log_extras={}, policy_class=learn.PolicyPPO, discrete=False): assert (verbosity in [1, 2]) is_continual = training_method.__name__ in ["PPO_EWC", "PPO_DM"] if is_continual: assert (care_about != None) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") num_env_steps = int(steps) if eval_envs != None: assert (eval_eps > 0) def env_fn(i): env = env_class(discrete=discrete) # env.debug['show_reasons'] = True env = utils.env.wrap_env( env, action_normalize=not discrete, time_limit=MaxT, deterministic=True, seed=i, ) return lambda: env envs = utils.env.vectorize_env( [env_fn(i) for i in range(num_processes)], state_normalize=True, device=device, train=True, ) if ob_rms != None: envs.ob_rms = ob_rms obs_space, action_space = envs.observation_space, envs.action_space init_obs = envs.reset() torch.manual_seed(training_seed) print("training_method = %s" % training_method.__name__) agent = training_method(obs_space, action_space, init_obs, clip_param=clip_param, num_steps=num_steps, lr=lr, num_processes=num_processes, gamma=gamma, policy=policy, hidden=hidden, linear_schedule=linear_schedule, entropy_coef=entropy_coef, linear_schedule_mode=linear_schedule_mode, policy_class=policy_class) num_updates = agent.compute_updates_needed(num_env_steps, num_processes) episode_rewards = collections.deque(maxlen=track_eps) s = collections.deque(maxlen=track_eps) log_dict = { 'r': episode_rewards, 'eps_done': 0, 'satisfactions': s, **log_extras } start = utils.timer() ret_steps = -1 for j in range(num_updates): agent.pre_step(j, num_updates) agent.step(envs, log=log_dict) vloss, piloss, ent = agent.train() if (j + 1) % log_interval == 0 and len(log_dict['r']) > 1: total_num_steps = (j + 1) * num_processes * num_steps elapsed = "Elapsed %s" % utils.timer_done(start) MeanR = np.mean(log_dict['r']) MedR = np.median(log_dict['r']) MinR = np.min(log_dict['r']) MaxR = np.max(log_dict['r']) if verbosity == 1: reward_stats = "MeanR:%.2f" % (MeanR) extra_stats = [reward_stats] elif verbosity == 2: reward_stats1 = "MeanR,MedR:%.2f,%.2f" % (MeanR, MedR) reward_stats2 = "MinR,MaxR:%.2f,%.2f" % (MinR, MaxR) reg_loss = None if type(ent) == list: ent, reg_loss = ent loss_stats = "Ent:%f, VLoss:%f, PiLoss:%f" % (ent, vloss, piloss) if reg_loss is not None: loss_stats += ", Reg:%f" % (reg_loss) extra_stats = [ reward_stats1, reward_stats2, loss_stats, ] reasons = "Reasons: %s" % (set(list(s))) stats = [ "Steps:%g" % total_num_steps, "Eps:%d" % log_dict['eps_done'], elapsed, *extra_stats, ] print(" ".join(stats)) print(reasons) if eval_envs != None: eval_rews = [] for eval_env in eval_envs: eval_rews += [ utils.env.evaluate_ppo(agent.actor_critic, None, eval_env, device, num_episodes=eval_eps, wrap=False, silent=True) ] eval_rews[-1] = round(eval_rews[-1], 2) if is_continual: eval_MeanR = np.mean( np.clip(eval_rews[:care_about], -100., 100.)) if not is_continual and care_about != None: eval_relevant_R = np.clip(eval_rews[care_about - 1], -100., 100.) print(eval_rews) # print("") sys.stdout.flush() if MeanR >= solved_at: if eval_envs != None: if is_continual: if eval_MeanR < continual_solved_at: continue if not is_continual and care_about != None: if eval_relevant_R < solved_at: continue print("Model solved! Continue") ret_steps = total_num_steps break if ret_steps == -1: print("Not solved.") ob_rms = utils.env.get_ob_rms(envs) assert (ob_rms != None) envs.close() return agent.actor_critic, ob_rms, ret_steps
request = ToxicImageDetection_pb2.ImageURL() request.urls.extend(test_urls) try: t1 = time.time() response = self.stub.OpenNSFW(request) print(response) t2 = time.time() results[self._no][i] = t2 - t1 except Exception as e: error[self._no] += 1 print(e) continue threads = [TextRequest(i) for i in range(num_threads)] start = time.time() with utils.timer('%s REQUEST' % num_threads): for thread in threads: thread.start() for thread in threads: thread.join() print('Error analysis:') print(np.sum([1 for i in range(len(error)) if(error[i] > 0)]), num_threads) print('Time analysis:') print('mean time cost ') print(results.mean(axis= 1)) print('maximum time cost ') print(results.max(axis= 1)) print('minimum time cost') print(results.min(axis= 1)) #print('success %s/%s ' % (len(np.where(results > 0.5)[0]), num_threads))
df = inst_.groupby('SK_ID_PREV').head(period).groupby('SK_ID_PREV')[['paid_late', 'paid_early']].agg([ 'mean', np.count_nonzero]) df.columns = df.columns = [f'first_{period}_{f[0]}_{f[1]}' for f in df.columns] dfs.append(df) df = inst_.groupby('SK_ID_PREV').tail(period).groupby('SK_ID_PREV')[['paid_late', 'paid_early']].agg([ 'mean', np.count_nonzero]) df.columns = df.columns = [f'last_{period}_{f[0]}_{f[1]}' for f in df.columns] dfs.append(df) df = pd.concat(dfs, axis=1) # type: pd.DataFrame df = df.merge(inst_[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates(), left_index=True, right_on='SK_ID_PREV', how='left') self.df = df.groupby('SK_ID_CURR').mean() if __name__ == '__main__': args = get_arguments('main') with timer('load dataset'): train = pd.read_feather(TRAIN) test = pd.read_feather(TEST) prev = pd.read_feather(PREV) inst = pd.read_feather(INST) cv_id = pd.read_feather(INPUT / 'cv_id.ftr') cv = PredefinedSplit(cv_id) # with timer('preprocessing'): with timer('create dataset'): generate_features(globals(), args.force)
def test(self, epoch=10): self.ckp.write_log('=> Evaluation...') timer_test = utils.timer() upscale = self.args.upscale avg_psnr = {} avg_ssim = {} for scale in upscale: avg_psnr[scale] = 0.0 avg_ssim[scale] = 0.0 for iteration, (input, hr) in enumerate(self.loader_test, 1): has_target = type(hr) == list # if test on demo if has_target: input, hr = self.prepare([input, hr]) else: input = self.prepare([input])[0] sr = self.model(input) save_list = [*sr, input] if has_target: save_list.extend(hr) psnr = {} ssim = {} for i, scale in enumerate(upscale): psnr[scale] = utils.calc_psnr(hr[i], sr[i], int(scale)) ssim[scale] = utils.calc_ssim(hr[i], sr[i]) avg_psnr[scale] += psnr[scale] avg_ssim[scale] += ssim[scale] if self.args.save: if has_target: for i, scale in enumerate(upscale): self.ckp.write_log( '=> Image{} PSNR_x{}: {:.4f}'.format( iteration, scale, psnr[scale])) self.ckp.write_log( '=> Image{} SSIM_x{}: {:.4f}'.format( iteration, scale, ssim[scale])) self.ckp.save_result(iteration, save_list) if has_target: for scale, value in avg_psnr.items(): self.ckp.write_log("=> PSNR_x{}: {:.4f}".format( scale, value / len(self.loader_test))) self.ckp.write_log("=> SSIM_x{}: {:.4f}".format( scale, avg_ssim[scale] / len(self.loader_test))) self.ckp.write_log("=> Total time: {:.1f}s".format(timer_test.toc())) if not self.args.test: self.ckp.save_model(self.model, 'latest') cur_psnr = avg_psnr[upscale[-1]] if self.best_psnr < cur_psnr: self.best_psnr = cur_psnr self.best_epoch = epoch self.ckp.save_model(self.model, '{}_best'.format(self.best_epoch))
# Create model objects encoder = Encoder(qa_vocab_size, embedding_dim, units, BATCH_SIZE, max_ques_length, embedding_matrix) decoder = Decoder(qa_vocab_size, embedding_dim, units, BATCH_SIZE, max_ques_length, embedding_matrix) # Checkpoints (Object-based saving) checkpoint_dir = os.path.join(os.getcwd(), 'checkpoints') checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder) # Restoring the latest checkpoint in checkpoint_dir checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) # Get predictions start_time = timer() predicted_ans = predict(encoder, decoder, question_encoded, max_ques_length, max_ans_length, word2id, id2word, units, beam_search=False) print("---- Without Beam Search ----") print("Original Question:", question) print("Predicted Answer:", predicted_ans) timer(start_time) start_time = timer()
def train_test_and_save_model(): ## load data with utils.timer('Load data'): data_1 = utils.load_cs_deleted_data(cs_delete_file) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data = pd.concat([data_1, data_2, data_3], axis= 0, ignore_index= True) DebugDir = '%s/debug' % config.DataBaseDir if(os.path.exists(DebugDir) == False): os.makedirs(DebugDir) #writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir) #data.to_excel(writer, index= False) #writer.close() del data_3, data_2, data_1 gc.collect() X_raw_words = data['text'].apply(utils.cut) uni_words = list(set([w for rec in X_raw_words for w in rec])) word_dict = dict(zip(uni_words, range(len(uni_words)))) X_words = [] for rec in X_raw_words: new_rec = [] for w in rec: new_rec.append(word_dict[w]) X_words.append(new_rec) # X_words = np.array(X_words) y = np.array(data['label']) if N_GRAM is not None: X_words = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n= N_GRAM) for x in X_words]) print(X_words.shape) print(y.shape) print(X_words[:5]) print(y[:5]) final_train_pred = np.zeros(len(X_words)) for s in range(config.train_times): s_start = time.time() train_pred = np.zeros(len(X_words)) classifier = FastTextClassifier( vocab_size=VOCAB_SIZE + N_BUCKETS, embedding_size=EMBEDDING_SIZE, n_labels=2, ) skf = StratifiedKFold(config.kfold, random_state=2018 * s, shuffle=False) for fold, (train_index, valid_index) in enumerate(skf.split(X_words, y)): X_train, X_valid = X_words[train_index], X_words[valid_index] y_train, y_valid = y[train_index], y[valid_index] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) tl.layers.initialize_global_variables(sess) for epoch in range(N_EPOCH): start_time = time.time() print('Epoch %d/%d' % (epoch + 1, N_EPOCH)) for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True): sess.run( classifier.train_op, feed_dict={ classifier.inputs: tl.prepro.pad_sequences(X_batch), classifier.labels: y_batch, } ) valid_pred_proba = sess.run( classifier.prediction_probs, feed_dict={ classifier.inputs: tl.prepro.pad_sequences(X_valid) } )[:,1] valid_pred_label = utils.proba2label(valid_pred_proba) valid_auc = roc_auc_score(y_valid, valid_pred_proba) valid_precision = precision_score(y_valid, valid_pred_label) valid_recall = recall_score(y_valid, valid_pred_label) if(epoch == N_EPOCH - 1): train_pred[valid_index] = valid_pred_proba # valid_precision = sess.run( # classifier.precision, feed_dict={ # classifier.inputs: tl.prepro.pad_sequences(X_valid), # classifier.labels: y_valid, # } # ) # valid_recall = sess.run( # classifier.recall, feed_dict={ # classifier.inputs: tl.prepro.pad_sequences(X_valid), # classifier.labels: y_valid, # } # ) print('valid: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (valid_auc, valid_precision, valid_recall, int(time.time() - start_time))) classifier.save(sess, MODEL_FILE_PATH) print('fold %s done!!!' % fold) auc = roc_auc_score(y, train_pred) precision = precision_score(y, utils.proba2label(train_pred)) recall = recall_score(y, utils.proba2label(train_pred)) print('auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (auc, precision, recall, int(time.time() - s_start)))
NAME = Path(__file__).stem print(NAME) feats = [ 'main_numeric', 'main_amount_pairwise', 'main_category', 'main_ext_pairwise', 'bureau', 'prev', 'pos', 'credit', 'pos_latest', 'credit_latest', 'bureau_active_count', 'bureau_enddate', 'bureau_amount_pairwise', 'bureau_prolonged', 'main_ext_null', 'prev_basic', 'prev_category_count', 'prev_category_tfidf', 'prev_product_combination', 'main_document', 'main_enquiry', 'main_day_pairwise', 'main_amount_per_person', 'main_ext_round', 'inst_basic_direct', 'inst_basic_via_prev', 'inst_latest', 'inst_ewm', 'inst_basic_direct', 'inst_basic_via_prev' ] with timer('load datasets'): X_train, y_train, X_test, _ = load_dataset(feats) cv = StratifiedKFold(5, shuffle=True, random_state=71) print('train:', X_train.shape) print('test :', X_test.shape) lgb_params = { 'n_estimators': 4000, 'learning_rate': 0.05, 'num_leaves': 34, 'colsample_bytree': 0.95, 'subsample': 0.85, 'reg_alpha': 0.05, 'reg_lambda': 0.075, 'min_split_gain': 0.02, 'min_child_weight': 40,
agg_primitives=[Sum, Std, Max, Min, Median, Count, PercentTrue, Trend, AvgTimeBetween], where_primitives=[Std, Max, Min, Median, Count], verbose=True, chunk_size=120, # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间 ) self.__train_feature.to_csv(os.path.join(self.__output_path, "train_pre_agg_df.csv"), index=True) def main(): with timer('init'): ftt = FeatureToolsTrainV1( input_path="/home/ubuntu/01-Home_credit/new_input", output_path="/home/ubuntu/01-Home_credit/output", # input_path='../new_input', # output_path='../output', debug=False ) with timer('es set'): ftt.es_set() with timer('dfs run'): ftt.dfs_run() if __name__ == "__main__": with timer('sum time'): main()
def load_data(data_path): timer = utils.timer(name='main').tic() split_folder = os.path.join(data_path, 'warm') u_file = os.path.join(data_path, 'trained/warm/U.csv.bin') v_file = os.path.join(data_path, 'trained/warm/V.csv.bin') user_content_file = os.path.join(data_path, 'user_features_0based.txt') item_content_file = os.path.join(data_path, 'item_features_0based.txt') train_file = os.path.join(split_folder, 'train.csv') test_warm_file = os.path.join(split_folder, 'test_warm.csv') test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv') test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv') test_cold_user_iid_file = os.path.join(split_folder, 'test_cold_user_item_ids.csv') test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv') test_cold_item_iid_file = os.path.join(split_folder, 'test_cold_item_item_ids.csv') dat = {} # load preference data timer.tic() u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200) v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200) dat['u_pref'] = u_pref dat['v_pref'] = v_pref timer.toc('loaded U:%s,V:%s' % (str(u_pref.shape), str(v_pref.shape))).tic() # pre-process _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref) _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref) timer.toc('standardized U,V').tic() # load content data timer.tic() user_content, _ = datasets.load_svmlight_file(user_content_file, zero_based=True, dtype=np.float32) dat['user_content'] = user_content.tolil(copy=False) timer.toc('loaded user feature sparse matrix: %s' % (str(user_content.shape))).tic() item_content, _ = datasets.load_svmlight_file(item_content_file, zero_based=True, dtype=np.float32) dat['item_content'] = item_content.tolil(copy=False) timer.toc('loaded item feature sparse matrix: %s' % (str(item_content.shape))).tic() # load split timer.tic() train = pd.read_csv( train_file, delimiter=",", header=-1, dtype=np.int32).values.ravel().view( dtype=[('uid', np.int32), ('iid', np.int32), ('inter', np.int32), ('date', np.int32)]) dat['user_indices'] = np.unique(train['uid']) timer.toc('read train triplets %s' % train.shape).tic() dat['eval_warm'] = data.load_eval_data(test_warm_file, test_warm_iid_file, name='eval_warm', cold=False, train_data=train) dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file, test_cold_user_iid_file, name='eval_cold_user', cold=True, train_data=train) dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file, test_cold_item_iid_file, name='eval_cold_item', cold=True, train_data=train) return dat
# np.random.seed(4) eta = 7.0 a = 1.1 location = [0., 0., 1] theta = np.random.normal(0., 1., 4) orientation = Quaternion([1., 0., 0., 0.]) r_vectors = 5 * a * np.random.rand(200, 3) + np.array([0., 0., 0.]) L = np.array([0., 0., 0.]) # Generate random forces force = np.random.randn(len(r_vectors), 3) # ================================================================ # NO WALL TESTS # ================================================================ timer('zz_no_wall_loops_full_matrix') mobility_no_wall_loops = mob.rotne_prager_tensor_loops(r_vectors, eta, a) u_no_wall_loops_full = np.dot(mobility_no_wall_loops, force.flatten()) timer('zz_no_wall_loops_full_matrix') timer('zz_no_wall_full_matrix') mobility_no_wall = mob.rotne_prager_tensor(r_vectors, eta, a) u_no_wall_full = np.dot(mobility_no_wall, force.flatten()) timer('zz_no_wall_full_matrix') u_no_wall_numba = mob.no_wall_mobility_trans_times_force_numba(r_vectors, force, eta, a) timer('zz_no_wall_numba') u_no_wall_numba = mob.no_wall_mobility_trans_times_force_numba(r_vectors, force, eta, a) timer('zz_no_wall_numba') if found_pycuda:
image_label_list = [ config.level_label_dict[config.level_zn_en[ image_file_batch[i].split('/')[-2]]] for i in range(len(image_file_batch)) ] return np.array(image_data_list), np.array(image_label_list) if __name__ == '__main__': '''''' strategy = 'att_resnet' print('\n') # step 1: get image files with utils.timer('scan image files'): #image_dir = '{}/raw/色情图片已标记'.format(config.DataBaseDir) image_dir = '{}/raw/updated_1109'.format(config.DataBaseDir) jpg_image_files = glob.glob('{}/*/*.jpg'.format(image_dir)) png_image_files = glob.glob('{}/*/*.png'.format(image_dir)) image_files = jpg_image_files + png_image_files print('total image files {}'.format(len(image_files))) print('\n') # step 2: train/valid split with utils.timer('split'): shuffle(image_files) if ((config.debug == True) & (config.sampling_ratio < 1.0)): image_files = image_files[:int(config.sampling_ratio * len(image_files))] print('sampled {:.1f} percentage of dataset'.format(
def dr_wrt(self, wrt, reverse_mode=False, profiler=None): tm_dr_wrt = timer() self.called_dr_wrt = True self._call_on_changed() drs = [] if wrt in self._cache['drs']: if DEBUG: if wrt not in self._cache_info: self._cache_info[wrt] = 0 self._cache_info[wrt] += 1 self._status = 'cached' return self._cache['drs'][wrt] direct_dr = self._compute_dr_wrt_sliced(wrt) if direct_dr is not None: drs.append(direct_dr) if DEBUG: self._status = 'pending' propnames = set(_props_for(self.__class__)) for k in set(self.dterms).intersection( propnames.union(set(self.__dict__.keys()))): p = getattr(self, k) if hasattr(p, 'dterms') and p is not wrt: indirect_dr = None if reverse_mode: lhs = self._compute_dr_wrt_sliced(p) if isinstance(lhs, LinearOperator): tm_dr_wrt.pause() dr2 = p.dr_wrt(wrt) tm_dr_wrt.resume() indirect_dr = lhs.matmat(dr2) if dr2 != None else None else: indirect_dr = p.lmult_wrt(lhs, wrt) else: # forward mode tm_dr_wrt.pause() dr2 = p.dr_wrt(wrt, profiler=profiler) tm_dr_wrt.resume() if dr2 is not None: indirect_dr = self.compute_rop(p, rhs=dr2) if indirect_dr is not None: drs.append(indirect_dr) if len(drs) == 0: result = None elif len(drs) == 1: result = drs[0] else: # TODO: ???????? # result = np.sum(x for x in drs) if not np.any([isinstance(a, LinearOperator) for a in drs]): result = reduce(lambda x, y: x + y, drs) else: result = LinearOperator( drs[0].shape, lambda x: reduce(lambda a, b: a.dot(x) + b.dot(x), drs)) # TODO: figure out how/whether to do this. if result is not None and not sp.issparse(result): tm_nonzero = timer() nonzero = np.count_nonzero(result) if tm_nonzero() > 0.1: pif('count_nonzero in {}sec'.format(tm_nonzero())) if nonzero == 0 or hasattr( result, 'size') and result.size / float(nonzero) >= 10.0: tm_convert_to_sparse = timer() result = sp.csc_matrix(result) import gc gc.collect() pif('converting result to sparse in {}sec'.format( tm_convert_to_sparse())) if (result is not None) and (not sp.issparse(result)) and ( not isinstance(result, LinearOperator)): result = np.atleast_2d(result) # When the number of parents is one, it indicates that # caching this is probably not useful because not # more than one parent will likely ask for this same # thing again in the same iteration of an optimization. # # When the number of parents is zero, this is the top # level object and should be cached; when it's > 1 # cache the combinations of the children. # # If we *always* filled in the cache, it would require # more memory but would occasionally save a little cpu, # on average. if len(self._parents.keys()) != 1: self._cache['drs'][wrt] = result if DEBUG: self._status = 'done' if getattr(self, '_make_dense', False) and sp.issparse(result): result = result.todense() if getattr(self, '_make_sparse', False) and not sp.issparse(result): result = sp.csc_matrix(result) if tm_dr_wrt() > 0.1: pif('dx of {} wrt {} in {}sec, sparse: {}'.format( self.short_name, wrt.short_name, tm_dr_wrt(), sp.issparse(result))) return result
# Define the optimizer optimizer = torch.optim.Adam(generator.parameters(), lr=params.lr, betas=(params.beta1, params.beta2)) # Define the scheduler scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=params.step_size, gamma = params.gamma) # Load model data if args.restore_from is not None : params.checkpoint = utils.load_checkpoint(restore_from, generator, optimizer, scheduler) logging.info('Model data loaded') #set the timer timer=utils.timer() # Train the model and save if params.numIter != 0 : logging.info('Start training') train(generator, optimizer, scheduler, eng, params) # Generate images and save logging.info('Start generating devices') evaluate(generator, eng, numImgs=500, params=params) timer.out() writer.close()
param = { #'booster': 'gbtree', 'objective': 'binary:logistic', 'eval_metric': 'auc', 'eta': 0.2, 'max_depth': 8, 'silent': 1, 'nthread': 4, 'colsample_bytree': .4, 'subsample': .9, } if __name__ == '__main__': '''''' ## load word2vec lookup table with utils.timer('Load word vector'): word2vec = tl.files.load_npy_to_any( name='%s/model/word2vec_post_text_3d.npy' % config.DataBaseDir) ## load data with utils.timer('Load data'): data_1 = utils.load_cs_deleted_data(cs_delete_file) print('target ratio: ') print(data_1['label'].value_counts()) data_2 = utils.load_58_data(pos_58_file) print(data_2['label'].value_counts()) data_3 = utils.load_58_data(neg_58_file) print(data_3['label'].value_counts()) data = pd.concat([data_1, data_2, data_3], axis=0, ignore_index=True) DebugDir = '%s/debug' % config.DataBaseDir if (os.path.exists(DebugDir) == False):
def compute_terms(file_full, file_mean, file_out, file_global='means.nc'): """ Return dataset with various zonal terms. Preserve a dimensionless """ # Load datasets # TODO: Rename 'plev' to 'lev' # NOTE: GFDL doesn't use CF conventions right now. # See: https://github.com/xgcm/xgcm/issues/91 timer() if os.path.exists(file_out): os.remove(file_out) data_full = nc4.Dataset(file_full, mode='r') data_mean = nc4.Dataset(file_mean, mode='r') data_global = nc4.Dataset(file_global, mode='r') data_out = nc4.Dataset(file_out, mode='w') copy_attrs(data_full, data_out, ignore=('NCO', 'filename', 'history')) for coord in ('time', 'plev', 'lat', 'lon', 'plev_bnds'): copy_variable(data_full, data_out, coord, singleton=coord == 'lon') # Coordinates and constants lat = data_full['lat'][:] rlat = np.pi * lat / 180.0 zmass = vertical_mass(data_full) p0 = 100000.0 R = 287.0 a = 6371.0e3 kappa = 0.286 cp = 1005.7 # Read full resolution # WARNING: Data might be stored in 32-bit but always compute in 64-bit # WARNING: Data is loaded from disk every time you use [:] indexing t = data_full['t'][:].astype('d') u = data_full['u'][:].astype('d') v = data_full['v'][:].astype('d') w = data_full['omega'][:].astype('d') z = data_full['z'][:].astype('d') q = data_full['tdt'][:].astype('d') udt = data_full['udt'][:].astype('d') vdt = data_full['vdt'][:].astype('d') # Read zonal means # NOTE: We follow CDO convention of preserving reduced longitude and latitude # dimensions with a dummy value (CDO uses zero, we use NaN). p = data_mean['plev'][:] * 100.0 exner = (p[:, None, None] / p0) ** kappa t_bar = data_mean['t'][:] u_bar = data_mean['u'][:] v_bar = data_mean['v'][:] w_bar = data_mean['omega'][:] z_bar = data_mean['z'][:] q_bar = data_mean['tdt'][:] udt_bar = data_mean['udt'][:] vdt_bar = data_mean['vdt'][:] pt_bar = t_bar / exner # Read globally mppnccombined zonal means t_globe = data_global['t'][:] q_globe = data_global['tdt'][:] timer(' * Time for reading') # Zonal anomalies t_star = t - t_bar # need both anomaly and average u_star = u - u_bar v_star = v - v_bar w_star = w - w_bar z_star = z - z_bar q_star = q - q_bar udt_star = udt - udt_bar vdt_star = vdt - vdt_bar # Global anomalies clat = np.cos(np.pi * data_global['lat'][:][:, None] / 180.0) t_globe = np.sum(t_globe * clat, axis=2, keepdims=True) / np.sum(clat) q_globe = np.sum(q_globe * clat, axis=2, keepdims=True) / np.sum(clat) pt_globe = t_globe / exner t_bar_anom = t_bar - t_globe w_bar_anom = w_bar # true due to mass conservation q_bar_anom = q_bar - q_globe pt_bar_anom = pt_bar - pt_globe # Barotropic and baroclinic terms u_tropic = weighted_mean(u, zmass, axis=1)[:, 0, :, :] # no height dimension v_tropic = weighted_mean(v, zmass, axis=1)[:, 0, :, :] u_clinic = u - u_tropic v_clinic = v - v_tropic u_tropic_bar = weighted_mean(u_tropic, zmass) v_tropic_bar = weighted_mean(v_tropic, zmass) u_clinic_bar = weighted_mean(u_clinic, zmass) v_clinic_bar = weighted_mean(v_clinic, zmass) u_tropic_star = u_tropic - weighted_mean(u_tropic, zmass) v_tropic_star = u_tropic - weighted_mean(v_tropic, zmass) u_clinic_star = u_clinic - weighted_mean(u_clinic, zmass) v_clinic_star = u_clinic - weighted_mean(v_clinic, zmass) # Stability factor -(theta / T) * (R / cp * p) * (dthetabar / dp)^-1 # New way recognizing that t / theta == (p / p0)^kappa which means stability = # = -R / (cp * p * (dtheta / dp) * (t / theta)) # = -kappa / ((dtheta / dp) * p * (p / p0)^kappa) denom = climo.deriv_uneven(p, pt_globe, axis=1, keepedges=True) denom = denom * exner * p[:, None, None] ** kappa denom[denom == 0] = np.nan stab = -kappa / denom timer(' * Time for setup') # Eddy variances make_variable( data_out, 'tvar', weighted_mean(t_star ** 2, zmass), long_name='zonal temperature variance', units='K^2', ) make_variable( data_out, 'uvar', weighted_mean(u_star ** 2, zmass), long_name='zonal zonal wind variance', units='m^2 / s^2', ) make_variable( data_out, 'vvar', weighted_mean(v_star ** 2, zmass), long_name='zonal meridional wind variance', units='m^2 / s^2', ) make_variable( data_out, 'zvar', weighted_mean(z_star ** 2, zmass), long_name='geopotential height variance', units='m^2', ) timer(' * Time for variance terms') # Eddy fluxes make_variable( data_out, 'ehf', weighted_mean(t_star * v_star, zmass), long_name='eddy heat flux', units='K m / s', ) make_variable( data_out, 'emf', weighted_mean(u_star * v_star, zmass), long_name='eddy momentum flux', units='m^2 / s^2', ) make_variable( data_out, 'egf', weighted_mean(z_star * v_star, zmass), long_name='eddy geopotential flux', units='m^2 / s', ) timer(' * Time for flux terms') # APE terms make_variable( data_out, 'pe', cp * stab * weighted_mean(t_star ** 2, zmass) / 2.0, long_name='eddy APE', units='J / kg', ) make_variable( data_out, 'pm', cp * stab * t_bar_anom ** 2 / 2.0, long_name='mean APE', units='J / kg', ) timer(' * Time for APE terms') # KE terms for prefix, suffix, u_bar_i, v_bar_i, u_star_i, v_star_i in ( ('', '', u_bar, v_bar, u_star, v_star), ('baroclinic ', '_clinic', u_clinic_bar, v_clinic_bar, u_clinic_star, v_clinic_star), # noqa: E501 ('barotropic ', '_tropic', u_tropic_bar, v_tropic_bar, u_tropic_star, v_tropic_star), # noqa: E501 ): make_variable( data_out, 'ke' + suffix, weighted_mean(u_star_i ** 2 + v_star_i ** 2, zmass) / 2.0, # noqa: E501 long_name=prefix + 'eddy KE', units='J / kg', ) make_variable( data_out, 'km' + suffix, (u_bar_i ** 2 + v_bar_i ** 2) / 2.0, long_name=prefix + 'mean KE', units='J / kg', ) del u_tropic, v_tropic, u_tropic_star, v_tropic_star del u_clinic, v_clinic, u_clinic_star, v_clinic_star timer(' * Time for KE terms') # Generation terms # WARNING: These need a 'cp', unlike in definitions, because we have a heating # rate K/s rather than a forcing term J/s * kg. make_variable( data_out, 'gpe', cp * stab * weighted_mean(q_star * t_star, zmass), long_name='generation of eddy APE', units='W / kg', ) make_variable( data_out, 'gpm', cp * stab * q_bar_anom * t_bar_anom, long_name='generation of mean APE', units='W / kg', ) timer(' * Time for APE generation terms') # Dissipation terms # NOTE: the wind tendency is always negative; want energy going away to be positive make_variable( data_out, 'dke', -1.0 * weighted_mean(u_star * udt_star + v_star * vdt_star, zmass), # noqa: E501 long_name='dissipation of eddy KE', units='W / kg', ) make_variable( data_out, 'dkm', -1.0 * (u_bar * udt_bar + v_bar * vdt_bar), long_name='dissipation of mean KE', units='W / kg', ) timer(' * Time for dissipation terms') # Conversion from eddy APE to eddy KE, mean APE to mean KE # NOTE: This is also eddy adiabatic heating heat budget term! Do not store vertical # eddy heat flux separately because it can be easily backed out from this term. make_variable( data_out, 'cpeke', -1.0 * R * weighted_mean(w_star * t_star, zmass) / p[:, None, None], # noqa: E501 long_name='eddy APE conversion to eddy KE', units='W / kg', ) make_variable( data_out, 'cpmkm', -1.0 * R * w_bar_anom * t_bar_anom / p[:, None, None], long_name='mean APE conversion to mean KE', units='W / kg', ) timer(' * Time for APE/KE conversion terms') # Conversion from eddy KE to mean KE # NOTE: See Kim and Kim 2013 (CliDyn) clat = np.cos(rlat[:, None]) tlat = np.tan(rlat[:, None]) ckekm = ( weighted_mean(u_star * v_star, zmass) * clat * climo.deriv_uneven(rlat * a, u_bar / clat, axis=2, keepedges=True) # noqa: E501 + weighted_mean(v_star ** 2, zmass) * climo.deriv_uneven(rlat * a, v_bar, axis=2, keepedges=True) # noqa: E501 + weighted_mean(u_star * w_star, zmass) * climo.deriv_uneven(p, u_bar, axis=1, keepedges=True) # noqa: E501 + weighted_mean(v_star * w_star, zmass) * climo.deriv_uneven(p, v_bar, axis=1, keepedges=True) # noqa: E501 - v_bar * weighted_mean(u_star ** 2, zmass) * tlat / a ) make_variable( data_out, 'ckekm', ckekm, long_name='eddy KE conversion to mean KE', units='W / kg', ) timer(' * Time for eddy KE conversion to mean KE') # Conversion from mean APE to eddy APE # NOTE: Use Oort definition here, way better than Kim formula dt_bar_dy = climo.deriv_uneven(rlat * a, t_bar, axis=2, keepedges=True) dpt_bar_dp = climo.deriv_uneven(p, pt_bar_anom, axis=1, keepedges=True) cpmpe = -1.0 * cp * stab * ( dt_bar_dy * weighted_mean(t_star * v_star, zmass) + exner * dpt_bar_dp * weighted_mean(t_star * w_star, zmass) ) make_variable( data_out, 'cpmpe', cpmpe, long_name='mean APE conversion to eddy APE', units='W / kg' ) timer(' * Time for mean APE conversion to eddy APE') return data_out
import time import logging from flask import Flask, render_template, request from sentence_transformers import SentenceTransformer from utils import timer from dataset import Dataset from sentence_similarity import SentenceSimilarity app = Flask(__name__) logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO) logger = logging.getLogger(__name__) dataset = timer(Dataset, 'data/quora/quora_example.txt') sentence_sim = timer(SentenceSimilarity, dataset=dataset) @app.route('/') def home(): return render_template('search.html') #end def @app.route('/search', methods=["GET", "POST"]) def search_request(): query = request.form["input"]
import seaborn as sns from sklearn.metrics import roc_auc_score from tqdm import tqdm sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from utils import generate_submit, load_dataset, send_line_notification from category_encoders import TargetEncoder from config import * from utils import timer sns.set_style('darkgrid') NAME = Path(__file__).stem print(NAME) with timer('load datasets'): feats = ['main_numeric', 'main_days_to_years', 'main_days_pairwise', 'main_target_enc', 'main_ext_source_pairwise', 'bureau', 'prev', 'pos', 'credit', 'inst', 'pos_latest', 'credit_latest'] X_train, y_train, X_test, cv = load_dataset(feats) with timer('generate money pairwise features'): money_cols = X_train.filter(regex='AMT_(?!REQ)(?!.*_min)').columns print(money_cols) l = len(list(itertools.combinations(money_cols, 2))) for i, j in tqdm(itertools.combinations(money_cols, 2), total=l): X_train[f'{i}_minus_{j}'] = X_train[i] - X_train[j] X_test[f'{i}_minus_{j}'] = X_test[i] - X_test[j] print('train:', X_train.shape) print('test :', X_test.shape) # print('feats: ', X_train.columns.tolist())
def build_useful_data(): """ #TODO 利用pca降维,或者LDA降维......方式构建特征文件 构建可用的初始特征数据, 默认原始竞赛数据储存在当前文件夹中的datas文件夹中. :return: 可用数据(pd.DataFrame实例) """ # 读取蛋白质数据 with timer("Loading and merging data"): protein_train = pd.read_csv('datas/df_protein_train.csv') protein_test = pd.read_csv('datas/df_protein_test.csv') protein_all = pd.concat([protein_train, protein_test]) # 添加蛋白质序列长度作为特征 protein_all['seq_len'] = protein_all['Sequence'].apply(len) # 读取分子数据 mol_train = pd.read_csv('datas/df_molecule.csv') aff_train = pd.read_csv('datas/df_affinity_train.csv') aff_test = pd.read_csv('datas/df_affinity_test_toBePredicted.csv') # 初始化待预测的Ki值为-11 aff_test['Ki'] = -11 aff_all = pd.concat([aff_train, aff_test]) data = aff_all.merge(mol_train, on="Molecule_ID", how='left') data = data.merge(protein_all, on='Protein_ID', how='left') # 获取蛋白质ID PID = list(protein_all["Protein_ID"]) with timer("Processing wordcount1"): # word_length = 1时的wordcount特征 _, word_counts1 = tfidf_and_wordcounts(protein_all, PID, word_length=1, stride=1) # word_length = 2时的wordcount特征 with timer("Processing wordcount2"): _, word_counts2 = tfidf_and_wordcounts(protein_all, PID, word_length=2, stride=1) word_counts1_2 = word_counts1.merge(word_counts2, on="Protein_ID", how="left") # 保存特征文件,以供后期训练 word_counts1_2.to_csv("datas/1and2_1_421_protein_std.csv", index=False) del word_counts1_2, word_counts1, word_counts2 with timer("Processing wordcount3"): _, word_count3 = tfidf_and_wordcounts(protein_all, PID, word_length=3, stride=1) word_count3_features = list(word_count3.columns) # 8000维的数据,需要降维 word_count3_features.remove("Protein_ID") # 利用标准差进行降维,设置标准差阈值为0.42,去掉标准差小于0.42的特征 new_word_count3 = reduce_dims_with_std(word_count3, word_count3_features, std_threshold=0.3) # 保存特征文件,以供后期训练 new_word_count3.to_csv("datas/3_1_protein_std_0.3.csv", index=False) del new_word_count3 for i in range(len(word_count3_features) // 1000): # 每次划分1000个特征,并保存在特征文件里,以供后期训练 file = word_count3[["Protein_ID"] + word_count3_features[i * 1000:(i + 1) * 1000]] file_name = "3_1_1000_protein_" + str(i) file.to_csv("datas/" + file_name + ".csv", index=False) del word_count3, word_count3_features with timer("Processing wordcount4"): gc.collect() _, word_count4 = tfidf_and_wordcounts(protein_all, PID, word_length=4, stride=1) word_count4_features = list(word_count4.columns) # 140000+ 维的数据,需要降维 word_count4_features.remove("Protein_ID") new_word_count4 = reduce_dims_with_pca(word_count4, word_count4_features, n_conponents=1000) new_word_count4.to_csv("datas/wordcount4_pca.csv", index=False) # 利用标准差进行降维,设置标准差阈值为0.15,去掉标准差小于0.15的特征 new_word_count4 = reduce_dims_with_std(word_count4, word_count4_features, std_threshold=0.15) new_word_count4.to_csv("datas/4_1_protein_std_0.15.csv", index=False) # 利用标准差进行降维,设置标准差阈值为0.12,去掉标准差小于0.12的特征 new_word_count4 = reduce_dims_with_std(word_count4, word_count4_features, std_threshold=0.12) word_count4_features = list(new_word_count4.columns) word_count4_features.remove("Protein_ID") for i in range(len(word_count4_features) // 1000): # 每次划分500个特征,并保存在特征文件里,以供日后训练 file = new_word_count4[["Protein_ID"] + word_count4_features[i * 1000:(i + 1) * 1000]] file_name = "4_1_1000_protein_" + str(i) file.to_csv("datas/" + file_name + ".csv", index=False) del new_word_count4, word_count4 # 以下特征是蛋白质的词向量特征, 来自技术圈, 谢谢"小武哥"同学.但我们的最终提交版本没用这些特征 "=====================================词向量特征===========================================" # feat2 = protein_embedding(protein_all, word_length = 2) # data = data.merge(feat2, on="Protein_ID", how="left") # del feat2 # feat3 = protein_embedding(protein_all, word_length = 3) # data = data.merge(feat3, on="Protein_ID", how="left") # del feat3 # feat4 = protein_embedding(protein_all, word_length = 4) # data = data.merge(feat4, on="Protein_ID", how="left") # del feat4 "================================================================================" with timer("分子指纹展开"): mol_fingerprints = list(mol_train["Fingerprint"].apply( lambda x: list(np.array(x.split(',')).astype(int)))) mol_fingerprints = pd.DataFrame( mol_fingerprints, columns=["Fingerprint_" + str(i) for i in range(167)]) mol_fingerprints["Molecule_ID"] = mol_train["Molecule_ID"] del PID "==================================================================================================" with timer("加入分子指纹和描述符"): data = data.merge(mol_fingerprints, on="Molecule_ID", how='left') mol_ECFP4 = pd.read_csv("datas/df_mol_ECFP4s_1024.csv") data = data.merge(mol_ECFP4, on="Molecule_ID") del mol_fingerprints, mol_ECFP4 del data["Sequence"], protein_train, protein_test, mol_train data.reset_index(drop=True, inplace=True) data.to_csv("datas/original_data.csv", index=False) del data print("Useful data have builded")
'reg_lambda': 0.075, 'min_split_gain': 0.02, 'min_child_weight': 40, 'random_state': 71, # 'boosting_type': 'dart', 'silent': -1, 'verbose': -1, 'n_jobs': -1, } fit_params = { 'eval_metric': 'auc', 'early_stopping_rounds': 150, 'verbose': 50 } with timer('load datasets'): X_train, y_train, X_test, _ = load_dataset(feats) cv = StratifiedKFold(5, shuffle=True, random_state=71) print('train:', X_train.shape) print('test :', X_test.shape) with timer('drop low importance feats'): ref = pd.read_csv( '/home/ubuntu/kaggle-home-credit/output/180615_014745_v32_credit_drawing/feats.csv', index_col=0, header=None) drop_cols = ref[1][ref[1] < 1].index drop_cols = drop_cols[drop_cols.isin(X_train.columns)] X_train.drop(drop_cols, axis=1, inplace=True) X_test.drop(drop_cols, axis=1, inplace=True) print('train:', X_train.shape)
def _multi_query( sparql, timeout, graph_pattern, source_target_pairs, batch_size, _vars, _values, _ret_val_mapping, _res_init, _chunk_q, _chunk_res, _res_update=lambda r, u, **___: r.update(u), **kwds): if batch_size is None: batch_size = config.BATCH_SIZE _query_stats.multi_query_count[batch_size] += 1 total_time = 0 res = _res_init(source_target_pairs, **kwds) for val_chunk in chunker(_values, batch_size): _query_stats.multi_query_chunks[batch_size] += 1 q = _chunk_q(graph_pattern, _vars, val_chunk, **kwds) chunk_stps = [stp for v in val_chunk for stp in _ret_val_mapping[v]] _start_time = timer() t = None chunk_res = None for retry in range(2, -1, -1): # 3 attempts: 2, 1, 0 if retry < 2: _query_stats.multi_query_retries[batch_size] += 1 try: t, q_res = _query(sparql, timeout, q, **kwds) chunk_res = _chunk_res( q_res, _vars, _ret_val_mapping, **kwds) except EndPointNotFound as e: # happens if the endpoint reports a 404... # as virtuoso in rare cases seems to report a 404 let's # retry after some time but then cancel if retry: logger.info( 'SPARQL endpoint reports a 404, will retry in %ds', config.ERROR_WAIT ) sleep(config.ERROR_WAIT) continue else: logger.exception( 'SPARQL endpoint unreachable even after back-off ' 'and retry\n' 'could not perform query:\n%s for %s\nException:', q, val_chunk, ) six.reraise(MultiQueryException, e, sys.exc_info()[2]) except (SPARQLWrapperException, SAXParseException, URLError) as e: if (isinstance(e, SPARQLWrapperException) and re.search( r'The estimated execution time [0-9]+ \(sec\) ' r'exceeds the limit of [0-9]+ \(sec\)\.', repr(e))): t, chunk_res = timeout, {} elif len(val_chunk) > 1: logger.debug('error in batch: {}'.format(val_chunk)) logger.debug('retrying with half size batch: {}...'.format( len(val_chunk) // 2 )) _query_stats.multi_query_splits[batch_size] += 1 t, chunk_res = _multi_query( sparql, timeout, graph_pattern, chunk_stps, len(val_chunk) // 2, _vars, val_chunk, _ret_val_mapping, _res_init, _chunk_q, _chunk_res, _res_update, **kwds) elif isinstance(e, URLError): # we're down at single query level and still encounter an # error. It is very likely that the endpoint is dead... if retry: logger.warning( 'URLError, seems we cannot reach SPARQL endpoint, ' 'retry in %ds. Tried to perform query:\n' '%s for %s\nException:', config.ERROR_WAIT, q, val_chunk, exc_info=1, # appends exception to message ) sleep(config.ERROR_WAIT) continue else: logger.exception( 'URLError, seems we cannot reach SPARQL endpoint, ' 'giving up after 3 retries. Tried to perform query:' '\n%s for %s\nException:', q, val_chunk, ) six.reraise(MultiQueryException, e, sys.exc_info()[2]) else: logger.warning( 'could not perform query, replacing with 0 result:\n' '%s for %s\nException:', q, val_chunk, exc_info=1, # appends exception to message ) t, chunk_res = timer() - _start_time, {} except Exception as e: if retry: logger.warning( 'unhandled exception, retry in %ds:\n' 'Query:\n%s\nChunk:%r\nException:', config.ERROR_WAIT, q, val_chunk, exc_info=1, # appends exception to message ) sleep(config.ERROR_WAIT) continue else: logger.exception( 'unhandled exception, giving up after 3 retries:\n' 'Query:\n%s\nChunk:%r\nException:', q, val_chunk, ) six.reraise(MultiQueryException, e, sys.exc_info()[2]) break _res_update(res, chunk_res, **kwds) total_time += t if query_time_soft_exceeded(total_time, timeout): logger.debug('early terminating batch query as timeout/2 exceeded') break return total_time, res
def cv_train_BoXHED2(train_data): # Define the output dictionary train_info_dict = {} # Preprocess the training data. THIS ONLY NEEDS TO BE DONE ONCE. boxhed_ = boxhed() # Create an instance of BoXHED prep_timer = timer() # Initialize timer # boxhed.preprocess(): # Input: # @ num_quantiles: the number of candidate split points to try for time and for each covariate. # The locations of the split points are based on the quantiles of the training data. # @ is_cat: a list of the column indexes that contain categorical data. The categorical data must be one-hot encoded. # For example, is_cat = [4,5,6] if a categorical variable with 3 factors is transformed into binary-valued columns 4,5,6 # @ weighted: if set to True, the locations of the candidate split points will be based on weighted quantiles # (see Section 3.3 of the BoXHED 2.0 paper) # @ nthreads: number of CPU threads to use for preprocessing the data # Return: # @ ID: subject ID for each row in the processed data frames X, w, and delta # @ X: each row represents an epoch of the transformed data, and contains the values of the covariates as well as # its start time # @ w: length of each epoch # @ delta: equals one if an event occurred at the end of the epoch; zero otherwise ID, X, w, delta = boxhed_.preprocess( data=train_data, #is_cat = [], num_quantiles=256, weighted=False, nthread=nthread_prep) train_info_dict["prep_time"] = prep_timer.get_dur( ) # calling the get_dur() function. # Perform K-fold cross-validation to select hyperparameters {tree depth, number of trees, learning rate} if do_CV = True. # Otherwise, users should manually specify hyperparameter values. Note that a tree of depth k has 2^k leaf nodes. do_CV = False param_manual = {'max_depth': 1, 'n_estimators': 200, 'eta': 0.1} # Specify the candidate values for the hyperparameters to cross-validate on (more trees and/or deeper trees may be needed for other datasets). param_grid = { 'max_depth': [1, 2, 3, 4, 5], 'n_estimators': [50, 100, 150, 200, 250, 300], 'eta': [0.1] } # Next, specify: # @ gpu_list: the list of GPU IDs to use for training. Set gpu_list = [-1] to use CPUs. # @ batch_size: the maximum number of BoXHED2.0 instances trained at any point in time. Example: Performing # 10-fold cross-validation using the param_grid above requires training 5*6*10 = 300 # instances in total. # * When gpu_list = [-1], batch_size specifies the number of CPU threads to be used, # with each one training one instance at a time. # * When using GPUs, each GPU trains at most batch_size/len(gpu_list) instances at a time. Hence # if 2 GPUs are used and batch_size = 20, each GPU will train at most 10 instances at a time. gpu_list = [-1] batch_size = 20 num_folds = 5 if do_CV: cv_timer = timer() # Call the cv function to perform K-fold cross validation on the training set. # This outputs the cross validation results for the different hyperparameter combinations. # Return: # @ cv_rslts: mean and st.dev of the log-likelihood value for each hyperparameter combination # @ best_params: The hyper-parameter combination where the mean log-likelihood value is maximized. # WE STRONGLY RECOMMEND AGAINST USING THIS COMBINATON. Instead, use the # one-standard-error rule to select the simplest model that is within st.dev/sqrt(k) # of the maximum log-likelihood value. See §7.10 in 'Elements of Statistical Learning' # by Hastie et al. (2009). cv_rslts, best_params = cv(param_grid, X, w, delta, ID, num_folds, gpu_list, batch_size) train_info_dict["CV_time"] = cv_timer.get_dur() else: best_params = param_manual best_params['gpu_id'] = gpu_list[ 0] # Use the first GPU in the list for training best_params['nthread'] = nthread_train train_info_dict.update(best_params) boxhed_.set_params(**best_params) # Fit BoXHED to the training data fit_timer = timer() boxhed_.fit(X, delta, w) train_info_dict["fit_time"] = fit_timer.get_dur() return boxhed_, train_info_dict
#======================================================================== #======================================================================== # Result Box is_oof = 1 result_list = [] score_list = [] oof_pred = np.zeros(len(tx_train)) test_pred = np.zeros(len(x_test)) #======================================================================== #======================================================================== # Train & Prediction Start for fold_no, (trn_idx, val_idx) in enumerate(kfold): with utils.timer(f'Fold{fold_no} Train'): #======================================================================== # Make Dataset X_train, y_train = tx_train[trn_idx, :], y[trn_idx] X_val, y_val = tx_train[val_idx, :], y[val_idx] print(X_train.shape, X_val.shape) print(f"Target Min --- Train: {y_train.min()} Valid: {y_val.min()}") print( f"Target Min Count --- Train: {np.sum(y_train==y_train.min())} Valid: {np.sum(y_val==y_val.min())}" ) model = build_model(max_length=max_length, nb_words=nb_words, embedding_size=embedding_size) model.fit(x=X_train,
dataset ([BasicDataset]) recmodel ([PairWiseModel]) Returns: [tensor]: Vector of negitems, shape (batch_size, ) corresponding to batch_users """ dns_k = world.DNS_K with torch.no_grad(): scores = userAndMatrix(batch_users, batch_neg, recmodel) _, top1 = scores.max(dim=1) idx = torch.arange(len(batch_users)).to(world.DEVICE) negitems = batch_neg[idx, top1] return negitems if __name__ == "__main__": method = UniformSample_DNS from register import dataset from utils import timer for i in range(1): with timer(): # S = method(dataset, 1) S = UniformSample_original(dataset) print(len(S[S >= dataset.m_items])) S = torch.from_numpy(S).long() print(len(S[S >= dataset.m_items])) print(timer.get())
for wi in xopt: print '%.8f' % wi, print '' sys.stdout.flush() def select_params_with_de(ks): for k in ks: MODEL_PARAM['k'] = k args = (trn_xs, trn_ys, tst_xs, tst_ys, MODEL_NAME, MODEL_PARAM, MODEL_REPEAT) # print '@@@@@@@@@@@@', MODEL_PARAM['k'] ret = differential_evolution(objective_de, bounds=bounds, args=args, maxiter=3) # print xopt, fopt # print 'result:' print k, ret.success, ret.fun, for xi in ret.x: print xi, print '' # print ret.message sys.stdout.flush() if __name__ == '__main__': l = int(sys.argv[1]) u = int(sys.argv[2]) timer(select_params_with_pso, ks=range(l, u)) # timer(select_params_with_de, ks=range(l, u)) # select_params_with_pso(range(5, 16))
# config RANDOM_STATE = 99 SHUFFLE = True TEST_SIZE = 0.50 # get args args = parse_args() datapath = args.datapath model = args.model pretrained = args.pretrained cv = args.cv t0 = time.time() # 1. import module module = __import__(model) # 2. load and preprocess data with timer("Load and Preprocess"): df_train, _, X_train, _ = load_and_preprocess(datapath, module) # 3. fit and eval with timer('Fitting and Validating'): if cv == 2: X_t, X_v, y_t, y_v = train_test_split(X_train, df_train.target, test_size=TEST_SIZE, random_state=RANDOM_STATE, shuffle=SHUFFLE, stratify=df_train.target) best_thres, df_score = fit_and_eval(X_t, y_t, X_v, y_v, module, pretrained) # noqa filepath = os.path.join(datapath, 'eval_{}.csv'.format(model)) df_score.to_csv(filepath) print('Save CV score file to {}'.format(filepath))
if __name__ == '__main__': print('# Start') N = 100 a = 1.1 b = 7 eps = 3.92 L = np.array([0.0, 0.0, 0.0]) r_vectors = np.random.randn(N, 3) if found_pycuda: force_pycuda = forces_pycuda.calc_blob_blob_forces_pycuda(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L) timer('pycuda') force_pycuda = forces_pycuda.calc_blob_blob_forces_pycuda(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L) timer('pycuda') force_numba = forces_numba.calc_blob_blob_forces_numba(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L) timer('numba') force_numba = forces_numba.calc_blob_blob_forces_numba(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L) timer('numba') timer('python') force_python = mbf.calc_blob_blob_forces_python(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L) timer('python') if found_boost: timer('boost') force_boost = mbf.calc_blob_blob_forces_boost(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
from keras import backend as K from keras.engine.topology import Layer from keras import initializers, regularizers, constraints, optimizers, layers from keras import callbacks from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau from keras import initializers from keras.engine import InputSpec, Layer from keras import backend as K base = utils.read_df_pkl(path='../input/base_Av*') if is_make: #======================================================================== # Dataset Load with utils.timer('Download Train and Test Data.\n'): train, test = MS_utils.get_dataset(base=base, feat_path='../features/4_winner/*.gz', is_cat_encode=False) nlp_cols = [ 'Engine' ,'OSVersion' ,'AppVersion' ,'AvSigVersion' ,'SkuEdition' ,'SmartScreen' ,'Census_OSArchitecture' ,'AVProductStatesIdentifier' ,'AVProductsInstalled' ,'CountryIdentifier' ,'CityIdentifier'
sns.set_style('darkgrid') feats = [ 'main_numeric', 'main_days_to_years', 'main_days_pairwise', 'main_money_pairwise', 'main_category', 'main_ext_source_pairwise', 'bureau', 'prev', 'pos', 'credit', 'inst', 'prev_latest', 'pos_latest', 'credit_latest', 'inst_latest', 'bureau_active_and_type_product', 'bureau_active_count', 'bureau_enddate', 'bureau_amount_pairwise', 'bureau_prolonged', 'main_ext_null' ] rank_average = False NAME = Path(__file__).stem print(NAME) with timer('load datasets'): X_train, y_train, X_test, _ = load_dataset(feats) cv = StratifiedKFold(5, shuffle=True, random_state=71) print('train:', X_train.shape) print('test :', X_test.shape) # print('feats: ', X_train.columns.tolist()) lgb_params = { 'n_estimators': 4000, 'learning_rate': 0.05, 'num_leaves': 31, 'colsample_bytree': 0.8, 'subsample': 0.8, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'min_split_gain': 0.01,
feats = [ 'main_numeric', 'main_days_to_years', 'main_days_pairwise', 'main_money_pairwise', 'main_category', 'main_ext_source_pairwise', 'bureau', 'prev', 'pos', 'credit', 'inst', 'pos_latest', 'credit_latest', 'inst_latest', 'bureau_active_count', 'bureau_enddate', 'bureau_amount_pairwise', 'bureau_prolonged', 'main_ext_null', 'prev_basic', 'prev_category_count', 'prev_category_tfidf', 'main_document', 'main_enquiry' ] rank_average = False use_cache = True NAME = Path(__file__).stem print(NAME) with timer('load datasets'): X_train, y_train, X_test, _ = load_dataset(feats) cv = StratifiedKFold(5, shuffle=True, random_state=71) print('train:', X_train.shape) print('test :', X_test.shape) # print('feats: ', X_train.columns.tolist()) def get_denoising_autoencoders(X_train, hiddens=None, drop_ratio=.15): hiddens = hiddens if hiddens else [500] x_in = Input((X_train.shape[1], ), name='input') h = Dropout(drop_ratio)(x_in) for i, dim in enumerate(hiddens): h = Dense(dim, activation='relu', name=f'hidden_{i}')(h) x_out = Dense(X_train.shape[1], activation='linear', name='out')(h) model = Model(x_in, x_out)
tar.extractall(path=dst) tar.close() except tarfile.ReadError: check_create_folder(dst) subprocess.check_call(["tar", "-xf", src, "-C", dst]) def _get_full_filename(self, band): base_file = "%s_B%s.*" % (self.scene, band) try: return glob.glob(join(self.scene_path, base_file))[0].split("/")[-1] except IndexError: raise FileDoesNotExist("%s does not exist" % "%s_B%s.*" % (self.scene, band)) def _check_if_zipped(self, path): """ Checks if the filename shows a tar/zip file """ filename = get_file(path).split(".") if filename[-1] in ["bz", "bz2"]: return True return False if __name__ == "__main__": with timer(): p = Process(sys.argv[1]) print p.run(sys.argv[2] == "t")
def main(): data_path = args.data_dir checkpoint_path = args.checkpoint_path tb_log_path = args.tb_log_path model_select = args.model_select rank_out = args.rank user_batch_size = 1000 n_scores_user = 2500 data_batch_size = 100 dropout = args.dropout recall_at = range(50, 550, 50) eval_batch_size = 1000 max_data_per_step = 2500000 eval_every = args.eval_every num_epoch = 10 _lr = args.lr _decay_lr_every = 50 _lr_decay = 0.1 experiment = '%s_%s' % ( datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), '-'.join( str(x / 100) for x in model_select) if model_select else 'simple') _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint' print('running: ' + experiment) dat = load_data(data_path) u_pref_scaled = dat['u_pref_scaled'] v_pref_scaled = dat['v_pref_scaled'] eval_warm = dat['eval_warm'] eval_cold_user = dat['eval_cold_user'] eval_cold_item = dat['eval_cold_item'] user_content = dat['user_content'] item_content = dat['item_content'] u_pref = dat['u_pref'] v_pref = dat['v_pref'] user_indices = dat['user_indices'] timer = utils.timer(name='main').tic() # append pref factors for faster dropout v_pref_expanded = np.vstack( [v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])]) v_pref_last = v_pref_scaled.shape[0] u_pref_expanded = np.vstack( [u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])]) u_pref_last = u_pref_scaled.shape[0] timer.toc('initialized numpy data for tf') # prep eval eval_batch_size = eval_batch_size timer.tic() eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) timer.toc('initialized eval_warm for tf').tic() eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) timer.toc('initialized eval_cold_user for tf').tic() eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content, eval_batch_size) timer.toc('initialized eval_cold_item for tf').tic() dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1], user_content_rank=user_content.shape[1], item_content_rank=item_content.shape[1], model_select=model_select, rank_out=rank_out) config = tf.ConfigProto(allow_soft_placement=True) with tf.device(args.model_device): dropout_net.build_model() with tf.device(args.inf_device): dropout_net.build_predictor(recall_at, n_scores_user) with tf.Session(config=config) as sess: tf_saver = None if _tf_ckpt_file is None else tf.train.Saver() train_writer = None if tb_log_path is None else tf.summary.FileWriter( tb_log_path + experiment, sess.graph) tf.global_variables_initializer().run() tf.local_variables_initializer().run() timer.toc('initialized tf') row_index = np.copy(user_indices) n_step = 0 best_cold_user = 0 best_cold_item = 0 best_warm = 0 n_batch_trained = 0 best_step = 0 for epoch in range(num_epoch): np.random.shuffle(row_index) for b in utils.batch(row_index, user_batch_size): n_step += 1 # prep targets target_users = np.repeat(b, n_scores_user) target_users_rand = np.repeat(np.arange(len(b)), n_scores_user) target_items_rand = [ np.random.choice(v_pref.shape[0], n_scores_user) for _ in b ] target_items_rand = np.array(target_items_rand).flatten() target_ui_rand = np.transpose( np.vstack([target_users_rand, target_items_rand])) [target_scores, target_items, random_scores] = sess.run( [ dropout_net.tf_topk_vals, dropout_net.tf_topk_inds, dropout_net.preds_random ], feed_dict={ dropout_net.U_pref_tf: u_pref[b, :], dropout_net.V_pref_tf: v_pref, dropout_net.rand_target_ui: target_ui_rand }) # merge topN and randomN items per user target_scores = np.append(target_scores, random_scores) target_items = np.append(target_items, target_items_rand) target_users = np.append(target_users, target_users) tf.local_variables_initializer().run() n_targets = len(target_scores) perm = np.random.permutation(n_targets) n_targets = min(n_targets, max_data_per_step) data_batch = [(n, min(n + data_batch_size, n_targets)) for n in xrange(0, n_targets, data_batch_size)] f_batch = 0 for (start, stop) in data_batch: batch_perm = perm[start:stop] batch_users = target_users[batch_perm] batch_items = target_items[batch_perm] if dropout != 0: n_to_drop = int(np.floor(dropout * len(batch_perm))) perm_user = np.random.permutation( len(batch_perm))[:n_to_drop] perm_item = np.random.permutation( len(batch_perm))[:n_to_drop] batch_v_pref = np.copy(batch_items) batch_u_pref = np.copy(batch_users) batch_v_pref[perm_user] = v_pref_last batch_u_pref[perm_item] = u_pref_last else: batch_v_pref = batch_items batch_u_pref = batch_users _, _, loss_out = sess.run( [ dropout_net.preds, dropout_net.updates, dropout_net.loss ], feed_dict={ dropout_net.Uin: u_pref_expanded[batch_u_pref, :], dropout_net.Vin: v_pref_expanded[batch_v_pref, :], dropout_net.Ucontent: user_content[batch_users, :].todense(), dropout_net.Vcontent: item_content[batch_items, :].todense(), # dropout_net.target: target_scores[batch_perm], dropout_net.lr_placeholder: _lr, dropout_net.phase: 1 }) f_batch += loss_out if np.isnan(f_batch): raise Exception('f is nan') n_batch_trained += len(data_batch) if n_step % _decay_lr_every == 0: _lr = _lr_decay * _lr print('decayed lr:' + str(_lr)) if n_step % eval_every == 0: recall_warm = utils.batch_eval_recall( sess, dropout_net.eval_preds_warm, eval_feed_dict=dropout_net.get_eval_dict, recall_k=recall_at, eval_data=eval_warm) recall_cold_user = utils.batch_eval_recall( sess, dropout_net.eval_preds_cold, eval_feed_dict=dropout_net.get_eval_dict, recall_k=recall_at, eval_data=eval_cold_user) recall_cold_item = utils.batch_eval_recall( sess, dropout_net.eval_preds_cold, eval_feed_dict=dropout_net.get_eval_dict, recall_k=recall_at, eval_data=eval_cold_item) # checkpoint if np.sum(recall_warm + recall_cold_user + recall_cold_item) > np.sum(best_warm + best_cold_user + best_cold_item): best_cold_user = recall_cold_user best_cold_item = recall_cold_item best_warm = recall_warm best_step = n_step if tf_saver is not None: tf_saver.save(sess, _tf_ckpt_file) timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' % (n_step, len(data_batch), n_batch_trained, f_batch, best_step)).tic() print('\t\t\t' + ' '.join([('@' + str(i)).ljust(6) for i in recall_at])) print('warm start\t%s\ncold user\t%s\ncold item\t%s' % (' '.join(['%.4f' % i for i in recall_warm]), ' '.join(['%.4f' % i for i in recall_cold_user]), ' '.join(['%.4f' % i for i in recall_cold_item]))) summaries = [] for i, k in enumerate(recall_at): if k % 100 == 0: summaries.extend([ tf.Summary.Value(tag="recall@" + str(k) + " warm", simple_value=recall_warm[i]), tf.Summary.Value( tag="recall@" + str(k) + " cold_user", simple_value=recall_cold_user[i]), tf.Summary.Value( tag="recall@" + str(k) + " cold_item", simple_value=recall_cold_item[i]) ]) recall_summary = tf.Summary(value=summaries) if train_writer is not None: train_writer.add_summary(recall_summary, n_step)