示例#1
0
文件: main.py 项目: tcxdgit/minigo
def selfplay(
        load_file: "The path to the network model files",
        output_dir: "Where to write the games"="data/selfplay",
        holdout_dir: "Where to write the games"="data/holdout",
        output_sgf: "Where to write the sgfs"="sgf/",
        readouts: 'How many simulations to run per move'=100,
        verbose: '>=2 will print debug info, >=3 will print boards' = 1,
        resign_threshold: 'absolute value of threshold to resign at' = 0.95,
        holdout_pct: 'how many games to hold out for evaluation' = 0.05):
    _ensure_dir_exists(output_sgf)
    _ensure_dir_exists(output_dir)

    with timer("Loading weights from %s ... " % load_file):
        network = dual_net.DualNetwork(load_file)
        network.name = os.path.basename(load_file)

    with timer("Playing game"):
        player = selfplay_mcts.play(
            network, readouts, resign_threshold, verbose)

    output_name = '{}-{}'.format(int(time.time()), socket.gethostname())
    game_data = player.extract_data()
    with gfile.GFile(os.path.join(output_sgf, '{}.sgf'.format(output_name)), 'w') as f:
        f.write(player.to_sgf())

    tf_examples = preprocessing.make_dataset_from_selfplay(game_data)

    # Hold out 5% of games for evaluation.
    if random.random() < holdout_pct:
        fname = os.path.join(holdout_dir, "{}.tfrecord.zz".format(output_name))
    else:
        fname = os.path.join(output_dir, "{}.tfrecord.zz".format(output_name))

    preprocessing.write_tf_examples(fname, tf_examples)
示例#2
0
文件: main.py 项目: tcxdgit/minigo
def evaluate(
        black_model: 'The path to the model to play black',
        white_model: 'The path to the model to play white',
        output_dir: 'Where to write the evaluation results'='data/evaluate/sgf',
        readouts: 'How many readouts to make per move.'=400,
        games: 'the number of games to play'=16,
        verbose: 'How verbose the players should be (see selfplay)' = 1):

    black_model = os.path.abspath(black_model)
    white_model = os.path.abspath(white_model)

    with timer("Loading weights"):
        black_net = dual_net.DualNetwork(black_model)
        white_net = dual_net.DualNetwork(white_model)

    with timer("%d games" % games):
        players = evaluation.play_match(
            black_net, white_net, games, readouts, verbose)

    for idx, p in enumerate(players):
        fname = "{:s}-vs-{:s}-{:d}".format(black_net.name, white_net.name, idx)
        with open(os.path.join(output_dir, fname + '.sgf'), 'w') as f:
            f.write(sgf_wrapper.make_sgf(p[0].position.recent,
                                         p[0].make_result_string(
                                             p[0].position),
                                         black_name=os.path.basename(
                                             black_model),
                                         white_name=os.path.basename(white_model)))
def main():
    with timer('init'):
        ftt = FeatureToolsTrainV1(
            # input_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input",
            # output_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input",
            input_path="/home/yuyang/02-ds-case/01-Home_Credit/new_input",
            output_path="/home/yuyang/02-ds-case/01-Home_Credit/output",
            debug=False
        )
    with timer('es set'):
        ftt.es_set()

    with timer('dfs run'):
        ftt.dfs_run()
示例#4
0
文件: main.py 项目: nhu2000/minigo
def validate(
    working_dir: 'tf.estimator working directory',
    *tf_record_dirs: 'Directories where holdout data are',
    checkpoint_name: 'Which checkpoint to evaluate (None=latest)'=None,
    validate_name: 'Name for validation set (i.e. selfplay or human)'=None):
    tf_records = []
    with timer("Building lists of holdout files"):
        for record_dir in tf_record_dirs:
            tf_records.extend(gfile.Glob(os.path.join(record_dir, '*.zz')))

    with timer("Validating from {} to {}".format(os.path.basename(tf_records[0]),
                                                 os.path.basename(tf_records[-1]))):
        dual_net.validate(working_dir, tf_records, checkpoint_name=checkpoint_name,
            name=validate_name)
def nn_1(debug=True):


    df = pd.read_pickle('../output/basic_application_noonehot_f117.pkl.gz')

    with timer('feature construct'):
        X, y, embed_cols, len_embed_cols = embedding_select(df)

    training = y.notnull()
    testing = y.isnull()

    train_id = df[training]['SK_ID_CURR']
    sub_id = df[testing]['SK_ID_CURR']

    print('\nid length {} {}'.format(len(list(train_id)), len(list(sub_id))))
    print('\nid length {} {}'.format(list(train_id)[:5], list(sub_id)[:5]))

    with timer('nn embedding train time'):
        metrics, oof_preds, sub_preds = nn_embedding(X, y, embed_cols, len_embed_cols, debug=debug)




    print('Saving results...')
    print(oof_preds.shape, sub_preds.shape)
    print(oof_preds.head(), sub_preds.head())
    sub = pd.DataFrame()
    train = pd.DataFrame()

    if debug:
        sub['SK_ID_CURR'] = [i for i in range(500)]
        train['SK_ID_CURR'] = [i for i in range(1000)]
    else:
        sub['SK_ID_CURR'] = sub_id
        train['SK_ID_CURR'] = train_id

    print(sub_preds.shape, type(sub_preds))
    sub['TARGET'] = sub_preds
    train['nn_train_pred'] = oof_preds

    if debug:
        sub[['SK_ID_CURR', 'TARGET']].to_csv('../03_Stack/input/sub_nn_ub_debug.csv', index=False)
        train[['SK_ID_CURR', 'nn_train_pred']].to_csv('../03_Stack/input/xpred_nn_ub_debug.csv', index=False)
    else:
        sub[['SK_ID_CURR', 'TARGET']].to_csv('../03_Stack/input/sub_nn_ub_embedding.csv', index=False)
        train[['SK_ID_CURR', 'nn_train_pred']].to_csv('../03_Stack/input/xpred_nn_ub_embedding.csv', index=False)

    print(sub.head(), sub.shape)
    print(train.head(), train.shape)
示例#6
0
def __main__():

    global parser
    parser = args_options()
    args = parser.parse_args()
    with timer():
        exit(*main(args))
示例#7
0
def get_followers(tw, name):
    # logger.info('get_followers called')
    tw = tw.authorize()
    with timer(logger.info):
        ids = [user for user in tw.cursor(tw.get_followers_ids,
            screen_name=name, count=COUNT)]
    return set(ids)
示例#8
0
def load_player(model_path):
  print("Loading weights from %s ... " % model_path)
  with timer("Loading weights from %s ... " % model_path):
      network = dual_net.DualNetwork(model_path)
      network.name = os.path.basename(model_path)
  player = MCTSPlayer(network, verbosity=2)
  return player
示例#9
0
文件: main.py 项目: nhu2000/minigo
def evaluate(
        black_model: 'The path to the model to play black',
        white_model: 'The path to the model to play white',
        output_dir: 'Where to write the evaluation results'='sgf/evaluate',
        readouts: 'How many readouts to make per move.'=400,
        games: 'the number of games to play'=16,
        verbose: 'How verbose the players should be (see selfplay)' = 1):
    _ensure_dir_exists(output_dir)

    with timer("Loading weights"):
        black_net = dual_net.DualNetwork(black_model)
        white_net = dual_net.DualNetwork(white_model)

    with timer("%d games" % games):
        evaluation.play_match(
            black_net, white_net, games, readouts, output_dir, verbose)
示例#10
0
def data_from_id(tw, ids):
    # logger.info('data_from_id called')
    tw = tw.authorize()
    result = None
    with timer(logger.info):
        result = tw.lookup_user(user_id=ids)
    return result
def main():
    with timer('init'):
        ftt = FeatureToolsTrainV1(
            input_path="/home/ubuntu/01-Home_credit/new_input",
            output_path="/home/ubuntu/01-Home_credit/output",

            # input_path='../new_input',
            # output_path='../output',

            debug=False
        )
    with timer('es set'):
        ftt.es_set()

    with timer('dfs run'):
        ftt.dfs_run()
def main():
    input_path = '/Users/yuyang/02CS/12-ML/01-ML-case/08-new-Home_Credit/input/'
    output_path = 'bureau_balance.csv'

    pb = PrepareBureauBalance(
        input_path=input_path
    )
    with timer('data_prepare'):
        pb.data_prepare()
    with timer('data_transform'):
        pb.data_transform()
    with timer('data_generate'):
        pb.data_generate()
    with timer('data_reture'):
        df = pb.data_return()
    with timer('data_save'):
        df.to_csv(output_path, index=False)
def main():
    input_path = '/Users/yuyang/02CS/12-ML/01-ML-case/08-new-Home_Credit/input/'
    output_path = 'previous_application.csv'

    pb = PreparePreviousApplication(
        input_path=input_path
    )
    with timer('data_prepare'):
        pb.data_prepare()
    with timer('data_transform'):
        pb.data_transform()
    with timer('data_generate'):
        pb.data_generate()
    with timer('data_reture'):
        df = pb.data_return()
    with timer('data_save'):
        df.to_csv(output_path, index=False)
示例#14
0
def __main__():

    global parser
    parser = args_options()
    args = parser.parse_args()
    if args.subs == 'search' and (hasattr(args, 'json') or hasattr(args, 'geojson')):
            print(main(args))
    else:
        with timer():
            exit(*main(args))
示例#15
0
文件: main.py 项目: tcxdgit/minigo
def train(chunk_dir, save_file, load_file=None, generation_num=0,
          logdir=None, num_steps=None, verbosity=1):
    tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz')))
    tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):]

    print("Training from:", tf_records[0], "to", tf_records[-1])

    n = dual_net.DualNetworkTrainer(save_file)
    with timer("Training"):
        n.train(tf_records, init_from=load_file,
                logdir=logdir, num_steps=num_steps, verbosity=verbosity)
def main():

    gc.enable()

    input_path = "/home/ubuntu/01-Home_credit/new_input"
    output_path = "/home/ubuntu/01-Home_credit/output"

	#input_path = "/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input"
    #output_path = "/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/output"

    with timer('data_load'):
        app_train, pre_app, credit, installment, poscash = load_data(input_path, debug=False)
    with timer('construct set'):
        entity_sets = es_set(app_train, pre_app, credit, installment, poscash)

    del app_train, pre_app, credit, installment, poscash
    gc.collect()

    with timer('dfs run'):
        dfs_run(entity_sets, output_path)
示例#17
0
def __main__():

    global parser
    parser = args_options()
    args = parser.parse_args()
    if args.subs == 'search':
        if args.json:
            print main(args)
            sys.exit(0)
    else:
        with timer():
            exit(*main(args))
示例#18
0
def loop():
    """Run gather and train as subprocesses."""
    gather_errors = 0
    while True:
        print("==================================")
        with timer("Gather"):
            gather = subprocess.call("python rl_loop.py gather", shell=True)
            if gather != 0:
                print("Error in gather, retrying")
                gather_errors += 1
                if gather_errors == 3:
                    print("Gathering died too many times!")
                    sys.exit(1)
                continue
        gather_errors = 0

        with timer("Train"):
            subprocess.call("python rl_loop.py train", shell=True)

        with timer("validate"):
            subprocess.call("python rl_loop.py validate", shell=True)
示例#19
0
文件: main.py 项目: nhu2000/minigo
def train(
    working_dir: 'tf.estimator working directory.',
    chunk_dir: 'Directory where gathered training chunks are.',
    model_save_path: 'Where to export the completed generation.',
    generation_num: 'Which generation you are training.'=0):
    tf_records = sorted(gfile.Glob(os.path.join(chunk_dir, '*.tfrecord.zz')))
    tf_records = tf_records[-1 * (WINDOW_SIZE // EXAMPLES_PER_RECORD):]

    print("Training from:", tf_records[0], "to", tf_records[-1])

    with timer("Training"):
        dual_net.train(working_dir, tf_records, generation_num)
        dual_net.export_model(working_dir, model_save_path)
示例#20
0
def evaluate(
        black_model: 'The path to the model to play black',
        white_model: 'The path to the model to play white',
        output_dir: 'Where to write the evaluation results'='sgf/evaluate',
        readouts: 'How many readouts to make per move.'=200,
        games: 'the number of games to play'=20,
        verbose: 'How verbose the players should be (see selfplay)' = 1):
    qmeas.start_time('evaluate')
    _ensure_dir_exists(output_dir)

    with timer("Loading weights"):
        black_net = dual_net.DualNetwork(black_model)
        white_net = dual_net.DualNetwork(white_model)

    winners = []
    with timer("%d games" % games):
        winners = evaluation.play_match(
            black_net, white_net, games, readouts, output_dir, verbose)
    qmeas.stop_time('evaluate')
    white_count = 0
    for win in winners:
      if 'W' in win or 'w' in win:
        white_count += 1
    return white_count * 1.0 / games
示例#21
0
文件: ch.py 项目: algrs/chumpy
    def _superdot(self, lhs, rhs, profiler=None):

        try:
            if lhs is None:
                return None
            if rhs is None:
                return None
            
            if isinstance(lhs, np.ndarray) and lhs.size==1:
                lhs = lhs.ravel()[0]
                
            if isinstance(rhs, np.ndarray) and rhs.size==1:
                rhs = rhs.ravel()[0]
    
            if isinstance(lhs, numbers.Number) or isinstance(rhs, numbers.Number):
                return lhs * rhs

            if isinstance(rhs, LinearOperator):
                return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x)))

            if isinstance(lhs, LinearOperator):                
                if sp.issparse(rhs):
                    return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x)))
                else:
                    # TODO: ?????????????
                    # return lhs.matmat(rhs)
                    return lhs.dot(rhs)
            
            # TODO: Figure out how/whether to do this.
            tm_maybe_sparse = timer()
            lhs, rhs = utils.convert_inputs_to_sparse_if_necessary(lhs, rhs)
            if tm_maybe_sparse() > 0.1:
                pif('convert_inputs_to_sparse_if_necessary in {}sec'.format(tm_maybe_sparse()))

            if not sp.issparse(lhs) and sp.issparse(rhs):
                return rhs.T.dot(lhs.T).T
            return lhs.dot(rhs)
        except Exception as e:
            import sys, traceback
            traceback.print_exc(file=sys.stdout)
            if DEBUG:
                import pdb; pdb.post_mortem()
            else:
                raise
示例#22
0
def gather(
        input_directory: 'where to look for games'='data/selfplay/',
        output_directory: 'where to put collected games'='data/training_chunks/',
        examples_per_record: 'how many tf.examples to gather in each chunk'=EXAMPLES_PER_RECORD):
    qmeas.start_time('gather')
    _ensure_dir_exists(output_directory)
    models = [model_dir.strip('/')
              for model_dir in sorted(gfile.ListDirectory(input_directory))[-50:]]
    with timer("Finding existing tfrecords..."):
        model_gamedata = {
            model: gfile.Glob(
                os.path.join(input_directory, model, '*.tfrecord.zz'))
            for model in models
        }
    print("Found %d models" % len(models))
    for model_name, record_files in sorted(model_gamedata.items()):
        print("    %s: %s files" % (model_name, len(record_files)))

    meta_file = os.path.join(output_directory, 'meta.txt')
    try:
        with gfile.GFile(meta_file, 'r') as f:
            already_processed = set(f.read().split())
    except tf.errors.NotFoundError:
        already_processed = set()

    num_already_processed = len(already_processed)

    for model_name, record_files in sorted(model_gamedata.items()):
        if set(record_files) <= already_processed:
            continue
        print("Gathering files for %s:" % model_name)
        for i, example_batch in enumerate(
                tqdm(preprocessing.shuffle_tf_examples(examples_per_record, record_files))):
            output_record = os.path.join(output_directory,
                                         '{}-{}.tfrecord.zz'.format(model_name, str(i)))
            preprocessing.write_tf_examples(
                output_record, example_batch, serialize=False)
        already_processed.update(record_files)

    print("Processed %s new files" %
          (len(already_processed) - num_already_processed))
    with gfile.GFile(meta_file, 'w') as f:
        f.write('\n'.join(sorted(already_processed)))
    qmeas.stop_time('gather')
示例#23
0
def selfplay_cache_model(
        network: "The path to the network model files",
        output_dir: "Where to write the games"="data/selfplay",
        holdout_dir: "Where to write the games"="data/holdout",
        output_sgf: "Where to write the sgfs"="sgf/",
        readouts: 'How many simulations to run per move'=100,
        verbose: '>=2 will print debug info, >=3 will print boards' = 1,
        resign_threshold: 'absolute value of threshold to resign at' = 0.95,
        holdout_pct: 'how many games to hold out for validation' = 0.05):
    qmeas.start_time('selfplay')
    clean_sgf = os.path.join(output_sgf, 'clean')
    full_sgf = os.path.join(output_sgf, 'full')
    _ensure_dir_exists(clean_sgf)
    _ensure_dir_exists(full_sgf)
    _ensure_dir_exists(output_dir)
    _ensure_dir_exists(holdout_dir)

    with timer("Playing game"):
        player = selfplay_mcts.play(
            network, readouts, resign_threshold, verbose)

    output_name = '{}-{}'.format(int(time.time() * 1000 * 1000), socket.gethostname())
    game_data = player.extract_data()
    with gfile.GFile(os.path.join(clean_sgf, '{}.sgf'.format(output_name)), 'w') as f:
        f.write(player.to_sgf(use_comments=False))
    with gfile.GFile(os.path.join(full_sgf, '{}.sgf'.format(output_name)), 'w') as f:
        f.write(player.to_sgf())

    tf_examples = preprocessing.make_dataset_from_selfplay(game_data)

    # Hold out 5% of games for evaluation.
    if random.random() < holdout_pct:
        fname = os.path.join(holdout_dir, "{}.tfrecord.zz".format(output_name))
    else:
        fname = os.path.join(output_dir, "{}.tfrecord.zz".format(output_name))

    preprocessing.write_tf_examples(fname, tf_examples)
    qmeas.stop_time('selfplay')
示例#24
0
 def run(self, train, test, logger):
     with timer(self.name, logger):
         self.create_features(train, test)
         logger.debug('[{}] train:{} test:{}'.format(
             self.name, self.train.shape, self.test.shape))
     return self
            entityset=self.__es,
            target_entity="application_train",
            agg_primitives=[Sum, Std, Max, Min, Median, Count, Skew, PercentTrue, Trend, AvgTimeBetween],
            where_primitives=[Std, Max, Min, Median, Count],
            verbose=True,
            chunk_size=150,  # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间
        )

        self.__train_feature.to_csv(os.path.join(self.__output_path, "train_agg_df.csv"), index=True)


def main():
    with timer('init'):
        ftt = FeatureToolsTrainV1(
            # input_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input",
            # output_path="/Users/yuyang/02CS/12-ML/01-ML-case/14-Home_Credit/new_input",
            input_path="/home/yuyang/02-ds-case/01-Home_Credit/new_input",
            output_path="/home/yuyang/02-ds-case/01-Home_Credit/output",
            debug=False
        )
    with timer('es set'):
        ftt.es_set()

    with timer('dfs run'):
        ftt.dfs_run()


if __name__ == "__main__":
    with timer('总计时间'):
        main()
示例#26
0
        pass

    os.mkdir(generated_rootfs)
    os.mkdir(generated_parent_rootfs)

    fs_dist = define_fs_structure(dist_define)

    depth = fs_dist["depth"]
    width = fs_dist["width"]
    layers_desc = fs_dist["layers"]

    dist = Distributor(generated_rootfs, depth, width)
    dist.generate_tree()

    for ld in layers_desc:
        with utils.timer("Generating test layer"):
            for d in ld.values():
                for f in d:
                    try:
                        size = f["size"]
                    except KeyError:
                        size = None
                    put_files(dist, f["type"], f["count"], size)

    parent_dist = Distributor(generated_parent_rootfs, depth, width)
    parent_dist.generate_tree()

    for ld in layers_desc:
        with utils.timer("Generating test parent layer"):
            for d in ld.values():
                for f in d:
示例#27
0
if __name__ == '__main__':
    args = argparser()  # get input arguments
    if args.silent:
        import warnings
        warnings.filterwarnings("ignore")
    print(
        "==================== Training model {0} on dataset {1} ===================="
        .format(args.model, args.dataset))

    # Load data
    dataset = np.load('data/regression_datasets/' + args.dataset + '.npz')
    X, y = dataset['data'], dataset['target']

    log_score, rmse_score = [], []
    # Train multiple models
    T = timer()
    for i in range(args.repeats):
        print("==================== Model {0}/{1} ====================".format(
            i + 1, args.repeats))
        # Make train/test split
        Xtrain, Xtest, ytrain, ytest = train_test_split(
            X, y, test_size=args.test_size, random_state=(i + 1) * args.seed)

        # Normalize data
        scaler = preprocessing.StandardScaler()
        scaler.fit(Xtrain)
        Xtrain = scaler.transform(Xtrain)
        Xtest = scaler.transform(Xtest)

        # Fit and score model
        T.begin()
示例#28
0
# 出力ディレクトリ・パスを準備
outdir_name = '2_paint_out'
output_path = utils.make_outdir(image_dir, outdir_name)

output_koma_path = utils.make_outdir(output_path, '0_koma')
if len(os.listdir(output_koma_path)) >= 3:
    shutil.rmtree(output_path)
    output_path = utils.make_outdir(image_dir, outdir_name)
    output_koma_path = utils.make_outdir(output_path, '0_koma')
output_shaved_path = utils.make_outdir(output_koma_path, '0_padding_shave')

# paint_out処理: 1st
img_path_list = utils.get_path_list(image_dir, args.ext)
print('pages:', len(img_path_list) - (args.start + args.end))
with utils.timer('paint_out処理: 1st 切り抜き位置が求められた画像を切り抜き'):
    odd_cp_list = []  # 奇数idxページのカットポイントを格納
    even_cp_list = []  # 偶数idxページのカットポイントを格納
    not_cut_img_path_dict = {}
    exec_paint_out_cut(img_path_list, kind='1st')

    # 平均切り出し座標を算出
    even_page_cp = find_average_point(even_cp_list)
    odd_page_cp = find_average_point(odd_cp_list)

print('lens', len(img_path_list) - len(not_cut_img_path_dict))

# 平均切り出し座標から画像を切り出すループ
if not_cut_img_path_dict:
    with utils.timer('平均切り出し座標から画像を切り出しています'):
        for idx, img_path in not_cut_img_path_dict.items():
    def encode_episode_data(self):
        # pass
        """ Encodes data from data["train"] to use in the episode calculations """
        #
        torch.set_grad_enabled(False)
        dataset = data["train_deleted"]
        img_embs, cap_embs = timer(self.encode_data, (dataset,))
        if opt.cuda:
            img_embs = img_embs.cuda()
            cap_embs = cap_embs.cuda()
        image_caption_distances = timer(pairwise_distances, (img_embs, cap_embs))
        topk = torch.topk(image_caption_distances, opt.topk, 1, largest=False)
        (image_caption_distances_topk, image_caption_distances_topk_idx) = (topk[0], topk[1])
        data["image_caption_distances_topk"] = image_caption_distances_topk
        data["image_caption_distances_topk_idx"] = image_caption_distances_topk_idx
        del topk
        del image_caption_distances
        intra_cap_distance = timer(pairwise_distances, (cap_embs, cap_embs))
        select_indices_row = []
        select_indices_col = []

        for row in data["image_caption_distances_topk_idx"].cpu().numpy():
            permutations = list(zip(*itertools.permutations(row, 2)))
            permutations_list = [list(p) for p in permutations]
            select_indices_row.extend(permutations_list[0])
            select_indices_col.extend(permutations_list[1])

        all_dist = intra_cap_distance[select_indices_row, select_indices_col]
        all_dist = all_dist.view(len(data["train_deleted"][0]), opt.topk, opt.topk -1)
        all_dist = all_dist.mean(dim=2)
        # all_img = torch.Tensor(data["train_deleted"][0])
        # print(all_img.size())
        # print(data["image_caption_distances_topk"].size())
        # print(all_dist.size())
        # data["all_states"] = torch.cat((all_img, data["image_caption_distances_topk"].cpu(), all_dist.cpu()), 1)
        # print(data["all_states"].size())
        print(data["image_caption_distances_topk"].size())
        # data["all_states"] = torch.cat((img_embs, all_dist, data["image_caption_distances_topk"]), dim=1).cpu()
        data["all_states"] = torch.cat((torch.Tensor(data["train_deleted"][0]), all_dist.cpu(), data["image_caption_distances_topk"].cpu()), dim=1).cpu()
        print(data["all_states"].size())
        # data["images_embed_all"] = img_embs.data.cpu()
        # data["captions_embed_all"] = cap_embs.data.cpu()
        # all_dist = all_dist.cpu()
        # data["all_states"] = all_dist.cpu()
        # print(data["all_states"].size())

        # Testing for fixed index to see if it works
        # test_idx = 1337
        # top_cap_idx = data["image_caption_distances_topk_idx"][test_idx]
        # top_cap = cap_embs.index_select(0, top_cap_idx)
        #
        # top_cap_intra_dist = pairwise_distances(top_cap, top_cap)
        # # print(top_cap_intra_dist)
        # top_cap_intra_dist = top_cap_intra_dist[top_cap_intra_dist > 0.0001].view(opt.topk, -1)
        # top_cap_mean_intra_dist = top_cap_intra_dist.mean(dim=1)
        # print(top_cap_mean_intra_dist)
        # print(data["all_states"][test_idx])

        del intra_cap_distance
        del img_embs
        del cap_embs
        torch.set_grad_enabled(True)
示例#30
0
def train_ppo(env_class,
              steps,
              track_eps=25,
              log_interval=1,
              solved_at=90.0,
              continual_solved_at=90.0,
              care_about=None,
              num_processes=8,
              gamma=0.99,
              MaxT=400,
              num_steps=128,
              clip_param=0.3,
              linear_schedule=True,
              policy=None,
              ob_rms=None,
              eval_envs=None,
              eval_eps=-1,
              hidden=-1,
              entropy_coef=0,
              linear_schedule_mode=0,
              lr=3e-4,
              training_seed=0,
              verbosity=1,
              training_method=learn.PPO,
              log_extras={},
              policy_class=learn.PolicyPPO,
              discrete=False):

    assert (verbosity in [1, 2])
    is_continual = training_method.__name__ in ["PPO_EWC", "PPO_DM"]
    if is_continual: assert (care_about != None)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_env_steps = int(steps)
    if eval_envs != None: assert (eval_eps > 0)

    def env_fn(i):
        env = env_class(discrete=discrete)
        # env.debug['show_reasons'] = True
        env = utils.env.wrap_env(
            env,
            action_normalize=not discrete,
            time_limit=MaxT,
            deterministic=True,
            seed=i,
        )
        return lambda: env

    envs = utils.env.vectorize_env(
        [env_fn(i) for i in range(num_processes)],
        state_normalize=True,
        device=device,
        train=True,
    )
    if ob_rms != None: envs.ob_rms = ob_rms

    obs_space, action_space = envs.observation_space, envs.action_space
    init_obs = envs.reset()

    torch.manual_seed(training_seed)
    print("training_method = %s" % training_method.__name__)
    agent = training_method(obs_space,
                            action_space,
                            init_obs,
                            clip_param=clip_param,
                            num_steps=num_steps,
                            lr=lr,
                            num_processes=num_processes,
                            gamma=gamma,
                            policy=policy,
                            hidden=hidden,
                            linear_schedule=linear_schedule,
                            entropy_coef=entropy_coef,
                            linear_schedule_mode=linear_schedule_mode,
                            policy_class=policy_class)

    num_updates = agent.compute_updates_needed(num_env_steps, num_processes)
    episode_rewards = collections.deque(maxlen=track_eps)
    s = collections.deque(maxlen=track_eps)
    log_dict = {
        'r': episode_rewards,
        'eps_done': 0,
        'satisfactions': s,
        **log_extras
    }
    start = utils.timer()
    ret_steps = -1

    for j in range(num_updates):

        agent.pre_step(j, num_updates)
        agent.step(envs, log=log_dict)
        vloss, piloss, ent = agent.train()

        if (j + 1) % log_interval == 0 and len(log_dict['r']) > 1:

            total_num_steps = (j + 1) * num_processes * num_steps
            elapsed = "Elapsed %s" % utils.timer_done(start)

            MeanR = np.mean(log_dict['r'])
            MedR = np.median(log_dict['r'])
            MinR = np.min(log_dict['r'])
            MaxR = np.max(log_dict['r'])
            if verbosity == 1:
                reward_stats = "MeanR:%.2f" % (MeanR)
                extra_stats = [reward_stats]
            elif verbosity == 2:
                reward_stats1 = "MeanR,MedR:%.2f,%.2f" % (MeanR, MedR)
                reward_stats2 = "MinR,MaxR:%.2f,%.2f" % (MinR, MaxR)
                reg_loss = None
                if type(ent) == list:
                    ent, reg_loss = ent
                loss_stats = "Ent:%f, VLoss:%f, PiLoss:%f" % (ent, vloss,
                                                              piloss)
                if reg_loss is not None: loss_stats += ", Reg:%f" % (reg_loss)
                extra_stats = [
                    reward_stats1,
                    reward_stats2,
                    loss_stats,
                ]
            reasons = "Reasons: %s" % (set(list(s)))
            stats = [
                "Steps:%g" % total_num_steps,
                "Eps:%d" % log_dict['eps_done'],
                elapsed,
                *extra_stats,
            ]
            print(" ".join(stats))
            print(reasons)
            if eval_envs != None:
                eval_rews = []
                for eval_env in eval_envs:
                    eval_rews += [
                        utils.env.evaluate_ppo(agent.actor_critic,
                                               None,
                                               eval_env,
                                               device,
                                               num_episodes=eval_eps,
                                               wrap=False,
                                               silent=True)
                    ]
                    eval_rews[-1] = round(eval_rews[-1], 2)
                if is_continual:
                    eval_MeanR = np.mean(
                        np.clip(eval_rews[:care_about], -100., 100.))
                if not is_continual and care_about != None:
                    eval_relevant_R = np.clip(eval_rews[care_about - 1], -100.,
                                              100.)
                print(eval_rews)
                # print("")
            sys.stdout.flush()

            if MeanR >= solved_at:
                if eval_envs != None:
                    if is_continual:
                        if eval_MeanR < continual_solved_at:
                            continue
                    if not is_continual and care_about != None:
                        if eval_relevant_R < solved_at:
                            continue

                print("Model solved! Continue")
                ret_steps = total_num_steps
                break

    if ret_steps == -1: print("Not solved.")
    ob_rms = utils.env.get_ob_rms(envs)
    assert (ob_rms != None)
    envs.close()
    return agent.actor_critic, ob_rms, ret_steps
            request = ToxicImageDetection_pb2.ImageURL()
            request.urls.extend(test_urls)
            try:
                t1 = time.time()
                response = self.stub.OpenNSFW(request)
                print(response)
                t2 = time.time()
                results[self._no][i] = t2 - t1
            except Exception as e:
                error[self._no] += 1
                print(e)
                continue

threads = [TextRequest(i) for i in range(num_threads)]
start = time.time()
with utils.timer('%s REQUEST' % num_threads):
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    print('Error analysis:')
    print(np.sum([1 for i in range(len(error)) if(error[i] > 0)]), num_threads)
    print('Time analysis:')
    print('mean time cost ')
    print(results.mean(axis= 1))
    print('maximum time cost ')
    print(results.max(axis= 1))
    print('minimum time cost')
    print(results.min(axis= 1))

    #print('success %s/%s ' % (len(np.where(results > 0.5)[0]), num_threads))
示例#32
0
            df = inst_.groupby('SK_ID_PREV').head(period).groupby('SK_ID_PREV')[['paid_late', 'paid_early']].agg([
                'mean', np.count_nonzero])
            df.columns = df.columns = [f'first_{period}_{f[0]}_{f[1]}' for f in df.columns]
            dfs.append(df)
            
            df = inst_.groupby('SK_ID_PREV').tail(period).groupby('SK_ID_PREV')[['paid_late', 'paid_early']].agg([
                'mean', np.count_nonzero])
            df.columns = df.columns = [f'last_{period}_{f[0]}_{f[1]}' for f in df.columns]
            dfs.append(df)
        
        df = pd.concat(dfs, axis=1)  # type: pd.DataFrame
        df = df.merge(inst_[['SK_ID_PREV', 'SK_ID_CURR']].drop_duplicates(), left_index=True, right_on='SK_ID_PREV',
                      how='left')
        self.df = df.groupby('SK_ID_CURR').mean()


if __name__ == '__main__':
    args = get_arguments('main')
    with timer('load dataset'):
        train = pd.read_feather(TRAIN)
        test = pd.read_feather(TEST)
        prev = pd.read_feather(PREV)
        inst = pd.read_feather(INST)
        cv_id = pd.read_feather(INPUT / 'cv_id.ftr')
        cv = PredefinedSplit(cv_id)
    
    # with timer('preprocessing'):
    
    with timer('create dataset'):
        generate_features(globals(), args.force)
示例#33
0
文件: trainer.py 项目: colorjam/SR
    def test(self, epoch=10):
        self.ckp.write_log('=> Evaluation...')
        timer_test = utils.timer()
        upscale = self.args.upscale
        avg_psnr = {}
        avg_ssim = {}

        for scale in upscale:
            avg_psnr[scale] = 0.0
            avg_ssim[scale] = 0.0

        for iteration, (input, hr) in enumerate(self.loader_test, 1):

            has_target = type(hr) == list  # if test on demo

            if has_target:
                input, hr = self.prepare([input, hr])
            else:
                input = self.prepare([input])[0]

            sr = self.model(input)

            save_list = [*sr, input]

            if has_target:
                save_list.extend(hr)

                psnr = {}
                ssim = {}
                for i, scale in enumerate(upscale):
                    psnr[scale] = utils.calc_psnr(hr[i], sr[i], int(scale))
                    ssim[scale] = utils.calc_ssim(hr[i], sr[i])
                    avg_psnr[scale] += psnr[scale]
                    avg_ssim[scale] += ssim[scale]

            if self.args.save:
                if has_target:
                    for i, scale in enumerate(upscale):
                        self.ckp.write_log(
                            '=> Image{} PSNR_x{}: {:.4f}'.format(
                                iteration, scale, psnr[scale]))
                        self.ckp.write_log(
                            '=> Image{} SSIM_x{}: {:.4f}'.format(
                                iteration, scale, ssim[scale]))
                self.ckp.save_result(iteration, save_list)

        if has_target:
            for scale, value in avg_psnr.items():
                self.ckp.write_log("=> PSNR_x{}: {:.4f}".format(
                    scale, value / len(self.loader_test)))
                self.ckp.write_log("=> SSIM_x{}: {:.4f}".format(
                    scale, avg_ssim[scale] / len(self.loader_test)))

        self.ckp.write_log("=> Total time: {:.1f}s".format(timer_test.toc()))

        if not self.args.test:
            self.ckp.save_model(self.model, 'latest')
            cur_psnr = avg_psnr[upscale[-1]]
            if self.best_psnr < cur_psnr:
                self.best_psnr = cur_psnr
                self.best_epoch = epoch
                self.ckp.save_model(self.model,
                                    '{}_best'.format(self.best_epoch))
示例#34
0
    # Create model objects
    encoder = Encoder(qa_vocab_size, embedding_dim, units, BATCH_SIZE,
                      max_ques_length, embedding_matrix)
    decoder = Decoder(qa_vocab_size, embedding_dim, units, BATCH_SIZE,
                      max_ques_length, embedding_matrix)

    # Checkpoints (Object-based saving)
    checkpoint_dir = os.path.join(os.getcwd(), 'checkpoints')
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder)

    # Restoring the latest checkpoint in checkpoint_dir
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

    # Get predictions
    start_time = timer()
    predicted_ans = predict(encoder,
                            decoder,
                            question_encoded,
                            max_ques_length,
                            max_ans_length,
                            word2id,
                            id2word,
                            units,
                            beam_search=False)
    print("---- Without Beam Search ----")
    print("Original Question:", question)
    print("Predicted Answer:", predicted_ans)
    timer(start_time)

    start_time = timer()
示例#35
0
def train_test_and_save_model():
    ## load data
    with utils.timer('Load data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3], axis= 0, ignore_index= True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if(os.path.exists(DebugDir) == False):
            os.makedirs(DebugDir)
        #writer = pd.ExcelWriter('%s/raw.xlsx' % DebugDir)
        #data.to_excel(writer, index= False)
        #writer.close()
        del data_3, data_2, data_1
        gc.collect()

    X_raw_words = data['text'].apply(utils.cut)
    uni_words = list(set([w for rec in X_raw_words for w in rec]))
    word_dict = dict(zip(uni_words, range(len(uni_words))))
    X_words = []
    for rec in X_raw_words:
        new_rec = []
        for w in rec:
            new_rec.append(word_dict[w])
        X_words.append(new_rec)
    # X_words = np.array(X_words)
    y = np.array(data['label'])
    if N_GRAM is not None:
        X_words = np.array([augment_with_ngrams(x, VOCAB_SIZE, N_BUCKETS, n= N_GRAM) for x in X_words])

    print(X_words.shape)
    print(y.shape)
    print(X_words[:5])
    print(y[:5])

    final_train_pred = np.zeros(len(X_words))
    for s in range(config.train_times):
        s_start = time.time()
        train_pred = np.zeros(len(X_words))

        classifier = FastTextClassifier(
            vocab_size=VOCAB_SIZE + N_BUCKETS,
            embedding_size=EMBEDDING_SIZE,
            n_labels=2,
        )

        skf = StratifiedKFold(config.kfold, random_state=2018 * s, shuffle=False)

        for fold, (train_index, valid_index) in enumerate(skf.split(X_words, y)):
            X_train, X_valid = X_words[train_index], X_words[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]

            with tf.Session() as sess:
                sess.run(tf.local_variables_initializer())
                tl.layers.initialize_global_variables(sess)

                for epoch in range(N_EPOCH):
                    start_time = time.time()
                    print('Epoch %d/%d' % (epoch + 1, N_EPOCH))
                    for X_batch, y_batch in tl.iterate.minibatches(X_train, y_train, batch_size=BATCH_SIZE, shuffle=True):
                        sess.run(
                            classifier.train_op, feed_dict={
                                classifier.inputs: tl.prepro.pad_sequences(X_batch),
                                classifier.labels: y_batch,
                            }
                        )

                    valid_pred_proba = sess.run(
                        classifier.prediction_probs, feed_dict={
                            classifier.inputs: tl.prepro.pad_sequences(X_valid)
                        }
                    )[:,1]
                    valid_pred_label = utils.proba2label(valid_pred_proba)
                    valid_auc = roc_auc_score(y_valid, valid_pred_proba)
                    valid_precision = precision_score(y_valid, valid_pred_label)
                    valid_recall = recall_score(y_valid, valid_pred_label)
                    if(epoch == N_EPOCH - 1):
                        train_pred[valid_index] = valid_pred_proba

                    # valid_precision = sess.run(
                    #     classifier.precision, feed_dict={
                    #         classifier.inputs: tl.prepro.pad_sequences(X_valid),
                    #         classifier.labels: y_valid,
                    #     }
                    # )
                    # valid_recall = sess.run(
                    #     classifier.recall, feed_dict={
                    #         classifier.inputs: tl.prepro.pad_sequences(X_valid),
                    #         classifier.labels: y_valid,
                    #     }
                    # )
                    print('valid: auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (valid_auc, valid_precision, valid_recall, int(time.time() - start_time)))
                classifier.save(sess, MODEL_FILE_PATH)
            print('fold %s done!!!' % fold)
        auc = roc_auc_score(y, train_pred)
        precision = precision_score(y, utils.proba2label(train_pred))
        recall = recall_score(y, utils.proba2label(train_pred))
        print('auc %.6f, precision %.6f, recall %.6f, took %s[s]' % (auc, precision, recall, int(time.time() - s_start)))
示例#36
0
NAME = Path(__file__).stem
print(NAME)

feats = [
    'main_numeric', 'main_amount_pairwise', 'main_category',
    'main_ext_pairwise', 'bureau', 'prev', 'pos', 'credit',
    'pos_latest', 'credit_latest',
    'bureau_active_count', 'bureau_enddate', 'bureau_amount_pairwise', 'bureau_prolonged',
    'main_ext_null',
    'prev_basic', 'prev_category_count', 'prev_category_tfidf', 'prev_product_combination',
    'main_document', 'main_enquiry', 'main_day_pairwise', 'main_amount_per_person', 'main_ext_round',
    'inst_basic_direct', 'inst_basic_via_prev', 'inst_latest', 'inst_ewm', 'inst_basic_direct', 'inst_basic_via_prev'
]

with timer('load datasets'):
    X_train, y_train, X_test, _ = load_dataset(feats)
    cv = StratifiedKFold(5, shuffle=True, random_state=71)
    print('train:', X_train.shape)
    print('test :', X_test.shape)

lgb_params = {
    'n_estimators': 4000,
    'learning_rate': 0.05,
    'num_leaves': 34,
    'colsample_bytree': 0.95,
    'subsample': 0.85,
    'reg_alpha': 0.05,
    'reg_lambda': 0.075,
    'min_split_gain': 0.02,
    'min_child_weight': 40,
            agg_primitives=[Sum, Std, Max, Min, Median, Count, PercentTrue, Trend, AvgTimeBetween],
            where_primitives=[Std, Max, Min, Median, Count],
            verbose=True,
            chunk_size=120,  # 调大 chunk_size 以时间换空间, 加大内存占用减少运行时间
        )

        self.__train_feature.to_csv(os.path.join(self.__output_path, "train_pre_agg_df.csv"), index=True)


def main():
    with timer('init'):
        ftt = FeatureToolsTrainV1(
            input_path="/home/ubuntu/01-Home_credit/new_input",
            output_path="/home/ubuntu/01-Home_credit/output",

            # input_path='../new_input',
            # output_path='../output',

            debug=False
        )
    with timer('es set'):
        ftt.es_set()

    with timer('dfs run'):
        ftt.dfs_run()


if __name__ == "__main__":
    with timer('sum time'):
        main()
示例#38
0
def load_data(data_path):
    timer = utils.timer(name='main').tic()
    split_folder = os.path.join(data_path, 'warm')

    u_file = os.path.join(data_path, 'trained/warm/U.csv.bin')
    v_file = os.path.join(data_path, 'trained/warm/V.csv.bin')
    user_content_file = os.path.join(data_path, 'user_features_0based.txt')
    item_content_file = os.path.join(data_path, 'item_features_0based.txt')
    train_file = os.path.join(split_folder, 'train.csv')
    test_warm_file = os.path.join(split_folder, 'test_warm.csv')
    test_warm_iid_file = os.path.join(split_folder, 'test_warm_item_ids.csv')
    test_cold_user_file = os.path.join(split_folder, 'test_cold_user.csv')
    test_cold_user_iid_file = os.path.join(split_folder,
                                           'test_cold_user_item_ids.csv')
    test_cold_item_file = os.path.join(split_folder, 'test_cold_item.csv')
    test_cold_item_iid_file = os.path.join(split_folder,
                                           'test_cold_item_item_ids.csv')

    dat = {}
    # load preference data
    timer.tic()
    u_pref = np.fromfile(u_file, dtype=np.float32).reshape(n_users, 200)
    v_pref = np.fromfile(v_file, dtype=np.float32).reshape(n_items, 200)
    dat['u_pref'] = u_pref
    dat['v_pref'] = v_pref

    timer.toc('loaded U:%s,V:%s' %
              (str(u_pref.shape), str(v_pref.shape))).tic()

    # pre-process
    _, dat['u_pref_scaled'] = utils.prep_standardize(u_pref)
    _, dat['v_pref_scaled'] = utils.prep_standardize(v_pref)
    timer.toc('standardized U,V').tic()

    # load content data
    timer.tic()
    user_content, _ = datasets.load_svmlight_file(user_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    dat['user_content'] = user_content.tolil(copy=False)
    timer.toc('loaded user feature sparse matrix: %s' %
              (str(user_content.shape))).tic()
    item_content, _ = datasets.load_svmlight_file(item_content_file,
                                                  zero_based=True,
                                                  dtype=np.float32)
    dat['item_content'] = item_content.tolil(copy=False)
    timer.toc('loaded item feature sparse matrix: %s' %
              (str(item_content.shape))).tic()

    # load split
    timer.tic()
    train = pd.read_csv(
        train_file, delimiter=",", header=-1,
        dtype=np.int32).values.ravel().view(
            dtype=[('uid', np.int32), ('iid',
                                       np.int32), ('inter',
                                                   np.int32), ('date',
                                                               np.int32)])
    dat['user_indices'] = np.unique(train['uid'])
    timer.toc('read train triplets %s' % train.shape).tic()

    dat['eval_warm'] = data.load_eval_data(test_warm_file,
                                           test_warm_iid_file,
                                           name='eval_warm',
                                           cold=False,
                                           train_data=train)
    dat['eval_cold_user'] = data.load_eval_data(test_cold_user_file,
                                                test_cold_user_iid_file,
                                                name='eval_cold_user',
                                                cold=True,
                                                train_data=train)
    dat['eval_cold_item'] = data.load_eval_data(test_cold_item_file,
                                                test_cold_item_iid_file,
                                                name='eval_cold_item',
                                                cold=True,
                                                train_data=train)
    return dat
    # np.random.seed(4)
    eta = 7.0
    a = 1.1
    location = [0., 0., 1]
    theta = np.random.normal(0., 1., 4)
    orientation = Quaternion([1., 0., 0., 0.])
    r_vectors = 5 * a * np.random.rand(200, 3) + np.array([0., 0., 0.])  
    L = np.array([0., 0., 0.])

    # Generate random forces
    force = np.random.randn(len(r_vectors), 3) 

    # ================================================================
    # NO WALL TESTS
    # ================================================================
    timer('zz_no_wall_loops_full_matrix')
    mobility_no_wall_loops = mob.rotne_prager_tensor_loops(r_vectors, eta, a)
    u_no_wall_loops_full = np.dot(mobility_no_wall_loops, force.flatten())
    timer('zz_no_wall_loops_full_matrix')

    timer('zz_no_wall_full_matrix')
    mobility_no_wall = mob.rotne_prager_tensor(r_vectors, eta, a)
    u_no_wall_full = np.dot(mobility_no_wall, force.flatten())
    timer('zz_no_wall_full_matrix')

    u_no_wall_numba = mob.no_wall_mobility_trans_times_force_numba(r_vectors, force, eta, a)
    timer('zz_no_wall_numba')
    u_no_wall_numba = mob.no_wall_mobility_trans_times_force_numba(r_vectors, force, eta, a)
    timer('zz_no_wall_numba')

    if found_pycuda:
示例#40
0
        image_label_list = [
            config.level_label_dict[config.level_zn_en[
                image_file_batch[i].split('/')[-2]]]
            for i in range(len(image_file_batch))
        ]

        return np.array(image_data_list), np.array(image_label_list)


if __name__ == '__main__':
    ''''''
    strategy = 'att_resnet'

    print('\n')
    # step 1: get image files
    with utils.timer('scan image files'):
        #image_dir = '{}/raw/色情图片已标记'.format(config.DataBaseDir)
        image_dir = '{}/raw/updated_1109'.format(config.DataBaseDir)
        jpg_image_files = glob.glob('{}/*/*.jpg'.format(image_dir))
        png_image_files = glob.glob('{}/*/*.png'.format(image_dir))
        image_files = jpg_image_files + png_image_files
        print('total image files {}'.format(len(image_files)))

    print('\n')
    # step 2: train/valid split
    with utils.timer('split'):
        shuffle(image_files)
        if ((config.debug == True) & (config.sampling_ratio < 1.0)):
            image_files = image_files[:int(config.sampling_ratio *
                                           len(image_files))]
            print('sampled {:.1f} percentage of dataset'.format(
示例#41
0
    def dr_wrt(self, wrt, reverse_mode=False, profiler=None):
        tm_dr_wrt = timer()
        self.called_dr_wrt = True
        self._call_on_changed()

        drs = []

        if wrt in self._cache['drs']:
            if DEBUG:
                if wrt not in self._cache_info:
                    self._cache_info[wrt] = 0
                self._cache_info[wrt] += 1
                self._status = 'cached'
            return self._cache['drs'][wrt]

        direct_dr = self._compute_dr_wrt_sliced(wrt)

        if direct_dr is not None:
            drs.append(direct_dr)

        if DEBUG:
            self._status = 'pending'

        propnames = set(_props_for(self.__class__))
        for k in set(self.dterms).intersection(
                propnames.union(set(self.__dict__.keys()))):

            p = getattr(self, k)

            if hasattr(p, 'dterms') and p is not wrt:

                indirect_dr = None

                if reverse_mode:
                    lhs = self._compute_dr_wrt_sliced(p)
                    if isinstance(lhs, LinearOperator):
                        tm_dr_wrt.pause()
                        dr2 = p.dr_wrt(wrt)
                        tm_dr_wrt.resume()
                        indirect_dr = lhs.matmat(dr2) if dr2 != None else None
                    else:
                        indirect_dr = p.lmult_wrt(lhs, wrt)
                else:  # forward mode
                    tm_dr_wrt.pause()
                    dr2 = p.dr_wrt(wrt, profiler=profiler)
                    tm_dr_wrt.resume()
                    if dr2 is not None:
                        indirect_dr = self.compute_rop(p, rhs=dr2)

                if indirect_dr is not None:
                    drs.append(indirect_dr)

        if len(drs) == 0:
            result = None
        elif len(drs) == 1:
            result = drs[0]
        else:
            # TODO: ????????
            # result = np.sum(x for x in drs)
            if not np.any([isinstance(a, LinearOperator) for a in drs]):
                result = reduce(lambda x, y: x + y, drs)
            else:
                result = LinearOperator(
                    drs[0].shape,
                    lambda x: reduce(lambda a, b: a.dot(x) + b.dot(x), drs))

        # TODO: figure out how/whether to do this.
        if result is not None and not sp.issparse(result):
            tm_nonzero = timer()
            nonzero = np.count_nonzero(result)
            if tm_nonzero() > 0.1:
                pif('count_nonzero in {}sec'.format(tm_nonzero()))
            if nonzero == 0 or hasattr(
                    result, 'size') and result.size / float(nonzero) >= 10.0:
                tm_convert_to_sparse = timer()
                result = sp.csc_matrix(result)
                import gc
                gc.collect()
                pif('converting result to sparse in {}sec'.format(
                    tm_convert_to_sparse()))

        if (result is not None) and (not sp.issparse(result)) and (
                not isinstance(result, LinearOperator)):
            result = np.atleast_2d(result)

        # When the number of parents is one, it indicates that
        # caching this is probably not useful because not
        # more than one parent will likely ask for this same
        # thing again in the same iteration of an optimization.
        #
        # When the number of parents is zero, this is the top
        # level object and should be cached; when it's > 1
        # cache the combinations of the children.
        #
        # If we *always* filled in the cache, it would require
        # more memory but would occasionally save a little cpu,
        # on average.
        if len(self._parents.keys()) != 1:
            self._cache['drs'][wrt] = result

        if DEBUG:
            self._status = 'done'

        if getattr(self, '_make_dense', False) and sp.issparse(result):
            result = result.todense()
        if getattr(self, '_make_sparse', False) and not sp.issparse(result):
            result = sp.csc_matrix(result)

        if tm_dr_wrt() > 0.1:
            pif('dx of {} wrt {} in {}sec, sparse: {}'.format(
                self.short_name, wrt.short_name, tm_dr_wrt(),
                sp.issparse(result)))

        return result
示例#42
0
    

    # Define the optimizer
    optimizer = torch.optim.Adam(generator.parameters(), lr=params.lr, betas=(params.beta1, params.beta2))
    
    # Define the scheduler
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=params.step_size, gamma = params.gamma)


    # Load model data
    if args.restore_from is not None :
        params.checkpoint = utils.load_checkpoint(restore_from, generator, optimizer, scheduler)
        logging.info('Model data loaded')

    #set the timer
    timer=utils.timer()

    # Train the model and save 
    if params.numIter != 0 :
        logging.info('Start training')   
        train(generator, optimizer, scheduler, eng, params)

    # Generate images and save 
    logging.info('Start generating devices')
    evaluate(generator, eng, numImgs=500, params=params)
    
    timer.out()
    writer.close()


示例#43
0
param = {
    #'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'eta': 0.2,
    'max_depth': 8,
    'silent': 1,
    'nthread': 4,
    'colsample_bytree': .4,
    'subsample': .9,
}

if __name__ == '__main__':
    ''''''
    ## load word2vec lookup table
    with utils.timer('Load word vector'):
        word2vec = tl.files.load_npy_to_any(
            name='%s/model/word2vec_post_text_3d.npy' % config.DataBaseDir)

    ## load data
    with utils.timer('Load data'):
        data_1 = utils.load_cs_deleted_data(cs_delete_file)
        print('target ratio: ')
        print(data_1['label'].value_counts())
        data_2 = utils.load_58_data(pos_58_file)
        print(data_2['label'].value_counts())
        data_3 = utils.load_58_data(neg_58_file)
        print(data_3['label'].value_counts())
        data = pd.concat([data_1, data_2, data_3], axis=0, ignore_index=True)
        DebugDir = '%s/debug' % config.DataBaseDir
        if (os.path.exists(DebugDir) == False):
示例#44
0
def compute_terms(file_full, file_mean, file_out, file_global='means.nc'):
    """
    Return dataset with various zonal terms. Preserve a dimensionless
    """
    # Load datasets
    # TODO: Rename 'plev' to 'lev'
    # NOTE: GFDL doesn't use CF conventions right now.
    # See: https://github.com/xgcm/xgcm/issues/91
    timer()
    if os.path.exists(file_out):
        os.remove(file_out)
    data_full = nc4.Dataset(file_full, mode='r')
    data_mean = nc4.Dataset(file_mean, mode='r')
    data_global = nc4.Dataset(file_global, mode='r')
    data_out = nc4.Dataset(file_out, mode='w')
    copy_attrs(data_full, data_out, ignore=('NCO', 'filename', 'history'))
    for coord in ('time', 'plev', 'lat', 'lon', 'plev_bnds'):
        copy_variable(data_full, data_out, coord, singleton=coord == 'lon')

    # Coordinates and constants
    lat = data_full['lat'][:]
    rlat = np.pi * lat / 180.0
    zmass = vertical_mass(data_full)
    p0 = 100000.0
    R = 287.0
    a = 6371.0e3
    kappa = 0.286
    cp = 1005.7

    # Read full resolution
    # WARNING: Data might be stored in 32-bit but always compute in 64-bit
    # WARNING: Data is loaded from disk every time you use [:] indexing
    t = data_full['t'][:].astype('d')
    u = data_full['u'][:].astype('d')
    v = data_full['v'][:].astype('d')
    w = data_full['omega'][:].astype('d')
    z = data_full['z'][:].astype('d')
    q = data_full['tdt'][:].astype('d')
    udt = data_full['udt'][:].astype('d')
    vdt = data_full['vdt'][:].astype('d')

    # Read zonal means
    # NOTE: We follow CDO convention of preserving reduced longitude and latitude
    # dimensions with a dummy value (CDO uses zero, we use NaN).
    p = data_mean['plev'][:] * 100.0
    exner = (p[:, None, None] / p0) ** kappa
    t_bar = data_mean['t'][:]
    u_bar = data_mean['u'][:]
    v_bar = data_mean['v'][:]
    w_bar = data_mean['omega'][:]
    z_bar = data_mean['z'][:]
    q_bar = data_mean['tdt'][:]
    udt_bar = data_mean['udt'][:]
    vdt_bar = data_mean['vdt'][:]
    pt_bar = t_bar / exner

    # Read globally mppnccombined zonal means
    t_globe = data_global['t'][:]
    q_globe = data_global['tdt'][:]
    timer('  * Time for reading')

    # Zonal anomalies
    t_star = t - t_bar  # need both anomaly and average
    u_star = u - u_bar
    v_star = v - v_bar
    w_star = w - w_bar
    z_star = z - z_bar
    q_star = q - q_bar
    udt_star = udt - udt_bar
    vdt_star = vdt - vdt_bar

    # Global anomalies
    clat = np.cos(np.pi * data_global['lat'][:][:, None] / 180.0)
    t_globe = np.sum(t_globe * clat, axis=2, keepdims=True) / np.sum(clat)
    q_globe = np.sum(q_globe * clat, axis=2, keepdims=True) / np.sum(clat)
    pt_globe = t_globe / exner
    t_bar_anom = t_bar - t_globe
    w_bar_anom = w_bar  # true due to mass conservation
    q_bar_anom = q_bar - q_globe
    pt_bar_anom = pt_bar - pt_globe

    # Barotropic and baroclinic terms
    u_tropic = weighted_mean(u, zmass, axis=1)[:, 0, :, :]  # no height dimension
    v_tropic = weighted_mean(v, zmass, axis=1)[:, 0, :, :]
    u_clinic = u - u_tropic
    v_clinic = v - v_tropic
    u_tropic_bar = weighted_mean(u_tropic, zmass)
    v_tropic_bar = weighted_mean(v_tropic, zmass)
    u_clinic_bar = weighted_mean(u_clinic, zmass)
    v_clinic_bar = weighted_mean(v_clinic, zmass)
    u_tropic_star = u_tropic - weighted_mean(u_tropic, zmass)
    v_tropic_star = u_tropic - weighted_mean(v_tropic, zmass)
    u_clinic_star = u_clinic - weighted_mean(u_clinic, zmass)
    v_clinic_star = u_clinic - weighted_mean(v_clinic, zmass)

    # Stability factor -(theta / T) * (R / cp * p) * (dthetabar / dp)^-1
    # New way recognizing that t / theta == (p / p0)^kappa which means stability =
    # = -R / (cp * p * (dtheta / dp) * (t / theta))
    # = -kappa / ((dtheta / dp) * p * (p / p0)^kappa)
    denom = climo.deriv_uneven(p, pt_globe, axis=1, keepedges=True)
    denom = denom * exner * p[:, None, None] ** kappa
    denom[denom == 0] = np.nan
    stab = -kappa / denom
    timer('  * Time for setup')

    # Eddy variances
    make_variable(
        data_out, 'tvar', weighted_mean(t_star ** 2, zmass),
        long_name='zonal temperature variance',
        units='K^2',
    )
    make_variable(
        data_out, 'uvar', weighted_mean(u_star ** 2, zmass),
        long_name='zonal zonal wind variance',
        units='m^2 / s^2',
    )
    make_variable(
        data_out, 'vvar', weighted_mean(v_star ** 2, zmass),
        long_name='zonal meridional wind variance',
        units='m^2 / s^2',
    )
    make_variable(
        data_out, 'zvar', weighted_mean(z_star ** 2, zmass),
        long_name='geopotential height variance',
        units='m^2',
    )
    timer('  * Time for variance terms')

    # Eddy fluxes
    make_variable(
        data_out, 'ehf', weighted_mean(t_star * v_star, zmass),
        long_name='eddy heat flux',
        units='K m / s',
    )
    make_variable(
        data_out, 'emf', weighted_mean(u_star * v_star, zmass),
        long_name='eddy momentum flux',
        units='m^2 / s^2',
    )
    make_variable(
        data_out, 'egf', weighted_mean(z_star * v_star, zmass),
        long_name='eddy geopotential flux',
        units='m^2 / s',
    )
    timer('  * Time for flux terms')

    # APE terms
    make_variable(
        data_out, 'pe', cp * stab * weighted_mean(t_star ** 2, zmass) / 2.0,
        long_name='eddy APE',
        units='J / kg',
    )
    make_variable(
        data_out, 'pm', cp * stab * t_bar_anom ** 2 / 2.0,
        long_name='mean APE',
        units='J / kg',
    )
    timer('  * Time for APE terms')

    # KE terms
    for prefix, suffix, u_bar_i, v_bar_i, u_star_i, v_star_i in (
        ('', '', u_bar, v_bar, u_star, v_star),
        ('baroclinic ', '_clinic', u_clinic_bar, v_clinic_bar, u_clinic_star, v_clinic_star),  # noqa: E501
        ('barotropic ', '_tropic', u_tropic_bar, v_tropic_bar, u_tropic_star, v_tropic_star),  # noqa: E501
    ):
        make_variable(
            data_out, 'ke' + suffix, weighted_mean(u_star_i ** 2 + v_star_i ** 2, zmass) / 2.0,  # noqa: E501
            long_name=prefix + 'eddy KE',
            units='J / kg',
        )
        make_variable(
            data_out, 'km' + suffix, (u_bar_i ** 2 + v_bar_i ** 2) / 2.0,
            long_name=prefix + 'mean KE',
            units='J / kg',
        )
    del u_tropic, v_tropic, u_tropic_star, v_tropic_star
    del u_clinic, v_clinic, u_clinic_star, v_clinic_star
    timer('  * Time for KE terms')

    # Generation terms
    # WARNING: These need a 'cp', unlike in definitions, because we have a heating
    # rate K/s rather than a forcing term J/s * kg.
    make_variable(
        data_out, 'gpe', cp * stab * weighted_mean(q_star * t_star, zmass),
        long_name='generation of eddy APE',
        units='W / kg',
    )
    make_variable(
        data_out, 'gpm', cp * stab * q_bar_anom * t_bar_anom,
        long_name='generation of mean APE',
        units='W / kg',
    )
    timer('  * Time for APE generation terms')

    # Dissipation terms
    # NOTE: the wind tendency is always negative; want energy going away to be positive
    make_variable(
        data_out, 'dke', -1.0 * weighted_mean(u_star * udt_star + v_star * vdt_star, zmass),  # noqa: E501
        long_name='dissipation of eddy KE',
        units='W / kg',
    )
    make_variable(
        data_out, 'dkm', -1.0 * (u_bar * udt_bar + v_bar * vdt_bar),
        long_name='dissipation of mean KE',
        units='W / kg',
    )
    timer('  * Time for dissipation terms')

    # Conversion from eddy APE to eddy KE, mean APE to mean KE
    # NOTE: This is also eddy adiabatic heating heat budget term! Do not store vertical
    # eddy heat flux separately because it can be easily backed out from this term.
    make_variable(
        data_out, 'cpeke', -1.0 * R * weighted_mean(w_star * t_star, zmass) / p[:, None, None],  # noqa: E501
        long_name='eddy APE conversion to eddy KE',
        units='W / kg',
    )
    make_variable(
        data_out, 'cpmkm', -1.0 * R * w_bar_anom * t_bar_anom / p[:, None, None],
        long_name='mean APE conversion to mean KE',
        units='W / kg',
    )
    timer('  * Time for APE/KE conversion terms')

    # Conversion from eddy KE to mean KE
    # NOTE: See Kim and Kim 2013 (CliDyn)
    clat = np.cos(rlat[:, None])
    tlat = np.tan(rlat[:, None])
    ckekm = (
        weighted_mean(u_star * v_star, zmass) * clat * climo.deriv_uneven(rlat * a, u_bar / clat, axis=2, keepedges=True)  # noqa: E501
        + weighted_mean(v_star ** 2, zmass) * climo.deriv_uneven(rlat * a, v_bar, axis=2, keepedges=True)  # noqa: E501
        + weighted_mean(u_star * w_star, zmass) * climo.deriv_uneven(p, u_bar, axis=1, keepedges=True)  # noqa: E501
        + weighted_mean(v_star * w_star, zmass) * climo.deriv_uneven(p, v_bar, axis=1, keepedges=True)  # noqa: E501
        - v_bar * weighted_mean(u_star ** 2, zmass) * tlat / a
    )
    make_variable(
        data_out, 'ckekm', ckekm,
        long_name='eddy KE conversion to mean KE',
        units='W / kg',
    )
    timer('  * Time for eddy KE conversion to mean KE')

    # Conversion from mean APE to eddy APE
    # NOTE: Use Oort definition here, way better than Kim formula
    dt_bar_dy = climo.deriv_uneven(rlat * a, t_bar, axis=2, keepedges=True)
    dpt_bar_dp = climo.deriv_uneven(p, pt_bar_anom, axis=1, keepedges=True)
    cpmpe = -1.0 * cp * stab * (
        dt_bar_dy * weighted_mean(t_star * v_star, zmass)
        + exner * dpt_bar_dp * weighted_mean(t_star * w_star, zmass)
    )
    make_variable(
        data_out, 'cpmpe', cpmpe,
        long_name='mean APE conversion to eddy APE',
        units='W / kg'
    )
    timer('  * Time for mean APE conversion to eddy APE')

    return data_out
示例#45
0
import time
import logging

from flask import Flask, render_template, request
from sentence_transformers import SentenceTransformer

from utils import timer
from dataset import Dataset
from sentence_similarity import SentenceSimilarity

app = Flask(__name__)
logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

dataset = timer(Dataset, 'data/quora/quora_example.txt')

sentence_sim = timer(SentenceSimilarity, dataset=dataset)


@app.route('/')
def home():
    return render_template('search.html')


#end def


@app.route('/search', methods=["GET", "POST"])
def search_request():
    query = request.form["input"]
import seaborn as sns
from sklearn.metrics import roc_auc_score
from tqdm import tqdm

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils import generate_submit, load_dataset, send_line_notification
from category_encoders import TargetEncoder
from config import *
from utils import timer

sns.set_style('darkgrid')

NAME = Path(__file__).stem
print(NAME)

with timer('load datasets'):
    feats = ['main_numeric', 'main_days_to_years', 'main_days_pairwise', 'main_target_enc',
             'main_ext_source_pairwise', 'bureau', 'prev', 'pos', 'credit', 'inst', 'pos_latest', 'credit_latest']
    X_train, y_train, X_test, cv = load_dataset(feats)

with timer('generate money pairwise features'):
    money_cols = X_train.filter(regex='AMT_(?!REQ)(?!.*_min)').columns
    print(money_cols)
    l = len(list(itertools.combinations(money_cols, 2)))
    for i, j in tqdm(itertools.combinations(money_cols, 2), total=l):
        X_train[f'{i}_minus_{j}'] = X_train[i] - X_train[j]
        X_test[f'{i}_minus_{j}'] = X_test[i] - X_test[j]

print('train:', X_train.shape)
print('test :', X_test.shape)
# print('feats: ', X_train.columns.tolist())
示例#47
0
def build_useful_data():
    """
    #TODO 利用pca降维,或者LDA降维......方式构建特征文件
    构建可用的初始特征数据, 默认原始竞赛数据储存在当前文件夹中的datas文件夹中.
    :return: 可用数据(pd.DataFrame实例)
    """

    # 读取蛋白质数据
    with timer("Loading and merging data"):
        protein_train = pd.read_csv('datas/df_protein_train.csv')

        protein_test = pd.read_csv('datas/df_protein_test.csv')

        protein_all = pd.concat([protein_train, protein_test])

        # 添加蛋白质序列长度作为特征
        protein_all['seq_len'] = protein_all['Sequence'].apply(len)

        # 读取分子数据
        mol_train = pd.read_csv('datas/df_molecule.csv')

        aff_train = pd.read_csv('datas/df_affinity_train.csv')

        aff_test = pd.read_csv('datas/df_affinity_test_toBePredicted.csv')

        # 初始化待预测的Ki值为-11
        aff_test['Ki'] = -11

        aff_all = pd.concat([aff_train, aff_test])

        data = aff_all.merge(mol_train, on="Molecule_ID", how='left')
        data = data.merge(protein_all, on='Protein_ID', how='left')

        # 获取蛋白质ID
        PID = list(protein_all["Protein_ID"])
    with timer("Processing wordcount1"):
        # word_length = 1时的wordcount特征
        _, word_counts1 = tfidf_and_wordcounts(protein_all,
                                               PID,
                                               word_length=1,
                                               stride=1)

    # word_length = 2时的wordcount特征
    with timer("Processing wordcount2"):
        _, word_counts2 = tfidf_and_wordcounts(protein_all,
                                               PID,
                                               word_length=2,
                                               stride=1)

        word_counts1_2 = word_counts1.merge(word_counts2,
                                            on="Protein_ID",
                                            how="left")
        # 保存特征文件,以供后期训练
        word_counts1_2.to_csv("datas/1and2_1_421_protein_std.csv", index=False)

        del word_counts1_2, word_counts1, word_counts2

    with timer("Processing wordcount3"):
        _, word_count3 = tfidf_and_wordcounts(protein_all,
                                              PID,
                                              word_length=3,
                                              stride=1)

        word_count3_features = list(word_count3.columns)  # 8000维的数据,需要降维
        word_count3_features.remove("Protein_ID")

        # 利用标准差进行降维,设置标准差阈值为0.42,去掉标准差小于0.42的特征
        new_word_count3 = reduce_dims_with_std(word_count3,
                                               word_count3_features,
                                               std_threshold=0.3)
        # 保存特征文件,以供后期训练
        new_word_count3.to_csv("datas/3_1_protein_std_0.3.csv", index=False)
        del new_word_count3

        for i in range(len(word_count3_features) // 1000):
            # 每次划分1000个特征,并保存在特征文件里,以供后期训练
            file = word_count3[["Protein_ID"] +
                               word_count3_features[i * 1000:(i + 1) * 1000]]
            file_name = "3_1_1000_protein_" + str(i)
            file.to_csv("datas/" + file_name + ".csv", index=False)

        del word_count3, word_count3_features

    with timer("Processing wordcount4"):
        gc.collect()
        _, word_count4 = tfidf_and_wordcounts(protein_all,
                                              PID,
                                              word_length=4,
                                              stride=1)

        word_count4_features = list(word_count4.columns)  # 140000+ 维的数据,需要降维
        word_count4_features.remove("Protein_ID")

        new_word_count4 = reduce_dims_with_pca(word_count4,
                                               word_count4_features,
                                               n_conponents=1000)
        new_word_count4.to_csv("datas/wordcount4_pca.csv", index=False)

        # 利用标准差进行降维,设置标准差阈值为0.15,去掉标准差小于0.15的特征
        new_word_count4 = reduce_dims_with_std(word_count4,
                                               word_count4_features,
                                               std_threshold=0.15)
        new_word_count4.to_csv("datas/4_1_protein_std_0.15.csv", index=False)

        # 利用标准差进行降维,设置标准差阈值为0.12,去掉标准差小于0.12的特征
        new_word_count4 = reduce_dims_with_std(word_count4,
                                               word_count4_features,
                                               std_threshold=0.12)

        word_count4_features = list(new_word_count4.columns)
        word_count4_features.remove("Protein_ID")

        for i in range(len(word_count4_features) // 1000):
            # 每次划分500个特征,并保存在特征文件里,以供日后训练
            file = new_word_count4[["Protein_ID"] +
                                   word_count4_features[i * 1000:(i + 1) *
                                                        1000]]
            file_name = "4_1_1000_protein_" + str(i)
            file.to_csv("datas/" + file_name + ".csv", index=False)

        del new_word_count4, word_count4

    # 以下特征是蛋白质的词向量特征, 来自技术圈, 谢谢"小武哥"同学.但我们的最终提交版本没用这些特征
    "=====================================词向量特征==========================================="
    # feat2 = protein_embedding(protein_all, word_length = 2)
    # data = data.merge(feat2, on="Protein_ID", how="left")
    # del feat2
    # feat3 = protein_embedding(protein_all, word_length = 3)
    # data = data.merge(feat3, on="Protein_ID", how="left")
    # del feat3
    # feat4 = protein_embedding(protein_all, word_length = 4)
    # data = data.merge(feat4, on="Protein_ID", how="left")
    # del feat4
    "================================================================================"

    with timer("分子指纹展开"):
        mol_fingerprints = list(mol_train["Fingerprint"].apply(
            lambda x: list(np.array(x.split(',')).astype(int))))
        mol_fingerprints = pd.DataFrame(
            mol_fingerprints,
            columns=["Fingerprint_" + str(i) for i in range(167)])
        mol_fingerprints["Molecule_ID"] = mol_train["Molecule_ID"]

    del PID
    "=================================================================================================="

    with timer("加入分子指纹和描述符"):
        data = data.merge(mol_fingerprints, on="Molecule_ID", how='left')
        mol_ECFP4 = pd.read_csv("datas/df_mol_ECFP4s_1024.csv")
        data = data.merge(mol_ECFP4, on="Molecule_ID")
        del mol_fingerprints, mol_ECFP4
        del data["Sequence"], protein_train, protein_test, mol_train

        data.reset_index(drop=True, inplace=True)
        data.to_csv("datas/original_data.csv", index=False)

        del data
        print("Useful data have builded")
示例#48
0
    'reg_lambda': 0.075,
    'min_split_gain': 0.02,
    'min_child_weight': 40,
    'random_state': 71,
    # 'boosting_type': 'dart',
    'silent': -1,
    'verbose': -1,
    'n_jobs': -1,
}
fit_params = {
    'eval_metric': 'auc',
    'early_stopping_rounds': 150,
    'verbose': 50
}

with timer('load datasets'):
    X_train, y_train, X_test, _ = load_dataset(feats)
    cv = StratifiedKFold(5, shuffle=True, random_state=71)
    print('train:', X_train.shape)
    print('test :', X_test.shape)

with timer('drop low importance feats'):
    ref = pd.read_csv(
        '/home/ubuntu/kaggle-home-credit/output/180615_014745_v32_credit_drawing/feats.csv',
        index_col=0,
        header=None)
    drop_cols = ref[1][ref[1] < 1].index
    drop_cols = drop_cols[drop_cols.isin(X_train.columns)]
    X_train.drop(drop_cols, axis=1, inplace=True)
    X_test.drop(drop_cols, axis=1, inplace=True)
    print('train:', X_train.shape)
示例#49
0
def _multi_query(
        sparql, timeout, graph_pattern, source_target_pairs,
        batch_size,
        _vars, _values, _ret_val_mapping,
        _res_init, _chunk_q, _chunk_res,
        _res_update=lambda r, u, **___: r.update(u),
        **kwds):
    if batch_size is None:
        batch_size = config.BATCH_SIZE
    _query_stats.multi_query_count[batch_size] += 1
    total_time = 0
    res = _res_init(source_target_pairs, **kwds)
    for val_chunk in chunker(_values, batch_size):
        _query_stats.multi_query_chunks[batch_size] += 1
        q = _chunk_q(graph_pattern, _vars, val_chunk, **kwds)
        chunk_stps = [stp for v in val_chunk for stp in _ret_val_mapping[v]]
        _start_time = timer()
        t = None
        chunk_res = None
        for retry in range(2, -1, -1):  # 3 attempts: 2, 1, 0
            if retry < 2:
                _query_stats.multi_query_retries[batch_size] += 1
            try:
                t, q_res = _query(sparql, timeout, q, **kwds)
                chunk_res = _chunk_res(
                    q_res, _vars, _ret_val_mapping, **kwds)
            except EndPointNotFound as e:
                # happens if the endpoint reports a 404...
                # as virtuoso in rare cases seems to report a 404 let's
                # retry after some time but then cancel
                if retry:
                    logger.info(
                        'SPARQL endpoint reports a 404, will retry in %ds',
                        config.ERROR_WAIT
                    )
                    sleep(config.ERROR_WAIT)
                    continue
                else:
                    logger.exception(
                        'SPARQL endpoint unreachable even after back-off '
                        'and retry\n'
                        'could not perform query:\n%s for %s\nException:',
                        q, val_chunk,
                    )
                    six.reraise(MultiQueryException, e, sys.exc_info()[2])
            except (SPARQLWrapperException, SAXParseException, URLError) as e:
                if (isinstance(e, SPARQLWrapperException) and
                        re.search(
                            r'The estimated execution time [0-9]+ \(sec\) '
                            r'exceeds the limit of [0-9]+ \(sec\)\.',
                            repr(e))):
                    t, chunk_res = timeout, {}
                elif len(val_chunk) > 1:
                    logger.debug('error in batch: {}'.format(val_chunk))
                    logger.debug('retrying with half size batch: {}...'.format(
                        len(val_chunk) // 2
                    ))
                    _query_stats.multi_query_splits[batch_size] += 1
                    t, chunk_res = _multi_query(
                        sparql, timeout, graph_pattern, chunk_stps,
                        len(val_chunk) // 2,
                        _vars, val_chunk, _ret_val_mapping,
                        _res_init, _chunk_q, _chunk_res,
                        _res_update,
                        **kwds)
                elif isinstance(e, URLError):
                    # we're down at single query level and still encounter an
                    # error. It is very likely that the endpoint is dead...
                    if retry:
                        logger.warning(
                            'URLError, seems we cannot reach SPARQL endpoint, '
                            'retry in %ds. Tried to perform query:\n'
                            '%s for %s\nException:',
                            config.ERROR_WAIT, q, val_chunk,
                            exc_info=1,  # appends exception to message
                        )
                        sleep(config.ERROR_WAIT)
                        continue
                    else:
                        logger.exception(
                            'URLError, seems we cannot reach SPARQL endpoint, '
                            'giving up after 3 retries. Tried to perform query:'
                            '\n%s for %s\nException:',
                            q, val_chunk,
                        )
                        six.reraise(MultiQueryException, e, sys.exc_info()[2])
                else:
                    logger.warning(
                        'could not perform query, replacing with 0 result:\n'
                        '%s for %s\nException:',
                        q, val_chunk,
                        exc_info=1,  # appends exception to message
                    )
                    t, chunk_res = timer() - _start_time, {}
            except Exception as e:
                if retry:
                    logger.warning(
                        'unhandled exception, retry in %ds:\n'
                        'Query:\n%s\nChunk:%r\nException:',
                        config.ERROR_WAIT, q, val_chunk,
                        exc_info=1,  # appends exception to message
                    )
                    sleep(config.ERROR_WAIT)
                    continue
                else:
                    logger.exception(
                        'unhandled exception, giving up after 3 retries:\n'
                        'Query:\n%s\nChunk:%r\nException:',
                        q, val_chunk,
                    )
                    six.reraise(MultiQueryException, e, sys.exc_info()[2])
            break
        _res_update(res, chunk_res, **kwds)
        total_time += t
        if query_time_soft_exceeded(total_time, timeout):
            logger.debug('early terminating batch query as timeout/2 exceeded')
            break
    return total_time, res
示例#50
0
def cv_train_BoXHED2(train_data):
    # Define the output dictionary
    train_info_dict = {}

    # Preprocess the training data. THIS ONLY NEEDS TO BE DONE ONCE.
    boxhed_ = boxhed()  # Create an instance of BoXHED
    prep_timer = timer()  # Initialize timer
    # boxhed.preprocess():
    # Input:
    #      @ num_quantiles: the number of candidate split points to try for time and for each covariate.
    #                       The locations of the split points are based on the quantiles of the training data.
    #      @ is_cat:        a list of the column indexes that contain categorical data. The categorical data must be one-hot encoded.
    #                       For example, is_cat = [4,5,6] if a categorical variable with 3 factors is transformed into binary-valued columns 4,5,6
    #      @ weighted:      if set to True, the locations of the candidate split points will be based on weighted quantiles
    #                       (see Section 3.3 of the BoXHED 2.0 paper)
    #      @ nthreads:      number of CPU threads to use for preprocessing the data
    # Return:
    #      @ ID:            subject ID for each row in the processed data frames X, w, and delta
    #      @ X:             each row represents an epoch of the transformed data, and contains the values of the covariates as well as
    #                       its start time
    #      @ w:             length of each epoch
    #      @ delta:         equals one if an event occurred at the end of the epoch; zero otherwise
    ID, X, w, delta = boxhed_.preprocess(
        data=train_data,
        #is_cat       = [],
        num_quantiles=256,
        weighted=False,
        nthread=nthread_prep)
    train_info_dict["prep_time"] = prep_timer.get_dur(
    )  # calling the get_dur() function.

    # Perform K-fold cross-validation to select hyperparameters {tree depth, number of trees, learning rate} if do_CV = True.
    # Otherwise, users should manually specify hyperparameter values. Note that a tree of depth k has 2^k leaf nodes.
    do_CV = False
    param_manual = {'max_depth': 1, 'n_estimators': 200, 'eta': 0.1}

    # Specify the candidate values for the hyperparameters to cross-validate on (more trees and/or deeper trees may be needed for other datasets).
    param_grid = {
        'max_depth': [1, 2, 3, 4, 5],
        'n_estimators': [50, 100, 150, 200, 250, 300],
        'eta': [0.1]
    }

    # Next, specify:
    #      @ gpu_list:    the list of GPU IDs to use for training. Set gpu_list = [-1] to use CPUs.
    #      @ batch_size:  the maximum number of BoXHED2.0 instances trained at any point in time. Example: Performing
    #                     10-fold cross-validation using the param_grid above requires training 5*6*10 = 300
    #                     instances in total.
    #                           * When gpu_list = [-1], batch_size specifies the number of CPU threads to be used,
    #                             with each one training one instance at a time.
    #                           * When using GPUs, each GPU trains at most batch_size/len(gpu_list) instances at a time. Hence
    #                             if 2 GPUs are used and batch_size = 20, each GPU will train at most 10 instances at a time.
    gpu_list = [-1]
    batch_size = 20
    num_folds = 5
    if do_CV:
        cv_timer = timer()
        # Call the cv function to perform K-fold cross validation on the training set.
        # This outputs the cross validation results for the different hyperparameter combinations.
        # Return:
        #      @ cv_rslts:    mean and st.dev of the log-likelihood value for each hyperparameter combination
        #      @ best_params: The hyper-parameter combination where the mean log-likelihood value is maximized.
        #                     WE STRONGLY RECOMMEND AGAINST USING THIS COMBINATON. Instead, use the
        #                     one-standard-error rule to select the simplest model that is within st.dev/sqrt(k)
        #                     of the maximum log-likelihood value. See §7.10 in 'Elements of Statistical Learning'
        #                     by Hastie et al. (2009).
        cv_rslts, best_params = cv(param_grid, X, w, delta, ID, num_folds,
                                   gpu_list, batch_size)

        train_info_dict["CV_time"] = cv_timer.get_dur()
    else:
        best_params = param_manual
    best_params['gpu_id'] = gpu_list[
        0]  # Use the first GPU in the list for training
    best_params['nthread'] = nthread_train

    train_info_dict.update(best_params)
    boxhed_.set_params(**best_params)

    # Fit BoXHED to the training data
    fit_timer = timer()
    boxhed_.fit(X, delta, w)
    train_info_dict["fit_time"] = fit_timer.get_dur()

    return boxhed_, train_info_dict
示例#51
0
#========================================================================

#========================================================================
# Result Box
is_oof = 1
result_list = []
score_list = []
oof_pred = np.zeros(len(tx_train))
test_pred = np.zeros(len(x_test))
#========================================================================

#========================================================================
# Train & Prediction Start
for fold_no, (trn_idx, val_idx) in enumerate(kfold):

    with utils.timer(f'Fold{fold_no} Train'):
        #========================================================================
        # Make Dataset
        X_train, y_train = tx_train[trn_idx, :], y[trn_idx]
        X_val, y_val = tx_train[val_idx, :], y[val_idx]

        print(X_train.shape, X_val.shape)
        print(f"Target Min --- Train: {y_train.min()} Valid: {y_val.min()}")
        print(
            f"Target Min Count --- Train: {np.sum(y_train==y_train.min())} Valid: {np.sum(y_val==y_val.min())}"
        )

        model = build_model(max_length=max_length,
                            nb_words=nb_words,
                            embedding_size=embedding_size)
        model.fit(x=X_train,
示例#52
0
        dataset ([BasicDataset])
        recmodel ([PairWiseModel])

    Returns:
        [tensor]: Vector of negitems, shape (batch_size, ) 
                  corresponding to batch_users
    """
    dns_k = world.DNS_K
    with torch.no_grad():

        scores = userAndMatrix(batch_users, batch_neg, recmodel)

        _, top1 = scores.max(dim=1)
        idx = torch.arange(len(batch_users)).to(world.DEVICE)
        negitems = batch_neg[idx, top1]
    return negitems


if __name__ == "__main__":
    method = UniformSample_DNS
    from register import dataset
    from utils import timer
    for i in range(1):
        with timer():
            # S = method(dataset, 1)
            S = UniformSample_original(dataset)
            print(len(S[S >= dataset.m_items]))
            S = torch.from_numpy(S).long()
            print(len(S[S >= dataset.m_items]))
        print(timer.get())
示例#53
0
        for wi in xopt:
            print '%.8f' % wi,
        print ''
        sys.stdout.flush()


def select_params_with_de(ks):
    for k in ks:
        MODEL_PARAM['k'] = k
        args = (trn_xs, trn_ys, tst_xs, tst_ys, MODEL_NAME, MODEL_PARAM, MODEL_REPEAT)
        # print '@@@@@@@@@@@@', MODEL_PARAM['k']
        ret = differential_evolution(objective_de, bounds=bounds, args=args, maxiter=3)
        # print xopt, fopt
        # print 'result:'
        print k, ret.success, ret.fun,
        for xi in ret.x:
            print xi,
        print ''
        # print ret.message
        sys.stdout.flush()


if __name__ == '__main__':

    l = int(sys.argv[1])
    u = int(sys.argv[2])

    timer(select_params_with_pso, ks=range(l, u))
    # timer(select_params_with_de, ks=range(l, u))
    # select_params_with_pso(range(5, 16))
示例#54
0
    # config
    RANDOM_STATE = 99
    SHUFFLE = True
    TEST_SIZE = 0.50
    # get args
    args = parse_args()
    datapath = args.datapath
    model = args.model
    pretrained = args.pretrained
    cv = args.cv

    t0 = time.time()
    # 1. import module
    module = __import__(model)
    # 2. load and preprocess data
    with timer("Load and Preprocess"):
        df_train, _, X_train, _ = load_and_preprocess(datapath, module)
    # 3. fit and eval
    with timer('Fitting and Validating'):
        if cv == 2:
            X_t, X_v, y_t, y_v = train_test_split(X_train,
                                                  df_train.target,
                                                  test_size=TEST_SIZE,
                                                  random_state=RANDOM_STATE,
                                                  shuffle=SHUFFLE,
                                                  stratify=df_train.target)
            best_thres, df_score = fit_and_eval(X_t, y_t, X_v, y_v, module,
                                                pretrained)  # noqa
            filepath = os.path.join(datapath, 'eval_{}.csv'.format(model))
            df_score.to_csv(filepath)
            print('Save CV score file to {}'.format(filepath))


if __name__ == '__main__':
  print('# Start')

  N = 100
  a = 1.1
  b = 7
  eps = 3.92
  L = np.array([0.0, 0.0, 0.0])
  r_vectors = np.random.randn(N, 3)

  if found_pycuda:
    force_pycuda = forces_pycuda.calc_blob_blob_forces_pycuda(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
    timer('pycuda')
    force_pycuda = forces_pycuda.calc_blob_blob_forces_pycuda(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
    timer('pycuda')
    
  force_numba = forces_numba.calc_blob_blob_forces_numba(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
  timer('numba')
  force_numba = forces_numba.calc_blob_blob_forces_numba(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
  timer('numba')

  timer('python')
  force_python = mbf.calc_blob_blob_forces_python(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
  timer('python')

  if found_boost:
    timer('boost')
    force_boost = mbf.calc_blob_blob_forces_boost(r_vectors, blob_radius=a, debye_length=b, repulsion_strength=eps, periodic_length=L)
示例#56
0
from keras import backend as K
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import callbacks
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

from keras import initializers
from keras.engine import InputSpec, Layer
from keras import backend as K


base = utils.read_df_pkl(path='../input/base_Av*')
if is_make:
    #========================================================================
    # Dataset Load
    with utils.timer('Download Train and Test Data.\n'):
        train, test = MS_utils.get_dataset(base=base, feat_path='../features/4_winner/*.gz', is_cat_encode=False)


        nlp_cols = [
            'Engine'
            ,'OSVersion'
            ,'AppVersion'
            ,'AvSigVersion'
            ,'SkuEdition'
            ,'SmartScreen'
            ,'Census_OSArchitecture'
            ,'AVProductStatesIdentifier'
            ,'AVProductsInstalled'
            ,'CountryIdentifier'
            ,'CityIdentifier'
sns.set_style('darkgrid')

feats = [
    'main_numeric', 'main_days_to_years', 'main_days_pairwise',
    'main_money_pairwise', 'main_category', 'main_ext_source_pairwise',
    'bureau', 'prev', 'pos', 'credit', 'inst', 'prev_latest', 'pos_latest',
    'credit_latest', 'inst_latest', 'bureau_active_and_type_product',
    'bureau_active_count', 'bureau_enddate', 'bureau_amount_pairwise',
    'bureau_prolonged', 'main_ext_null'
]
rank_average = False

NAME = Path(__file__).stem
print(NAME)

with timer('load datasets'):
    X_train, y_train, X_test, _ = load_dataset(feats)
    cv = StratifiedKFold(5, shuffle=True, random_state=71)
    print('train:', X_train.shape)
    print('test :', X_test.shape)
    # print('feats: ', X_train.columns.tolist())

lgb_params = {
    'n_estimators': 4000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'min_split_gain': 0.01,
feats = [
    'main_numeric', 'main_days_to_years', 'main_days_pairwise',
    'main_money_pairwise', 'main_category', 'main_ext_source_pairwise',
    'bureau', 'prev', 'pos', 'credit', 'inst', 'pos_latest', 'credit_latest',
    'inst_latest', 'bureau_active_count', 'bureau_enddate',
    'bureau_amount_pairwise', 'bureau_prolonged', 'main_ext_null',
    'prev_basic', 'prev_category_count', 'prev_category_tfidf',
    'main_document', 'main_enquiry'
]
rank_average = False
use_cache = True

NAME = Path(__file__).stem
print(NAME)

with timer('load datasets'):
    X_train, y_train, X_test, _ = load_dataset(feats)
    cv = StratifiedKFold(5, shuffle=True, random_state=71)
    print('train:', X_train.shape)
    print('test :', X_test.shape)
    # print('feats: ', X_train.columns.tolist())


def get_denoising_autoencoders(X_train, hiddens=None, drop_ratio=.15):
    hiddens = hiddens if hiddens else [500]
    x_in = Input((X_train.shape[1], ), name='input')
    h = Dropout(drop_ratio)(x_in)
    for i, dim in enumerate(hiddens):
        h = Dense(dim, activation='relu', name=f'hidden_{i}')(h)
    x_out = Dense(X_train.shape[1], activation='linear', name='out')(h)
    model = Model(x_in, x_out)
示例#59
0
                tar.extractall(path=dst)
                tar.close()
        except tarfile.ReadError:
            check_create_folder(dst)
            subprocess.check_call(["tar", "-xf", src, "-C", dst])

    def _get_full_filename(self, band):

        base_file = "%s_B%s.*" % (self.scene, band)
        try:
            return glob.glob(join(self.scene_path, base_file))[0].split("/")[-1]
        except IndexError:
            raise FileDoesNotExist("%s does not exist" % "%s_B%s.*" % (self.scene, band))

    def _check_if_zipped(self, path):
        """ Checks if the filename shows a tar/zip file """
        filename = get_file(path).split(".")

        if filename[-1] in ["bz", "bz2"]:
            return True

        return False


if __name__ == "__main__":

    with timer():
        p = Process(sys.argv[1])

        print p.run(sys.argv[2] == "t")
示例#60
0
def main():
    data_path = args.data_dir
    checkpoint_path = args.checkpoint_path
    tb_log_path = args.tb_log_path
    model_select = args.model_select

    rank_out = args.rank
    user_batch_size = 1000
    n_scores_user = 2500
    data_batch_size = 100
    dropout = args.dropout
    recall_at = range(50, 550, 50)
    eval_batch_size = 1000
    max_data_per_step = 2500000
    eval_every = args.eval_every
    num_epoch = 10

    _lr = args.lr
    _decay_lr_every = 50
    _lr_decay = 0.1

    experiment = '%s_%s' % (
        datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S'), '-'.join(
            str(x / 100) for x in model_select) if model_select else 'simple')
    _tf_ckpt_file = None if checkpoint_path is None else checkpoint_path + experiment + '/tf_checkpoint'

    print('running: ' + experiment)

    dat = load_data(data_path)
    u_pref_scaled = dat['u_pref_scaled']
    v_pref_scaled = dat['v_pref_scaled']
    eval_warm = dat['eval_warm']
    eval_cold_user = dat['eval_cold_user']
    eval_cold_item = dat['eval_cold_item']
    user_content = dat['user_content']
    item_content = dat['item_content']
    u_pref = dat['u_pref']
    v_pref = dat['v_pref']
    user_indices = dat['user_indices']

    timer = utils.timer(name='main').tic()

    # append pref factors for faster dropout
    v_pref_expanded = np.vstack(
        [v_pref_scaled, np.zeros_like(v_pref_scaled[0, :])])
    v_pref_last = v_pref_scaled.shape[0]
    u_pref_expanded = np.vstack(
        [u_pref_scaled, np.zeros_like(u_pref_scaled[0, :])])
    u_pref_last = u_pref_scaled.shape[0]
    timer.toc('initialized numpy data for tf')

    # prep eval
    eval_batch_size = eval_batch_size
    timer.tic()
    eval_warm.init_tf(u_pref_scaled, v_pref_scaled, user_content, item_content,
                      eval_batch_size)
    timer.toc('initialized eval_warm for tf').tic()
    eval_cold_user.init_tf(u_pref_scaled, v_pref_scaled, user_content,
                           item_content, eval_batch_size)
    timer.toc('initialized eval_cold_user for tf').tic()
    eval_cold_item.init_tf(u_pref_scaled, v_pref_scaled, user_content,
                           item_content, eval_batch_size)
    timer.toc('initialized eval_cold_item for tf').tic()

    dropout_net = model.DeepCF(latent_rank_in=u_pref.shape[1],
                               user_content_rank=user_content.shape[1],
                               item_content_rank=item_content.shape[1],
                               model_select=model_select,
                               rank_out=rank_out)

    config = tf.ConfigProto(allow_soft_placement=True)

    with tf.device(args.model_device):
        dropout_net.build_model()

    with tf.device(args.inf_device):
        dropout_net.build_predictor(recall_at, n_scores_user)

    with tf.Session(config=config) as sess:
        tf_saver = None if _tf_ckpt_file is None else tf.train.Saver()
        train_writer = None if tb_log_path is None else tf.summary.FileWriter(
            tb_log_path + experiment, sess.graph)
        tf.global_variables_initializer().run()
        tf.local_variables_initializer().run()
        timer.toc('initialized tf')

        row_index = np.copy(user_indices)
        n_step = 0
        best_cold_user = 0
        best_cold_item = 0
        best_warm = 0
        n_batch_trained = 0
        best_step = 0
        for epoch in range(num_epoch):
            np.random.shuffle(row_index)
            for b in utils.batch(row_index, user_batch_size):
                n_step += 1
                # prep targets
                target_users = np.repeat(b, n_scores_user)
                target_users_rand = np.repeat(np.arange(len(b)), n_scores_user)
                target_items_rand = [
                    np.random.choice(v_pref.shape[0], n_scores_user) for _ in b
                ]
                target_items_rand = np.array(target_items_rand).flatten()
                target_ui_rand = np.transpose(
                    np.vstack([target_users_rand, target_items_rand]))
                [target_scores, target_items, random_scores] = sess.run(
                    [
                        dropout_net.tf_topk_vals, dropout_net.tf_topk_inds,
                        dropout_net.preds_random
                    ],
                    feed_dict={
                        dropout_net.U_pref_tf: u_pref[b, :],
                        dropout_net.V_pref_tf: v_pref,
                        dropout_net.rand_target_ui: target_ui_rand
                    })
                # merge topN and randomN items per user
                target_scores = np.append(target_scores, random_scores)
                target_items = np.append(target_items, target_items_rand)
                target_users = np.append(target_users, target_users)

                tf.local_variables_initializer().run()
                n_targets = len(target_scores)
                perm = np.random.permutation(n_targets)
                n_targets = min(n_targets, max_data_per_step)
                data_batch = [(n, min(n + data_batch_size, n_targets))
                              for n in xrange(0, n_targets, data_batch_size)]
                f_batch = 0
                for (start, stop) in data_batch:
                    batch_perm = perm[start:stop]
                    batch_users = target_users[batch_perm]
                    batch_items = target_items[batch_perm]
                    if dropout != 0:
                        n_to_drop = int(np.floor(dropout * len(batch_perm)))
                        perm_user = np.random.permutation(
                            len(batch_perm))[:n_to_drop]
                        perm_item = np.random.permutation(
                            len(batch_perm))[:n_to_drop]
                        batch_v_pref = np.copy(batch_items)
                        batch_u_pref = np.copy(batch_users)
                        batch_v_pref[perm_user] = v_pref_last
                        batch_u_pref[perm_item] = u_pref_last
                    else:
                        batch_v_pref = batch_items
                        batch_u_pref = batch_users

                    _, _, loss_out = sess.run(
                        [
                            dropout_net.preds, dropout_net.updates,
                            dropout_net.loss
                        ],
                        feed_dict={
                            dropout_net.Uin:
                            u_pref_expanded[batch_u_pref, :],
                            dropout_net.Vin:
                            v_pref_expanded[batch_v_pref, :],
                            dropout_net.Ucontent:
                            user_content[batch_users, :].todense(),
                            dropout_net.Vcontent:
                            item_content[batch_items, :].todense(),
                            #
                            dropout_net.target:
                            target_scores[batch_perm],
                            dropout_net.lr_placeholder:
                            _lr,
                            dropout_net.phase:
                            1
                        })
                    f_batch += loss_out
                    if np.isnan(f_batch):
                        raise Exception('f is nan')

                n_batch_trained += len(data_batch)
                if n_step % _decay_lr_every == 0:
                    _lr = _lr_decay * _lr
                    print('decayed lr:' + str(_lr))
                if n_step % eval_every == 0:
                    recall_warm = utils.batch_eval_recall(
                        sess,
                        dropout_net.eval_preds_warm,
                        eval_feed_dict=dropout_net.get_eval_dict,
                        recall_k=recall_at,
                        eval_data=eval_warm)
                    recall_cold_user = utils.batch_eval_recall(
                        sess,
                        dropout_net.eval_preds_cold,
                        eval_feed_dict=dropout_net.get_eval_dict,
                        recall_k=recall_at,
                        eval_data=eval_cold_user)
                    recall_cold_item = utils.batch_eval_recall(
                        sess,
                        dropout_net.eval_preds_cold,
                        eval_feed_dict=dropout_net.get_eval_dict,
                        recall_k=recall_at,
                        eval_data=eval_cold_item)

                    # checkpoint
                    if np.sum(recall_warm + recall_cold_user +
                              recall_cold_item) > np.sum(best_warm +
                                                         best_cold_user +
                                                         best_cold_item):
                        best_cold_user = recall_cold_user
                        best_cold_item = recall_cold_item
                        best_warm = recall_warm
                        best_step = n_step
                        if tf_saver is not None:
                            tf_saver.save(sess, _tf_ckpt_file)

                    timer.toc('%d [%d]b [%d]tot f=%.2f best[%d]' %
                              (n_step, len(data_batch), n_batch_trained,
                               f_batch, best_step)).tic()
                    print('\t\t\t' + ' '.join([('@' + str(i)).ljust(6)
                                               for i in recall_at]))
                    print('warm start\t%s\ncold user\t%s\ncold item\t%s' %
                          (' '.join(['%.4f' % i for i in recall_warm]),
                           ' '.join(['%.4f' % i for i in recall_cold_user]),
                           ' '.join(['%.4f' % i for i in recall_cold_item])))
                    summaries = []
                    for i, k in enumerate(recall_at):
                        if k % 100 == 0:
                            summaries.extend([
                                tf.Summary.Value(tag="recall@" + str(k) +
                                                 " warm",
                                                 simple_value=recall_warm[i]),
                                tf.Summary.Value(
                                    tag="recall@" + str(k) + " cold_user",
                                    simple_value=recall_cold_user[i]),
                                tf.Summary.Value(
                                    tag="recall@" + str(k) + " cold_item",
                                    simple_value=recall_cold_item[i])
                            ])
                    recall_summary = tf.Summary(value=summaries)
                    if train_writer is not None:
                        train_writer.add_summary(recall_summary, n_step)