Exemplo n.º 1
0
def main(args: argparse.Namespace):
    label, *_ = data.read_csv(args.label)
    pred, *_ = data.read_csv(args.prediction)

    # Set criterion as RMSE
    critic = criterion.get(args.method)()
    result = critic(label[:, 2], pred[:, 2])

    print(f'{args.method}: {result}')
Exemplo n.º 2
0
def main(args: argparse.Namespace):
    result = Path(args.result)
    result.mkdir(exist_ok=True, parents=True)

    # Read whole csv file
    content, header = data.read_csv(args.dataset)

    # Split query
    for query in args.split:
        try:
            # Each query must include {name, begin condition, end condition}
            name, begin, end = query.split(',')
            begin, end = map(int, (begin, end))

            cropped = content[(begin <= content[:, -1])
                              & (content[:, -1] <= end)]
            data.to_csv(str(result.joinpath(f'{name}.csv')),
                        cropped,
                        header=header)

        except ValueError as e:
            print(f'Error: {e}')
            print(
                'format mismatch, each query must be [name,begin_condition,end_condition]'
            )
            print(f'But got `{query}`')
            continue
Exemplo n.º 3
0
def main(args: argparse.Namespace):
    # Reproducible (Important)
    # An experiment that can not be reproduced can not make any conclusions.
    # So fix random seed before anything else.
    seed(args.seed)

    # Load dataset
    # Provides two dataset loading methods
    # - Load from whole csv and split train, test by condition (slow)
    # - Load each train, test csv (faster)
    #   (Using scripts/split.py to split train, test by condition)
    if args.dataset:
        dataset = data.Dataset(args.dataset)
        train, test = dataset.split_train_test(args.mode)
        test_header = dataset.rating_headers

    else:
        train, train_header = data.read_csv(args.train)
        test, test_header = data.read_csv(args.test)

    # Set criterion as RMSE
    critic = criterion.get(args.criterion)()

    # Fit model, using train data
    model = Recommender(factors=args.factor,
                        epochs=args.epoch,
                        mean=args.mean,
                        derivation=args.dev,
                        lr=args.lr,
                        reg=args.reg)
    model.fit(train[:, :2], train[:, 2])

    # Predict by test data and calculate error
    predictions = model.predict(test[:, :2])
    error = critic(predictions, test[:, 2])
    print(f'RMSE: {error}')

    # Save predictions
    test[:, 2] = predictions
    data.to_csv(args.result, test, header=test_header)
def main(args: argparse.Namespace):
    global train, test, test_header

    # Reproducible (Important)
    # An experiment that can not be reproduced can not make any conclusions.
    # So fix random seed before anything else.
    seed(args.seed)

    # Load dataset
    # Provides two dataset loading methods
    # - Load from whole csv and split train, test by condition (slow)
    # - Load each train, test csv (faster)
    #   (Using scripts/split.py to split train, test by condition)
    if args.dataset:
        dataset = data.Dataset(args.dataset)
        train, test = dataset.split_train_test(args.mode)
        test_header = dataset.rating_headers

    else:
        train, train_header = data.read_csv(args.train)
        test, test_header = data.read_csv(args.test)

    # Find param in search space
    params = list(product(*param_space.values()))

    if args.size:
        indexes = np.random.choice(len(params), args.size, replace=False)
        params = [params[i] for i in indexes]

    print(f'Search space: {len(params)}')
    print(f'Param: {param_space}')

    with Pool(args.cpu or cpu_count()) as pool:
        results = pool.map(wrapper, params)

    best, *_ = sorted(results, key=lambda x: x[1])
    print(f'Best RMSE: {best[1]}')
    print(f'param: {best[0]}')
Exemplo n.º 5
0
                    help='year of data to play, default=2018')
parser.add_argument('--commission',
                    type=float,
                    default=DEFAULT_COMMISSION,
                    help='commission size, default=0.00025')
parser.add_argument('--cuda',
                    default=False,
                    action='store_true',
                    help='enable cuda')
args = parser.parse_args()

device = 'cuda' if args.cuda else 'cpu'

try:
    from lib import data
    play_data = data.read_csv(file_name='data/000001_%d.csv' % args.year)
except ModuleNotFoundError:
    play_data = (pd.read_csv('data/prices_%d.csv' % args.year, index_col=0),
                 pd.read_csv('data/factors_%d.csv' % args.year, index_col=0))

env = environ.StockEnv(play_data,
                       bars_count=BARS_COUNT,
                       commission=args.commission,
                       reset_on_sell=False,
                       random_ofs_on_reset=False)
net = models.DQNConv1d(env.observation_space.shape, env.action_space.n)
datestr = datetime.strftime(date(2019, 2, 2), '%Y-%m-%d')
save_path = os.path.join('saves', datestr)
state_dict = torch.load(os.path.join(save_path, 'best_mean_val.pth'),
                        map_location=lambda storage, loc: storage)
net.load_state_dict(state_dict)
Exemplo n.º 6
0
from lib import data, model, common, ML, visualize
import numpy as np
import graphviz
from sklearn.ensemble import RandomForestClassifier

PATH = "/home/chris/projects/Kaggle/heart_200719/data/heart.csv"
TYPE_LIST = [
    "num", "mc", "mc", "num", "num", "mc", "mc", "num", "mc", "num", "mc",
    "mc", "mc"
]
MAX_DATA_COL = 13

# read the df
df = data.read_csv(PATH)

# get the feature name list
feature_list = []
for i, key in enumerate(df.keys()):
    feature_list.append(key)
    if i == (MAX_DATA_COL - 1):
        break

# shuffle data
seed_arr = data.shuffle(data.df2array(df))
# split the data into training set and testing set
train_set, test_set = data.split_data(seed_arr, percentage=0.8)

randomForest = ML.RandomForest(tolerance=0.05,
                               min_element=10,
                               max_depth=100,
                               num_col_blocked_each_step=2,
Exemplo n.º 7
0
 def test_read_csv(self):
     prices = data.read_csv("WIKI-CSCO.csv")
     self.assertIsInstance(prices, data.Prices)
Exemplo n.º 8
0
                        help='enable cuda')
    parser.add_argument('--colab',
                        default=False,
                        action='store_true',
                        help='enable colab hosted runtime')
    parser.add_argument('--double',
                        default=False,
                        action='store_true',
                        help='enable double DQN')
    args = parser.parse_args()

    device = torch.device('cuda' if args.cuda else 'cpu')

    try:
        from lib import data
        train_data = data.read_csv(file_name='data/000001_2017.csv')
        val_data = data.read_csv(file_name='data/000001_2018.csv')
    except ModuleNotFoundError:
        train_data = (pd.read_csv('data/prices_2017.csv', index_col=0),
                      pd.read_csv('data/factors_2017.csv', index_col=0))
        val_data = (pd.read_csv('data/prices_2018.csv', index_col=0),
                    pd.read_csv('data/factors_2018.csv', index_col=0))

    env = environ.StockEnv(train_data,
                           bars_count=BARS_COUNT,
                           reset_on_sell=True)
    env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
    env_test = environ.StockEnv(train_data,
                                bars_count=BARS_COUNT,
                                reset_on_sell=True)
    env_test = gym.wrappers.TimeLimit(env_test, max_episode_steps=1000)
Exemplo n.º 9
0
MAIN_PATH = "../docs/3"
NET_SAVE_PATH = MAIN_PATH + '/checkpoint'
RUNS_SAVE_PATH = MAIN_PATH + "/runs/" + dt_string
NET_FILE = "checkpoint-3100000.data"
LOAD_NET = False
TRAIN_ON_GPU = True
BATCH_SIZE = 512
lr = 0.001
CHECKPOINT_STEP = 100000
PRINT_EVERY = 50000
SCALAR_VISUALIZE_EVERY = 1000
EMBEDDING_VISUALIZE_EVERY = 100000
MOVING_AVERAGE_STEP = 1000

# read file
col_names, raw_data = data.read_csv(path=DATA_PATH)

# define batch generator
domain_col = raw_data[7]  # x = company name
codomain_col = raw_data[3]  # y = issue name
batch_generator = data.Batch_Generator()
generator_prepare = batch_generator.prepare_generator(
    domain_col=domain_col, codomain_col=codomain_col)

# define model
fc_model = models.FC_Embed(len(generator_prepare.domain_int2vocab),
                           len(generator_prepare.codomain_int2vocab),
                           embedding_size=3,
                           train_on_gpu=TRAIN_ON_GPU)
if LOAD_NET:
    print("Loading net params...")
Exemplo n.º 10
0
EPSILON_START = 1.0
EPSILON_STOP = 0.1
EPSILON_STEPS = 1000000

CHECKPOINT_EVERY_STEP = 50000
VALIDATION_EVERY_STEP = 10000 # 10000

load_net = True
load_fileName = "checkpoint-950000.data"
saves_path = "C:\\Users\\user\\python_jupyter\\book_Hands_On_Reinforcement_Learning_Pytorch\\cmk_chapter8\\2_LSTM\\checkpoint"

if __name__ == "__main__":

    device = torch.device("cuda")

    stock_data = data.read_csv(
        "C:\\Users\\user\\python_jupyter\\book_Hands_On_Reinforcement_Learning_Pytorch\\cmk_chapter8\\2_LSTM\\data\\0005.HK.csv")

    # create the training and val set
    train_set, val_set = data.split_data(stock_data, percentage=0.8)
    train_set = {"train": train_set}
    val_set = {"eval": val_set}

    env = environ.StocksEnv(train_set, bars_count=BARS_COUNT, reset_on_close=True, state_1d=False, volumes=True)
    env = wrappers.TimeLimit(env, max_episode_steps=1000)
    env_val = environ.StocksEnv(val_set, bars_count=BARS_COUNT, reset_on_close=True, state_1d=False, volumes=True)
    # env_val = wrappers.TimeLimit(env_val, max_episode_steps=1000)

    # create neural network
    net = models.SimpleLSTM(input_size=5, n_hidden=512, n_layers=2, drop_prob=0.5, actions_n=3,
                 train_on_gpu=True, batch_first=True).to(device)
    # load the network
 def test_read_csv(self):
     prices = data.read_csv("data/YNDX_160101_161231.csv")
     self.assertIsInstance(prices, data.Prices)
Exemplo n.º 12
0
def worker(net, device, train_queue, proc_idx, save_path):
    try:
        from lib import data
        train_data = data.read_csv(file_name='data/000001_2018.csv')
    except ModuleNotFoundError:
        train_data = (pd.read_csv('data/prices_2018.csv', index_col=0),
                      pd.read_csv('data/factors_2018.csv', index_col=0))
    env = environ.StockEnv(train_data,
                           bars_count=BARS_COUNT,
                           reset_on_sell=True)
    env = gym.wrappers.TimeLimit(env, max_episode_steps=1000)
    agt = agent.ProbabilityAgent(lambda x: net(x)[0],
                                 apply_softmax=True,
                                 device=device)
    exp_source = experience.ExperienceSource(env,
                                             agt,
                                             GAMMA,
                                             steps_count=REWARD_STEPS)

    batch = []
    frame_idx = 0
    total_reward = []
    total_steps = []
    reward_buf = []
    steps_buf = []
    frame_idx = 0
    frame_prev = 0
    ts = time.time()
    best_mean_reward = None

    stats = collections.defaultdict(list)

    file_name = os.path.splitext(os.path.basename(__file__))[0]
    file_name = file_name.split('_')[-1]
    proc_name = 'worker_' + '%d' % proc_idx
    writer = SummaryWriter(os.path.join('runs', file_name, proc_name))

    logging.basicConfig(level=logging.INFO,
                        format='%(levelname)s:%(message)s',
                        handlers=[
                            logging.FileHandler(
                                os.path.join(save_path, 'console.log')),
                            logging.StreamHandler()
                        ])

    for exp in exp_source:
        frame_idx += 1
        batch.append(exp)
        if len(batch) < GRAD_BATCH:
            continue

        net.zero_grad()
        loss_val_v, loss_policy_v, loss_entropy_v = helper.a2c_loss(
            batch, net, GAMMA**REWARD_STEPS, ENTROPY_BETA, device)
        batch.clear()
        loss_v = loss_entropy_v + loss_val_v + loss_policy_v
        loss_v.backward()
        nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
        grads = [
            param.grad.data.cpu().numpy() if param.grad is not None else None
            for param in net.parameters()
        ]
        train_queue.put(grads)

        stats['loss_value'].append(loss_val_v)
        stats['loss_policy'].append(loss_policy_v)
        stats['loss_entropy'].append(loss_entropy_v)
        stats['loss_total'].append(loss_v)
        for stat in stats:
            if len(stat) >= STATS_GROUPS:
                writer.add_scalar(stat,
                                  torch.mean(torch.stack(stats[stat])).item(),
                                  frame_idx)
                stats[stat].clear()

        ep_reward, ep_steps = exp_source.pop_episode_result()
        if ep_reward:
            print(
                'Worker_%d: %d done, Episode reward: %.4f, Episode step: %d' %
                (proc_idx, frame_idx, ep_reward, ep_steps))
            reward_buf.append(ep_reward)
            steps_buf.append(ep_steps)
            if len(reward_buf) == REWARD_GROUPS:
                reward = np.mean(reward_buf)
                steps = np.mean(steps_buf)
                reward_buf.clear()
                steps_buf.clear()
                total_reward.append(reward)
                total_steps.append(steps)
                speed = (frame_idx - frame_prev) / (time.time() - ts)
                frame_prev = frame_idx
                ts = time.time()
                mean_reward = np.mean(total_reward[-100:])
                mean_step = np.mean(total_steps[-100:])
                logging.info(
                    '%d done, mean reward %.3f, mean step %d, speed %d f/s' %
                    (frame_idx, mean_reward, mean_step, speed))
                writer.add_scalar('speed', speed, frame_idx)
                writer.add_scalar('reward', reward, frame_idx)
                writer.add_scalar('reward_100', mean_reward, frame_idx)
                writer.add_scalar('steps', steps, frame_idx)
                writer.add_scalar('steps_100', mean_step, frame_idx)
                if best_mean_reward is None or best_mean_reward < mean_reward:
                    torch.save(
                        net.state_dict(),
                        os.path.join(save_path, 'best_mean_reward-%.3f.pth') %
                        mean_reward)
                    if best_mean_reward is not None:
                        logging.info(
                            'Worker_%d: Best mean value updated %.3f -> %.3f' %
                            (proc_idx, best_mean_reward, mean_reward))
                    best_mean_reward = mean_reward
    writer.close()
Exemplo n.º 13
0
from lib import data, model, common, ML, visualize
import numpy as np
import graphviz
from sklearn import tree

READ_SAME = False
MAX_DATA_COL = 13
PATH = "/home/chris/projects/Kaggle/heart_200719/data/heart.csv"
TYPE_LIST = [
    "num", "mc", "mc", "num", "num", "mc", "mc", "num", "mc", "num", "mc",
    "mc", "mc"
]
decisionTree = ML.DecisionTree(tolerance=0.00, min_element=1, max_depth=100)

if READ_SAME:
    df = data.read_csv(
        "/home/chris/projects/Kaggle/heart_200719/data/heart_debug.csv")
    seed_arr = data.df2array(df)

    # get the feature name list
    feature_list = []
    for i, key in enumerate(df.keys()):
        feature_list.append(key)
else:
    # read the df
    df = data.read_csv(PATH)

    # get the feature name list
    feature_list = []
    for i, key in enumerate(df.keys()):
        feature_list.append(key)