def main(tickers_path): tickers = {} for file_name in get_next_file(tickers_path): calc_volatility = TickerVolatility(file_path=file_name) ticker, volatility = calc_volatility.run() tickers[ticker] = volatility print_report(tickers)
def main(tickers_path): threads = [] tickers = {} lock = Lock() for fname in get_next_file(tickers_path): threads.append( TickerVolatility(file_path=fname, tickers=tickers, lock=lock)) [thread.start() for thread in threads] [thread.join() for thread in threads] print_report(tickers)
def main(tickers_path): tickers = {} collector = Queue(maxsize=2) processes = [ TickerVolatility(file_path=fname, tickers_queue=collector) for fname in get_next_file(tickers_path) ] [process.start() for process in processes] while True: try: ticker, volatility = collector.get(timeout=1) tickers[ticker] = volatility except Empty: if not any(process.is_alive() for process in processes): break [process.join() for process in processes] print_report(tickers)
def train_loop(loader, model, epochs = 3, start_epoch = 0, params = None, device = None, loss_func = torch.nn.CrossEntropyLoss, n_tops = [1, 5]): L_RATE, DECAY_RATE, DECAY_EPOCHS, WEIGHT_DECAY, SAVE_MODEL, SAVE_MODEL_N, SAVE_MODEL_DIR, MODEL, N_LAYERS = params optimizer = optim.Adam(model.parameters(), lr = L_RATE, weight_decay = WEIGHT_DECAY) if SAVE_MODEL: if MODEL == 'Darknet': path = '{}{}'.format(MODEL, N_LAYERS) else: path = MODEL if not os.path.exists('{}/{}'.format(SAVE_MODEL_DIR, path)): os.makedirs('{}/{}'.format(SAVE_MODEL_DIR, path)) losses, accuracies = {'train': [], 'validate': []}, {'train': [], 'validate': []} for epoch in range(start_epoch, epochs + start_epoch): t = time() if (epoch + 1) % DECAY_EPOCHS == 0: L_RATE *= (1 - DECAY_RATE) optimizer = optim.Adam(model.parameters(), lr=L_RATE, weight_decay=WEIGHT_DECAY) # print epoch number print_report(part = 'start', epoch = epoch) # train loop train_epoch(loader['train'], model, optimizer, device, loss_func) # print metrics val_acc, val_loss = get_accuracy(loader['val'], model, device, dtype, loss_func, n_tops) train_acc, train_loss = get_accuracy(loader['train'], model, device, dtype, loss_func, n_tops) metrics = train_loss, val_loss, train_acc, val_acc, n_tops print_report(part='accuracy', metrics = metrics) # collect metrics losses['train'].append(train_loss) losses['validate'].append(val_loss) accuracies['train'].append(train_acc) accuracies['validate'].append(val_acc) # save models if SAVE_MODEL: save_checkpoint(model = model, cfg = cfg, epoch = epoch, loss = round(val_loss, 3)) # print time print_report(part='end', t = int(time() - t))
X_val, y_val = load_validation_data() # build prediction pipeline text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier( loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None, )) ]) text_clf.fit(X_train, y_train) print('=== training error ===') predictions = text_clf.predict(X_train) print_report(predictions, y_train) print('=== validation error ===') predictions = text_clf.predict(X_val) print_report(predictions, y_val) print('=== classification report ===') print(metrics.classification_report(y_val, predictions)) print('=== confusion matrix ===') print(metrics.confusion_matrix(y_val, predictions))
def train_loop(cfg_path, gpu_n='0'): # get configs with open(cfg_path, 'r') as stream: config = yaml.safe_load(stream) device = torch.device('cuda:{}'.format(gpu_n) if config['GPU'] and torch.cuda.is_available else 'cpu') dtype = torch.float32 # TODO: find out how it affects speed and accuracy MODEL = config['MODEL'] LOAD_MODEL = config['LOAD_MODEL'] LOAD_MODEL_FILE = config['LOAD_MODEL_FILE'] SAVE_MODEL = config['SAVE_MODEL'] SAVE_MODEL_N = config['SAVE_MODEL_N'] SAVE_MODEL_DIR = config['SAVE_MODEL_DIR'] DATASET_DIR = config['DATASET_DIR'] L_RATE = config['LEARNING_RATE'] DECAY_RATE = config['DECAY_RATE'] DECAY_EPOCHS = config['DECAY_EPOCHS'] WEIGHT_DECAY = config['WEIGHT_DECAY'] EPOCHS = config['EPOCHS'] BATCH_SIZE = config['BATCH_SIZE'] NUM_WORKERS = config['NUM_WORKERS'] PIN_MEMORY = config['PIN_MEMORY'] CSV_TRAIN = config['CSV_TRAIN'] CSV_VAL = config['CSV_VAL'] # set up model if MODEL == 'Darknet': model = YoloV1(grid_size=7, num_boxes=2, num_classes=20).to(DEVICE) elif MODEL == 'VGG': pass # add here VGG backbone if LOAD_MODEL: # TODO: load backbone # cfg_cp, start_epoch = load_checkpoint(LOAD_MODEL_FILE, model) val = input( 'Do you want to use config from checkpoint? Answer "yes" or "no": ' ) # if 'val' == 'yes': # L_RATE = cfg_cp['LEARNING_RATE'] # DECAY_RATE = cfg_cp['DECAY_RATE'] # DECAY_EPOCHS = cfg_cp['DECAY_EPOCHS'] # WEIGHT_DECAY = cfg_cp['WEIGHT_DECAY'] # BALANCED = cfg_cp['BALANCED_DATASET'] # BATCH_SIZE = cfg_cp['BATCH_SIZE'] # NUM_WORKERS = cfg_cp['NUM_WORKERS'] # PIN_MEMORY = cfg_cp['PIN_MEMORY'] # MIN_IMAGES = cfg_cp['MIN_IMAGES'] # LOSS = cfg_cp['LOSS'] else: model = init_weights(model) start_epoch = 0 optimizer = optim.Adam(model.parameters(), lr=L_RATE, weight_decay=WEIGHT_DECAY) loss_fn = YoloLoss() loader_params = BATCH_SIZE, NUM_WORKERS, PIN_MEMORY, DATASET_DIR, CSV_TRAIN, CSV_VAL loader = get_dataloader(loader_params) # create folder to save models if SAVE_MODEL: if not os.path.exists('{}/{}'.format(SAVE_MODEL_DIR, MODEL)): os.makedirs('{}/{}'.format(SAVE_MODEL_DIR, MODEL)) losses, accuracies = { 'train': [], 'validate': [] }, { 'train': [], 'validate': [] } for epoch in range(start_epoch, EPOCHS + start_epoch): t = time() if (epoch + 1) % DECAY_EPOCHS == 0: L_RATE *= (1 - DECAY_RATE) optimizer = optim.Adam(model.parameters(), lr=L_RATE, weight_decay=WEIGHT_DECAY) # print epoch number print_report(part='start', epoch=epoch) # train loop train_epoch(loader['train'], model, optimizer, device, loss_fn) # print metrics pred_bb, target_bb = get_bboxes(loader['train'], model, iou_threshold=0.5, threshold=0.4) train_map = mean_average_precision(pred_bb, target_bb, iou_threshold=0.5, box_format='midpoint') v_pred_bb, v_target_bb = get_bboxes(loader['val'], model, iou_threshold=0.5, threshold=0.4) val_map = mean_average_precision(v_pred_bb, v_target_bb, iou_threshold=0.5, box_format='midpoint') metrics = -1, -1, train_map, val_map print_report(part='accuracy', metrics=metrics) # collect metrics # losses['train'].append(train_loss) # losses['validate'].append(val_loss) # accuracies['train'].append(train_acc) # accuracies['validate'].append(val_acc) # save models # if SAVE_MODEL: # save_checkpoint(model=model, cfg=cfg, epoch=epoch, loss=round(val_loss, 3)) # print time print_report(part='end', t=int(time() - t))
def train_loop(cfg_path, gpu_n='0', stat_path='stat'): # get configs with open(cfg_path, 'r') as stream: config = yaml.safe_load(stream) print() print(config) print() device = torch.device('cuda:{}'.format(gpu_n) if config['GPU'] and torch.cuda.is_available else 'cpu') dtype = torch.float32 # TODO: find out how it affects speed and accuracy MODEL = config['MODEL'] LOAD_MODEL = config['LOAD_MODEL'] LOAD_MODEL_FILE = config['LOAD_MODEL_FILE'] SAVE_MODEL = config['SAVE_MODEL'] SAVE_MODEL_N = config['SAVE_MODEL_N'] SAVE_MODEL_DIR = config['SAVE_MODEL_DIR'] DATASET_DIR = config['DATASET_DIR'] EPOCHS = config['EPOCHS'] BATCH_SIZE = config['BATCH_SIZE'] NUM_WORKERS = config['NUM_WORKERS'] PIN_MEMORY = config['PIN_MEMORY'] CSV_TRAIN = config['CSV_TRAIN'] CSV_VAL = config['CSV_VAL'] OPTIMIZER = config['OPTIMIZER'] # create stat file nid = create_stat(stat_path, config) # set up model S, B, C = 7, 2, 20 # TODO: add it to config if LOAD_MODEL: # TODO: load backbone model, cfg_save, epoch = load_checkpoint( LOAD_MODEL_FILE, device=device, S=S, B=B, C=C, cfg=config if MODEL == 'VGG16' else None) # TODO: init weight else: if MODEL == 'Darknet': model = YoloV1(grid_size=S, num_boxes=B, num_classes=C).to(device) elif MODEL == 'VGG': pass # add here VGG backbone model = init_weights(model) start_epoch = 0 loss_fn = YoloLoss() loader_params = BATCH_SIZE, NUM_WORKERS, PIN_MEMORY, DATASET_DIR, CSV_TRAIN, CSV_VAL loader = get_dataloader(loader_params, my_transforms) # create folder to save models if SAVE_MODEL: if not os.path.exists('{}/{}'.format(SAVE_MODEL_DIR, MODEL)): os.makedirs('{}/{}'.format(SAVE_MODEL_DIR, MODEL)) losses, accuracies = { 'train': [], 'validate': [] }, { 'train': [], 'validate': [] } optimizer = None opt_lr = None for epoch in range(start_epoch, EPOCHS + start_epoch): t = time() optimizer, opt_name, opt_lr = get_optimizer(optimizer, model, OPTIMIZER, epoch, opt_lr) print_report(part='start', epoch=epoch) # train loop train_epoch(loader['train'], model, optimizer, device, loss_fn, (opt_name, opt_lr)) # print metrics train_loss, train_maps = get_metrics_NEW(loader=loader['train'], model=model, iou_threshold=0.5, threshold=0.4, device=device, loss_func=loss_fn, S=S, B=B, C=C) val_loss, val_maps = get_metrics_NEW(loader=loader['val'], model=model, iou_threshold=0.5, threshold=0.4, device=device, loss_func=loss_fn) metrics = train_loss, val_loss, train_maps, val_maps print_report(part='accuracy', metrics=metrics) # collect metrics losses['train'].append(train_loss) losses['validate'].append(val_loss) accuracies['train'].append(np.mean(train_maps)) accuracies['validate'].append(np.mean(val_maps)) # write stats to CSV r = [ nid, datetime.now(), epoch, train_loss, val_loss, train_maps, val_maps ] with open('{}/stat.csv'.format(stat_path), 'a') as f: writer = csv.writer(f) writer.writerow(r) # save models # if SAVE_MODEL: # save_checkpoint(model=model, cfg=cfg, epoch=epoch, loss=round(val_loss, 3)) # print time print_report(part='end', t=int(time() - t))
def main(args): # load data X, y = utils.load_dataset(args.dataset_path) # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Training mode if args.mode == 'train': print(f'Training data shape {X_train.shape}, {y_train.shape}') print(f'Test data shape {X_test.shape}, {y_test.shape}') print(f'Training class distrubution') class_counter = Counter(y_train) for k, v in class_counter.items(): print(f' Class={k}, Count={v}') # preprocess data # missing data, date engineer (date column), class imbalance, text data X_train_processed = utils.prepare_inputs(X_train, X_train) X_test_processed = utils.prepare_inputs(X_train, X_test) y_train_processed = utils.prepare_targets(y_train, y_train) y_test_processed = utils.prepare_targets(y_train, y_test) # try different classifiers, spot-checking which algorithm perform well print("Starting spot check") # define models models, names = utils.get_models() results = list() # evaluate each model for i in range(len(models)): # evaluate the model and store results scores = utils.evaluate_model(X_train_processed, y_train_processed, models[i]) results.append(scores) # summarize performance print('>%s %.3f (%.3f)' % (names[i], np.mean(scores), np.std(scores))) print("End spot check") # get the best model print("Start model training") model = RandomForestClassifier(n_estimators=100) # fit the model model.fit(X_train_processed, y_train_processed) # save model print("Saving model") pickle.dump(model, open(args.save_model_path, 'wb')) # evaluate the model y_train_preds = model.predict(X_train_processed) y_test_preds = model.predict(X_test_processed) # precition, recall, f-score for training for each category print("Evaluating training performance") utils.print_report(y_train_processed, y_train_preds, class_counter) # precition, recall, f-score for testing for each category print("Evaluating testing performance") utils.print_report(y_test_processed, y_test_preds, class_counter) # confusion matrix plot_confusion_matrix(model, X_test_processed, y_test_processed) plt.show() # feature_importances utils.plot_feature_importance(X_train_processed.columns, model) # Explain mode, how the classifier come to the decicion elif args.mode == 'explain': data = pd.DataFrame(data=[args.input.split(',')], columns=X_train.columns) data['pagesCount'] = data.pagesCount.astype('int64') data['wordCount'] = data.wordCount.astype('int64') data['fileSize'] = data.fileSize.astype('int64') # process test data data_processed = utils.prepare_inputs(X_train, data) # load model model = pickle.load(open(args.save_model_path, 'rb')) # Extract and plot single tree estimator = model.estimators_[5] fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(4, 4), dpi=300) tree.plot_tree(estimator, feature_names=data_processed.columns, filled=True) plt.show() # get decision tree for each tree n_nodes_ = [t.tree_.node_count for t in model.estimators_] children_left_ = [t.tree_.children_left for t in model.estimators_] children_right_ = [t.tree_.children_right for t in model.estimators_] feature_ = [t.tree_.feature for t in model.estimators_] threshold_ = [t.tree_.threshold for t in model.estimators_] for i, e in enumerate(model.estimators_): print("Tree %d\n" % i) sample_id = 0 utils.explore_tree(model.estimators_[i], n_nodes_[i], children_left_[i], children_right_[i], feature_[i], threshold_[i], data_processed.columns, data_processed, sample_id=sample_id) prediction = model.estimators_[i].predict(data_processed) prediction = [int(i) for i in prediction] print( f'Prediction for sample {sample_id}: {utils.decode_targets(y_train, prediction)[sample_id]}' ) print('\n' * 2) # Predict mode elif args.mode == 'predict': data = pd.DataFrame(data=[args.input.split(',')], columns=X_train.columns) data['pagesCount'] = data.pagesCount.astype('int64') data['wordCount'] = data.wordCount.astype('int64') data['fileSize'] = data.fileSize.astype('int64') # process test data data_processed = utils.prepare_inputs(X_train, data) # load model model = pickle.load(open(args.save_model_path, 'rb')) # predict data prediction = model.predict(data_processed) print(f'prediction : {utils.decode_targets(y_train, prediction)[0]}')
def classifer_creation(df_final, target_column): X = df_final.loc[:, df_final.columns != target_column] Y = df_final.loc[:, df_final.columns == target_column] X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42) y_train_ravel = y_train.values.ravel() print(len(X_train.columns)) svc_best_param, svc_report = svc_param_selection(X_train, y_train_ravel, 5) print('SVM finished') dt_best_param, dt_report = descision_tree_param_selection( X_train, y_train_ravel, 5) print('Descision tree finished') bt_best_param, bt_report = boosted_tree_param_selection( X_train, y_train_ravel, dt_best_param, 5) print('Boosted tree finished') knn_best_param, knn_report = k_nearest_neighbors_param_selection( X_train, y_train_ravel, 5) print('KNN finished') ANN_best_param, ANN_report = ANN_param_selection(X_train, y_train_ravel, 5) print('ANN finished') print_report(svc_report, dt_report, bt_report, knn_report, ANN_report) params = { 'SVC': svc_best_param, 'DecisionTree': dt_best_param, 'BoostedTrees': bt_best_param, 'K-NearestNeighbors': knn_best_param, 'NeuralNetworks': ANN_best_param } for best_params in params: print('{classifer}: {params}'.format(classifer=best_params, params=params[best_params])) svc = svm.SVC(C=svc_best_param['C'], gamma=svc_best_param['gamma'], kernel=svc_best_param['kernel']) dt = DecisionTreeClassifier( max_depth=dt_best_param['max_depth'], min_samples_split=dt_best_param['min_samples_split'], min_samples_leaf=dt_best_param['min_samples_leaf'], max_features=dt_best_param['max_features']) # check for pruning max_prune, prune = get_prune(dt, X_train, y_train, X_test, y_test) print( 'Pruning with highest score: {max_prune}, Pruning with acceptable loss in accuracy: {prune}' .format(max_prune=max_prune, prune=prune)) bt = GradientBoostingClassifier( max_depth=bt_best_param['max_depth'], min_samples_split=bt_best_param['min_samples_split'], min_samples_leaf=bt_best_param['min_samples_leaf'], max_features=bt_best_param['max_features'], learning_rate=bt_best_param['learning_rate'], n_estimators=bt_best_param['n_estimators']) knn = KNeighborsClassifier(n_neighbors=knn_best_param['n_neighbors'], p=knn_best_param['p']) ANN = create_model(len(X_train.columns)) learning_curves(svc, dt, bt, knn, ANN, ANN_best_param, X, Y.values.ravel(), target_column) start_time = timeit.default_timer() svc.fit(X_train, y_train_ravel) print("{classifer} took {time} to fit on X and y".format( classifer='svc', time=(timeit.default_timer() - start_time))) start_time = timeit.default_timer() dt.fit(X_train, y_train_ravel) print("{classifer} took {time} to fit on X and y".format( classifer='dt', time=(timeit.default_timer() - start_time))) start_time = timeit.default_timer() bt.fit(X_train, y_train_ravel) print("{classifer} took {time} to fit on X and y".format( classifer='bt', time=(timeit.default_timer() - start_time))) start_time = timeit.default_timer() knn.fit(X_train, y_train_ravel) print("{classifer} took {time} to fit on X and y".format( classifer='knn', time=(timeit.default_timer() - start_time))) start_time = timeit.default_timer() ANN.fit(X_train, y_train_ravel, epochs=ANN_best_param['epochs'], batch_size=ANN_best_param['batch_size'], verbose=0) print("{classifer} took {time} to fit on X and y".format( classifer='ann', time=(timeit.default_timer() - start_time))) # PRUNE THE MODEL try: dot_data = tree.export_graphviz(dt, out_file=None, feature_names=X_test.columns) graph = graphviz.Source(dot_data) graph.render("pre-pruning-" + target_column) post_pruning(dt.tree_, 0, max_prune) dot_data = tree.export_graphviz(dt, out_file=None, feature_names=X_test.columns) graph = graphviz.Source(dot_data) graph.render("post-pruning-" + target_column) except: print( 'if you wish to render the pruning graphs please install graphviz') return svc, dt, bt, knn, ANN, X_train, y_train, X_test, y_test
resources = gscatalog.get_resources(workspace=ws) for res in resources: fullname = ws.name + ":" + res.name if args.item == res.name or args.item == fullname: resource_found = res break if resource_found is not None: break # Still not found ? trying on the layergroups # TODO: Cannot update layergroups properties # if resource_found is None: # lgroups = gscatalog.get_layergroups() # for lg in lgroups: # if lg.name == args.item: # resource_found = lg # break # resource not found in the whole GeoServer if resource_found is None: logger.error("Ressource \"%s\" not found." % args.item) sys.exit() # Actually process the provided resources else: logger.debug("Resource \"%s\" found, processing ..." % resource_found.name) try: layer = gscatalog.get_layer(resource_found.workspace.name + ":" + resource_found.name) gn_to_gs_fix(layer, resource_found, args.dry_run, creds, args.disable_ssl_verification) except Inconsistency as e: errors.append(e) print_report(logger, errors)
frames_total = 0 for episode in range(num_episodes): state = env.reset() score = 0 while True: frames_total += 1 epsilon = calculate_epsilon(frames_total) action = qnet_agent.select_action(state, epsilon) new_state, reward, done, info = env.step(action) score += reward memory.push(state, action, new_state, reward, done) qnet_agent.optimize() # env.render() state = new_state if done: solved_after = episode rewards_total.append(score) plot_results(rewards_total) mean_reward_100 = sum(rewards_total[-100:]) / 100 if episode % report_interval == 0 and episode > 0: print_report(episode, report_interval, rewards_total, mean_reward_100, epsilon, frames_total) break print("Average reward: %.2f" % (sum(rewards_total) / num_episodes)) print("Average reward (last 100 episodes): ", (sum(rewards_total[-100:]) / 100)) print("Solved after %i episodes" % solved_after) env.close()