logger.info(f'Converting: {src_collection.__name__} ({db.get_collection_size(src_collection)})' f' -> {dst_collection.__name__} ({db.get_collection_size(dst_collection)})') docs = src_collection.objects() total_count = docs.count() for current_count, src_doc in enumerate(docs): log_progress(current_count, total_count) try: mapped_doc = map_document(src_doc) except (DocumentConversionError, DocumentConstructionError) as e: logger.warning(f'Skipping: {src_doc} because of: {e}') continue mapped_doc.create_or_update() with db.connect(): logger.info(f'Total {dst_collection.__name__} count: ({db.get_collection_size(dst_collection)})') logger.info(f'Documents.Conclusion count: ({db.get_collection_size(Documents.Conclusion)})') logger.info(f'Documents.License count: ({db.get_collection_size(Documents.License)})') if __name__ == '__main__': logger = root_logger('convert_data', logging.INFO) try: db = MongoDB() # credentials for MongoDB can be set up here convert_data(Documents.FileRaw, Documents.File) logger.info('Success') except Exception as e: logger.info(e, exc_info=True)
'function': multi_label, 'x': x_multi_label, 'y': y_multi_label }, "DP": { 'function': single_label, 'x': x_dual_problem, 'y': y_dual_problem }, } train_problems_on_data(pipeline, data, train_in_parallel) if __name__ == '__main__': logger = root_logger('train_sk_pipeline', logging.INFO) load_dotenv(find_dotenv()) scenario_dir = Path( get_train_dir() / f'sklearn_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")}') scenario_dir.mkdir(parents=True, exist_ok=True) add_file_handler_to_logger(logger, scenario_dir, 'train_sk_pipeline') try: db = MongoDB() # credentials for MongoDB can be set up here n_cores = cpu_count( ) # number of processors that shall be used for loading data from MongoDB max_samples = 10_000 # max number of samples per license, used for single and multi label problem, value min_samples = 1_000 # min number of samples per license, decides if the license will be taken to training, internally limited to 10
'x': x_special_cases[basename], 'y': y_special_cases[basename] } train_problems_on_data(pipeline, data, train_in_parallel) def generalize_special_cases(license: str): for l in SPECIAL_CASES: if license.startswith(l + "-"): return l return license if __name__ == '__main__': logger = root_logger('train_sk_pipeline_special_cases', logging.INFO) load_dotenv(find_dotenv()) scenario_dir = Path( get_train_dir() / f'sklearn_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")}') scenario_dir.mkdir(parents=True, exist_ok=True) add_file_handler_to_logger(logger, scenario_dir, 'train_sk_pipeline_special_cases') try: db = MongoDB() # credentials for MongoDB can be set up here n_cores = cpu_count( ) # number of processors that shall be used for loading data from MongoDB max_samples = 10_000 # max number of samples per license, used for single and multi label problem, value min_samples = 1_000 # min number of samples per license, decides if the license will be taken to training, internally limited to 10
logger.info('Exiting: Nothing to update!') return 0 logger.info('[%s] Starting process %d', 'analyze_batch', process_id) with db.connect(): ca = CollectionAnalysis(_collection, **query) ca.analyze_batch(skip_n, limit_n) logger.info('[%s] Completed process %d', 'analyze_batch', process_id) return ca, None if __name__ == '__main__': logger = root_logger('analyze_data', logging.INFO) load_dotenv(find_dotenv()) try: db = MongoDB() # credentials for MongoDB can be set up here n_cores = cpu_count( ) # number of processors that shall be used can be set up here analysis = analyze_in_parallel(db, n_cores=n_cores, batch_size=10_000, collection=Documents.Conclusion) logger.info(analysis.print_counts()) logger.info(analysis.print_percentages()) analysis.save_statistics(db, output_path=None)
for i, doc in enumerate(docs): if i == max_samples: break doc.modify(add_to_set__tags=tag) i += 1 except StopIteration as e: pass finally: logger.info(f'{license} Finished tagging {i} samples with {tag}') docs._cursor.close() # end functions for tagging the benchmark set if __name__ == '__main__': logger = root_logger('update_data', logging.INFO) load_dotenv(find_dotenv()) preprocessor = PreprocessorSpacy() try: db = MongoDB() # credentials for MongoDB can be set up here n_cores = cpu_count( ) # number of processors that shall be used can be set up here license_mapping = load_license_mapping_file(get_train_dir() / LICENSE_MAPPING_FILENAME) update(update_document, n_cores=n_cores, batch_size=10_000, collection=Documents.Conclusion)
# construct a plot that plots and saves the training history N = np.arange(0, EPOCHS) plt.style.use("ggplot") plt.figure() plt.plot(N, H.history["loss"], label="train_loss") plt.plot(N, H.history["val_loss"], label="val_loss") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend(loc="lower left") plt.savefig(filename) args = arg_parse() root_logger('./logs/log.txt') logging.info(str(args)) # load the MNIST dataset logging.info(" loading MNIST dataset...") ((trainX, _), (testX, _)) = mnist.load_data() # add a channel dimension to every image in the dataset, then scale # the pixel intensities to the range [0, 1] trainX = np.expand_dims(trainX, axis=-1) # (sample,w,h) -> (sample, w, h, d) testX = np.expand_dims(testX, axis=-1) trainX = trainX.astype("float32") / 255.0 testX = testX.astype("float32") / 255.0 factor_noise = args['factor_noise']