Exemplo n.º 1
0
        logger.info(f'Converting: {src_collection.__name__} ({db.get_collection_size(src_collection)})'
                    f' -> {dst_collection.__name__} ({db.get_collection_size(dst_collection)})')

    docs = src_collection.objects()
    total_count = docs.count()
    for current_count, src_doc in enumerate(docs):
        log_progress(current_count, total_count)

        try:
            mapped_doc = map_document(src_doc)
        except (DocumentConversionError, DocumentConstructionError) as e:
            logger.warning(f'Skipping: {src_doc} because of: {e}')
            continue

        mapped_doc.create_or_update()

    with db.connect():
        logger.info(f'Total {dst_collection.__name__} count: ({db.get_collection_size(dst_collection)})')
        logger.info(f'Documents.Conclusion count: ({db.get_collection_size(Documents.Conclusion)})')
        logger.info(f'Documents.License count: ({db.get_collection_size(Documents.License)})')


if __name__ == '__main__':
    logger = root_logger('convert_data', logging.INFO)
    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        convert_data(Documents.FileRaw, Documents.File)
        logger.info('Success')
    except Exception as e:
        logger.info(e, exc_info=True)
            'function': multi_label,
            'x': x_multi_label,
            'y': y_multi_label
        },
        "DP": {
            'function': single_label,
            'x': x_dual_problem,
            'y': y_dual_problem
        },
    }

    train_problems_on_data(pipeline, data, train_in_parallel)


if __name__ == '__main__':
    logger = root_logger('train_sk_pipeline', logging.INFO)
    load_dotenv(find_dotenv())

    scenario_dir = Path(
        get_train_dir() /
        f'sklearn_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")}')
    scenario_dir.mkdir(parents=True, exist_ok=True)
    add_file_handler_to_logger(logger, scenario_dir, 'train_sk_pipeline')

    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        n_cores = cpu_count(
        )  # number of processors that shall be used for loading data from MongoDB
        max_samples = 10_000  # max number of samples per license, used for single and multi label problem, value
        min_samples = 1_000  # min number of samples per license, decides if the license will be taken to training, internally limited to 10
            'x': x_special_cases[basename],
            'y': y_special_cases[basename]
        }

    train_problems_on_data(pipeline, data, train_in_parallel)


def generalize_special_cases(license: str):
    for l in SPECIAL_CASES:
        if license.startswith(l + "-"):
            return l
    return license


if __name__ == '__main__':
    logger = root_logger('train_sk_pipeline_special_cases', logging.INFO)
    load_dotenv(find_dotenv())

    scenario_dir = Path(
        get_train_dir() /
        f'sklearn_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M")}')
    scenario_dir.mkdir(parents=True, exist_ok=True)
    add_file_handler_to_logger(logger, scenario_dir,
                               'train_sk_pipeline_special_cases')

    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        n_cores = cpu_count(
        )  # number of processors that shall be used for loading data from MongoDB
        max_samples = 10_000  # max number of samples per license, used for single and multi label problem, value
        min_samples = 1_000  # min number of samples per license, decides if the license will be taken to training, internally limited to 10
Exemplo n.º 4
0
        logger.info('Exiting: Nothing to update!')
        return 0

    logger.info('[%s] Starting process %d', 'analyze_batch', process_id)

    with db.connect():
        ca = CollectionAnalysis(_collection, **query)
        ca.analyze_batch(skip_n, limit_n)

    logger.info('[%s] Completed process %d', 'analyze_batch', process_id)

    return ca, None


if __name__ == '__main__':
    logger = root_logger('analyze_data', logging.INFO)
    load_dotenv(find_dotenv())

    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        n_cores = cpu_count(
        )  # number of processors that shall be used can be set up here
        analysis = analyze_in_parallel(db,
                                       n_cores=n_cores,
                                       batch_size=10_000,
                                       collection=Documents.Conclusion)

        logger.info(analysis.print_counts())
        logger.info(analysis.print_percentages())

        analysis.save_statistics(db, output_path=None)
Exemplo n.º 5
0
            for i, doc in enumerate(docs):
                if i == max_samples:
                    break
                doc.modify(add_to_set__tags=tag)
                i += 1
        except StopIteration as e:
            pass
        finally:
            logger.info(f'{license} Finished tagging {i} samples with {tag}')
            docs._cursor.close()


# end functions for tagging the benchmark set

if __name__ == '__main__':
    logger = root_logger('update_data', logging.INFO)
    load_dotenv(find_dotenv())
    preprocessor = PreprocessorSpacy()

    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        n_cores = cpu_count(
        )  # number of processors that shall be used can be set up here

        license_mapping = load_license_mapping_file(get_train_dir() /
                                                    LICENSE_MAPPING_FILENAME)
        update(update_document,
               n_cores=n_cores,
               batch_size=10_000,
               collection=Documents.Conclusion)
Exemplo n.º 6
0
    # construct a plot that plots and saves the training history
    N = np.arange(0, EPOCHS)
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(N, H.history["loss"], label="train_loss")
    plt.plot(N, H.history["val_loss"], label="val_loss")
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend(loc="lower left")
    plt.savefig(filename)


args = arg_parse()

root_logger('./logs/log.txt')
logging.info(str(args))

# load the MNIST dataset
logging.info(" loading MNIST dataset...")
((trainX, _), (testX, _)) = mnist.load_data()

# add a channel dimension to every image in the dataset, then scale
# the pixel intensities to the range [0, 1]
trainX = np.expand_dims(trainX, axis=-1)  # (sample,w,h) -> (sample, w, h, d)
testX = np.expand_dims(testX, axis=-1)

trainX = trainX.astype("float32") / 255.0
testX = testX.astype("float32") / 255.0

factor_noise = args['factor_noise']