def train_and_save_distance_model(ethnicity_model_path, save_distance_model_path, sampled_pairs_size): """Train the distance estimator model and save it to disk. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. save_distance_model_path (str): Full path where trained distance model will be saved. sampled_pairs_size (int): Number of pairs to be generated for the training. Note: Must be multiple of 12. """ LOGGER.info("Pulling training data from ES") curated_signatures = get_signatures(only_curated=True) input_clusters = get_input_clusters(curated_signatures) LOGGER.info("Preparing %s pairs from sampled data for training.", sampled_pairs_size) pairs = list( sample_signature_pairs(curated_signatures, input_clusters, sampled_pairs_size)) ethnicity_estimator = EthnicityEstimator(ethnicity_model_path) distance_estimator = DistanceEstimator(ethnicity_estimator) distance_estimator.load_data(curated_signatures, pairs, sampled_pairs_size) LOGGER.info("Training DistanceEstimator...") distance_estimator.fit() distance_estimator.save_model(save_distance_model_path)
def train_and_save_ethnicity_model(load_data_path, save_model_path): """Train the ethnicity estimator model and save it to disk. Args: load_data_path (str): Full path to training data for ethnicity estimator. save_model_path (str): Full path where trained ethnicity model will be saved. """ estimator = EthnicityEstimator() estimator.load_data(load_data_path) LOGGER.info("Training EthnicityEstimator. May take a while...") estimator.fit() estimator.save_model(save_model_path)
def train_and_save_distance_model(ethnicity_model_path, save_distance_model_path, sampled_pairs_size): """Train the distance estimator model and save it to disk. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. save_distance_model_path (str): Full path where trained distance model will be saved. sampled_pairs_size (int): Number of pairs to be generated for the training. Note: Must be multiple of 12. """ LOGGER.info("Pulling training data from ES") start_time = datetime.now() curated_signatures = get_signatures(only_curated=True) input_clusters = get_input_clusters(curated_signatures) prepare_intput_time = datetime.now() LOGGER.info( "Preparing pairs from sampled data for training.", pairs_count=sampled_pairs_size, ) pairs = list( sample_signature_pairs(curated_signatures, input_clusters, sampled_pairs_size)) prepare_pairs_time = datetime.now() ethnicity_estimator = EthnicityEstimator(ethnicity_model_path) distance_estimator = DistanceEstimator(ethnicity_estimator) prepare_estimators_time = datetime.now() distance_estimator.load_data(curated_signatures, pairs, sampled_pairs_size) load_data_to_model_time = datetime.now() LOGGER.info("Training DistanceEstimator...") distance_estimator.fit() training_model_time = datetime.now() distance_estimator.save_model(save_distance_model_path) save_model_time = datetime.now() LOGGER.info( "Train distance model", prepare_input_runtime=str(prepare_intput_time - start_time), prepare_pairs_runtime=str(prepare_pairs_time - prepare_intput_time), prepare_estimators_runtime=str(prepare_estimators_time - prepare_pairs_time), load_data_runtime=str(load_data_to_model_time - prepare_estimators_time), training_model_runtime=str(training_model_time - load_data_to_model_time), save_model_runtime=str(save_model_time - training_model_time), total_runtime=str(save_model_time - start_time), )
def train_and_save_ethnicity_model(load_data_path, save_model_path): """Train the ethnicity estimator model and save it to disk. Args: load_data_path (str): Full path to training data for ethnicity estimator. save_model_path (str): Full path where trained ethnicity model will be saved. """ start_time = datetime.now() estimator = EthnicityEstimator() estimator.load_data(load_data_path) load_time = datetime.now() LOGGER.info("Training EthnicityEstimator. May take a while...") estimator.fit() training_time = datetime.now() estimator.save_model(save_model_path) save_time = datetime.now() LOGGER.info( "Training ethnicity model", load_data_runtime=str(load_time - start_time), training_model_runtime=str(training_time - load_time), save_model_runtime=str(save_time - training_time), total_runtime=str(save_time - start_time), )
def train_and_save_distance_model( ethnicity_model_path, save_distance_model_path, sampled_pairs_size, train_to_validation_split_fraction=0.8, ): """Train the distance estimator model and save it to disk. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. save_distance_model_path (str): Full path where trained distance model will be saved. sampled_pairs_size (int): Number of pairs to be generated for the training. Note: Must be multiple of 4. train_to_validation_split_fraction (float): fraction of the data used for training. """ start_time = datetime.now() curated_signatures = get_signatures(only_curated=True) LOGGER.info( "Splitting data into training and test set.", training_set_fraction=train_to_validation_split_fraction, ) train_signatures_dict, test_signatures_dict = train_validation_split( curated_signatures, train_to_validation_split_fraction) train_signatures_list = train_signatures_dict.values() test_signatures_list = test_signatures_dict.values() input_clusters_train = get_input_clusters(train_signatures_list) input_clusters_test = get_input_clusters(test_signatures_list) prepare_intput_time = datetime.now() LOGGER.info( "Preparing pairs from sampled data for training.", pairs_count=sampled_pairs_size, ) pairs_train = list( sample_signature_pairs(train_signatures_list, input_clusters_train, sampled_pairs_size)) prepare_pairs_time = datetime.now() # must be multiple of 4 pair_size_test = 4 * math.ceil( (((1 - train_to_validation_split_fraction) / train_to_validation_split_fraction)**2 * sampled_pairs_size) / 4) pairs_test = list( sample_signature_pairs(test_signatures_list, input_clusters_test, pair_size_test)) LOGGER.info( "Pairs prepared.", n_training_pairs=len(pairs_train), n_test_pairs=len(pairs_test), ) ethnicity_estimator = EthnicityEstimator(ethnicity_model_path) distance_estimator = DistanceEstimator(ethnicity_estimator) prepare_estimators_time = datetime.now() distance_estimator.load_data(train_signatures_list, pairs_train, sampled_pairs_size) load_data_to_model_time = datetime.now() distance_estimator.fit() training_model_time = datetime.now() distance_estimator.save_model(save_distance_model_path) save_model_time = datetime.now() distance_estimator.load_data(test_signatures_list, pairs_test, pair_size_test) test_score = distance_estimator.score() LOGGER.info( "Train distance model", prepare_input_runtime=str(prepare_intput_time - start_time), prepare_pairs_runtime=str(prepare_pairs_time - prepare_intput_time), prepare_estimators_runtime=str(prepare_estimators_time - prepare_pairs_time), load_data_runtime=str(load_data_to_model_time - prepare_estimators_time), training_model_runtime=str(training_model_time - load_data_to_model_time), save_model_runtime=str(save_model_time - training_model_time), total_runtime=str(save_model_time - start_time), test_score=str(test_score), ) return set(test_signatures_dict)