def generic_cccv(type, genotype, phenotype, folds, replicates, threads, comple_steps, conta_steps, verb, groups=None, rank=None, optimize=False, out=None, n_features=None, params_file=None, *args, **kwargs): """ Perform crossvalidation over a range of simulated completeness/contamination values, and save output. """ kwargs = _fix_uppercase(kwargs) assert groups is None, 'Usage of LOGO in CCCV not currently implemented.' assert not optimize, 'Parameter search over CCCV not currently implemented.' training_records, *_ = load_training_files( genotype_file=genotype, phenotype_file=phenotype, verb=verb ) if params_file is not None: loaded_params = load_params_file(params_file) logger.info(f'Parameters loaded from file:') logger.info('\n' + pformat(loaded_params)) kwargs = {**kwargs, **loaded_params} # TODO: should loaded params have precendence? clf = CLF_MAPPER[type](verb=verb, *args, **kwargs) reduce_features = True if n_features is not None else False cccv = clf.crossvalidate_cc(records=training_records, cv=folds, n_replicates=replicates, comple_steps=comple_steps, conta_steps=conta_steps, n_jobs=threads, reduce_features=reduce_features, n_features=n_features) write_cccv_accuracy_file(out, cccv)
def test_resampling(trait_name): td, *_ = load_training_files( FLAT_PATH / trait_name / f'{trait_name}.genotype', FLAT_PATH / trait_name / f'{trait_name}.phenotype') trr = TrainingRecordResampler(random_state=2, verb=True) trr.fit(td) trr.get_resampled(td[0], comple=.5, conta=.5)
def generic_cv(type, genotype, phenotype, folds, replicates, threads, verb, optimize=False, optimize_out=None, optimize_n_iter=None, groups=None, rank=None, out=None, n_features=None, params_file=None, *args, **kwargs): """ Estimate model performance by cross-validation. Optionally, perform parameter search and save found parameters. """ kwargs = _fix_uppercase(kwargs) training_records, *_ = load_training_files( genotype_file=genotype, phenotype_file=phenotype, groups_file=groups, selected_rank=rank, verb=verb ) if params_file is not None: loaded_params = load_params_file(params_file) logger.info(f'Parameters loaded from file:') logger.info('\n' + pformat(loaded_params)) kwargs = {**kwargs, **loaded_params} # TODO: should loaded params have precendence? clf = CLF_MAPPER[type](verb=verb, *args, **kwargs) if optimize: assert optimize_out is not None, 'No savepath for found parameters passed.' logger.info(f'Optimizing parameters...') found_params = clf.parameter_search(training_records, n_iter=optimize_n_iter) params = {**kwargs, **found_params} write_params_file(optimize_out, params) logger.info(f'Optimized parameters written to file {optimize_out}.') clf = CLF_MAPPER[type](verb=verb, *args, **params) reduce_features = True if n_features is not None else False use_groups = groups is not None logger.info(f'Running CV...') score_mean, score_sd, misclass = clf.crossvalidate(records=training_records, cv=folds, n_replicates=replicates, groups=use_groups, n_jobs=threads, reduce_features=reduce_features, n_features=n_features, demote=not verb) logger.info(f"CV score: {round(score_mean, 4)} +/- {round(score_sd, 4)}") if out is not None: write_misclassifications_file(out, training_records, misclass, use_groups=use_groups)
def generic_train(type, genotype, phenotype, verb, weights, out, n_features=None, params_file=None, *args, **kwargs): """ Train and save a TrexClassifier model. """ kwargs = _fix_uppercase(kwargs) training_records, *_ = load_training_files(genotype_file=genotype, phenotype_file=phenotype, verb=verb) if params_file is not None: loaded_params = load_params_file(params_file) logger.info(f'Parameters loaded from file:') logger.info('\n' + pformat(loaded_params)) kwargs = {**kwargs, **loaded_params} clf = CLF_MAPPER[type](verb=verb, *args, **kwargs) reduce_features = True if n_features is not None else False clf.train(records=training_records, reduce_features=reduce_features, n_features=n_features) if weights: weights = clf.get_feature_weights() weights_file_name = f"{out}.rank" if clf.feature_type.startswith('eggNOG5'): text_annotator = Eggnog5TextAnnotator() feature_taxon = int(clf.feature_type.split('-')[-1]) annots = [ text_annotator.annotate(taxon_id=feature_taxon, enog_id=x)[1] for x in weights.keys() ] else: annots = None write_weights_file(weights_file=weights_file_name, weights=weights, annots=annots) save_classifier(obj=clf, filename=out, verb=verb)
def test_get_shap(self, trait_name, classifier_type): """ Get ShapHandler and SHAP data from classifier and genotype file. :param trait_name: :param classifier_type: :return: """ full_path_genotype = FLAT_PATH / trait_name / f"{trait_name}.genotype" full_path_phenotype = FLAT_PATH / trait_name / f"{trait_name}.phenotype" training_records, genotype, phenotype, group = load_training_files( genotype_file=full_path_genotype, phenotype_file=full_path_phenotype, verb=True) tr = training_records[:3] model_path = MODELS_PATH / trait_name / f'{trait_name}.{classifier_type.lower()}.pkl' clf = load_classifier(model_path, verb=True) sh = ShapHandler.from_clf(clf) fs, sv, bv = clf.get_shap(tr, n_samples=50) return tr, sh, fs, sv, bv
def test_load_data(self, trait_name, do_write): """ Test training data loading. Check/catch invalid file formats. :param trait_name: :return: """ full_path_genotype = FLAT_PATH / trait_name / f"{trait_name}.genotype" full_path_phenotype = FLAT_PATH / trait_name / f"{trait_name}.phenotype" full_path_groups = FLAT_PATH / trait_name / f"{trait_name}.taxids" training_records, genotype, phenotype, group = load_training_files( genotype_file=full_path_genotype, phenotype_file=full_path_phenotype, groups_file=full_path_groups, verb=True) if do_write: with TemporaryDirectory() as tmpdir: gt_out = Path(tmpdir) / 'gt.genotype' write_genotype_file(genotype, gt_out) assert gt_out.is_file() return training_records, genotype, phenotype, group
def generic_train(type, genotype, phenotype, verb, weights, out, n_features=None, params_file=None, *args, **kwargs): """ Train and save a TrexClassifier model. """ kwargs = _fix_uppercase(kwargs) training_records, *_ = load_training_files( genotype_file=genotype, phenotype_file=phenotype, verb=verb ) if params_file is not None: loaded_params = load_params_file(params_file) logger.info(f'Parameters loaded from file:') logger.info('\n' + pformat(loaded_params)) kwargs = {**kwargs, **loaded_params} clf = CLF_MAPPER[type](verb=verb, *args, **kwargs) reduce_features = True if n_features is not None else False clf.train(records=training_records, reduce_features=reduce_features, n_features=n_features) if weights: weights = clf.get_feature_weights() weights_file_name = f"{out}.rank" write_weights_file(weights_file=weights_file_name, weights=weights) save_classifier(obj=clf, filename=out, verb=verb)
def get_training_data(self): td, *_ = load_training_files( FLAT_PATH/trait_name/f'{trait_name}.genotype', FLAT_PATH/trait_name/f'{trait_name}.phenotype' ) return td