def load_training_files( genotype_file: str, phenotype_file: str, groups_file: str = None, selected_rank: str = None, verb=False ) -> Tuple[ List[TrainingRecord], List[GenotypeRecord], List[PhenotypeRecord], List[GroupRecord] ]: """ Convenience function to load phenotype, genotype and optionally groups file together, and return a list of TrainingRecord. :param genotype_file: The path to the input genotype file. :param phenotype_file: The path to the input phenotype file. :param groups_file: The path to the input groups file. Optional. :param selected_rank: The selected standard rank to use for taxonomic grouping :param verb: toggle verbosity. :return: The collated TrainingRecords as well as single genotype, phenotype and group records """ logger = get_logger(__name__, verb=verb) gr = load_genotype_file(genotype_file) pr = load_phenotype_file(phenotype_file) if groups_file: gp = load_groups_file(groups_file, selected_rank=selected_rank) else: # if not given, each sample gets its own group (not used currently) gp = [ GroupRecord(identifier=x.identifier, group_name=x.identifier, group_id=y) for y, x in enumerate(pr) ] tr = collate_training_data(gr, pr, gp, verb=verb) logger.info("Records successfully loaded from file.") return tr, gr, pr, gp
def __init__(self, random_state: float = None, verb: bool = False): self.logger = get_logger(initname=self.__class__.__name__, verb=verb) self.random_state = random_state if type( random_state) is RandomState else RandomState(random_state) self.conta_source_pos = None self.conta_source_neg = None self.fitted = False
def load_classifier(filename: str, verb=False): """ Load a pickled TrexClassifier to a usable object. :param filename: Input filename :param verb: Toggle verbosity :return: a unpickled PICA ml classifier """ logger = get_logger(initname=__name__, verb=verb) if not os.path.isfile(filename): raise RuntimeError(f"Input file does not exist: {filename}") try: obj = joblib.load(filename) except ModuleNotFoundError: # load old models sys.modules['pica'] = phenotrex obj = joblib.load(filename) if not hasattr(obj, 'feature_type'): obj.feature_type = 'legacy' logger.warning( 'The loaded classifier does not advertise the feature_type it was trained on. Consider' 're-training it with a .genotype file containing the requisite metadata to ensure ' 'feature types of the model and the data it is applied to are aligned.' ) logger.info( f"Successfully loaded classifier (feature_type={obj.feature_type}).") return obj
def collate_training_data( genotype_records: List[GenotypeRecord], phenotype_records: List[PhenotypeRecord], group_records: List[GroupRecord], verb: bool = False ) -> List[TrainingRecord]: """ Returns a list of TrainingRecord from two lists of GenotypeRecord and PhenotypeRecord. To be used for training and CV of TrexClassifier. Checks if 1:1 mapping of phenotypes and genotypes exists, and if all PhenotypeRecords pertain to same trait. :param genotype_records: List[GenotypeRecord] :param phenotype_records: List[PhenotypeRecord] :param group_records: List[GroupRecord] optional, if leave one group out is the split strategy :param verb: toggle verbosity. :return: A list of TrainingRecords. """ logger = get_logger(__name__, verb=verb) gr_dict = {x.identifier: x for x in genotype_records} pr_dict = {x.identifier: x for x in phenotype_records} gp_dict = {x.identifier: x for x in group_records} traits = set(x.trait_name for x in phenotype_records) if not set(gr_dict.keys()).issuperset(set(pr_dict.keys())): raise RuntimeError( "Not all identifiers of phenotype records were found in the phenotype file. " "Cannot collate to TrainingRecords." ) if not set(gp_dict.keys()).issuperset(set(pr_dict.keys())): raise RuntimeError( "Not all identifiers of phenotype records were found in the groups file. " "Cannot collate to TrainingRecords." ) if len(traits) > 1: raise RuntimeError( "More than one trait has been found in phenotype records. " "Cannot collate to TrainingRecords." ) ret = [ TrainingRecord( identifier=pr_dict[x].identifier, trait_name=pr_dict[x].trait_name, trait_sign=pr_dict[x].trait_sign, feature_type=gr_dict[x].feature_type, features=gr_dict[x].features, group_name=gp_dict[x].group_name, group_id=gp_dict[x].group_id ) for x in pr_dict.keys() ] logger.info(f"Collated genotype and phenotype records into {len(ret)} TrainingRecord.") return ret
def load_classifier(filename: str, verb=False): """ Load a pickled TrexClassifier to a usable object. :param filename: Input filename :param verb: Toggle verbosity :return: a unpickled PICA ml classifier """ logger = get_logger(initname=__name__, verb=verb) if not os.path.isfile(filename): raise RuntimeError(f"Input file does not exist: {filename}") obj = joblib.load(filename) logger.info( f"Successfully loaded classifier (feature_type={obj.feature_type}).") return obj
def _completeness_cv(self, param, **kwargs) -> Dict[float, Dict[float, float]]: """ Perform completeness/contamination simulation and testing for one fold. This is a separate function only called by run_cccv which spawns subprocesses using a ProcessPoolExecutor from concurrent.futures :param param: List [test_records, X_train, y_train, comple_steps, conta_steps, starting_message] workaround to get multiple parameters into this function. (using processor.map) """ # unpack parameters test_records, training_records, comple_steps, conta_steps, verb, starting_message = param # needed to create a new logger, self.logger not accessible from a different process logger = get_logger(__name__, verb=verb) logger.info(starting_message) classifier = copy.deepcopy(self.pipeline) if self.reduce_features: recursive_feature_elimination(training_records, classifier, n_features=self.n_features, random_state=self.random_state) X_train, y_train, tn, ft = get_x_y_tn_ft(training_records) classifier.fit(X=X_train, y=y_train, **kwargs) # initialize the resampler with the test_records only, # so the samples are unknown to the classifier resampler = TrainingRecordResampler(random_state=self.random_state, verb=False) resampler.fit(records=test_records) cv_scores = {} comple_increment = 1 / comple_steps conta_increment = 1 / conta_steps for comple in np.arange(0, 1.05, comple_increment): comple = np.round(comple, 2) cv_scores[comple] = {} for conta in np.arange(0, 1.05, conta_increment): conta = np.round(conta, 2) resampled_set = [ resampler.get_resampled(x, comple, conta) for x in test_records ] cv_scores[comple][conta] = self._validate_subset( resampled_set, classifier) return cv_scores
def __init__(self, max_depth: int = 4, learning_rate: float = 0.05, n_estimators: int = 30, gamma: float = 0., min_child_weight: int = 1, subsample: float = 0.7, colsample_bytree: float = 0.3, n_jobs: int = 1, random_state: int = None, verb=False, *args, **kwargs): super().__init__(random_state=random_state, verb=verb) if n_jobs == -1: n_jobs = os.cpu_count() self.n_jobs = n_jobs self.logger = get_logger(__name__, verb=True) self.default_search_params = { 'n_estimators': np.array([20, 30, 50, 80, 100, 150]), 'subsample': np.arange(0.2, 1., 0.1).round(2), 'colsample_bytree': np.arange(0.2, 1., 0.1).round(2), 'min_child_weight': np.arange(1, 20), 'gamma': np.array([0, 0.2, 0.5, 1, 5, 10]), 'max_depth': np.arange(3, 7), 'scale_pos_weight': np.array([1, 1.5, 2, 3, 5, 8]), 'learning_rate': np.arange(0.01, 0.11, 0.01).round(4), 'eval_metric': ['auc', 'aucpr'] } classifier = xgb.sklearn.XGBClassifier( missing=0, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=colsample_bytree, n_jobs=n_jobs, **kwargs) self.pipeline = Pipeline(steps=[("vec", self.vectorizer), ("clf", classifier)]) self.cv_pipeline = clone(self.pipeline)
def save_classifier(obj, filename: str, overwrite=False, verb=False): """ Save a TrexClassifier as a pickled object. :param obj: the Python3 object to be saved. :param filename: Output filename :param overwrite: Overwrite existing files with same name :param verb: Toggle verbosity """ logger = get_logger(initname=__name__, verb=verb) basefolder = os.path.dirname(os.path.abspath(filename)) if not os.path.exists(basefolder): raise RuntimeError(f"Output folder does not exist: {basefolder}") if os.path.isfile(filename): if overwrite: logger.warning("Overwriting existing file.") else: raise RuntimeError("Output file exists.") logger.info("Begin saving classifier...") joblib.dump(obj, filename=filename) logger.info("Classifier saved.")
def __init__(self, C: float = 5., penalty: str = "l2", tol: float = 1., random_state: int = None, verb=False, *args, **kwargs): super().__init__(random_state=random_state, verb=verb) self.C = C self.penalty = penalty self.tol = tol self.default_search_params = { 'C': np.logspace(-6, 4, 30).round(8), 'tol': np.logspace(0, -5, 10).round(8), 'max_iter': np.logspace(2, 4.3, 20).astype(int) } self.logger = get_logger(__name__, verb=verb) self.shap_explainer = None if self.penalty == "l1": self.dual = False else: self.dual = True classifier = LinearSVC(C=self.C, tol=self.tol, penalty=self.penalty, dual=self.dual, class_weight="balanced", random_state=self.random_state, **kwargs) self.pipeline = Pipeline(steps=[( "vec", self.vectorizer ), ("clf", CalibratedClassifierCV(classifier, method="sigmoid", cv=5))]) self.cv_pipeline = Pipeline( steps=[("vec", self.vectorizer), ("clf", classifier)])
def __init__(self, pipeline: Pipeline, scoring_function: Callable = balanced_accuracy_score, cv: int = 5, comple_steps: int = 20, conta_steps: int = 20, n_jobs: int = -1, n_replicates: int = 10, random_state: np.random.RandomState = None, verb: bool = False, reduce_features: bool = False, n_features: int = 10000): self.pipeline = pipeline self.cv = cv self.scoring_method = scoring_function self.logger = get_logger(__name__, verb=verb) if comple_steps < 1: self.logger.warning( f"Completeness steps parameter is out of range: " f"{comple_steps}, was set to 1 instead") comple_steps = 1 if conta_steps < 1: self.logger.warning( f"Contamination steps parameter is out of range: " f"{conta_steps}, was set to 1 instead") conta_steps = 1 self.comple_steps = comple_steps self.conta_steps = conta_steps if n_jobs is not None: self.n_jobs = n_jobs if n_jobs > 0 else os.cpu_count() else: self.n_jobs = None self.n_replicates = n_replicates self.random_state = random_state if type(random_state) is np.random.RandomState \ else np.random.RandomState(random_state) self.reduce_features = reduce_features self.n_features = n_features
def __init__(self): self._known_taxa = {} self.logger = get_logger(self.__class__.__name__)
def recursive_feature_elimination(records: List[TrainingRecord], pipeline: Pipeline, step: float = DEFAULT_STEP_SIZE, n_features: int = None, random_state: np.random.RandomState = None): """ Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step. :param records: list of TrainingRecords, entire training set. :param pipeline: the pipeline which vocabulary should be modified :param step: rate of features to eliminate at each step. the lower the number, the more steps :param n_features: number of features to select (if None: half of the provided features) :param random_state: random state for deterministic results :return: number of features used """ t1 = time() X, y, tn, ft = get_x_y_tn_ft(records) vec = pipeline.named_steps["vec"] estimator = pipeline.named_steps["clf"] if not vec.vocabulary: vec.fit(X) previous_vocabulary = vec.vocabulary_ if not n_features: n_features = len(previous_vocabulary) // 2 X_trans = vec.transform(X) logger = get_logger(__name__, verb=True) split = StratifiedKFold(shuffle=True, n_splits=5, random_state=random_state) selector = RFECV(estimator, step=step, min_features_to_select=n_features, cv=split, n_jobs=5, scoring=DEFAULT_SCORING_FUNCTION) selector = selector.fit(X=X_trans, y=y) original_size = len(previous_vocabulary) support = selector.get_support() support = support.nonzero()[0] new_id = {support[x]: x for x in range(len(support))} vocabulary = { feature: new_id[i] for feature, i in previous_vocabulary.items() if new_id.get(i) is not None } size_after = selector.n_features_ t2 = time() logger.info( f"{size_after}/{original_size} features selected using Recursive Feature Eliminiation." f" in {np.round(t2 - t1, 2)} seconds.") # set vocabulary to vectorizer pipeline.named_steps["vec"].vocabulary = vocabulary pipeline.named_steps["vec"].vocabulary_ = vocabulary pipeline.named_steps["vec"].fixed_vocabulary_ = True return size_after
from pprint import pformat from phenotrex.io.flat import (load_training_files, load_genotype_file, load_params_file, write_weights_file, write_params_file, write_misclassifications_file, write_cccv_accuracy_file) from phenotrex.io.serialization import save_classifier, load_classifier from phenotrex.util.logging import get_logger from phenotrex.ml import TrexSVM, TrexXGB, ShapHandler from phenotrex.transforms.annotation import fastas_to_grs CLF_MAPPER = {'svm': TrexSVM, 'xgb': TrexXGB} logger = get_logger("phenotrex", verb=True) def _fix_uppercase(kwargs): """ Properly handle uppercase arguments which are normalized by click. """ if 'c' in kwargs: kwargs['C'] = kwargs.pop('c') return kwargs def generic_train(type, genotype, phenotype, verb, weights, out, n_features=None, params_file=None, *args, **kwargs): """ Train and save a TrexClassifier model. """ kwargs = _fix_uppercase(kwargs) training_records, *_ = load_training_files(