예제 #1
0
def load_training_files(
    genotype_file: str,
    phenotype_file: str,
    groups_file: str = None,
    selected_rank: str = None,
    verb=False
) -> Tuple[
    List[TrainingRecord], List[GenotypeRecord], List[PhenotypeRecord], List[GroupRecord]
]:
    """
    Convenience function to load phenotype, genotype and optionally groups file together,
    and return a list of TrainingRecord.

    :param genotype_file: The path to the input genotype file.
    :param phenotype_file: The path to the input phenotype file.
    :param groups_file: The path to the input groups file. Optional.
    :param selected_rank: The selected standard rank to use for taxonomic grouping
    :param verb: toggle verbosity.
    :return: The collated TrainingRecords as well as single genotype, phenotype and group records
    """
    logger = get_logger(__name__, verb=verb)
    gr = load_genotype_file(genotype_file)
    pr = load_phenotype_file(phenotype_file)
    if groups_file:
        gp = load_groups_file(groups_file, selected_rank=selected_rank)
    else:
        # if not given, each sample gets its own group (not used currently)
        gp = [
            GroupRecord(identifier=x.identifier, group_name=x.identifier, group_id=y)
            for y, x in enumerate(pr)
        ]
    tr = collate_training_data(gr, pr, gp, verb=verb)
    logger.info("Records successfully loaded from file.")
    return tr, gr, pr, gp
예제 #2
0
 def __init__(self, random_state: float = None, verb: bool = False):
     self.logger = get_logger(initname=self.__class__.__name__, verb=verb)
     self.random_state = random_state if type(
         random_state) is RandomState else RandomState(random_state)
     self.conta_source_pos = None
     self.conta_source_neg = None
     self.fitted = False
예제 #3
0
def load_classifier(filename: str, verb=False):
    """
    Load a pickled TrexClassifier to a usable object.

    :param filename: Input filename
    :param verb: Toggle verbosity
    :return: a unpickled PICA ml classifier
    """
    logger = get_logger(initname=__name__, verb=verb)
    if not os.path.isfile(filename):
        raise RuntimeError(f"Input file does not exist: {filename}")
    try:
        obj = joblib.load(filename)
    except ModuleNotFoundError:  # load old models
        sys.modules['pica'] = phenotrex
        obj = joblib.load(filename)
    if not hasattr(obj, 'feature_type'):
        obj.feature_type = 'legacy'
        logger.warning(
            'The loaded classifier does not advertise the feature_type it was trained on. Consider'
            're-training it with a .genotype file containing the requisite metadata to ensure '
            'feature types of the model and the data it is applied to are aligned.'
        )
    logger.info(
        f"Successfully loaded classifier (feature_type={obj.feature_type}).")
    return obj
예제 #4
0
def collate_training_data(
    genotype_records: List[GenotypeRecord],
    phenotype_records: List[PhenotypeRecord],
    group_records: List[GroupRecord],
    verb: bool = False
) -> List[TrainingRecord]:
    """
    Returns a list of TrainingRecord from two lists of GenotypeRecord and PhenotypeRecord.
    To be used for training and CV of TrexClassifier.
    Checks if 1:1 mapping of phenotypes and genotypes exists,
    and if all PhenotypeRecords pertain to same trait.

    :param genotype_records: List[GenotypeRecord]
    :param phenotype_records: List[PhenotypeRecord]
    :param group_records: List[GroupRecord] optional, if leave one group out is the split strategy
    :param verb: toggle verbosity.
    :return: A list of TrainingRecords.
    """
    logger = get_logger(__name__, verb=verb)
    gr_dict = {x.identifier: x for x in genotype_records}
    pr_dict = {x.identifier: x for x in phenotype_records}
    gp_dict = {x.identifier: x for x in group_records}
    traits = set(x.trait_name for x in phenotype_records)
    if not set(gr_dict.keys()).issuperset(set(pr_dict.keys())):
        raise RuntimeError(
            "Not all identifiers of phenotype records were found in the phenotype file. "
            "Cannot collate to TrainingRecords."
        )
    if not set(gp_dict.keys()).issuperset(set(pr_dict.keys())):
        raise RuntimeError(
            "Not all identifiers of phenotype records were found in the groups file. "
            "Cannot collate to TrainingRecords."
        )
    if len(traits) > 1:
        raise RuntimeError(
            "More than one trait has been found in phenotype records. "
            "Cannot collate to TrainingRecords."
        )
    ret = [
        TrainingRecord(
            identifier=pr_dict[x].identifier,
            trait_name=pr_dict[x].trait_name,
            trait_sign=pr_dict[x].trait_sign,
            feature_type=gr_dict[x].feature_type,
            features=gr_dict[x].features,
            group_name=gp_dict[x].group_name,
            group_id=gp_dict[x].group_id
        ) for x in pr_dict.keys()
    ]
    logger.info(f"Collated genotype and phenotype records into {len(ret)} TrainingRecord.")
    return ret
예제 #5
0
def load_classifier(filename: str, verb=False):
    """
    Load a pickled TrexClassifier to a usable object.

    :param filename: Input filename
    :param verb: Toggle verbosity
    :return: a unpickled PICA ml classifier
    """
    logger = get_logger(initname=__name__, verb=verb)
    if not os.path.isfile(filename):
        raise RuntimeError(f"Input file does not exist: {filename}")
    obj = joblib.load(filename)
    logger.info(
        f"Successfully loaded classifier (feature_type={obj.feature_type}).")
    return obj
예제 #6
0
    def _completeness_cv(self, param,
                         **kwargs) -> Dict[float, Dict[float, float]]:
        """
        Perform completeness/contamination simulation and testing for one fold.
        This is a separate function only called by run_cccv which spawns
        subprocesses using a ProcessPoolExecutor from concurrent.futures

        :param param: List [test_records, X_train, y_train, comple_steps, conta_steps, starting_message]
                      workaround to get multiple parameters into this function. (using processor.map)
        """
        # unpack parameters
        test_records, training_records, comple_steps, conta_steps, verb, starting_message = param

        # needed to create a new logger, self.logger not accessible from a different process
        logger = get_logger(__name__, verb=verb)
        logger.info(starting_message)

        classifier = copy.deepcopy(self.pipeline)
        if self.reduce_features:
            recursive_feature_elimination(training_records,
                                          classifier,
                                          n_features=self.n_features,
                                          random_state=self.random_state)

        X_train, y_train, tn, ft = get_x_y_tn_ft(training_records)
        classifier.fit(X=X_train, y=y_train, **kwargs)

        # initialize the resampler with the test_records only,
        # so the samples are unknown to the classifier
        resampler = TrainingRecordResampler(random_state=self.random_state,
                                            verb=False)
        resampler.fit(records=test_records)
        cv_scores = {}
        comple_increment = 1 / comple_steps
        conta_increment = 1 / conta_steps
        for comple in np.arange(0, 1.05, comple_increment):
            comple = np.round(comple, 2)
            cv_scores[comple] = {}
            for conta in np.arange(0, 1.05, conta_increment):
                conta = np.round(conta, 2)
                resampled_set = [
                    resampler.get_resampled(x, comple, conta)
                    for x in test_records
                ]
                cv_scores[comple][conta] = self._validate_subset(
                    resampled_set, classifier)
        return cv_scores
예제 #7
0
    def __init__(self,
                 max_depth: int = 4,
                 learning_rate: float = 0.05,
                 n_estimators: int = 30,
                 gamma: float = 0.,
                 min_child_weight: int = 1,
                 subsample: float = 0.7,
                 colsample_bytree: float = 0.3,
                 n_jobs: int = 1,
                 random_state: int = None,
                 verb=False,
                 *args,
                 **kwargs):
        super().__init__(random_state=random_state, verb=verb)
        if n_jobs == -1:
            n_jobs = os.cpu_count()
        self.n_jobs = n_jobs
        self.logger = get_logger(__name__, verb=True)
        self.default_search_params = {
            'n_estimators': np.array([20, 30, 50, 80, 100, 150]),
            'subsample': np.arange(0.2, 1., 0.1).round(2),
            'colsample_bytree': np.arange(0.2, 1., 0.1).round(2),
            'min_child_weight': np.arange(1, 20),
            'gamma': np.array([0, 0.2, 0.5, 1, 5, 10]),
            'max_depth': np.arange(3, 7),
            'scale_pos_weight': np.array([1, 1.5, 2, 3, 5, 8]),
            'learning_rate': np.arange(0.01, 0.11, 0.01).round(4),
            'eval_metric': ['auc', 'aucpr']
        }

        classifier = xgb.sklearn.XGBClassifier(
            missing=0,
            max_depth=max_depth,
            learning_rate=learning_rate,
            n_estimators=n_estimators,
            gamma=gamma,
            min_child_weight=min_child_weight,
            subsample=subsample,
            colsample_bytree=colsample_bytree,
            n_jobs=n_jobs,
            **kwargs)

        self.pipeline = Pipeline(steps=[("vec",
                                         self.vectorizer), ("clf",
                                                            classifier)])

        self.cv_pipeline = clone(self.pipeline)
예제 #8
0
def save_classifier(obj, filename: str, overwrite=False, verb=False):
    """
    Save a TrexClassifier as a pickled object.

    :param obj: the Python3 object to be saved.
    :param filename: Output filename
    :param overwrite: Overwrite existing files with same name
    :param verb: Toggle verbosity
    """
    logger = get_logger(initname=__name__, verb=verb)
    basefolder = os.path.dirname(os.path.abspath(filename))
    if not os.path.exists(basefolder):
        raise RuntimeError(f"Output folder does not exist: {basefolder}")
    if os.path.isfile(filename):
        if overwrite:
            logger.warning("Overwriting existing file.")
        else:
            raise RuntimeError("Output file exists.")
    logger.info("Begin saving classifier...")
    joblib.dump(obj, filename=filename)
    logger.info("Classifier saved.")
예제 #9
0
파일: svm.py 프로젝트: univieCUBE/phenotrex
    def __init__(self,
                 C: float = 5.,
                 penalty: str = "l2",
                 tol: float = 1.,
                 random_state: int = None,
                 verb=False,
                 *args,
                 **kwargs):
        super().__init__(random_state=random_state, verb=verb)
        self.C = C
        self.penalty = penalty
        self.tol = tol
        self.default_search_params = {
            'C': np.logspace(-6, 4, 30).round(8),
            'tol': np.logspace(0, -5, 10).round(8),
            'max_iter': np.logspace(2, 4.3, 20).astype(int)
        }
        self.logger = get_logger(__name__, verb=verb)
        self.shap_explainer = None

        if self.penalty == "l1":
            self.dual = False
        else:
            self.dual = True

        classifier = LinearSVC(C=self.C,
                               tol=self.tol,
                               penalty=self.penalty,
                               dual=self.dual,
                               class_weight="balanced",
                               random_state=self.random_state,
                               **kwargs)

        self.pipeline = Pipeline(steps=[(
            "vec", self.vectorizer
        ), ("clf",
            CalibratedClassifierCV(classifier, method="sigmoid", cv=5))])
        self.cv_pipeline = Pipeline(
            steps=[("vec", self.vectorizer), ("clf", classifier)])
예제 #10
0
    def __init__(self,
                 pipeline: Pipeline,
                 scoring_function: Callable = balanced_accuracy_score,
                 cv: int = 5,
                 comple_steps: int = 20,
                 conta_steps: int = 20,
                 n_jobs: int = -1,
                 n_replicates: int = 10,
                 random_state: np.random.RandomState = None,
                 verb: bool = False,
                 reduce_features: bool = False,
                 n_features: int = 10000):
        self.pipeline = pipeline
        self.cv = cv
        self.scoring_method = scoring_function
        self.logger = get_logger(__name__, verb=verb)
        if comple_steps < 1:
            self.logger.warning(
                f"Completeness steps parameter is out of range: "
                f"{comple_steps}, was set to 1 instead")
            comple_steps = 1
        if conta_steps < 1:
            self.logger.warning(
                f"Contamination steps parameter is out of range: "
                f"{conta_steps}, was set to 1 instead")
            conta_steps = 1

        self.comple_steps = comple_steps
        self.conta_steps = conta_steps
        if n_jobs is not None:
            self.n_jobs = n_jobs if n_jobs > 0 else os.cpu_count()
        else:
            self.n_jobs = None
        self.n_replicates = n_replicates
        self.random_state = random_state if type(random_state) is np.random.RandomState \
            else np.random.RandomState(random_state)
        self.reduce_features = reduce_features
        self.n_features = n_features
예제 #11
0
 def __init__(self):
     self._known_taxa = {}
     self.logger = get_logger(self.__class__.__name__)
예제 #12
0
def recursive_feature_elimination(records: List[TrainingRecord],
                                  pipeline: Pipeline,
                                  step: float = DEFAULT_STEP_SIZE,
                                  n_features: int = None,
                                  random_state: np.random.RandomState = None):
    """
    Function to apply RFE to limit the vocabulary used by the CustomVectorizer, optional step.

    :param records: list of TrainingRecords, entire training set.
    :param pipeline: the pipeline which vocabulary should be modified
    :param step: rate of features to eliminate at each step. the lower the number, the more steps
    :param n_features: number of features to select (if None: half of the provided features)
    :param random_state: random state for deterministic results
    :return: number of features used
    """
    t1 = time()

    X, y, tn, ft = get_x_y_tn_ft(records)
    vec = pipeline.named_steps["vec"]
    estimator = pipeline.named_steps["clf"]

    if not vec.vocabulary:
        vec.fit(X)
    previous_vocabulary = vec.vocabulary_

    if not n_features:
        n_features = len(previous_vocabulary) // 2

    X_trans = vec.transform(X)

    logger = get_logger(__name__, verb=True)
    split = StratifiedKFold(shuffle=True,
                            n_splits=5,
                            random_state=random_state)
    selector = RFECV(estimator,
                     step=step,
                     min_features_to_select=n_features,
                     cv=split,
                     n_jobs=5,
                     scoring=DEFAULT_SCORING_FUNCTION)
    selector = selector.fit(X=X_trans, y=y)

    original_size = len(previous_vocabulary)
    support = selector.get_support()
    support = support.nonzero()[0]
    new_id = {support[x]: x for x in range(len(support))}
    vocabulary = {
        feature: new_id[i]
        for feature, i in previous_vocabulary.items()
        if new_id.get(i) is not None
    }
    size_after = selector.n_features_

    t2 = time()

    logger.info(
        f"{size_after}/{original_size} features selected using Recursive Feature Eliminiation."
        f" in {np.round(t2 - t1, 2)} seconds.")

    # set vocabulary to vectorizer
    pipeline.named_steps["vec"].vocabulary = vocabulary
    pipeline.named_steps["vec"].vocabulary_ = vocabulary
    pipeline.named_steps["vec"].fixed_vocabulary_ = True

    return size_after
예제 #13
0
from pprint import pformat

from phenotrex.io.flat import (load_training_files, load_genotype_file, load_params_file,
                               write_weights_file, write_params_file,
                               write_misclassifications_file,
                               write_cccv_accuracy_file)
from phenotrex.io.serialization import save_classifier, load_classifier
from phenotrex.util.logging import get_logger
from phenotrex.ml import TrexSVM, TrexXGB, ShapHandler
from phenotrex.transforms.annotation import fastas_to_grs

CLF_MAPPER = {'svm': TrexSVM, 'xgb': TrexXGB}
logger = get_logger("phenotrex", verb=True)


def _fix_uppercase(kwargs):
    """
    Properly handle uppercase arguments which are normalized by click.
    """
    if 'c' in kwargs:
        kwargs['C'] = kwargs.pop('c')
    return kwargs


def generic_train(type, genotype, phenotype, verb, weights, out,
                  n_features=None, params_file=None, *args, **kwargs):
    """
    Train and save a TrexClassifier model.
    """
    kwargs = _fix_uppercase(kwargs)
    training_records, *_ = load_training_files(