def full_train(model, haplotypes, group_keys, iteration): """ return (matrix_profiles, snp_list) """ X_train = X_test = haplotypes y_train = y_test = group_keys best_score = (-1, None, None, None) for i in range(iteration): # the iteration here is used for stochastic models where each iteration can yield # different result lk_predictions, snplist, orig_predictions = fit_and_predict(model, X_train, y_train, X_test, k) scores = lkprof.calculate_scores(y_test, lk_predictions, len(snplist), model='lk' , selector=model.code, iter = i) if orig_predictions is not None: orig_scores = lkprof.calculate_scores(y_test, orig_predictions , len(snplist), model=model.code, selector=model.code, iter = i) else: orig_scores = None f_min = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0] f_mean = scores.loc[ scores['REG'] == 'MEAN', 'F'].values[0] f_score = 2 * f_min * f_mean / (f_min + f_mean) if f_score > best_score[0]: best_score = (f_score, scores, orig_scores, snplist.tolist()) results.append( best_score[1] ) if best_score[2] is not None: results.append( best_score[2] ) snps['%d/%d/%d/%d' % (simid, k_fold, k, len(best_score[3]))] = best_score[3] # reformat model log log = [ '[I - {%d} %s]' % (simid, line) for line in model.get_loglines()]
def select_2(self, haplotypes1, haplotypes2): """ return (snplist, F): snplist - a list of SNP positions after further selection F = F score for these particular SNP set """ X_train = np.append(haplotypes1, haplotypes2, axis=0) y_train = np.array( [1] * len(haplotypes1) + [2] * len(haplotypes2) ) best_score = (-1, None, None, None) for i in range(3): classifier = DecisionTreeClassifier(class_weight='balanced', random_state = self.randomstate, min_samples_leaf=2) classifier = classifier.fit(X_train, y_train) features = classifier.tree_.feature # remove features with negative position and redundant features = np.unique(features[ features >= 0]) model = FixSNPSelector(features) lk_predictions, snplist, _, params = fit_and_predict(model, X_train, y_train, X_train, len(features)) scores = lkprof.calculate_scores(y_train, lk_predictions) f_score = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0] if f_score > best_score[0]: best_score = (f_score, scores, None, features.tolist()) return best_score[3], best_score[0]
def validator_worker( args ): """ validator: returns (r, scores, snplist, log) where: r: repeat identifier scores: Panda dataframe containing all scores snplist: a dictionary of simid: snplist log: list of log message """ model, y, k_list, fold, iteration, simid = args pid = os.getpid() cerr('[I - pid %d: validator_worker() started]' % pid) np.random.seed( simid % pid ) model.reseed( simid ) if var_dict['X_shape'] == None: X = var_dict['X'] else: cerr('[I - pid %d: validator_worker() is mapping numpy array]' % pid) X = np.frombuffer(var_dict['X'], dtype=np.int8).reshape(var_dict['X_shape']) results = [] snps = {} k_fold = -1 if fold <= 0: # no cross-validation X_train = X_test = X y_train = y_test = y for k in k_list: # best score will be based on highest min F score best_score = (-1, None, None, None) for i in range(iteration): # the iteration here is used for stochastic models where each iteration can yield # different result lk_predictions, snplist, orig_predictions, params = fit_and_predict(model, X_train, y_train, X_test, k) scores = lkprof.calculate_scores(y_test, lk_predictions , k = len(snplist), _k = k, EST = 'lk', SELECTOR = model.code, SIMID = simid , FOLD = k_fold, **params) if orig_predictions is not None: orig_scores = lkprof.calculate_scores(y_test, orig_predictions , k = len(snplist), _k = k, EST = model.code, SELECTOR = model.code, SIMID = simid , FOLD = k_fold, **params) else: orig_scores = None f_score = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0] if f_score > best_score[0]: best_score = (f_score, scores, orig_scores, snplist.tolist()) results.append( best_score[1] ) if best_score[2] is not None: results.append( best_score[2] ) snps['%d/%d/%d/%d' % (simid, k_fold, k, len(best_score[3]))] = best_score[3] # reformat model log log = [ '[I - {%d} %s]' % (simid, line) for line in model.get_loglines()] return (simid, pd.concat( results ), snps, log) # check for sample size suitability for k-folding X, y = prepare_stratified_samples( X, y, fold ) skf = StratifiedKFold(n_splits = fold, shuffle=True, random_state = np.random.randint(1e8)) for train_index, test_index in skf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] k_fold += 1 for k in k_list: # best score will be based on highest min F score best_score = (-1, None, None, None) for i in range(iteration): # the iteration here is used for stochastic models where each iteration can yield # different result lk_predictions, snplist, orig_predictions, params = fit_and_predict(model, X_train, y_train, X_test, k) scores = lkprof.calculate_scores(y_test, lk_predictions, len(snplist), k, 'lk', simid, k_fold) if orig_predictions is not None: orig_scores = lkprof.calculate_scores(y_test, orig_predictions , len(snplist), k, model.code, simid, k_fold) else: orig_scores = None f_score = scores.loc[ scores['REG'] == 'MIN', 'F'].values[0] if f_score > best_score[0]: best_score = (f_score, scores, orig_scores, snplist.tolist()) results.append( best_score[1] ) if best_score[2] is not None: results.append( best_score[2] ) snps['%d/%d/%d/%d' % (simid, k_fold, k, len(best_score[3]))] = best_score[3] # reformat model log log = [ '[I - {%d} %s]' % (simid, line) for line in model.get_loglines()] return (simid, pd.concat( results ), snps, log)
def score(self, genotype_train, group_train, genotype_test, group_test, simid, k_fold): """ return a dataframe containing scores and dict of snps """ results = [] snps = {} log = [] for k in self.k_list: # best score containe (F, score_dataframe, orig_score_dataframe, snplist) best_score = (-1, None, None, None) for i in range(self.iteration): lk_pred, snplist, orig_pred, params = self.fit_and_predict( genotype_train, group_train, genotype_test, k) if lk_pred is None: continue scores = lkprof.calculate_scores(group_test, lk_pred, EST='lk', k=len(snplist), _k=k, SELECTOR=self.code, MODELID=self.model_id, SIMID=simid, FOLD=k_fold, **params) orig_scores = None if orig_pred is not None: orig_scores = lkprof.calculate_scores( group_test, orig_pred, EST=self.code, k=len(snplist), _k=k, SELECTOR=self.code, MODELID=self.model_id, SIMID=simid, FOLD=k_fold, **params) f_score = scores.loc[scores['REG'] == 'MIN', 'F'].values[0] if f_score > best_score[0]: best_score = (f_score, scores, orig_scores, snplist.tolist()) if best_score[0] < 0: continue results.append(best_score[1]) if best_score[2] is not None: results.append(best_score[2]) snps['%s/%d/%d/%d/%d' % (self.model_id, simid, k_fold, k, len(best_score[3]))] = best_score[3] log += [ '[I - {%d|%s}: %s]' % (simid, self.model_id, line) for line in self.flush_log() ] if len(results) <= 0: return (pd.DataFrame(), snps, log) return (pd.concat(results, sort=False), snps, log)
def select(self, haplotypes, groups, haplotest, k=None): # we use k for redundancy parameters if k == 0 or k is None: k = 1 candidate_L = [] # [ (pos, rank, no_actual_pops)] # we traverse through the tree for (level, pop1, pop2) in traverse(self.guide_tree): n_pops = len(pop1) + len(pop2) haplotypes1 = haplotypes[np.isin(groups, pop1)] haplotypes2 = haplotypes[np.isin(groups, pop2)] if len(haplotypes1) < 4: cerr('[I - insufficient population size for %s]' % pop1) if len(haplotypes2) < 4: cerr('[I - insufficient population size for %s]' % pop2) # convert haplotypes to allele counts ac1 = count_allele(haplotypes1) ac2 = count_allele(haplotypes2) # calculate highest FST FST = [] num, den = allel.hudson_fst(ac1, ac2) # NOTE: the line below avoids warning (invalid value in true_divide) # when den == 0, which should be perfectly ok for FST calculation den[den == 0] = -1 fst = num / den # check for FST == 1.0 ultimate_fst_pos = np.nonzero(fst == 1.0)[0] if len(ultimate_fst_pos) > 0: self.log('FST: 1.0 at %s for pop %s <> %s' % (str(ultimate_fst_pos), pop1, pop2)) if len(ultimate_fst_pos) > k and self.priority is not None: # get ultimate_fst based on priority ultimate_priority = self.priority[ultimate_fst_pos] sortidx = ultimate_fst_pos[np.argsort(ultimate_priority)] #import IPython; IPython.embed() else: #fst[ np.isnan(fst) ] = 0 sortidx = np.argsort(fst) # get highest FST highest_fst_pos = sortidx[-(k + 1):-1] highest_fst_val = fst[highest_fst_pos] #cerr('[I - highest FST: %5.4f at %d for pops %s and %s' % (highest_fst_val, highest_fst_pos, pop1, pop2)) # check suitability of SNPs snplist, F = None, -1 if highest_fst_val.max() < self.min_fst: if self.max_leaf_snp > k: X_train = np.append(haplotypes1, haplotypes2, axis=0) y_train = np.array([1] * len(haplotypes1) + [2] * len(haplotypes2)) best_iteration = (-1, None) for i in range(k, self.max_leaf_snp): features = sortidx[-(i + 1):-1] model = FixSNPSelector('dummy', snpindex=features) lk_predictions, snplist, _, params = model.fit_and_predict( X_train, y_train, X_train, len(features)) scores = lkprof.calculate_scores( y_train, lk_predictions) F = scores.loc[scores['REG'] == 'MIN', 'F'].values[0] if best_iteration[0] < F: best_iteration = (F, snplist) snplist, F = best_iteration[1], best_iteration[0] snplist_2, F_2 = self.select_2(haplotypes1, haplotypes2) if F_2 > F: snplist, F = snplist_2, F_2 if snplist is not None: self.log('F: %5.4f SNP: %d for pop %s <> %s => %s' % (F, len(snplist), pop1, pop2, snplist)) for p in snplist: candidate_L.append((p, level, n_pops)) continue # TODO: 2nd approach: find 2 SNPs with highest r^2(st) eg r^2 subpopulation vs r^2 total population # if snplist is None, just provide warning notice ! else: self.log('low FST = %5.4f for %s vs %s' % (highest_fst_val.max(), pop1, pop2)) # append to candidate_L for p in highest_fst_pos: candidate_L.append((p, level, n_pops)) self.log('FST: %s SNP: %d for pop %s <> %s => %s' % (str(highest_fst_val), len(highest_fst_pos), pop1, pop2, str(highest_fst_pos))) # process candidate_L L = np.unique(np.array(sorted([x[0] for x in candidate_L]))) # return snp position return (L, None, {})