def cvtest(name, base_sp, nsp, fs, base_featstruct, kfold=2, clf_type='svm', nfeats=100, norm=True, ppi_output=None, train_limit=None, save_data=True, balance_train=False, keep_cols=None, clf_factory=None, clffact_feats=None, **kwargs): """ """ assert kfold>1, "CV K-fold 1 not possible" exs = ppi.feature_array(base_sp, fs, base_featstruct, nsp, **kwargs) if ppi_output is None else ppi_output arrfeats, ntest_pos = fe.arr_copy(exs.arrfeats), exs.ntest_pos assert len(arrfeats)>0, '0 examples not supported' if train_limit: print 'Sampling %s train/cv examples' % train_limit train_limit = train_limit or len(arrfeats) arrfeats = arrfeats if keep_cols is None else fe.keep_cols(arrfeats, keep_cols) arrfeats = fe.keep_rows(arrfeats, random.sample(range(len(arrfeats)), int(train_limit))) # shuffle even if not sampling. don't random.shuffle ntest_pos = int(ntest_pos * train_limit / len(arrfeats)) if clf_type in clf_factories and clf_factory is None: clf_factory, clffact_feats = clf_factories[clf_type] ppis = [] for k in range(kfold): print 'Fold %s:' % k ppis_fold,clf,scaler,feats = fold_test(arrfeats, kfold, k, clf_factory, clffact_feats, nfeats, norm, balance_train) ppis += ppis_fold random.shuffle(ppis) ppis.sort(key=lambda x: x[2], reverse=True) result = Struct(traincv=arrfeats[['id1','id2','hit']], clf=clf, scaler=scaler, ppis=ppis, ntest_pos=ntest_pos, name=name, species=base_sp, ppi_params=str(clf), feats=feats, source_feats=exs.arrfeats.dtype.names, balance_train=balance_train) if save_data: result.exs = exs return result
def enrichment_array_combined(sp_base, sp_dict_elutfs, cxs, func=np.average, nsp=1, scores=["poisson"], exs=None): """ sp_dict_elutfs: {'Ce': [Ce_elution_1, Ce_elution_2, ...] , ...} """ exs = exs or correlation_enrichment([(i, set(c)) for i, c in enumerate(cxs)]) elutfs = ut.flatten([elutfs for sp, elutfs in sp_dict_elutfs.items()]) ppio = ppi.feature_array(sp_base, elutfs, exs, nsp, scores=scores, extdata=[], do_filter=False) newarr = ppio.arrfeats for sp in sp_dict_elutfs.keys(): newarr = fe.merge_features(newarr, "%s.*" % sp, func, False) return newarr