示例#1
0
def compact(d, scoref, dtype='f2'):
    sys.path.append(d+'/..')
    import utils as ut
    compactf = '%s.%s.pyd' % (scoref, dtype)
    print compactf, dtype
    ascores = np.loadtxt(scoref, dtype)
    ut.savepy(ascores, compactf)
示例#2
0
def multi_clust(tested, score_cutoffs=None, length_cutoffs=None,
        fracs=[.012,.014], frac_retain=.1, ds=[.1,.25,.3,.35], ms=[.1,.15,.2],
        penalties=[.1,1], overlaps=[.55], haircuts=[0,.2], max_pval=1,
        savef=None, runid=None, show_stats=True, pres=None, gold_nspecies=1,
        gold_splits=None, gold_minlen=3, mdprod_min=.01, **kwargs):
    runid = runid or random.randrange(1,1000)
    fracs = (fracs if fracs is not None 
        else [cl.n_thresh(tested, s)/len(tested) for s in score_cutoffs] if score_cutoffs is not None
        else [le/len(tested) for le in length_cutoffs])
    print "random id:", runid
    clusts = []
    params = [fracs, ds, ms, penalties, overlaps, haircuts]
    products = it.product(*params)
    for (f,d,m,p,o,h) in products:
        if d*m >= mdprod_min:
            cxstruct = cl.filter_clust(ut.list_frac(tested, f),
                    ut.list_frac(tested, frac_retain), merge_cutoff=o, negmult=m, min_density=d,
                    runid=runid, penalty=p, max_pval=max_pval, max_overlap=o,
                    haircut=h, **kwargs)
            cxstruct.params = ('density=%s,frac=%s,f_retain=%s,negmult=%s,penalty=%s,max_overlap=%s,haircut=%s' % (d,f,frac_retain,m,p,o,h))
            clusts.append(cxstruct)
            if show_stats and len(cxstruct.cxs)>0:
                if pres is not None and gold_splits is not None:
                    out = cp.select_best(cp.result_stats(pres.species, gold_splits,
                        clusts[-1:], gold_nspecies, min_gold_size=gold_minlen))
                else:
                    print "Can't show stats: pres and gold_splits required."
            if savef and (len(clusts) % 10 == 1):
                ut.savepy(clusts, ut.pre_ext(savef, "clusts_temp_%s_%s" % (ut.date(),
                    runid)))
    return clusts, runid
示例#3
0
def precalc_scores(scoref, dtype='f2'):
    """
    Also zero out the diagonal to more efficiently remove all self-interactions
    up-front.
    """
    # NOTE to change dtype you must change it in loadtxt below!!
    save_compact = ut.config()['save_compact_corrs'] 
    compactf = '%s.%s.pyd' % (scoref, dtype)
    if os.path.exists(compactf): 
        mat = ut.loadpy(compactf)
        inds = range(mat.shape[0]) # always square score matrix
        mat[inds, inds] = 0
        return mat
    else:
        ascores = np.loadtxt(scoref, dtype='f2')
        if save_compact:
            print 'saving compact', compactf
            ut.savepy(ascores, compactf)
        return ascores
示例#4
0
def precalc_scores(scoref, dtype='f2'):
    """
    Also zero out the diagonal to more efficiently remove all self-interactions
    up-front.
    """
    # NOTE to change dtype you must change it in loadtxt below!!
    save_compact = ut.config()['save_compact_corrs'] 
    compactf = '%s.%s.pyd' % (scoref, dtype)
    if os.path.exists(compactf): 
        mat = ut.loadpy(compactf)
        inds = range(mat.shape[0]) # always square score matrix
        mat[inds, inds] = 0
        return mat
    else:
        ascores = np.loadtxt(scoref, dtype='f2')
        if save_compact:
            print 'saving compact', compactf
            ut.savepy(ascores, compactf)
        return ascores
示例#5
0
    # Plot the feature importances of the trees and of the forest
    if do_plot:
        import pylab as pl
        pl.figure()
        pl.title("Feature importances")
        for tree in forest.estimators_:
            pl.plot(indnums, tree.feature_importances_[indices], "r")
        pl.plot(indnums, importances[indices], "b")
        pl.show()
    feats, weights = zip(*ranked)
    return list(feats), list(weights)

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.exit("usage: python ml.py train_test feats_f clf_type \
               donorm kwarg1_val1-kwarg2-val2")
    ttf = sys.argv[1]
    tt = np.load(ttf)
    feats = ut.loadpy(sys.argv[2])
    k = sys.argv[3]
    do_norm = sys.argv[4]
    kvs = sys.argv[5]
    kwargs = dict([tuple(kv.split('_')) for kv in kvs.split('-')]) \
        if kvs else {}
    clf = tree(**kwargs) if k=='tree' else svm(kernel=k, **kwargs)
    ts =  [('%s features, %s kernel, norm: %s, %s' %(n,k,do_norm, kvs),
        fit_and_test([fe.keep_cols(t, ut.i0(feats[:n])) for t in tt], 
                        clf, norm=do_norm)) 
        for n in 20,30,40,50]
    ut.savepy(ts, 'ts_%s_%s_%s_%s' %(k,do_norm,kvs,ttf))
示例#6
0
def predict_clust(name, sp, nsp, obs=None, exs=None, savef=None, pres=None,
        pd_spcounts=None, cl_kwargs={}, clusts=None, runid=None,
        count_ext=False, cutoff=0.5, n_cvs=7, accept_clust=False,
        obs_fnames=None, base_splits=None, obs_kwargs={}, kfold=3,
        gold_nspecies=2, do_cluster=True, do_2stage_cluster=True,
        cxs_cxppis=None, do_rescue=True, n_rescue=20000, rescue_fracs=20,
        rescue_score=0.9, clstruct=None, **predict_kwargs):
    """
    - obs/test_kwargs: note obs_kwargs is combined with predict_kwargs to enforce
      consistency.
    - pd_spcounts: supply from ppi.predict_all if nsp > 1.
    - base_splits: supply exs.splits to generate examples from existing
      division of complexes.
    - cxs_cxppis: provide if you want to export, or do the ppi rescue
      clustering--also must set accept_clust=True, do_rescue=True
    """
    savef = savef if savef else ut.bigd(name)+'.pyd'
    print "Will save output to", savef
    runid = runid or random.randrange(0,1000)
    if clusts is None: 
        if pres is None:
            if obs is None:
                obs, pd_spcounts = ppi.predict_all(sp, obs_fnames,
                        save_fname=savef.replace('.pyd',''), nsp=nsp,
                        **obs_kwargs)
            if exs is None:
                cvtest_kwargs = ut.dict_quick_merge(obs_kwargs, predict_kwargs)
                n_cvs = 1 if base_splits is not None else n_cvs
                cvs, cvstd = cvstd_via_median(name, sp, nsp, obs_fnames, kfold,
                        base_splits, n_cvs, **cvtest_kwargs)
                if n_cvs > 1:
                    ut.savepy(cvs, ut.pre_ext(savef, '_cvs_%s' % n_cvs))
                ut.savepy(cvstd, ut.pre_ext(savef, '_cvstd'))
                exs=cvstd.exs
            pres = predict(name, sp, obs, exs.arrfeats, nsp, **predict_kwargs)
            pres.exs = exs
            ut.savepy(pres, ut.pre_ext(savef, '_pres'), check_exists=True) 
        else:
            pres=ut.struct_copy(pres)
            if do_rescue:
                assert obs is not None, "Must supply obs for rescue step"
    merged_splits = pres.exs.splits[1] # splits is (lp_splits, clean_splits)
    if do_cluster:
        if cxs_cxppis is None and clstruct is None:
            if clusts is None and cxs_cxppis is None:
                #if calc_fracs:
                    #cl_kwargs['fracs'] = [cp.find_inflection(pres.ppis, merged_splits,
                        #pres.species, gold_nspecies)]
                clusts, runid = multi_clust(pres.ppis, savef=savef, runid=runid,
                        pres=pres, gold_splits=merged_splits,
                        gold_nspecies=gold_nspecies, **cl_kwargs)
                ut.savepy(clusts, ut.pre_ext(savef, '_clusts_id%s' % runid))
            if do_2stage_cluster:
                clusts2 = multi_stage2_clust(clusts, pres.ppis, runid=runid,
                        **cl_kwargs)
                clstruct = cp.result_stats(sp, merged_splits, clusts2,
                        gold_nspecies) 
                ut.savepy(clstruct, ut.pre_ext(savef, '_clstruct2_id%s' % runid))
            else:
                clstruct = cp.result_stats(sp, merged_splits, clusts, nsp) 
                ut.savepy(clstruct, ut.pre_ext(savef, '_clstruct_id%s' % runid))
        if accept_clust:
            if cxs_cxppis is None:
                pres.cxs, pres.cxppis, pres.ind = cp.select_best(clstruct)
                ut.savepy([pres.cxs,pres.cxppis],
                        ut.pre_ext(savef,'_cxs_cxppis_id%s_ind%s_%scxs'
                            % (runid, pres.ind, len(pres.cxs))))
            else:
                pres.cxs, pres.cxppis = cxs_cxppis
                pres.ind = 0
            if do_rescue:
                # note cl_kwargs aren't passed--would be messy
                pres.cxs, pres.cxppis, pres.ppis_rescue = rescue_ppis(pres,
                        obs, n_rescue, cutoff_fracs=rescue_fracs,
                        cutoff_score=rescue_score)
            cyto_export(pres, merged_splits, name_ext='_clust%s_%scxs' % (pres.ind,
                len(pres.cxs)), pd_spcounts=pd_spcounts, arrdata=obs,
                cutoff=cutoff, count_ext=False, arrdata_ppis=None)
            return pres
        else:
            return pres, clstruct
    else:
        return pres