示例#1
0
def scatter_blake(a, b, which='circles', classes=[0,1], colors=['k','r'],
        maxval=None, **kwargs):
    if maxval:
        a,b = filter_valpairs(a,b,maxval)
    defaults = {'s': 50, 'alpha':.2, 'lw':0}
    kwargs = ut.dict_set_defaults(kwargs, defaults)
    if type(a[0]) == list or type(a[0]) == tuple:
        # second value is presumed to be class--should be 0 or 1, which will be
        # mapped to the colormap cmap.
        # Also need to clean a and b to just be values rather than values and
        # classes.
        print 'using classes'
        assert ut.i1(a) == ut.i1(b), "Classes not the same between a and b"
        kwargs['c'] = [colors[0] if x==classes[0] else colors[1] for x in
                ut.i1(a)]
        a,b = ut.i0(a), ut.i0(b)
    else:
        c = 'k'
    if which=='pointcloud':
        scatter(a, b, s=50, alpha=0.08, lw=0)
        scatter(a, b, **kwargs)
    elif which=='points':
        scatter(a, b, **kwargs)
    elif which=='fadepoints':
        scatter(a, b, **kwargs)
    elif which=='circles':
        del kwargs['lw']
        scatter(a, b, facecolors='none', edgecolors=c, **kwargs)
    title('R-squared: %0.3f' % ut.r_squared(a,b))
示例#2
0
def pds_alloverlaps(named_pds):
    """
    input: [(name, pairdict), ...]
    """
    for num in range(2,len(named_pds)+1):
        for n_pd in it.combinations(named_pds, num):
            print ut.i0(n_pd), pds_overlap(ut.i1(n_pd))
示例#3
0
def hpa_stats(ppis, locs, max_set_size=None):
    s = attr_to_sets(locs)
    if max_set_size is not None: 
        s = [c for c in s if len(c) < max_set_size]
    plocs = co.pairs_from_complexes(s)
    ppiprots = set(ut.i0(ppis)+ut.i1(ppis))
    anprots = set(ut.i0(locs))
    intprots = set.intersection(ppiprots, anprots)
    print len(ppiprots), len(anprots), len(intprots)
    return ppis_stats(ppis, plocs, intprots)
示例#4
0
def nonpairs_gen(pairs, n):
    items = list(set(ut.i0(pairs) + ut.i1(pairs)))
    exclude = set(pairs)
    pdexclude = pd.PairDict(exclude)
    count = 0
    while count < n:
        pair = (random.choice(items), random.choice(items))
        if not pdexclude.contains(pair):
            yield pair
            count += 1
示例#5
0
def load_pepcount(f):
    lol = ut.load_lol(f)
    print "Omitting header:", lol[0]
    lol = lol[1:]
    peps = ut.i0(lol[1:])
    samples = lol[0][2:]
    arr = np.zeros((len(peps), len(samples)))
    for i,row in enumerate(lol[1:]):
        arr[i,:] = row[2:]
    return peps, samples, arr
示例#6
0
def ppis_scatter(ppis1, ppis2, useinds=range(3)):
    """
    useinds: set to [0,1,3,2] to take ppi.learning_examples output into (score,
    t/f) tuples; [0,1,3] to exclude the class.
    """
    pd1,pd2 = [pd.PairDict([[p[i] for i in useinds] for p in ppis]) 
            for ppis in ppis1,ppis2]
    nvals = len(useinds)-2
    pdcomb = pd.pd_union_disjoint_vals(pd1, pd2, adefaults=[0]*nvals,
            bdefaults=[0]*nvals)
    vals = zip(*ut.i1(pdcomb.d.items()))
    v1s,v2s = zip(*vals[:nvals]), zip(*vals[nvals:])
    v1s,v2s = [ut.i0(x) for x in v1s,v2s]
    return v1s,v2s
def collapse_alternatives(ppis):
    """
    Simply go through the whole set of interactions, and whenever there are two
    nodes with identical partners, and scores are all within 20% or 0.01,
    collapse them to a single node.
    """
    nodes = set([p[0] for p in ppis]+[p[1] for p in ppis])
    d_ints = defaultdict(set)
    for node1,node2,score in ppis:
        d_ints[node1].add((node2,score)); d_ints[node2].add((node1,score))
    alt_groups = []
    for i,(node1,ints1) in enumerate(d_ints.items()):
        # first go through the existing collections
        found_group=False
        for group in alt_groups:
            anode1,an1ints = group[0]
            if (len(ints1)==len(an1ints) and
                    set(ut.i0(ints1))==set(ut.i0(an1ints))):
                group.append((node1,ints1))
                found_group=True
                break
        if not found_group:
            alt_groups.append([(node1,ints1)])
    return alt_groups
示例#8
0
def stats(gold,cxs_list, cxppis_list=None, conv_to_sets=True, funcs=None):
    if conv_to_sets:
        gold = [set(c) for c in gold]
        cxs_list = [[set(c) for c in cxs] for cxs in cxs_list]
    funcs = funcs or ut.i0(cp_funcs)
    use_funcs = [f for f in cp_funcs if f[0] in funcs]
    print funcs
    arr = np.zeros(len(cxs_list),dtype=','.join(['f8']*len(funcs)))
    arr.dtype.names = funcs
    print '%s total maps.' % len(cxs_list)
    for i, (cxs, cxppis) in enumerate(zip(cxs_list, cxppis_list)):
        #sys.stdout.write(str(i))
        print i
        arr[i] = np.array([f(gold, cxs, cxppis) for f in ut.i1(use_funcs)])
    return arr
示例#9
0
def load_paralogs(sp_base, sp_other, ogroup_max, other_ogroup_max):
    """
    Get all pairwise base species paralogs from sharing orthogroups in the
    inParanoid orthology files between the base and other species.
    """
    ogs = orth.load_ogroups(sp_base, sp_other)
    use_ogs = []
    if ogroup_max:
        for og_base, og_other in ogs:
            if (len(og_base) <= ogroup_max 
                    and (other_ogroup_max is None
                        or len(og_other) <= other_ogroup_max)):
                use_ogs.append(og_base)
    else:
        use_ogs = ut.i0(base_ogs)
    base_paralogs = pu.groups_to_pairs(use_ogs)
    return base_paralogs
示例#10
0
def merge_atonce(psets, cutoff, func, sep):
    """
    Once difference seems to be that say a series of overlapping doubles can't
    string together with this approach, whereas it can with the iter approach.
    """
    to_merge = list(psets)
    merged = []
    while len(to_merge)>1:
        c1,c1ps = random.choice(to_merge)
        to_merge.remove((c1,c1ps))
        matches = [(c,ps) for c,ps in to_merge if func(ps,c1ps)>cutoff]
        for m in matches: to_merge.remove(m)
        newname = sep.join([c1]+ut.i0(matches))
        newps = reduce(set.union, [c1ps]+ut.i1(matches))
        merged.append((newname,newps))
    merged.append(to_merge[0])
    return merged
def merge(proj_dir, dirnames, pq_new_path):
    """
    Combine pepquant quantitation from project_1 (etc) PQ_FILE into
    project+PQ_NEW.
    """
    if not os.path.exists(proj_dir):
        os.mkdir(proj_dir)
    proj_name = ut.shortname(proj_dir)
    assert not os.path.exists(pq_new_path), "%s exists. Exiting." % pq_new_path
    dirnames = ut.i0(sort_numbered(dirnames))
    #print "Sorted dirnames:", dirnames
    pq_files = [os.path.join(d,PQ_FILE) for d in dirnames]
    for f in pq_files:
        if not os.path.exists(f):
            print "No Elution File:", f
    eluts = (el.load_elution(f) for f in pq_files if os.path.exists(f))
    merged = reduce(el.combine_elutions, eluts)
    el.write_elution(merged, pq_new_path)
示例#12
0
def select_best(clstruct,
scorenames=['sensitivity','mmr','aupr','cliqueness_3_20','nonov_iter','n_proteins','n_complexes_3_20'],
        rfunc=operator.add, use_norm=False, dispn=15, score_factors=None,
        use_ranks=True, output_ranks=False, print_ranks=False,
        require_scores=None):
    cxstructs, stats = clstruct.cxstructs, clstruct.stats
    clusts = [cxstr.cxs for cxstr in cxstructs]
    scorenames = scorenames or list(stats.dtype.names)
    stats = stats[scorenames]
    ranks = rank_columns(stats)
    if use_ranks:
        stats = ranks
    else:
        if use_norm: stats = norm_columns(stats)
        if score_factors: stats = rescale_columns(stats, score_factors)
    inds = np.argsort(reduce(rfunc, [stats[n] for n in scorenames]))[::-1]
    if require_scores is not None:
        for req_name,thresh in require_scores:
            thresh = (np.median(clstruct.stats[req_name]) if thresh is None
                    else thresh)
            inds = [i for i in inds if clstruct.stats[req_name][i] > thresh]
    nstats = len(stats)
    def filt_params(s):
        return " ".join([p[:2]+p.split('=')[1] for p in s.split(',')])
    show_columns = (scorenames if require_scores is None else
            scorenames+ut.i0(require_scores))
    d = DataFrame(clstruct.stats[inds[:dispn]][show_columns],
            index=["#%s: %sc %si %s" %
                (i,len(clusts[i]),len(cxstructs[i].cxppis),
                    filt_params(cxstructs[i].params)) for i in inds[:dispn]])
    print d.head(dispn)
    for i in inds[:dispn]: 
        #print (i, ["%0.4f " % s for s in clstruct.stats[i]], len(clusts[i]), 
                #len(cxstructs[i].cxppis), cxstructs[i].params)
        if print_ranks:
            print i, [nstats-s for s in ranks[i]]
    if output_ranks:
        return inds
    else:
        return clusts[inds[0]], cxstructs[inds[0]].cxppis, inds[0]
def set_from_pairs(ppis):
    return set(ut.i0(ppis) + ut.i1(ppis))
示例#14
0
    # Plot the feature importances of the trees and of the forest
    if do_plot:
        import pylab as pl
        pl.figure()
        pl.title("Feature importances")
        for tree in forest.estimators_:
            pl.plot(indnums, tree.feature_importances_[indices], "r")
        pl.plot(indnums, importances[indices], "b")
        pl.show()
    feats, weights = zip(*ranked)
    return list(feats), list(weights)

if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.exit("usage: python ml.py train_test feats_f clf_type \
               donorm kwarg1_val1-kwarg2-val2")
    ttf = sys.argv[1]
    tt = np.load(ttf)
    feats = ut.loadpy(sys.argv[2])
    k = sys.argv[3]
    do_norm = sys.argv[4]
    kvs = sys.argv[5]
    kwargs = dict([tuple(kv.split('_')) for kv in kvs.split('-')]) \
        if kvs else {}
    clf = tree(**kwargs) if k=='tree' else svm(kernel=k, **kwargs)
    ts =  [('%s features, %s kernel, norm: %s, %s' %(n,k,do_norm, kvs),
        fit_and_test([fe.keep_cols(t, ut.i0(feats[:n])) for t in tt], 
                        clf, norm=do_norm)) 
        for n in 20,30,40,50]
    ut.savepy(ts, 'ts_%s_%s_%s_%s' %(k,do_norm,kvs,ttf))
示例#15
0
def items_from_pairs(pairs):
    return set(ut.i0(pairs)+ut.i1(pairs))
示例#16
0
def random_pairs(pairs, npairs):
    ps = list(set(ut.i0(pairs) + ut.i1(pairs)))
    rpairs = [(random.choice(ps), random.choice(ps)) for i in
            range(int(npairs*1.5))]
    return pu.dedupe(rpairs)[:npairs]
示例#17
0
def unique_items(pairs):
    return set(ut.i0(pairs) + ut.i1(pairs))