Exemplo n.º 1
0
def scatter_blake(a, b, which='circles', classes=[0,1], colors=['k','r'],
        maxval=None, **kwargs):
    if maxval:
        a,b = filter_valpairs(a,b,maxval)
    defaults = {'s': 50, 'alpha':.2, 'lw':0}
    kwargs = ut.dict_set_defaults(kwargs, defaults)
    if type(a[0]) == list or type(a[0]) == tuple:
        # second value is presumed to be class--should be 0 or 1, which will be
        # mapped to the colormap cmap.
        # Also need to clean a and b to just be values rather than values and
        # classes.
        print 'using classes'
        assert ut.i1(a) == ut.i1(b), "Classes not the same between a and b"
        kwargs['c'] = [colors[0] if x==classes[0] else colors[1] for x in
                ut.i1(a)]
        a,b = ut.i0(a), ut.i0(b)
    else:
        c = 'k'
    if which=='pointcloud':
        scatter(a, b, s=50, alpha=0.08, lw=0)
        scatter(a, b, **kwargs)
    elif which=='points':
        scatter(a, b, **kwargs)
    elif which=='fadepoints':
        scatter(a, b, **kwargs)
    elif which=='circles':
        del kwargs['lw']
        scatter(a, b, facecolors='none', edgecolors=c, **kwargs)
    title('R-squared: %0.3f' % ut.r_squared(a,b))
Exemplo n.º 2
0
def result_gold(splits, species, split_inds, make_unmerged=False,
        consv_sp='Dm'):
    if make_unmerged: 
        print "Converting to unmerged using conserved:", (consv_sp if consv_sp
                else "None")
        ppi_corum,_,_ = ppi.load_training_complexes(species,'',consv_sp)
        splits = unmerged_splits_from_merged_splits(ut.i1(ppi_corum),
                [[ut.i1(s) for s in split] for split in splits])
    gold = ut.i1(reduce(operator.add, [splits[i] for i in split_inds]))
    return gold
Exemplo n.º 3
0
def unmerged_splits_from_merged_splits(unmerged, merged_splits):
    """
    Unmerged: list of enumerated/named complex sets, usually from
    ppi.load_training_complexes.
    Merged_splits: A list of sets for each split (often 3 splits).
    """
    merged_splits = [ut.i1(ms) for ms in merged_splits]
    unmerged = ut.i1(unmerged)
    usplits = [[] for ms in merged_splits] + [[]] # extra for non matches
    for u in unmerged:
        matches = [best_match(u, ms, sensitivity) for ms in merged_splits]
        which_split = np.argmax(matches) if max(matches) > 0 else -1
        usplits[which_split].append(u)
    return usplits
Exemplo n.º 4
0
def triple_venn(three_ppis, names=['a','b','c']):
    # Can send output to venn3(sizes, names) for plotting
    ppis_names = zip(three_ppis, names)
    full_sizes = [len(p) for p in three_ppis]
    print zip(names, full_sizes)
    trip = ints_overlap(three_ppis)
    print names, trip
    intersects_2 = []
    for (a,namea),(b,nameb) in it.combinations(ppis_names,2):
        intersect = ints_overlap([a,b])
        print namea, nameb, "--", intersect
        intersect_2 = intersect - trip
        print namea, nameb, "-only-", intersect_2
        intersects_2.append((set([namea,nameb]), intersect_2))
    only_singles = []
    for i,(a,namea) in enumerate(ppis_names):
        #print namea, len(a), trip, intersects_2 #debug
        only_single = (len(a) - trip - sum([x[1] for x in intersects_2 if namea in x[0]]))
        print namea, "only:", only_single
        only_singles.append(only_single)
    # for output into matplotlib_venn format
    set_sizes = [0] * 7 
    set_sizes[6] = trip
    set_sizes[:2], set_sizes[3] = only_singles[:2], only_singles[2]
    set_sizes[2], set_sizes[4], set_sizes[5] = ut.i1(intersects_2)
    return set_sizes, names
Exemplo n.º 5
0
def pds_alloverlaps(named_pds):
    """
    input: [(name, pairdict), ...]
    """
    for num in range(2,len(named_pds)+1):
        for n_pd in it.combinations(named_pds, num):
            print ut.i0(n_pd), pds_overlap(ut.i1(n_pd))
Exemplo n.º 6
0
def triple_venn_consv():
    hints = co.load_havug_ints()
    ppi_cxs, clean_cxs, corconsv = ppi.load_training_complexes("Hs", "Dm")
    cints = co.pairs_from_complexes(ut.i1(ppi_cxs))  # exclude huge ones
    ints23 = ut.loadpy(ut.bigd("../23_collapsenodes/Hs_filtorth025_withsc_2sp_refilt2sp_cxs_cxppis_clust27_532cxs"))[1]
    ints3 = [cp.consv_pairs(i, h2d) for i in ints23, hints, cints]
    cp.triple_venn(ints3, ["map23", "havug", "corum"])
Exemplo n.º 7
0
def ppis_gold_standard(ppis, cxs_splits, species):
    pdppis = pd.PairDict([p[:3] for p in ppis])
    print len(pdppis.d), "predicted interactions"
    ppi_cxs,_,all_cxs = ppi.load_training_complexes(species, None,'') #conv doesn't matter
    pdcorum = pd.PairDict([(i[0],i[1],'gold') for i in
                        co.pairs_from_complexes(ut.i1(all_cxs))])
    print len(pdcorum.d), "total gold standard"
    pdcomb = pd.pd_union_disjoint_vals(pdppis, pdcorum)
    unmr_splits = cp.unmerged_splits_from_merged_splits(ppi_cxs,cxs_splits)
    print "unmerged split assignment lengths", [len(s) for s in unmr_splits]
    pdtrainpos = pd.PairDict([(t[0],t[1]) for t in
        co.pairs_from_complexes(unmr_splits[0])])
    print len(pdtrainpos.d), "total train interactions"
    counterrs = 0
    for tpair in pdtrainpos.d:
        cpair = pdcomb.find(tpair)
        #assert cpair is not None, "Gold standard problem--filter_methods changed since run?"
        if cpair is None or pdcomb.d[cpair][1] != 'gold':
            #print 'error: train should be subset', tpair
            counterrs += 1
        else:
            pdcomb.d[cpair][1] = 'train'
    if counterrs: print "number of training not found in gold std:", counterrs
    comblist = [list(k)+list(v) for k,v in pdcomb.d.items()]
    print (len([1 for p in comblist if p[2] and p[3]=='gold']), 
            "ppis in gold not train")
    print len([1 for p in comblist if p[2] and p[3]=='train']), "ppis in train"
    # only return those that are predictions
    return [p for p in comblist if p[2]]
Exemplo n.º 8
0
def prot_counts_pep2prots(peplist, only_uniques, pep2prots):
    """
    - pep2prots: use supplied dict of {peptide: set(proteinids)} instead of the
      protein ids on the lines in the pep_list file, in which case sum up the
      counts for a peptide-spectral combination. 
    """
    assert only_uniques, "Only handles only_uniques=True so far."
    exclude_peps = (set([pep for pep,prots in pep2prots.items() if len(prots)>1])
            if only_uniques else set([]))
    print "%s non-unique peptides to exclude." % len(exclude_peps)
    pep_samp_counts = defaultdict(float)
    for _,sample,pep,count in peplist:
        if pep not in exclude_peps:
            pep_samp_counts[(pep,sample)] += float(count)
    # Currently ignoring peptides without a mapping.
    prots = sorted(list(reduce(set.union, [pep2prots[pep] for (pep,_),_ in
        pep_samp_counts.items() if pep in pep2prots])))
    samples = sorted(list(set(ut.i1(peplist))))
    print "%s unique proteins. %s samples." % (len(prots), len(samples))
    dprots, dsamples = [ut.list_inv_to_dict(lst) for lst in prots, samples]
    counts = np.zeros((len(prots), len(samples)), dtype='float32')
    for (pep,sample),count in pep_samp_counts.items():
        if pep in pep2prots: # Currently ignoring peptides without a mapping.
            assert len(pep2prots[pep])==1, "Non-unique peptide found"
            counts[dprots[list(pep2prots[pep])[0]], dsamples[sample]] += count
    totals = counts.sum(axis=1)
    nonzero = totals > 0
    prots, totals = [list(np.array(lst)[nonzero]) for lst in prots, totals]
    counts = counts[nonzero,:]
    return prots, samples, counts, totals
Exemplo n.º 9
0
def hist_prot_counts(fs, in_prots_sets=[None],**kwargs):
    """
    Usually set linewidth=3, histtype='step', range=(0,18), bins=36
    """
    pcounts = prot_counts(fs).items()
    for prots in in_prots_sets:
        if prots:
            pcounts = [(p,c) for p,c in pcounts if p in prots]
        hist(np.log2(ut.i1(pcounts)), **kwargs)
Exemplo n.º 10
0
def attr_to_sets(atts):
    """
    Given a list of items with attributes, return a list of item sets.
    Input: [(item1, attr1), (item2, attr2), (item3, attr1), ...]
    Output: [{item1,item3}, {item2}, ...]
    """
    datts = defaultdict(set)
    for i,a in atts:
        datts[a].add(i)
    return ut.i1(datts.items())
Exemplo n.º 11
0
def arrfeats_prep_all_data(arrfeats, ppis, sp="Hs", gold_consv="Dm", cutoff=0.5):
    print "Adding species summary."
    arrfeats = fe.arr_add_spsummary(arrfeats, cutoff)
    print "Adding ppis."
    arrfeats = fe.arrfeats_add_ppis(arrfeats, ppis)
    _, _, all_cxs = ppi.load_training_complexes(sp, None, gold_consv)
    pdgold = pd.PairDict(co.pairs_from_complexes(ut.i1(all_cxs)))
    print "Setting trues."
    arrfeats = fe.arrfeats_set_gold(arrfeats, pdgold)
    return arrfeats
Exemplo n.º 12
0
def nonpairs_gen(pairs, n):
    items = list(set(ut.i0(pairs) + ut.i1(pairs)))
    exclude = set(pairs)
    pdexclude = pd.PairDict(exclude)
    count = 0
    while count < n:
        pair = (random.choice(items), random.choice(items))
        if not pdexclude.contains(pair):
            yield pair
            count += 1
Exemplo n.º 13
0
def hpa_stats(ppis, locs, max_set_size=None):
    s = attr_to_sets(locs)
    if max_set_size is not None: 
        s = [c for c in s if len(c) < max_set_size]
    plocs = co.pairs_from_complexes(s)
    ppiprots = set(ut.i0(ppis)+ut.i1(ppis))
    anprots = set(ut.i0(locs))
    intprots = set.intersection(ppiprots, anprots)
    print len(ppiprots), len(anprots), len(intprots)
    return ppis_stats(ppis, plocs, intprots)
Exemplo n.º 14
0
def ppis_scatter(ppis1, ppis2, useinds=range(3)):
    """
    useinds: set to [0,1,3,2] to take ppi.learning_examples output into (score,
    t/f) tuples; [0,1,3] to exclude the class.
    """
    pd1,pd2 = [pd.PairDict([[p[i] for i in useinds] for p in ppis]) 
            for ppis in ppis1,ppis2]
    nvals = len(useinds)-2
    pdcomb = pd.pd_union_disjoint_vals(pd1, pd2, adefaults=[0]*nvals,
            bdefaults=[0]*nvals)
    vals = zip(*ut.i1(pdcomb.d.items()))
    v1s,v2s = zip(*vals[:nvals]), zip(*vals[nvals:])
    v1s,v2s = [ut.i0(x) for x in v1s,v2s]
    return v1s,v2s
Exemplo n.º 15
0
def stats(gold,cxs_list, cxppis_list=None, conv_to_sets=True, funcs=None):
    if conv_to_sets:
        gold = [set(c) for c in gold]
        cxs_list = [[set(c) for c in cxs] for cxs in cxs_list]
    funcs = funcs or ut.i0(cp_funcs)
    use_funcs = [f for f in cp_funcs if f[0] in funcs]
    print funcs
    arr = np.zeros(len(cxs_list),dtype=','.join(['f8']*len(funcs)))
    arr.dtype.names = funcs
    print '%s total maps.' % len(cxs_list)
    for i, (cxs, cxppis) in enumerate(zip(cxs_list, cxppis_list)):
        #sys.stdout.write(str(i))
        print i
        arr[i] = np.array([f(gold, cxs, cxppis) for f in ut.i1(use_funcs)])
    return arr
Exemplo n.º 16
0
def merge_atonce(psets, cutoff, func, sep):
    """
    Once difference seems to be that say a series of overlapping doubles can't
    string together with this approach, whereas it can with the iter approach.
    """
    to_merge = list(psets)
    merged = []
    while len(to_merge)>1:
        c1,c1ps = random.choice(to_merge)
        to_merge.remove((c1,c1ps))
        matches = [(c,ps) for c,ps in to_merge if func(ps,c1ps)>cutoff]
        for m in matches: to_merge.remove(m)
        newname = sep.join([c1]+ut.i0(matches))
        newps = reduce(set.union, [c1ps]+ut.i1(matches))
        merged.append((newname,newps))
    merged.append(to_merge[0])
    return merged
Exemplo n.º 17
0
def load_kegg_sequentials(fname, do_convert=True):
    dkegg = load_kegg_brite(fname)
    kegg_paths = [ut.i1(v) for v in dkegg.values() if v]
    def path_pairs(list_path):
        return [(list_path[i],list_path[i+1]) for i in range(len(list_path)-1)]
    group_pairs = ut.flatten([path_pairs(lpath) for lpath in kegg_paths])
    #if return_groups:
        #if conv_dict:
            #return convert_groups_singles(labeled_pairs, conv_dict)
        #else:
            #return labeled_pairs
    single_pairs = [(xi,yi) for x,y in group_pairs for xi in x for yi in y]
    unique_pairs = pu.dedupe(single_pairs)
    print "%s total, %s single, %s unique pairs returned" % (
            len(group_pairs), len(single_pairs), len(unique_pairs)) 
    if do_convert:
        conv_dict = ut.dict_inverse_sets(orth.convert_dict('Hs','Hs_entrez'))
        conv_pairs = convert_pairs_singles(unique_pairs, conv_dict)
        print "%s converted pairs with 1-1 matches" % len(conv_pairs)
        return conv_pairs
    else:
        return unique_pairs
Exemplo n.º 18
0
def load_havug_cxppis():
    hints = load_havug_ppis()
    hcxs = load_havug_cxs()
    hints = cl._filter_ints(hints, ut.i1(hcxs))
    return hints
Exemplo n.º 19
0
def random_pairs(pairs, npairs):
    ps = list(set(ut.i0(pairs) + ut.i1(pairs)))
    rpairs = [(random.choice(ps), random.choice(ps)) for i in
            range(int(npairs*1.5))]
    return pu.dedupe(rpairs)[:npairs]
Exemplo n.º 20
0
def set_from_pairs(ppis):
    return set(ut.i0(ppis) + ut.i1(ppis))
Exemplo n.º 21
0
def unique_items(pairs):
    return set(ut.i0(pairs) + ut.i1(pairs))
Exemplo n.º 22
0
def items_from_pairs(pairs):
    return set(ut.i0(pairs)+ut.i1(pairs))