Пример #1
0
def family_exemplar_structs(rfid,
                            refseq_method = None,
                            sp_method = None,
                            aff_type = None):

    suboptimals = rutils.family_suboptimals(rfid)
    c2 = rutils.cluster_2(spairs, ungapped_ref)

    arr = rutils.rna_draw(ungapped_ref.seq, 
                          rutils.pairs_stk(sp,len(ungapped_ref)),
                          'name' )

    raise Exception()
    affinities, ss = rutils.affinity_matrix(spairs, aff_type = aff_type)
    aff_shape, ss_shape = rutils.affinity_matrix(spairs, aff_type = 'easy', ss_multiplier = .5)
    

    pca_vecs = mlab.PCA(affinities).project(affinities)  
    pca_vecs_shape = mlab.PCA(aff_shape).project(aff_shape)  
    inds = compute_clusters(aff_shape, ss_shape)
    exemplars = list(set(inds))
    
    import compbio.utils.colors as mycolors
    ct = mycolors.getct(len(exemplars))
    import matplotlib.pyplot as plt
    f = plt.gcf()
    plt.clf()
    
    for idx0, embeddings in enumerate([pca_vecs, pca_vecs_shape]):
            ax = f.add_subplot('21{0}'.format(idx0 +1))

            lims =[ [min(embeddings[:,0]),max(embeddings[:,0])],
                         [min(embeddings[:,1]),max(embeddings[:,1])] ]
            lims += [-.5,.5] *squeeze(diff(lims,1))[:,newaxis]
            

            ax.set_xlim(lims[0])
            ax.set_ylim(lims[1])
    
            print sum(embeddings)
            for idx, embedding in enumerate(embeddings):
              if mod(idx,1) != 0: continue
              sp = spairs[idx]
              arr = rutils.rna_draw(ungapped_ref.seq, 
                              rutils.pairs_stk(sp,len(ungapped_ref)),
                              'name' )
              struct_emb = arr + embedding[0:2]
              #plt.plot(*struct_emb.T)
              
              pkw = {'color':ct[exemplars.index(inds[idx])],
                     'lw':8 if idx in inds else 1,
                     'alpha': 1 if idx in inds else .2}
              
              lc = rplots.show_rna(embedding, arr, pkw = pkw)
    #exemplar_structs = [spairs[e] for e in set(inds)]  
    raise Exception()

    return pca_vecs, exemplar_structs
Пример #2
0
def main():
    sonets = open_corpus('corpus1.txt')
    anna = open_corpus('corpus2.txt')

    sonets_data = make_features(sonets)
    anna_data = make_features(anna)

    data = np.vstack((sonets_data, anna_data))
    p = mlab.PCA(data, True)
    N = len(sonets_data)
    print(p.Wt)

    plt.plot(p.Y[N:, 0], p.Y[N:, 1], 'og', p.Y[:N, 0], p.Y[:N, 1], 'sb')
    # зелененькое - анна каренина, а синенькое - сонеты

    # Правда ли, что существует линейная комбинация признаков (т.е. значение по первой оси в преобразованных методом главных компонент данных), и пороговое значение, при которых больше 70% текстов каждого жанра находятся с одной стороны от порогового значения? Напишите программу genre-by-pos.py, которая демонстрирует ответ на этот вопрос.

    # Мне кажется, что ответ да, судя по картинке
    print(
        'Линейная комбинация и пороговое значение, при которых больше 70% текстов каждого жанра находятся с одной стороны от порогового значения, существуют.'
    )
    # plt.savefig('result.png')
    plt.show()

    # Подберем, например, на глаз по картинке пороговое значение,
    # при котором больше 70% предложений анны карениной справа от него, и больше 70% предложений сонетов -- слева
    # Например:
    print('Пороговое значение: -4.2')
    print(
        sum(p.Y[N:, 0] > -4.2) / len(p.Y[N:, 0]) * 100,
        '- процент предложений "Анны Карениной", которые лежат справа от порога'
    )
    print(
        sum(p.Y[:N, 0] < -4.2) / len(p.Y[:N, 0]) * 100,
        '- процент предложений сонетов, которые лежат слева от порога')
Пример #3
0
def get_test_data():
    f = File(
        'C:/Users/sommerc/data/Chromatin-Microtubles/Analysis/H2b_aTub_MD20x_exp911_2_channels_nozip/dump_save/two_positions.hdf5'
    )
    pos = f[f.positions[0]]
    print f.positions[0]
    events = pos.get_objects('event')
    feature_matrix = []
    labels = []
    for e in events:
        item_features = e.item_features
        item_labels = e.item_labels
        if item_features is not None:
            feature_matrix.append(item_features)
            labels.append(item_labels)
    feature_matrix = numpy.concatenate(feature_matrix)
    feature_matrix = remove_constant_columns(feature_matrix)
    feature_matrix = stats.zscore(feature_matrix)
    pca = mlab.PCA(feature_matrix)
    feature_matrix = pca.project(feature_matrix)
    feature_matrix = feature_matrix.reshape(len(events), len(item_features),
                                            -1)
    f.close()
    labels2 = numpy.asarray(labels)
    labels2[labels2 == 7] = 1
    labels2 += 1
    return feature_matrix, labels2
Пример #4
0
    def preprocess(self):
        """Preprocess data for further analysis

        1) remove columns with zeros
        2) remove columns with NAN's
        3) z-score data
        4) perform pca
        """

        data, nodes = self.data_matrix()

        #import pdb; pdb.set_trace()
        if data.shape[0] <= data.shape[1]:
            msg = ("Not enough objects in data set to proceed",
                   "Number of object is smaller than the number of features",
                   "(%d <= %d)" % tuple(data.shape))
            raise EventSelectionError(msg)

        # delete columns with zeros
        ind = np.where(data == 0)[1]
        data = np.delete(data, ind, 1)

        # remove columns with nans
        data, nodes = self._filter_nans(data, nodes)

        data_zs = stats.zscore(data)
        # sss.zscore(self.remove_constant_columns(data))
        pca = mlab.PCA(data_zs)
        # XXX take the minimum to make it more readable
        num_features = np.nonzero(np.cumsum(pca.fracs) > self.varfrac)[0][0]
        data_pca = pca.project(data_zs)[:, 0:num_features]
        return data_pca, nodes
Пример #5
0
def plotKMedoid(K, X):
    # Used demo from https://stackoverflow.com/questions/9847026/plotting-output-of-kmeanspycluster-impl
    # cluster
    kMedoids = trainKMedoid(K, X)
    clustersIds = assignKMedoids(K, X, kMedoids)

    # reduce dimensionality
    iris_pca = mlab.PCA(X)
    cutoff = iris_pca.fracs[1]
    iris_2d = iris_pca.project(X, minfrac=cutoff)
    medoids_2d = iris_pca.project(list(kMedoids.values()), minfrac=cutoff)

    # make a plot
    colors = ['red', 'green', 'blue', 'yellow']
    plt.figure()
    plt.xlim([iris_2d[:, 0].min() - .5, iris_2d[:, 0].max() + .5])
    plt.ylim([iris_2d[:, 1].min() - .5, iris_2d[:, 1].max() + .5])
    plt.xticks([], [])
    plt.yticks([], [])  # numbers aren't meaningful

    # show the centroids
    plt.scatter(medoids_2d[:, 0],
                medoids_2d[:, 1],
                marker='o',
                c=colors,
                s=100)

    # show user numbers, colored by their cluster id
    for i, ((x, y), kls) in enumerate(zip(iris_2d,
                                          list(clustersIds.values()))):
        plt.annotate(str(i),
                     xy=(x, y),
                     xytext=(0, 0),
                     textcoords='offset points',
                     color=colors[kls])
Пример #6
0
 def pca(self, feature, var_lim):
     i = 0
     max_var = 0
     results = mlab.PCA(feature)
     while max_var <= var_lim:
         max_var += results.fracs[i]
         i += 1
     return results.Y[:, 0:5]
Пример #7
0
def surf_segmentation(points, config):
    global ELAPSE_SEG
    config.slice_count = min(int(len(points) / config.origin_points),
                             config.slice_count)
    assert len(points) / config.slice_count >= config.origin_points
    surfs = []
    npoints = point_normalize(points)
    # cov = np.cov(npoints)
    pca_md = mlab.PCA(np.copy(npoints))
    projection0 = pca_md.Y[:, 0]
    step_count = len(projection0) / config.slice_count
    pointsets = [np.array([]).reshape(0, 3)] * config.slice_count
    starttime = time.clock()

    # projection0_index = np.hstack((projection0, np.arange(len(projection0))))
    sorted_projection0_index = np.argsort(projection0)
    # sorted_projection0 = projection0[sorted_projection0_index]
    current_slot_count, ptsetid = 0, 0
    # for (index, value) in zip(sorted_projection0_index, sorted_projection0):
    for index in sorted_projection0_index:
        pointsets[ptsetid] = np.vstack((pointsets[ptsetid], npoints[index]))
        current_slot_count += 1
        if current_slot_count > step_count:
            current_slot_count = 0
            ptsetid += 1

    partial_surfs = []
    for ptset in pointsets:
        print "before segment", len(partial_surfs), len(ptset)
        if len(ptset) > 0:
            partial_surfs, _ = identifysurf(np.copy(ptset),
                                            AdaSurfConfig({
                                                'origin_points':
                                                config.origin_points,
                                                'most_combination_points':
                                                config.most_combination_points,
                                                'same_threshold':
                                                config.same_threshold,
                                                'filter_rate':
                                                config.filter_rate,
                                                'ori_adarate':
                                                config.ori_adarate
                                            }),
                                            donorm=False,
                                            surfs=partial_surfs)
        print "after segment", len(partial_surfs)
    surfs.extend(partial_surfs)

    return surfs, npoints
Пример #8
0
def get_pca_variance(df, dates, loopback=30):
    """
    computes the variance of each dimension per date (with 30 days loopback)
    """
    result = {}
    for day in dates:
        try:
            end_day = day + BDay(loopback - 1)
            sd = mlab.PCA(df.ix[day:end_day]).sigma
            variance = [x**2 for x in sd]
            result[end_day] = Series(variance, index=df.columns)
            print '%s done' % day
        except:
            print 'error in PCA computation of %s' % day
    return DataFrame.from_dict(result, orient='index')
Пример #9
0
def runMUSIC():
    global runtime, stoptime, timestep, state, state_hist, d, PCA_HIST_LENGTH, spikes, SPIKE_HIST_LENGTH, proj_hist
    print "running PCA adapter"
    t = 0 
    pca_created = False
    
    while runtime.time() < stoptime:
        
        if runtime.time() > PCA_HIST_LENGTH and not pca_created:
            pca = mlab.PCA(state_hist['states'])
            pca_created = True

        state = state * np.exp(-timestep/ tau)
        if t % 50 == 0:
            if runtime.time() < PCA_HIST_LENGTH:
                state_hist['states'] = np.append(state_hist['states'], [state], axis = 0)
                state_hist['times'] = np.append(state_hist['times'], [runtime.time()], axis = 0)

                state_hist_mask = np.where(state_hist['times'] > max(state_hist['times']) - PCA_HIST_LENGTH)
                state_hist['times'] = state_hist['times'][state_hist_mask]
                state_hist['states'] = state_hist['states'][state_hist_mask]

            #print "states", state_hist['states']
            

            if runtime.time() > PCA_HIST_LENGTH:
                projection = pca.project(state)
                #print "proj", len(projection)
                projection = projection[:3]

                proj_hist['projs'] = np.append(proj_hist['projs'], [projection], axis = 0)
                proj_hist['times'] = np.append(proj_hist['times'], [runtime.time()], axis = 0)

                proj_hist_mask = np.where(proj_hist['times'] > max(proj_hist['times']) - PROJ_HIST_LENGTH)
                proj_hist['times'] = proj_hist['times'][proj_hist_mask]
                proj_hist['projs'] = proj_hist['projs'][proj_hist_mask]


            spike_hist_mask = np.where(spikes['times'] > max(spikes['times']) - SPIKE_HIST_LENGTH)
            spikes['times'] = spikes['times'][spike_hist_mask]
            spikes['senders'] = spikes['senders'][spike_hist_mask]

            d.on_running(spikes['times'], spikes['senders'], proj_hist['projs'])
        #print spikes
        #print t, runtime.time()
        runtime.tick()
        t += 1 
Пример #10
0
def surf_segmentation(points, config):
    global ELAPSE_SEG
    assert len(points) / config.slice_count >= config.origin_points
    npoints = point_normalize(points)
    # cov = np.cov(npoints)
    pca_md = mlab.PCA(npoints)
    projection0 = pca_md.Y[:, 0]
    projection0min, projection0max = np.min(projection0), np.max(projection0)
    slice_step = (projection0max - projection0min) / config.slice_count
    pointsets = [np.array([]).reshape(0,3)] * config.slice_count
    surfs = []
    starttime = time.clock()
    for row_id in xrange(len(projection0)): 
        if projection0[row_id] == projection0max:
            ptsetid = config.slice_count - 1
        else:
            ptsetid = int((projection0[row_id]-projection0min) / slice_step)
        pointsets[ptsetid] = np.vstack((pointsets[ptsetid], npoints[row_id]))
    ELAPSE_SEG += time.clock() - starttime
    partial_surfs = []
    for ptset in pointsets:
        print "before segment", len(partial_surfs)
        if len(ptset) > 0:
            partial_surfs, _ = identifysurf(ptset, AdaSurfConfig(
                {'origin_points': config.origin_points, 'most_combination_points': config.most_combination_points, 'same_threshold': config.same_threshold}), donorm = False, surfs = partial_surfs)
            # # 注意这里不能简单地extend,应当将surfs和partial_surfs去重
            # if len(partial_surfs) > 0:
            #     surfs.extend(partial_surfs)
        print "after segment", len(partial_surfs)
    surfs.extend(partial_surfs)
    # print np.std(pca_md.Y[:, 0]),np.std(pca_md.Y[:, 1]),np.std(pca_md.Y[:, 2])
    # print pca_md.Y[:, 0]
    # print projection0.shape, npoints.shape
    # print np.linalg.norm(pca_md.Wt[0])
    # fig = pl.figure()
    # ax = fig.add_subplot(111, projection='3d')
    # ax.scatter(npoints[:, 0], npoints[:, 1], npoints[:, 2], c='r')
    # x = np.linspace(0, pca_md.Wt[0, 0], 300)
    # y = np.linspace(0, pca_md.Wt[0, 1], 300)
    # z = np.linspace(0, pca_md.Wt[0, 2], 300)
    # ax.plot(x, y, z, c='k')
    # x = np.linspace(0, pca_md.Wt[1, 0], 300)
    # y = np.linspace(0, pca_md.Wt[1, 1], 300)
    # z = np.linspace(0, pca_md.Wt[1, 2], 300)
    # ax.plot(x, y, z, c='g')
    # pl.show()
    return surfs, npoints
Пример #11
0
    def _run_pca(self):
        self.feature_matrix = []
        self.item_colors = []
        for well_key in self.data_provider.positions:
            for pos_key in self.data_provider.positions[well_key]:
                position = self.data_provider.get_position(well_key, pos_key)
                events = position.get_events()
                for t in events:
                    item_features = t.item_features
                    if item_features is not None:
                        self.feature_matrix.append(item_features)

                    item_colors = t.item_colors
                    if item_colors is not None:
                        self.item_colors.extend(item_colors)

        print 'number ofevents', len(self.feature_matrix)
        self.feature_matrix = numpy.concatenate(self.feature_matrix)

        nan_index = ~numpy.isnan(self.feature_matrix).any(1)
        self.feature_matrix = self.feature_matrix[nan_index, :]
        self.item_colors = numpy.asarray(self.item_colors)[nan_index]
        print self.feature_matrix.shape, self.item_colors.shape

        temp_pca = mlab.PCA(self.feature_matrix)
        result = temp_pca.project(self.feature_matrix)[:, :4]

        for cnt, (i, j) in enumerate([(1, 2), (1, 3), (2, 3), (1, 4)]):
            self.axes = self.fig.add_subplot(221 + cnt)

            means = kmeans(result[:, [i - 1, j - 1]], 7)[0]

            self.axes.scatter(result[:, i - 1],
                              result[:, j - 1],
                              c=self.item_colors)
            self.axes.plot(means[:, 0],
                           means[:, 1],
                           'or',
                           markeredgecolor='r',
                           markerfacecolor='None',
                           markersize=12,
                           markeredgewidth=3)
            self.axes.set_xlabel('Principle component %d' % i)
            self.axes.set_ylabel('Principle component %d' % j)
            self.axes.set_title('Events in PCA Subspace %d' % (cnt + 1))
Пример #12
0
 def evaluate(self):
     """
     Compute the temporal covariance between nodes in the time_series. 
     """
     cls_attr_name = self.__class__.__name__+".time_series"
     self.time_series.trait["data"].log_debug(owner = cls_attr_name)
     
     ts_shape = self.time_series.data.shape
     
     #Need more measurements than variables
     if ts_shape[0] < ts_shape[2]:
         msg = "PCA requires a longer timeseries (tpts > number of nodes)."
         LOG.error(msg)
         raise Exception, msg
     
     #(nodes, nodes, state-variables, modes)
     weights_shape = (ts_shape[2], ts_shape[2], ts_shape[1], ts_shape[3])
     LOG.info("weights shape will be: %s" % str(weights_shape))
     
     fractions_shape = (ts_shape[2], ts_shape[1], ts_shape[3])
     LOG.info("fractions shape will be: %s" % str(fractions_shape))
     
     weights = numpy.zeros(weights_shape)
     fractions = numpy.zeros(fractions_shape)
     
     #One inter-node temporal covariance matrix for each state-var & mode.
     for mode in range(ts_shape[3]):
         for var in range(ts_shape[1]):
             data = self.time_series.data[:, var, :, mode]
             data_pca = mlab.PCA(data)
             fractions[:, var, mode ] = data_pca.fracs
             weights[:, :, var, mode] = data_pca.Wt
     
     util.log_debug_array(LOG, fractions, "fractions")
     util.log_debug_array(LOG, weights, "weights")
     
     pca_result = mode_decompositions.PrincipalComponents(
         source = self.time_series,
         fractions = fractions,
         weights = weights,
         use_storage = False)
     
     return pca_result
Пример #13
0
def main():
    print('Читаю тексты...')
    sonets = Text('corpus1.txt')
    anna = Text('corpus2.txt')
    news = Text('corpus3.txt')

    print('Считаю характеристики...')
    sonets_data = sonets.make_features()
    anna_data = anna.make_features()
    news_data = news.make_features()

    print('Использую метод главных компонент...')
    data = np.vstack((sonets_data, anna_data, news_data))
    p = mlab.PCA(data, True)
    N1 = len(sonets_data)
    N2 = len(anna_data)

    print('Значимые признаки:')
    main_features = sorted(zip(Text.features, p.s, range(12)),
                           key=lambda pair: -abs(pair[1]))[:3]
    print('\r\n'.join('   ' + par[0] + ' - ' + str(par[1])
                      for par in main_features))

    print('Рисую графики...')
    plt.figure()
    plt.plot(p.Y[:N1, 0], p.Y[:N1, 1], 'og', p.Y[N1:N2 + N1, 0],
             p.Y[N1:N2 + N1, 1], 'sb', p.Y[N2 + N1:, 0], p.Y[N2 + N1:,
                                                             1], 'xr')
    plt.savefig('PCA-result.png')
    plt.close()

    for i, j in itertools.combinations(main_features, 2):
        plt.figure()
        plt.plot(data[:N1, i[2]], data[:N1, j[2]], 'og',
                 data[N1:N2 + N1, i[2]], data[N1:N2 + N1, j[2]], 'sb',
                 data[N2 + N1:, j[2]], data[N2 + N1:, j[2]], 'xr')
        plt.savefig('%s_vs_%s.png' %
                    (Text.features[i[2]], Text.features[j[2]]))
        plt.close()

    print('Ура! Конец!')
Пример #14
0
def train(d, attrCount):
    matrix = []
    count = []
    test = []
    u = []
    sigma = []
    for i in range(10):
        matrix.append(0)
        count.append(0)
    d = read()
    for img in d:

        # Subtract mean
        m = np.subtract(img[1], np.mean(img[1]))

        if count[img[0]] is 0:
            matrix[img[0]] = np.reshape(m, -1).T
            count[img[0]] = 1

        if count[img[0]] < 100:
            count[img[0]] += 1
            centered = np.reshape(m, -1).T
            matrix[img[0]] = np.vstack((matrix[img[0]], centered))

        else:
            test.append(img)
    b = 0
    for num in matrix:

        pca = (mlab.PCA(num.T, standardize=False)).Y.T
        pca = pca[0:attrCount]

        show(np.cov(pca.T))

        sigma.append(np.cov(pca.T))
        u.append([])
        for i in range(attrCount):
            u[b].append(np.mean(pca[i]))
        b += 1

    return u, sigma, test
Пример #15
0
def test_colinear_pca():
    a = mlab.PCA._get_colinear()
    pca = mlab.PCA(a)

    assert (np.allclose(pca.fracs[2:], 0.))
    assert (np.allclose(pca.Y[:, 2:], 0.))
        c.append(y)
    c.append(len(i.split()))
    c2data.append(c)

anna_data = []
sonet_data = []

for t in range(len(dataa)):
    anna_data.append(c1data[t] + dataa[t])

for t in range(len(datas)):
    sonet_data.append(c2data[t] + datas[t])

data = np.vstack((anna_data, sonet_data))
N = len(anna_data)
p = mlab.PCA(data, True)
a = p.Wt[0]
di = {}

di[a[0]] = 'A'
di[a[1]] = 'S'
di[a[2]] = 'V'
di[a[3]] = 'ADV'
di[a[4]] = 'SPRO'
di[a[5]] = 'len_in_letters'
di[a[6]] = 'len_in_diff_letters'
di[a[7]] = 'len_in_vowels'
di[a[8]] = 'median_length_of_words'
di[a[9]] = 'mean_length_of_words'
di[a[10]] = 'median_vowels_in_words'
di[a[11]] = 'length_of_sent_in_words'
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 15 17:47:42 2011

@author: Sat Kumar Tomer
@website: www.ambhas.com
@email: [email protected]
"""

import matplotlib.mlab as ml
import numpy as np

mean = [0, 0, 0]
cov = [[1, 0.2, 0.5], [0.2, 1, 0.8], [0.5, 0.8, 1]]
print(np.array(cov))

data = np.random.multivariate_normal(mean, cov, 100)

foo = ml.PCA(data)
Пример #18
0
def get_seq_groups(rfid = 'RF00167', reset = True, tree = True,
        draw_distances = draw_all_easy,
        draw_clusters = draw_all_easy,
        draw_single_cluster = draw_all_hard):
    '''
Run the tree computation for each clsuter in the rfam family.
(Or just one)

1) Compute clusters using a distance measure derived either 
   phyml or a simple levenshtein dist.

   kwds:
     tree          [True]  Use a tree or just a levenshtein 
                           distance to get distances for
                           init clustering.

2) Choose a cluster of well related sequences and for this 
   this cluster, compute an alignment (For each structure 
   using phase or for sequences using MUSCLE)
  
   kwds:
     struct_align  [True]   Whether to compute structural 
                            alignments or use MUSCLE

'''
    rutils = utils

    ali, tree, infos = rfam.get_fam(rfid)
    n = len(ali)

    if draw_distances:
        dists_t = seq_dists(ali,rfid, tree = True)
        dists_l = seq_dists(ali,rfid, tree = False)
        dtf = dists_t.flatten()
        dlf = dists_l.flatten()
        lin = linregress(dtf, dlf)
        rsquared = lin[2]**2

        f = myplots.fignum(5, (7,7))
        ax = f.add_subplot(111)
        ax.annotate('Levenshtein distance vs. BioNJ branch lengths',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('R-Squared: {0}'.format(rsquared),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('BIONJ Tree ML Distance')
        ax.set_ylabel('Levenshtein Distance')

        ax.scatter(dtf, dlf, 100)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_lev_tree_dists.tiff')
        f.savefig(datafile)
        
    dists = mem.getOrSet(setDistances, ali = ali, tree = tree, run_id = rfid,
                         register = rfid, 
                         on_fail = 'compute',
                         reset = reset)
    
    clusters = maxclust_dists(dists, k = 5, method = 'complete')
    clusters -= 1

    if draw_clusters:

        ct = mycolors.getct(len(set(clusters)))
        colors = [ct[elt] for elt in clusters]
        pca_vecs = mlab.PCA(dists).project(dists) 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Rfam sequence clusters in first 2 PC of sequence space.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of Clusters: {0}'.format(len(ct)),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 1')
        ax.set_ylabel('PC 2')

        ax.scatter(pca_vecs[:,0],pca_vecs[:,1], 20, color = colors)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_all_seqs_clustered.ps')
        f.savefig(datafile)        

    #now take the largest cluster and do the analysis.
    
    cgrps = dict([ (k, list(g)) 
              for k , g  in it.groupby(\
                sorted( list(enumerate(clusters)),key = lambda x: x[1]),
                key = lambda x: x[1])])
    cbig = argmax([len(x) for x in cgrps.values()])
    cluster_seqs = [ elt[0] for elt in cgrps.values()[cbig] ] 
    csize = len(cluster_seqs)
    seqs =[ali[c] for c in cluster_seqs]

    
    
    if 0:

        ct = mycolors.getct(2)
        pca_vecs = mlab.PCA(dists).project(dists) 
        colors =[ct[1] if elt in cluster_seqs else ct[0] for elt in range(len(pca_vecs))] 
        
        f = myplots.fignum(5, (8,8))
        ax = f.add_subplot(111)
        ax.annotate('Inter and intra cluster distances vs. PC0 component for chosen cluster.',
                    [0,1], xycoords = 'axes fraction', va = 'top',
                    xytext = [10,-10],textcoords = 'offset pixels')
        ax.annotate('Number of cluster sequences: {0}, Number of total sequences'.format(csize, n  - csize),
                    [1,0], xycoords = 'axes fraction', ha = 'right',
                    xytext = [-10,10],textcoords = 'offset pixels')
        ax.set_xlabel('PC 0')
        ax.set_ylabel('Distance')


        for s in cluster_seqs:
            ax.scatter(pca_vecs[:,0],dists[s,:] ,200 *exp(-(dists[s,:] / .5) **2),  color = colors, alpha = .2)
        
        datafile = cfg.dataPath('figs/gpm2/pt2_focused_cluster_dists.ps')
        f.savefig(datafile)        
        
    clusters_final  = [ [ elt[0] for elt in cgrps.values()[i] ] for i in range(len(cgrps.values()))]
    seqs_final = [ [ ali[idx] for idx in clust ] for clust in clusters_final]
    return seqs_final
Пример #19
0
    def run(self):
        """
        Task run method.

        Computes the principal component decomposition of the input images and
        populates the output eigenimages and projection matrix.

        Parameters
        ----------
        None
        
        Returns
        -------
        None
        """
	self._summary = {}

	# BDP output uses the alias name if provided, else a flow-unique one.
	stem = self._alias
	if not stem: stem = "pca%d" % (self.id())

	inum = 0
	data = []
	icols = []
        for ibdp in self._bdp_in:
	  # Convert input CASA images to numpy arrays.
	  istem = ibdp.getimagefile(bt.CASA)
	  ifile = ibdp.baseDir() + istem
	  icols.append(os.path.splitext(istem)[0])
	  if os.path.dirname(icols[-1]):
	    icols[-1] = os.path.dirname(icols[-1])  # Typical line cube case.
	  img = admit.casautil.getdata(ifile, zeromask=True).data
	  data.append(img)
	  admit.logging.info("%s shape=%s min=%g max=%g" % 
              (icols[-1], str(img.shape), np.amin(img), np.amax(img)))
	  assert len(data[0].shape) == 2, "Only 2-D input images supported"
	  assert data[0].shape == data[inum].shape, "Input shapes must match"
	  inum += 1

	# At least two inputs required for meaningful PCA!
	assert inum >= 2, "At least two input images required"

	# Each 2-D input image is a plane in a single multi-color image.
	# Each color multiplet (one per pixel) is an observation.
	# For PCA we collate the input images into a vector of observations.
	shape = data[0].shape
	npix = shape[0] * shape[1]
	clip = self.getkey('clipvals')
	if not clip: clip = [0 for i in range(inum)]
	assert len(clip) >= inum, "Too few clipvals provided"

	# Clip input values and stack into a vector of observations.
	pca_data = []
        for i in range(inum):
          nd = data[i]
	  nd[nd < clip[i]] = 0.0
	  pca_data.append(np.reshape(nd, (npix,1)))
	pca_in = np.hstack(pca_data)
	pca = mlab.PCA(pca_in)

	# Input statistics and output variance fractions.
	#print "fracs:", pca.fracs
	#print "mean:", pca.mu
	#print "sdev:", pca.sigma
	obdp = admit.Table_BDP(stem+"_stats")
	obdp.table.setData(np.vstack([pca.mu, pca.sigma,pca.fracs]).T)
	obdp.table.columns = ["Input mean", "Input deviation",
			      "Eigenimage variance fraction"]
	obdp.table.description = "PCA Image Statistics"
	self.addoutput(obdp)

	# Pre-format columns for summary output.
	# This is required when mixing strings and numbers in a table.
	# (NumPy will output the array as all strings.)
	table1 = admit.Table()
	table1.setData(np.vstack([[i for i in range(inum)],
                                  icols,
	                          ["%.3e" % x for x in pca.mu],
	                          ["%.3e" % x for x in pca.sigma],
	                          ["%s_eigen/%d.im" % (stem, i) 
                                      for i in range(inum)],
				  ["%.4f" % x for x in pca.fracs]]).T)
	table1.columns = ["Index", "Input", "Input mean",
			  "Input deviation",
                          "Eigenimage",
			  "Eigenimage variance fraction"]
	table1.description = "PCA Image Statistics"

	# Projection matrix (eigenvectors).
	#print "projection:", pca.Wt
	obdp = admit.Table_BDP(stem + "_proj")
	obdp.table.setData(pca.Wt)
	obdp.table.columns = icols
	obdp.table.description = \
	    "PCA Projection Matrix (normalized input to output)"
	self.addoutput(obdp)

	# Covariance matrix.
	covar = np.cov(pca.a, rowvar=0, bias=1)
	#print "covariance:", covar
	obdp = admit.Table_BDP(stem + "_covar")
	obdp.table.setData(covar)
	obdp.table.columns = icols
	obdp.table.description = "PCA Covariance Matrix"
	self.addoutput(obdp)

	# Collate projected observations into eigenimages and save output.
	os.mkdir(self.baseDir()+stem+"_eigen")
	pca_out = np.hsplit(pca.Y, inum)
	odata = []
        for i in range(inum):
	  ofile = "%s_eigen/%d" % (stem, i)
	  img = np.reshape(pca_out[i], shape)
          odata.append(img)
	  #print ofile, "shape, min, max:", img.shape, np.amin(img), np.amax(img)

	  aplot = admit.util.APlot(figno=inum, abspath=self.baseDir(),
                                   ptype=admit.util.PlotControl.PNG)
	  aplot.map1(np.rot90(img), title=ofile, figname=ofile)
	  aplot.final()

	  # Currently the output eigenimages are stored as PNG files only.
	  admit.casautil.putdata_raw(self.baseDir()+ofile+".im", img, ifile)
	  oimg = admit.Image()
	  oimg.addimage(admit.imagedescriptor(ofile+".im",  format=bt.CASA))
	  oimg.addimage(admit.imagedescriptor(ofile+".png", format=bt.PNG))
          obdp = admit.Image_BDP(ofile)
	  obdp.addimage(oimg)
	  self.addoutput(obdp)

	# As a cross-check, reconstruct input images and compute differences.
        for k in range(inum):
	  ximg = pca.Wt[0][k]*odata[0]
	  for l in range(1,inum):
	    ximg += pca.Wt[l][k]*odata[l]

	  ximg = pca.mu[k] + pca.sigma[k]*ximg
          admit.logging.regression("PCA: %s residual: " % icols[k] +
                                   str(np.linalg.norm(ximg - data[k])))

	# Collect large covariance values for summary.
	cvmin = self.getkey('covarmin')
	cvsum = []
        cvmax = 0.0
	for i in range(inum):
	  for j in range(i+1, inum):
            if abs(covar[i][j]) >= cvmax:
              cvmax = abs(covar[i][j])
	    if abs(covar[i][j]) >= cvmin:
	      cvsum.append([icols[i], icols[j], "%.4f" % (covar[i][j])])
        admit.logging.regression("PCA: Covariances > %.4f: %s (max: %.4f)" % (cvmin,str(cvsum),cvmax))

	table2 = admit.Table()
	table2.columns = ["Input1", "Input2", "Covariance"]
	table2.setData(cvsum)
	table2.description = "PCA High Covariance Summary"

	keys = "covarmin=%.4f clipvals=%s" % (cvmin, str(clip))
        self._summary["pca"] = admit.SummaryEntry([table1.serialize(),
						   table2.serialize()
						  ],
						  "PrincipalComponent_AT",
						  self.id(True), keys)
Пример #20
0
def surf_segmentation(points, config, paint_when_end=False):
    global ELAPSE_SEG
    config.slice_count = min(int(len(points) / config.origin_points),
                             config.slice_count)
    assert len(points) / config.slice_count >= config.origin_points
    adasurconfig = AdaSurfConfig({
        'origin_points': config.origin_points,
        'most_combination_points': config.most_combination_points,
        'same_threshold': config.same_threshold,
        'filter_rate': config.filter_rate,
        'ori_adarate': config.ori_adarate,
        'step_adarate': config.step_adarate,
        'max_adarate': config.max_adarate,
        'pointsame_threshold': config.pointsame_threshold,
        'filter_count': config.filter_count,
        'weak_abort': config.weak_abort
    })
    surfs = []
    slice_fig = []
    npoints = point_normalize(points)
    starttime = time.clock()
    xlim = (np.min(npoints[:, 0]), np.max(npoints[:, 0]))
    ylim = (np.min(npoints[:, 1]), np.max(npoints[:, 1]))
    zlim = (np.min(npoints[:, 2]), np.max(npoints[:, 2]))

    pca_md = mlab.PCA(np.copy(npoints))

    projection0_direction = None

    # projection0_direction = pca_md.Y[0]
    # projection0 = np.inner(projection0_direction, npoints)
    projection0 = npoints[:, 0]
    if config.split_by_count:
        step_count = len(projection0) / config.slice_count
        pointsets = [np.array([]).reshape(0, 3)] * config.slice_count
        sorted_projection0_index = np.argsort(projection0)
        current_slot_count, ptsetid = 0, 0
        for index in sorted_projection0_index:
            pointsets[ptsetid] = np.vstack(
                (pointsets[ptsetid], npoints[index, :]))
            current_slot_count += 1
            if current_slot_count > step_count:
                current_slot_count = 0
                ptsetid += 1
    else:
        projection0min, projection0max = np.min(projection0), np.max(
            projection0)
        step_len = (projection0max - projection0min) / config.slice_count
        pointsets = [np.array([]).reshape(0, 3)] * config.slice_count
        for i in xrange(len(projection0)):
            if projection0[i] == projection0max:
                ptsetid = config.slice_count - 1
            else:
                ptsetid = int((projection0[i] - projection0min) / step_len)
            pointsets[ptsetid] = np.vstack((pointsets[ptsetid], npoints[i]))

    # random.shuffle(pointsets)

    partial_surfs, fail = [], np.array([]).reshape(0, 3)
    # for (ptset, ptsetindex) in zip(pointsets, range(len(pointsets))):
    #     print "slice", len(ptset), xlim, ylim, zlim
    # paint_points(ptset, xlim = xlim, ylim = ylim, zlim = zlim)
    for (ptset, ptsetindex) in zip(pointsets, range(len(pointsets))):
        print "--------------------------------------"
        print "before segment", ptsetindex, '/', len(pointsets)
        print 'derived surfs:'
        # print '---000', ptset.shape, np.array(fail).shape, np.array(fail), fail
        if fail == None:
            allptfortest = np.array(ptset)
        else:
            allptfortest = np.vstack((ptset, np.array(fail).reshape(-1, 3)))
        print "len of surf is: ", len(
            partial_surfs), ", len of points is: ", len(allptfortest)
        if allptfortest != None and len(allptfortest) > 0:
            partial_surfs, _, fail, extradata = identifysurf(
                allptfortest,
                adasurconfig,
                donorm=False,
                surfs=partial_surfs,
                title=str(ptsetindex),
                paint_when_end=paint_when_end,
                current_direction=projection0_direction)
            if paint_when_end:
                slice_fig.append(extradata[0])
        if fail == None:
            print "after segment", ptsetindex, "len of surf", len(
                partial_surfs), "fail is None", fail
        else:
            print "after segment", ptsetindex, "len of surf", len(
                partial_surfs), "len of fail", len(fail)

        for x in partial_surfs:
            x.printf()

    surfs.extend(partial_surfs)

    # fig = pl.figure()
    # ax = fig.add_subplot(111, projection='3d')
    # ax.scatter(npoints[:, 0], npoints[:, 1], npoints[:, 2], c='r')
    # x = np.linspace(0, pca_md.Wt[0, 0] * 100, 300)
    # y = np.linspace(0, pca_md.Wt[0, 1] * 100, 300)
    # z = np.linspace(0, pca_md.Wt[0, 2] * 100, 300)
    # ax.plot(x, y, z, c='k')
    # x = np.linspace(0, pca_md.Wt[1, 0] * 100, 300)
    # y = np.linspace(0, pca_md.Wt[1, 1] * 100, 300)
    # z = np.linspace(0, pca_md.Wt[1, 2] * 100, 300)
    # ax.plot(x, y, z, c='g')
    # pl.show()

    return surfs, npoints, (slice_fig, )
Пример #21
0
    def test_colinear_pca(self):
        a = mlab.PCA._get_colinear()
        pca = mlab.PCA(a)

        np.testing.assert_allclose(pca.fracs[2:], 0., atol=1e-8)
        np.testing.assert_allclose(pca.Y[:, 2:], 0., atol=1e-8)
Пример #22
0
def project_structs(structs,
                    ptype ='l',
                    affinities = None,
                    n_comp = None,
                    l = None,
                    vecs = None):
    '''
Project RNA structures in any one of several ways.
Different projections require different inputs.

inputs:
 ptype:  ['pca', 'rnd', 'l', 'full_pairs']
 affinities: aff matrix for pca
 n_comps:    n_comps for pca
 l:          length for l projections and pairs projections
 vecs:       vecs from l/pairs projection for random projections


outputs:
 projections in the form of an [N, X] matrix where X is the 
 size of the projection and N is the number of input structures.

The projections requiring the fewest input variables
are 'full_l' and 'full_pairs' as these only require a list
of structures (specifiied as base pairs) and a sequence length.
Most of the rest can be called using the output projections
from 'full_l' or 'full_pairs' as input.

In particular, we can project onto PCA vectors:
  pca inputs: 
              n_comps: the number of components to take from 
                       PCA projection
              affinities: the affinity matrix to use for the
                          projection
  
Or random matrices:
  rnd inputs:
              vecs: vectors in l dimensional space to project
                    onto random matrices.

'''

    if ptype == 'pca':
        assert affinities != None
        assert n_comp != None
        pca_vecs = mlab.PCA(affinities).project(affinities)  
        pca_vecs = pca_vecs[:,0:n_comp]
        return pca_vecs
    elif ptype == 'l':
        assert l != None
        return array([struct_project_l(p, l) for p in structs])
    elif ptype == 'full_pairs':
        assert l != None
        return [struct_project_l2(p, l) for p in structs]
    elif ptype == 'rnd':
        assert n_comp != None
        assert vecs != None
        mat = array(np.round(random.rand(n_comp, l)),float)
        mat *= 2
        mat -= 1
        mat/= sqrt(l)
        plt.imshow(mat)
        cvecs = dot(mat,vecs.T).T
        return cvecs


    else:
        raise Exception('Projection type: {0} not yet implemented'.format(ptype)) 
Пример #23
0
# <markdowncell>

# Principal Components Analysis and Display
# -----------------------------------------
#
# The first three principal components identify the three major risk factors, and account for 95% of the total variance:
#
# * The first factor represents an approximate parallel shift
# * The second factor represents a twist
# * The third factor represents a change in convexity

# <codecell>

# PCA on rate change
zc_pca = ml.PCA(np.diff(zc_rate, axis=0))

fig = plt.figure()
fig.set_size_inches(10, 6)

ax = fig.add_subplot(121)

# compute x-axis limits
dtCalc = dtObs[0]
ts = get_term_structure(df_libor, dtCalc)
(dtMat, zc) = zero_curve(ts, days, dtCalc)
dtMin = dtMat[0]

dtCalc = dtObs[-1]
ts = get_term_structure(df_libor, dtCalc)
(dtMat, zc) = zero_curve(ts, days, dtCalc)
Пример #24
0
topic_words = []
for topic in clf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])

for t in range(len(topic_words)):
    print("Topic {}: {}".format(t, ' '.join(topic_words[t][:15])))

kmeans = KMeans(n_clusters=11, random_state=0).fit(doctopic)
clusters = kmeans.predict(doctopic)
clusters = clusters.reshape(-1, 1)

centroid = kmeans.cluster_centers_
clusterid = kmeans.labels_

doctopic_pca = mlab.PCA(doctopic)
cutoff = doctopic_pca.fracs[1]
doctopic_2d = doctopic_pca.project(doctopic, minfrac=cutoff)
centroid_2d = doctopic_pca.project(centroid, minfrac=cutoff)
colors = [
    'red', 'green', 'blue', 'yellow', 'black', 'cyan', 'magenta', 'brown',
    'tomato', 'c', 'slateblue'
]
plt.figure()
plt.xlim([doctopic_2d[:, 0].min() - 0.5, doctopic_2d[:, 0].max() + 0.5])
plt.ylim([doctopic_2d[:, 1].min() - 0.5, doctopic_2d[:, 1].max() + 0.5])
plt.xticks([], [])
plt.yticks([], [])

plt.scatter(centroid_2d[:, 0], centroid_2d[:, 1], marker='o', c=colors, s=100)
Пример #25
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

pca_df = pd.read_csv('../data/parsed/results/DataFrameProto.csv')
pca_T = pca_df.T

x = pca_df[0]
y = pca_df[1]

mean_x = np.mean(x)
mean_y = np.mean(y)

pca = mlab.PCA(pca_T)

sig_x = np.std(pca_T['0'])  # ~ 2.01
sig_y = np.std(pca_T['1'])  # ~ 0.96

plt.figure(1)
plt.plot(pca.Y[0:, 0], pca.Y[0:, 1], 'o', alpha=0.5, color='blue')

#plt.axis('equal')
plt.title('Transformed PCA samples')
plt.xlim(xmin=(sig_x * -3), xmax=(sig_x * 3))
plt.xticks(np.arange((sig_x * -2), (sig_x * 3), sig_x))

plt.axvline(x=sig_x * -2, linestyle='dotted')
plt.axvline(x=sig_x * -1, linestyle='dotted')
plt.axvline(x=sig_x * 1, linestyle='dotted')
plt.axvline(x=sig_x * 2, linestyle='dotted')
Пример #26
0
#from code import operators
import code
from code.operators import *
from math import *
from code.utils.mathlogic import *
from code.rencode import *
from code.evodevo import *
from random import sample
import matplotlib.mlab as mlab
from numpy import array as nparray
import numpy

numclasses = 5
allclasses = nparray(
    [int(c) for c in open('datafiles/MIREXclasses.txt').readlines()])
allfeatures = nparray([
    map(lambda t: float(t), l.split(','))
    for l in open('datafiles/FMnorm.txt').readlines()
])

pcafeats = mlab.PCA(allfeatures)

#print pcafeats.fracs
#print pcafeats.Y[0,:]

projected = pcafeats.project(allfeatures, 0.03)
print projected[:1]
numpy.save('datafiles/projectedfeat-03', projected)