Пример #1
0
def do_fast_ica(pca_first):
    mo1_cj_inverse = numpy.array(mo1_cj).T
    mo2_cj_inverse = numpy.array(mo2_cj).T
    if pca_first:
        mo1_cj_array = mdp.pca(mo1_cj_inverse, input_dim=4, output_dim=3)
        mo2_cj_array = mdp.pca(mo2_cj_inverse, input_dim=4, output_dim=3)
    else:
        mo1_cj_array = mo1_cj_inverse
        mo2_cj_array = mo2_cj_inverse
    a = mdp.fastica(mo1_cj_array)
    b = mdp.fastica(mo2_cj_array)
    return a, b
Пример #2
0
def do_fast_ica(pca_first):
	mo1_cj_inverse = numpy.array(mo1_cj).T
	mo2_cj_inverse = numpy.array(mo2_cj).T
	if pca_first:
		mo1_cj_array = mdp.pca(mo1_cj_inverse, input_dim=4, output_dim=3)
		mo2_cj_array = mdp.pca(mo2_cj_inverse, input_dim=4, output_dim=3)
	else:
		mo1_cj_array = mo1_cj_inverse
		mo2_cj_array = mo2_cj_inverse
	a = mdp.fastica(mo1_cj_array)
	b = mdp.fastica(mo2_cj_array)
	return a,b
Пример #3
0
 def _pca(self):
     #self.pca_box_surface_area= 2*( self.pca_lengths[0]*self.pca_lengths[1]
     #                + self.pca_lengths[1]*self.pca_lengths[2]
     #                + self.pca_lengths[2]*self.pca_lengths[0]
     #                )
     '''
     2 * (
         1/2. * self.pca_lengths[0] * numpy.sqrt(numpy.square(self.pca_lengths[1]/2) + numpy.square(self.pca_lengths[2]/2)) 
         +
         1/2. * self.pca_lengths[0] * numpy.sqrt(numpy.square(self.pca_lengths[2]/2) + numpy.square(self.pca_lengths[1]/2)) 
         )
     '''
     #self.pca_rhombus =  self.pca_lengths[0] * numpy.sqrt(numpy.square(self.pca_lengths[2]) + numpy.square(self.pca_lengths[1]))
     mins   = [float('inf'), float('inf'), float('inf')]
     maxs   = [float('-inf'),float('-inf'),float('-inf')]
     for x in mdp.pca( numpy.array([[compartment.x, compartment.y, compartment.z] for compartment in self.morphology.compartments]) ):
         for d in xrange(3):
             if x[d] < mins[d]:
                 mins[d]     = x[d]
             if x[d] > maxs[d]:
                 maxs[d]     = x[d]
     self._pca_length_x  = maxs[0] - mins[0]
     self._pca_length_y  = maxs[1] - mins[1]
     self._pca_length_z  = maxs[2] - mins[2]
     
     self._pca_lengths = (
         self._pca_length_x,
         self._pca_length_y,
         self._pca_length_z
     )
Пример #4
0
def tweet_pca_reduce(tweets_train, tweets_test, output_dim):

    # convert dictionary feature vecs to numpy array
    print '--> Converting dictionaries to NumPy arrays'
    train_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
                              (t,s) in tweets_train])

    test_arr = numpy.array( [tweet_features.tweet_dict_to_nparr(t) for \
                             (t,s) in tweets_test])

    # compute principle components over training set
    print '--> Computing PCT'
    pca_array = mdp.pca( train_arr.transpose(), \
                         svd=True, output_dim=output_dim )

    # both train and test sets to PC space
    print '--> Projecting feature vectors to PC space'

    train_arr = numpy.dot(train_arr, pca_array)
    test_arr = numpy.dot(test_arr, pca_array)

    # convert projected vecs back to reduced dictionaries
    print '--> Converting NumPy arrays to dictionaries'

    reduced_train = \
        zip( [tweet_features.tweet_nparr_to_dict(v) for v in train_arr], \
             [s for (t,s) in tweets_train] )

    reduced_test  = \
        zip( [tweet_features.tweet_nparr_to_dict(v) for v in test_arr], \
             [s for (t,s) in tweets_test])

    return (reduced_train, reduced_test)
Пример #5
0
def pca(self):
    import numpy
    import mdp
    m = morphjongleur.model.morphology.Morphology(
      name                  = self.name,
      file_origin           = self.file_origin,
      description           = self.description,
      datetime_recording    = self.datetime_recording
    )
    assert m.number_of_compartments == 0
    pca_cs  = mdp.pca( numpy.array([ [c.x, c.y, c.z] for c in self.compartments ] ) )
    assert self.number_of_compartments == len(pca_cs)
    for i in xrange( self.number_of_compartments ):
        m.add_compartment(
            morphjongleur.model.morphology.Compartment( 
                self._compartments[i].compartment_id, 
                self._compartments[i].compartment_parent_id, 
                self._compartments[i].radius, 
                x=pca_cs[i][0],
                y=pca_cs[i][1],
                z=pca_cs[i][2]
            ) 
        )
    assert self.number_of_compartments == m.number_of_compartments
    return m
Пример #6
0
def eval_func(chromosome):
    """ The evaluation function """
    indices_values = []
    sellTrendVector = []
    buyTrendVector = []

    for gene in chromosome:
        indices_values.append(gene.getResult())
        sellTrendVector.append(gene.getResult()[:tradingGA.sellTrendBeginning])
        buyTrendVector.append(gene.getResult()[:tradingGA.buyTrendBeginning])

    #raw_input("Press ENTER to exit")

    indices_values = indicesNormalizer().normalize(indices_values)
    indices_values = numpy.asarray(indices_values)

    result = mdp.pca(indices_values.T, reduce=True)  #, svd=True)
    sell_center = calculate_centroid_center(result[:4], sellTrendVector)
    buy_center = calculate_centroid_center(result[:4], buyTrendVector)

    #print sell_center, len(sell_center)
    #print buy_center, len(buy_center)

    wynik = numpy.linalg.norm(
        numpy.asarray(sell_center) - numpy.asarray(buy_center))

    return wynik
Пример #7
0
def eval_func(chromosome):
    """ The evaluation function """
    indices_values = []
    sellTrendVector = []
    buyTrendVector = []

    for gene in chromosome:
        indices_values.append(gene.getResult())
        sellTrendVector.append(gene.getResult()[:tradingGA.sellTrendBeginning])
        buyTrendVector.append(gene.getResult()[:tradingGA.buyTrendBeginning])

    #raw_input("Press ENTER to exit")

    indices_values = indicesNormalizer().normalize(indices_values)
    indices_values = numpy.asarray(indices_values)

    result = mdp.pca(indices_values.T, reduce=True)#, svd=True)
    sell_center = calculate_centroid_center(result[:4], sellTrendVector)
    buy_center = calculate_centroid_center(result[:4], buyTrendVector)

    #print sell_center, len(sell_center)
    #print buy_center, len(buy_center)

    wynik = numpy.linalg.norm(numpy.asarray(sell_center) - numpy.asarray(buy_center))

    return wynik
Пример #8
0
def PCA(data):
	NBD=np.zeros((len(data)-1,len(data[0])-1))
	for k in range(1,len(data),1):
		row=[]
		for k1 in range(1,len(data[0]),1):
			if(is_number(data[k][k1])):
				row.append(float(data[k][k1]))
		NBD[k-1]=row
	pca=mdp.pca(NBD,svd=True)
	return pca
Пример #9
0
def PCA(data):
    NBD = np.zeros((len(data) - 1, len(data[0]) - 1))
    for k in range(1, len(data), 1):
        row = []
        for k1 in range(1, len(data[0]), 1):
            if (is_number(data[k][k1])):
                row.append(float(data[k][k1]))
        NBD[k - 1] = row
    pca = mdp.pca(NBD, svd=True)
    return pca
Пример #10
0
 def _reduce_dimensions(
     self, vectors,
     output_dim=6
 ):
     """
     Scales image data vectors to lower dimension
     """
     matrix = np.array(vectors, dtype='float32')
     scaled = mdp.pca(matrix, output_dim=output_dim)
     return scaled
Пример #11
0
def simple_pca(a1):

    M = np.zeros((len(a1), len(a1)))

    for i in range(len(a1)):
        for j in range(i, len(a1)):
            M[i, j] = M[j, i] = (a1[i] - a1[j])**2

    import mdp

    return mdp.pca(M, output_dim=2)
Пример #12
0
def pca_distance( m ):
    # perform PCA, add random noise based on how much the data spread

    comps = mdp.pca( m, output_dim = 2 )
    a = comps[1,0]
    comps[:,0] += jitters( comps[:,0] )
    b = comps[1,0]
    assert a != b
    comps[:,1] += jitters( comps[:,1] )

    return comps
Пример #13
0
def pca_distance( aDistanceMatrix, dim = 2 ):

    comps = mdp.pca( aDistanceMatrix.M, output_dim = 2 )
    a = comps[1,0]
    comps[:,0] += jitters( comps[:,0] )
    b = comps[1,0]
    #assert a != b
    comps[:,1] += jitters( comps[:,1] )


    return comps
Пример #14
0
def simple_pca( a1 ):

    M = np.zeros( (len(a1), len(a1)) )

    for i in range( len(a1) ):
        for j in range( i, len(a1) ):
            M[i,j] = M[j,i] = (a1[i]-a1[j]) ** 2

    import mdp

    return mdp.pca( M, output_dim = 2 )
Пример #15
0
def getPcaTransformedMatrix(samples, group2samples, type2intersectGenes, selectedvjs, genetype, abs, outfile, options):
    m, rownames = preparePcaMatrix(samples, group2samples, type2intersectGenes, selectedvjs, genetype, abs)
    transformedM = mdp.pca(m, output_dim=4)
    
    #Write to text file
    f = open("%s.txt" %outfile, 'w')
    for i,r in enumerate(transformedM):
        f.write("%s\t%s\t%s\n" %(rownames[i][0], rownames[i][1], "\t".join( [str(c) for c in r] )))
    f.close()

    #Draw plot:
    drawPca(rownames, transformedM, outfile, options)
Пример #16
0
    def Main(self,model):
        # self.model = model
        data = array(model.GetCurrentData()[:])

        k = wx.GetNumberFromUser("PCA Dialog",
                                 "Enter number of principal components",
                                 "k",
                                 1)

        pca_data = mdp.pca(data, output_dim=k)
        # ica_data = r.fastICA(data, k, alg_typ = "deflation", fun = "logcosh", alpha = 1, method = "R", row_norm = 0, maxit = 200, tol = 0.0001, verbose = 1)
        fields = ['Comp%02d' % c for c in range(1, k+1)]
        model.updateHDF('PcaPY', pca_data, fields=fields)
Пример #17
0
def computeClusterCentre(chromosome, trendBeginning):
    indices_values = []
    trendVector = []

    for gene in chromosome:
        indices_values.append(gene.getResult())
        trendVector.append(gene.getResult()[:trendBeginning])

    indices_values = indicesNormalizer().normalize(indices_values)
    indices_values = numpy.asarray(indices_values)

    result = mdp.pca(indices_values.T, reduce=True)
    center = calculate_centroid_center(result[:4], trendVector)

    return center
Пример #18
0
def computeClusterCentre(chromosome, trendBeginning):
    indices_values = []
    trendVector = []

    for gene in chromosome:
        indices_values.append(gene.getResult())
        trendVector.append(gene.getResult()[:trendBeginning])

    indices_values = indicesNormalizer().normalize(indices_values)
    indices_values = numpy.asarray(indices_values)

    result = mdp.pca(indices_values.T, reduce=True)
    center = calculate_centroid_center(result[:4], trendVector)

    return center
Пример #19
0
def PCAAlg(x, fps):
    global lastValue
    global counter
    global pca_bpms
    primo = True
    prova = -1
    x = np.transpose(x)
    #print("dopo x: " + str(x))
    y = mdp.pca(x)
    #print("pca: " + str(y))
    secondComponent = y[:, 1]
    #print("second: " + str(secondComponent))
    freqs, pruned = searchFreqs(secondComponent, fps, len(secondComponent))
    prova, index = calcolaProssimaFreqSensata(freqs, pruned)
    #print("pca: " + str(prova))
    pca_bpms.append(prova)
Пример #20
0
    def plot_clusters(self, spikes, noise_cov=None):
        """:spikeplot.cluster: and :spikeplot.cluster_projection: plots

        There will be two plots visualizing the clustering and discrimination
        of the sorting. One showing the clustering of units (scatter plot
        using the first two principal components). The initial cluster
        labels are preserved as colorization in the projected data.

        Additionally there will be a plot showing the projection of each
        cluster coupling onto the vector connecting the corresponding cluster
        means/centers

        :type spikes: dict
        :param spikes: one set of waveforms per unit {k:[n,samples]}
        :type noise_cov: ndarray
        :param noise_cov: noise covariance matrix compatible with the
            dimension of individual observations in :spikes:
        """

        # prepare data
        tf = sum(self.parameters['cut'])
        # TODO: prewhiten !!!
        data_stacked = pca(sp.vstack(spikes.values()), output_dim=4)
        data = {}
        idx = 0
        for k, v in spikes.items():
            n = v.shape[0]
            data[k] = data_stacked[idx:idx + n]
            idx += n

        # produce scatter plots
        for pcs in [(0, 1), (2, 3)]:
            self.result.append(
                cluster(
                    data,
                    data_dim=pcs,
                    plot_mean=True,
                    title='cluster plot',
                    xlabel='PC%s' % (pcs[0] + 1),
                    ylabel='PC%s' % (pcs[1] + 1)))

        # cluster projection
        self.result.append(cluster_projection(data))
Пример #21
0
    def plot_clusters(self, spikes, noise_cov=None):
        """:spikeplot.cluster: and :spikeplot.cluster_projection: plots

        There will be two plots visualizing the clustering and discrimination
        of the sorting. One showing the clustering of units (scatter plot
        using the first two principal components). The initial cluster
        labels are preserved as colorization in the projected data.

        Additionally there will be a plot showing the projection of each
        cluster coupling onto the vector connecting the corresponding cluster
        means/centers

        :type spikes: dict
        :param spikes: one set of waveforms per unit {k:[n,samples]}
        :type noise_cov: ndarray
        :param noise_cov: noise covariance matrix compatible with the
            dimension of individual observations in :spikes:
        """

        # prepare data
        tf = sum(self.parameters['cut'])
        # TODO: prewhiten !!!
        data_stacked = pca(sp.vstack(spikes.values()), output_dim=4)
        data = {}
        idx = 0
        for k, v in spikes.items():
            n = v.shape[0]
            data[k] = data_stacked[idx:idx + n]
            idx += n

        # produce scatter plots
        for pcs in [(0, 1), (2, 3)]:
            self.result.append(
                cluster(data,
                        data_dim=pcs,
                        plot_mean=True,
                        title='cluster plot',
                        xlabel='PC%s' % (pcs[0] + 1),
                        ylabel='PC%s' % (pcs[1] + 1)))

        # cluster projection
        self.result.append(cluster_projection(data))
Пример #22
0
    def compute(self,
                waveforms,
                sampling_rate=None,
                output_dim=2,
                start_sample=0,
                num_samples=0):
        """Computes PCA of waveforms concatenated across recording points.
        
        waveforms : ndarray of waveforms, of shape
            (N_spikes, N_recordingpoints, len(waveform))        
        sampling_rate : not used
        output_dim : Number of features (eigenvalues) to return per waveform
        start_sample : Index of first sample in each waveform to slice out
            to use for PCA
        num_samples : Number of samples of each waveform to use for PCA.
            The default is '0', which means to use all samples, regardless
            of the value of `start sample`.
        
        Returns : pca_mat, a matrix of components. shape: (N_spikes, N_features)
        """
        lenwf = waveforms.shape[2]

        if num_samples > 0:
            # We're not using all samples
            if start_sample < 0 or start_sample >= lenwf:
                # garbage input, use all samples
                print "warning: start_sample must be in [0, %d)" % lenwf
                start_sample = 0

            # slice
            waveforms = waveforms[:, :,
                                  start_sample:(start_sample + num_samples)]

        # reshape into the format PCA expects. Each row is now a concatenation
        # of waveforms from each channel in the group.
        waveforms2 = waveforms.reshape(waveforms.shape[0],
                                       waveforms.shape[1] * waveforms.shape[2])

        # do PCA and return results in (N_spikes, N_features) shape
        pca_mat = mdp.pca(waveforms2, output_dim=output_dim)
        return pca_mat
Пример #23
0
def tweet_pca_reduce( tweets_train, tweets_test, output_dim ):

    # convert dictionary feature vecs to numpy array
    print '--> Converting dictionaries to NumPy arrays'
    train_arr = numpy.array( [tweet_dict_to_nparr(t) for \
                              (t,s) in tweets_train])

    test_arr = numpy.array( [tweet_dict_to_nparr(t) for \
                             (t,s) in tweets_test])


    # compute principle components over training set
    print '--> Computing PCT'
    pca_array = mdp.pca( train_arr.transpose(), \
                         svd=True, output_dim=output_dim )


    # both train and test sets to PC space
    print '--> Projecting feature vectors to PC space'

    train_arr = numpy.dot( train_arr, pca_array )
    test_arr  = numpy.dot( test_arr,  pca_array )


    # convert projected vecs back to reduced dictionaries
    print '--> Converting NumPy arrays to dictionaries'

    reduced_train = \
        zip( [tweet_nparr_to_dict(v) for v in train_arr], \
             [s for (t,s) in tweets_train] )

    reduced_test  = \
        zip( [tweet_nparr_to_dict(v) for v in test_arr], \
             [s for (t,s) in tweets_test])

    return (reduced_train, reduced_test)
Пример #24
0
def reduce_dimensions(myarray):
    # with std test input get
    # Covariance matrix may be singular.Try instantiating the node with svd=True.
    return mdp.pca(myarray, svd=True)
Пример #25
0
import mdp

# x is matrix of all instances and features
y = mdp.pca(x)

# evt uitzoeken of mdp.ica iets oplevert
Пример #26
0
def featureSelection(trainData,labels,featureSelectionMechanism,numFeatures,minNumSongs, maxNumSongs, trainSongs, featureExtractor):
    """
    Given the feature set of example, will give a reduced feature set.
    
    @param list of training data
    @param list of strings - a list of the unique labels
    @param string - type of feature selection we want to perform
    @param number of features we want to choose
    @param number of min songs for num_songs feature selection
    @param number of max songs for num_songs feature selection
    @return list with a reduced set of features
    """
    informationGains = []
    featureNames = []
    featureLibraryInfo = []
    featureLibrary = []
    
    # Calculate all of the features (not just those from the example)  
    print("Calculate all of the given features")
    allFeatures = Counter()
    featureArray = []
    featureArray = [fs for (fs, label) in trainData]
    for featureSet in featureArray :
        allFeatures.update(featureSet)
    if(numFeatures<len(allFeatures) and numFeatures != 0):
        if(featureSelectionMechanism=="information_gain"):
            selected_features_info = []
            selected_features = []
            print("Using information gain to select features")
            # Loop through all of the features and calculate the information gain for each 
            for feature in allFeatures: 
                print("Calculating information gain for %s " % feature)
                informationGains.append(informationGain(trainData,allFeatures,feature, labels))
                featureNames.append(feature) 
            informationGains = np.array(informationGains)
            sortedargs = np.argsort(informationGains)
            featureNames = [featureNames[i] for i in sortedargs]
            print informationGains
            #informationGains.reverse()
            #featureNames.reverse()
            # Add the top numFeatures to the counter.   
            # if requesting too many features change number of requested features.
            if(numFeatures>len(featureNames)):
                numFeatures = len(featureNames)
            for i in range(0,numFeatures):
                selected_features_info.append(informationGains[i])
                selected_features.append(featureNames[i])
            # print featureLibraryInfo        
        elif(featureSelectionMechanism=="num_songs"):
            print("Using min/max song metric to select features")
            selected_features = getDict(minNumSongs, maxNumSongs, trainSongs, featureExtractor) 
        else: 
            print("Using PCA to select features")
            # Create a matrix with he relevant labels
            allFeaturePairList = list(allFeatures.items());
            allFeatureKeyList = [pair[0] for pair in allFeaturePairList];
            index = 0;
            data_features_matrix = np.zeros((len(trainData),len(allFeatures)));
            print("Creating the matrix for PCA input")
            for (features,label) in trainData: 
                # Loop through each feature and populate matrix
                for feature in features:
                    data_features_matrix[index][allFeatureKeyList.index(feature)] = allFeatures[feature]
                index=index+1
        
            # Run PCA to reduce feature size.
            # print(data_features_matrix)
            
            print("Using PCA to reduce the number of features")
            reduced_features = mdp.pca(transpose(data_features_matrix),output_dim = 2, svd = True)
            u1 = reduced_features[:,1]
            order = np.argsort(u1)[::-1]
            order = order[1:numFeatures]
            print("Populate the selected_features based on representation in first principal component")
            selected_features = [allFeatureKeyList[index] for index in order]
            print(selected_features)
    else:
        for feature in allFeatures:
            featureNames.append(feature)
        selected_features = featureNames    
       
    # Return the selected features regardless of algorithm used.
    return selected_features   
Пример #27
0
def runPCA(paramFile):
  data = np.loadtxt(paramFile)
  y = mdp.pca(data, reduce=True)
  print y
Пример #28
0
def pca(data, singleValueDecomp=True):
    return (mdp.pca(data, svd=singleValueDecomp))
Пример #29
0
    if ideology != prev:
         stats.append([])
         cursor2 = connection.cursor()
         cursor2.execute("select ideology from ideology where id=%s" % ideology)
         idrow=cursor2.fetchone()
         idname=idrow[0]
         if idname=="":
             idname="[ideology #%d]" % ideology
         ideologies.append(idname)
    stats[-1].append(float(row[2]))
    prev=ideology
    row=cursor.fetchone()


raw=array(stats)
cooked=mdp.pca(raw, output_dim=2) # see http://nullege.com/codes/search/mdp.pca


(xmax, ymax) = cooked.max(0) # max value in each column vector of y, see http://mathesaurus.sourceforge.net/numeric-numpy.html
(xmin, ymin) = cooked.min(0) # And min.  These will be used to interpolate the x,y coordinates for plotting


idmap=Image.new("RGB", (width+240, height+12), (128,128,128))
draw=ImageDraw.Draw(idmap)
for i in range(len(ideologies)):
    ts=draw.textsize(ideologies[i]) # center the name over its coordinates
    x=width*(cooked[i,0]-xmin)/(xmax-xmin)-math.trunc(ts[0]/2)+120
    y=height*(cooked[i,1]-ymin)/(ymax-ymin)-math.trunc(ts[1]/2)+6
    newcolor=[]
    for j in range(3): # generate a random color, one that contrasts w. midtone gray background
        newrand=random.random()+random.random()
Пример #30
0
from  mdp.nodes import RBMNode
import mdp
from numpy import *
import time

import read_spro

X = read_spro.load_mfcc_file()



rbm = RBMNode(10, X.shape[1])


x2 = X.dot(X.T)

print x2.shape

mdp_pca = mdp.pca(x2)

print X.shape

Пример #31
0
print "read data"
for line in co_occ_file:
    if count < 100:
        count += 1
        line = line.replace("\n", "")
        instance = line.split(" ")
        if (indexes[instance[0]] == -1):
            indexes[instance[0]] = current_index
            current_index += 1
        if (indexes[instance[1]] == -1):
            indexes[instance[1]] = current_index
            current_index += 1
        matrix[indexes[instance[0]], indexes[instance[1]]] = float(instance[2])
    else:
        break

    if count % 10000 == 0:
        print count, "entries processed"

co_occ_file.close()

if current_index != nr_functional_words - 1:
    print "not the same", current_index, nr_functional_words

print "perform pca"
matrix = mdp.pca(matrix, 30)
print "pca done, start tsne"

#Y = tsne.tsne(X, no_dims, perplexity)
y = tsne.tsne(matrix, 2, nr_functional_words, perplexity)
Пример #32
0
def featureSelection(trainData, labels, featureSelectionMechanism, numFeatures,
                     minNumSongs, maxNumSongs, trainSongs, featureExtractor):
    """
    Given the feature set of example, will give a reduced feature set.
    
    @param list of training data
    @param list of strings - a list of the unique labels
    @param string - type of feature selection we want to perform
    @param number of features we want to choose
    @param number of min songs for num_songs feature selection
    @param number of max songs for num_songs feature selection
    @return list with a reduced set of features
    """
    informationGains = []
    featureNames = []
    featureLibraryInfo = []
    featureLibrary = []

    # Calculate all of the features (not just those from the example)
    print("Calculate all of the given features")
    allFeatures = Counter()
    featureArray = []
    featureArray = [fs for (fs, label) in trainData]
    for featureSet in featureArray:
        allFeatures.update(featureSet)
    if (numFeatures < len(allFeatures) and numFeatures != 0):
        if (featureSelectionMechanism == "information_gain"):
            selected_features_info = []
            selected_features = []
            print("Using information gain to select features")
            # Loop through all of the features and calculate the information gain for each
            for feature in allFeatures:
                print("Calculating information gain for %s " % feature)
                informationGains.append(
                    informationGain(trainData, allFeatures, feature, labels))
                featureNames.append(feature)
            informationGains = np.array(informationGains)
            sortedargs = np.argsort(informationGains)
            featureNames = [featureNames[i] for i in sortedargs]
            print informationGains
            #informationGains.reverse()
            #featureNames.reverse()
            # Add the top numFeatures to the counter.
            # if requesting too many features change number of requested features.
            if (numFeatures > len(featureNames)):
                numFeatures = len(featureNames)
            for i in range(0, numFeatures):
                selected_features_info.append(informationGains[i])
                selected_features.append(featureNames[i])
            # print featureLibraryInfo
        elif (featureSelectionMechanism == "num_songs"):
            print("Using min/max song metric to select features")
            selected_features = getDict(minNumSongs, maxNumSongs, trainSongs,
                                        featureExtractor)
        else:
            print("Using PCA to select features")
            # Create a matrix with he relevant labels
            allFeaturePairList = list(allFeatures.items())
            allFeatureKeyList = [pair[0] for pair in allFeaturePairList]
            index = 0
            data_features_matrix = np.zeros((len(trainData), len(allFeatures)))
            print("Creating the matrix for PCA input")
            for (features, label) in trainData:
                # Loop through each feature and populate matrix
                for feature in features:
                    data_features_matrix[index][allFeatureKeyList.index(
                        feature)] = allFeatures[feature]
                index = index + 1

            # Run PCA to reduce feature size.
            # print(data_features_matrix)

            print("Using PCA to reduce the number of features")
            reduced_features = mdp.pca(transpose(data_features_matrix),
                                       output_dim=2,
                                       svd=True)
            u1 = reduced_features[:, 1]
            order = np.argsort(u1)[::-1]
            order = order[1:numFeatures]
            print(
                "Populate the selected_features based on representation in first principal component"
            )
            selected_features = [allFeatureKeyList[index] for index in order]
            print(selected_features)
    else:
        for feature in allFeatures:
            featureNames.append(feature)
        selected_features = featureNames

    # Return the selected features regardless of algorithm used.
    return selected_features
Пример #33
0
def pca(data, singleValueDecomp=True):
	return(mdp.pca(data,svd=singleValueDecomp))
random.seed([3]) #seting the random seed

Y = get_headers()
for i in Y:
    F = split()
    for j in K:
        #should return a number between 0 and Y.size()
        #The algorithm should delete a random subset of classes
        keepCols = []
        deletedCols = []
        for jj in K:
            if(random.randrange(1)==1): keepCols.append(jj) #every column has 50% to be deleted
            else: deletedCols.append(jj)
        #the 'p' means that's a prime
        Xijp = bootstrap(F[keepCols], 1, len(F)*0.75) #option 2 http://climateecology.wordpress.com/2013/08/19/r-vs-python-speed-comparison-for-bootstrapping/
        Cij = mdp.pca(Xijp)

        #arrangin the rotation matrix
        Ri = [len(Cij)][K]
        id=0
        for a in len(Cij):
            aux = 0
            for b in K:
                aux += Cij[a][b]
            if(id == a): Ri[a][a] = aux #does the diagonal
            ++id

        #It should have the same order but without some columns, so is ok

Пример #35
0
        cur = con.cursor()
        cur.execute("select * from bu_cat")
        rows = cur.fetchall()
        A = rows[0]
        for row in rows[1:]:
            A = np.vstack([A, list(row)])

except mdb.Error, e:
    print e
    sys.exit(1)

finally:
    if con:
        con.close()

A = mdp.pca(A.astype('float32'), reduce=True)

##distances = pdist(A, cosine)
##distances_2d = squareform(distances)
clusters = hierarchy.linkage(A, method='complete', metric='cosine')
flat_clusters = hierarchy.fcluster(clusters.clip(0,100000), 0.8,'inconsistent')
plt.scatter(*numpy.transpose(A), c=clusters)
plt.axis("equal")
title = "threshold: %f, number of clusters: %d" % (thresh, len(set(clusters)))
plt.title(title)
plt.show()
with open('Clusters.dat', 'w+') as f:
    count = 0
    for v in flat_clusters:
        count += 1
        f.write(str(count) + "\t" + str(v) + "\n")
Пример #36
0
print "read data"
for line in co_occ_file:
	if count < 100:
		count += 1
		line = line.replace("\n", "")
		instance = line.split(" ")
		if(indexes[instance[0]] == -1):
			indexes[instance[0]] = current_index
			current_index+=1
		if(indexes[instance[1]] == -1):
			indexes[instance[1]] = current_index
			current_index+=1
		matrix[ indexes[instance[0]], indexes[instance[1]] ] = float(instance[2])
	else:
		break
		
	if count%10000 == 0:
		print count, "entries processed"
		
co_occ_file.close()

if current_index != nr_functional_words-1:
	print "not the same", current_index, nr_functional_words

print "perform pca"		
matrix = mdp.pca(matrix, 30)
print "pca done, start tsne"
	
#Y = tsne.tsne(X, no_dims, perplexity)
y = tsne.tsne(matrix, 2, nr_functional_words, perplexity)
Пример #37
0
abec = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


correcto = ['P', 'W', '3', '7', '6', '8', '8', 'D', 'Z', 'G', '2', '0', '1', 'R', 'S', '2', '0', '0']
for k in range(len(correcto)):
	im = Image.open("muestras/%s.png"%k)
	pix = im.load()
	w, h = im.size
	x = []

	for i in range(w):
	  tmp = []
	  for j in range(h):
	    if pix[i, j] == (255, 255, 255):
	      tmp.append(1)
	    else:
	      tmp.append(0)
	  x.append(tmp)
	
	y = mdp.pca(np.array(x,dtype = np.float64 ), output_dim = (7))
	y = y.transpose()
	res = []
	for value in y:
		res.append(value.sum()*1.0e+14)
	y = res
	f.write("%s "%bin(abec.index(correcto[k]))[2:].zfill(7))
	for value in y:
	  f.write("%s "%(str(value))) 
	f.write("\n")
	if debug: print "Dato para la imagen %s: %s\n"%(k, y)
Пример #38
0
 def do_pca(self,args):
     '''
     PCA -> "pca gaeta_coor_blind50.txt 1,3,6"
     Automatically measures pca from coordinates filename and shows two interactives plots
     With the second argument (arbitrary) you can select the columns and the multiplier factor 
     to use for the pca (for es "1,3*50,6,8x10,9"). Dont use spaces. "*" or "x" are the same thing.
     Without second argument it reads pca_config.txt file
     (c)Paolo Pancaldi, Massimo Sandal 2009
     '''
     
     # reads the columns of pca
     if self.config['hookedir'][0]=='/':
         slash='/' #a Unix or Unix-like system
     else:
         slash='\\'
     self.my_hooke_dir = self.config['hookedir']+slash
     #self.my_work_dir = os.getcwd()+slash+"pCluster_"+time.strftime("%Y%m%d_%H%M")+slash
     #self.my_curr_dir = os.path.basename(os.getcwd())
     conf=open(self.my_hooke_dir+"pca_config.txt")
     config = conf.readlines()
     conf.close()
     
     self.plot_myCoord = []          # tiene le coordinate prese direttamente dal file creato con pCluster
     self.plot_origCoord = []        # tiene le coordinate solo delle colonne scelte e moltiplicate per i valori scelti
     self.plot_pcaCoord = []         # tiene le due colonne della pca
     self.plot_pcaCoordTr = []       # tiene le due colonne della pca trasposta
     self.plot_FiltOrigCoord = []    # tiene le coordinate solo dei punti filtrati per densita
     self.plot_FiltPaths = []        # tiene i paths dei plot solo dei punti filtrati per densita
     self.plot_paths = []            # tiene i paths di tutti i plots
     self.plot_NewPcaCoord = []      # tiene le due colonne della pca filtrate per densita
     self.plot_NewPcaCoordTr=[]      # tiene le due colonne della pca trasposta filtrate per densita
     plot_path_temp = ""
     
     # prende in inpunt un arg (nome del file) 
     # e il secondo le colonne su cui lavorare (e' arbitrario, riceve x es "1,2,3")
     arg = args.split(" ")
     if arg[0]==args:
         file_name=args
     else:
         file_name=arg[0]
         config[0] = arg[1]
     
     # creo l'array "plot_myCoord" con tutte le coordinate dei plots
     # e l'array plot_paths con tutti i percorsi dei plots
     nPlotTot = -3 #tolgo le prime 3 righe iniziali del file
     f=open(file_name)
     rows = f.readlines()
     for row in rows:
         if row[0]!=" " and row[0]!="":
             nPlotTot = nPlotTot+1
             plot_path_temp = row
         if row[0]==" " and row.find('nan')==-1 and row.find("-1.#IND")==-1:
             row = row[row.index(";",2)+2:].split(" ; ")	# non considero la prima colonna col #picchi
             row = [float(i) for i in row]
             
             #0:Mean delta, 1:Median delta, 2:Mean force, 3:Median force, 4:First peak length, 5:Last peak length
             #6:Max delta 7:Min delta 8:Max force 9:Min force 10:Std delta 11:Std force
             if (row[0]<500 and row[1]<500 and row[2]<500 and row[3]<500 and row[4]<500 and row[5]<500 and row[6]<500 and row[7]<500 and row[8]<500 and row[9]<500 and row[10]<500 and row[11]<500):
                 if (row[0]>0 and row[1]>0 and row[2]>0 and row[3]>0 and row[4]>0 and row[5]>0 and row[6]>0 and row[7]>0 and row[8]>0 and row[9]>0 and row[10]>0 and row[11]>0):
                     #row = row[0], row[2], row[3]*3, row[6], row[7]*56, row[8]
                     self.plot_myCoord.append(row)
                     self.plot_paths.append(plot_path_temp)
     f.close()
     
     # creo l'array con alcune colonne e pure moltiplicate 
     for row in self.plot_myCoord:
         res=[]
         for cols in config[0].split(","):
             if cols.find("*")!=-1:
                 col = int(cols.split("*")[0])
                 molt = int(cols.split("*")[1])
             elif cols.find("x")!=-1:
                 col = int(cols.split("x")[0])
                 molt = int(cols.split("x")[1])
             else:
                 col = int(cols)
                 molt = 1
             res.append(row[col]*molt)
         self.plot_origCoord.append(res)
     
     # array convert, calculate PCA, transpose
     self.plot_origCoord = np.array(self.plot_origCoord,dtype='float')
     #print self.plot_origCoord.shape
     self.plot_pcaCoord = pca(self.plot_origCoord, output_dim=2)	#other way -> y = mdp.nodes.PCANode(output_dim=2)(array)
     self.plot_pcaCoordTr = np.transpose(self.plot_pcaCoord)
     pca_X=np.array(self.plot_pcaCoordTr[0],dtype='float')
     pca_Y=np.array(self.plot_pcaCoordTr[1],dtype='float')
     
     '''
     # Start section of testing with good plots                                  # 4 TESTING!
     Xsyn_1=[]
     Ysyn_1=[]        
     Xgb1_1=[]
     Ygb1_1=[]
     Xbad_1=[]
     Ybad_1=[]
     goodnamefile=open(file_name.replace("coordinate", "good"),'r')
     goodnames=goodnamefile.readlines()
     nPlotGood = len(goodnames)-2 #tolgo prima e ultima riga
     goodnames=[i.split()[0] for i in goodnames[1:]]
     
     for index in range(len(self.plot_paths)):
         if self.plot_paths[index][:-1] in goodnames:
             Xsyn_1.append(pca_X[index])
             Ysyn_1.append(pca_Y[index])
         else:
             Xbad_1.append(pca_X[index])
             Ybad_1.append(pca_Y[index])
     # Stop section of testing with good plots                                   # 4 TESTING!
     '''
     
     # print first plot
     clustplot1=lhc.PlotObject()
     clustplot1.add_set(pca_X,pca_Y)
     #clustplot1.add_set(Xbad_1,Ybad_1) # 4 TESTING!
     #clustplot1.add_set(Xsyn_1,Ysyn_1) # 4 TESTING!
     clustplot1.normalize_vectors()
     clustplot1.styles=['scatter', 'scatter','scatter']
     clustplot1.colors=[None,'red','green']
     clustplot1.destination=0
     self._send_plot([clustplot1])
     self.clustplot1=clustplot1
     
     # density and filer estimation
     kernel = sp.stats.kde.gaussian_kde(sp.c_[pca_X,pca_Y].T)
     tallest = 0
     for i in range(len(pca_X)):
         kern_value = kernel.evaluate([pca_X[i],pca_Y[i]])
         if tallest < kern_value:
                 tallest = float(kern_value)
     if float(config[1]) == 0:
         my_filter = float(tallest / 3.242311147)
     else:
         my_filter = float(config[1])
     '''
     # section useful only for graphic printing
     xmin = pca_X.min()
     xmax = pca_X.max()
     ymin = pca_Y.min()
     ymax = pca_Y.max()
     mX, mY = sp.mgrid[xmin:xmax:100j, ymin:ymax:100j]
     Z = sp.rot90(sp.fliplr(sp.reshape(kernel(sp.c_[mX.ravel(), mY.ravel()].T).T, mX.T.shape)))
     axis_X = np.linspace(xmin,xmax,num=100)
     axis_Y = np.linspace(ymin,ymax,num=100)
     '''
     
     # density filtering:
     # tramite "kernel.evaluate" trovo lo score (altezza) di ogni coordinata e decido se mantenerla o no
     filtered_pca_X = []
     filtered_pca_Y = []
     filtered_PcaCoordTr = []
     filtered_PcaCoord = []
     for i in range(len(pca_X)):
         kern_value = kernel.evaluate([pca_X[i],pca_Y[i]])
         if kern_value > my_filter:
             filtered_pca_X.append(pca_X[i])
             filtered_pca_Y.append(pca_Y[i])
     filtered_PcaCoordTr.append(filtered_pca_X)
     filtered_PcaCoordTr.append(filtered_pca_Y)
     filtered_PcaCoord = np.transpose(filtered_PcaCoordTr)
     
     # creo i due array "plot_FiltOrigCoord" e "plot_FiltPaths" contenenti solo i dati filtrati con alta densita
     for index in range(len(self.plot_pcaCoord)):
         if self.plot_pcaCoord[index] in filtered_PcaCoord:
             self.plot_FiltOrigCoord.append(self.plot_myCoord[index])
             self.plot_FiltPaths.append(self.plot_paths[index])
     
     '''
     # START PCA#2: USELESS!!!
     
     # creo l array con alcune colonne e pure moltiplicate
     temp_coord = []
     for row in self.plot_FiltOrigCoord:
         res=[]
         for cols in config[2].split(","):
             if cols.find("*")!=-1:
                 col = int(cols.split("*")[0])
                 molt = int(cols.split("*")[1])
             elif cols.find("x")!=-1:
                 col = int(cols.split("x")[0])
                 molt = int(cols.split("x")[1])
             else:
                 col = int(cols)
                 molt = 1
             res.append(row[col]*molt)
         temp_coord.append(res)
     self.plot_FiltOrigCoord = temp_coord
             
     # ricalcolo la PCA: array convert, calculate PCA, transpose
     self.plot_FiltOrigCoord = np.array(self.plot_FiltOrigCoord,dtype='float')
     #print self.plot_FiltOrigCoord.shape
     self.plot_NewPcaCoord = pca(self.plot_FiltOrigCoord, output_dim=2)	#other way -> y = mdp.nodes.PCANode(output_dim=2)(array)
     self.plot_NewPcaCoordTr = np.transpose(self.plot_NewPcaCoord)
     pca_X2=np.array(self.plot_NewPcaCoordTr[0],dtype='float')
     pca_Y2=np.array(self.plot_NewPcaCoordTr[1],dtype='float')
     
     # Start section of testing with good plots                              # 4 TESTING!
     Xsyn_2=[]
     Ysyn_2=[]
     Xbad_2=[]
     Ybad_2=[]
     for index in range(len(self.plot_FiltPaths)):
         if self.plot_FiltPaths[index][:-1] in goodnames:
             Xsyn_2.append(pca_X2[index])
             Ysyn_2.append(pca_Y2[index])
         else:
             Xbad_2.append(pca_X2[index])
             Ybad_2.append(pca_Y2[index])
     
     # print second plot
     clustplot2=lhc.PlotObject()
     #clustplot2.add_set(pca_X2,pca_Y2)
     clustplot2.add_set(Xbad_2,Ybad_2)                                       # 4 TESTING!
     clustplot2.add_set(Xsyn_2,Ysyn_2)                                       # 4 TESTING!
     clustplot2.normalize_vectors()
     clustplot2.styles=['scatter', 'scatter','scatter']
     clustplot2.colors=[None,'red','green']
     clustplot2.destination=1
     self._send_plot([clustplot2])
     self.clustplot2=clustplot2
     '''
     
     # PRINT density plot
     clustplot2=lhc.PlotObject()
     clustplot2.add_set(filtered_pca_X,filtered_pca_Y)
     clustplot2.normalize_vectors()
     clustplot2.styles=['scatter', 'scatter','scatter']
     clustplot2.colors=[None,'red','green']
     clustplot2.destination=1
     self._send_plot([clustplot2])
     self.clustplot2=clustplot2
     
     # printing results
     config_pca1 = config[0].replace("*", "x").rstrip("\n")
     config_pca2 = config[2].replace("*", "x").rstrip("\n")
     print ""
     print "- START: "+file_name
     print "Curve totali: ", nPlotTot
     #print "Curve totali good: ", nPlotGood                                  # 4 TESTING!
     print "- FILTRO 1: 0-500 e NaN"
     print "Curve totali rimaste: ", len(self.plot_origCoord)
     #print 'Curve good rimaste: ', len(Xsyn_1)                               # 4 TESTING!
     print "- FILTRO 2: PCA:"+config_pca1+" e DENSITA:"+str(my_filter)
     print "Curve totali rimaste: ", len(self.plot_FiltOrigCoord)
     #print 'Curve good rimaste: ', len(Xsyn_2)                               # 4 TESTING!
     print "Piu alta: ", tallest
     #print "- FILTRO 3: 2'PCA:"+config_pca2
     print ""
     
     # -- exporting coordinates and plot of PCA in debug mode! --
     if config[3].find("true")!=-1:
         #1' PCA: save plot and build coordinate s file
         self.do_export(file_name.replace("coordinate_", "debug_pca1graph_").replace('.txt','_'+config_pca1) + " 0")
         f = open(file_name.replace("coordinate_", "debug_pca1coor_").replace('.txt','_'+config_pca1+'.txt'),'w')
         for i in range(len(pca_X)):
             f.write (str(i) + "\t" + str(pca_X[i]) + "\t" + str(pca_Y[i]) + "\n")
         f.close()
         #2' PCA: save plot and build coordinate s file
         #self.do_export(file_name.replace("coordinate_", "debug_pca2graph_").replace('.txt','_'+config_pca2) + " 1")
         #f = open(file_name.replace("coordinate_", "debug_pca2coor_").replace('.txt','_'+config_pca2+'.txt'),'w')
         #for i in range(len(pca_X2)):
         #    f.write (str(i) + "\t" + str(pca_X2[i]) + "\t" + str(pca_Y2[i]) + "\n")
         #f.close()
         #DENSITY: save plot
         self.do_export(file_name.replace("coordinate_", "debug_densitygraph_").replace('.txt','_'+config_pca1+'_'+str(my_filter).replace(".",",")) + " 1")
         f = open(file_name.replace("coordinate_", "debug_densitycoor_").replace('.txt','_'+config_pca1+'_'+str(my_filter).replace(".",",")+'.txt'),'w')
         for i in range(len(filtered_pca_X)):
             f.write (str(i) + "\t" + str(filtered_pca_X[i]) + "\t" + str(filtered_pca_Y[i]) + "\n")
         f.close()
         #ALL GOOD COORDINATES (without NaN and 0<x<500)
         f = open(file_name.replace("coordinate_", "debug_allgoodcoor_"),'w')
         for i in range(len(self.plot_myCoord)):
             for cel in self.plot_myCoord[i]:
                 f.write (" ; " + str(cel))
             f.write ("\n")
         f.close()
     
     # pCLUSTER SAVING!!!
     import shutil
     pcl_name = file_name.replace("coordinate_", "goodplots_").replace('.txt','_'+config_pca1+'_'+str(my_filter).replace(".",","))
     if os.path.exists(pcl_name+slash): shutil.rmtree(pcl_name)
     os.mkdir(pcl_name+slash)
     f = open(pcl_name+'.txt','w')
     for i in range(len(self.plot_FiltPaths)):
         myfile = str(self.plot_FiltPaths[i]).rstrip("\n")
         f.write (myfile+"\n")
         shutil.copy2(myfile, pcl_name)
     f.close()
Пример #39
0
def PCA(x):
    # http://mdp-toolkit.sourceforge.net/
    return mdp.pca(x)