示例#1
0
def getNaiveDists(oc,fld,topN,rt='t',reduc='none',comps=[]):
    '''
    For origin scale degree set oc
    calculate its naive (or PCA-reduced) distance from topN most unigram probable chords
    in terms of TPDs taken from directory fld
    output a ranked list of the closest chords and their Manhattan distances
    If reduc=='PCA,' calculates Manhattan distances based on components listed in comps
    '''
    import operator
    import numpy as np
    from sklearn.decomposition import PCA
    
    distList = []#here are all the distances between the oc TPD and the topN TPD matrices
    
    #figure out which chords are in the topN most probable for keeping
    allChords = csv.reader(open('50ms 3 SDSets.csv','r',newline='\n'))
    allChordsList = []#the names of the topN origin chords
    uniProbs = {}#a dict of their unigram probs
    for i, row in enumerate(allChords):
        #Make a list of the topN most unigram-probable sds
        if i > topN - 1:
            break
        allChordsList.append(row[0])
        uniProbs[row[0]] = int(row[1])    
    #print(allChordsList[0])#Can leave chord names as strings
    
    #get data for this origin chord (oc)
    if rt=='t':
        originpath = 'C:/Users/Andrew/workspace/DissWork/'+str(oc)+' SDs prog probs 50ms.csv'
    elif rt=='c':
        originpath = 'C:/Users/Andrew/workspace/DissWork/'+str(oc)+' SDs prog probs 50msTRANS.csv'
    ocTPD = getOrderedTPDdata(originpath, allChordsList, reduc=reduc,rowtype=rt)
    
    #now iterate through all DC TPDs in fld
    listing = os.listdir(fld)
    flist = []#origin chord labels/names, in order
    for f in listing:
        #Toss out those not in allChordsList (i.e, not topN prob)
        chdStr = f.split('.')[0]#more csv kludging
        sdsStr = chdStr.split(']')[0] + ']'
        if sdsStr not in allChordsList:
            #print('skipping '+f)
            continue
        #any f reaching this point is a topN chord
        ocName = f.split(']')[0]+']'
        flist.append(ocName)
        
        #get the data for the origin chord's destinations over time
        address = fld + f
        if rt=='c':
            csvTransposer(address, 'temp_dcMat_c.csv')
            address = 'temp_dcMat_c.csv'
        dcTPD = getOrderedTPDdata(address, allChordsList, reduc=reduc,rowtype=rt)
            
        #now get the distance between given oc and this origin chord's TPD array
        if reduc=='PCA': hdr='no'
        else: hdr='yes'
        #get manhattan distance between arrays
        distList.append([f,naiveDistance(ocTPD, dcTPD,headers=hdr,pcaComp=comps)])
    
    #now sort and output the distList
    s_distList = sorted(distList,key=operator.itemgetter(1))
    if reduc=='PCA':
        file = open(str(oc)+ ' neighbors ' +str(topN)+ 'PCA.csv', 'w',newline='\n')
    else:
        file = open(str(oc)+ ' neighbors ' +str(topN)+ '.csv', 'w',newline='\n')
    lw = csv.writer(file)
    lw.writerow(['scale degree set','TPD distance'])
    for row in s_distList:
        lw.writerow(row)
示例#2
0
def bigramTPDcluster(fld,k):
    '''
    Input: the collection of sds temporal probability distributions (TPDs), abs(P) vs. time
    Pull each individual (100-dim) TPD and label it as an OC-DC bigram vector
    Run k-medoids on all bigram vectors to cluster by "progression similarity type"
    (i.e., OC1 goes to DC1 in the same way that OC2 -> DC2)
    NB: k-medoids converges LOCALLY, and this is very inefficient!
    [Likely dead end, but kept in case necessary in future]
    '''
    import numpy
    import scipy.stats
    from sklearn.decomposition import PCA
    
    listing = os.listdir(fld)
    bigramTPDList = []
    bgList = []
    for f in listing:
        #Get origin chord data
        address = fld + f
        #these start out as (row, col) = (time window, dest chord)
        transMat = csvTransposer(address,'tempTRANSmat.csv')
        allDests = csv.reader(open('tempTRANSmat.csv','r',newline='\n'))
        listOfRows = []
        #now, each row is a destination chord with (time window) cols
        for row in allDests:
            listOfRows.append(row)
        #turn all the empty entries into 0 abs probs
        for row in listOfRows:
            for j in range(101):
                if row[j]=='':
                    row[j]=0
        
        #run PCA on destination chord spectrum for given origin chord
        distprobs = []
        for i in range(1,len(listOfRows)):
            distprobs.append([float(x) for x in listOfRows[i][1:]])
        #print(len(distprobs[0]),distprobs[0])
        #convert into numpy array for PCA; num DCs rows x 100 ts cols
        probarr = numpy.array(distprobs)
        #print(probarr.shape,probarr[0])
        pca = PCA(n_components = 3)
        pca.fit(probarr)
        #put DC vectors into PCA basis
        transformed_data = pca.fit(probarr).transform(probarr)
        compn = pca.components_
        
        #to orient the components for comparison, we need at least 3 of them
        if len(compn) < 3:
            continue
        #NB!: for many chords, the components are shitty and don't tell us anything good!
        if compn[0][0] < 0:#set the first component to start positive (usually phonetic data)
            for dcrow in transformed_data:
                dcrow[0] = -1*dcrow[0]
        if compn[1][0] > 0:#set the second component to start negative (usually long-range key data)
            for dcrow in transformed_data:
                dcrow[1] = -1*dcrow[1]
        if compn[2][0] > 0:#set the third component to start negative (usually syntactic[?] data)
            for dcrow in transformed_data:
                dcrow[2] = -1*dcrow[2]
        #print(len(transformed_data),len(transformed_data[1]))
        for i,dcrow in enumerate(transformed_data):
            bigramTPDList.append(dcrow)#put PCA-basis bigram data in list
            bgList.append([f,listOfRows[i][0]])#labels; same order as actual TPD list

    #now, k-medoids cluster based on bigramTPDList
    distMat = scipy.spatial.distance.pdist(bigramTPDList, 'cosine')#condensed
    distMat_sq = scipy.spatial.distance.squareform(distMat)#redundant, square
    clus_and_med = cluster(distMat_sq,k)
    
    #send out the results via csv
    meds = [bgList[med] for med in clus_and_med[1]]
    clus = []#format: [bigram label, cluster assignment number, cluster medoid label]
    for l,bg in enumerate(bgList):
        clus.append([bg,clus_and_med[0][l],bgList[clus_and_med[0][l]]])
    csvName = 'bigramTPD kmedoids PCAtest.csv'
    file = open(csvName, 'w',newline='\n')
    lw = csv.writer(file)
    lw.writerow(meds)
    for row in clus:
        lw.writerow(row)