def getNaiveDists(oc,fld,topN,rt='t',reduc='none',comps=[]): ''' For origin scale degree set oc calculate its naive (or PCA-reduced) distance from topN most unigram probable chords in terms of TPDs taken from directory fld output a ranked list of the closest chords and their Manhattan distances If reduc=='PCA,' calculates Manhattan distances based on components listed in comps ''' import operator import numpy as np from sklearn.decomposition import PCA distList = []#here are all the distances between the oc TPD and the topN TPD matrices #figure out which chords are in the topN most probable for keeping allChords = csv.reader(open('50ms 3 SDSets.csv','r',newline='\n')) allChordsList = []#the names of the topN origin chords uniProbs = {}#a dict of their unigram probs for i, row in enumerate(allChords): #Make a list of the topN most unigram-probable sds if i > topN - 1: break allChordsList.append(row[0]) uniProbs[row[0]] = int(row[1]) #print(allChordsList[0])#Can leave chord names as strings #get data for this origin chord (oc) if rt=='t': originpath = 'C:/Users/Andrew/workspace/DissWork/'+str(oc)+' SDs prog probs 50ms.csv' elif rt=='c': originpath = 'C:/Users/Andrew/workspace/DissWork/'+str(oc)+' SDs prog probs 50msTRANS.csv' ocTPD = getOrderedTPDdata(originpath, allChordsList, reduc=reduc,rowtype=rt) #now iterate through all DC TPDs in fld listing = os.listdir(fld) flist = []#origin chord labels/names, in order for f in listing: #Toss out those not in allChordsList (i.e, not topN prob) chdStr = f.split('.')[0]#more csv kludging sdsStr = chdStr.split(']')[0] + ']' if sdsStr not in allChordsList: #print('skipping '+f) continue #any f reaching this point is a topN chord ocName = f.split(']')[0]+']' flist.append(ocName) #get the data for the origin chord's destinations over time address = fld + f if rt=='c': csvTransposer(address, 'temp_dcMat_c.csv') address = 'temp_dcMat_c.csv' dcTPD = getOrderedTPDdata(address, allChordsList, reduc=reduc,rowtype=rt) #now get the distance between given oc and this origin chord's TPD array if reduc=='PCA': hdr='no' else: hdr='yes' #get manhattan distance between arrays distList.append([f,naiveDistance(ocTPD, dcTPD,headers=hdr,pcaComp=comps)]) #now sort and output the distList s_distList = sorted(distList,key=operator.itemgetter(1)) if reduc=='PCA': file = open(str(oc)+ ' neighbors ' +str(topN)+ 'PCA.csv', 'w',newline='\n') else: file = open(str(oc)+ ' neighbors ' +str(topN)+ '.csv', 'w',newline='\n') lw = csv.writer(file) lw.writerow(['scale degree set','TPD distance']) for row in s_distList: lw.writerow(row)
def bigramTPDcluster(fld,k): ''' Input: the collection of sds temporal probability distributions (TPDs), abs(P) vs. time Pull each individual (100-dim) TPD and label it as an OC-DC bigram vector Run k-medoids on all bigram vectors to cluster by "progression similarity type" (i.e., OC1 goes to DC1 in the same way that OC2 -> DC2) NB: k-medoids converges LOCALLY, and this is very inefficient! [Likely dead end, but kept in case necessary in future] ''' import numpy import scipy.stats from sklearn.decomposition import PCA listing = os.listdir(fld) bigramTPDList = [] bgList = [] for f in listing: #Get origin chord data address = fld + f #these start out as (row, col) = (time window, dest chord) transMat = csvTransposer(address,'tempTRANSmat.csv') allDests = csv.reader(open('tempTRANSmat.csv','r',newline='\n')) listOfRows = [] #now, each row is a destination chord with (time window) cols for row in allDests: listOfRows.append(row) #turn all the empty entries into 0 abs probs for row in listOfRows: for j in range(101): if row[j]=='': row[j]=0 #run PCA on destination chord spectrum for given origin chord distprobs = [] for i in range(1,len(listOfRows)): distprobs.append([float(x) for x in listOfRows[i][1:]]) #print(len(distprobs[0]),distprobs[0]) #convert into numpy array for PCA; num DCs rows x 100 ts cols probarr = numpy.array(distprobs) #print(probarr.shape,probarr[0]) pca = PCA(n_components = 3) pca.fit(probarr) #put DC vectors into PCA basis transformed_data = pca.fit(probarr).transform(probarr) compn = pca.components_ #to orient the components for comparison, we need at least 3 of them if len(compn) < 3: continue #NB!: for many chords, the components are shitty and don't tell us anything good! if compn[0][0] < 0:#set the first component to start positive (usually phonetic data) for dcrow in transformed_data: dcrow[0] = -1*dcrow[0] if compn[1][0] > 0:#set the second component to start negative (usually long-range key data) for dcrow in transformed_data: dcrow[1] = -1*dcrow[1] if compn[2][0] > 0:#set the third component to start negative (usually syntactic[?] data) for dcrow in transformed_data: dcrow[2] = -1*dcrow[2] #print(len(transformed_data),len(transformed_data[1])) for i,dcrow in enumerate(transformed_data): bigramTPDList.append(dcrow)#put PCA-basis bigram data in list bgList.append([f,listOfRows[i][0]])#labels; same order as actual TPD list #now, k-medoids cluster based on bigramTPDList distMat = scipy.spatial.distance.pdist(bigramTPDList, 'cosine')#condensed distMat_sq = scipy.spatial.distance.squareform(distMat)#redundant, square clus_and_med = cluster(distMat_sq,k) #send out the results via csv meds = [bgList[med] for med in clus_and_med[1]] clus = []#format: [bigram label, cluster assignment number, cluster medoid label] for l,bg in enumerate(bgList): clus.append([bg,clus_and_med[0][l],bgList[clus_and_med[0][l]]]) csvName = 'bigramTPD kmedoids PCAtest.csv' file = open(csvName, 'w',newline='\n') lw = csv.writer(file) lw.writerow(meds) for row in clus: lw.writerow(row)