def getFeatureFilesDetails(featureNames): ''' Get a list of the files for a particular feature and their details ''' fileDetails = [] for featureName in featureNames: # Get names of feature folders rootPath = FFP.getRootPath(featureName) featuresPath = FFP.getFeatureFolderPath(rootPath, featureName) pieceFolders = getFolderNames(featuresPath, orderAlphabetically=True) # Iterate over pieces for pieceFolder in pieceFolders: performanceFiles = getFileNames(featuresPath + pieceFolder, endsWith='.csv', orderAlphabetically=True) # Iterate over performances for performanceFile in performanceFiles: fileDetails.append({ 'Feature': featureName, 'Piece': pieceFolder, 'Performance': rcut(performanceFile, FFP.fileSuffix[featureName]), 'Filename': performanceFile }) df = pd.DataFrame(fileDetails) df.to_csv('Feature File Details.csv')
def getFeatureValuesDataFrame(featureName, numFolders, numFilesPerFolder, featureFileType='.csv'): ''' Returns a dataframe of the feature values for a specified number of performances ''' # Get names of feature folders rootPath = FFP.getRootPath(featureName) pieceFolders = getFolderNames(rootPath, contains='mazurka', orderAlphabetically=True) featureDataFrames = [] # Iterate over pieces for pieceFolder in pieceFolders: print 'processing folder: %s' % pieceFolder featuresPath = FFP.getFeatureFolderPath(rootPath + pieceFolder, featureName) performanceFiles = getFileNames( featuresPath, endsWith=featureFileType, orderAlphabetically=True)[:numFilesPerFolder] # Iterate over performances for performanceFile in performanceFiles: print '\tprocessing file: %s' % performanceFile featureFn = os.path.join(featuresPath, performanceFile) if featureFileType == '.csv': featureDataFrames.append( pd.read_csv(featureFn, header=None, index_col=0)) elif featureFileType == '.pkl': featureDataFrames.append(pd.read_pickle(featureFn)) dfAllPerformances = pd.concat(featureDataFrames, ignore_index=True) return dfAllPerformances
def getFeatureFileDict(piecesPath, pieceFolder, featuresPath, featureName, numFiles=None): ''' Returns a dictionary of all the performances of a given piece The dictionary keys are the filenames with the standard suffix for that feature removed. The dictionary values are new dictionaries with keys and associated values for FileName, FilePath and PieceId If numFiles is None then all files will be returned, otherwise return the first numFiles files alphabetically ''' featureFileDict = {} featureFileSuffix = FeatureFileProps.fileSuffix[featureName] featureFileNames = sorted( getFileNames(featuresPath, featureFileSuffix, True)) if numFiles is not None: featureFileNames = featureFileNames[:numFiles] featureFilePaths = [featuresPath + fName for fName in featureFileNames] featureFileIds = [ fName.rstrip(featureFileSuffix) for fName in featureFileNames ] for i in np.arange(len(featureFileIds)): fileName = featureFileNames[i] filePath = featureFilePaths[i] pieceId = pieceFolder featureFileDict[featureFileIds[i]] = FeatureFileProps( fileName, filePath, pieceId) return featureFileDict
def loadCRPfiles(): ''' Loads all CRP files into memory ''' CRPfiles = sorted(getFileNames(CRPpath, '.npy')) CRPs = [] for CRPfile in CRPfiles: fCRP = open(CRPpath + CRPfile, 'rb') CRPdata = fCRP.read() fCRP.close() CRPs.append(CRPdata) return CRPfiles, CRPs
def cleanRunFolder(runName=None, cleanCRPfolder=True, cleanNCDfolder=True): # Remove CRP files crpFiles = getFileNames(CRPpath, endsWith='.npy') for crpFile in crpFiles: os.remove(CRPpath + crpFile) # Remove NCD files ncdFiles = getFileNames(NCDpath, endsWith='.pkl') for ncdFile in ncdFiles: os.remove(NCDpath + ncdFile) # Empty Run Folder and Run History Folder if runName is not None: resultsPath = NCDpath + runName + '/' resultsFiles = getFileNames(resultsPath) for resultsFile in resultsFiles: os.remove(resultsPath + resultsFile) historyPath = runHistoryPath + runName + '/' historyFiles = getFileNames(historyPath) for historyFile in historyFiles: os.remove(historyPath + historyFile)
def getRunHistoryDataFrame(): ''' Loads the history of runs and returns a dataframe of the settings ''' lstRunHistory = [] runHistoryFiles = getFileNames(runHistoryPath, endsWith='.pkl', orderAlphabetically=True) for rhFile in runHistoryFiles: lstRunHistory.append(pickle.load(open(runHistoryPath + rhFile, 'rb'))) df = pd.DataFrame(lstRunHistory) return df
def getFeatureFileDictTestSet(cls, pieceFolder, featuresPath, featureName): featureFileDict = {} featureFileSuffix = FeatureFileProps.getFileSuffix(featureName) featureFileNames = getFileNames(featuresPath, endsWith=featureFileSuffix, orderAlphabetically=True) featureFileNames = getTestSetPerformances(featureFileNames) featureFilePaths = [featuresPath + fName for fName in featureFileNames] featureFileIds = [ fName[:-len(featureFileSuffix)] for fName in featureFileNames ] for i in range(len(featureFileIds)): fileName = featureFileNames[i] filePath = featureFilePaths[i] pieceId = pieceFolder featureFileDict[featureFileIds[i]] = FeatureFileProps( fileName, filePath, pieceId, featureFileIds[i]) return featureFileDict
def getFeatureFrequenciesDataFrame(featureName, weightMatrix, biasesMatrix, featureOffset, featureScaling, NNtimeStacking, numFolders, numFilesPerFolder): # Get the folders (performances) piecesPath = FFP.getRootPath(featureName) piecesFolders = getFolderNames( piecesPath, contains='mazurka', orderAlphabetically=True )[: 20] # added the contains parameter to avoid the new powerspectrum folder if numFolders is not None: piecesFolders = piecesFolders[:numFolders] # For each piece featureDataFrames = [] for piecesFolder in piecesFolders: # Get performances of the piece featuresPath = FFP.getFeatureFolderPath(piecesPath + piecesFolder, featureName) performances = getFileNames(featuresPath, orderAlphabetically=True, endsWith='.csv') if numFilesPerFolder is not None: performances = performances[:numFilesPerFolder] pf = 0 for performance in performances: pf += 1 print 'Transforming Features %i' % pf # Load feature file and transform dfTransformedFeatures = loadAndTransformFeatureFile( featuresPath + performance, featureOffset, featureScaling, NNtimeStacking, weightMatrix, biasesMatrix) featureDataFrames.append(dfTransformedFeatures) # Calculate Histogram of the transformed features dfAllPerformances = pd.concat(featureDataFrames, ignore_index=True) return dfAllPerformances
def getFeatureFileDict(piecesPath, pieceFolder, featuresPath, featureName): ''' Returns a dictionary of all the performances of a given piece The dictionary keys are the filenames with the standard suffix for that feature removed. The dictionary values are new dictionaries with keys and associated values for FileName, FilePath and PieceId ''' featureFileDict = {} featureFileSuffix = featuresDict[featureName]['file suffix'] featureFileNames = getFileNames(featuresPath, featureFileSuffix) featureFilePaths = [featuresPath + fName for fName in featureFileNames] featureFileIds = [ fName.rstrip(featureFileSuffix) for fName in featureFileNames ] for i in np.arange(len(featureFileIds)): featureFileDict[featureFileIds[i]] = {} featureFileDict[featureFileIds[i]]['FileName'] = featureFileNames[i] featureFileDict[featureFileIds[i]]['FilePath'] = featureFilePaths[i] featureFileDict[featureFileIds[i]]['PieceId'] = pieceFolder return featureFileDict
def getFeatureFileDict(piecesPath, pieceFolder, featuresPath, featureName, runType): ''' Returns a dictionary of all the performances of a given piece The dictionary keys are the filenames with the standard suffix for that feature removed. The dictionary values are new dictionaries with keys and associated values for FileName, FilePath and PieceId If numFiles is None then all files will be returned, otherwise return the first numFiles files alphabetically ''' featureFileDict = {} featureFileSuffix = FeatureFileProps.getFileSuffix(featureName) featureFileNames = getFileNames(featuresPath, endsWith=featureFileSuffix, orderAlphabetically=True) if runType == 'training': featureFileNames = getTrainingSetPerformances(featureFileNames) elif runType == 'validation': featureFileNames = getValidationSetPerformances(featureFileNames) elif runType == 'test': featureFileNames = getTestSetPerformances(featureFileNames) featureFilePaths = [featuresPath + fName for fName in featureFileNames] featureFileIds = [ fName[:-len('featureFileSuffix')] for fName in featureFileNames ] for i in np.arange(len(featureFileIds)): fileName = featureFileNames[i] filePath = featureFilePaths[i] pieceId = pieceFolder featureFileDict[featureFileIds[i]] = FeatureFileProps( fileName, filePath, pieceId, featureFileIds[i]) return featureFileDict
def createNCDfiles(existingNCDs, processPool, featureName, downSampleFactor, timeDelay, dimension, method, neighbourhoodSize, numFilesPerFolder, sequenceLength, weightMatrix=None, biases=None, featureOffset=0.0, featureScaling=1.0): ''' Inputs: :existingNCDs: a list of existing NCD files in order to avoid duplication :processPool: a pool of multiprocessing processes to use for running the script :featureName: the name of the feature e.g 'chroma', 'mfcc' :downSampleFactor: the factor to use in downsampling the original signals before creating CRPs :timeDelay: the time delay to use in creating the CRPs :method: the method to use in creating the CRPs :neighbourhoodSize: the neighbourhood size to use in creating the CRPs :numFilesPerFolder: the number of performances of each piece to use - set to None to use all performances :sequenceLength: fixed sequence length to normalise CRPs to (use 'var' for variable length) :weightMatrix: a matrix of weights (inputFeatureLength rows x outputFeatureLength columns) to transform the input feature files with before calculating the CRPs ''' mazurkasPath = FeatureFileProps.rootPath[featureName] mazurkaIds = getFolderNames(mazurkasPath, True)[:20] if existingNCDs is not None: existingNCDs = set(existingNCDs) # makes checking faster # Get performances from folders featureFileDict = getFeatureFileDictAllFolders(mazurkasPath, mazurkaIds, featureName, numFilesPerFolder) # Create list of required NCD files requiredNCDs = [] featureFileIds = featureFileDict.keys() numFeatureFiles = len(featureFileIds) print 'Checking for existing NCD files...' for f1 in np.arange(numFeatureFiles - 1): featureFilePath1 = featureFileDict[featureFileIds[f1]].filePath pc1Id = featureFileDict[featureFileIds[f1]].pieceId pc1pfId = featureFileIds[f1] for f2 in np.arange(f1, numFeatureFiles): featureFilePath2 = featureFileDict[featureFileIds[f2]].filePath pc2Id = featureFileDict[featureFileIds[f2]].pieceId pc2pfId = featureFileIds[f2] ncdProps = NCDprops(pc1Id, pc1pfId, pc2Id, pc2pfId, method, dimension, timeDelay, neighbourhoodSize, downSampleFactor, sequenceLength, featureName, featureFilePath1, featureFilePath2) if not NCDexists(ncdProps.getFileName(), existingNCDs=existingNCDs): requiredNCDs.append(ncdProps) print 'Number of NCD files missing for combination: %i' % len(requiredNCDs) # Create Required CRPs for NCD files if len(requiredNCDs) > 0: # Create CRP files and save to the CRPs folder print 'Calculating # of required CRP files' requiredCRPs = [] sourceCRPs = [] for requiredNCD in requiredNCDs: crp1 = requiredNCD.getCRP1() crp2 = requiredNCD.getCRP2() sourceCRPs.append(crp1) sourceCRPs.append(crp2) if not crp1.hasExistingFile(): requiredCRPs.append(crp1) if not crp2.hasExistingFile(): requiredCRPs.append(crp2) requiredCRPs = CRPprops.uniqueCRPprops(requiredCRPs) sourceCRPs = CRPprops.uniqueCRPprops(sourceCRPs) numRequiredCRPs = len(requiredCRPs) print 'Creating %i required CRP files' % numRequiredCRPs if numRequiredCRPs > 0: CRPargList = [] for crp in requiredCRPs: crp.weightMatrix = weightMatrix crp.biases = biases crp.featureOffset = featureOffset crp.featureScaling = featureScaling CRPargList.append((crp, )) processPool.map(multi_createCRPfile, CRPargList) # Load CRP files into memory print 'Loading %i CRP files' % len(sourceCRPs) CRPfiles = loadCRPfiles(sourceCRPs) # Create NCD files numNCDs = len(requiredNCDs) print 'Creating %i NCD files' % numNCDs NCDindex = 0 while NCDindex < numNCDs: NCDargList = [] for iNCD in np.arange(NCDindex, min(NCDindex + 100, numNCDs)): requiredNCD = requiredNCDs[iNCD] NCDfn = requiredNCD.getFileName() CRPtuple1 = requiredNCD.getCRP1().toTuple(False) CRPtuple2 = requiredNCD.getCRP2().toTuple(False) try: NCDargList.append( (NCDfn, CRPfiles[CRPtuple1], CRPfiles[CRPtuple2])) except: pass if NCDargList: processPool.map(multi_createNCDfile, NCDargList) NCDindex += 100 print '\r%i...' % NCDindex, # Delete CRP files print 'Deleting CRP files' for CRPfilename in getFileNames(CRPpath, '.npy', True): try: os.remove(CRPpath + CRPfilename) except: pass
# Load weights and biases if NNtype is not None: weightMatrix, biases, featureOffset, featureScaling = get_NN_NCD_params( NNtype, featureName, learningRate, learningRateBoostFactor, corruptionLevel, numOriginalFeatures, numNewFeatures, batchSize, freqStd = frequencyStandardisation, NNnumFolders = numFolders, NNnumFilesPerFolder = numFilesPerFolder, NNtimeStacking = timeStacking) # Load (and optionally transform) the feature files p = 0 featuresDataFrames = [] for piecesFolder in piecesFolders: performancesPath = FFP.getFeatureFolderPath(piecesPath + piecesFolder + '/', featureName) performances = getFileNames(performancesPath, orderAlphabetically = True, endsWith = '.csv') if numFilesPerFolder is not None: performances = performances[: numFilesPerFolder] for performance in performances: p+= 1 print '\rloading feature file %i...' % p, performanceFilePath = performancesPath + performance if NNtype is None: featuresDataFrames.append(loadFeatureFile(performanceFilePath)) else: featuresDataFrames.append(loadAndTransformFeatureFile(performanceFilePath, featureOffset, featureScaling, timeStacking, weightMatrix, biases)) print
def cleanNCDfolder(): # Remove NCD files ncdFiles = getFileNames(NCDpath, endsWith = '.pkl') for ncdFile in ncdFiles: os.remove(NCDpath + ncdFile)
def cleanCRPfolder(): # Remove CRP files crpFiles = getFileNames(CRPpath, endsWith = '.npy') for crpFile in crpFiles: os.remove(CRPpath + crpFile)
def convertNCDfiles(dataFrameFileName): ''' Converts NCD results files in the NCD folder into a pandas dataframe If the dataframe already exists with old results then the new results are appended ''' dataFrameFileName = rcut(dataFrameFileName, '.pkl.res') + '.pkl.res' # Load new NCD files NCDfiles = [ fn for fn in getFileNames(NCDpath, endsWith='.pkl') if reNCDfilename.search(fn) ] print 'Total number of files: %i' % len(NCDfiles) lstNCDs = [] iFile = 0 print 'Reading files...' for NCDfile in NCDfiles: try: NCDfileDict = pickle.load(open(NCDpath + NCDfile, 'rb')) NCDfileDict['FileName'] = NCDfile.rstrip('.pkl') m = reNCDfilename.search(NCDfile) NCDfileDict['Piece 1 Id'] = m.group(1) NCDfileDict['Piece 1 Performance Id'] = m.group(2) NCDfileDict['Piece 2 Id'] = m.group(3) NCDfileDict['Piece 2 Performance Id'] = m.group(4) NCDfileDict['CRP Method'] = m.group(5) NCDfileDict['CRP Dimension'] = float(m.group(6)) NCDfileDict['CRP Time Delay'] = float(m.group(7)) NCDfileDict['CRP Neighbourhood Size'] = float(m.group(8)) NCDfileDict['Downsample Factor'] = m.group(9) NCDfileDict['Feature'] = m.group(10) NCDfileDict['Sequence Length'] = m.group(11) NCDfileDict['File DateTime'] = time.ctime( os.path.getmtime(NCDpath + NCDfile)) lstNCDs.append(copy.deepcopy(NCDfileDict)) except: print 'Error reading file: %s' % NCDfile iFile += 1 if float(iFile) / 10000 == int(float(iFile) / 10000): print 'Processing file #%i' % iFile print 'Creating dataframe from results files' dfNewNCDs = pd.DataFrame(lstNCDs) # Check for existing NCD dataframe if os.path.exists(NCDpath + dataFrameFileName): # Read old NCDs dataframe and concatenate new NCDs print 'Reading existing results dataframe...' dfOldNCDs = pd.read_pickle(NCDpath + dataFrameFileName) dfAllNCDs = pd.concat([dfOldNCDs, dfNewNCDs], ignore_index=True) else: dfAllNCDs = dfNewNCDs # Save file print 'Saving results dataframe %s...' % dataFrameFileName dfAllNCDs.to_pickle(NCDpath + dataFrameFileName) # Delete old NCD files print 'Deleting old results files...' for NCDfile in NCDfiles: os.remove(NCDpath + NCDfile) return dfAllNCDs
def getNCDresults(subFolder='', featureNames=None, downSampleFactors=None, methods=None, dimensions=None, timeDelays=None, neighbourhoodSizes=None, numFilesPerFolder=None, sequenceLengths=None): ''' Loads NCD results from results dataframes in the NCD folder If you want to only select some results then for each parameter you want to filter, include a list of the values you want to keep ''' if subFolder != '': subFolder = subFolder.rstrip('/') + '/' runHistoryFiles = getFileNames(runHistoryPath + subFolder, endsWith='.pkl', orderAlphabetically=True) resultsDataFrames = [] for rhFile in runHistoryFiles: # Load history file runDict = pickle.load(open(runHistoryPath + subFolder + rhFile, 'rb')) useFile = True # Check filters if featureNames is not None: if runDict['featureName'] not in featureNames: useFile = False if downSampleFactors is not None: if runDict['downSampleFactor'] not in downSampleFactors: useFile = False if methods is not None: if runDict['method'] not in methods: useFile = False if dimensions is not None: if runDict['dimension'] not in dimensions: useFile = False if timeDelays is not None: if runDict['timeDelay'] not in timeDelays: useFile = False if neighbourhoodSizes is not None: if runDict['neighbourhoodSize'] not in neighbourhoodSizes: useFile = False if numFilesPerFolder is not None: if runDict['numFilesPerFolder'] not in numFilesPerFolder: useFile = False if sequenceLengths is not None: if runDict['sequenceLength'] not in sequenceLengths: useFile = False # Load results file if useFile: # Read dataframe and append results print 'Reading %s...' % (rhFile + '.res') resultsDataFrames.append( pd.read_pickle(NCDpath + subFolder + rhFile + '.res')) # Create and return dataframe of all results print 'Creating results dataframe' dfAll = pd.concat(resultsDataFrames) return dfAll
'numFilesPerFolder': numFilesPerFolder, 'sequenceLength': setting['Sequence Length'] } runTime = str(datetime.now()).replace(':', '-') pickle.dump(runDict, open(runHistoryPath + runTime + '.pkl', 'wb')) # Convert NCD files into a dataframe convertNCDfiles(runTime) # Create subfolders and move results files into them NCDdest = NCDpath + subFolder + '/' runHistDest = runHistoryPath + subFolder + '/' if not os.path.exists(NCDdest): os.makedirs(NCDdest) if not os.path.exists(runHistDest): os.makedirs(runHistDest) for fn in getFileNames(NCDpath, '.pkl.res'): shutil.move(NCDpath + fn, NCDdest) for fn in getFileNames(runHistoryPath, '.pkl'): shutil.move(runHistoryPath + fn, runHistDest) # Get the overall MAP of the run and add to the setting MAPresult = getMAPresult( featureName, CRPmethod, setting['Dimension'], setting['Neighbourhood Size'], setting['Time Delay'], setting['DownSample Factor'], numFilesPerFolder, setting['Sequence Length'], subFolder) if MAPresult is not None: print 'Mean Average Precision: %0.3f\n' % MAPresult else: print 'No MAP result found!' setting['Mean Average Precision'] = MAPresult
def csvsToTheanoDataSet2(inputPaths, outputFn, numFilesPerFolder, timeStepsPerFeature, cropFeaturesToSize=None, frequencyStandardisation=False, trainPercentage=70.0, validationPercentage=15.0, testPercentage=15.0): ''' Convert a batch of .csv files created by SonicAnnotator to a .pkl.gz training and testing set for input into Theano, with sequential stacking of features to incorporate temporal effects Inputs: :inputPaths: The input folders to get examples from :outputFn: Path to the output file :numFilesPerFolder: The number of files to use from each folder (set to None to use all files) :timeStepsPerFeature: The number of time steps of original features to include in each new feature :cropFeaturesToSize: Set to an integer if only some features should be used - features from the lower end of the range will be used i.e. lower frequencies :frequencyStandardisation: whether to standardise the range of each frequency band individually :trainPercentage: The percentage of examples to use for training :validationPercentage: The percentage of examples to use for validation :testPercentage: The percentage of examples to use for testing TODO: implement a standardisation function argument ''' allFeatures = None pieceIndex = 0 # For each folder (piece) for inputPath in inputPaths: print 'Converting features in folder %s' % inputPath # Get list of numFilesPerFolder feature files inputFiles = getFileNames(inputPath, endsWith='.csv', orderAlphabetically=True) if numFilesPerFolder is not None: inputFiles = inputFiles[:numFilesPerFolder] # For each file (performance) for inputFn in inputFiles: print '\t%s' % inputFn # Read file fileFeatures = np.genfromtxt(inputPath + inputFn, delimiter=',') # Drop first column (time) fileFeatures = fileFeatures[:, 1:] # Drop upper columns if specified if cropFeaturesToSize is not None: fileFeatures = fileFeatures[:, :cropFeaturesToSize] # Drop rows where all columns are zero fileFeatures = fileFeatures[~np.all(fileFeatures == 0, axis=1)] # Stack features accoring to the time argument numExamples = fileFeatures.shape[0] numFeatures = fileFeatures.shape[1] ffNew = np.zeros([ numExamples - timeStepsPerFeature + 1, timeStepsPerFeature * numFeatures ]) for ts in range(timeStepsPerFeature): ffNew[:, ts * numFeatures:(ts + 1) * numFeatures] = fileFeatures[ts:numExamples + 1 + ts - timeStepsPerFeature, :] fileFeatures = ffNew # Add a label column to the end numFeatures = fileFeatures.shape[1] labelledFileFeatures = np.ones( [fileFeatures.shape[0], numFeatures + 1]) * pieceIndex labelledFileFeatures[:, :-1] = fileFeatures # Add to allFeatures array if allFeatures is None: allFeatures = copy.deepcopy(labelledFileFeatures) else: allFeatures = np.vstack( (allFeatures, copy.deepcopy(labelledFileFeatures))) pieceIndex += 1 # Standardise feature range from 0 to 1 print 'Standardising range...' standardisationFn = rcut(outputFn, '.pkl.gz') + '_standardisationValues.pkl.gz' if frequencyStandardisation: minFeatureValue = np.min(allFeatures[:, 0:numFeatures], axis=0) maxFeatureValue = np.max(allFeatures[:, 0:numFeatures], axis=0) else: minFeatureValue = np.min(np.min(allFeatures[:, 0:numFeatures])) maxFeatureValue = np.max(np.max(allFeatures[:, 0:numFeatures])) standardisationDict = { 'Min Value': minFeatureValue, 'Max Value': maxFeatureValue } pickle.dump(standardisationDict, open(standardisationFn, 'wb')) allFeatures[:, 0:numFeatures] = (allFeatures[:, 0:numFeatures] - minFeatureValue) / (maxFeatureValue - minFeatureValue) print 'minFeatureValue = %s\nmaxFeatureValue = %s' % (minFeatureValue, maxFeatureValue) # Shuffle Features 10 times print 'Shuffling...' for _ in np.arange(10): np.random.shuffle(allFeatures) # Extract training, validation and test sets numExamples = allFeatures.shape[0] numFeatures = allFeatures.shape[1] - 1 trainingExamples = allFeatures[0:int(trainPercentage * numExamples / 100)] validationExamples = allFeatures[int(trainPercentage * numExamples / 100):int((trainPercentage + validationPercentage) * numExamples / 100)] testExamples = allFeatures[int((trainPercentage + validationPercentage) * numExamples / 100):] train_set = (trainingExamples[:, :numFeatures], trainingExamples[:, numFeatures]) valid_set = (validationExamples[:, :numFeatures], validationExamples[:, numFeatures]) test_set = (testExamples[:, :numFeatures], testExamples[:, numFeatures]) # Write file for Theano print 'Writing Theano file...' outputFn = rcut(outputFn, '.pkl.gz') + '.pkl.gz' f = gzip.open(outputFn, 'wb') cPickle.dump((train_set, valid_set, test_set), f) f.close()
def csvsToTheanoDataSet(inputPaths, outputFn, numFilesPerFolder, trainPercentage=70.0, validationPercentage=15.0, testPercentage=15.0): ''' Convert a batch of .csv files created by SonicAnnotator to a .pkl.gz training and testing set for input into Theano Inputs: :inputPaths: The input folders to get examples from :outputFn: Path to the output file :numFilesPerFolder: The number of files to use from each folder (set to None to use all files) :trainPercentage: The percentage of examples to use for training :validationPercentage: The percentage of examples to use for validation :testPercentage: The percentage of examples to use for testing ''' allFeatures = None pieceIndex = 0 # For each folder (piece) for inputPath in inputPaths: print 'Converting features in folder %s' % inputPath # Get list of numFilesPerFolder feature files inputFiles = getFileNames(inputPath, endsWith='.csv', orderAlphabetically=True) if numFilesPerFolder is not None: inputFiles = inputFiles[:numFilesPerFolder] # For each file (performance) for inputFn in inputFiles: print '\t%s' % inputFn # Read file fileFeatures = np.genfromtxt(inputPath + inputFn, delimiter=',') # Drop first column (time) fileFeatures = fileFeatures[:, 1:] # Drop rows where all columns are zero fileFeatures = fileFeatures[~np.all(fileFeatures == 0, axis=1)] # Add a label column to the end numFeatures = fileFeatures.shape[1] labelledFileFeatures = np.ones( [fileFeatures.shape[0], numFeatures + 1]) * pieceIndex labelledFileFeatures[:, :-1] = fileFeatures # Add to allFeatures array if allFeatures is None: allFeatures = copy.deepcopy(labelledFileFeatures) else: allFeatures = np.vstack( (allFeatures, copy.deepcopy(labelledFileFeatures))) pieceIndex += 1 # Standardise feature range from 0 to 1 print 'Standardising range...' minFeatureValue = np.min(allFeatures[:, 0:numFeatures]) maxFeatureValue = np.max(allFeatures[:, 0:numFeatures]) print 'minFeatureValue = %f, maxFeatureValue = %f' % (minFeatureValue, maxFeatureValue) allFeatures[:, 0:numFeatures] = (allFeatures[:, 0:numFeatures] - minFeatureValue) / (maxFeatureValue - minFeatureValue) # Shuffle Features 10 times print 'Shuffling...' for _ in np.arange(10): np.random.shuffle(allFeatures) # Extract training, validation and test sets numExamples = allFeatures.shape[0] numFeatures = allFeatures.shape[1] - 1 trainingExamples = allFeatures[0:int(trainPercentage * numExamples / 100)] validationExamples = allFeatures[int(trainPercentage * numExamples / 100):int((trainPercentage + validationPercentage) * numExamples / 100)] testExamples = allFeatures[int((trainPercentage + validationPercentage) * numExamples / 100):] train_set = (trainingExamples[:, :numFeatures], trainingExamples[:, numFeatures]) valid_set = (validationExamples[:, :numFeatures], validationExamples[:, numFeatures]) test_set = (testExamples[:, :numFeatures], testExamples[:, numFeatures]) # Write file for Theano print 'Writing Theano file...' outputFn = rcut(outputFn, '.pkl.gz') + '.pkl.gz' f = gzip.open(outputFn, 'wb') cPickle.dump((train_set, valid_set, test_set), f) f.close()