def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileList, galTemplateLocation, galTempFileList): # TODO: Maybe do memory mapping for these arrays self.images = [] self.labelsIndexes = [] self.filenames = [] self.typeNames = [] if galTemplateLocation is None or galTempFileList is None: galTempList = [None] galTemplateLocation = None snFractions = [1.0] else: galTempList = temp_list(galTempFileList) snFractions = [ 0.99, 0.98, 0.95, 0.93, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ] if isinstance(snTempFileList, dict): snTempList = list(snTempFileList.keys()) ageIndexesDict = snTempFileList else: snTempList = temp_list(snTempFileList) ageIndexesDict = None galAndSnTemps = list(itertools.product(galTempList, snTempList)) argsList = [] for gal, sn in galAndSnTemps: if ageIndexesDict is not None: ageIdxDict = {k: ageIndexesDict[k] for k in (sn, )} else: ageIdxDict = {k: range(0, 1000) for k in (sn, )} argsList.append((snTemplateLocation, [sn], galTemplateLocation, [gal], snFractions, ageIdxDict)) pool = mp.Pool() results = pool.map_async(self.combined_sn_gal_templates_to_arrays, argsList) pool.close() pool.join() outputs = results.get() for i, output in enumerate(outputs): self.collect_results(output) print('combining results...', i, len(outputs)) self.images = np.array(self.images) self.labelsIndexes = np.array(self.labelsIndexes) self.filenames = np.array(self.filenames) self.typeNames = np.array(self.typeNames) print("Completed Creating Arrays!") return self.images, self.labelsIndexes.astype( np.uint16), self.filenames, self.typeNames
def train_test_split(self): """ Split training set before creating arrays. Maybe should change this to include ages in train/test split instead of just SN files. """ snTempFileList = copy.copy(self.snidTempFileList) fileList = temp_list(snTempFileList) random.Random(42).shuffle(fileList) trainSize = int(self.trainFraction * len(fileList)) dirName = os.path.dirname(self.snidTempFileList) trainListFileName = os.path.join(dirName, 'train_templist.txt') testListFileName = os.path.join(dirName, 'test_templist.txt') # Save train set file list with open(trainListFileName, 'w') as f: for line in fileList[:trainSize]: f.write("%s\n" % line) # Save test set file list with open(testListFileName, 'w') as f: for line in fileList[trainSize:]: f.write("%s\n" % line) return trainListFileName, testListFileName
def template_spectra_to_list(self, tempFileList, templateDirectory): tempList = temp_list(tempFileList) templates = [] for filename in tempList: spectrum = self.read_template_file(templateDirectory+filename) templates.append(spectrum) print(filename) return templates
def create_sn_and_host_arrays(snTemplateDirectory, snTempFileList, galTemplateDirectory, galTempFileList, paramsFile): snTemplates = {} galTemplates = {} snList = temp_list(snTempFileList) galList = temp_list(galTempFileList) with open(paramsFile, 'rb') as f: pars = pickle.load(f) w0, w1, nw, snTypes, galTypes, minAge, maxAge, ageBinSize = pars['w0'], pars['w1'], pars['nw'], pars['typeList'], \ pars['galTypeList'], pars['minAge'], pars['maxAge'], \ pars['ageBinSize'] ageBinning = AgeBinning(minAge, maxAge, ageBinSize) ageLabels = ageBinning.age_labels() # Create dictionary of dictionaries for type and age of SN for snType in snTypes: snTemplates[snType] = {} for ageLabel in ageLabels: snTemplates[snType][ageLabel] = {} snTemplates[snType][ageLabel]['snInfo'] = [] snTemplates[snType][ageLabel]['names'] = [] for galType in galTypes: galTemplates[galType] = {} galTemplates[galType]['galInfo'] = [] galTemplates[galType]['names'] = [] for snFile in snList: snBinTemplate = BinTemplate(snTemplateDirectory + snFile, 'sn', w0, w1, nw) nAges = snBinTemplate.nCols ages = snBinTemplate.ages snType = snBinTemplate.tType filename = snBinTemplate.filename for ageIdx in range(nAges): age = ages[ageIdx] if minAge < age < maxAge: ageBin = ageBinning.age_bin(age) ageLabel = ageLabels[ageBin] snInfo = snBinTemplate.bin_template(ageIdx) snTemplates[snType][ageLabel]['snInfo'].append(snInfo) snTemplates[snType][ageLabel]['names'].append("%s_%s" % (filename, age)) print("Reading {} {} out of {}".format(snFile, ageIdx, nAges)) for galFile in galList: galBinTemplate = BinTemplate(galTemplateDirectory + galFile, 'gal', w0, w1, nw) galType = galBinTemplate.tType filename = galBinTemplate.filename galInfo = galBinTemplate.bin_template() galTemplates[galType]['galInfo'].append(galInfo) galTemplates[galType]['names'].append(filename) print("Reading {}".format(galFile)) # Convert lists in dictionaries to numpy arrays for snType in snTypes: for ageLabel in ageLabels: snTemplates[snType][ageLabel]['snInfo'] = np.array( snTemplates[snType][ageLabel]['snInfo']) snTemplates[snType][ageLabel]['names'] = np.array( snTemplates[snType][ageLabel]['names']) for galType in galTypes: galTemplates[galType]['galInfo'] = np.array( galTemplates[galType]['galInfo']) galTemplates[galType]['names'] = np.array( galTemplates[galType]['names']) return snTemplates, galTemplates
def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation, snTempFileList, galTemplateLocation, galTempFileList): if galTemplateLocation is None or galTempFileList is None: galTempList = [None] galTemplateLocation = None snFractions = [1.0] else: galTempList = temp_list(galTempFileList) snFractions = [ 0.99, 0.98, 0.95, 0.93, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1 ] snTempList = temp_list(snTempFileList) # 514 files galAndSnTemps = list(itertools.product(galTempList, snTempList))[0:2] argsList = [] for gal, sn in galAndSnTemps: argsList.append((snTemplateLocation, [sn], galTemplateLocation, [gal], snFractions)) numSpectra = 3968 # sum nColsDict arraySize = numSpectra * len(galTempList) * len( snFractions) * self.numOfRedshifts print("Arraysize is:", arraySize) self.images = np.memmap('all_images.dat', dtype=np.float16, mode='w+', shape=(arraySize, int(self.nw))) self.labelsIndexes = np.memmap('all_labels.dat', dtype=np.uint16, mode='w+', shape=arraySize) self.filenames = np.memmap('all_filenames.dat', dtype=object, mode='w+', shape=arraySize) self.typeNames = np.memmap('all_typeNames.dat', dtype=object, mode='w+', shape=arraySize) print("images GiB:", self.images.nbytes / 2**30) print("labels GiB:", self.labelsIndexes.nbytes / 2**30) print("filenames GiB:", self.filenames.nbytes / 2**30) print("typeNames GiB:", self.typeNames.nbytes / 2**30) # # Multiprocessing with map_async (faster) # pool = mp.Pool() # results = pool.map_async(self.combined_sn_gal_templates_to_arrays, argsList) # pool.close() # pool.join() # outputs = results.get() # for i, output in enumerate(outputs): # self.collect_results(output) # print('combining results...', output[-1], i, len(outputs)) print("Begin pooling...") # # Multiprocessing with apply_async (better when arrays are large - i.e. agnostic redshift) results = [] pool = mp.Pool(processes=50) print("pool") for arg in argsList: print("argLoop") result = pool.apply_async(self.combined_sn_gal_templates_to_arrays, [arg]) results.append(result) print("close pool") pool.close() pool.join() print("Finished Pooling") wheretostartappending = 0 for i, p in enumerate(results): output = p.get() nRows = output[-1] self.collect_results(output, wheretostartappending) wheretostartappending += nRows print('combining results...', nRows, i, len(results)) print("Completed Creating Arrays!") # Delete temporary memory mapping files for filename in glob.glob('images_*.dat'): os.remove(filename) for filename in glob.glob('labels_*.dat'): os.remove(filename) for filename in glob.glob('filenames_*.dat'): os.remove(filename) for filename in glob.glob('typeNames_*.dat'): os.remove(filename) return self.images, self.labelsIndexes, self.filenames, self.typeNames
def train_test_split(self): """ Split training set before creating arrays. Maybe should change this to include ages in train/test split instead of just SN files. """ snTempFileList = copy.copy(self.snidTempFileList) fileList = temp_list(snTempFileList) snAndAgeIdxDict = OrderedDict() spectraList = [] # SPLIT BY SPECTRA # Get number of spectra per file for i, sn in enumerate(fileList): with open(os.path.join(self.snidTemplateLocation, sn), 'r') as FileObj: for lineNum, line in enumerate(FileObj): # Read Header Info if lineNum == 0: header = (line.strip('\n')).split(' ') header = [x for x in header if x != ''] numAges, nwx, w0x, w1x, mostKnots, tname, dta, ttype, ittype, itstype = header numAges, mostKnots = map(int, (numAges, mostKnots)) elif lineNum == mostKnots + 2: ages = np.array(line.split()[1:]).astype(float) agesIndexesInRange = np.where((ages >= self.minAge) & (ages <= self.maxAge))[0] snAndAgeIdxDict[sn] = agesIndexesInRange for ageIdx in agesIndexesInRange: spectraList.append((sn, ageIdx)) # Split train/test random.shuffle(spectraList) trainSize = int(self.trainFraction * len(spectraList)) trainSpectra = spectraList[:trainSize] testSpectra = spectraList[trainSize:] trainDict, testDict = OrderedDict(), OrderedDict() for k, v in trainSpectra: trainDict.setdefault(k, []).append(v) for k, v in testSpectra: testDict.setdefault(k, []).append(v) # # SPLIT BY FILENAME INSTEAD OF BY SPECTRA # random.Random(42).shuffle(fileList) # # trainSize = int(self.trainFraction * len(fileList)) # dirName = os.path.dirname(self.snidTempFileList) # trainListFileName = os.path.join(dirName, 'train_templist.txt') # testListFileName = os.path.join(dirName, 'test_templist.txt') # # # Save train set file list # with open(trainListFileName, 'w') as f: # for line in fileList[:trainSize]: # f.write("%s\n" % line) # # # Save test set file list # with open(testListFileName, 'w') as f: # for line in fileList[trainSize:]: # f.write("%s\n" % line) print("trainDict", trainDict) print("testDict", testDict) return trainDict, testDict