示例#1
0
    def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation,
                                               snTempFileList,
                                               galTemplateLocation,
                                               galTempFileList):
        # TODO: Maybe do memory mapping for these arrays
        self.images = []
        self.labelsIndexes = []
        self.filenames = []
        self.typeNames = []

        if galTemplateLocation is None or galTempFileList is None:
            galTempList = [None]
            galTemplateLocation = None
            snFractions = [1.0]
        else:
            galTempList = temp_list(galTempFileList)
            snFractions = [
                0.99, 0.98, 0.95, 0.93, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
                0.1
            ]

        if isinstance(snTempFileList, dict):
            snTempList = list(snTempFileList.keys())
            ageIndexesDict = snTempFileList
        else:
            snTempList = temp_list(snTempFileList)
            ageIndexesDict = None

        galAndSnTemps = list(itertools.product(galTempList, snTempList))
        argsList = []
        for gal, sn in galAndSnTemps:
            if ageIndexesDict is not None:
                ageIdxDict = {k: ageIndexesDict[k] for k in (sn, )}
            else:
                ageIdxDict = {k: range(0, 1000) for k in (sn, )}
            argsList.append((snTemplateLocation, [sn], galTemplateLocation,
                             [gal], snFractions, ageIdxDict))

        pool = mp.Pool()
        results = pool.map_async(self.combined_sn_gal_templates_to_arrays,
                                 argsList)
        pool.close()
        pool.join()

        outputs = results.get()
        for i, output in enumerate(outputs):
            self.collect_results(output)
            print('combining results...', i, len(outputs))

        self.images = np.array(self.images)
        self.labelsIndexes = np.array(self.labelsIndexes)
        self.filenames = np.array(self.filenames)
        self.typeNames = np.array(self.typeNames)

        print("Completed Creating Arrays!")

        return self.images, self.labelsIndexes.astype(
            np.uint16), self.filenames, self.typeNames
示例#2
0
    def train_test_split(self):
        """
        Split training set before creating arrays.
        Maybe should change this to include ages in train/test split instead of just SN files.
        """
        snTempFileList = copy.copy(self.snidTempFileList)
        fileList = temp_list(snTempFileList)
        random.Random(42).shuffle(fileList)

        trainSize = int(self.trainFraction * len(fileList))
        dirName = os.path.dirname(self.snidTempFileList)
        trainListFileName = os.path.join(dirName, 'train_templist.txt')
        testListFileName = os.path.join(dirName, 'test_templist.txt')

        # Save train set file list
        with open(trainListFileName, 'w') as f:
            for line in fileList[:trainSize]:
                f.write("%s\n" % line)

        # Save test set file list
        with open(testListFileName, 'w') as f:
            for line in fileList[trainSize:]:
                f.write("%s\n" % line)

        return trainListFileName, testListFileName
    def template_spectra_to_list(self, tempFileList, templateDirectory):
        tempList = temp_list(tempFileList)
        templates = []
        for filename in tempList:
            spectrum = self.read_template_file(templateDirectory+filename)
            templates.append(spectrum)
            print(filename)

        return templates
def create_sn_and_host_arrays(snTemplateDirectory, snTempFileList,
                              galTemplateDirectory, galTempFileList,
                              paramsFile):
    snTemplates = {}
    galTemplates = {}
    snList = temp_list(snTempFileList)
    galList = temp_list(galTempFileList)
    with open(paramsFile, 'rb') as f:
        pars = pickle.load(f)
    w0, w1, nw, snTypes, galTypes, minAge, maxAge, ageBinSize = pars['w0'], pars['w1'], pars['nw'], pars['typeList'], \
                                                                pars['galTypeList'], pars['minAge'], pars['maxAge'], \
                                                                pars['ageBinSize']
    ageBinning = AgeBinning(minAge, maxAge, ageBinSize)
    ageLabels = ageBinning.age_labels()
    # Create dictionary of dictionaries for type and age of SN
    for snType in snTypes:
        snTemplates[snType] = {}
        for ageLabel in ageLabels:
            snTemplates[snType][ageLabel] = {}
            snTemplates[snType][ageLabel]['snInfo'] = []
            snTemplates[snType][ageLabel]['names'] = []
    for galType in galTypes:
        galTemplates[galType] = {}
        galTemplates[galType]['galInfo'] = []
        galTemplates[galType]['names'] = []

    for snFile in snList:
        snBinTemplate = BinTemplate(snTemplateDirectory + snFile, 'sn', w0, w1,
                                    nw)
        nAges = snBinTemplate.nCols
        ages = snBinTemplate.ages
        snType = snBinTemplate.tType
        filename = snBinTemplate.filename
        for ageIdx in range(nAges):
            age = ages[ageIdx]
            if minAge < age < maxAge:
                ageBin = ageBinning.age_bin(age)
                ageLabel = ageLabels[ageBin]
                snInfo = snBinTemplate.bin_template(ageIdx)
                snTemplates[snType][ageLabel]['snInfo'].append(snInfo)
                snTemplates[snType][ageLabel]['names'].append("%s_%s" %
                                                              (filename, age))

            print("Reading {} {} out of {}".format(snFile, ageIdx, nAges))

    for galFile in galList:
        galBinTemplate = BinTemplate(galTemplateDirectory + galFile, 'gal', w0,
                                     w1, nw)
        galType = galBinTemplate.tType
        filename = galBinTemplate.filename
        galInfo = galBinTemplate.bin_template()
        galTemplates[galType]['galInfo'].append(galInfo)
        galTemplates[galType]['names'].append(filename)

        print("Reading {}".format(galFile))

    # Convert lists in dictionaries to numpy arrays
    for snType in snTypes:
        for ageLabel in ageLabels:
            snTemplates[snType][ageLabel]['snInfo'] = np.array(
                snTemplates[snType][ageLabel]['snInfo'])
            snTemplates[snType][ageLabel]['names'] = np.array(
                snTemplates[snType][ageLabel]['names'])
    for galType in galTypes:
        galTemplates[galType]['galInfo'] = np.array(
            galTemplates[galType]['galInfo'])
        galTemplates[galType]['names'] = np.array(
            galTemplates[galType]['names'])

    return snTemplates, galTemplates
    def combined_sn_gal_arrays_multiprocessing(self, snTemplateLocation,
                                               snTempFileList,
                                               galTemplateLocation,
                                               galTempFileList):
        if galTemplateLocation is None or galTempFileList is None:
            galTempList = [None]
            galTemplateLocation = None
            snFractions = [1.0]
        else:
            galTempList = temp_list(galTempFileList)
            snFractions = [
                0.99, 0.98, 0.95, 0.93, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2,
                0.1
            ]

        snTempList = temp_list(snTempFileList)  # 514 files
        galAndSnTemps = list(itertools.product(galTempList, snTempList))[0:2]
        argsList = []
        for gal, sn in galAndSnTemps:
            argsList.append((snTemplateLocation, [sn], galTemplateLocation,
                             [gal], snFractions))

        numSpectra = 3968  # sum nColsDict
        arraySize = numSpectra * len(galTempList) * len(
            snFractions) * self.numOfRedshifts
        print("Arraysize is:", arraySize)

        self.images = np.memmap('all_images.dat',
                                dtype=np.float16,
                                mode='w+',
                                shape=(arraySize, int(self.nw)))
        self.labelsIndexes = np.memmap('all_labels.dat',
                                       dtype=np.uint16,
                                       mode='w+',
                                       shape=arraySize)
        self.filenames = np.memmap('all_filenames.dat',
                                   dtype=object,
                                   mode='w+',
                                   shape=arraySize)
        self.typeNames = np.memmap('all_typeNames.dat',
                                   dtype=object,
                                   mode='w+',
                                   shape=arraySize)

        print("images GiB:", self.images.nbytes / 2**30)
        print("labels GiB:", self.labelsIndexes.nbytes / 2**30)
        print("filenames GiB:", self.filenames.nbytes / 2**30)
        print("typeNames GiB:", self.typeNames.nbytes / 2**30)

        # #  Multiprocessing with map_async (faster)
        # pool = mp.Pool()
        # results = pool.map_async(self.combined_sn_gal_templates_to_arrays, argsList)
        # pool.close()
        # pool.join()
        # outputs = results.get()
        # for i, output in enumerate(outputs):
        #     self.collect_results(output)
        #     print('combining results...', output[-1], i, len(outputs))

        print("Begin pooling...")
        # #  Multiprocessing with apply_async (better when arrays are large - i.e. agnostic redshift)
        results = []
        pool = mp.Pool(processes=50)
        print("pool")
        for arg in argsList:
            print("argLoop")
            result = pool.apply_async(self.combined_sn_gal_templates_to_arrays,
                                      [arg])
            results.append(result)
        print("close pool")
        pool.close()
        pool.join()
        print("Finished Pooling")

        wheretostartappending = 0
        for i, p in enumerate(results):
            output = p.get()
            nRows = output[-1]
            self.collect_results(output, wheretostartappending)
            wheretostartappending += nRows
            print('combining results...', nRows, i, len(results))

        print("Completed Creating Arrays!")

        # Delete temporary memory mapping files
        for filename in glob.glob('images_*.dat'):
            os.remove(filename)
        for filename in glob.glob('labels_*.dat'):
            os.remove(filename)
        for filename in glob.glob('filenames_*.dat'):
            os.remove(filename)
        for filename in glob.glob('typeNames_*.dat'):
            os.remove(filename)

        return self.images, self.labelsIndexes, self.filenames, self.typeNames
示例#6
0
    def train_test_split(self):
        """
        Split training set before creating arrays.
        Maybe should change this to include ages in train/test split instead of just SN files.
        """
        snTempFileList = copy.copy(self.snidTempFileList)
        fileList = temp_list(snTempFileList)
        snAndAgeIdxDict = OrderedDict()
        spectraList = []

        # SPLIT BY SPECTRA
        # Get number of spectra per file
        for i, sn in enumerate(fileList):
            with open(os.path.join(self.snidTemplateLocation, sn),
                      'r') as FileObj:
                for lineNum, line in enumerate(FileObj):
                    # Read Header Info
                    if lineNum == 0:
                        header = (line.strip('\n')).split(' ')
                        header = [x for x in header if x != '']
                        numAges, nwx, w0x, w1x, mostKnots, tname, dta, ttype, ittype, itstype = header
                        numAges, mostKnots = map(int, (numAges, mostKnots))
                    elif lineNum == mostKnots + 2:
                        ages = np.array(line.split()[1:]).astype(float)
                        agesIndexesInRange = np.where((ages >= self.minAge) &
                                                      (ages <= self.maxAge))[0]
                        snAndAgeIdxDict[sn] = agesIndexesInRange
                        for ageIdx in agesIndexesInRange:
                            spectraList.append((sn, ageIdx))

        # Split train/test
        random.shuffle(spectraList)
        trainSize = int(self.trainFraction * len(spectraList))
        trainSpectra = spectraList[:trainSize]
        testSpectra = spectraList[trainSize:]

        trainDict, testDict = OrderedDict(), OrderedDict()
        for k, v in trainSpectra:
            trainDict.setdefault(k, []).append(v)
        for k, v in testSpectra:
            testDict.setdefault(k, []).append(v)

        # # SPLIT BY FILENAME INSTEAD OF BY SPECTRA
        # random.Random(42).shuffle(fileList)
        #
        # trainSize = int(self.trainFraction * len(fileList))
        # dirName = os.path.dirname(self.snidTempFileList)
        # trainListFileName = os.path.join(dirName, 'train_templist.txt')
        # testListFileName = os.path.join(dirName, 'test_templist.txt')
        #
        # # Save train set file list
        # with open(trainListFileName, 'w') as f:
        #     for line in fileList[:trainSize]:
        #         f.write("%s\n" % line)
        #
        # # Save test set file list
        # with open(testListFileName, 'w') as f:
        #     for line in fileList[trainSize:]:
        #         f.write("%s\n" % line)
        print("trainDict", trainDict)
        print("testDict", testDict)
        return trainDict, testDict