Python DPLIB示例，Lib.DPLIB Python示例

示例#1

0

显示文件

文件： Benchmarks.py 项目： rebvar/datasci.datalytikz.com

    def Basic(trainSet,
              testSet,
              file,
              fout,
              name,
              vecin,
              clfName,
              isCount=False):

        auc = 0
        l = GLOB(clfName).getClassifier()
        l.buildClassifier(trainSet)

        vec = l.evaluateModel(testSet)
        actual = testSet[:, -1]

        if isCount:

            vals = DPLIB.getMeasuresCount(actual, vec)
            print(name + ":" + file + ": " + str(vals))
            fout.write("\n" + name + ":" + file + ": " + "Vals=" + str(vals))

        else:
            tvals = DPLIB.getConfMatrix(actual, vec)
            vals = DPLIB.getMeasures(tvals)
            auc = DPLIB.getAUC(actual, vec)
            print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))
            fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) +
                       ";" + "Vals=" + str(vals))

示例#2

0

显示文件

文件： Benchmarks.py 项目： rebvar/datasci.datalytikz.com

    def NNFilterMulti(trainSeti, testSet, file, fout, name, vecin, count,
                      clfName, tunelrn, vSets):

        startTime = Common.getCurrentTimeMil()

        trainSet = DPLIB.NNFilterMulti(trainSeti, testSet, count)

        l = GLOB(clfName, tunelrn).getClassifier()
        if (tunelrn):

            l = l.getTunedCLF(trainSet, vSets, fout, name, file)

            print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(l.selectedParams))

            fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(l.selectedParams))
            fout.write("\n")
            sCheck = l.getCLFOptions()

            print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(sCheck))

            fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(sCheck))
            fout.write("\n")

        l.buildClassifier(trainSet)

        vec = l.evaluateModel(testSet)

        vecin = vec

        tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin)

        print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals))

        fout.write("#CONF-TEST-" + name + ":" + file + ": ")
        fout.write(str(tvals))
        fout.write("\n")

        auc = DPLIB.getAUC(testSet[:, -1], vec)
        vals = DPLIB.getMeasures(tvals)

        print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

        fout.write(name + ":" + file + ": ")
        fout.write(str(vals))
        fout.write(" AUC = ")
        fout.write(str(auc))
        fout.write("\n")

        time = Common.getCurrentTimeMil() - startTime

        print("#TIME-FOR:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n")

        return vecin

示例#3

0

显示文件

文件： GIS_Module.py 项目： rebvar/datasci.datalytikz.com

    def fit(self, trainSet, testSet, vSets, vSetType, clfName):

        if self.isCount:
            self.mad = GA.getBugSTDForMutation(trainSet)

        if len(set(list(trainSet[:, -1]))) < 2:
            self.prnt('Error: Number of classes can not be less than two.')
            print('Error: Number of classes can not be less than two.')
            return

        trainSet, testSet = np.copy(trainSet), np.copy(testSet)

        tstSize = len(testSet)
        partSize = int(tstSize / self.numParts)

        isOK = True

        np.random.shuffle(testSet)
        self.FinalLearner = None
        self.FinalDataset = None

        diffs = []

        vSets = ValidationSetManager.getValidationSets(vSets, vSetType,
                                                       trainSet, testPart)
        pop = GA.createInitialPopulation(trainSet, self.popSize,
                                         fixedTrainSize, self.chrmSize)
        pop = GA.assignFitness(pop,
                               GLOB(clfName).getClassifier(), vSets,
                               self.isCount)
        pop = DPLIB.SortPopulation(pop)

        for g in range(self.numGens):
            self.prnt(str(g) + " ")
            newPop = GA.generateNewPopulation(pop,
                                              self.sizeTopP,
                                              selectionType='TORNAMENT',
                                              isCount=self.isCount,
                                              mad=mad)
            newPop = GA.assignFitness(newPop,
                                      GLOB(clfName).getClassifier(), vSets,
                                      self.isCount)
            newPop = DPLIB.SortPopulation(newPop)
            newPop, rdel = DPLIB.CombinePops(pop, newPop)
            rdel = None
            diff, exit = GA.checkExit(pop, newPop, self.countComp)

            diffs.append(diff)
            pop.clear()
            pop = newPop

            if (pop[0].getFitness() > 0.0) and (exit):
                break
        self.FinalDataset = pop[0].ds

示例#4

0

显示文件

文件： GIS_Module.py 项目： rebvar/datasci.datalytikz.com

 def getKeySetKeys(self):
     if self.isCount:
         self.keySet = list(
             DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys())
     else:
         self.keySet = list(DPLIB.MEASURES_BIN.keys())
     return self.keySet

示例#5

0

显示文件

文件： Benchmarks.py 项目： rebvar/datasci.datalytikz.com

    def WMulti(files,
               file,
               testSet,
               fout,
               features,
               name,
               clfName,
               dp,
               convertToBinary=True):

        train = []
        for file2 in files:
            if (file2[0:3] == file[0:3] and file2 < file < 0):
                train.append(file2)

        if (len(train)):
            trainSet = DPLIB.LoadCSV(train, dp, features, convertToBinary)

            if (name.lower().find("infogain") >= 0):
                #int indi[] = DPLIB.fSelectInfoGain(trainSet);
                #if (DPLIB.useIterativeInfoGainSubsetting)
                #{
                #    indi = DPLIB.iterativeInfoGainSubsetting(trainSet, indi,clfName);
                #}
                #else
                #    indi = DPLIB.getTopX(indi);
                #trainSet = DPLIB.fSelectSet(trainSet, indi);
                #testSet = DPLIB.fSelectSet(testSet, indi);
                pass

            l = GLOB(clfName).getClassifier()
            l.buildClassifier(trainSet)
            vec = l.evaluateModel(testSet)

            tvals = DPLIB.getConfMatrix(testSet[:, -1], vec)
            auc = DPLIB.getAUC(testSet[:, -1], vec)
            vals = DPLIB.getMeasures(tvals)
            print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

            fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) +
                       ";" + "Vals=" + str(vals))

        else:

            print(name + ":" + file + ": " + "!!!" + " AUC = !!!")
            fout.write("\n" + name + ":" + file + ": !!!")

示例#6

0

显示文件

    def score(self, predictions, testLabels):

        if self.isCount:
            pass
        else:
            if testLabels != None:
                confs, measures = DPLIB.getConfAndExtMeasures(testLabels, predictions)
                return confs, measures
            return None

示例#7

0

显示文件

    def toDict(lds):
        measures = None
        expNames = ('GIS','LSH','FIXED','VAR','VR','FX',)
        out = {}

        for line in DPLIB.doReplaces('\n'.join(['\n'.join(l) for l in lds])).split('\n'):
            line = line.strip()
            if line.startswith(expNames):
                parts = line.replace("': ","'=>").split(':')
                perf = parts[2].replace('{','').replace('}','').strip()
                method = parts[0]
                ds = parts[1]
                learner = method.split('-')[-1]
                apprName = method[:method.rfind('-')]
                featureSpace = apprName.split('-')[-1]
                vals = []
                measures = []
                for p in perf.split(','):
                    p = p.strip().split('=>')
                    p[0] = p[0].strip().replace("'","")
                    p[1] = p[1].strip()
                    if p[0] not in measures:
                        measures.append(p[0])
                        vals.append(float(p[1]))
                if method not in out.keys():
                    out[method] = {}
                if ds not in out[method].keys():
                    out[method][ds] = {}
                    out[method][ds]['measures'] = {}
                    out[method][ds]['confs'] = []
                for mindex, m in enumerate(measures):
                    if m not in out[method][ds]['measures'].keys():
                        out[method][ds]['measures'][m] = []
                    out[method][ds]['measures'][m].append(vals[mindex])
            elif line.startswith("#CONF-TEST:"):
                parts = line.replace("': ","'=>").split(':')[1:]
                perf = parts[2].replace('{','').replace('}','').strip()
                method = parts[0]
                ds = parts[1]
                learner = method.split('-')[-1]
                apprName = method[:method.rfind('-')]
                featureSpace = apprName.split('-')[-1]
                
                if method not in out.keys():
                    out[method] = {}
                if ds not in out[method].keys():
                    out[method][ds] = {}
                    out[method][ds]['measures'] = {}
                    out[method][ds]['confs'] = []
                out[method][ds]['confs'].append(perf)

        return out

示例#8

0

显示文件

文件： GA_Module.py 项目： rebvar/datasci.datalytikz.com

 def createInitialPopulation(pool, popSize, fixedTrainSize,
                             maxChromosomeSize):
     pop = []
     for i in range(popSize):
         uinds = set()
         size = GA.getChromosomeSize(fixedTrainSize, maxChromosomeSize)
         while True:
             #Select a subset that has noth classes
             trSet = DPLIB.getRandomSubSet(size, pool)
             if len(set(list(trSet[:, -1]))) >= 2:
                 break
         pop.append(trSet)
     return pop

示例#9

0

显示文件

文件： Benchmarks.py 项目： rebvar/datasci.datalytikz.com

    def LOC50(testSeti, file, fout, name, locIndex):
        startTime = Common.getCurrentTimeMil()
        spentISTime = 0
        tempTime = 0
        spentISTime = Common.getCurrentTimeMil()
        allloc = testSeti[:, locIndex]

        med = np.median(allloc)
        predicted = [1 if t >= med else 0 for t in allloc]
        actual = testSeti[:, -1]
        tvals = DPLIB.getConfMatrix(actual, predicted)

        print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals))

        fout.write("#CONF-TEST-" + name + ":" + file + ": ")
        fout.write(str(tvals))
        fout.write("\n")

        vals = DPLIB.getMeasures(tvals)

        auc = DPLIB.getAUC(actual, predicted)
        print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

        fout.write(name + ":" + file + ": ")
        fout.write(str(vals))
        fout.write(" AUC = ")
        fout.write(str(auc))
        fout.write("\n")

        time = Common.getCurrentTimeMil() - startTime

        print("#TIME-FOR:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n")

        print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time) +
                   "\n")

示例#10

0

显示文件

文件： GA_Module.py 项目： rebvar/datasci.datalytikz.com

 def assignFitness(pop, clf, vSets, isCount):
     for i in range(len(pop)):
         clf.buildClassifier(pop[i])
         all_confs_measures = []
         all_predictions = clf.evaluateMultiModel(vSets)
         if isCount:
             pass
         else:
             for index, predictions in enumerate(all_predictions):
                 confs_measures = DPLIB.getConfMatrixAndExtMeasures(
                     vSets[index][:, -1], predictions)
                 all_confs_measures.append(confs_measures)
             h = CHRM_GIS(pop[i], all_measures, all_confs_measures)
             pop[i] = h
     return pop

示例#11

0

显示文件

文件： GA_Module.py 项目： rebvar/datasci.datalytikz.com

    def Mutate(ds, mProb=0.1, mCount=1, isCount=False, mad=0.0):
        """
        Performs mutation with specified parameters. 
        Please note that the mutation should consider the fact that datasets might contain 
        repeated instances of the same data row, and the operation should consider consistency. 
        After performing mutation, all instances with exact same data should have a consistent label.

        """

        r2 = np.random.rand()
        if (r2 <= mProb):
            rands = set()
            i = 0

            while (i < mCount):

                r1 = np.random.randint(0, len(ds))
                if len(rands) == len(ds):
                    return ds

                if (r1 in rands):
                    continue
                instLabel = ds[r1, -1]

                #Mutation for non binary class values. Shift Using normal distribution random value
                if isCount:
                    shift = int(np.random.randn() * mad)
                    classVal = instLabel + shift
                    if (classVal < 0):
                        classVal = 0
                else:
                    classVal = (1 - instLabel)

                st = DPLIB.FindAllSimilarInstancesIndexes(r1, ds)

                for r1 in st:
                    rands.add(r1)
                    ds[r1, -1] = classVal

                i += 1
        return ds

示例#12

0

显示文件

文件： GA_Module.py 项目： rebvar/datasci.datalytikz.com

    def crossOver(ds1, ds2, fixedSize, isCount=False):
        """
        Cross over operator. It supports both one point and two point cross over methods. 
        Further, it can keep the datasets the same size, as well as change the data in a way to
        generate varying size chromosomes. 

        Please note that the cross over should consider the fact that datasets might contain 
        repeated instances of the same data row, and the operation should consider consistency. 
        This is especially inportant since, data can come from multiple sources, as well as, the effect of
        mutation on particular instances from previous generations. 
        After performing crossover, all instances with exact same data should have a consistent label.
        This is done through majority voting rule in the cross over operations.


        """
        ss = len(ds1)
        point1 = 0
        point2 = 0

        if (fixedSize):
            point1 = np.random.randint(ss)
            point2 = point1
        else:
            point1 = np.random.randint(ss)
            point2 = np.random.randint(len(ds2))

            if (len(ds1) >= 4000):
                point1 = int(len(ds1) / 2)

            if (len(ds2) >= 4000):
                point2 = int(len(ds2) / 2)

        np.random.shuffle(ds1)
        np.random.shuffle(ds2)
        ds1c = np.copy(ds1[:point1, :])
        ds2c = np.copy(ds2[:point2, :])

        ds1c = np.append(ds1c, ds2[point2:, :], axis=0)
        ds2c = np.append(ds2c, ds1[point1:, :], axis=0)

        pSet = set()

        for i in range(len(ds1c)):
            if i in pSet:
                continue
            t = list(DPLIB.FindAllSimilarInstancesIndexes(i, ds1c))
            lbl = 0

            index = -1
            for j in range(len(t)):
                index = t[j]
                lbl += ds1c[index, -1]
                pSet.add(index)

            lbl = lbl / (len(t))
            if not isCount:
                if (lbl >= 0.5):
                    lbl = 1
                else:
                    lbl = 0
            else:
                if lbl < 0:
                    lbl = 0
            for j in range(len(t)):

                index = t[j]
                #Process extra
                #if ((int)ds1c.instance(index).classValue()!=(int)lbl)
                #    ds1c.instance(index).SetExtra(ds1c.instance(index).GetExtra() +"-C="+String.valueOf((int)(1-lbl))+">"+String.valueOf((int)lbl));
                ds1c[index, -1] = lbl

        pSet.clear()
        for i in range(len(ds2c)):
            if (i in pSet):
                continue
            t = list(DPLIB.FindAllSimilarInstancesIndexes(i, ds2c))
            lbl = 0
            index = -1
            for j in range(len(t)):
                index = t[j]
                lbl += ds2c[index, -1]
                pSet.add(index)

            lbl = lbl / len(t)

            if not isCount:

                if (lbl >= 0.5):
                    lbl = 1
                else:
                    lbl = 0
            else:
                if lbl < 0:
                    lbl = 0

            for j in range(len(t)):

                index = t[j]
                #Process extra
                #if ((int)ds2c.instance(index).classValue()!=(int)lbl)
                #    ds2c.instance(index).SetExtra(ds2c.instance(index).GetExtra() +"-C="+String.valueOf((int)(1-lbl))+">"+String.valueOf((int)lbl));
                ds2c[index, -1] = lbl

        return ds1c, ds2c

示例#13

0

显示文件

    def CreateBuckets(self, trainSet, testSet, vSets, name, testCut, iternum,
                      save, superbit, stages, buckets, doprint, clfName,
                      tunelrn):

        out = []
        if self.isCount:
            keySet = list(
                DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys())
        else:
            keySet = list(
                DPLIB.getExtMeasures({
                    "tp": 1,
                    "tn": 2,
                    "fp": 3,
                    "fn": 4
                }).keys())

        out.append("#STARTED FOR-" + name + ":" + self.file + ": ")

        startTime = Common.getCurrentTimeMil()
        spentIsTime = 0
        tempTime = 0

        out.append("#Using also Label For train in LSH")

        if (vSets == None):

            vSets = []
            vSets.append(trainSet)

        if (save):
            DPLIB.SaveToCsv(
                trainSet, "MAIN-TRAIN-FILE-" + "ITER=" + str(iternum) + "--" +
                "METHOD=" + name + "--FILE=" + self.file + "--")
            DPLIB.SaveToCsv(
                testSet, "MAIN-TEST-FILE-" + "ITER=" + str(iternum) + "--" +
                "METHOD=" + name + "--FILE=" + self.file + "--")
            for i in range(len(vSets)):

                DPLIB.SaveToCsv(
                    trainSet,
                    "VSET-FILE-" + "INDEX=" + str(i) + "ITER=" + str(iternum) +
                    "--" + "METHOD=" + name + "--FILE=" + self.file + "--")

        np.random.shuffle(trainSet)
        np.random.shuffle(testSet)
        tempTime = Common.getCurrentTimeMil()
        count = len(trainSet)
        bins = {}
        # R^n
        n = trainSet.shape[1] - 1

        binid = 0
        #lshmin = LSHMinHash(stages, buckets, n);
        try:
            lshsuper = LSHSuperBit(stages=stages,
                                   buckets=buckets,
                                   dimensions=n)
        except Exception as ex:
            print('##SuperBit with specified parameters failed:' + str(ex))
            return None
        sp = 0.75
        # Compute a SuperBit signature, and a LSH hash
        for i in range(count):
            vector = trainSet[i, 1:].tolist()

            hash = None
            if (superbit):
                hash = lshsuper.hash(vector)
            else:
                ##Minhash support
                # #hash = lshmin.hash(vecBool);
                pass

            binid = hash[0]
            if not binid in bins.keys():
                bins[binid] = []

            bins[binid].append(trainSet[i])

        spentIsTime += Common.getCurrentTimeMil() - tempTime

        numBins = len(bins.keys())

        for binid in bins.keys():
            bins[binid] = np.array(bins[binid])

        out.append("#Number of BINS:" + name + ":" + self.file + ": " +
                   str(numBins))

        pop = []

        for i in bins.keys():

            trSet = bins[i]
            l = GLOB(clfName, tunelrn).getClassifier()

            #if (tunelrn):
            #    l = l.getTunedCLF(trSet, vSets,fout,name, file);

            l.buildClassifier(trSet)
            cf = 0
            j = 0

            allvecs = []
            confs = []
            allcfs = []
            allaucs = []
            valsA = None
            confsA = None
            aucA = 0.0
            for vSet in vSets:

                vec = None
                actuals = None

                vec = l.evaluateModel(vSet)
                actuals = vSet[:, -1]

                vals = None
                auc = 0
                if self.isCount:
                    vals = DPLIB.getMeasuresCount(actuals, vec)
                else:

                    auc = DPLIB.getAUC(actuals, vec)
                    aucA += auc
                    allaucs.append(auc)
                    if (testCut):
                        vCF = 0.1
                        bestCF = 0
                        bestCFVal = -1
                        bestVals = None

                        while True:

                            tvals = DPLIB.getConfMatrix(actuals, vec, vCF)
                            measures = DPLIB.getMeasures(tvals)
                            fit = measures["F"] * measures["GMean1"]
                            if (fit > bestCFVal or bestVals == None):

                                bestCFVal = fit
                                bestCF = vCF
                                bestVals = tvals

                            vCF += 0.1

                            if (vCF >= 1):
                                break

                        if (confsA == None):

                            confsA = {key: 0 for key in bestVals.keys()}

                        for j in confsA.keys():
                            confsA[j] += bestVals[j]

                        confs.append(bestVals)

                        vals = DPLIB.getMeasures(bestVals)
                        cf += bestCF
                        allcfs.append(bestCF)

                    else:

                        tvals = DPLIB.getConfMatrix(actuals, vec)

                        if (confsA == None):

                            confsA = {key: 0 for key in tvals.keys()}

                        for j in confsA.keys():
                            confsA[j] += tvals[j]

                        confs.append(tvals)

                        vals = DPLIB.getMeasures(tvals)
                        allcfs.append(DPLIB.DefaultCF)

                allvecs.append(vals)

                if (valsA == None):
                    valsA = {key: 0 for key in keySet}

                for j in keySet:
                    valsA[j] += vals[j]

            for j in keySet:
                valsA[j] /= len(vSets)

            h = None
            if not self.isCount:
                for j in confsA.keys():
                    confsA[j] /= len(vSets)

                if (testCut):
                    cf /= len(vSets)

                aucA /= len(vSets)

                h = CHRM_GIS(trSet, valsA, aucA)
                h.fitnesses = allvecs
                h.aucs = allaucs
                h.conf = confsA
                h.confs = confs
                h.allcfs = allcfs
                if (testCut):
                    h.bestCF = cf
                else:
                    h.bestCF = DPLIB.DefaultCF
            else:

                h = CHRM_GIS_Count(trSet, valsA)
                h.fitnesses = allvecs

            pop.append(h)
            l = None

        tempTime = Common.getCurrentTimeMil()
        pop = DPLIB.MySort(pop)
        spentIsTime += Common.getCurrentTimeMil() - tempTime
        top = pop[0]

        out.append("#Instances in Top:" + str(len(top.ds)))

        out.append("#STAGES:" + name + ":" + self.file + ": " + str(stages))
        out.append("#BUCKETS:" + name + ":" + self.file + ": " + str(buckets))
        if not self.isCount:
            out.append("#BEST-CF-VALUE:" + name + ":" + self.file + ": " +
                       str(top.bestCF))

        l = GLOB(clfName, tunelrn).getClassifier()

        if (tunelrn):

            l = l.getTunedCLF(top.ds, vSets, fout, name, file)

            out.append("#TUNE-LRN-PARAMS-" + name + ":" + self.file + ": " +
                       str(l.selectedParams))
            sCheck = l.getCLFOptions()
            out.append("#SETSET-LRN-PARAMS-" + name + ":" + self.file + ": " +
                       str(sCheck))

        l.buildClassifier(top.ds)

        vec = l.evaluateModel(testSet)

        out.append("#LSH-FOR-TOP-ONLY")

        if self.isCount:
            vals = DPLIB.getMeasuresCount(testSet[:, -1], vec)
            out.append(name + ":" + self.file + ": " + str(vals))
        else:
            tvals = DPLIB.getConfMatrix(testSet[:, -1], vec, top.bestCF)
            out.append("#CONF-TEST-" + name + ":" + self.file + ": " +
                       str(tvals))
            vals = DPLIB.getMeasures(tvals)
            auc = DPLIB.getAUC(testSet[:, -1], vec)
            vals['auc'] = auc
            out.append(name + ":" + self.file + ": " + str(vals))

        for i in range(len(pop)):

            pop[i] = None

        pop = None

        for i in bins.keys():
            bins[i] = None

        bins = None

        time = Common.getCurrentTimeMil() - startTime

        if (name.find("LSHTune") < 0):
            out.append("#TIME-FOR:" + name + ":" + self.file + ": " +
                       str(time))
            out.append("#TIME-FOR-IS:" + name + ":" + self.file + ": " +
                       str(spentIsTime))
            self.output = +out

        top.addToExtra("SPENT-TIME-IS", float(spentIsTime))

        return top, out

示例#14

0

显示文件

    def run(self, trainSeti, testSeti, name, fout, vSets, vSetType,
            fixedTrainSize, log, ignoreOK, threshold, thresholds, rejectedFits,
            rejectedPerfs, rejectedTestPerfs, clfName):

        mad = 0.0
        if self.isCount:
            keySet = list(
                DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys())
            mad = DPLIB.SetBugCountForMut(trainSeti)
        else:
            keySet = list(
                DPLIB.getExtMeasures({
                    "tp": 1,
                    "tn": 2,
                    "fp": 3,
                    "fn": 4
                }).keys())
        startTime = Common.getCurrentTimeMil()
        tempTime = 0
        spentISTime = 0

        #For Binary Prediction, isCount = False
        auc = 0
        preds = []
        pop = []

        trainSet = np.copy(trainSeti)
        testSet = np.copy(testSeti)
        pop.clear()

        tstSize = len(testSet)
        partSize = int(tstSize / self.numParts)
        preds.clear()
        diffs = []
        auc = 0.0

        #For isCount = True
        actuals = []
        prrs = []

        if (log):
            self.prnt("#GIS-OPTIONS;;For=" + name + "@" + ":iters=" +
                      str(self.iters) + "-POPSIZE=" + str(self.popSize) +
                      "-NumParts=" + str(self.numParts) + "-NumGens=" +
                      str(self.numGens) + "-sizeTop=" + str(self.sizeTopP) +
                      "-Learner=" + clfName + "\n")

        isOK = True

        np.random.shuffle(testSet)
        self.FinalLearners = []
        self.FinalDatasets = []
        for p in range(self.numParts):

            diffp = []

            self.prnt("\n" + str(p) + ": ")

            tempTime = Common.getCurrentTimeMil()
            pop.clear()
            start = p * partSize
            end = (p + 1) * partSize
            if (end > tstSize):
                end = tstSize

            if (p == self.numParts - 1):
                end = tstSize

            testPart = testSet[start:end, :]

            spentISTime += Common.getCurrentTimeMil() - tempTime

            uinds = set()
            if (vSets == None or len(vSets) == 0):

                if (vSets == None):
                    vSets = []

                vSet = None
                retVal = ""
                if (vSetType == 'Train Set'):

                    vSet = trainSeti
                    if (log):
                        retVal = DPLIB.getStats(vSet, True, True, True)
                        self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" +
                                  name + "@" + ":" + retVal + "\n")

                        retVal = None

                elif (vSetType == 'NN-Filter'):
                    tempTime = Common.getCurrentTimeMil()
                    vSet = DPLIB.NNFilter(trainSet, testPart, 1)
                    spentISTime += Common.getCurrentTimeMil() - tempTime

                    if (log):

                        retVal = DPLIB.getStats(vSet, True, True, True)
                        self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" +
                                  name + "@" + ":" + retVal + "\n")
                        retVal = None

                #If random, but not fed into the func, generate one randomly, with size of testPart
                elif (vSetType == 'Multiple Random'
                      or vSetType == 'Single Random'):

                    size = len(testPart)
                    vSet = []
                    j = 0
                    while (j < size):
                        index = np.random.randint(trainSet.numInstances())

                        if (not index in uinds):
                            uinds.add(index)
                        else:
                            continue

                        vSets.append(trainSet[index])

                        j += 1

                    if (log):

                        retVal = DPLIB.getStats(vSet, true, true, True)
                        self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" +
                                  name + "@" + ":" + retVal + "\n")
                        retVal = None

                    vSet = np.array(vSet)

                elif (vSetType == '!!TEST!!'):

                    #Upper Bound. Should not be used.
                    self.prnt("Should not be used.")
                    vSet = testSeti
                    if (log):

                        retVal = DPLIB.getStats(vSet, True, True, True)
                        self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" +
                                  name + "@" + ":" + retVal + "\n")

                        retVal = None

                elif vSetType == 'KS2':
                    vSet = None
                vSets.append(vSet)

            else:

                retVal = ""
                for vSet in vSets:

                    if (log):
                        retVal = DPLIB.getStats(vSet, True, True, True)
                        self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" +
                                  name + "@" + ":" + retVal + "\n")
                        retVal = None

            for i in range(self.popSize):
                tempTime = Common.getCurrentTimeMil()
                uinds.clear()

                size = 0

                if (fixedTrainSize):
                    size = self.chrmSize
                else:
                    size = np.random.randint(self.chrmSize) + 10

                while True:
                    trSet = []
                    j = 0
                    while (j < size):
                        index = np.random.randint(len(trainSet))

                        trSet.append(trainSet[index])

                        if (not index in uinds):
                            uinds.add(index)

                        j += 1

                    spentISTime += Common.getCurrentTimeMil() - tempTime
                    trSet = np.array(trSet)
                    if len(set(list(trSet[:, -1]))) >= 2:
                        break

                tempTime = Common.getCurrentTimeMil()

                pv, p_vals = DPLIB.checkSimilarity(trSet[:, :-1],
                                                   testPart[:, :-1])

                if self.isCount:
                    h = CHRM_GIS_Count(trSet, None, extraAsFitness='p-val')
                    h.addToExtra('p-val', sum(p_vals))
                    pop.append(h)
                else:

                    h = CHRM_GIS(trSet, None, None, extraAsFitness='p-val')
                    h.addToExtra('p-val', sum(p_vals))
                    pop.append(h)

                spentISTime += Common.getCurrentTimeMil() - tempTime

            tempTime = Common.getCurrentTimeMil()
            pop = DPLIB.MySort(pop)
            spentISTime += Common.getCurrentTimeMil() - tempTime

            cnt = 0
            g = 0
            for g in range(self.numGens):
                self.prnt(str(g) + " ")
                if (log):
                    pass
                    #retVal = ""
                    #for i in range(len(pop)):

                    #    chrm = pop[i]
                    #    retVal = DPLIB.getStats(chrm.ds, False, False, False);
                    #    self.prnt("#POPITNFO;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n");
                    #    self.prnt("#POPITVALS;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+"rpaf="+str(chrm.fitness).replace(", ", ",")
                    #            +";;conf="+str(chrm.conf).replace(", ", ",")+";;fit="+str(chrm.getFitness())+";;TConf2="+str(chrm.testConf).replace(", ", ",")+";;TRpaf2="+str(chrm.testFitness).replace(", ", ",")+"\n");
                    #    retVal = None;

                tempTime = Common.getCurrentTimeMil()
                newPop = []
                for i in range(self.sizeTopP):
                    newPop.append(pop[i])

                i = 0
                for i in range(0, len(pop) - self.sizeTopP, 2):
                    idx1 = 0
                    idx2 = 0
                    while (idx1 == idx2):
                        if (cnt >= 3):
                            idx1 = np.random.randint(len(pop))
                            idx2 = np.random.randint(len(pop))
                        else:
                            idx1 = GA.tornament(pop)
                            idx2 = GA.tornament(pop)
                            cnt += 1

                    cnt = 0
                    ds1 = pop[idx1].ds
                    ds2 = pop[idx2].ds
                    while True:

                        ds1, ds2 = GA.crossOver(ds1,
                                                ds2,
                                                fixedTrainSize,
                                                isCount=self.isCount)
                        if len(set(list(ds1[:, -1]))) >= 2 and len(
                                set(list(ds2[:, -1]))) >= 2:
                            break
                        self.prnt('repeat cross')
                    while True:
                        ds1 = GA.Mutate(ds1, isCount=self.isCount, mad=mad)
                        if len(set(list(ds1[:, -1]))) >= 2:
                            break
                        self.prnt(
                            'repeat mut ds1, because all elements are of type one class'
                        )

                    while True:

                        ds2 = GA.Mutate(ds2, isCount=self.isCount, mad=mad)
                        if len(set(list(ds2[:, -1]))) >= 2:
                            break
                        self.prnt(
                            'repeat mut ds1, because all elements are of type one class'
                        )
                    if self.isCount:
                        newPop.append(
                            CHRM_GIS_Count(ds1, None, extraAsFitness='p-val'))
                        newPop.append(
                            CHRM_GIS_Count(ds2, None, extraAsFitness='p-val'))
                    else:
                        newPop.append(
                            CHRM_GIS(ds1, None, extraAsFitness='p-val'))
                        newPop.append(
                            CHRM_GIS(ds2, None, extraAsFitness='p-val'))

                spentISTime += Common.getCurrentTimeMil() - tempTime

                for i in range(len(newPop)):

                    tempTime = Common.getCurrentTimeMil()

                    pv, p_vals = DPLIB.checkSimilarity(newPop[i].ds[:, :-1],
                                                       testPart[:, :-1])

                    newPop[i].addToExtra('p-val', sum(p_vals))

                    spentISTime += Common.getCurrentTimeMil() - tempTime

                tempTime = Common.getCurrentTimeMil()

                newPop = DPLIB.MySort(newPop)
                exit = False
                countComp = 0

                newPop, rdel = DPLIB.CombinePops(pop, newPop)

                if (log):
                    pass
                    #retVal = ""
                    #for i in range(len(rdel)):

                    #    chrm = rdel[i];
                    #    retVal = DPLIB.getStats(chrm.ds, False, False, False);
                    #    self.prnt("#POPDELITNFO;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+retVal+";;rpaf="+str(chrm.fitness).replace(", ", ",")
                    #            +";;conf="+str(chrm.conf).replace(", ", ",")+";;fit="+str(chrm.getFitness())+";;TConf2="+str(chrm.testConf).replace(", ", ",")+";;TRpaf2="+str(chrm.testFitness).replace(", ", ",")
                    #            +"\n");

                    #    retVal = None;

                rdel = None

                diff = abs(
                    GA.GetMeanFittness(pop, countComp) -
                    GA.GetMeanFittness(newPop, countComp))
                if (diff < 0.000001):
                    exit = True

                diffp.append(diff)

                pop = newPop
                if (pop[0].getFitness() > 0.0) and (exit):
                    break

                exit = False
                spentISTime += Common.getCurrentTimeMil() - tempTime

            w = []
            if (self.count == 0):
                self.count = len(pop)

            for i in range(self.count):
                l = GLOB(clfName).getClassifier()
                tds = pop[i].ds
                self.FinalLearners.append(l)
                self.FinalDatasets.append(tds)
                testPartI = testPart

                l.buildClassifier(tds)

                if self.isCount:
                    actual = DPLIB.getActuals(testPartI)
                    prr = l.evaluateModel(testPartI)
                    #vals = DPLIB.getMeasuresCount(actual,prr)

                    actall = None
                    predall = None
                    if (len(actuals) == self.count):

                        actuals[i] = actuals[i] + actual
                        prrs[i] = prrs[i] + prr
                    else:
                        actuals.append(actual)
                        prrs.append(prr)

                else:

                    vec = l.evaluateModel(testPartI)

                    if (len(preds) == self.count):
                        preds[i] += list(vec)
                    else:
                        preds.append(list(vec))

                if (log):
                    pass
                    #retVal = DPLIB.getStats(tds, True, True, True);
                    #self.prnt("#TRPRTNFO;;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n");

                    #retVal = DPLIB.getStats(testPart,true,true, True);
                    #self.prnt("#TSTPRTNFO;;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n");
                    #vals = DPLIB.getConfMatrix(testPart[:,-1],vec)

                    #self.prnt("#TSTPRTVALS;;prt="+str(p)+";;For="+name+"@"+":"+
                    #        "rpaf="+str(DPLIB.getMeasures(vals)).replace(", ", ",")
                    #            +";;conf="+str(vals).replace(", ", ",")+"\n");

                    #retVal = None;

                w.append(pop[i].getFitness())

        isOK = True

        if not isOK:
            pass
        else:
            thresholds.append(pop[0].getFitness())

        self.prnt()
        self.prnt("Best Top Fitness:" + str(pop[0].fitness))
        self.prnt("Best Fitness (mean):", pop[0].getMeanFitness())

        if self.isCount:
            vals = DPLIB.getMeasuresCountSet(actuals, prrs)
        else:
            vals1 = DPLIB.getConfMatrixSet(testSet[:, -1], preds)
            vals = DPLIB.getMeasures(vals1)

        if (isOK):

            if not self.isCount:

                if (len(preds) == 1):
                    auc = DPLIB.getAUC(testSet[:, -1], preds[0])
                else:
                    auc = DPLIB.getAUCSet(testSet[:, -1], preds)

                vals['auc'] = auc
                self.prnt()
                self.prnt("#CONF-TEST:" + name + ":" + self.file + ": " +
                          str(vals1))

                self.prnt()
                self.prnt(name + ":" + self.file + ": " + str(vals))

                self.prnt()
            else:
                self.prnt()
                self.prnt(name + ":" + self.file + ": " + str(vals))
                self.prnt()
        else:

            bestI = pop[0]
            rejectedFits.append(bestI.getFitness())

            rejVals = copy.deepcopy(bestI.fitness)
            rejectedPerfs.append(rejVals)

            testRejVals = copy.deepcopy(vals)

            rejectedTestPerfs.eppend(testRejVals)

            self.prnt("#NOTOKPREDS----" + name + ":" + self.file + ": " +
                      str(vals))

            if not self.isCount:
                self.prnt()
                self.prnt("#NOTOKPREDS----" + "#CONF-TEST:" + name + ":" +
                          self.file + ": " + str(vals1))

        time = Common.getCurrentTimeMil() - startTime

        self.prnt("#TIME-FOR:" + name + ":" + self.file + ": " + str(time))

        self.prnt("#TIME-FOR-IS:" + name + ":" + self.file + ": " +
                  str(spentISTime))

        return isOK

示例#15

0

显示文件

    def run(self):
        lrnrnames = self.lrnrs
        try:
            rnd = random.Random(Common.getCurrentTimeMil())            
            if self.expType == 'GIS':
                if self.isKS:
                    self.gis = GISKS2(self.pars,self.file)
                else:
                    self.gis = GIS(self.pars)
            elif self.expType == 'LSH':
                lsh = CPDP_LSH_Binary(self.pars, self.file)

            trainSetAll = DPLIB.LoadCSV(self.train, self.dp, self.features, convertToBinary = not self.isCount);
            testSetAll = DPLIB.LoadCSV(self.test, self.dp, self.features, convertToBinary = not self.isCount);  
            

            ft = 'A'
            indi = None

            if not self.isCount:

                if self.pars['features'] == 'Iterative InfoGain Subsetting':
                    ft = 'IG'
                    indi = DPLIB.fSelectInfoGain(trainSetAll);

            
            if self.pars['features'] == 'All':
                print ('All')

            if self.pars['features'] == 'PCA':
                ft = 'PCA'
                print ('PCA')
                trainSetAll, testSetAll = DPLIB.applyPCA(trainSetAll, testSetAll, 0.95)
                        
            for lk in range(len(lrnrnames)):                

                lrnr = "-" + lrnrnames[lk];

                clfName = lrnrnames[lk];

                vSets = None                                
                
                if not self.isCount:
                    if self.pars['features'] == 'Iterative InfoGain Subsetting':

                        indis2 = DPLIB.iterativeInfoGainSubsetting(trainSetAll, indi, clfName);
                        trainSetAll = DPLIB.fSelectSet(trainSetAll, indis2);
                        testSetAll = DPLIB.fSelectSet(testSetAll, indis2);
                
                if self.pars['vSetType'] in ['Single Random','Multiple Random']:
                    vSets = DatasetUtils.getRandomDatasets()
                c = 0
                while (c < self.iters):

                    print("Start:" + self.file + ": " + str(c));
                    print("====================================================");
                    #fout.write("#ITERINFO:For File=" +self.file + "-Iter:" + str(c) + "\n");

                    stages = None
                    buckets = None
                    sbtx = ""
                                                                    
                    if self.expType == 'GIS':
                        self.doGIS(trainSetAll, testSetAll, "FIXED-VMUL-GEN-"+ft, lrnr, fout, vSets, False, clfName, gis=gis);
                        gis.prnt('---------------------------------------\n')
                    
                        
                    elif self.expType == 'LSH':
                        lsh.CreateBucketsTune(trainSetAll, testSetAll, vSets, name= "LSHTune-ALL-TOP-SUPER" + sbtx + lrnr, testCut= self.pars['tunecut'], iternum=c, save=False, superbit=self.pars['lshType'] =='SuperBit', clfName=clfName,tunelrn = self.pars['tunelrnr']);
                        lsh.prnt('---------------------------------------\n')

                    c+=1
                    
            #fout.write("===================================================================\n");
            #fout.close();

            print("File Processing Ended:" +self.file);
        except Exception as e:

            try:
                print (str(e))
                print(traceback.format_exc())                
            except Exception as ex2:
                print("X2", str(ex2));
                print(traceback.format_exc())
                
        if self.expType == 'GIS':
            return gis
        elif self.expType == 'LSH':
            return lsh

示例#16

0

显示文件

文件： Benchmarks.py 项目： rebvar/datasci.datalytikz.com

    def WCFolds(testSet, folds, file, fout, name, clfName):

        auc = 0
        preds = []
        actuals = []
        vals = None
        tssCopy = testSet[:, :]
        rnd = random.Random(Common.getCurrentTimeMil())
        np.random.shuffle(tssCopy)

        skf = StratifiedKFold(n_splits=folds)
        X = tssCopy[:, :-1]
        y = tssCopy[:, -1]
        for train_index, test_index in skf.split(X, y):

            cvtrain, cvtest = X[train_index], X[test_index]
            cvtrainY, cvtestY = y[train_index], y[test_index]

            cvtrain = np.append(cvtrain,
                                cvtrainY.reshape((len(cvtrainY), 1)),
                                axis=1)

            cvtest = np.append(cvtest,
                               cvtestY.reshape((len(cvtestY), 1)),
                               axis=1)

            if (name.lower().find("infogain") >= 0):
                pass
                #int indi[] = DPLIB.fSelectInfoGain(cvtrain);
                #if (DPLIB.useIterativeInfoGainSubsetting)
                #{
                #    indi = DPLIB.iterativeInfoGainSubsetting(cvtrain, indi, clfName);
                #}
                #else
                #    indi = DPLIB.getTopX(indi);
                #cvtrain = DPLIB.fSelectSet(cvtrain, indi);
                #cvtest = DPLIB.fSelectSet(cvtest, indi);

            m = GLOB(clfName).getClassifier()
            m.buildClassifier(cvtrain)

            vec = m.evaluateModel(cvtest)

            preds.append(vec)
            actuals.append(cvtestY)
            if vals == None:

                vals = DPLIB.getConfMatrix(cvtestY, vec)

            else:

                v2 = DPLIB.getConfMatrix(cvtestY, vec)

                for key in vals.keys():
                    vals[key] += v2[key]

        auc = DPLIB.getAUCCV(actuals, preds)
        vals1 = DPLIB.getMeasures(vals)
        print(name + ":" + file + ": " + str(vals1) + " AUC = " + str(auc))
        fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) +
                   ";" + "Vals=" + str(vals1))

示例#17

0

显示文件

文件： Benchmarks.py 项目： rebvar/datasci.datalytikz.com

    def NNFilter(trainSeti, testSet, file, fout, name, vecin, count, clfName,
                 tunelrn, vSets, testCut):

        startTime = Common.getCurrentTimeMil()
        spentISTime = 0
        tempTime = 0

        bestFit = 0.0
        bestCount = 0
        btrainSet = None
        cfbf = DPLIB.DefaultCF

        if (count == 0):
            for i in range(1, 11):

                tempTime = Common.getCurrentTimeMil()

                trainSet = DPLIB.NNFilter(trainSeti, testSet, i)

                spentISTime += Common.getCurrentTimeMil() - tempTime

                l = GLOB(clfName, tunelrn).getClassifier()

                if (tunelrn):
                    l = l.getTunedCLF(trainSet, vSets, fout, name, file)

                l.buildClassifier(trainSet)

                avgFit = 0.0
                j = 0
                for j in range(len(vSets)):

                    vec = l.evaluateModel(vSets[j])

                    tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec)
                    measures = DPLIB.getExtMeasures(tvals)
                    fit = measures["F"] * measures["GMean1"]
                    avgFit += fit

                avgFit /= len(vSets)

                if (avgFit > bestFit):
                    bestFit = avgFit
                    bestCount = i
                    btrainSet = trainSet[:, :]

            if (testCut):

                cf = 0
                j = 0

                trainSet = btrainSet

                l = GLOB(clfName, tunelrn).getClassifier()

                if (tunelrn):
                    l = l.getTunedCLF(trainSet, vSets, fout, name, file)

                l.buildClassifier(trainSet)
                avgFit = 0.0

                for j in range(len(vSets)):

                    vec = l.evaluateModel(vSets[j])
                    vCF = 0.1
                    bestCF = 0
                    bestCFVal = -1
                    bestVals = None

                    while True:
                        tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec, vCF)
                        measures = DPLIB.getExtMeasures(tvals)
                        fit = measures["F"] * measures["GMean1"]
                        if (fit > bestCFVal or bestVals == None):
                            bestCFVal = fit
                            bestCF = vCF
                            bestVals = tvals

                        vCF += 0.1
                        if vCF >= 1:
                            break
                    cf += bestCF

                cf /= vSets.size()
                cfbf = cf

        trainSet = None
        if (count == 0):
            trainSet = btrainSet
        else:
            tempTime = Common.getCurrentTimeMil()
            trainSet = DPLIB.NNFilter(trainSeti, testSet, count)
            spentISTime = Common.getCurrentTimeMil() - tempTime
            bestCount = count

        l = GLOB(clfName, tunelrn).getClassifier()

        if (tunelrn):
            l = l.getTunedCLF(trainSet, vSets, fout, name, file)

            print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(l.selectedParams))
            fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(l.selectedParams))
            fout.write("\n")
            sCheck = l.getCLFOptions()

            print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(sCheck))
            fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(sCheck))
            fout.write("\n")

        l.buildClassifier(trainSet)

        vec = l.evaluateModel(testSet)

        vecin = vec

        tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin, cfbf)
        if (count == 0):

            print("#BESTCOUNT-" + name + ":" + file + ": " + str(bestCount))

            fout.write("#BESTCOUNT-" + name + ":" + file + ": ")
            fout.write(str(bestCount))
            fout.write("\n")

            print("#BESTFIT-" + name + ":" + file + ": " + str(bestFit))
            fout.write("#BESTFIT-" + name + ":" + file + ": ")
            fout.write(str(bestFit))
            fout.write("\n")

        print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals))
        fout.write("#CONF-TEST-" + name + ":" + file + ": ")
        fout.write(str(tvals))
        fout.write("\n")
        if (testCut):

            print("#NN-BEST-CF-VALUE:" + name + ":" + file + ": " + str(cfbf))

            fout.write("#NN-BEST-CF-VALUE:" + name + ":" + file + ": ")
            fout.write(str(cfbf))
            fout.write("\n")

        vals = DPLIB.getMeasures(tvals)
        auc = DPLIB.getAUC(testSet[:, -1], vecin)
        print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

        fout.write(name + ":" + file + ": ")
        fout.write(str(vals))
        fout.write(" AUC = ")
        fout.write(str(auc))
        fout.write("\n")

        time = Common.getCurrentTimeMil() - startTime

        print("#TIME-FOR:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n")

        print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(spentISTime))
        fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " +
                   str(spentISTime) + "\n")

        return vecin

示例#18

0

显示文件

文件： ParseBlames2.py 项目： rebvar/datasci.datalytikz.com

    for key in FutureBugs.keys():
        bds = sorted([bugDates[b] for b in FutureBugs[key]])

        if len(bds)>1:
            timespans.append((datetime.datetime.fromtimestamp(bds[-1])-datetime.datetime.fromtimestamp(bds[0])).total_seconds()/86400)
            timespansNoz.append(timespans[-1])
            ids.append(key)
        else:
            timespans.append(0)

    print ('2 Timespan of Bugs: Mean,STD,Median ',np.mean(timespans),np.std(timespans),np.median(timespans))
    print ('2 Timespan of Bugs (for multibugs only): Mean,STD ,Median',np.mean(timespansNoz),np.std(timespansNoz),np.median(timespansNoz))
    print ('2 Count Multi Bug:', len(timespansNoz))
    print ('2 Count One or Multi Bug:', len(timespans))
    mad = DPLIB.MAD(timespansNoz)
    umad = mad+np.median(timespansNoz)
    lmad = np.median(timespansNoz)-mad
    if len(timespansNoz)>0:
        print ('2 MAD, Median , UMAD , Percent>UMAD, Median Filtered:', 
            mad,np.median(timespansNoz),umad,'%.2f' %(100*len([t for t in timespansNoz if t>umad])/len(timespansNoz)), np.median([t for t in timespansNoz if t>umad]))
    else:
        print ('2 MAD, Median , UMAD , Percent>UMAD, Median Filtered:', 
            mad,np.median(timespansNoz),umad,'PERCENT UNDEF', np.median([t for t in timespansNoz if t>umad]))
    #numfiles= [(key,len(logsd[int(key)][logcolnames.index('files')].split('---')[:-1])) for index,key in enumerate(ids) if timespansNoz[index]>umad]
    #print (numfiles)
    if len(timespansNoz)>0:
        try:

            pass
            #plt.figure(figsize=(10,10))