Пример #1
0
    def condense(self):
        D, key, kName = self.args.dataset, self.args.n_keys[
            'groups'], self.args.n_keys['groups'].names[0]
        kData, kType, kOpts = [
            [k[2], 1, 'NO'] for k in key.data[kName]
        ], key.types[kName], list(set([k[2] for k in key.data[kName] if k[0]]))

        if not key.default:
            groups = [[
                i for i in range(len(kData)) if kData[i][0] == kOpts[j]
            ] for j in range(len(kOpts))]

        else:
            print key.default
            sys.exit()

        for size in self.args.size:

            size, grps = max(size,
                             min(min([int(len(g) / 2.0) for g in groups]),
                                 2)), [g for g in groups]

            for i, group in enumerate(grps):
                random.shuffle(group)
                grps[i] = [
                    group[j:j + size] for j in range(0, len(group), size)
                ]

            n = [x for x in D.n] + [
                a for b in
                [[kOpts[j] + '@' + str(i + 1) for i in range(len(grps[j]))]
                 for j in range(len(kOpts))] for a in b
            ]
            gk = kData + [
                a for b in [[[kOpts[j], len(g), 'YES'] for g in grps[j]]
                            for j in range(len(kOpts))] for a in b
            ]
            gv = [
                v + [
                    a for b in [[sum([v[x] for x in g]) for g in grps[i]]
                                for i in range(len(grps))] for a in b
                ] for j, v in enumerate(D.v)
            ]
            gP, gN = 'condense' + str(size) + '.', D.N + '_c' + str(size)
            #gP,gN = ".".join([D.name.split(".")[0],'condense',kName,str(size),".".join(D.name.split(".")[1:-1])]),D.N+'.'+str(size)+'.condense'
            if self.args.log: gv = [[log(x + 1) for x in X] for X in gv]

            self.results.append(
                Dataset(gP + D.V).create(D.M, gN, D.V, D.m, n, gv))
            self.results[-1].parents['N'].append(D.N)
            self.results.append(
                Dataset(gP + 'key', False,
                        'hide').create(gN, 'info', 'anno', n,
                                       [kName, 'size', 'condensed'], gk, gN))
Пример #2
0
    def rpm_normalize(self):

        D = self.args.dataset
        nVals = [[
            self.args.dataset.v[j][i] for j in range(len(self.args.dataset.m))
        ] for i in range(len(self.args.dataset.n))]
        nTotals = [
            float(
                sum([
                    self.args.dataset.v[j][i]
                    for j in range(len(self.args.dataset.m))
                ])) for i in range(len(self.args.dataset.n))
        ]

        constant = sum(nTotals) / float(len(nTotals))

        self.rpm_vals = [[
            (self.args.dataset.v[j][i] * constant) / (nTotals[i] + 0.01)
            for i in range(len(self.args.dataset.n))
        ] for j in range(len(self.args.dataset.m))]

        if self.args.log: name = 'rpm.log.cnts'
        else: name = 'rpm.raw.cnts'

        self.results.append(
            Dataset(name, False,
                    self.transform).create(self.args.dataset.M,
                                           self.args.dataset.N, 'rpm-cnts',
                                           self.args.dataset.m,
                                           self.args.dataset.n, self.rpm_vals))
        self.talk("RPM Normalization Complete\n")
Пример #3
0
    def produce_quantiles(self):

        if self.args.log: name = 'quantile.log.cnts'
        else: name = 'quantile.raw.cnts'

        self.results.append(
            Dataset(name, False, self.transform).create(
                self.args.dataset.M, self.args.dataset.N, 'quantile-cnts',
                self.args.dataset.m, self.args.dataset.n, self.quantile_vals))
        self.talk("Quantile Normalization Complete\n")
Пример #4
0
    def downsample(self):

        D, nKey, self.args.size = self.args.dataset, {}, self.args.size[0]

        nVals = [[
            self.args.dataset.v[j][i] for j in range(len(self.args.dataset.m))
        ] for i in range(len(self.args.dataset.n))]

        nTotals = sorted([(int(
            sum([
                self.args.dataset.v[j][i]
                for j in range(len(self.args.dataset.m))
            ])), self.args.dataset.n[i])
                          for i in range(len(self.args.dataset.n))])
        if self.args.size == 0:
            self.args.size = int(nTotals[int(len(nTotals) / 10.0)][0])

            nLen = float(len(nTotals))
            perc5, perc10, perc20, perc25, perc75, perc80, perc90, perc95 = int(
                nLen * 0.05), int(nLen * 0.10), int(nLen * 0.20), int(
                    nLen * 0.25), int(nLen * 0.75), int(nLen * 0.8), int(
                        nLen * 0.9), int(nLen * 0.95)

            nFloats = [float(N[0]) for N in nTotals][perc25:perc75]

            print nFloats[0:20]

            nMean = sum(nFloats) / len(nFloats)
            nStd = (sum([(nf - nMean) * (nf - nMean)
                         for nf in nFloats]) / (len(nFloats) - 1.0))

            print nMean, nStd

            self.talk("No minimum size supplied - will use 10th percentile: " +
                      str(self.args.size) + "\n")
        else:
            self.talk("Using supplied minimum downsample value: " +
                      str(self.args.size) + "\n")

        nSmall = [x for x in nTotals if x[0] * 5 < self.args.size]
        if len(nSmall) > 0:
            nList = ",".join([x[1] for x in nSmall])
            self.talk(
                "Warning: Outlier samples with low counts will be heavilty upsampled: "
                + nList + "\n")

        for i, V in enumerate(nVals):
            idx, iRand, nKey[D.n[i]], k = [[(0, 0), 'INIT']], [], dd(int), 0
            for j, v in enumerate(V):
                if v >= 1:
                    idx.append([(idx[-1][0][1], idx[-1][0][1] + int(v)),
                                D.m[j]])
            iRange, iLen = range(idx[-1][0][1] + 1), len(
                range(idx[-1][0][1] + 1))

            while len(iRand) < self.args.size:
                iRand += random.sample(iRange,
                                       min(iLen, self.args.size - len(iRand)))
            for s in sorted(iRand):
                while s > idx[k][0][1]:
                    k += 1
                nKey[D.n[i]][idx[k][1]] += 1
        vals = [[nKey[s][D.m[j]] for s in D.n] for j in range(len(D.m))]

        if self.args.log: name = 'ds.' + str(self.args.size) + '.log.cnts'
        else: name = 'ds.' + str(self.args.size) + '.cnts'

        self.results.append(
            Dataset(name, False,
                    self.transform).create(D.M, D.N, 'dowsampled-cnts', D.m,
                                           D.n, vals))
        self.talk("Downsampling Complete\n")