示例#1
0
 def getBiBW(self, numIters, memSize):
     logging.debug("STATUS: begin BiBW test.")
     self.collectiveArgs.asyncOp = True
     # get bidirectional bandwidth
     biLatencyNS = []
     for _ in range(numIters):
         self.backendFuncs.sync_barrier(self.collectiveArgs)
         start = time.monotonic()
         for w in range(self.collectiveArgs.window):
             if self.collectiveArgs.global_rank == self.collectiveArgs.src_rank:
                 self.backendFuncs.isend(self.collectiveArgs,
                                         self.collectiveArgs.dst_rank,
                                         tag=w)
                 self.backendFuncs.irecv(self.collectiveArgs,
                                         self.collectiveArgs.dst_rank,
                                         tag=w + self.collectiveArgs.window)
             elif self.collectiveArgs.global_rank == self.collectiveArgs.dst_rank:
                 self.backendFuncs.irecv(self.collectiveArgs,
                                         self.collectiveArgs.src_rank,
                                         tag=w)
                 self.backendFuncs.isend(self.collectiveArgs,
                                         self.collectiveArgs.src_rank,
                                         tag=w + self.collectiveArgs.window)
         self.backendFuncs.complete_accel_ops(self.collectiveArgs)
         biLatencyNS.append(
             (time.monotonic() - start) * 1e9
         )  # keeping time in NS, helps in divising data by nanosecond
     biLatencyNS = [lat / self.collectiveArgs.window for lat in biLatencyNS]
     biLatencyNS = np.mean(np.array(biLatencyNS))
     _, avgBiBW = comms_utils.getAlgBW(biLatencyNS, 2 * memSize, 1)
     logging.debug("STATUS: end UniBW test.")
     return avgBiBW
示例#2
0
 def test_no_iterations(self):
     elapsedTimeNs = 30000
     dataSize = 90000 # bytes
     numIters = 0
     (avgIterNS, algBW) = comms_utils.getAlgBW(elapsedTimeNs, dataSize, numIters)
     # If we had no iterations, then we have no avg iteration time or algBW.
     self.assertEqual(0.0, avgIterNS, algBW)
示例#3
0
文件: comms.py 项目: louisfeng/param
 def getUniBW(self, numIters, memSize):
     logger.debug(
         "STATUS: begin UniBW test with src_ranks=%s, dst_ranks=%s." %
         (self.collectiveArgs.src_ranks, self.collectiveArgs.dst_ranks))
     self.collectiveArgs.asyncOp = True
     # get unidirectional bandwidth
     uniLatencyNS = []
     for _ in range(numIters):
         self.backendFuncs.sync_barrier(self.collectiveArgs)
         start = time.monotonic()
         for w in range(self.collectiveArgs.window):
             if self.collectiveArgs.global_rank in self.collectiveArgs.src_ranks:
                 idx = self.collectiveArgs.src_ranks.index(
                     self.collectiveArgs.global_rank)
                 self.backendFuncs.isend(self.collectiveArgs,
                                         self.collectiveArgs.dst_ranks[idx],
                                         tag=w)
             elif self.collectiveArgs.global_rank in self.collectiveArgs.dst_ranks:
                 idx = self.collectiveArgs.dst_ranks.index(
                     self.collectiveArgs.global_rank)
                 self.backendFuncs.irecv(self.collectiveArgs,
                                         self.collectiveArgs.src_ranks[idx],
                                         tag=w)
         self.backendFuncs.complete_accel_ops(self.collectiveArgs)
         uniLatencyNS.append(
             (time.monotonic() - start) * 1e9
         )  # keeping time in NS, helps in divising data by nanosecond
     uniLatencyNS = [
         lat / self.collectiveArgs.window for lat in uniLatencyNS
     ]
     uniLatencyNS = np.mean(np.array(uniLatencyNS))
     _, avgUniBW = comms_utils.getAlgBW(uniLatencyNS, memSize, 1)
     logger.debug("STATUS: end UniBW test.")
     return avgUniBW
示例#4
0
 def test_iterations(self):
     elapsedTimeNs = 30000
     dataSize = 90000 # bytes
     numIters = 3
     (avgIterNS, algBW) = comms_utils.getAlgBW(elapsedTimeNs, dataSize, numIters)
     # avgIterNS = elapsedTimeNS / numIters = 10000
     self.assertEqual(10000.0, avgIterNS)
     # algBW = dataSize / avgIterNs = 9
     self.assertEqual(9.0, algBW)
示例#5
0
文件: comms.py 项目: pallab-zz/param
    def runColl(self, comm_fn=None, compute_fn=None):
        self.backendFuncs.complete_accel_ops(self.collectiveArgs, initOp=True)
        numElements = self.collectiveArgs.numElements
        # Initial warmup iters.
        for _ in range(self.collectiveArgs.numWarmupIters):
            if comm_fn is not None:
                self.collectiveArgs.waitObj.append(
                    comm_fn(self.collectiveArgs,
                            retFlag=self.collectiveArgs.asyncOp))
            if compute_fn is not None:
                for _ in range(self.collectiveArgs.numComputePerColl):
                    compute_fn(self.collectiveArgs)
            if not self.collectiveArgs.asyncOp:  # should be sychronous, do wait.
                self.backendFuncs.complete_accel_ops(self.collectiveArgs)
        self.backendFuncs.complete_accel_ops(
            self.collectiveArgs
        )  # should be done regardless of blocking or non-blocking.

        self.backendFuncs.barrier(self.collectiveArgs, "runcoll")

        # Measuring time.
        start = time.monotonic()  # available only in py3
        for _ in range(self.collectiveArgs.numIters):
            if comm_fn is not None:
                self.collectiveArgs.waitObj.append(
                    comm_fn(self.collectiveArgs,
                            retFlag=self.collectiveArgs.asyncOp))
            if compute_fn is not None:
                for _ in range(self.collectiveArgs.numComputePerColl):
                    # TODO: investigate the cache effect
                    # Flush the cache
                    # _ = torch.rand(6 * 1024 * 1024 // 4).float() * 2  # V100 6MB L2 cache
                    compute_fn(self.collectiveArgs)
            if not self.collectiveArgs.asyncOp:  # should be sychronous, do wait.
                self.backendFuncs.complete_accel_ops(self.collectiveArgs)

        self.backendFuncs.complete_accel_ops(self.collectiveArgs)
        end = time.monotonic()  # available only in py3
        x = self.collectiveArgs.opTensor[
            numElements -
            1].item()  # to ensure collective won't be optimized away.

        elapsedTimeNS = (
            end - start
        ) * 1e9  # keeping time in NS, helps in divising data by nanoseconds
        avgIterNS, algBW = comms_utils.getAlgBW(elapsedTimeNS,
                                                self.collectiveArgs.dataSize,
                                                self.collectiveArgs.numIters)
        busBW = self.backendFuncs.getBusBW(self.collectiveArgs.collective,
                                           algBW,
                                           self.collectiveArgs.world_size)
        memSize = self.backendFuncs.get_mem_size(self.collectiveArgs)

        self.backendFuncs.barrier(self.collectiveArgs, "runcoll2")
        return (avgIterNS, algBW, busBW, memSize, x)
示例#6
0
文件: comms.py 项目: pallab-zz/param
    def reportBenchTime(self, commsParams, allSizes, tensorList, results):
        self.collectiveArgs.collective = commsParams.collective
        self.collectiveArgs.numIters = 1  # commsParams.numIters

        print(
            "\n\tCOMMS-RES\tsize (B)\t num-elements\t Latency(us):p50\tp75\t\tp95\t algBW(GB/s)\t busBW(GB/s)"
        )
        for idx, curSize in enumerate(allSizes):
            if commsParams.backend == "xla":
                latencyAcrossRanks = torch.transpose(
                    tensorList.view(-1, len(allSizes)), 0, 1)[idx]
                latencyAcrossRanks = latencyAcrossRanks.cpu().detach().numpy()
            else:
                latencyAcrossRanks = []
                for curRankTensor in tensorList:
                    rank_lat = curRankTensor[idx].item()
                    latencyAcrossRanks.append(rank_lat)

                latencyAcrossRanks = np.array(latencyAcrossRanks)

            # print("AAA lat size ", curSize, " time ", latencyAcrossRanks)

            p50 = np.percentile(latencyAcrossRanks, 50)
            p75 = np.percentile(latencyAcrossRanks, 75)
            p95 = np.percentile(latencyAcrossRanks, 95)

            self.collectiveArgs.dataSize = curSize
            avgIterNS, algBW = comms_utils.getAlgBW(
                p50 * 1e3, self.collectiveArgs.dataSize,
                self.collectiveArgs.numIters)
            busBW = self.backendFuncs.getBusBW(self.collectiveArgs.collective,
                                               algBW,
                                               self.collectiveArgs.world_size)

            print("\tCOMMS-RES\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s" % (
                results[curSize]["memSize"],
                str("%d" % (results[curSize]["num_elements"])),
                str("%.1f" % (p50)),
                str("%.1f" % (p75)),
                str("%.1f" % (p95)),
                str("%.3f" % (algBW)),
                str("%.3f" % (busBW)),
            ))
示例#7
0
文件: comms.py 项目: manjugv/param
def reportBenchTime(collectiveArgs, commsParams, allSizes, tensorList,
                    results):
    collectiveArgs.collective = commsParams.collective
    collectiveArgs.numIters = 1  # commsParams.numIters

    print(
        "\n\tCOMMS-RES\tsize (B)\t num-elements\t Latency(us):p50\tp75\t\tp95\t algBW(GB/s)\t busBW(GB/s)"
    )
    for idx, curSize in enumerate(allSizes):
        latencyAcrossRanks = []
        for curRankTensor in tensorList:
            rank_lat = curRankTensor[idx].item()
            latencyAcrossRanks.append(rank_lat)

        latencyAcrossRanks = np.array(latencyAcrossRanks)
        p50 = np.percentile(latencyAcrossRanks, 50)
        p75 = np.percentile(latencyAcrossRanks, 75)
        p95 = np.percentile(latencyAcrossRanks, 95)

        collectiveArgs.dataSize = curSize
        avgIterNS, algBW = comms_utils.getAlgBW(p50 * 1e3,
                                                collectiveArgs.dataSize,
                                                collectiveArgs.numIters)
        busBW = collectiveArgs.backendFuncs.getBusBW(collectiveArgs.collective,
                                                     algBW,
                                                     collectiveArgs.world_size)

        print("\tCOMMS-RES\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s\t%12s" % (
            results[curSize]["memSize"],
            str("%d" % (results[curSize]["num_elements"])),
            str("%.1f" % (p50)),
            str("%.1f" % (p75)),
            str("%.1f" % (p95)),
            str("%.3f" % (algBW)),
            str("%.3f" % (busBW)),
        ))
示例#8
0
    def runColl(self, comm_fn=None, compute_fn=None, comm_fn_pair=None):
        self.backendFuncs.complete_accel_ops(self.collectiveArgs, initOp=True)
        numElements = self.collectiveArgs.numElements
        if comm_fn_pair is not None:
            numElements_pair = self.collectiveArgs.numElements_pair
        # Initial warmup iters.
        for _ in range(self.collectiveArgs.numWarmupIters):
            if comm_fn is not None:
                if self.collectiveArgs.num_pgs > 1:
                    self.collectiveArgs.group = self.collectiveArgs.groups[0]
                comm_fn(self.collectiveArgs)
            if comm_fn_pair is not None:
                if self.collectiveArgs.num_pgs > 1:
                    self.collectiveArgs.group = self.collectiveArgs.groups[1]
                comm_fn_pair(self.collectiveArgs, pair=True)
            if compute_fn is not None:
                for _ in range(self.collectiveArgs.numComputePerColl):
                    compute_fn(self.collectiveArgs)
            if not self.collectiveArgs.asyncOp:  # should be sychronous, do wait.
                self.backendFuncs.complete_accel_ops(self.collectiveArgs)

        self.backendFuncs.sync_barrier(self.collectiveArgs,
                                       desc="runColl_begin")

        # Measuring time.
        elapsedTimeNS = 0.0
        for _ in range(self.collectiveArgs.numIters):
            if not self.collectiveArgs.asyncOp:  # should be sychronous, do barrier and wait for collective
                self.setTensorVal(
                    self.collectiveArgs.opTensor)  # reset tensor values
                if comm_fn_pair is not None:
                    self.setTensorVal(self.collectiveArgs.opTensor_pair)
                self.backendFuncs.sync_barrier(self.collectiveArgs)
            oldAsyncOp = self.collectiveArgs.asyncOp
            round_robin_group = cycle(self.collectiveArgs.groups)
            if comm_fn_pair is not None:
                self.collectiveArgs.asyncOp = True

            start = time.monotonic()  # available only in py3
            if comm_fn is not None:
                self.collectiveArgs.group = next(round_robin_group)
                comm_fn(self.collectiveArgs)
            if comm_fn_pair is not None:
                self.collectiveArgs.group = next(round_robin_group)
                comm_fn_pair(self.collectiveArgs, pair=True)
            if compute_fn is not None:
                for _ in range(self.collectiveArgs.numComputePerColl):
                    # TODO: investigate the cache effect
                    # Flush the cache
                    # _ = torch.rand(6 * 1024 * 1024 // 4).float() * 2  # V100 6MB L2 cache
                    compute_fn(self.collectiveArgs)
            self.collectiveArgs.asyncOp = oldAsyncOp
            if not self.collectiveArgs.asyncOp:  # should be sychronous, wait for the collective
                self.backendFuncs.complete_accel_ops(self.collectiveArgs)
            # Measuring time.
            elapsedTimeNS += (
                time.monotonic() - start
            ) * 1e9  # keeping time in NS, helps in divising data by nanosecond

        start = time.monotonic()  # available only in py3
        self.backendFuncs.complete_accel_ops(self.collectiveArgs)
        end = time.monotonic()  # available only in py3
        if isinstance(self.collectiveArgs.opTensor, list):
            # allgather is a list of tensors
            x = self.collectiveArgs.opTensor[-1][-1].item(
            )  # to ensure collective won't be optimized away.
        else:
            x = self.collectiveArgs.opTensor[
                numElements -
                1].item()  # to ensure collective won't be optimized away.
        x_pair = None
        if comm_fn_pair is not None:
            if isinstance(self.collectiveArgs.opTensor_pair, list):
                # allgather is a list of tensors
                x_pair = self.collectiveArgs.opTensor_pair[-1][-1].item(
                )  # to ensure collective won't be optimized away.
            else:
                x_pair = self.collectiveArgs.opTensor_pair[
                    numElements_pair -
                    1].item()  # to ensure collective won't be optimized away.

        elapsedTimeNS += (
            end - start
        ) * 1e9  # keeping time in NS, helps in divising data by nanoseconds

        memSize = self.backendFuncs.get_mem_size(self.collectiveArgs)

        avgIterNS, algBW = comms_utils.getAlgBW(elapsedTimeNS, memSize,
                                                self.collectiveArgs.numIters)
        busBW = self.backendFuncs.getBusBW(self.collectiveArgs.collective,
                                           algBW,
                                           self.collectiveArgs.world_size)
        if comm_fn_pair is not None:
            memSize_pair = self.backendFuncs.get_mem_size(self.collectiveArgs,
                                                          pair=True)
            memSize += memSize_pair

            _, algBW_pair = comms_utils.getAlgBW(elapsedTimeNS, memSize_pair,
                                                 self.collectiveArgs.numIters)
            algBW += algBW_pair
            busBW_pair = self.backendFuncs.getBusBW(
                self.collectiveArgs.collective_pair, algBW_pair,
                self.collectiveArgs.world_size)
            busBW += busBW_pair

        self.backendFuncs.sync_barrier(self.collectiveArgs, "runColl_end")
        return (avgIterNS, algBW, busBW, memSize, x, x_pair)
示例#9
0
文件: comms.py 项目: louisfeng/param
    def runColl(self, comm_fn=None, compute_fn=None, comm_fn_pair=None):
        self.backendFuncs.complete_accel_ops(self.collectiveArgs, initOp=True)
        self.backendFuncs.sync_barrier(self.collectiveArgs,
                                       desc="runColl_begin")

        elapsedTimeNS = 0.0
        is_blocking = not self.collectiveArgs.asyncOp
        enable_comms = False if (
            comm_fn is None or comm_fn == self.backendFuncs.noop) else True
        enable_compute = False if (compute_fn is None or compute_fn
                                   == self.backendFuncs.noop) else True
        enable_comms_pair = False if (comm_fn_pair is None or comm_fn_pair
                                      == self.backendFuncs.noop) else True

        # for comms pair mode, force async comms for overlapping evaluation
        if enable_comms_pair:
            self.collectiveArgs.asyncOp = True
        for nIter in range(self.collectiveArgs.numWarmupIters +
                           self.collectiveArgs.numIters):
            if nIter == self.collectiveArgs.numWarmupIters:
                # Start measuring time after warmup iterations
                elapsedTimeNS = 0.0
                self.collectiveArgs.quant_time.reset()
                self.collectiveArgs.dequant_time.reset()
            # reset tensor values for data validation check
            if enable_comms:
                self.setTensorVal(self.collectiveArgs.opTensor)
            # for blocking mode, do barrier before starting collective
            if is_blocking:
                self.backendFuncs.sync_barrier(self.collectiveArgs)

            start = time.monotonic()  # available only in py3
            self.collectiveArgs.group = self.backendFuncs.get_next_group()
            comm_fn(self.collectiveArgs)
            # post another collecitve if on comms pair mode, otherwise it's noop
            self.collectiveArgs.group = self.backendFuncs.get_next_group()
            comm_fn_pair(self.collectiveArgs, pair=enable_comms_pair)

            if enable_compute:
                for _ in range(self.collectiveArgs.numComputePerColl):
                    # TODO: investigate the cache effect
                    # Flush the cache
                    # _ = torch.rand(6 * 1024 * 1024 // 4).float() * 2  # V100 6MB L2 cache
                    compute_fn(self.collectiveArgs)
            if is_blocking:  # should be sychronous, wait for the collective
                self.backendFuncs.complete_accel_ops(self.collectiveArgs)
            # Measuring time.
            elapsedTimeNS += (
                time.monotonic() - start
            ) * 1e9  # keeping time in NS, helps in divising data by nanosecond

        start = time.monotonic()  # available only in py3
        self.backendFuncs.complete_accel_ops(self.collectiveArgs)
        end = time.monotonic()  # available only in py3

        ensureTensorFlush(self.collectiveArgs.opTensor)
        if enable_comms_pair:
            ensureTensorFlush(self.collectiveArgs.opTensor_pair)

        elapsedTimeNS += (
            end - start
        ) * 1e9  # keeping time in NS, helps in divising data by nanoseconds

        memSize = self.backendFuncs.get_mem_size(self.collectiveArgs)

        avgIterNS, algBW = comms_utils.getAlgBW(elapsedTimeNS, memSize,
                                                self.collectiveArgs.numIters)
        busBW = self.backendFuncs.getBusBW(
            self.collectiveArgs.collective,
            algBW,
            self.collectiveArgs,
        )
        if enable_comms_pair:
            memSize_pair = self.backendFuncs.get_mem_size(
                self.collectiveArgs, pair=enable_comms_pair)
            memSize += memSize_pair

            _, algBW_pair = comms_utils.getAlgBW(elapsedTimeNS, memSize_pair,
                                                 self.collectiveArgs.numIters)
            algBW += algBW_pair

            busBW += self.backendFuncs.getBusBW(
                self.collectiveArgs.collective_pair,
                algBW_pair,
                self.collectiveArgs,
            )

        self.backendFuncs.sync_barrier(self.collectiveArgs, desc="runColl_end")

        results = {
            "timeUS": avgIterNS / 1e3,
            "algBW": algBW,
            "busBW": busBW,
            "memSize": memSize,
        }
        return results