示例#1
0
def broad_func(node_count, am_partitions, inputs, rank, size, group):
    global device
    global comm_time
    global comp_time
    global scomp_time
    global bcast_comm_time
    global run

    # n_per_proc = math.ceil(float(adj_matrix.size(1)) / size)
    n_per_proc = math.ceil(float(node_count) / size)

    # z_loc = torch.cuda.FloatTensor(adj_matrix.size(0), inputs.size(1), device=device).fill_(0)
    z_loc = torch.cuda.FloatTensor(am_partitions[0].size(0),
                                   inputs.size(1),
                                   device=device).fill_(0)
    # z_loc = torch.zeros(adj_matrix.size(0), inputs.size(1))

    inputs_recv = torch.cuda.FloatTensor(n_per_proc,
                                         inputs.size(1),
                                         device=device).fill_(0)
    # inputs_recv = torch.zeros(n_per_proc, inputs.size(1))

    for i in range(size):
        if i == rank:
            inputs_recv = inputs.clone()
        elif i == size - 1:
            inputs_recv = torch.cuda.FloatTensor(am_partitions[i].size(1),
                                                 inputs.size(1),
                                                 device=device).fill_(0)
            # inputs_recv = torch.zeros(list(am_partitions[i].t().size())[1], inputs.size(1))

        tstart_comm = start_time(group, rank)

        dist.broadcast(inputs_recv, src=i, group=group)

        dur = stop_time(group, rank, tstart_comm)
        comm_time[run][rank] += dur
        bcast_comm_time[run][rank] += dur

        tstart_comp = start_time(group, rank)

        spmm_gpu(am_partitions[i].indices()[0].int(),
                 am_partitions[i].indices()[1].int(),
                 am_partitions[i].values(), am_partitions[i].size(0),
                 am_partitions[i].size(1), inputs_recv, z_loc)

        dur = stop_time(group, rank, tstart_comp)
        comp_time[run][rank] += dur
        scomp_time[run][rank] += dur

    return z_loc
示例#2
0
def split3dspmm_sparse(adj_matrix, inputs, rank, row, col, rank_c, size,
                       acc_per_rank, row_groups, col_groups, c_groups, height,
                       middim, width):

    proc_row = proc_row_size(size)
    proc_col = proc_col_size(size)
    proc_c = proc_c_size(size)

    # Compute the height, middim, and width for the local spmm
    height_per_proc = height // proc_row
    width_per_proc = width // proc_col

    middim_per_proc = middim // (proc_col * proc_c)
    device = torch.device('cuda:{}'.format(rank_to_devid(rank, acc_per_rank)))

    # Handle boundary conditions if this rank is in the last process row or column
    if row == proc_row - 1:
        height_per_proc = height - height_per_proc * (proc_row - 1)

    if col == proc_col - 1:
        width_per_proc = width - width_per_proc * (proc_col - 1)

    # Initialize output matrix for local spmm
    z_loc = torch.cuda.FloatTensor(height_per_proc,
                                   width_per_proc,
                                   device=device).fill_(0)

    # Determine column size to split output matrix after local spmm's
    chunk_sizes_col = []
    chunk_len = inputs.size(1) // proc_c
    for i in range(proc_c):
        if i == proc_c - 1:
            chunk_sizes_col.append(inputs.size(1) - chunk_len * (proc_c - 1))
        else:
            chunk_sizes_col.append(chunk_len)

    for k in range(proc_col):

        row_src_rank = row * (
            proc_col * proc_c) + rank_c + k * proc_c  # src rank for row bcast
        col_src_rank = col * proc_row + rank_c + k * proc_c * proc_row  # src rank for col bcast

        # Determine middle dimension of matrices for local spmm
        middim_per_col = middim // proc_col
        if k == proc_col - 1:
            middim_per_col = middim - middim_per_col * (proc_col - 1)

        middim_per_proc = middim_per_col // proc_c
        if rank_c == proc_c - 1:
            middim_per_proc = middim_per_col - middim_per_proc * (proc_c - 1)

        if row_src_rank == rank:
            acol_indices_len = torch.cuda.LongTensor(
                [adj_matrix.indices().contiguous()[0].size(0)], device=device)
            acol_values_len = torch.cuda.LongTensor(
                [adj_matrix.values().contiguous().size(0)], device=device)
        else:
            acol_indices_len = torch.cuda.LongTensor([0], device=device)
            acol_values_len = torch.cuda.LongTensor([0], device=device)

        # Broadcast nnz across rows (necessary for row bcast)
        dist.broadcast(acol_indices_len, row_src_rank, row_groups[row][rank_c])

        acol_indices_len = acol_indices_len.item()  # nnz
        acol_values_len = acol_indices_len

        # Initialize new empty matrix for row bcast if this rank is not the src rank
        if row_src_rank == rank:
            acol_indices = adj_matrix.indices().contiguous().long()
            acol_values = adj_matrix.values().contiguous().float()
        else:
            acol_indices = torch.cuda.LongTensor(2,
                                                 acol_indices_len,
                                                 device=device).fill_(0)
            acol_values = torch.cuda.FloatTensor(acol_values_len,
                                                 device=device).fill_(0)

        acol = torch.cat((acol_indices.float(), acol_values.unsqueeze(0)),
                         dim=0)

        # Row bcast
        dist.broadcast(acol.contiguous(), row_src_rank,
                       row_groups[row][rank_c])

        acol_indices = acol[:2].long()
        acol_values = acol[2].squeeze(0)

        if row_src_rank == rank:
            acol = adj_matrix
        else:
            acol = sparse_coo_tensor_gpu(
                acol_indices, acol_values,
                torch.Size([height_per_proc, middim_per_proc]))

        # Initialize new empty matrix for col bcast if this rank is not the src rank
        if col_src_rank == rank:
            brow = inputs
        else:
            brow = torch.cuda.FloatTensor(middim_per_proc,
                                          width_per_proc,
                                          device=device)

        # Col bcast
        brow = brow.contiguous()
        dist.broadcast(brow, col_src_rank, col_groups[col][rank_c])

        # Local spmm
        spmm_gpu(acol_indices[0].int(), acol_indices[1].int(), acol_values,
                 height_per_proc, middim_per_proc, brow, z_loc)

    z_loc = z_loc.contiguous()

    # All-Reduce across third process grid dimension
    dist.all_reduce(z_loc, group=c_groups[int(rank // proc_c)])

    # Split the output of the all-reduce across third process grid dimension
    # Each rank only keeps its submatrix
    z_loc = torch.split(z_loc, chunk_sizes_col, dim=1)
    z_loc = z_loc[rank_c].contiguous()

    return z_loc
示例#3
0
def broad_func(node_count, am_partitions, inputs, rank, size, row_groups,
               col_groups, group):
    global device
    global comm_time
    global comp_time
    global scomp_time
    global bcast_comm_time
    global bcast_words
    global reduce_comm_time
    global run
    global replication

    # n_per_proc = math.ceil(float(adj_matrix.size(1)) / size)
    n_per_proc = math.ceil(float(node_count) / (size / replication))

    # z_loc = torch.cuda.FloatTensor(adj_matrix.size(0), inputs.size(1), device=device).fill_(0)
    z_loc = torch.cuda.FloatTensor(am_partitions[0].size(0),
                                   inputs.size(1),
                                   device=device).fill_(0)
    # z_loc = torch.zeros(adj_matrix.size(0), inputs.size(1))

    inputs_recv = torch.cuda.FloatTensor(n_per_proc,
                                         inputs.size(1),
                                         device=device).fill_(0)
    # inputs_recv = torch.zeros(n_per_proc, inputs.size(1))

    rank_c = rank // replication
    rank_col = rank % replication

    stages = size // (replication**2)
    if rank_col == replication - 1:
        stages = (size // replication) - (replication - 1) * stages

    for i in range(stages):
        # q = rank_c // (size // (replication ** 2)) * (size // (replication ** 2)) + i
        # = q * replication + rank_c // (size // (replication **2))
        q = (rank_col * (size //
                         (replication**2)) + i) * replication + rank_col

        q_c = q // replication

        am_partid = rank_col * (size // replication**2) + i

        if q == rank:
            inputs_recv = inputs.clone()
        elif q_c == size // replication - 1:
            inputs_recv = torch.cuda.FloatTensor(
                am_partitions[am_partid].size(1),
                inputs.size(1),
                device=device).fill_(0)
            # inputs_recv = torch.zeros(list(am_partitions[i].t().size())[1], inputs.size(1))

        tstart_comm = start_time(col_groups[rank_col], rank)

        inputs_recv = inputs_recv.contiguous()
        bcast_words[run][rank] += inputs_recv.size(0) * inputs_recv.size(1)
        dist.broadcast(inputs_recv, src=q, group=col_groups[rank_col])

        dur = stop_time(col_groups[rank_col], rank, tstart_comm)

        comm_time[run][rank] += dur
        bcast_comm_time[run][rank] += dur

        tstart_comp = start_time(col_groups[rank_col], rank)

        spmm_gpu(am_partitions[am_partid].indices()[0].int(),
                 am_partitions[am_partid].indices()[1].int(),
                 am_partitions[am_partid].values(),
                 am_partitions[am_partid].size(0),
                 am_partitions[am_partid].size(1), inputs_recv, z_loc)

        dur = stop_time(col_groups[rank_col], rank, tstart_comp)
        comp_time[run][rank] += dur
        scomp_time[run][rank] += dur

    z_loc = z_loc.contiguous()

    tstart_comm = start_time(row_groups[rank_c], rank)
    dist.all_reduce(z_loc, op=dist.reduce_op.SUM, group=row_groups[rank_c])
    dur = stop_time(row_groups[rank_c], rank, tstart_comm)

    comm_time[run][rank] += dur
    reduce_comm_time[run][rank] += dur

    return z_loc
示例#4
0
文件: nccl_ex.py 项目: alokpathy/spmm
def dspmm(node_count, am_partitions, inputs, rank, size, replication,
          row_groups, col_groups, group, device):
    global comm_time
    global comp_time
    global bcast_comm_time
    global reduce_comm_time

    n_per_proc = math.ceil(float(node_count) / (size / replication))

    z_loc = torch.cuda.FloatTensor(am_partitions[0].size(0),
                                   inputs.size(1),
                                   device=device).fill_(0)

    inputs_recv = torch.cuda.FloatTensor(n_per_proc,
                                         inputs.size(1),
                                         device=device).fill_(0)

    rank_c = rank // replication  # effectively row-rank
    rank_col = rank % replication

    stages = size // (replication**2)
    if rank_col == replication - 1:
        stages = (size // replication) - (replication - 1) * stages

    for i in range(stages):
        # Compute src rank in bcast
        q = (rank_col * (size //
                         (replication**2)) + i) * replication + rank_col

        q_c = q // replication

        am_partid = rank_col * (size // replication**2) + i

        # If this rank is the src rank for bcast, set inputs_recv to the local matrix
        # Else, instantiate a new empty matrix
        if q == rank:
            inputs_recv = inputs.clone()
        elif q_c == size // replication - 1:
            inputs_recv = torch.cuda.FloatTensor(
                am_partitions[am_partid].size(1),
                inputs.size(1),
                device=device).fill_(0)

        inputs_recv = inputs_recv.contiguous()
        tstart_comm = start_time(col_groups[rank_col], rank)
        dist.broadcast(inputs_recv, src=q, group=col_groups[rank_col])
        dur = stop_time(col_groups[rank_col], rank, tstart_comm)

        comm_time[rank] += dur
        bcast_comm_time[rank] += dur

        tstart_comp = start_time(col_groups[rank_col], rank)

        spmm_gpu(am_partitions[am_partid].indices()[0].int(),
                 am_partitions[am_partid].indices()[1].int(),
                 am_partitions[am_partid].values(),
                 am_partitions[am_partid].size(0),
                 am_partitions[am_partid].size(1), inputs_recv, z_loc)

        dur = stop_time(col_groups[rank_col], rank, tstart_comp)
        comp_time[rank] += dur

    z_loc = z_loc.contiguous()

    tstart_comm = start_time(row_groups[rank_c], rank)
    dist.all_reduce(z_loc, op=dist.reduce_op.SUM, group=row_groups[rank_c])
    dur = stop_time(row_groups[rank_c], rank, tstart_comm)

    comm_time[rank] += dur
    reduce_comm_time[rank] += dur

    return z_loc
示例#5
0
def summa_sparse(adj_matrix, inputs, rank, row, col, size, acc_per_rank,
                 row_groups, col_groups, height, middim, width):

    proc_row = proc_row_size(size)
    proc_col = proc_col_size(size)

    # Compute the height, middim, and width for the local spmm
    height_per_proc = height // proc_row
    width_per_proc = width // proc_col

    middim_per_proc = middim // proc_col
    device = torch.device('cuda:{}'.format(rank_to_devid(rank, acc_per_rank)))

    # Handle boundary conditions if this rank is in the last process row or column
    if row == proc_row - 1:
        height_per_proc = height - height_per_proc * (proc_row - 1)

    if col == proc_col - 1:
        width_per_proc = width - width_per_proc * (proc_col - 1)

    # Initialize output matrix for local spmm
    z_loc = torch.cuda.FloatTensor(height_per_proc,
                                   width_per_proc,
                                   device=device).fill_(0)

    for k in range(proc_col):

        row_src_rank = k + proc_col * row  # src rank for row bcast
        col_src_rank = k * proc_col + col  # src rank for col bcast

        # Determine middle dimension of matrices for local spmm
        if k == proc_col - 1:
            middim_per_proc = middim - middim_per_proc * (proc_col - 1)

        if row_src_rank == rank:
            acol_indices_len = torch.cuda.LongTensor(
                [adj_matrix.indices().contiguous()[0].size(0)], device=device)
            acol_values_len = torch.cuda.LongTensor(
                [adj_matrix.values().contiguous().size(0)], device=device)
        else:
            acol_indices_len = torch.cuda.LongTensor([0], device=device)
            acol_values_len = torch.cuda.LongTensor([0], device=device)

        # Broadcast nnz across rows (necessary for row bcast)
        dist.broadcast(acol_indices_len, row_src_rank, row_groups[row])

        acol_indices_len = acol_indices_len.item()  # nnz
        acol_values_len = acol_indices_len

        # Initialize new empty matrix for row bcast if this rank is not the src rank
        if row_src_rank == rank:
            acol_indices = adj_matrix.indices().contiguous().long()
            acol_values = adj_matrix.values().contiguous().float()
        else:
            acol_indices = torch.cuda.LongTensor(2,
                                                 acol_indices_len,
                                                 device=device).fill_(0)
            acol_values = torch.cuda.FloatTensor(acol_values_len,
                                                 device=device).fill_(0)

        acol = torch.cat((acol_indices.float(), acol_values.unsqueeze(0)),
                         dim=0).contiguous()

        # Row bcast
        dist.broadcast(acol, row_src_rank, row_groups[row])

        acol_indices = acol[:2].long()
        acol_values = acol[2].squeeze(0)

        if row_src_rank == rank:
            acol = adj_matrix
        else:
            acol = sparse_coo_tensor_gpu(
                acol_indices, acol_values,
                torch.Size([height_per_proc, middim_per_proc]))

        # Initialize new empty matrix for col bcast if this rank is not the src rank
        if col_src_rank == rank:
            brow = inputs
        else:
            brow = torch.cuda.FloatTensor(middim_per_proc,
                                          width_per_proc,
                                          device=device)

        # Col bcast
        brow = brow.contiguous()
        dist.broadcast(brow, col_src_rank, col_groups[col])

        # Local spmm
        spmm_gpu(acol_indices[0].int(), acol_indices[1].int(), acol_values,
                 height_per_proc, middim_per_proc, brow, z_loc)

    return z_loc