Пример #1
0
def MetropolisCuda(sigma, T, J, B, iterations, ParaStyle, Alu, Device):

    # Avec PyCUDA autoinit, rien a faire !

    sigmaCU = cuda.InOut(sigma)

    mod = SourceModule(KERNEL_CODE_CUDA)

    MetropolisCU = mod.get_function("MainLoopOne")

    start = pycuda.driver.Event()
    stop = pycuda.driver.Event()

    SizeX, SizeY = sigma.shape

    start.record()
    start.synchronize()
    MetropolisCU(sigmaCU,
                 numpy.float32(T),
                 numpy.float32(J),
                 numpy.float32(B),
                 numpy.uint32(SizeX),
                 numpy.uint32(SizeY),
                 numpy.uint32(iterations),
                 numpy.uint32(nprnd(2**31 - 1)),
                 numpy.uint32(nprnd(2**31 - 1)),
                 grid=(1, 1),
                 block=(1, 1, 1))

    print "%s with %i %s done" % (Alu, 1, ParaStyle)

    stop.record()
    stop.synchronize()

    #elapsed = stop.time_since(start)*1e-3
    elapsed = start.time_till(stop) * 1e-3

    return (elapsed)
Пример #2
0
def MetropolisOpenCL(sigma, T, J, B, iterations, ParaStyle, Alu, Device):

    # Initialisation des variables en les CASTant correctement

    # Je detecte un peripherique GPU dans la liste des peripheriques
    # for platform in cl.get_platforms():
    #     for device in platform.get_devices():
    #             if cl.device_type.to_string(device.type)=='GPU':
    #                     GPU=device
    #print "GPU detected: ",device.name

    HasGPU = False
    Id = 1
    # Primary Device selection based on Device Id
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            #deviceType=cl.device_type.to_string(device.type)
            deviceType = "xPU"
            if Id == Device and not HasGPU:
                GPU = device
                print "CPU/GPU selected: ", device.name
                HasGPU = True
            Id = Id + 1

    # Je cree le contexte et la queue pour son execution
    # ctx = cl.create_some_context()
    ctx = cl.Context([GPU])
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    # Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags

    # Attention au CAST ! C'est un int8 soit un char en OpenCL !
    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma)

    MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
        options = "-cl-mad-enable -cl-fast-relaxed-math")

    SizeX, SizeY = sigma.shape

    if ParaStyle == 'Blocks':
        # Call OpenCL kernel
        # (1,) is Global work size (only 1 work size)
        # (1,) is local work size
        CLLaunch = MetropolisCL.MainLoopOne(queue, (1, ), None, sigmaCL,
                                            numpy.float32(T), numpy.float32(J),
                                            numpy.float32(B),
                                            numpy.uint32(SizeX),
                                            numpy.uint32(SizeY),
                                            numpy.uint32(iterations),
                                            numpy.uint32(nprnd(2**31 - 1)),
                                            numpy.uint32(nprnd(2**31 - 1)))
        print "%s with %i %s done" % (Alu, 1, ParaStyle)
    else:
        # en OpenCL, necessaire de mettre un Global_id identique au local_id
        CLLaunch = MetropolisCL.MainLoopOne(queue, (1, ), (1, ), sigmaCL,
                                            numpy.float32(T), numpy.float32(J),
                                            numpy.float32(B),
                                            numpy.uint32(SizeX),
                                            numpy.uint32(SizeY),
                                            numpy.uint32(iterations),
                                            numpy.uint32(nprnd(2**31 - 1)),
                                            numpy.uint32(nprnd(2**31 - 1)))
        print "%s with %i %s done" % (Alu, 1, ParaStyle)

    CLLaunch.wait()
    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
    elapsed = 1e-9 * (CLLaunch.profile.end - CLLaunch.profile.start)
    sigmaCL.release()

    return (elapsed)
Пример #3
0
def MetropolisOpenCL(circle,iterations,steps,jobs,ParaStyle,Alu,Device,
                     RNG,ValueType):
	
  # Initialisation des variables en les CASTant correctement
    
  if Device==0:
    print "Enter XPU selector based on ALU type: first selected"
    HasXPU=False
    # Default Device selection based on ALU Type
    for platform in cl.get_platforms():
      for device in platform.get_devices():
        deviceType=cl.device_type.to_string(device.type)
        if deviceType=="GPU" and Alu=="GPU" and not HasXPU:
          XPU=device
          print "GPU selected: ",device.name
          HasXPU=True
        if deviceType=="CPU" and Alu=="CPU" and not HasXPU:        
          XPU=device
          print "CPU selected: ",device.name
          HasXPU=True
  else:
    print "Enter XPU selector based on device number & ALU type"
    Id=1
    HasXPU=False
    # Primary Device selection based on Device Id
    for platform in cl.get_platforms():
      for device in platform.get_devices():
        deviceType=cl.device_type.to_string(device.type)
        if Id==Device and Alu==deviceType and HasXPU==False:
          XPU=device
          print "CPU/GPU selected: ",device.name.lstrip()
          HasXPU=True
        Id=Id+1
    if HasXPU==False:
      print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
          (Device,Alu,Id-1)
      return(0,0,0)
				
  # Je cree le contexte et la queue pour son execution
  ctx = cl.Context([XPU])
  queue = cl.CommandQueue(ctx,
                          properties=cl.command_queue_properties.PROFILING_ENABLE)

  # Je recupere les flag possibles pour les buffers
  mf = cl.mem_flags
	
  circleCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=circle)

  
  MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
    options = "-cl-mad-enable -cl-fast-relaxed-math -DTRNG=%i -DTYPE=%s" % (Marsaglia[RNG],Computing[ValueType]))

  i=0

  MyPi=numpy.zeros(steps)
  MyDuration=numpy.zeros(steps)
  
  if iterations%jobs==0:
    iterationsCL=numpy.uint64(iterations/jobs)
    iterationsNew=numpy.uint64(iterationsCL*jobs)
  else:
    iterationsCL=numpy.uint64(iterations/jobs+1)
    iterationsNew=numpy.uint64(iterations)

  for i in range(steps):
		
    if ParaStyle=='Blocks':
      # Call OpenCL kernel
      # (1,) is Global work size (only 1 work size)
      # (1,) is local work size
      # circleCL is lattice translated in CL format
      # SeedZCL is lattice translated in CL format
      # SeedWCL is lattice translated in CL format
      # step is number of iterations
      CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
                                           circleCL,
                                           numpy.uint64(iterationsCL),
                                           numpy.uint32(nprnd(2**30/jobs)),
                                           numpy.uint32(nprnd(2**30/jobs)))
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
            (Alu,jobs,1,ParaStyle)
    elif ParaStyle=='Hybrid':
      threads=BestThreadsNumber(jobs)
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
      CLLaunch=MetropolisCL.MainLoopHybrid(queue,(jobs,),(threads,),
                                          circleCL,
                                          numpy.uint64(iterationsCL),
                                          numpy.uint32(nprnd(2**30/jobs)),
                                          numpy.uint32(nprnd(2**30/jobs)))
        
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
            (Alu,jobs/threads,threads,ParaStyle)
    else:
      # en OpenCL, necessaire de mettre un Global_id identique au local_id
      CLLaunch=MetropolisCL.MainLoopLocal(queue,(jobs,),(jobs,),
                                          circleCL,
                                          numpy.uint64(iterationsCL),
                                          numpy.uint32(nprnd(2**30/jobs)),
                                          numpy.uint32(nprnd(2**30/jobs)))
      print "%s with %i %s done" % (Alu,jobs,ParaStyle)

    CLLaunch.wait()
    cl.enqueue_copy(queue, circle, circleCL).wait()

    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)

    print circle,numpy.mean(circle),numpy.median(circle),numpy.std(circle)
    MyDuration[i]=elapsed
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
    MyPi[i]=numpy.median(AllPi)
    print MyPi[i],numpy.std(AllPi),MyDuration[i]

  circleCL.release()

  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)
	
  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
Пример #4
0
    # Je cree le contexte et la queue pour son execution
    try:
        ctx = cl.Context([XPU])
        queue = cl.CommandQueue(ctx,properties=cl.command_queue_properties.PROFILING_ENABLE)
    except:
        print "Crash during context creation"
   

    MyRoutines = cl.Program(ctx, BlobOpenCL).build()

    mf = cl.mem_flags
    clData = cl.Buffer(ctx, mf.READ_WRITE, MyData.nbytes)

    print 'Tous au meme endroit',MyData

    MyRoutines.SplutterPoints(queue,(Number,1),None,clData,np.float32(SizeOfBox-LengthOfSegment),np.uint32(nprnd(2**32)),np.uint32(nprnd(2**32)))

    cl.enqueue_copy(queue, MyData, clData)

    print 'Tous distribues',MyData

    MyRoutines.ExtendSegment(queue,(Number,1),None,clData,np.float32(LengthOfSegment),np.uint32(nprnd(2**32)),np.uint32(nprnd(2**32)))

    cl.enqueue_copy(queue, MyData, clData)

    print 'Tous avec leur extremite',MyData

    MySize = np.zeros(len(MyData), dtype=np.float32)
    clSize = cl.Buffer(ctx, mf.READ_WRITE, MySize.nbytes)

    MyRoutines.EstimateLength(queue, (Number,1), None, clData, clSize)
Пример #5
0
def MetropolisAllCuda(sigmaDict, TList, J, B, iterations, jobs, ParaStyle, Alu,
                      Device):

    # sigmaDict & Tlist are NOT respectively array & float
    # sigmaDict : dict of array for each temperatoire
    # TList : list of temperatures

    # Avec PyCUDA autoinit, rien a faire !

    mod = SourceModule(KERNEL_CODE_CUDA)

    MetropolisBlocksCuda = mod.get_function("MainLoopGlobal")
    MetropolisThreadsCuda = mod.get_function("MainLoopLocal")
    MetropolisHybridCuda = mod.get_function("MainLoopHybrid")

    # Concatenate all sigma in single array
    sigma = numpy.copy(sigmaDict[TList[0]])
    for T in TList[1:]:
        sigma = numpy.concatenate((sigma, sigmaDict[T]), axis=1)

    sigmaCU = cuda.InOut(sigma)
    TCU = cuda.InOut(TList)

    SizeX, SizeY = sigmaDict[TList[0]].shape

    start = pycuda.driver.Event()
    stop = pycuda.driver.Event()

    start.record()
    start.synchronize()
    if ParaStyle == 'Blocks':
        # Call CUDA kernel
        # (1,) is Global work size (only 1 work size)
        # (1,) is local work size
        # SeedZCL is lattice translated in CL format
        # SeedWCL is lattice translated in CL format
        # step is number of iterations
        MetropolisBlocksCuda(sigmaCU,
                             TCU,
                             numpy.float32(J),
                             numpy.float32(B),
                             numpy.uint32(SizeX),
                             numpy.uint32(SizeY),
                             numpy.uint32(iterations),
                             numpy.uint32(nprnd(2**31 - 1)),
                             numpy.uint32(nprnd(2**31 - 1)),
                             grid=(jobs, 1),
                             block=(1, 1, 1))
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
              (Alu,jobs,1,ParaStyle)
    elif ParaStyle == 'Threads':
        MetropolisThreadsCuda(sigmaCU,
                              TCU,
                              numpy.float32(J),
                              numpy.float32(B),
                              numpy.uint32(SizeX),
                              numpy.uint32(SizeY),
                              numpy.uint32(iterations),
                              numpy.uint32(nprnd(2**31 - 1)),
                              numpy.uint32(nprnd(2**31 - 1)),
                              grid=(1, 1),
                              block=(jobs, 1, 1))
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
              (Alu,1,jobs,ParaStyle)
    else:
        threads = BestThreadsNumber(jobs)
        MetropolisHybridCuda(sigmaCU,
                             TCU,
                             numpy.float32(J),
                             numpy.float32(B),
                             numpy.uint32(SizeX),
                             numpy.uint32(SizeY),
                             numpy.uint32(iterations),
                             numpy.uint32(nprnd(2**31 - 1)),
                             numpy.uint32(nprnd(2**31 - 1)),
                             grid=(jobs / threads, 1),
                             block=(threads, 1, 1))
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
              (Alu,jobs/threads,threads,ParaStyle)

    stop.record()
    stop.synchronize()
    elapsed = start.time_till(stop) * 1e-3

    results = numpy.split(sigma, len(TList), axis=1)
    for T in TList:
        sigmaDict[T] = numpy.copy(results[numpy.nonzero(TList == T)[0][0]])

    return (elapsed)
Пример #6
0
def MetropolisCuda(circle,iterations,steps,jobs,ParaStyle,RNG,ValueType):

  # Avec PyCUDA autoinit, rien a faire !
  
  circleCU = cuda.InOut(circle)

  try:
    mod = SourceModule(KERNEL_CODE_CUDA,options=['--compiler-options','-Wall -DTRNG=%i -DTYPE=%s' % (Marsaglia[RNG],Computing[ValueType])])
  except:
    print "Compilation seems to brake"
  
  MetropolisBlocksCU=mod.get_function("MainLoopBlocks")
  MetropolisJobsCU=mod.get_function("MainLoopThreads")
  MetropolisHybridCU=mod.get_function("MainLoopHybrid")
  
  start = pycuda.driver.Event()
  stop = pycuda.driver.Event()
  
  MyPi=numpy.zeros(steps)
  MyDuration=numpy.zeros(steps)

  if iterations%jobs==0:
    iterationsCL=numpy.uint64(iterations/jobs)
    iterationsNew=iterationsCL*jobs
  else:
    iterationsCL=numpy.uint64(iterations/jobs+1)
    iterationsNew=iterations

  for i in range(steps):
    start.record()
    start.synchronize()
    if ParaStyle=='Blocks':
      MetropolisBlocksCU(circleCU,
                         numpy.uint64(iterationsCL),
                         numpy.uint32(nprnd(2**30/jobs)),
                         numpy.uint32(nprnd(2**30/jobs)),
                         grid=(jobs,1),
                         block=(1,1,1))
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
            (Alu,jobs,1,ParaStyle)      
    elif ParaStyle=='Hybrid':
      threads=BestThreadsNumber(jobs)
      MetropolisHybridCU(circleCU,
                         numpy.uint64(iterationsCL),
                         numpy.uint32(nprnd(2**30/jobs)),
                         numpy.uint32(nprnd(2**30/jobs)),
                         grid=(jobs,1),
                         block=(threads,1,1))
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
            (Alu,jobs/threads,threads,ParaStyle)
    else:
      MetropolisJobsCU(circleCU,
                       numpy.uint64(iterationsCL),
                       numpy.uint32(nprnd(2**30/jobs)),
                       numpy.uint32(nprnd(2**30/jobs)),
                       grid=(1,1),
                       block=(jobs,1,1))
      print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
            (Alu,jobs,1,ParaStyle)
    stop.record()
    stop.synchronize()
                
    elapsed = start.time_till(stop)*1e-3

    MyDuration[i]=elapsed
    AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
    MyPi[i]=numpy.median(AllPi)
    print MyPi[i],numpy.std(AllPi),MyDuration[i]


  print jobs,numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration)

  return(numpy.mean(MyDuration),numpy.median(MyDuration),numpy.std(MyDuration),numpy.mean(Iterations/MyDuration),numpy.median(Iterations/MyDuration),numpy.std(Iterations/MyDuration))
Пример #7
0
    LAPIMAGE = False

    sigmaIn = numpy.where(numpy.random.randn(Size, Size) > 0, 1,
                          -1).astype(numpy.int32)

    ImageOutput(sigmaIn, "Ising2D_Serial_%i_Initial" % (Size))

    E = []
    M = []

    for T in Trange:
        # Indispensable d'utiliser copy : [:] ne fonctionne pas avec numpy !
        sigma = numpy.copy(sigmaIn)
        # duration=Metropolis(sigma,J,B,T,Iterations)
        SeedW, SeedZ = numpy.int32(nprnd(2**31 - 1)), numpy.int32(
            nprnd(2**31 - 1))
        start = time.time()
        array_module_np.array_metropolis_np(sigma, J, B, T, Iterations, SeedW,
                                            SeedZ)
        duration = time.time() - start
        E = numpy.append(E, Energy(sigma, J))
        M = numpy.append(M, Magnetization(sigma, B))
        ImageOutput(sigma, "Ising2D_Serial_%i_%1.1f_Final" % (Size, T))

        print "CPU Time : %f" % (duration)
        print "Total Energy at Temperature %f : %f" % (T, E[-1])
        print "Total Magnetization at Temperature %f : %f" % (T, M[-1])

    if Curves:
        DisplayCurves(Trange, E, M, J, B)
Пример #8
0
def MetropolisAllOpenCL(sigmaDict, TList, J, B, iterations, jobs, ParaStyle,
                        Alu, Device):

    # sigmaDict & Tlist are NOT respectively array & float
    # sigmaDict : dict of array for each temperatoire
    # TList : list of temperatures

    # Initialisation des variables en les CASTant correctement

    # Je detecte un peripherique GPU dans la liste des peripheriques

    HasGPU = False
    Id = 1
    # Primary Device selection based on Device Id
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            #deviceType=cl.device_type.to_string(device.type)
            deviceType = "xPU"
            if Id == Device and not HasGPU:
                GPU = device
                print "CPU/GPU selected: ", device.name
                HasGPU = True
            Id = Id + 1

    # Je cree le contexte et la queue pour son execution
    # ctx = cl.create_some_context()
    ctx = cl.Context([GPU])
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    # Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags

    # Concatenate all sigma in single array
    sigma = numpy.copy(sigmaDict[TList[0]])
    for T in TList[1:]:
        sigma = numpy.concatenate((sigma, sigmaDict[T]), axis=1)

    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma)
    TCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=TList)

    MetropolisCL = cl.Program(ctx,KERNEL_CODE_OPENCL).build( \
        options = "-cl-mad-enable -cl-fast-relaxed-math")

    SizeX, SizeY = sigmaDict[TList[0]].shape

    if ParaStyle == 'Blocks':
        # Call OpenCL kernel
        # (1,) is Global work size (only 1 work size)
        # (1,) is local work size
        # SeedZCL is lattice translated in CL format
        # SeedWCL is lattice translated in CL format
        # step is number of iterations
        CLLaunch = MetropolisCL.MainLoopGlobal(queue, (jobs, ), None, sigmaCL,
                                               TCL, numpy.float32(J),
                                               numpy.float32(B),
                                               numpy.uint32(SizeX),
                                               numpy.uint32(SizeY),
                                               numpy.uint32(iterations),
                                               numpy.uint32(nprnd(2**31 - 1)),
                                               numpy.uint32(nprnd(2**31 - 1)))
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
              (Alu,jobs,1,ParaStyle)
    elif ParaStyle == 'Threads':
        # It's necessary to put a Local_ID equal to a Global_ID
        # Jobs are to be considerated as global number of jobs to do
        # And to be distributed to entities
        # For example :
        # G_ID=10 & L_ID=10 : 10 Threads on 1 UC
        # G_ID=10 & L_ID=1 : 10 Threads on 1 UC

        CLLaunch = MetropolisCL.MainLoopLocal(queue, (jobs, ), (jobs, ),
                                              sigmaCL, TCL, numpy.float32(J),
                                              numpy.float32(B),
                                              numpy.uint32(SizeX),
                                              numpy.uint32(SizeY),
                                              numpy.uint32(iterations),
                                              numpy.uint32(nprnd(2**31 - 1)),
                                              numpy.uint32(nprnd(2**31 - 1)))
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
              (Alu,1,jobs,ParaStyle)
    else:
        threads = BestThreadsNumber(jobs)
        # en OpenCL, necessaire de mettre un Global_id identique au local_id
        CLLaunch = MetropolisCL.MainLoopHybrid(queue, (jobs, ), (threads, ),
                                               sigmaCL, TCL, numpy.float32(J),
                                               numpy.float32(B),
                                               numpy.uint32(SizeX),
                                               numpy.uint32(SizeY),
                                               numpy.uint32(iterations),
                                               numpy.uint32(nprnd(2**31 - 1)),
                                               numpy.uint32(nprnd(2**31 - 1)))
        print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
              (Alu,jobs/threads,threads,ParaStyle)

    CLLaunch.wait()
    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
    elapsed = 1e-9 * (CLLaunch.profile.end - CLLaunch.profile.start)
    sigmaCL.release()

    results = numpy.split(sigma, len(TList), axis=1)
    for T in TList:
        sigmaDict[T] = numpy.copy(results[numpy.nonzero(TList == T)[0][0]])

    return (elapsed)
Пример #9
0
def MetropolisOpenCL(circle, iterations, steps, jobs, ParaStyle, Alu, Device,
                     Memory):

    # Initialisation des variables en les CASTant correctement

    MaxMemoryXPU = 0
    MinMemoryXPU = 0

    if Device == 0:
        print "Enter XPU selector based on ALU type: first selected"
        HasXPU = False
        # Default Device selection based on ALU Type
        for platform in cl.get_platforms():
            for device in platform.get_devices():
                #deviceType=cl.device_type.to_string(device.type)
                deviceMemory = device.max_mem_alloc_size
                if deviceMemory > MaxMemoryXPU:
                    MaxMemoryXPU = deviceMemory
                if deviceMemory < MinMemoryXPU or MinMemoryXPU == 0:
                    MinMemoryXPU = deviceMemory
                if not HasXPU:
                    XPU = device
                    print "XPU selected with Allocable Memory %i: %s" % (
                        deviceMemory, device.name)
                    HasXPU = True
                    MemoryXPU = deviceMemory

    else:
        print "Enter XPU selector based on device number & ALU type"
        Id = 1
        HasXPU = False
        # Primary Device selection based on Device Id
        for platform in cl.get_platforms():
            for device in platform.get_devices():
                #deviceType=cl.device_type.to_string(device.type)
                deviceMemory = device.max_mem_alloc_size
                if deviceMemory > MaxMemoryXPU:
                    MaxMemoryXPU = deviceMemory
                if deviceMemory < MinMemoryXPU or MinMemoryXPU == 0:
                    MinMemoryXPU = deviceMemory
                if Id == Device and HasXPU == False:
                    XPU = device
                    print "CPU/GPU selected with Allocable Memory %i: %s" % (
                        deviceMemory, device.name)
                    HasXPU = True
                    MemoryXPU = deviceMemory
                Id = Id + 1
        if HasXPU == False:
            print "No XPU #%i of type %s found in all of %i devices, sorry..." % \
                (Device,Alu,Id-1)
            return (0, 0, 0)

    print "Allocable Memory is %i, between %i and %i " % (
        MemoryXPU, MinMemoryXPU, MaxMemoryXPU)

    # Je cree le contexte et la queue pour son execution
    ctx = cl.Context([XPU])
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    # Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags

    MetropolisCL = cl.Program(ctx, KERNEL_CODE_OPENCL).build(
        options="-cl-mad-enable -cl-fast-relaxed-math")

    MyDuration = numpy.zeros(steps)

    if iterations % jobs == 0:
        iterationsCL = numpy.uint64(iterations / jobs)
    else:
        iterationsCL = numpy.uint64(iterations / jobs + 1)

    iterationsNew = numpy.uint64(iterationsCL * jobs)

    MySplutter = numpy.zeros(steps)

    MaxWorks = 2**(int)(numpy.log2(MinMemoryXPU / 4))
    print MaxWorks, 2**(int)(numpy.log2(MemoryXPU))

    #Splutter=numpy.zeros((MaxWorks/jobs)*jobs).astype(numpy.uint32)
    #Splutter=numpy.zeros(jobs*16).astype(numpy.uint32)
    Splutter = numpy.zeros(Memory).astype(numpy.uint32)

    for i in range(steps):

        #Splutter=numpy.zeros(2**(int)(numpy.log2(MemoryXPU/4))).astype(numpy.uint32)
        #Splutter=numpy.zeros(1024).astype(numpy.uint32)

        #Splutter=numpy.zeros(jobs).astype(numpy.uint32)

        Splutter[:] = 0

        print Splutter, len(Splutter)

        h2d_time = time.time()
        SplutterCL = cl.Buffer(ctx,
                               mf.WRITE_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=Splutter)
        print('From Host to Device time %f' % (time.time() - h2d_time))

        start_time = time.time()
        if ParaStyle == 'Blocks':
            # Call OpenCL kernel
            # (1,) is Global work size (only 1 work size)
            # (1,) is local work size
            # circleCL is lattice translated in CL format
            # SeedZCL is lattice translated in CL format
            # SeedWCL is lattice translated in CL format
            # step is number of iterations
            # CLLaunch=MetropolisCL.MainLoopGlobal(queue,(jobs,),None,
            #                                      SplutterCL,
            #                                      numpy.uint32(len(Splutter)),
            #                                      numpy.uint64(iterationsCL),
            #                                      numpy.uint32(nprnd(2**30/jobs)),
            #                                      numpy.uint32(nprnd(2**30/jobs)))
            CLLaunch = MetropolisCL.SplutterGlobal(
                queue, (jobs, ), None, SplutterCL, numpy.uint32(len(Splutter)),
                numpy.uint64(iterationsCL), numpy.uint32(nprnd(2**30 / jobs)),
                numpy.uint32(nprnd(2**30 / jobs)))

            print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
                  (Alu,jobs,1,ParaStyle)
        elif ParaStyle == 'Hybrid':
            #threads=BestThreadsNumber(jobs)
            threads = BestThreadsNumber(256)
            print "print", threads
            # en OpenCL, necessaire de mettre un Global_id identique au local_id
            CLLaunch = MetropolisCL.SplutterHybrid(
                queue, (jobs, ), (threads, ), SplutterCL,
                numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL),
                numpy.uint32(nprnd(2**30 / jobs)),
                numpy.uint32(nprnd(2**30 / jobs)))

            print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
                  (Alu,jobs/threads,threads,ParaStyle)
        else:
            # en OpenCL, necessaire de mettre un global_id identique au local_id
            CLLaunch = MetropolisCL.SplutterLocal(
                queue, (jobs, ), (jobs, ), SplutterCL,
                numpy.uint32(len(Splutter)), numpy.uint64(iterationsCL),
                numpy.uint32(nprnd(2**30 / jobs)),
                numpy.uint32(nprnd(2**30 / jobs)))

            print "%s with %i %s done" % (Alu, jobs, ParaStyle)

        CLLaunch.wait()
        d2h_time = time.time()
        cl.enqueue_copy(queue, Splutter, SplutterCL).wait()
        print('From Device to Host %f' % (time.time() - d2h_time))

        #    elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
        elapsed = time.time() - start_time
        print('Elapsed compute time %f' % elapsed)

        MyDuration[i] = elapsed
        #print Splutter,sum(Splutter)
        #MySplutter[i]=numpy.median(Splutter)
        #print numpy.mean(Splutter)*len(Splutter),MySplutter[i]*len(Splutter),numpy.std(Splutter)

    SplutterCL.release()

    print jobs, numpy.mean(MyDuration), numpy.median(MyDuration), numpy.std(
        MyDuration)

    return (numpy.mean(MyDuration), numpy.median(MyDuration),
            numpy.std(MyDuration))
Пример #10
0
def MetropolisCuda(circle, iterations, steps, jobs, ParaStyle, Density,
                   Memory):

    # Avec PyCUDA autoinit, rien a faire !

    circleCU = cuda.InOut(circle)

    mod = SourceModule(KERNEL_CODE_CUDA)

    if Density == 'Dense':
        MetropolisBlocksCU = mod.get_function("SplutterGlobalDense")
        MetropolisThreadsCU = mod.get_function("SplutterLocalDense")
        MetropolisHybridCU = mod.get_function("SplutterHybridDense")
    elif Density == 'Sparse':
        MetropolisBlocksCU = mod.get_function("SplutterGlobalSparse")
        MetropolisThreadsCU = mod.get_function("SplutterLocalSparse")
        MetropolisHybridCU = mod.get_function("SplutterHybridSparse")
    else:
        MetropolisBlocksCU = mod.get_function("SplutterGlobal")

    start = pycuda.driver.Event()
    stop = pycuda.driver.Event()

    MySplutter = numpy.zeros(steps)
    MyDuration = numpy.zeros(steps)

    if iterations % jobs == 0:
        iterationsCL = numpy.uint64(iterations / jobs)
    else:
        iterationsCL = numpy.uint64(iterations / jobs + 1)

    iterationsNew = iterationsCL * jobs

    Splutter = numpy.zeros(jobs * 16).astype(numpy.uint32)

    for i in range(steps):

        start_time = time.time()
        Splutter[:] = 0

        print Splutter, len(Splutter)

        SplutterCU = cuda.InOut(Splutter)

        start.record()
        start.synchronize()
        if ParaStyle == 'Blocks':
            MetropolisBlocksCU(SplutterCU,
                               numpy.uint32(len(Splutter)),
                               numpy.uint64(iterationsCL),
                               numpy.uint32(nprnd(2**30 / jobs)),
                               numpy.uint32(nprnd(2**30 / jobs)),
                               grid=(jobs, 1),
                               block=(1, 1, 1))

            print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
                  (Alu,jobs,1,ParaStyle)
        elif ParaStyle == 'Hybrid':
            threads = BestThreadsNumber(jobs)
            MetropolisHybridCU(SplutterCU,
                               numpy.uint32(len(Splutter)),
                               numpy.uint64(iterationsCL),
                               numpy.uint32(nprnd(2**30 / jobs)),
                               numpy.uint32(nprnd(2**30 / jobs)),
                               grid=(jobs, 1),
                               block=(threads, 1, 1))
            print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
                  (Alu,jobs/threads,threads,ParaStyle)
        else:
            MetropolisThreadsCU(SplutterCU,
                                numpy.uint32(len(Splutter)),
                                numpy.uint64(iterationsCL),
                                numpy.uint32(nprnd(2**30 / jobs)),
                                numpy.uint32(nprnd(2**30 / jobs)),
                                grid=(1, 1),
                                block=(jobs, 1, 1))
            print "%s with (WorkItems/Threads)=(%i,%i) %s method done" % \
                  (Alu,1,jobs,ParaStyle)
        stop.record()
        stop.synchronize()

        #    elapsed = start.time_till(stop)*1e-3
        elapsed = time.time() - start_time

        print Splutter, sum(Splutter)
        MySplutter[i] = numpy.median(Splutter)
        print numpy.mean(Splutter), MySplutter[i], numpy.std(Splutter)

        MyDuration[i] = elapsed

        #AllPi=4./numpy.float32(iterationsCL)*circle.astype(numpy.float32)
        #MyPi[i]=numpy.median(AllPi)
        #print MyPi[i],numpy.std(AllPi),MyDuration[i]

    print jobs, numpy.mean(MyDuration), numpy.median(MyDuration), numpy.std(
        MyDuration)

    return (numpy.mean(MyDuration), numpy.median(MyDuration),
            numpy.std(MyDuration))
Пример #11
0
def Metropolis(sigma, J, B, T, iterations, Device, Divider):

    kernel_params = {'block_size': sigma.shape[0] / Divider}

    # Je detecte un peripherique GPU dans la liste des peripheriques
    Id = 1
    HasXPU = False
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            if Id == Device:
                XPU = device
                print "CPU/GPU selected: ", device.name.lstrip()
                HasXPU = True
            Id += 1

    if HasXPU == False:
        print "No XPU #%i found in all of %i devices, sorry..." % (Device,
                                                                   Id - 1)
        sys.exit()

    ctx = cl.Context([XPU])
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    # Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags

    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma)
    # Program based on Kernel2
    MetropolisCL = cl.Program(ctx,
                              KERNEL_CODE.substitute(kernel_params)).build()

    divide = Divider * Divider
    step = STEP / divide
    i = 0
    duration = 0.
    while (step * i < iterations / divide):

        # Call OpenCL kernel
        # (Divider,Divider) is global work size
        # sigmaCL is lattice translated in CL format
        # step is number of iterations

        start_time = time.time()
        CLLaunch = MetropolisCL.MainLoop(queue, (Divider, Divider), None,
                                         sigmaCL, numpy.float32(J),
                                         numpy.float32(B), numpy.float32(T),
                                         numpy.uint32(sigma.shape[0]),
                                         numpy.uint32(step),
                                         numpy.uint32(nprnd(2**32)),
                                         numpy.uint32(nprnd(2**32)))

        CLLaunch.wait()
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
        elapsed = time.time() - start_time
        print "Iteration %i with T=%f and %i iterations in %f: " % (i, T, step,
                                                                    elapsed)
        if LAPIMAGE:
            cl.enqueue_copy(queue, sigma, sigmaCL).wait()
            checkLattice(sigma)
            ImageOutput(sigma,
                        "Ising2D_GPU_Local_%i_%1.1f_%.3i_Lap" % (SIZE, T, i))
        i = i + 1
        duration = duration + elapsed

    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
    CheckLattice(sigma)
    sigmaCL.release()

    return (duration)
Пример #12
0
def Metropolis(sigma,J,B,T,iterations,Device):
		
	# Initialisation des variables en les CASTant correctement
    
	# Je detecte un peripherique GPU dans la liste des peripheriques
    Id=1
    HasXPU=False
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            if Id==Device:
                XPU=device
                print "CPU/GPU selected: ",device.name.lstrip()
                HasXPU=True
            Id+=1

    if HasXPU==False:
        print "No XPU #%i found in all of %i devices, sorry..." % (Device,Id-1)
        sys.exit()
     
    # Je cree le contexte et la queue pour son execution
    ctx = cl.Context([XPU])
    queue = cl.CommandQueue(ctx,
        properties=cl.command_queue_properties.PROFILING_ENABLE)

	# Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags
	
    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=sigma)

    MetropolisCL = cl.Program(ctx,KERNEL_CODE).build(
        options = "-cl-mad-enable -cl-fast-relaxed-math")

    i=0
    step=STEP
    duration=0.
	
    while (step*i < iterations):
        # Call OpenCL kernel
        # (1,) is global work size (only 1 work size)
        # (1,) is local work size
        # sigmaCL is lattice translated in CL format
        # step is number of iterations
        
        start_time=time.time()
        CLLaunch=MetropolisCL.MainLoop(queue,(1,),None,
					        sigmaCL,
					        numpy.float32(J),numpy.float32(B),
					        numpy.float32(T),
					        numpy.uint32(sigma.shape[0]),
					        numpy.uint32(step),
					        numpy.uint32(nprnd(2**32)),
					        numpy.uint32(nprnd(2**32)))
        CLLaunch.wait()
        # Does not seem to work under AMD/ATI
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
        elapsed = time.time()-start_time
        print "Iteration %i with T=%f and %i iterations in %f: " % (i,T,step,elapsed)
        if LAPIMAGE:
            cl.enqueue_copy(queue, sigma, sigmaCL).wait()
            ImageOutput(sigma,"Ising2D_GPU_Global_%i_%1.1f_%.3i_Lap" %  
                (SIZE,T,i))
        i=i+1
        duration=duration+elapsed

	cl.enqueue_copy(queue, sigma, sigmaCL).wait()
	sigmaCL.release()
	
	return(duration)
Пример #13
0
    clPotential = cl.Buffer(ctx, mf.READ_WRITE, MyPotential.nbytes)
    clKinetic = cl.Buffer(ctx, mf.READ_WRITE, MyKinetic.nbytes)
    clCoM = cl.Buffer(ctx, mf.READ_WRITE, MyCoM.nbytes)

    # Write/HostPointer approach for buffering
    # clDataX = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataX)
    # clDataV = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyDataV)
    # clPotential = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyPotential)  # noqa: E501
    # clKinetic = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyKinetic)
    # clCoM = cl.Buffer(ctx, mf.WRITE_ONLY|mf.COPY_HOST_PTR,hostbuf=MyCoM)

    print("All particles superimposed.")

    # Set particles to RNG points
    if InitialRandom:
        seed_w = np.uint32(nprnd(2**32))
        seed_z = np.uint32(nprnd(2**32))
    else:
        seed_w = np.uint32(19710211)
        seed_z = np.uint32(20081010)

    if Shape == "Ball":
        MyRoutines.InBallSplutterPoints(queue, (Number, 1), None, clDataX,
                                        SizeOfShape, seed_w, seed_z)
    else:
        MyRoutines.InBoxSplutterPoints(queue, (Number, 1), None, clDataX,
                                       SizeOfShape, seed_w, seed_z)

    print("All particules distributed")

    CLLaunch = MyRoutines.CenterOfMass(queue, (1, 1), None, clDataX, clCoM,