def copy_to_shared( start_loop, array_name, alignment ):
    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment ) 
    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
    stmt = 0 # assume statement 0

    cur = chill.cur_indices(stmt)
    #print "Cur indices ",
    #print_array( cur )

    start_level = find_cur_level( stmt, start_loop )
    #print "start_level %d" % start_level

    old_num_statements = chill.num_statements()
    #print "old_num_statements %d" % old_num_statements
    

    # Now, we give it indices for up to two dimensions for copy loop
    copy_loop_idxs = ["tmp1","tmp2"]
    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
    passtoC = [stmt, start_level, array_name]   # a list
    passtoC.append( len(copy_loop_idxs))
    for i in copy_loop_idxs:
        passtoC.append(i)
    passtoC.append( 0 ) # False
    passtoC.append( 0 )
    passtoC.append( 1 )
    passtoC.append( alignment )
    passtoC.append( 1 )   # True
    #print "\n[DataCopy]datacopy( ",
    #print passtoC,
    #print ")"

    #if array_name == "b":
    #    chill.cheat(1)
    #if array_name == "c":
    #    chill.cheat(2)
    
    chill.datacopy_9arg( tuple( passtoC ))

    #print "back from datacopy_9arg\n\n\n"
    #sys.stdout.flush()


    #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
    chill.add_sync( stmt, start_loop )
    #print "back from add_sync()\n\n"

    new_num_statements = chill.num_statements()
    
    #  This is fairly CUBLAS2 specific, not sure how well it generalizes,
    #  but for a 2D copy, what we want to do is "normalize" the first loop
    #  "tmp1" then get its hard upper bound. We then want to tile it to
    #  make the control loop of that tile "ty". We then tile "tmp2" with a
    #  size of 1 and make it "tx".

    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements)
    sys.stdout.flush()
    sys.stdout.flush()

    for stmt in range(old_num_statements, new_num_statements):
        #print "for stmt = %d" % stmt
        level = find_cur_level( stmt, "tmp2")
        #print "FOUND CUR LEVEL?  level '",
        #print level,
        #print "'"

        #print "in loop, stmt %d   level %d" % ( stmt, level )
        if level != -1:
            #print "\nCopy to shared: [If was no error]\n"
            find_cur_level(stmt,"tmp2")
            chill.tile3( stmt, level, level )
            
            #print "hard_loop_bounds( %d, %d )" % (stmt, level)
            bounds = chill.hard_loop_bounds(stmt, level)
            lower = bounds[0]
            upper = 1+ bounds[1]
            #print "lower %d  upper %d" % ( lower, upper )

            dims = chill.thread_dims()
            #print "in cudaize.py copy_to_shared, dims =",
            #print dims
            tx = dims[0]
            ty = dims[1]
            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx)

            level = find_cur_level(stmt,"tmp1")
            #print "level %d" % level
            if tx == upper and ty == 1:
                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty)
                #print "Don't need"

                # Don't need an extra tile level, just move this loop up
                second_level = find_cur_level(stmt,"tmp2")
                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)

            else:
                #print "DO need?"
                if ty == 1:
                    new_ctrl = "tmp3" 
                else:
                    new_ctrl = "ty"

                # LOTS of commented out code here in cudaize.lua 

                #print_code()
                #print "\nStarting tmp2\n"
                first_level  = find_cur_level(stmt,"tmp1")
                second_level = find_cur_level(stmt,"tmp2")
                bounds = chill.hard_loop_bounds(stmt, second_level)
                lower = bounds[0]
                upper = 1 + bounds[1]   # BROKEN?
                        
                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level) 

                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
                #print_code()

                first_level = find_cur_level(stmt,"tmp1")
                bounds = chill.hard_loop_bounds(stmt, first_level)
                lower_1 =     bounds[0]
                upper_1 = 1 + bounds[1]
                tx_level = find_cur_level(stmt,"tx")
                bounds = chill.hard_loop_bounds(stmt,tx_level)
                lower_tx =   bounds[0]
                upper_tx = 1+bounds[1]
                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)

                if int(math.ceil( float(upper_tx)/float(tx))) > 1:
                     #print "ceil I say"
                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
                     #print_code()

                     repeat = find_cur_level(stmt,"tx")
                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
                     #print_code()

                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
                        #print_code()

                #print_code()

                #print "\nStarting tmp1\n"
                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))      
                #print_code()

                ty_level = find_cur_level(stmt,"tmp1")
                bounds = chill.hard_loop_bounds(stmt,ty_level)
                lower_ty = bounds[0]
                upper_ty = 1 + bounds[1]

                tx_level = find_cur_level(stmt,"tx")
                bounds = chill.hard_loop_bounds(stmt,tx_level)
                lower_tx = bounds[0]
                upper_tx = 1 + bounds[1]

                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
                
                #print "before ceil"
                #sys.stdout.flush()

                if(math.ceil(float(upper_ty)/float(ty)) > 1):
                    #print "CEIL IF"
                    #print "\n Inside upper_ty/ty > 1\n"

                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
                    #print_code()

                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty"))
                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
                    #print_code()

                    cur_idxs = chill.cur_indices(stmt)
                    #print "\n cur indexes are ",
                    #print_array( cur_idxs)
                    #sys.stdout.flush()

                    # Putting ty before any tmp_tx
                    idx_flag = -1
                    if "tmp_tx" in cur_idxs:
                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1
                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
                    #sys.stdout.flush()      
                    
                    if idx_flag >= 0:
                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                             #print_code()
                    
                    
                    #  Now Putting ty before any tmp_ty
                    sys.stdout.flush()      
                    idx_flag = -1
                    if "tmp_ty" in cur_idxs:
                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag
                    #sys.stdout.flush()      
                                            
                    if idx_flag >= 0:
                        #print "one more test"
                        sys.stdout.flush()
                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                            #sys.stdout.flush()
                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                            #print_code()



                else:
                    #print "CEIL ELSE"
                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
                    #sys.stdout.flush()
                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
                    #print_code()

                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
                    sys.stdout.flush()

                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
                    #print_code()


                    idx_flag = -1
                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is
                    #if(cur_idxs) then
                        #print "CAN NEVER GET HERE?  cur_idxs"
                        #for num= 0,table.getn(cur_idxs) do
                            #if(cur[num] == "tmp_ty") then
                            #idx_flag = find_cur_level(stmt,cur[num])
                            #break
                        #end
                    #end
                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
                    if idx_flag >= 0:  # can't happen
                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                    
                        
                    

                    
            #print "\n\n *** at bottom of if in copy to shared, "
            #print_code()
            #print "end of if"

        else:
            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV)
            #print "\nCopy to shared: [If was error]\n"
            level = find_cur_level(stmt,"tmp1")
            chill.tile3(stmt, level, level)

            dims = chill.thread_dims()
            #print dims
            tx = dims[0]
            ty = dims[1]

            bounds = chill.hard_loop_bounds(stmt, level)
            lower = bounds[0]   
            upper = bounds[1]

            #print "bounds  lower %d    upper %d" % (lower, upper)
            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
            if upper == tx:
                #print "upper == tx"
                chill.rename_index( stmt, "tmp1", "tx")
            else:
                #print "upper is not tx"
                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
                #print_code()

                #print "stmt:%d level+1: %d" % ( stmt, level+1) 
                #print("TILE 7")
                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
                #print("TILE 3")
                chill.tile3( stmt, level+1, level)
                #print_code()           


                if ty > 1:
                   #print "GOING IN"
                   bounds = chill.hard_loop_bounds(stmt, level+1)
                   lower = bounds[0]   
                   upper = bounds[1]   
                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper )
                   floatdiv = float(upper)/float(ty)
                   bound =  int(math.ceil(float(upper)/float(ty)))
                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound)
                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)

        # Always add sync
        chill.add_sync( stmt, start_loop )
Exemplo n.º 2
0
def copy_to_shared( start_loop, array_name, alignment ):
    #print "\nstarting copy to shared( %s, %s, %d)" % (start_loop, array_name, alignment ) 
    #print "copy_to_shared( %s, %s, %d) in cudaize.py" % ( start_loop, array_name, alignment )
    stmt = 0 # assume statement 0

    cur = chill.cur_indices(stmt)
    #print "Cur indices ",
    #print_array( cur )

    start_level = find_cur_level( stmt, start_loop )
    #print "start_level %d" % start_level

    old_num_statements = chill.num_statements()
    #print "old_num_statements %d" % old_num_statements
    

    # Now, we give it indices for up to two dimensions for copy loop
    copy_loop_idxs = ["tmp1","tmp2"]
    #chill.datacopy_9arg(stmt, start_level, array_name, copy_loop_idxs, False, 0, 1, alignment,True)
    passtoC = [stmt, start_level, array_name]   # a list
    passtoC.append( len(copy_loop_idxs))
    for i in copy_loop_idxs:
        passtoC.append(i)
    passtoC.append( 0 ) # False
    passtoC.append( 0 )
    passtoC.append( 1 )
    passtoC.append( alignment )
    passtoC.append( 1 )   # True
    #print "\n[DataCopy]datacopy( ",
    #print passtoC,
    #print ")"

    #if array_name == "b":
    #    chill.cheat(1)
    #if array_name == "c":
    #    chill.cheat(2)
    
    chill.datacopy_9arg( tuple( passtoC ))

    #print "back from datacopy_9arg\n\n\n"
    #sys.stdout.flush()


    #print "calling add_sync( %d, %s )" % ( stmt, start_loop )
    chill.add_sync( stmt, start_loop )
    #print "back from add_sync()\n\n"

    new_num_statements = chill.num_statements()
    
    #  This is fairly CUBLAS2 specific, not sure how well it generalizes,
    #  but for a 2D copy, what we want to do is "normalize" the first loop
    #  "tmp1" then get its hard upper bound. We then want to tile it to
    #  make the control loop of that tile "ty". We then tile "tmp2" with a
    #  size of 1 and make it "tx".

    #print "fairly CUBLAS2 specific, OLD %d  NEW %d" % ( old_num_statements, new_num_statements)
    sys.stdout.flush()
    sys.stdout.flush()

    for stmt in range(old_num_statements, new_num_statements):
        #print "for stmt = %d" % stmt
        level = find_cur_level( stmt, "tmp2")
        #print "FOUND CUR LEVEL?  level '",
        #print level,
        #print "'"

        #print "in loop, stmt %d   level %d" % ( stmt, level )
        if level != -1:
            #print "\nCopy to shared: [If was no error]\n"
            find_cur_level(stmt,"tmp2")
            chill.tile3( stmt, level, level )
            
            #print "hard_loop_bounds( %d, %d )" % (stmt, level)
            bounds = chill.hard_loop_bounds(stmt, level)
            lower = bounds[0]
            upper = 1+ bounds[1]
            #print "lower %d  upper %d" % ( lower, upper )

            dims = chill.thread_dims()
            #print "in cudaize.py copy_to_shared, dims =",
            #print dims
            tx = dims[0]
            ty = dims[1]
            #print "2-loop cleanup: lower, upper: %d, %d,  tx: %d" % ( lower, upper, tx)

            level = find_cur_level(stmt,"tmp1")
            #print "level %d" % level
            if tx == upper and ty == 1:
                #print "tx = %d    upper = %d     ty = %d"% (tx, upper, ty)
                #print "Don't need"

                # Don't need an extra tile level, just move this loop up
                second_level = find_cur_level(stmt,"tmp2")
                chill.tile7(stmt, second_level, 1, level, "tx", "tx", counted)

            else:
                #print "DO need?"
                if ty == 1:
                    new_ctrl = "tmp3" 
                else:
                    new_ctrl = "ty"

                # LOTS of commented out code here in cudaize.lua 

                #print_code()
                #print "\nStarting tmp2\n"
                first_level  = find_cur_level(stmt,"tmp1")
                second_level = find_cur_level(stmt,"tmp2")
                bounds = chill.hard_loop_bounds(stmt, second_level)
                lower = bounds[0]
                upper = 1 + bounds[1]   # BROKEN?
                        
                #print "[Malik]-loop cleanup@tmp2: lower, upper: %d, %d, tx: %d,first level:%d,second_level:%d" % ( lower, upper-1, tx, first_level, second_level) 

                # Move the fastest changing dimension loop to the outermost,identified by "tmp2" and to be identified as tx.
                #print "\n[fastest]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, second_level,1,first_level, "tx", "tx")
                chill.tile7(stmt, second_level,1,first_level,"tx","tx",counted)
                #print_code()

                first_level = find_cur_level(stmt,"tmp1")
                bounds = chill.hard_loop_bounds(stmt, first_level)
                lower_1 =     bounds[0]
                upper_1 = 1 + bounds[1]
                tx_level = find_cur_level(stmt,"tx")
                bounds = chill.hard_loop_bounds(stmt,tx_level)
                lower_tx =   bounds[0]
                upper_tx = 1+bounds[1]
                #print "UL_1 %d %d     UL_tx %d %d" % ( lower_1, upper_1-1, lower_tx, upper_tx-1)

                if int(math.ceil( float(upper_tx)/float(tx))) > 1:
                     #print "ceil I say"
                     #print "\n[Tile1]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, tx_level,tx,tx_level, "tx", "tmp1")
                     chill.tile7(stmt,tx_level,tx,tx_level,"tx","tmp_tx",counted)
                     #print_code()

                     repeat = find_cur_level(stmt,"tx")
                     #print "\n[Tile1]tile(%d, %d, %d)" % (stmt, repeat, repeat)
                     chill.tile3(stmt, repeat, repeat)  #find_cur_level(stmt,"tx"),find_cur_level(stmt,"tx"))
                     #print_code()

                     if find_cur_level(stmt,"tx")>find_cur_level(stmt,"tmp_tx"):
                        #print "\nagain [Tile1]tile(%d, %d, %d)" % (stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
                        chill.tile3(stmt,find_cur_level(stmt,"tx"),find_cur_level(stmt,"tmp_tx"))
                        #print_code()

                #print_code()

                #print "\nStarting tmp1\n"
                # Handle the other slower changing dimension, the original outermost loop, now identified by "tmp1", to be identified as "ty".
                chill.tile3(stmt,find_cur_level(stmt,"tmp1"),find_cur_level(stmt,"tmp1"))      
                #print_code()

                ty_level = find_cur_level(stmt,"tmp1")
                bounds = chill.hard_loop_bounds(stmt,ty_level)
                lower_ty = bounds[0]
                upper_ty = 1 + bounds[1]

                tx_level = find_cur_level(stmt,"tx")
                bounds = chill.hard_loop_bounds(stmt,tx_level)
                lower_tx = bounds[0]
                upper_tx = 1 + bounds[1]

                #print "[Malik]-loop cleanup@tmp1: lowerty, upperty: %d, %d, ty: %d,ty level:%d,tx_level:%d, stmt: %d" % ( lower_ty, upper_ty-1, ty, ty_level, tx_level, stmt)
                
                #print "before ceil"
                #sys.stdout.flush()

                if(math.ceil(float(upper_ty)/float(ty)) > 1):
                    #print "CEIL IF"
                    #print "\n Inside upper_ty/ty > 1\n"

                    #print "\n[Tile2]tile(%d, %d, %d,%d,%s,%s,counted)"%(stmt, ty_level,ty,ty_level, "ty", "tmp_ty")
                    chill.tile7(stmt,ty_level,ty,ty_level,"ty","tmp_ty",counted)
                    #print_code()

                    #print "\n[Tile2-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt  ,"ty"),find_cur_level(stmt,"ty"))
                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"ty"))
                    #print_code()

                    cur_idxs = chill.cur_indices(stmt)
                    #print "\n cur indexes are ",
                    #print_array( cur_idxs)
                    #sys.stdout.flush()

                    # Putting ty before any tmp_tx
                    idx_flag = -1
                    if "tmp_tx" in cur_idxs:
                        idx_flag = 1 + cur_idxs.index("tmp_tx")   # lua index starts at 1
                    #print "\n (1) so i have found out the value of idx flag as %d" % idx_flag
                    #sys.stdout.flush()      
                    
                    if idx_flag >= 0:
                         if find_cur_level(stmt,"ty") > find_cur_level(stmt,"tmp_ty"):
                             #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                             chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                             #print_code()
                    
                    
                    #  Now Putting ty before any tmp_ty
                    sys.stdout.flush()      
                    idx_flag = -1
                    if "tmp_ty" in cur_idxs:
                        idx_flag = 1 + cur_idxs.index("tmp_ty") # lua index starts at 1
                    #print "\n IF  so i have found out the value of idx flag as %d" % idx_flag
                    #sys.stdout.flush()      
                                            
                    if idx_flag >= 0:
                        #print "one more test"
                        sys.stdout.flush()
                        if find_cur_level(stmt,"ty")>find_cur_level(stmt,"tmp_ty"):
                            #print "\n[Tile2-2]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                            #sys.stdout.flush()
                            chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                            #print_code()



                else:
                    #print "CEIL ELSE"
                    #print "\n[Tile3]tile(%d, %d, %d,%d,%s,%s,counted)" % (stmt, ty_level,1,ty_level, "ty", "ty")
                    #sys.stdout.flush()
                    chill.tile7( stmt, ty_level, 1, ty_level, "ty", "ty", counted )
                    #print_code()

                    #print "\n[Tile3-1]tile(%d, %d, %d)"%(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
                    sys.stdout.flush()

                    chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tx")+1)
                    #print_code()


                    idx_flag = -1
                    # LUA code checks to see if cur_idxs exists?  it is unused except in the other clause of this is
                    #if(cur_idxs) then
                        #print "CAN NEVER GET HERE?  cur_idxs"
                        #for num= 0,table.getn(cur_idxs) do
                            #if(cur[num] == "tmp_ty") then
                            #idx_flag = find_cur_level(stmt,cur[num])
                            #break
                        #end
                    #end
                    print "\n ELSE so i have found out the value of idx flag as %d" % idx_flag
                    if idx_flag >= 0:  # can't happen
                        print "tile( stmt %d, level ty %d, level ty %d" % ( stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                        #chill.tile3(stmt,find_cur_level(stmt,"ty"),find_cur_level(stmt,"tmp_ty"))
                    
                        
                    

                    
            #print "\n\n *** at bottom of if in copy to shared, "
            #print_code()
            #print "end of if"

        else:
            #  copy to shared only created one level, not two, so we use a different approach (MV & TMV)
            #print "\nCopy to shared: [If was error]\n"
            level = find_cur_level(stmt,"tmp1")
            chill.tile3(stmt, level, level)

            dims = chill.thread_dims()
            #print dims
            tx = dims[0]
            ty = dims[1]

            bounds = chill.hard_loop_bounds(stmt, level)
            lower = bounds[0]   
            upper = bounds[1]

            #print "bounds  lower %d    upper %d" % (lower, upper)
            upper = upper+1 # upper bound given as <=, compare to dimensions tx which is <
            if upper == tx:
                #print "upper == tx"
                chill.rename_index( stmt, "tmp1", "tx")
            else:
                #print "upper is not tx"
                #print "upper %d tx %d stmt: %d level: %d" % ( upper, tx, stmt, level)
                chill.tile7( stmt, level, tx, level, "tx", "tmp_tx", counted)
                #print_code()

                #print "stmt:%d level+1: %d" % ( stmt, level+1) 
                #print("TILE 7")
                chill.tile7( stmt, level+1,1,level+1,"tx", "tx",counted)
                #print("TILE 3")
                chill.tile3( stmt, level+1, level)
                #print_code()           


                if ty > 1:
                   #print "GOING IN"
                   bounds = chill.hard_loop_bounds(stmt, level+1)
                   lower = bounds[0]   
                   upper = bounds[1]   
                   #print "ty %d  lower %d  upper %d" % ( ty, lower, upper )
                   floatdiv = float(upper)/float(ty)
                   bound =  int(math.ceil(float(upper)/float(ty)))
                   #print "NOW FOR Y: upper %d ty %d stmt: %d level: %d bound: %d" % ( upper, ty, stmt, level+1,   bound)
                   chill.tile7(stmt, level+1, bound, level+1, "tmp_ty", "ty", counted)

        # Always add sync
        chill.add_sync( stmt, start_loop )
def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
    #print "STARTING TILE BY INDEX"
    #print "tile_by_index() tile_method ",
    #print tile_method
    #print "index_names: ",
    #print index_names

    stmt = 0 # assume statement 0
    if not valid_indices( stmt, tile_indices):
        print "python tile_by_index() one or more of ",
        print tile_indices,
        print " is not valid"
        sys.exit(-1)

    if tile_method == None:
        #print "CREATING tile_method = 1"
        tile_method = 1 # "counted"

    tile_index_names = []
    for ti in tile_indices:
        tile_index_names.append( ti )  # make a copy? 
    #print "tile_index_names:",
    #print tile_index_names

    control_index_names = dict()
    tile_index_map = dict()
    
    #print "index_names: "
    #print index_names

    for control, name in index_names.items():
        valid = False
        
        if control[0] == "l" and control[1].isdigit():
            if control.endswith("_control"):
                index = int(control[1: -8])
                control_index_names[index-1] = name
                valid = True

            elif control.endswith("_tile"):
                index = int(control[1: -5])
                #print "index %d" % index
                tile_index_names[index-1] = name # ?? 
                tile_index_map[name] = tile_indices[index-1]
                valid = True
        if not valid:
            print "%s is not a proper key for specifying tile or control loop indices\n" % control

    #print "control_index_names = ",
    #print control_index_names

    #print "tile_index_names = ",
    #print tile_index_names

    #print "before call to build_order(), tile_index_map = ",
    #print tile_index_map


    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)

    #print "returned from build_order python\n\n"

    # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
    #print "permute(%d, {" % stmt,
    #print "cur_order = ",
    #print cur_order,
    #print "})"

    print cur_order
    chill.permute(stmt, list(cur_order)) 
    #print "in cudaize.py, returned from C code chill.permute()\n"

    for i in range(len(tile_indices)):
        cur_idx = tile_indices[i]
        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx)
        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
        #print "cur_idx %s return from build order" % cur_idx
        
        # Find an offset between tile loop and control loop
        #  0   = control loop one level above tile loop
        #  -1  = control loop two levels above tile loop
        #  > 0 = tile loop above control loop
        #  In the last case, we do two extra tile commands to get the control
        #  above the tile and then rely on the final permute to handle the
        #  rest
        level = find_cur_level(stmt,cur_idx)
        #print "level %d\n" % level     

        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
        #print "offset %d" % offset

        if offset <= 0:
            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
            #print "in cudaize.py, returned from C code chill.tile7\n"

        else:
            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  )
            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level

            # flip and tile control loop
            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
            chill.tile3( stmt, level+1, level+1)

            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
            chill.tile3( stmt, level+1, level)

            #print_code()

        # Do permutation based on cur_order
        #print("permute based on build order calling build_order()")
        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)

        #print("permute based on build order return from build_order()")

        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})")
        topermute = cur_order
        chill.permute(stmt, list(topermute)) 
Exemplo n.º 4
0
def tile_by_index( tile_indices, sizes, index_names, final_order, tile_method):
    #print "STARTING TILE BY INDEX"
    #print "tile_by_index() tile_method ",
    #print tile_method
    #print "index_names: ",
    #print index_names

    stmt = 0 # assume statement 0
    if not valid_indices( stmt, tile_indices):
        print "python tile_by_index() one or more of ",
        print tile_indices,
        print " is not valid"
        sys.exit(-1)

    if tile_method == None:
        #print "CREATING tile_method = 1"
        tile_method = 1 # "counted"

    tile_index_names = []
    for ti in tile_indices:
        tile_index_names.append( ti )  # make a copy? 
    #print "tile_index_names:",
    #print tile_index_names

    control_index_names = {} # a dictionary?
    tile_index_map =  {}
    
    #print "index_names: "
    #print index_names

    for pair in index_names:
        valid = False
        control = pair[0]
        name    = pair[1]
        #print "control %s   name  %s" % ( control, name )
        
        if control[0] == "l" and control[1].isdigit():
            if control.endswith("_control"):
                index = int(control[1: -8])
                control_index_names[index-1] = name
                valid = True

            elif control.endswith("_tile"):
                index = int(control[1: -5])
                #print "index %d" % index
                tile_index_names[index-1] = name # ?? 
                tile_index_map[name] = tile_indices[index-1]
                valid = True
        if not valid:
            print "%s is not a proper key for specifying tile or control loop indices\n" % control

    #print "control_index_names = ",
    #print control_index_names

    #print "tile_index_names = ",
    #print tile_index_names

    #print "before call to build_order(), tile_index_map = ",
    #print tile_index_map


    # filter out control indices (and do name substitution of unprocessed tile indices) for a given level
    cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, -1)

    #print "returned from build_order python\n\n"

    # print("permute("..stmt..", {"..list_to_string(cur_order).."})")
    #print "permute(%d, {" % stmt,
    #print "cur_order = ",
    #print cur_order,
    #print "})"

    cur_order.insert(0, stmt)
    #print cur_order
    chill.permute( tuple( cur_order)) 
    #print "in cudaize.py, returned from C code chill.permute()\n"

    for i in range(len(tile_indices)):
        cur_idx = tile_indices[i]
        #print "i %d  cur_idx %s calling build order ********" % (i, cur_idx)
        cur_order = build_order( final_order, tile_indices, control_index_names, tile_index_map, i)
        #print "cur_idx %s return from build order" % cur_idx
        
        # Find an offset between tile loop and control loop
        #  0   = control loop one level above tile loop
        #  -1  = control loop two levels above tile loop
        #  > 0 = tile loop above control loop
        #  In the last case, we do two extra tile commands to get the control
        #  above the tile and then rely on the final permute to handle the
        #  rest
        level = find_cur_level(stmt,cur_idx)
        #print "level %d\n" % level     

        offset = find_offset(cur_order, tile_index_names[i], control_index_names[i])
        #print "offset %d" % offset

        if offset <= 0:
            #print "[offset<=0]1tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
            chill.tile7( stmt, level, sizes[i], level+offset, tile_index_names[i], control_index_names[i], tile_method  )
            #print "in cudaize.py, returned from C code chill.tile7\n"

        else:
            #print "2tile(%d, %d, %d, %d, %s, %s, %d)" % (stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  )
            chill.tile7( stmt, level, sizes[i], level+offset-1, tile_index_names[i], control_index_names[i], tile_method  ) # regular level

            # flip and tile control loop
            #print "3tile(%d, %d, %d)" % ( stmt, level+1, level+1)
            chill.tile3( stmt, level+1, level+1)

            #print "4tile(%d, %d, %d)" % ( stmt, level+1, level)
            chill.tile3( stmt, level+1, level)

            #print_code()

        # Do permutation based on cur_order
        #print("permute based on build order calling build_order()")
        cur_order = build_order(final_order, tile_indices, control_index_names, tile_index_map, i)

        #print("permute based on build order return from build_order()")

        #  print("permute("..stmt..", {"..list_to_string(cur_order).."})")
        topermute = cur_order
        topermute.insert(0, stmt)
        chill.permute( tuple(topermute) )