__local DTYPE sdata[blockSize];
        const int our_index = get_global_id(1); //The index of the vector that we will sum
        const int offset = get_local_id(0);       

        sdata[offset] = 0;        
        if(our_index < other_size)
        {
            //Load the values
            for(int i = offset; i<sum_size;i+=blockSize )
            {
               sdata[offset] += from_matrix(in,i,our_index);
            }
        }
        
        NOWARPBLOCK
"""+opencl_tools.get_inkernal_reduction('sdata','blockSize','offset')+ """

        if(our_index < other_size && offset == 0)
        {
            out[our_index] = sdata[0];
        }
        
        
    }
"""

class SumKernal(object):
    
    def __init__(self,matrix,axis,queue=None):
        assert axis >= 0 and axis <= 1
        
                const uint column_index = indices[c];
                #ifdef INPUT_COLUMN_MAJOR
                    const float in_value = in[column_index+out_col*out_rows];
                #else
                    const float in_value = in[column_index*max_cols+out_col];
                #endif
                sdata[offset] += data[c]*in_value;
            }
        }
        
        #ifdef WARPSPEED
        if (blockSize > 32)
        #endif        
            barrier(CLK_LOCAL_MEM_FENCE);  
"""
    + opencl_tools.get_inkernal_reduction("sdata", "blockSize", "offset")
    + """
        //Return the output
        if (offset == 0 && row < num_rows && out_col < max_cols)
        {
            #ifdef OUTPUT_COLUMN_MAJOR
                out[row+out_col*num_rows] = sdata[0];
            #else
                out[row*max_cols+out_col] = sdata[0];
            #endif
        }
      
    }
"""
)